From d6b8bf87c2a7cec3f89fba97b5536bb697b8c424 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Mon, 8 Jun 2026 16:39:33 +0200
Subject: [PATCH] =?UTF-8?q?fix:=204=20Bugs=20gemeinsam=20=E2=80=94=20B22?=
 =?UTF-8?q?=20PDF=20+=20B17=20Walk-Fallback=20+=20company=5Fname=20+=20Pla?=
 =?UTF-8?q?usibility-Fallback?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(1) B22 Cross-Domain (fix #59):
  Elli-Test fand AGB auf logpay.de NICHT obwohl URL in doc_entries
  korrekt. Vermutete Ursache: Discovery-Phase A drops/überschreibt
  Original-URL bei PDF-Fetch-Fail (word_count=0).
  Fix: _collect_audit_urls() iteriert über state.doc_entries +
  rejected_url + req.documents — Cross-Domain-Hosting ist
  unabhängig vom Text-Inhalt. Plus Trace-Logging für künftige
  Diagnose. Dedup per (doc_type, host_sld).

(2) B17 Audit-Walk-Fail-Fallback (fix #60):
  BMW v5 hatte audit_walk=None ohne Mail-Hinweis. Vermutlich
  180s-Timeout bei OneTrust-CMP-Banner-Tour.
  Fix: Timeout 180s → 300s. Plus: Bei Fail wird ein Hinweis-
  Stub mit error-Grund in state["audit_walk"] + HTML-Block
  geschrieben — Reviewer sieht den Fail statt silent-skip.

(3) company_name + origin_domain im Backend (fix #61):
  Frontend sendet seit ec03317 die zwei Felder — Backend ignorierte
  sie.
  Fix: ComplianceCheckRequest-Schema um company_name +
  origin_domain erweitert. phase_e_email priorisiert User-Input
  vor URL-Heuristik für site_name. Bei origin_domain ohne
  ableitbare doc_entries-domain wird der User-Input als domain
  übernommen.

(4) Plausibility-LLM Fallback-Modell (fix #62):
  qwen3:30b-a3b liefert auf großen DSEs (BMW 122 FAIL) gehäuft
  leere format='json'-Responses — Circuit-Breaker griff aber
  Phase blieb nutzlos.
  Fix: Default-Modell auf qwen2.5:7b umgestellt (4× kleiner,
  zuverlässiger bei format=json, ausreichendes Reasoning für
  PASS/MODIFY/DROP-Klassifikation). Plus Strategy-C eingeführt
  — Fallback-Modell (llama3.2:3b) wenn primary leer bleibt.
  BATCH_SIZE 4 → 3. ENV-Switches PLAUSIBILITY_LLM_MODEL +
  PLAUSIBILITY_FALLBACK_MODEL für Tuning.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../compliance/api/agent_check/_b17_wiring.py | 34 ++++++++--
 .../api/agent_check/_phase_e_email.py         | 12 +++-
 .../compliance/api/agent_check/_schemas.py    |  5 ++
 .../services/cross_domain_doc_check.py        | 57 +++++++++++++---
 .../services/finding_plausibility_check.py    | 65 ++++++++++++-------
 5 files changed, 138 insertions(+), 35 deletions(-)

diff --git a/backend-compliance/compliance/api/agent_check/_b17_wiring.py b/backend-compliance/compliance/api/agent_check/_b17_wiring.py
index d2de8ce1..4ff8a2fe 100644
--- a/backend-compliance/compliance/api/agent_check/_b17_wiring.py
+++ b/backend-compliance/compliance/api/agent_check/_b17_wiring.py
@@ -57,20 +57,46 @@ async def run_b17(state: dict) -> None:
         return
 
     walk: dict = {}
+    walk_error: str | None = None
     try:
-        async with httpx.AsyncClient(timeout=180.0) as c:
+        async with httpx.AsyncClient(timeout=300.0) as c:
             r = await c.post(
                 f"{CONSENT_TESTER_URL}/scan-audit-walk",
                 json={"url": homepage, "dwell_s": 4.0, "max_links": 8},
-                timeout=180.0,
+                timeout=300.0,
             )
             if r.status_code == 200:
                 walk = r.json()
+            else:
+                walk_error = f"consent-tester HTTP {r.status_code}"
     except Exception as e:
-        logger.warning("B17 audit-walk request failed: %s", e)
-        return
+        walk_error = f"{type(e).__name__}: {str(e)[:120]}"
+        logger.warning("B17 audit-walk request failed: %s", walk_error)
 
     if not walk or not walk.get("walk_id"):
+        # Fallback-Stub damit Audit-Report einen Hinweis bekommt
+        # statt "audit_walk: None". Reviewer sieht den Fail.
+        state["audit_walk"] = {
+            "walk_id": "",
+            "url": homepage,
+            "video": {},
+            "actions": [],
+            "annotations": [],
+            "error": walk_error or "unknown (no walk_id returned)",
+        }
+        state["audit_walk_html"] = (
+            "<div style='margin:24px 0;padding:16px;border-left:4px solid #f59e0b;"
+            "background:#fef3c7;border-radius:4px;'>"
+            "<h2 style='margin:0 0 8px;color:#92400e;font-size:16px;'>"
+            "⚠️ Audit-Walk konnte nicht aufgezeichnet werden"
+            "</h2>"
+            f"<p style='margin:0;font-size:13px;color:#92400e;'>"
+            f"Site: <code>{homepage}</code> · Ursache: "
+            f"<code>{walk_error or 'unknown'}</code>. Mögliche "
+            "Gründe: komplexes CMP-Banner (lange Tour-Zeit), Anti-Bot-"
+            "Protection, oder consent-tester überlastet.</p>"
+            "</div>"
+        )
         return
 
     # Stufe-5: annotierte Screenshots pro Finding. Schickt die
diff --git a/backend-compliance/compliance/api/agent_check/_phase_e_email.py b/backend-compliance/compliance/api/agent_check/_phase_e_email.py
index 409465a8..65c9e160 100644
--- a/backend-compliance/compliance/api/agent_check/_phase_e_email.py
+++ b/backend-compliance/compliance/api/agent_check/_phase_e_email.py
@@ -36,7 +36,17 @@ def run_phase_e(state: dict) -> None:
     doc_count = len([r for r in results if not r.error])
     url_company = _company_name_from_url(doc_entries)
     domain = _extract_domain(doc_entries)
-    site_name = url_company or domain or "Unbekannt"
+    # Priorität: User-Input (req.company_name) > URL-Heuristik > "Unbekannt"
+    req_company = (getattr(req, "company_name", None) or "").strip()
+    req_domain = (getattr(req, "origin_domain", None) or "").strip()
+    site_name = req_company or url_company or domain or "Unbekannt"
+    if req_domain and not domain:
+        # Falls keine domain aus URLs ableitbar war: User-Input verwenden
+        from urllib.parse import urlparse
+        try:
+            domain = urlparse(req_domain).netloc.lstrip("www.") or req_domain
+        except Exception:
+            domain = req_domain
     _update(check_id, "E-Mail wird versendet...", 98)
 
     # A1: bundle cookie-evidence slices into a ZIP attachment so the
diff --git a/backend-compliance/compliance/api/agent_check/_schemas.py b/backend-compliance/compliance/api/agent_check/_schemas.py
index d4625533..94aa7591 100644
--- a/backend-compliance/compliance/api/agent_check/_schemas.py
+++ b/backend-compliance/compliance/api/agent_check/_schemas.py
@@ -28,6 +28,11 @@ class ComplianceCheckRequest(BaseModel):
     # Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im
     # Snapshot persistiert und filtert die MC-Auswertung (P72).
     scan_context: dict | None = None
+    # Frontend-eingegebene Firma + Origin-Domain. Priorisiert vor
+    # LLM-extracted_profile-Inferenz. Wenn leer: Fallback auf Heuristik
+    # aus URL-Domains und DSE-Text.
+    company_name: str | None = None
+    origin_domain: str | None = None
 
 
 class ComplianceCheckStartResponse(BaseModel):
diff --git a/backend-compliance/compliance/services/cross_domain_doc_check.py b/backend-compliance/compliance/services/cross_domain_doc_check.py
index 773812a6..ddb042d8 100644
--- a/backend-compliance/compliance/services/cross_domain_doc_check.py
+++ b/backend-compliance/compliance/services/cross_domain_doc_check.py
@@ -87,17 +87,52 @@ def _site_origin_sld(state: dict) -> str:
     return max(counter, key=counter.get)
 
 
-def check_cross_domain_docs(state: dict) -> list[dict]:
-    """Emit findings for doc_entries whose URL has a different SLD
-    than the site origin."""
-    primary = _site_origin_sld(state)
-    if not primary:
-        return []
-    findings: list[dict] = []
+def _collect_audit_urls(state: dict) -> list[tuple[str, str]]:
+    """Sammle (doc_type, url) aus BEIDEN Quellen — state.doc_entries
+    (nach Discovery) UND req.documents (USER-Original-Input). Discovery
+    kann Original-URLs verlieren (PDF-Fetch-Fail, Auto-Reclassify), aber
+    Cross-Domain-Hosting ist juristisch unabhängig vom Text-Inhalt
+    der Datei.
+    """
+    seen: set[tuple[str, str]] = set()
+    out: list[tuple[str, str]] = []
     for e in (state.get("doc_entries") or []):
         url = (e.get("url") or "").strip()
         doc_type = (e.get("doc_type") or "").lower()
-        if not url or "://" not in url:
+        if url and doc_type and (doc_type, url) not in seen:
+            seen.add((doc_type, url))
+            out.append((doc_type, url))
+        # rejected_url ist die Original-URL die Discovery rejected hat
+        rej = (e.get("rejected_url") or "").strip()
+        if rej and doc_type and (doc_type, rej) not in seen:
+            seen.add((doc_type, rej))
+            out.append((doc_type, rej))
+    # Fallback: req.documents — USER hat sie explizit eingegeben
+    req = state.get("req")
+    if req is not None:
+        for d in getattr(req, "documents", []) or []:
+            url = (getattr(d, "url", "") or "").strip()
+            doc_type = (getattr(d, "doc_type", "") or "").lower()
+            if url and doc_type and (doc_type, url) not in seen:
+                seen.add((doc_type, url))
+                out.append((doc_type, url))
+    return out
+
+
+def check_cross_domain_docs(state: dict) -> list[dict]:
+    """Emit findings for doc-URLs whose host has a different SLD
+    than the site origin."""
+    primary = _site_origin_sld(state)
+    if not primary:
+        logger.info("B22 cross-domain: kein primary SLD ermittelbar")
+        return []
+    findings: list[dict] = []
+    audit_urls = _collect_audit_urls(state)
+    logger.info("B22 cross-domain: primary=%s, prüfe %d URL(s)",
+                primary, len(audit_urls))
+    emitted_keys: set[tuple[str, str]] = set()
+    for doc_type, url in audit_urls:
+        if "://" not in url:
             continue
         try:
             host = urlparse(url).netloc
@@ -106,6 +141,12 @@ def check_cross_domain_docs(state: dict) -> list[dict]:
             continue
         if not url_sld or url_sld == primary:
             continue
+        # Dedup pro (doc_type, host_sld) damit rejected_url + url nicht
+        # doppelt gemeldet werden
+        e_key = (doc_type, url_sld)
+        if e_key in emitted_keys:
+            continue
+        emitted_keys.add(e_key)
         # Cross-Domain detected
         severity = _SEVERITY_BY_DOC.get(doc_type, "MEDIUM")
         doc_label = {
diff --git a/backend-compliance/compliance/services/finding_plausibility_check.py b/backend-compliance/compliance/services/finding_plausibility_check.py
index 6789edd2..294bd374 100644
--- a/backend-compliance/compliance/services/finding_plausibility_check.py
+++ b/backend-compliance/compliance/services/finding_plausibility_check.py
@@ -50,11 +50,19 @@ import httpx
 logger = logging.getLogger(__name__)
 
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
-MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen3:30b-a3b")
-# Reduced from 8 → 4 to fight qwen3 empty-response-on-large-prompts bug.
-# 4 items × ~500 token/item + 2000 system + 1500 excerpt = ~5500 token total,
-# well within qwen3's safe range for format='json'.
-BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "4"))
+# Default-Modell als ENV-Switch konfigurierbar. qwen3:30b-a3b ist
+# bestes Reasoning, aber gibt bei großen DSEs gerne leere Responses
+# unter format='json'. qwen2.5:7b ist 4× kleiner, deutlich
+# zuverlässiger, leicht schwächeres Reasoning aber für die einfache
+# Plausibility-Klassifikation (PASS/MODIFY/DROP) ausreichend.
+MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen2.5:7b")
+# Fallback-Modell wenn das primary trotz Retries nichts liefert
+# (Strategy A → B → C → D-Schritte erschöpft). Default ist ein
+# kleines, robustes Modell.
+FALLBACK_MODEL = os.getenv("PLAUSIBILITY_FALLBACK_MODEL", "llama3.2:3b")
+# Mit kleinerem Modell können größere Batches funktionieren — aber
+# konservativ bleiben damit Single-Modell-Fail nicht ganz Phase killt.
+BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "3"))
 TIMEOUT = float(os.getenv("PLAUSIBILITY_TIMEOUT_S", "45.0"))
 # Reduced excerpt 4000 → 1500 chars (same reason).
 DOC_EXCERPT_CHARS = int(os.getenv("PLAUSIBILITY_DOC_EXCERPT", "1500"))
@@ -173,33 +181,46 @@ async def _ask_llm_batch(items: list[dict], doc_title: str,
     """Send a batch of up to BATCH_SIZE findings to the LLM.
 
     Resilience strategy (P125 fix for empty-response bug):
-      A. format='json' (strict) — current default
-      B. If A returns empty: format='' (loose), extract JSON manually
-      C. If B also empty AND batch >2: split batch + recurse
-      D. Else: give up, return {} (callers stamp llm_skipped=true)
+      A. primary MODEL + format='json' (strict)
+      B. primary MODEL + format='' (loose), parse JSON manuell
+      C. FALLBACK_MODEL + format='json' (kleineres robusteres Modell)
+      D. If batch >2: split + recurse
+      E. Else: give up, return {} (callers stamp llm_skipped=true)
     """
     user_prompt = _build_user_prompt(items, doc_title, doc_excerpt)
-    base_body = {
-        "model": MODEL,
-        "messages": [
-            {"role": "system", "content": _SYSTEM_PROMPT},
-            {"role": "user", "content": user_prompt},
-        ],
-        "stream": False,
-        "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500},
-    }
+
+    def _body(model: str) -> dict:
+        return {
+            "model": model,
+            "messages": [
+                {"role": "system", "content": _SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            "stream": False,
+            "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500},
+        }
+
     out: dict[str, dict] = {}
     input_ids = [it["id"] for it in items]
     try:
-        # Strategy A: format='json'
-        content = await _post_llm({**base_body, "format": "json"})
+        # Strategy A: primary + format='json'
+        content = await _post_llm({**_body(MODEL), "format": "json"})
         if not content:
-            # Strategy B: format-free, parse-on-our-side
+            # Strategy B: primary + format-free
             logger.info(
                 "plausibility A→empty, trying B (format-free) batch=%d",
                 len(items),
             )
-            content = await _post_llm(base_body)
+            content = await _post_llm(_body(MODEL))
+        if not content and FALLBACK_MODEL and FALLBACK_MODEL != MODEL:
+            # Strategy C: fallback-model + format='json'
+            logger.info(
+                "plausibility A+B empty, trying C (fallback=%s) batch=%d",
+                FALLBACK_MODEL, len(items),
+            )
+            content = await _post_llm(
+                {**_body(FALLBACK_MODEL), "format": "json"},
+            )
 
         if not content:
             # Strategy C: split + recurse