From d6b8bf87c2a7cec3f89fba97b5536bb697b8c424 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 8 Jun 2026 16:39:33 +0200 Subject: [PATCH] =?UTF-8?q?fix:=204=20Bugs=20gemeinsam=20=E2=80=94=20B22?= =?UTF-8?q?=20PDF=20+=20B17=20Walk-Fallback=20+=20company=5Fname=20+=20Pla?= =?UTF-8?q?usibility-Fallback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (1) B22 Cross-Domain (fix #59): Elli-Test fand AGB auf logpay.de NICHT obwohl URL in doc_entries korrekt. Vermutete Ursache: Discovery-Phase A drops/überschreibt Original-URL bei PDF-Fetch-Fail (word_count=0). Fix: _collect_audit_urls() iteriert über state.doc_entries + rejected_url + req.documents — Cross-Domain-Hosting ist unabhängig vom Text-Inhalt. Plus Trace-Logging für künftige Diagnose. Dedup per (doc_type, host_sld). (2) B17 Audit-Walk-Fail-Fallback (fix #60): BMW v5 hatte audit_walk=None ohne Mail-Hinweis. Vermutlich 180s-Timeout bei OneTrust-CMP-Banner-Tour. Fix: Timeout 180s → 300s. Plus: Bei Fail wird ein Hinweis- Stub mit error-Grund in state["audit_walk"] + HTML-Block geschrieben — Reviewer sieht den Fail statt silent-skip. (3) company_name + origin_domain im Backend (fix #61): Frontend sendet seit ec03317 die zwei Felder — Backend ignorierte sie. Fix: ComplianceCheckRequest-Schema um company_name + origin_domain erweitert. phase_e_email priorisiert User-Input vor URL-Heuristik für site_name. Bei origin_domain ohne ableitbare doc_entries-domain wird der User-Input als domain übernommen. (4) Plausibility-LLM Fallback-Modell (fix #62): qwen3:30b-a3b liefert auf großen DSEs (BMW 122 FAIL) gehäuft leere format='json'-Responses — Circuit-Breaker griff aber Phase blieb nutzlos. Fix: Default-Modell auf qwen2.5:7b umgestellt (4× kleiner, zuverlässiger bei format=json, ausreichendes Reasoning für PASS/MODIFY/DROP-Klassifikation). Plus Strategy-C eingeführt — Fallback-Modell (llama3.2:3b) wenn primary leer bleibt. BATCH_SIZE 4 → 3. ENV-Switches PLAUSIBILITY_LLM_MODEL + PLAUSIBILITY_FALLBACK_MODEL für Tuning. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../compliance/api/agent_check/_b17_wiring.py | 34 ++++++++-- .../api/agent_check/_phase_e_email.py | 12 +++- .../compliance/api/agent_check/_schemas.py | 5 ++ .../services/cross_domain_doc_check.py | 57 +++++++++++++--- .../services/finding_plausibility_check.py | 65 ++++++++++++------- 5 files changed, 138 insertions(+), 35 deletions(-) diff --git a/backend-compliance/compliance/api/agent_check/_b17_wiring.py b/backend-compliance/compliance/api/agent_check/_b17_wiring.py index d2de8ce1..4ff8a2fe 100644 --- a/backend-compliance/compliance/api/agent_check/_b17_wiring.py +++ b/backend-compliance/compliance/api/agent_check/_b17_wiring.py @@ -57,20 +57,46 @@ async def run_b17(state: dict) -> None: return walk: dict = {} + walk_error: str | None = None try: - async with httpx.AsyncClient(timeout=180.0) as c: + async with httpx.AsyncClient(timeout=300.0) as c: r = await c.post( f"{CONSENT_TESTER_URL}/scan-audit-walk", json={"url": homepage, "dwell_s": 4.0, "max_links": 8}, - timeout=180.0, + timeout=300.0, ) if r.status_code == 200: walk = r.json() + else: + walk_error = f"consent-tester HTTP {r.status_code}" except Exception as e: - logger.warning("B17 audit-walk request failed: %s", e) - return + walk_error = f"{type(e).__name__}: {str(e)[:120]}" + logger.warning("B17 audit-walk request failed: %s", walk_error) if not walk or not walk.get("walk_id"): + # Fallback-Stub damit Audit-Report einen Hinweis bekommt + # statt "audit_walk: None". Reviewer sieht den Fail. + state["audit_walk"] = { + "walk_id": "", + "url": homepage, + "video": {}, + "actions": [], + "annotations": [], + "error": walk_error or "unknown (no walk_id returned)", + } + state["audit_walk_html"] = ( + "
" + "

" + "⚠️ Audit-Walk konnte nicht aufgezeichnet werden" + "

" + f"

" + f"Site: {homepage} · Ursache: " + f"{walk_error or 'unknown'}. Mögliche " + "Gründe: komplexes CMP-Banner (lange Tour-Zeit), Anti-Bot-" + "Protection, oder consent-tester überlastet.

" + "
" + ) return # Stufe-5: annotierte Screenshots pro Finding. Schickt die diff --git a/backend-compliance/compliance/api/agent_check/_phase_e_email.py b/backend-compliance/compliance/api/agent_check/_phase_e_email.py index 409465a8..65c9e160 100644 --- a/backend-compliance/compliance/api/agent_check/_phase_e_email.py +++ b/backend-compliance/compliance/api/agent_check/_phase_e_email.py @@ -36,7 +36,17 @@ def run_phase_e(state: dict) -> None: doc_count = len([r for r in results if not r.error]) url_company = _company_name_from_url(doc_entries) domain = _extract_domain(doc_entries) - site_name = url_company or domain or "Unbekannt" + # Priorität: User-Input (req.company_name) > URL-Heuristik > "Unbekannt" + req_company = (getattr(req, "company_name", None) or "").strip() + req_domain = (getattr(req, "origin_domain", None) or "").strip() + site_name = req_company or url_company or domain or "Unbekannt" + if req_domain and not domain: + # Falls keine domain aus URLs ableitbar war: User-Input verwenden + from urllib.parse import urlparse + try: + domain = urlparse(req_domain).netloc.lstrip("www.") or req_domain + except Exception: + domain = req_domain _update(check_id, "E-Mail wird versendet...", 98) # A1: bundle cookie-evidence slices into a ZIP attachment so the diff --git a/backend-compliance/compliance/api/agent_check/_schemas.py b/backend-compliance/compliance/api/agent_check/_schemas.py index d4625533..94aa7591 100644 --- a/backend-compliance/compliance/api/agent_check/_schemas.py +++ b/backend-compliance/compliance/api/agent_check/_schemas.py @@ -28,6 +28,11 @@ class ComplianceCheckRequest(BaseModel): # Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im # Snapshot persistiert und filtert die MC-Auswertung (P72). scan_context: dict | None = None + # Frontend-eingegebene Firma + Origin-Domain. Priorisiert vor + # LLM-extracted_profile-Inferenz. Wenn leer: Fallback auf Heuristik + # aus URL-Domains und DSE-Text. + company_name: str | None = None + origin_domain: str | None = None class ComplianceCheckStartResponse(BaseModel): diff --git a/backend-compliance/compliance/services/cross_domain_doc_check.py b/backend-compliance/compliance/services/cross_domain_doc_check.py index 773812a6..ddb042d8 100644 --- a/backend-compliance/compliance/services/cross_domain_doc_check.py +++ b/backend-compliance/compliance/services/cross_domain_doc_check.py @@ -87,17 +87,52 @@ def _site_origin_sld(state: dict) -> str: return max(counter, key=counter.get) -def check_cross_domain_docs(state: dict) -> list[dict]: - """Emit findings for doc_entries whose URL has a different SLD - than the site origin.""" - primary = _site_origin_sld(state) - if not primary: - return [] - findings: list[dict] = [] +def _collect_audit_urls(state: dict) -> list[tuple[str, str]]: + """Sammle (doc_type, url) aus BEIDEN Quellen — state.doc_entries + (nach Discovery) UND req.documents (USER-Original-Input). Discovery + kann Original-URLs verlieren (PDF-Fetch-Fail, Auto-Reclassify), aber + Cross-Domain-Hosting ist juristisch unabhängig vom Text-Inhalt + der Datei. + """ + seen: set[tuple[str, str]] = set() + out: list[tuple[str, str]] = [] for e in (state.get("doc_entries") or []): url = (e.get("url") or "").strip() doc_type = (e.get("doc_type") or "").lower() - if not url or "://" not in url: + if url and doc_type and (doc_type, url) not in seen: + seen.add((doc_type, url)) + out.append((doc_type, url)) + # rejected_url ist die Original-URL die Discovery rejected hat + rej = (e.get("rejected_url") or "").strip() + if rej and doc_type and (doc_type, rej) not in seen: + seen.add((doc_type, rej)) + out.append((doc_type, rej)) + # Fallback: req.documents — USER hat sie explizit eingegeben + req = state.get("req") + if req is not None: + for d in getattr(req, "documents", []) or []: + url = (getattr(d, "url", "") or "").strip() + doc_type = (getattr(d, "doc_type", "") or "").lower() + if url and doc_type and (doc_type, url) not in seen: + seen.add((doc_type, url)) + out.append((doc_type, url)) + return out + + +def check_cross_domain_docs(state: dict) -> list[dict]: + """Emit findings for doc-URLs whose host has a different SLD + than the site origin.""" + primary = _site_origin_sld(state) + if not primary: + logger.info("B22 cross-domain: kein primary SLD ermittelbar") + return [] + findings: list[dict] = [] + audit_urls = _collect_audit_urls(state) + logger.info("B22 cross-domain: primary=%s, prüfe %d URL(s)", + primary, len(audit_urls)) + emitted_keys: set[tuple[str, str]] = set() + for doc_type, url in audit_urls: + if "://" not in url: continue try: host = urlparse(url).netloc @@ -106,6 +141,12 @@ def check_cross_domain_docs(state: dict) -> list[dict]: continue if not url_sld or url_sld == primary: continue + # Dedup pro (doc_type, host_sld) damit rejected_url + url nicht + # doppelt gemeldet werden + e_key = (doc_type, url_sld) + if e_key in emitted_keys: + continue + emitted_keys.add(e_key) # Cross-Domain detected severity = _SEVERITY_BY_DOC.get(doc_type, "MEDIUM") doc_label = { diff --git a/backend-compliance/compliance/services/finding_plausibility_check.py b/backend-compliance/compliance/services/finding_plausibility_check.py index 6789edd2..294bd374 100644 --- a/backend-compliance/compliance/services/finding_plausibility_check.py +++ b/backend-compliance/compliance/services/finding_plausibility_check.py @@ -50,11 +50,19 @@ import httpx logger = logging.getLogger(__name__) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") -MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen3:30b-a3b") -# Reduced from 8 → 4 to fight qwen3 empty-response-on-large-prompts bug. -# 4 items × ~500 token/item + 2000 system + 1500 excerpt = ~5500 token total, -# well within qwen3's safe range for format='json'. -BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "4")) +# Default-Modell als ENV-Switch konfigurierbar. qwen3:30b-a3b ist +# bestes Reasoning, aber gibt bei großen DSEs gerne leere Responses +# unter format='json'. qwen2.5:7b ist 4× kleiner, deutlich +# zuverlässiger, leicht schwächeres Reasoning aber für die einfache +# Plausibility-Klassifikation (PASS/MODIFY/DROP) ausreichend. +MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen2.5:7b") +# Fallback-Modell wenn das primary trotz Retries nichts liefert +# (Strategy A → B → C → D-Schritte erschöpft). Default ist ein +# kleines, robustes Modell. +FALLBACK_MODEL = os.getenv("PLAUSIBILITY_FALLBACK_MODEL", "llama3.2:3b") +# Mit kleinerem Modell können größere Batches funktionieren — aber +# konservativ bleiben damit Single-Modell-Fail nicht ganz Phase killt. +BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "3")) TIMEOUT = float(os.getenv("PLAUSIBILITY_TIMEOUT_S", "45.0")) # Reduced excerpt 4000 → 1500 chars (same reason). DOC_EXCERPT_CHARS = int(os.getenv("PLAUSIBILITY_DOC_EXCERPT", "1500")) @@ -173,33 +181,46 @@ async def _ask_llm_batch(items: list[dict], doc_title: str, """Send a batch of up to BATCH_SIZE findings to the LLM. Resilience strategy (P125 fix for empty-response bug): - A. format='json' (strict) — current default - B. If A returns empty: format='' (loose), extract JSON manually - C. If B also empty AND batch >2: split batch + recurse - D. Else: give up, return {} (callers stamp llm_skipped=true) + A. primary MODEL + format='json' (strict) + B. primary MODEL + format='' (loose), parse JSON manuell + C. FALLBACK_MODEL + format='json' (kleineres robusteres Modell) + D. If batch >2: split + recurse + E. Else: give up, return {} (callers stamp llm_skipped=true) """ user_prompt = _build_user_prompt(items, doc_title, doc_excerpt) - base_body = { - "model": MODEL, - "messages": [ - {"role": "system", "content": _SYSTEM_PROMPT}, - {"role": "user", "content": user_prompt}, - ], - "stream": False, - "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500}, - } + + def _body(model: str) -> dict: + return { + "model": model, + "messages": [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ], + "stream": False, + "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500}, + } + out: dict[str, dict] = {} input_ids = [it["id"] for it in items] try: - # Strategy A: format='json' - content = await _post_llm({**base_body, "format": "json"}) + # Strategy A: primary + format='json' + content = await _post_llm({**_body(MODEL), "format": "json"}) if not content: - # Strategy B: format-free, parse-on-our-side + # Strategy B: primary + format-free logger.info( "plausibility A→empty, trying B (format-free) batch=%d", len(items), ) - content = await _post_llm(base_body) + content = await _post_llm(_body(MODEL)) + if not content and FALLBACK_MODEL and FALLBACK_MODEL != MODEL: + # Strategy C: fallback-model + format='json' + logger.info( + "plausibility A+B empty, trying C (fallback=%s) batch=%d", + FALLBACK_MODEL, len(items), + ) + content = await _post_llm( + {**_body(FALLBACK_MODEL), "format": "json"}, + ) if not content: # Strategy C: split + recurse