diff --git a/backend-compliance/compliance/api/agent_check/_b17_wiring.py b/backend-compliance/compliance/api/agent_check/_b17_wiring.py index d2de8ce1..4ff8a2fe 100644 --- a/backend-compliance/compliance/api/agent_check/_b17_wiring.py +++ b/backend-compliance/compliance/api/agent_check/_b17_wiring.py @@ -57,20 +57,46 @@ async def run_b17(state: dict) -> None: return walk: dict = {} + walk_error: str | None = None try: - async with httpx.AsyncClient(timeout=180.0) as c: + async with httpx.AsyncClient(timeout=300.0) as c: r = await c.post( f"{CONSENT_TESTER_URL}/scan-audit-walk", json={"url": homepage, "dwell_s": 4.0, "max_links": 8}, - timeout=180.0, + timeout=300.0, ) if r.status_code == 200: walk = r.json() + else: + walk_error = f"consent-tester HTTP {r.status_code}" except Exception as e: - logger.warning("B17 audit-walk request failed: %s", e) - return + walk_error = f"{type(e).__name__}: {str(e)[:120]}" + logger.warning("B17 audit-walk request failed: %s", walk_error) if not walk or not walk.get("walk_id"): + # Fallback-Stub damit Audit-Report einen Hinweis bekommt + # statt "audit_walk: None". Reviewer sieht den Fail. + state["audit_walk"] = { + "walk_id": "", + "url": homepage, + "video": {}, + "actions": [], + "annotations": [], + "error": walk_error or "unknown (no walk_id returned)", + } + state["audit_walk_html"] = ( + "
" + "

" + "⚠️ Audit-Walk konnte nicht aufgezeichnet werden" + "

" + f"

" + f"Site: {homepage} · Ursache: " + f"{walk_error or 'unknown'}. Mögliche " + "Gründe: komplexes CMP-Banner (lange Tour-Zeit), Anti-Bot-" + "Protection, oder consent-tester überlastet.

" + "
" + ) return # Stufe-5: annotierte Screenshots pro Finding. Schickt die diff --git a/backend-compliance/compliance/api/agent_check/_phase_e_email.py b/backend-compliance/compliance/api/agent_check/_phase_e_email.py index 409465a8..65c9e160 100644 --- a/backend-compliance/compliance/api/agent_check/_phase_e_email.py +++ b/backend-compliance/compliance/api/agent_check/_phase_e_email.py @@ -36,7 +36,17 @@ def run_phase_e(state: dict) -> None: doc_count = len([r for r in results if not r.error]) url_company = _company_name_from_url(doc_entries) domain = _extract_domain(doc_entries) - site_name = url_company or domain or "Unbekannt" + # Priorität: User-Input (req.company_name) > URL-Heuristik > "Unbekannt" + req_company = (getattr(req, "company_name", None) or "").strip() + req_domain = (getattr(req, "origin_domain", None) or "").strip() + site_name = req_company or url_company or domain or "Unbekannt" + if req_domain and not domain: + # Falls keine domain aus URLs ableitbar war: User-Input verwenden + from urllib.parse import urlparse + try: + domain = urlparse(req_domain).netloc.lstrip("www.") or req_domain + except Exception: + domain = req_domain _update(check_id, "E-Mail wird versendet...", 98) # A1: bundle cookie-evidence slices into a ZIP attachment so the diff --git a/backend-compliance/compliance/api/agent_check/_schemas.py b/backend-compliance/compliance/api/agent_check/_schemas.py index d4625533..94aa7591 100644 --- a/backend-compliance/compliance/api/agent_check/_schemas.py +++ b/backend-compliance/compliance/api/agent_check/_schemas.py @@ -28,6 +28,11 @@ class ComplianceCheckRequest(BaseModel): # Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im # Snapshot persistiert und filtert die MC-Auswertung (P72). scan_context: dict | None = None + # Frontend-eingegebene Firma + Origin-Domain. Priorisiert vor + # LLM-extracted_profile-Inferenz. Wenn leer: Fallback auf Heuristik + # aus URL-Domains und DSE-Text. + company_name: str | None = None + origin_domain: str | None = None class ComplianceCheckStartResponse(BaseModel): diff --git a/backend-compliance/compliance/services/cross_domain_doc_check.py b/backend-compliance/compliance/services/cross_domain_doc_check.py index 773812a6..ddb042d8 100644 --- a/backend-compliance/compliance/services/cross_domain_doc_check.py +++ b/backend-compliance/compliance/services/cross_domain_doc_check.py @@ -87,17 +87,52 @@ def _site_origin_sld(state: dict) -> str: return max(counter, key=counter.get) -def check_cross_domain_docs(state: dict) -> list[dict]: - """Emit findings for doc_entries whose URL has a different SLD - than the site origin.""" - primary = _site_origin_sld(state) - if not primary: - return [] - findings: list[dict] = [] +def _collect_audit_urls(state: dict) -> list[tuple[str, str]]: + """Sammle (doc_type, url) aus BEIDEN Quellen — state.doc_entries + (nach Discovery) UND req.documents (USER-Original-Input). Discovery + kann Original-URLs verlieren (PDF-Fetch-Fail, Auto-Reclassify), aber + Cross-Domain-Hosting ist juristisch unabhängig vom Text-Inhalt + der Datei. + """ + seen: set[tuple[str, str]] = set() + out: list[tuple[str, str]] = [] for e in (state.get("doc_entries") or []): url = (e.get("url") or "").strip() doc_type = (e.get("doc_type") or "").lower() - if not url or "://" not in url: + if url and doc_type and (doc_type, url) not in seen: + seen.add((doc_type, url)) + out.append((doc_type, url)) + # rejected_url ist die Original-URL die Discovery rejected hat + rej = (e.get("rejected_url") or "").strip() + if rej and doc_type and (doc_type, rej) not in seen: + seen.add((doc_type, rej)) + out.append((doc_type, rej)) + # Fallback: req.documents — USER hat sie explizit eingegeben + req = state.get("req") + if req is not None: + for d in getattr(req, "documents", []) or []: + url = (getattr(d, "url", "") or "").strip() + doc_type = (getattr(d, "doc_type", "") or "").lower() + if url and doc_type and (doc_type, url) not in seen: + seen.add((doc_type, url)) + out.append((doc_type, url)) + return out + + +def check_cross_domain_docs(state: dict) -> list[dict]: + """Emit findings for doc-URLs whose host has a different SLD + than the site origin.""" + primary = _site_origin_sld(state) + if not primary: + logger.info("B22 cross-domain: kein primary SLD ermittelbar") + return [] + findings: list[dict] = [] + audit_urls = _collect_audit_urls(state) + logger.info("B22 cross-domain: primary=%s, prüfe %d URL(s)", + primary, len(audit_urls)) + emitted_keys: set[tuple[str, str]] = set() + for doc_type, url in audit_urls: + if "://" not in url: continue try: host = urlparse(url).netloc @@ -106,6 +141,12 @@ def check_cross_domain_docs(state: dict) -> list[dict]: continue if not url_sld or url_sld == primary: continue + # Dedup pro (doc_type, host_sld) damit rejected_url + url nicht + # doppelt gemeldet werden + e_key = (doc_type, url_sld) + if e_key in emitted_keys: + continue + emitted_keys.add(e_key) # Cross-Domain detected severity = _SEVERITY_BY_DOC.get(doc_type, "MEDIUM") doc_label = { diff --git a/backend-compliance/compliance/services/finding_plausibility_check.py b/backend-compliance/compliance/services/finding_plausibility_check.py index 6789edd2..294bd374 100644 --- a/backend-compliance/compliance/services/finding_plausibility_check.py +++ b/backend-compliance/compliance/services/finding_plausibility_check.py @@ -50,11 +50,19 @@ import httpx logger = logging.getLogger(__name__) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") -MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen3:30b-a3b") -# Reduced from 8 → 4 to fight qwen3 empty-response-on-large-prompts bug. -# 4 items × ~500 token/item + 2000 system + 1500 excerpt = ~5500 token total, -# well within qwen3's safe range for format='json'. -BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "4")) +# Default-Modell als ENV-Switch konfigurierbar. qwen3:30b-a3b ist +# bestes Reasoning, aber gibt bei großen DSEs gerne leere Responses +# unter format='json'. qwen2.5:7b ist 4× kleiner, deutlich +# zuverlässiger, leicht schwächeres Reasoning aber für die einfache +# Plausibility-Klassifikation (PASS/MODIFY/DROP) ausreichend. +MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen2.5:7b") +# Fallback-Modell wenn das primary trotz Retries nichts liefert +# (Strategy A → B → C → D-Schritte erschöpft). Default ist ein +# kleines, robustes Modell. +FALLBACK_MODEL = os.getenv("PLAUSIBILITY_FALLBACK_MODEL", "llama3.2:3b") +# Mit kleinerem Modell können größere Batches funktionieren — aber +# konservativ bleiben damit Single-Modell-Fail nicht ganz Phase killt. +BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "3")) TIMEOUT = float(os.getenv("PLAUSIBILITY_TIMEOUT_S", "45.0")) # Reduced excerpt 4000 → 1500 chars (same reason). DOC_EXCERPT_CHARS = int(os.getenv("PLAUSIBILITY_DOC_EXCERPT", "1500")) @@ -173,33 +181,46 @@ async def _ask_llm_batch(items: list[dict], doc_title: str, """Send a batch of up to BATCH_SIZE findings to the LLM. Resilience strategy (P125 fix for empty-response bug): - A. format='json' (strict) — current default - B. If A returns empty: format='' (loose), extract JSON manually - C. If B also empty AND batch >2: split batch + recurse - D. Else: give up, return {} (callers stamp llm_skipped=true) + A. primary MODEL + format='json' (strict) + B. primary MODEL + format='' (loose), parse JSON manuell + C. FALLBACK_MODEL + format='json' (kleineres robusteres Modell) + D. If batch >2: split + recurse + E. Else: give up, return {} (callers stamp llm_skipped=true) """ user_prompt = _build_user_prompt(items, doc_title, doc_excerpt) - base_body = { - "model": MODEL, - "messages": [ - {"role": "system", "content": _SYSTEM_PROMPT}, - {"role": "user", "content": user_prompt}, - ], - "stream": False, - "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500}, - } + + def _body(model: str) -> dict: + return { + "model": model, + "messages": [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ], + "stream": False, + "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500}, + } + out: dict[str, dict] = {} input_ids = [it["id"] for it in items] try: - # Strategy A: format='json' - content = await _post_llm({**base_body, "format": "json"}) + # Strategy A: primary + format='json' + content = await _post_llm({**_body(MODEL), "format": "json"}) if not content: - # Strategy B: format-free, parse-on-our-side + # Strategy B: primary + format-free logger.info( "plausibility A→empty, trying B (format-free) batch=%d", len(items), ) - content = await _post_llm(base_body) + content = await _post_llm(_body(MODEL)) + if not content and FALLBACK_MODEL and FALLBACK_MODEL != MODEL: + # Strategy C: fallback-model + format='json' + logger.info( + "plausibility A+B empty, trying C (fallback=%s) batch=%d", + FALLBACK_MODEL, len(items), + ) + content = await _post_llm( + {**_body(FALLBACK_MODEL), "format": "json"}, + ) if not content: # Strategy C: split + recurse