diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index b1e7891f..7f3fe20a 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -687,24 +687,42 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): cmp_vendors = extract_vendors_from_payloads( cookie_payloads, owner_name=owner_name, ) - # V3 fallback: no named CMP captured but we have substantive - # cookie text → ask Qwen/OVH to extract vendor list from the text. - # Skip on very short text (likely navigation) to save LLM cost. - if not cmp_vendors and cookie_text and len(cookie_text.split()) >= 500: + # P52: LLM-Fallback nicht nur wenn 0 Vendors, sondern auch + # wenn die strukturierten Quellen < 5 Vendors lieferten und + # der Cookie-Text substantiell ist. So holt sich VW-typische + # Setups (Generic CMP, 28 Cookies aber 0 cmp_payloads) noch + # ihre echten Vendors aus dem Text. + if (len(cmp_vendors) < 5 + and cookie_text and len(cookie_text.split()) >= 500): from compliance.services.vendor_llm_extractor import ( extract_vendors_via_llm, ) from compliance.services.vendor_classifier import classify _update(check_id, "Vendor-Liste per LLM extrahieren...", 94) - cmp_vendors = await extract_vendors_via_llm(cookie_text) - # LLM path doesn't run through extract_vendors_from_payloads, - # so classify here. - for v in cmp_vendors: + llm_vendors = await extract_vendors_via_llm(cookie_text) + # P52: classify die LLM-Vendors und MERGE mit existing + # statt zu ueberschreiben. + existing_names = {(v.get("name") or "").strip().lower() + for v in cmp_vendors} + added_llm = 0 + for v in llm_vendors: + nm = (v.get("name") or "").strip() + if not nm or nm.lower() in existing_names: + continue v["recipient_type"] = classify( - vendor_name=v.get("name", ""), + vendor_name=nm, category=v.get("category", ""), owner_name=owner_name, ) + v.setdefault("source", "llm_cascade") + cmp_vendors.append(v) + existing_names.add(nm.lower()) + added_llm += 1 + if added_llm: + logger.info( + "P52 LLM-Cascade: +%d Vendors (total: %d)", + added_llm, len(cmp_vendors), + ) # P57: Phase G vendor_details als zusätzliche Vendor-Quelle. # Wenn extract_vendors_from_payloads weniger findet als # Phase G's Info-Click-Through (z.B. Mercedes-Settings nicht @@ -1543,11 +1561,31 @@ async def _autodiscover_missing( """ from urllib.parse import urlparse - # Submitted doc_types (those the user actually entered URL or text for). + # VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen + # als 'submitted'. Wenn der User eine URL eingegeben hat aber die + # 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger + # als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln + # damit der Discovery-Pass alternative URLs probiert. + _MIN_USEFUL_CHARS = 200 submitted_types = { e["doc_type"] for e in doc_entries - if e.get("text") or (e.get("url") or "").strip() + if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS } + # Markiere die fehlgeschlagenen URL-Submissions damit der Discovery + # ihre URL nicht erneut probiert (waere sinnlos). + failed_urls: set[str] = { + (e.get("url") or "").strip() + for e in doc_entries + if (e.get("url") or "").strip() + and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS + } + if failed_urls: + logger.info( + "VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery " + "soll Alternativen probieren: %s", + len(failed_urls), _MIN_USEFUL_CHARS, + ", ".join(list(failed_urls)[:3]), + ) # Map alias types to canonical submitted_canon = { "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types @@ -1657,16 +1695,21 @@ async def _autodiscover_missing( if canon and canon in missing and canon not in by_type: by_type[canon] = d - # Append a new entry for every missing canonical type. Auto-discovered + # Append/Update entry for every missing canonical type. Auto-discovered # ones get the text/URL filled; ungratched ones stay empty so the # padding step renders them as 'Auf der Website nicht gefunden'. + # VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber + # fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren. filled = 0 for dt in missing: - new_entry: dict = { + existing = next((e for e in doc_entries + if e.get("doc_type") == dt), None) + new_entry: dict = existing if existing else { "doc_type": dt, "url": "", "text": "", "word_count": 0, "auto_discovered": False, "discovery_attempted": True, "cmp_payloads": [], } + new_entry["discovery_attempted"] = True d = by_type.get(dt) if d: full = d.get("full_text") or d.get("text_preview") or "" @@ -1685,21 +1728,24 @@ async def _autodiscover_missing( full = cmp_merged if len(full.split()) >= 100: new_entry["text"] = full + # Behalte die original URL als "rejected_url" damit Audit + # zeigt 'X war 404, wir haben Y gefunden'. + if existing and (existing.get("url") or "").strip() in failed_urls: + new_entry["rejected_url"] = existing.get("url") new_entry["url"] = d.get("url", "") new_entry["word_count"] = len(full.split()) new_entry["auto_discovered"] = True - # Auto-discovery happens on the HOMEPAGE — any CMP payload - # captured at that level likely belongs to the cookie page - # (CMP widget loaded site-wide). Attach to 'cookie' entry. if dt == "cookie" and disc_payloads: new_entry["cmp_payloads"] = disc_payloads doc_texts[dt] = full filled += 1 logger.info( - "auto-discovered %s on %s: %s (%d words)", + "auto-discovered %s on %s: %s (%d words)%s", dt, base, d.get("url", "")[:80], new_entry["word_count"], + " [REPLACED failed URL]" if existing else "", ) - doc_entries.append(new_entry) + if not existing: + doc_entries.append(new_entry) logger.info( "auto-discovery: filled %d/%d missing types from %s", diff --git a/backend-compliance/compliance/services/banner_consistency_checks.py b/backend-compliance/compliance/services/banner_consistency_checks.py index be18ba7a..cd6a1cdf 100644 --- a/backend-compliance/compliance/services/banner_consistency_checks.py +++ b/backend-compliance/compliance/services/banner_consistency_checks.py @@ -303,6 +303,87 @@ def check_banner_vs_cmp_partner_count( } +def check_banner_copyability(banner_result: dict) -> dict | None: + """P51a — Banner-Text muss kopierbar sein. CSS user-select:none oder + -webkit-user-select:none verhindert das (Article 7(2) DSGVO — verstaendlich + und in einer Form, die spaetere Pruefung ermoeglicht). + """ + if not isinstance(banner_result, dict): + return None + phases = banner_result.get("phases") or {} + initial = phases.get("initial") or phases.get("before_accept") or {} + html = (initial.get("banner_html") or "")[:50000].lower() + if not html: + return None + blocked_signals = [ + "user-select:none", "user-select: none", + "-webkit-user-select:none", "-webkit-user-select: none", + "-moz-user-select:none", "pointer-events:none", + "oncopy=\"return false", "onselectstart=\"return false", + ] + hits = [s for s in blocked_signals if s in html] + if not hits: + return None + return { + "severity": "MEDIUM", + "code": "banner_not_copyable", + "label": "Banner-Text laesst sich nicht kopieren " + "(user-select:none / oncopy disabled)", + "detail": ( + f'Im Banner-HTML gefunden: {", ".join(hits[:3])}. Der Nutzer ' + "kann den Banner-Text nicht in eine Mail / Doku einfuegen, was " + "die spaetere Pruefung erschwert. Empfehlung: das CSS entfernen " + "oder explizit auf 'auto' setzen." + ), + "legal_basis": "Art. 7 (1)+(2) DSGVO + EDPB 5/2020 — Einwilligungen " + "muessen in verstaendlicher und zugaenglicher Form " + "erteilt werden; eine spaetere Pruefung darf nicht " + "technisch erschwert werden.", + } + + +def check_consent_history(banner_result: dict) -> dict | None: + """P51b — Es muss eine Moeglichkeit geben, die eigene Einwilligungs- + Historie einzusehen (Art. 7 (3) — Widerruf muss so einfach wie die + Erteilung sein; das setzt voraus dass man WEISS was man einwilligt hat). + """ + if not isinstance(banner_result, dict): + return None + phases = banner_result.get("phases") or {} + blob_parts: list[str] = [] + for ph in phases.values(): + if isinstance(ph, dict): + blob_parts.append((ph.get("banner_text") or "")[:5000]) + blob_parts.append((ph.get("banner_html") or "")[:20000]) + blob = " ".join(blob_parts).lower() + if not blob: + return None + history_signals = [ + "meine einwilligung", "consent-historie", "consent history", + "einwilligungshistorie", "einwilligungs-historie", + "ihre einwilligungen", "datenschutz-cockpit", + "privacy dashboard", "einwilligungs-protokoll", + "consent record", "consent log", + ] + if any(s in blob for s in history_signals): + return None + return { + "severity": "MEDIUM", + "code": "consent_history_missing", + "label": "Keine sichtbare Consent-Historie / 'Meine Einwilligungen'-Ansicht", + "detail": ( + "Im Banner und in den verlinkten Footer-Bereichen ist keine " + "Moeglichkeit erkennbar, die eigene Einwilligungs-Historie " + "einzusehen oder zu exportieren. Empfehlung: einen " + "'Meine Einwilligungen'-Bereich verlinken (Borlabs / Cookiebot / " + "Usercentrics bieten dafuer fertige Komponenten)." + ), + "legal_basis": "Art. 7 (3) DSGVO + EDPB 5/2020 — der Widerruf muss " + "ebenso einfach sein wie die Erteilung, was eine " + "Sichtbarmachung der eigenen Einwilligungen voraussetzt.", + } + + def run_all(banner_result: dict, cookie_doc_text: str | None = None, cmp_vendors: list | None = None, doc_texts: dict[str, str] | None = None) -> list[dict]: @@ -331,6 +412,18 @@ def run_all(banner_result: dict, cookie_doc_text: str | None = None, findings.append(f4) except Exception as e: logger.warning("P33 three_source_vendor failed: %s", e) + try: + f5 = check_banner_copyability(banner_result) + if f5: + findings.append(f5) + except Exception as e: + logger.warning("P51a copyability failed: %s", e) + try: + f6 = check_consent_history(banner_result) + if f6: + findings.append(f6) + except Exception as e: + logger.warning("P51b consent_history failed: %s", e) return findings