diff --git a/.claude/rules/loc-exceptions.txt b/.claude/rules/loc-exceptions.txt index 401ce785..48ffb95a 100644 --- a/.claude/rules/loc-exceptions.txt +++ b/.claude/rules/loc-exceptions.txt @@ -122,9 +122,9 @@ consent-sdk/src/mobile/ios/ConsentManager.swift consent-tester/services/dsi_discovery.py # --- backend-compliance: unified compliance check orchestrator --- -# Sequential 7-step pipeline (text resolve, profile detect, check documents, -# banner scan, cross-check, profile extract, report). Phase 5 split target. -backend-compliance/compliance/api/agent_compliance_check_routes.py +# 2026-06-06: REMOVED — file split into agent_check/ subpackage +# (19 files, main module now 347 LOC). Phase 5 target completed. +# [guardrail-change] # --- docs-src: binary office files (not source code) --- # (Also excluded by extension in scripts/check-loc.sh — kept here for legibility.) diff --git a/backend-compliance/compliance/api/agent_check/__init__.py b/backend-compliance/compliance/api/agent_check/__init__.py new file mode 100644 index 00000000..b15c5367 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/__init__.py @@ -0,0 +1,10 @@ +""" +Subpackage for the compliance-check route — extracted to keep +`agent_compliance_check_routes.py` under the 500-line guardrail. + +The route module still owns the public HTTP endpoints and re-exports +all helpers from this subpackage, so external callers +(`saving_scan_routes`, `agent_migration_routes`, tests) continue to +import them from `compliance.api.agent_compliance_check_routes` +unchanged. +""" diff --git a/backend-compliance/compliance/api/agent_check/_b1_wiring.py b/backend-compliance/compliance/api/agent_check/_b1_wiring.py new file mode 100644 index 00000000..599a893d --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_b1_wiring.py @@ -0,0 +1,105 @@ +"""B1 wiring — Mobile Consent-Reachability check + HTML block. + +Fetches the homepage of the first submitted URL, runs the static +`evaluate_reachability` analysis on the footer, and renders the +result as an HTML block for the audit mail. + +Only renders a block when the check FAILS — a passing site doesn't +need a block. The block is severity-colored and lists the specific +notes that triggered the finding (missing reopen anchor, new-tab +break, browser-deflection language). +""" + +from __future__ import annotations + +import html +import logging + +import httpx + +from compliance.services.consent_reachability_check import ( + evaluate_reachability, +) + +from ._helpers import _update + +logger = logging.getLogger(__name__) + + +async def run_b1(state: dict) -> None: + """Run the reachability check + render HTML. Mutates state in place.""" + req = state["req"] + check_id = state["check_id"] + homepage_url = "" + for d in req.documents: + if d.url: + from urllib.parse import urlparse + p = urlparse(d.url) + if p.scheme and p.netloc: + homepage_url = f"{p.scheme}://{p.netloc}/" + break + if not homepage_url: + return + + _update(check_id, "Mobile Consent-Reachability prüfen...", 95) + try: + async with httpx.AsyncClient( + timeout=20.0, follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 " + "like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) " + "Version/17.5 Mobile/15E148 Safari/604.1"}, + ) as c: + r = await c.get(homepage_url) + if r.status_code != 200: + logger.info("B1: homepage fetch %s → HTTP %d", homepage_url, r.status_code) + return + page_html = r.text + except Exception as e: + logger.warning("B1: homepage fetch failed: %s", e) + return + + finding = evaluate_reachability(page_html, homepage_url) + state["reachability_finding"] = finding + state["reachability_html"] = _render_block(finding) + logger.info( + "B1 Reachability: passed=%s severity=%s reason=%s", + finding["passed"], finding.get("severity"), + finding.get("severity_reason"), + ) + + +def _render_block(finding: dict) -> str: + """Render the reachability finding as an audit-mail HTML block.""" + if finding["passed"]: + return "" + sev = (finding.get("severity") or "").upper() + color = "#dc2626" if sev == "HIGH" else "#f59e0b" + notes_html = "".join( + f"
  • {html.escape(n)}
  • " for n in finding.get("notes") or [] + ) + anchor = finding.get("reopen_anchor") or {} + anchor_html = "" + if anchor: + anchor_html = ( + "

    " + "Gefundener Footer-Link: " + f"{html.escape((anchor.get('text') or '')[:80])} " + f"→ {html.escape((anchor.get('href') or '')[:120])} " + f"(target_class: {html.escape(anchor.get('target_class') or '—')})" + "

    " + ) + return ( + f"
    " + f"

    " + "COOKIE-CONSENT-UX-001 — Mobile Consent-Reachability

    " + f"

    Severity: " + f"{sev} ({html.escape(finding.get('severity_reason') or '')})

    " + "

    " + "Art. 7 Abs. 3 DSGVO: Widerruf muss so einfach wie Erteilung sein. " + "Auf Mobile-Safari konnten wir folgendes Problem feststellen:

    " + f"" + f"{anchor_html}" + "
    " + ) diff --git a/backend-compliance/compliance/api/agent_check/_b3_wiring.py b/backend-compliance/compliance/api/agent_check/_b3_wiring.py new file mode 100644 index 00000000..8f6e1a9d --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_b3_wiring.py @@ -0,0 +1,189 @@ +"""B3 wiring — Cross-doc retention consistency check + HTML block. + +Combines three sources of retention truth per cookie: + + - DSI text (state["doc_texts"]["dse"] or "cookie") + - cookie-table `duration` from cmp_vendors[i]["cookies"][j] + - actual cookie expiry from banner_result["cookies_detailed"][k] + +and produces per-cookie findings + a TH-RETENTION theme summary. Only +renders an HTML block when there are findings to show; the block is +sorted by severity (HIGH first) and shows the top-10 mismatches. +""" + +from __future__ import annotations + +import html +import logging +import time + +from compliance.services.retention_comparator import ( + build_retention_theme_summary, + compare_retention, + extract_retention_claims, +) + +logger = logging.getLogger(__name__) + + +def _actual_max_age_seconds(cookie: dict) -> float | None: + """Get cookie Max-Age in seconds. + + Playwright gives us `expires` as a Unix timestamp (seconds-since- + epoch). Some sources give `max_age` directly. -1 / 0 means session + cookie (no expiry) — return None to signal that. + """ + ma = cookie.get("max_age") + if isinstance(ma, (int, float)) and ma > 0: + return float(ma) + exp = cookie.get("expires") + if isinstance(exp, (int, float)) and exp > 0: + delta = exp - time.time() + if delta > 0: + return float(delta) + return None + + +def run_b3(state: dict) -> None: + """Cross-doc retention check + render HTML. Mutates state in place.""" + doc_texts = state["doc_texts"] + cmp_vendors = state["cmp_vendors"] + banner_result = state["banner_result"] + + dsi_text = doc_texts.get("dse") or doc_texts.get("cookie") or "" + if not dsi_text: + return + + cookie_records: list[dict] = [] + cookie_names: list[str] = [] + vendor_names: list[str] = [] + for v in cmp_vendors or []: + vname = (v.get("name") or "").strip() + if vname: + vendor_names.append(vname) + for c in (v.get("cookies") or []): + cname = (c.get("name") or "").strip() + if not cname: + continue + duration = (c.get("duration") or c.get("persistence") + or c.get("expiry") or "") + cookie_names.append(cname) + cookie_records.append({ + "name": cname, + "vendor": vname, + "table_duration": duration, + "actual_max_age": None, + }) + + if not cookie_records: + return + + # Match actual max_age from banner_result.cookies_detailed + if banner_result: + cookies_detailed = banner_result.get("cookies_detailed") or [] + by_name: dict[str, dict] = {} + for c in cookies_detailed: + n = (c.get("name") or "").lower() + if n: + by_name[n] = c + for rec in cookie_records: + nm = rec["name"].lower() + if nm in by_name: + rec["actual_max_age"] = _actual_max_age_seconds(by_name[nm]) + + claims = extract_retention_claims(dsi_text, cookie_names, vendor_names) + + findings: list[dict] = [] + for rec in cookie_records: + finding = compare_retention( + cookie_name=rec["name"], + table_duration=rec["table_duration"], + actual_max_age_seconds=rec["actual_max_age"], + dsi_claims=claims, + vendor_name=rec["vendor"] or None, + ) + findings.append(finding) + + summary = build_retention_theme_summary(findings) + state["retention_findings"] = findings + state["retention_theme_summary"] = summary + state["retention_html"] = _render_block(summary, findings) + logger.info( + "B3 Retention: %d findings, %d passed, %d failed, %d incomplete", + summary["total"], summary["passed"], summary["failed"], + summary["incomplete"], + ) + + +def _fmt_days(d: float | None) -> str: + if d is None: + return "—" + if d < 1: + return f"{int(d * 24)}h" + if d < 30: + return f"{int(d)}d" + if d < 365: + return f"{int(d / 30)}mo" + return f"{d / 365:.1f}y" + + +def _render_block(summary: dict, findings: list[dict]) -> str: + if summary["total"] == 0: + return "" + failed_findings = [f for f in findings if not f.get("matches") + and f.get("severity_reason") != "incomplete"] + if not failed_findings: + return "" # all OK, no block needed + # Sort by severity (HIGH first) then diff_days desc + sev_rank = {"HIGH": 0, "MEDIUM": 1, "LOW": 2} + failed_findings.sort(key=lambda f: ( + sev_rank.get((f.get("severity") or "").upper(), 9), + -(f.get("diff_days") or 0), + )) + rows = [] + for f in failed_findings[:10]: + sev = (f.get("severity") or "").upper() + color = ("#dc2626" if sev == "HIGH" + else "#f59e0b" if sev == "MEDIUM" else "#64748b") + rows.append( + "" + f"" + f"{html.escape(f.get('cookie_name') or '—')}" + f"" + f"{html.escape((f.get('vendor_name') or '—'))}" + f"" + f"DSI: {_fmt_days(f.get('dsi_days'))} • " + f"Tabelle: {_fmt_days(f.get('table_days'))} • " + f"Realität: {_fmt_days(f.get('actual_days'))}" + f"" + f"{sev} ({html.escape(f.get('mismatch_type') or '—')})" + "" + ) + total = summary["total"] + passed = summary["passed"] + failed = summary["failed"] + incomplete = summary["incomplete"] + return ( + "
    " + "

    " + "TH-RETENTION — Speicherdauer-Konsistenz (DSI ↔ Cookie-Tabelle ↔ Realität)" + "

    " + "

    " + f"{total} Cookies verglichen: " + f"{passed} ✓ / " + f"{failed} ✗ / " + f"{incomplete} ?

    " + "" + "" + "" + "" + "" + "" + "" + f"{''.join(rows)}" + "
    CookieVendorWerteMismatch
    " + "
    " + ) diff --git a/backend-compliance/compliance/api/agent_check/_constants.py b/backend-compliance/compliance/api/agent_check/_constants.py new file mode 100644 index 00000000..628f45de --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_constants.py @@ -0,0 +1,93 @@ +"""Module-level constants + shared job state for the compliance-check +route. + +`_compliance_check_jobs` is the SINGLE source of truth for in-flight +job progress. Other modules MUST import the same object — never +re-declare it — otherwise progress updates land in a detached dict. +""" + +from __future__ import annotations + +# Internal hostname of the consent-tester container. +CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094" + +# In-memory job registry. Keyed by check_id. Values: +# {"status": "running"|"completed"|"failed"|"skipped_tdm", +# "progress": str, "progress_pct": int, "result": dict, ...} +# Read/written by: +# - agent_compliance_check_routes (start/status/_run/_update) +# - saving_scan_routes (start) +# - agent_migration_routes (status mirror) +_compliance_check_jobs: dict[str, dict] = {} + + +# Canonical doc types in the same order the frontend +# ComplianceCheckTab renders them. The route pads `results` to always +# include an entry for each — missing rows are flagged as 'Nicht +# eingereicht' or 'Auf der Website nicht gefunden'. +# +# DSB-Kontakt is NOT canonical: per GDPR practice the DSB is named +# inside the DSI/datenschutz document (email or contact block), not as +# a separate page. We check 'DSB benannt' as a sub-check of the DSE. +_ALL_DOC_TYPES = [ + "dse", "impressum", "social_media", "cookie", + "agb", "nutzungsbedingungen", "widerruf", +] + + +# Human-readable labels per doc_type. Used in the report + emails. +_DOC_TYPE_LABELS = { + "dse": "Datenschutzerklaerung", + "datenschutz": "Datenschutzerklaerung", + "privacy": "Datenschutzerklaerung", + "impressum": "Impressum", + "agb": "AGB", + "widerruf": "Widerrufsbelehrung", + "cookie": "Cookie-Richtlinie", + "avv": "Auftragsverarbeitung", + "loeschkonzept": "Loeschkonzept", + "dsfa": "Datenschutz-Folgenabschaetzung", + "social_media": "Social Media Datenschutz", + "nutzungsbedingungen": "Nutzungsbedingungen", + "dsb": "DSB-Kontakt", + # P74: Legal-Notice / Rechtliche Hinweise (IP, Forward-Looking, Risiko) + "legal_notice": "Rechtliche Hinweise", + # P96: Digital Services Act-Pflichtangaben (Art. 12+17 DSA) + "dsa": "DSA-Pflichtangaben", + # P97: Lizenzhinweise Dritter (OSS-Compliance) + "lizenzhinweise": "Lizenzhinweise Dritter", +} + + +# Title/URL keywords → canonical doc_type. Order matters: most-specific first. +_DISCOVERY_RULES: list[tuple[str, tuple[str, ...]]] = [ + ("cookie", ("cookie", "kuche", "biscuit", "cookies-")), + ("widerruf", ("widerruf", "rueckgabe", "rückgabe", "cancellation", + "right-of-withdrawal", "ruecktritts", "rücktritts")), + ("social_media", ("social-media", "soziale-medien", "social_media", + "social-media-policy")), + # P23: 'terms-and-conditions' kann Allgemeine Geschaeftsbedingungen ODER + # Nutzungsbedingungen meinen. Discovery-Funktion klassifiziert spaeter + # praeziser per Titel + Inhalt. Hier nur Url-Hint: + ("agb", ("/agb", "geschaeftsbedingungen", "geschäftsbedingungen", + "general-terms")), + ("nutzungsbedingungen", ("nutzungsbedingung", "nutzungsbedingungen", + "terms-of-use", "terms-and-conditions", + "nutzungsordnung", "terms-of-service", + "allgemeine-nutzungsbedingungen")), + ("dsb", ("datenschutzbeauftragt", "data-protection-officer", + "dpo-contact", "/dsb")), + ("impressum", ("impressum", "imprint", "legal-notice", "site-notice", + "anbieterkennzeichnung", "legal-disclaimer-pool")), + ("dse", ("data-privacy", "datenschutz", "data-protection", + "privacy-policy", "privacy-notice", "dsgvo", + "data_privacy", "datenschutzinformation")), +] + + +# Compound TLDs that count as 2 labels when extracting the second-level +# domain (e.g. shop.example.co.uk → 'example', not 'co'). +_COMPOUND_TLDS = { + "co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in", + "com.au", "com.br", "com.mx", "com.tr", "com.sg", +} diff --git a/backend-compliance/compliance/api/agent_check/_discovery.py b/backend-compliance/compliance/api/agent_check/_discovery.py new file mode 100644 index 00000000..e7e4d392 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_discovery.py @@ -0,0 +1,230 @@ +"""Auto-discovery of missing canonical doc-types. + +For each canonical type the user did NOT submit, try to find it on the +homepage of the URLs they DID submit. Also follow same-owner subdomains +mentioned in the submitted text (BMW Group → bmwgroup.com etc.). + +Discovered docs are classified by `_classify_discovered_doc` and merged +back into `doc_entries`; entries that stayed empty get +`discovery_attempted=True` so the padding step can differentiate +"Nicht eingereicht" from "Auf der Website nicht gefunden". +""" + +from __future__ import annotations + +import logging +import re +from urllib.parse import urlparse + +import httpx + +from ._constants import _ALL_DOC_TYPES, CONSENT_TESTER_URL +from ._helpers import _classify_discovered_doc, _update + +logger = logging.getLogger(__name__) + + +async def _autodiscover_missing( + check_id: str, + doc_entries: list[dict], + doc_texts: dict[str, str], + url_text_cache: dict[str, str], +) -> None: + """For each canonical doc_type the user did not submit, try to find + the corresponding document on the homepage of the site they DID submit. + + Modifies doc_entries in place: fills text/url/word_count and sets + `auto_discovered=True`. Marks `discovery_attempted=True` on every + missing entry (even when nothing was found) so the report can + distinguish 'Nicht eingereicht' from 'Auf der Website nicht gefunden'. + """ + # VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen + # als 'submitted'. Wenn der User eine URL eingegeben hat aber die + # 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger + # als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln + # damit der Discovery-Pass alternative URLs probiert. + _MIN_USEFUL_CHARS = 200 + submitted_types = { + e["doc_type"] for e in doc_entries + if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS + } + # Markiere die fehlgeschlagenen URL-Submissions damit der Discovery + # ihre URL nicht erneut probiert (waere sinnlos). + failed_urls: set[str] = { + (e.get("url") or "").strip() + for e in doc_entries + if (e.get("url") or "").strip() + and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS + } + if failed_urls: + logger.info( + "VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery " + "soll Alternativen probieren: %s", + len(failed_urls), _MIN_USEFUL_CHARS, + ", ".join(list(failed_urls)[:3]), + ) + # Map alias types to canonical + submitted_canon = { + "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types + } + # Missing = canonical types the user did NOT submit + missing = set(_ALL_DOC_TYPES) - submitted_canon + if not missing: + return + + # Pick the most common base (scheme://netloc) from submitted URLs. + bases: dict[str, int] = {} + for e in doc_entries: + u = (e.get("url") or "").strip() + if u and "://" in u: + p = urlparse(u) + base = f"{p.scheme}://{p.netloc}" + bases[base] = bases.get(base, 0) + 1 + if not bases: + # No submitted URL at all — nothing to crawl from. Add empty + # placeholders (with discovery_attempted=False) so the padding + # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden'). + for dt in missing: + doc_entries.append({ + "doc_type": dt, "url": "", "text": "", "word_count": 0, + "auto_discovered": False, "discovery_attempted": False, + }) + return + + # Build crawl plan: primary base + any related domains mentioned in + # the submitted texts that share the owner's SLD. Example: BMW Group + # text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de. + primary_base = max(bases, key=bases.get) + "/" + crawl_bases: list[str] = [primary_base] + primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.") + owner_token = primary_netloc.split(".")[0] # 'bmw' + + if owner_token and len(owner_token) >= 3: + domain_re = re.compile( + r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token) + + r"[a-z0-9\-]*\.[a-z]{2,}", + re.IGNORECASE, + ) + seen_bases = {primary_base} + for entry in doc_entries: + text = entry.get("text") or "" + for m in domain_re.finditer(text): + p = urlparse(m.group(0)) + base = f"{p.scheme}://{p.netloc}/" + base_netloc = p.netloc.lower().lstrip("www.") + if base_netloc == primary_netloc: + continue + if base in seen_bases: + continue + seen_bases.add(base) + crawl_bases.append(base) + if len(crawl_bases) >= 3: + break + if len(crawl_bases) >= 3: + break + + _update( + check_id, + f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...", + 18, + ) + + discovered: list[dict] = [] + disc_payloads: list[dict] = [] + disc_cookie_texts: list[str] = [] + for base in crawl_bases: + try: + async with httpx.AsyncClient(timeout=300.0) as client: # P90: 180s -> 300s + resp = await client.post( + f"{CONSENT_TESTER_URL}/dsi-discovery", + json={"url": base, "max_documents": 15}, + timeout=300.0, # P90: 180s -> 300s + ) + if resp.status_code != 200: + logger.warning("auto-discovery: HTTP %d for %s", + resp.status_code, base) + continue + body = resp.json() + discovered.extend(body.get("documents", []) or []) + disc_payloads.extend(body.get("cmp_payloads") or []) + cmp_text = body.get("cmp_cookie_text") or "" + if cmp_text: + disc_cookie_texts.append(cmp_text) + logger.info("auto-discovery on %s: %d docs, %d CMP payloads, " + "cmp_cookie_text=%d words", base, + len(body.get("documents", []) or []), + len(body.get("cmp_payloads") or []), + len(cmp_text.split())) + except Exception as e: + # P90: verbose exception fuer Diagnose + logger.warning("auto-discovery failed for %s: %s (%s)", + base, str(e) or "(empty)", type(e).__name__) + + # Classify each discovered doc into a canonical doc_type + by_type: dict[str, dict] = {} + for d in discovered: + title = (d.get("title") or "").lower() + url = (d.get("url") or "").lower() + wc = d.get("word_count") or 0 + if wc < 100: + continue + canon = _classify_discovered_doc(title, url) + if canon and canon in missing and canon not in by_type: + by_type[canon] = d + + # Append/Update entry for every missing canonical type. Auto-discovered + # ones get the text/URL filled; ungratched ones stay empty so the + # padding step renders them as 'Auf der Website nicht gefunden'. + # VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber + # fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren. + filled = 0 + for dt in missing: + existing = next((e for e in doc_entries + if e.get("doc_type") == dt), None) + new_entry: dict = existing if existing else { + "doc_type": dt, "url": "", "text": "", "word_count": 0, + "auto_discovered": False, "discovery_attempted": True, + "cmp_payloads": [], + } + new_entry["discovery_attempted"] = True + d = by_type.get(dt) + if d: + full = d.get("full_text") or d.get("text_preview") or "" + # For cookie: prefer the CMP-reconstructed text when it's + # substantially richer than the auto-discovered DOM extraction. + # BMW homepage CMP yields ~1800 words of authoritative policy; + # DOM extraction typically yields ~600 words of site chrome. + if dt == "cookie" and disc_cookie_texts: + cmp_merged = "\n\n".join(disc_cookie_texts) + if len(cmp_merged.split()) > len(full.split()): + logger.info( + "cookie: using CMP-reconstructed text (%d words) " + "instead of DOM (%d words)", + len(cmp_merged.split()), len(full.split()), + ) + full = cmp_merged + if len(full.split()) >= 100: + new_entry["text"] = full + # Behalte die original URL als "rejected_url" damit Audit + # zeigt 'X war 404, wir haben Y gefunden'. + if existing and (existing.get("url") or "").strip() in failed_urls: + new_entry["rejected_url"] = existing.get("url") + new_entry["url"] = d.get("url", "") + new_entry["word_count"] = len(full.split()) + new_entry["auto_discovered"] = True + if dt == "cookie" and disc_payloads: + new_entry["cmp_payloads"] = disc_payloads + doc_texts[dt] = full + filled += 1 + logger.info( + "auto-discovered %s on %s: %s (%d words)%s", + dt, base, d.get("url", "")[:80], new_entry["word_count"], + " [REPLACED failed URL]" if existing else "", + ) + if not existing: + doc_entries.append(new_entry) + + logger.info( + "auto-discovery: filled %d/%d missing types from %s", + filled, len(missing), base, + ) diff --git a/backend-compliance/compliance/api/agent_check/_fetch.py b/backend-compliance/compliance/api/agent_check/_fetch.py new file mode 100644 index 00000000..52c96e7e --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_fetch.py @@ -0,0 +1,142 @@ +"""URL → text fetch helper for the compliance-check pipeline. + +Tries the consent-tester service first (Playwright, full JS render + +CMP capture). On any failure or empty result, falls back to a direct +HTTP GET with an identifiable User-Agent and per-domain rate limiting. + +For cookie/dse/social_media doc types we cap discovery to 1 sub-page +(the policy itself is authoritative). For Impressum/AGB/Widerruf and +similar enterprise-split pages we follow up to 3 sub-pages. +""" + +from __future__ import annotations + +import logging +import re as _re + +import httpx + +from ._constants import CONSENT_TESTER_URL + +logger = logging.getLogger(__name__) + + +async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]: + """Fetch text from URL via consent-tester, with HTTP fallback. + + Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured + during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or + HTTP fallback was used. Backend turns payloads into structured vendor + records for the VVT table in the email. + """ + # 1. Consent-tester (Playwright-based, full JS rendering). + # max_documents depends on doc_type: + # - cookie/dse/social_media: self-extract (often + CMP capture) is + # authoritative, sub-pages dilute the policy text. max=1. + # - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar + # enterprise sites split this across 3-4 short sub-pages + # (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows + # them. The 15s networkidle bail (dsi_helpers) keeps timing safe. + short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"} + max_docs = 1 if (doc_type or "") in short_extract_types else 3 + try: + # P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt + # 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit + # 120s auch oft an Akamai-Latenz. + async with httpx.AsyncClient(timeout=240.0) as client: + resp = await client.post( + f"{CONSENT_TESTER_URL}/dsi-discovery", + json={"url": url, "max_documents": max_docs}, + timeout=240.0, + ) + if resp.status_code == 200: + payload = resp.json() + docs = payload.get("documents", []) + cmp_payloads = payload.get("cmp_payloads") or [] + cmp_cookie_text = payload.get("cmp_cookie_text") or "" + # D — wenn der consent-tester HTML-Tabellen aus dem DOM + # extrahiert hat, in die cmp_payloads als "generic_table" + # einschleusen damit das Backend sie via cookies_table_parser + # verarbeiten kann. + for doc in (docs or []): + for tbl in (doc.get("tables") or []): + if not tbl or len(tbl) < 3: + continue + cmp_payloads.append({ + "kind": "html_table", + "url": doc.get("url", ""), + "rows": tbl, + }) + if docs: + texts = [] + for doc in docs: + t = doc.get("full_text", "") or doc.get("text_preview", "") or "" + if t and len(t) > 50: + texts.append(t) + merged = "\n\n".join(texts) + # For cookie/dse/social_media: when CMP reconstruction is + # substantially richer than DOM extraction, use it. This + # fixes the BMW case where DOM yields ~600 words of + # navigation but the ePaaS payload reconstructs to ~1800 + # words of actual cookie policy. + if (doc_type in short_extract_types + and cmp_cookie_text + and len(cmp_cookie_text.split()) > len(merged.split())): + logger.info( + "Preferring CMP-reconstructed text for %s on %s " + "(%d words CMP vs %d words DOM)", + doc_type, url, + len(cmp_cookie_text.split()), + len(merged.split()), + ) + merged = cmp_cookie_text + if merged and len(merged.split()) > 100: + if len(texts) > 1: + logger.info("Merged %d docs from %s (%d words)", + len(texts), url, len(merged.split())) + return merged, cmp_payloads + # P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort- + # Schwelle ist, die captured CMP-Payloads NICHT verwerfen. + # BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON + # (393KB) wurde captured. Backend braucht die fuer + # extract_vendors_from_payloads (VVT-Tabelle). + if cmp_payloads: + logger.info( + "P90: keeping %d CMP payloads for %s despite " + "short text (%d words) — HTTP fallback runs in parallel", + len(cmp_payloads), url, + len((merged or cmp_cookie_text).split()), + ) + fallback_text = merged or cmp_cookie_text or "" + return fallback_text, cmp_payloads + except Exception as e: + # P90: verbose exception fuer Diagnose (war vorher empty) + logger.warning("Consent-tester fetch failed for %s: %s (%s)", + url, str(e) or "(empty)", type(e).__name__) + + # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW). + # P7: kenntlicher UA + per-Domain Rate-Limit. + try: + from compliance.services.compliance_user_agent import ( + default_request_headers, DomainRateLimiter, + ) + async with httpx.AsyncClient( + timeout=30.0, follow_redirects=True, + headers=default_request_headers(), + ) as client: + async with DomainRateLimiter(url): + resp = await client.get(url) + if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""): + html = resp.text + # Strip HTML tags, decode entities + text = _re.sub(r"]*>.*?", " ", html, flags=_re.DOTALL | _re.IGNORECASE) + text = _re.sub(r"]*>.*?", " ", text, flags=_re.DOTALL | _re.IGNORECASE) + text = _re.sub(r"<[^>]+>", " ", text) + text = _re.sub(r"\s+", " ", text).strip() + if len(text.split()) > 100: + logger.info("HTTP fallback for %s: %d words", url, len(text.split())) + return text, [] + except Exception as e: + logger.warning("HTTP fallback failed for %s: %s", url, e) + + return "", [] diff --git a/backend-compliance/compliance/api/agent_check/_helpers.py b/backend-compliance/compliance/api/agent_check/_helpers.py new file mode 100644 index 00000000..4c8d5d28 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_helpers.py @@ -0,0 +1,228 @@ +"""Pure helpers for the compliance-check route — no I/O, no async. + +Grouped here because each is small and they share the same constants +imports. Splitting further would not improve readability. +""" + +from __future__ import annotations + +import logging +from urllib.parse import urlparse + +from ._constants import ( + _ALL_DOC_TYPES, + _COMPOUND_TLDS, + _DISCOVERY_RULES, + _DOC_TYPE_LABELS, + _compliance_check_jobs, +) + +logger = logging.getLogger(__name__) + + +def _update(check_id: str, msg: str, pct: int | None = None) -> None: + """Update the in-memory job entry with a progress message + pct.""" + job = _compliance_check_jobs[check_id] + job["progress"] = msg + if pct is not None: + job["progress_pct"] = max(0, min(100, int(pct))) + + +def _doc_type_label(doc_type: str) -> str: + return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper()) + + +def _classify_discovered_doc(title: str, url: str) -> str | None: + """Map a discovered doc (by its title + URL) to one of our 8 canonical types.""" + haystack = f"{title} {url}" + for canon, keywords in _DISCOVERY_RULES: + if any(kw in haystack for kw in keywords): + return canon + return None + + +def _extract_domain(doc_entries: list[dict]) -> str | None: + """Extract base domain (without www) from first URL.""" + for entry in doc_entries: + url = entry.get("url", "") + if url and "://" in url: + host = urlparse(url).netloc.lower() + if host.startswith("www."): + host = host[4:] + return host or None + return None + + +def _company_name_from_url(doc_entries: list[dict]) -> str | None: + """Derive a display company name from the entered URLs. + + Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"), + uppercase short acronyms (<=4 chars, no hyphens), title-case the rest. + + Examples: + www.bmw.de -> BMW + mercedes-benz.de -> Mercedes-Benz + shop.example.co.uk -> Example + juris.de -> Juris + """ + for entry in doc_entries: + url = entry.get("url", "") + if not url or "://" not in url: + continue + host = urlparse(url).netloc.lower() + if host.startswith("www."): + host = host[4:] + parts = host.split(".") + if len(parts) < 2: + continue + # Handle compound TLDs (.co.uk etc.) + if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS: + sld = parts[-3] + else: + sld = parts[-2] + if not sld: + continue + if len(sld) <= 4 and "-" not in sld: + return sld.upper() + return "-".join(p.capitalize() for p in sld.split("-")) + return None + + +def _get_skip_types(profile) -> dict[str, str]: + """Doc_types to skip entirely with a per-type reason message. + + Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes): + wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/ + Nutzungsbedingungen nicht Pflicht auf der Website — sie werden + beim Vertragshaendler ausgehaendigt. + """ + if getattr(profile, "no_direct_sales", False): + msg = ( + "Nicht anwendbar — die Webseite schliesst keinen Direkt-" + "Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft " + "ueber Vertragshaendler). AGB/Widerruf werden beim " + "Haendler ausgehaendigt." + ) + return { + "agb": msg, + "widerruf": msg, + "nutzungsbedingungen": msg, + } + return {} + + +def _apply_profile_filter(result, profile, doc_type: str): + """Adjust INFO-level checks based on business profile context. + + For example: ODR check only relevant for B2C online shops. + """ + for check in result.checks: + cid = check.id.lower() + + # ODR/OS-Link: relevant ONLY for B2C online shops. The check's + # default hint is written for B2B (it explains why it's not + # relevant) — for B2C we must replace it with action-oriented + # guidance, otherwise the report contradicts itself. + if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower(): + if profile.needs_odr: + if not check.passed: + check.hint = ( + "Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 " + "auf die OS-Plattform (https://ec.europa.eu/consumers/odr) " + "verlinken — klickbarer Link, nicht nur Text. Zusaetzlich " + "§36 VSBG: angeben, ob Sie an Verbraucher-" + "Streitbeilegungsverfahren teilnehmen (oder nicht)." + ) + else: + check.skipped = True + check.hint = "Nicht relevant (kein B2C Online-Shop)" + + # Widerruf: Flag entire document as unnecessary for B2B + if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"): + check.severity = "INFO" + if not check.passed: + check.hint = ( + "Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung " + "(§355 BGB gilt nur fuer Verbrauchervertraege). " + "Empfehlung: Entfernen Sie die Widerrufsbelehrung von " + "Ihrer Website, da sie Verwirrung stiften kann." + ) + + # Regulated profession: check for Kammer info + if "kammer" in cid or "berufsordnung" in check.label.lower(): + if not profile.is_regulated_profession: + check.skipped = True + check.hint = "Nicht relevant (kein regulierter Beruf)" + + return result + + +def _pad_results_with_missing( + results: list, + discovery_attempted: set[str] | None = None, +) -> list: + """Ensure every canonical doc_type has an entry in the results list. + + Doc_types the user did not submit AND auto-discovery did not find get + a placeholder DocCheckResult. The error message distinguishes: + - 'Auf der Website nicht gefunden' (discovery was attempted) + - 'Nicht eingereicht' (no submitted URLs to crawl from) + + Preserves the canonical ordering from _ALL_DOC_TYPES so the report + layout is stable. + """ + from ..agent_doc_check_routes import DocCheckResult + attempted = discovery_attempted or set() + + by_type: dict[str, object] = {} + for r in results: + canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type + by_type[canon] = r + + ordered: list = [] + for dt in _ALL_DOC_TYPES: + if dt in by_type: + ordered.append(by_type[dt]) + continue + if dt in attempted: + msg = ("Auf der Website nicht gefunden — bitte URL des " + "Dokuments manuell eintragen, falls vorhanden") + else: + msg = "Nicht eingereicht — Quelle nicht angegeben" + ordered.append(DocCheckResult( + label=_doc_type_label(dt), + url="", + doc_type=dt, + word_count=0, + completeness_pct=0, + correctness_pct=0, + checks=[], + findings_count=0, + error=msg, + scenario="missing", + )) + + extras = [r for r in results + if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse") + not in _ALL_DOC_TYPES] + ordered.extend(extras) + return ordered + + +def _result_to_dict(r) -> dict: + """Convert DocCheckResult to JSON-serializable dict.""" + fields = ("id", "label", "passed", "severity", "matched_text", + "level", "parent", "skipped", "hint") + return { + "label": r.label, "url": r.url, "doc_type": r.doc_type, + "word_count": r.word_count, "completeness_pct": r.completeness_pct, + "correctness_pct": r.correctness_pct, + "checks": [{f: getattr(c, f) for f in fields} for c in r.checks], + "findings_count": r.findings_count, "error": r.error, + "scenario": getattr(r, "scenario", ""), + } + + +def _build_profile_html(profile) -> str: + from ..agent_doc_check_report import build_profile_html + return build_profile_html(profile) diff --git a/backend-compliance/compliance/api/agent_check/_orchestrator.py b/backend-compliance/compliance/api/agent_check/_orchestrator.py new file mode 100644 index 00000000..3fcbb4f1 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_orchestrator.py @@ -0,0 +1,69 @@ +"""Thin orchestrator — runs the 6 phases of the compliance check. + +The original `_run_compliance_check` was a 1620-line monolith. It is +now decomposed into six phases (A=resolve, B=profile+check, +C=banner+extract, D=report-build [D1 raw vendors, D2 finalize, +D3-top/mid/bot blocks], E=email, F=persist), each in its own module. + +State flows through a single mutable `dict` (see `_state.new_state`). +This intentionally trades type safety for additive flexibility: the +report-building phase routinely adds new optional keys for each new +HTML block, and a typed dataclass would freeze the schema before the +new blocks could land. +""" + +from __future__ import annotations + +import logging + +from ._b1_wiring import run_b1 +from ._b3_wiring import run_b3 +from ._constants import _compliance_check_jobs +from ._phase_a_resolve import run_phase_a +from ._phase_b_profile_check import run_phase_b +from ._phase_c_banner import run_phase_c +from ._phase_d1_vendors_raw import run_phase_d1 +from ._phase_d2_vendors_finalize import run_phase_d2 +from ._phase_d3_blocks_bot import run_phase_d3_bot +from ._phase_d3_blocks_mid import run_phase_d3_mid +from ._phase_d3_blocks_top import run_phase_d3_top +from ._phase_e_email import run_phase_e +from ._phase_f_persist import run_phase_f +from ._state import new_state + +logger = logging.getLogger(__name__) + + +async def run_compliance_check(check_id: str, req) -> None: + """Background task: check all documents with business-profile context.""" + state = new_state(check_id, req) + try: + # Phase A: TDM gate + Step 1 (resolve / discover / split / dedup) + continue_run = await run_phase_a(state) + if not continue_run: + return # TDM denied — job already marked skipped_tdm + # Phase B: Step 2 (profile detect) + Step 3 (per-doc checks) + await run_phase_b(state) + # Phase C: Step 3b-d (banner + cross-check + TCF) + Step 4 + await run_phase_c(state) + # Phase D-1/D-2: Step 5 vendor extraction + finalize + await run_phase_d1(state) + await run_phase_d2(state) + # B1 + B3: cross-cutting checks that need the finalized vendor + # list + DSI text. Render their own HTML blocks consumed by + # phase D-3 bot's full_html composition. + await run_b1(state) + run_b3(state) + # Phase D-3 top/mid/bot: Step 5 HTML blocks + await run_phase_d3_top(state) + await run_phase_d3_mid(state) + await run_phase_d3_bot(state) + # Phase E: Step 6 send mail (with A1 ZIP attachment) + run_phase_e(state) + # Phase F: Step 7 persist + audit log + unified findings + run_phase_f(state) + except Exception as e: + logger.error("Compliance check %s failed: %s", + check_id, e, exc_info=True) + _compliance_check_jobs[check_id]["status"] = "failed" + _compliance_check_jobs[check_id]["error"] = str(e)[:500] diff --git a/backend-compliance/compliance/api/agent_check/_phase_a_resolve.py b/backend-compliance/compliance/api/agent_check/_phase_a_resolve.py new file mode 100644 index 00000000..b6bfa679 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_a_resolve.py @@ -0,0 +1,232 @@ +"""Phase A — TDM gate + text resolution + section split + dedup. + +Covers (in the original `_run_compliance_check`): + - TDM-reservation pre-check (§ 44b UrhG) + - Step 1 Resolve texts (URL fetch / pasted text / auto-reclassify) + - Step 1a Auto-discovery of missing canonical doc_types + - Step 1b Section splitting (shared URL → multiple doc_types, + DSI → Cookie/Social-Media auto-fill) + - Step 1c Cross-document keyword search + - P15 Dedup of doc_types referencing the same source document + +Returns True to continue, False if the run was aborted (TDM denied). +""" + +from __future__ import annotations + +import logging + +from ._constants import _compliance_check_jobs +from ._discovery import _autodiscover_missing +from ._fetch import _fetch_text +from ._helpers import _update + +logger = logging.getLogger(__name__) + + +async def run_phase_a(state: dict) -> bool: + """Run TDM gate + Step 1 + Step 1a-c + P15 dedup. Mutate state in place.""" + check_id = state["check_id"] + req = state["req"] + + # Reset anchor-locator cache per run (avoid cross-run leak) + try: + from compliance.services.doc_anchor_locator import reset_cache + reset_cache() + except Exception: + pass + + # P7: TDM-Reservation-Check der Base-Domain (§ 44b UrhG). + # Bei reserved/denied: Run sofort beenden, kein Crawl. + try: + from compliance.services.tdm_reservation_check import ( + check_tdm_reservation, is_crawl_allowed, + ) + first_url = next( + (d.url for d in req.documents if d.url), "", + ) + if first_url: + tdm = await check_tdm_reservation(first_url) + _compliance_check_jobs[check_id]["tdm"] = tdm + # P12: Bei tdm_override + Reason wird NICHT abgebrochen, + # sondern nur dokumentiert. Override ohne Reason wird ignoriert. + override_active = ( + req.tdm_override + and len((req.tdm_override_reason or "").strip()) >= 10 + ) + if not is_crawl_allowed(tdm) and not override_active: + _compliance_check_jobs[check_id]["status"] = "skipped_tdm" + _compliance_check_jobs[check_id]["error"] = ( + f"TDM-Vorbehalt fuer {tdm.get('domain')} erkannt " + f"(status={tdm.get('status')}) — Crawl nach § 44b " + f"UrhG nicht zulaessig. Signals: " + f"{[s.get('src') for s in tdm.get('signals', [])]}" + ) + _compliance_check_jobs[check_id]["progress_pct"] = 100 + logger.info("TDM-skip check_id=%s domain=%s status=%s", + check_id, tdm.get("domain"), tdm.get("status")) + return False + if override_active and not is_crawl_allowed(tdm): + _compliance_check_jobs[check_id]["tdm_override"] = { + "reason": req.tdm_override_reason.strip()[:500], + "original_status": tdm.get("status"), + } + logger.warning( + "TDM-Override aktiv: check_id=%s domain=%s " + "status=%s reason=%r", + check_id, tdm.get("domain"), tdm.get("status"), + req.tdm_override_reason.strip()[:80], + ) + except Exception as e: + logger.warning("TDM-check failed (proceeding): %s", e) + + # Step 1: Resolve texts (fetch from URL if needed) — 0-30% + _update(check_id, "Texte werden geladen...", 1) + doc_texts: dict[str, str] = {} + doc_entries: list[dict] = [] + + # Cache fetched URLs to detect duplicates + url_text_cache: dict[str, str] = {} + + n_docs = max(1, len(req.documents)) + # User-pasted-Tabellen-Vendors (kein LLM noetig) — werden weiter + # unten in cmp_vendors gemerged. + pasted_table_vendors: list[dict] = [] + for i, doc in enumerate(req.documents): + pct = int(1 + (i / n_docs) * 29) + _update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct) + text = (doc.text or "").strip() + input_source = "url" + cmp_payloads: list[dict] = [] + if text: + input_source = "text" + if doc.url: + input_source = "text+url" # User hat beide gefuellt + logger.info( + "doc_type=%s: User hat URL UND Text geliefert — " + "Text gewinnt, URL wird als Quellen-Referenz behalten", + doc.doc_type, + ) + elif doc.url: + url_key = doc.url.strip().rstrip("/").lower() + if url_key in url_text_cache: + text = url_text_cache[url_key] + else: + text, cmp_payloads = await _fetch_text(doc.url, doc_type=doc.doc_type) + if text: + url_text_cache[url_key] = text + + # Auto-Reclassify-Check: wenn der user Text in das falsche + # Doc-Type-Feld kopiert hat (z.B. Impressum-Text in DSE), + # erkennen und ggf. umtaggen. + actual_doc_type = doc.doc_type + reclassify_hint: dict | None = None + if input_source.startswith("text") and len(text) >= 500: + try: + from compliance.services.doc_type_classifier import ( + detect_mismatch, + ) + reclassify_hint = detect_mismatch(doc.doc_type, text) + if reclassify_hint and reclassify_hint["action"] == "reclassify": + actual_doc_type = reclassify_hint["detected"] + logger.info( + "doc_type AUTO-RECLASSIFY: deklariert=%s " + "erkannt=%s (score %d vs %d) — uebernehme erkannten Typ", + doc.doc_type, actual_doc_type, + reclassify_hint["detected_score"], + reclassify_hint["declared_score"], + ) + except Exception as e: + logger.warning("doc_type_classifier failed: %s", e) + + # Cookie-Tabelle: wenn User Tabelle reinkopiert hat, deterministisch + # parsen (kein LLM noetig) und Vendors gleich ableiten. + if input_source.startswith("text") and actual_doc_type == "cookie": + try: + from compliance.services.cookies_table_parser import ( + parse_cookie_table, + ) + tab_vendors = parse_cookie_table(text) + if tab_vendors: + pasted_table_vendors.extend(tab_vendors) + logger.info( + "Cookie-Tabelle erkannt im pasted Text — " + "%d Vendors / %d Cookies deterministisch geparst", + len(tab_vendors), + sum(len(v.get("cookies", [])) for v in tab_vendors), + ) + except Exception as e: + logger.warning("cookies_table_parser failed: %s", e) + + if text: + doc_texts[actual_doc_type] = text + doc_entries.append({ + "doc_type": actual_doc_type, + "declared_doc_type": doc.doc_type, + "url": doc.url, + "text": text, + "word_count": len(text.split()) if text else 0, + "auto_discovered": False, + "discovery_attempted": False, + "cmp_payloads": cmp_payloads, + "input_source": input_source, + "reclassify_hint": reclassify_hint, + }) + + # Step 1a-bis: AUTO-DISCOVERY + await _autodiscover_missing( + check_id, doc_entries, doc_texts, url_text_cache, + ) + + # Step 1b: Section splitting — two cases: + # 1. Same URL used for multiple doc_types → split by heading + # 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows + from compliance.services.section_splitter import ( + split_shared_texts, auto_fill_from_dsi, cross_search_documents, + ) + split_shared_texts(doc_entries, url_text_cache) + auto_fill_from_dsi(doc_entries) + + # Step 1c: Cross-document search — find doc_types in wrong documents (30-35%) + _update(check_id, "Dokumente werden uebergreifend durchsucht...", 32) + placement_findings = cross_search_documents(doc_entries) + + # Refresh doc_texts after all splitting/searching + for entry in doc_entries: + if entry.get("text"): + doc_texts[entry["doc_type"]] = entry["text"] + + # P15: Dedupe — wenn mehrere Doc-Types DASSELBE Dokument referenzieren + # (z.B. Safetykon: User gibt /datenschutz fuer dse + cookie + widerruf), + # behalten wir nur den primaeren Doc-Type. Andere: leeren + note. + # Priorität: dse > impressum > cookie > widerruf > agb > nutzungsbedingungen + _DOC_PRIORITY = ["dse", "impressum", "cookie", "widerruf", "agb", + "nutzungsbedingungen", "social_media", "dsb"] + seen_text_hash: dict[int, str] = {} + for dt in _DOC_PRIORITY: + entry = next((e for e in doc_entries if e.get("doc_type") == dt + and e.get("text")), None) + if not entry: + continue + text_hash = hash((entry.get("text") or "").strip()[:1000]) + if text_hash in seen_text_hash: + primary = seen_text_hash[text_hash] + logger.info( + "P15 dedup: doc_type=%s referenziert dasselbe Dokument " + "wie %s (URL=%s) -> als Duplikat markiert.", + dt, primary, entry.get("url", "")[:60], + ) + entry["text"] = "" + entry["word_count"] = 0 + entry["url"] = "" + entry["dup_of"] = primary + doc_texts.pop(dt, None) + else: + seen_text_hash[text_hash] = dt + + state["doc_texts"] = doc_texts + state["doc_entries"] = doc_entries + state["url_text_cache"] = url_text_cache + state["pasted_table_vendors"] = pasted_table_vendors + state["placement_findings"] = placement_findings + return True diff --git a/backend-compliance/compliance/api/agent_check/_phase_b_profile_check.py b/backend-compliance/compliance/api/agent_check/_phase_b_profile_check.py new file mode 100644 index 00000000..b19c5ed9 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_b_profile_check.py @@ -0,0 +1,183 @@ +"""Phase B — Business-profile detection + per-document checks. + +Covers (in the original `_run_compliance_check`): + - Step 2 Detect business profile (with optional homepage merge for + P16 keywords) + - Step 3 Run regex + MC + LLM checks on each submitted document + (`_check_single`), applying skip rules + profile filter + + placement findings +""" + +from __future__ import annotations + +import logging +import os +import re as _re +from dataclasses import asdict + +import httpx + +from ._helpers import ( + _apply_profile_filter, + _doc_type_label, + _get_skip_types, + _update, +) +from ._single_check import _check_single + +logger = logging.getLogger(__name__) + + +async def run_phase_b(state: dict) -> None: + """Detect business profile + check each document. Mutates state in place.""" + check_id = state["check_id"] + req = state["req"] + doc_texts = state["doc_texts"] + doc_entries = state["doc_entries"] + placement_findings = state["placement_findings"] + + # Step 2: Detect business profile (35-40%) + from compliance.services.business_profiler import detect_business_profile + _update(check_id, "Geschaeftsmodell wird erkannt...", 37) + # P16: Homepage-Text mit fuer Profile-Detection (no_direct_sales + # B2B-Indikatoren wie "CE-Zertifizierung" / "Schulungen" stehen oft + # nur im Homepage-Menue, nicht im Pflichttext). + profile_input = dict(doc_texts) + try: + base_url = "" + for e in doc_entries: + if e.get("url"): + from urllib.parse import urlparse + p = urlparse(e["url"]) + if p.scheme and p.netloc: + base_url = f"{p.scheme}://{p.netloc}/" + break + if base_url: + async with httpx.AsyncClient( + timeout=8.0, follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) " + "AppleWebKit/537.36 HeadlessChrome/120.0.0.0"}, + ) as _hc: + _hr = await _hc.get(base_url) + if _hr.status_code == 200 and "text/html" in _hr.headers.get( + "content-type", ""): + _html = _hr.text[:60000] + _html = _re.sub(r"]*>.*?", " ", + _html, flags=_re.DOTALL | _re.IGNORECASE) + _html = _re.sub(r"]*>.*?", " ", + _html, flags=_re.DOTALL | _re.IGNORECASE) + _html = _re.sub(r"<[^>]+>", " ", _html) + _html = _re.sub(r"\s+", " ", _html).strip() + if len(_html.split()) > 30: + profile_input["__homepage"] = _html[:20000] + logger.info("P16 homepage merged for profile: %d words", + len(_html.split())) + except Exception as e: + logger.debug("homepage fetch for profile failed: %s", e) + profile = await detect_business_profile(profile_input) + profile_dict = asdict(profile) + + # Step 3: Check each document + from ..agent_doc_check_routes import CheckItem, DocCheckResult + results: list[DocCheckResult] = [] + total_findings = 0 + use_agent_flag = req.use_agent or os.getenv( + "COMPLIANCE_USE_AGENT", "false", + ).lower() == "true" + + # Filter out doc_types that don't apply to this business profile + skip_types = _get_skip_types(profile) + + # Derive business_scope hints for the MC filter (O1 — Doc-type Scope-Flag). + # MCs that explicitly require a feature (e.g. 'biometric_processing', + # 'ai_decision_making', 'child_targeting') get dropped when the + # detected profile doesn't declare it. + business_scope: set[str] = set() + for svc in (getattr(profile, "detected_services", []) or []): + business_scope.add(str(svc).lower()) + if (getattr(profile, "business_type", "") or "").lower() == "b2c": + business_scope.add("b2c") + if getattr(profile, "has_online_shop", False): + business_scope.add("ecommerce") + if getattr(profile, "is_regulated_profession", False): + business_scope.add("regulated_profession") + + # Document checks: 40-80% + n_entries = max(1, len(doc_entries)) + for i, entry in enumerate(doc_entries): + text = entry["text"] + doc_type = entry["doc_type"] + label = _doc_type_label(doc_type) + url = entry["url"] + + if doc_type in skip_types: + results.append(DocCheckResult( + label=label, url=url, doc_type=doc_type, + error=skip_types[doc_type], + )) + continue + + pct = int(40 + (i / n_entries) * 40) + _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct) + + if not text or len(text) < 50: + # P15: duplicate doc that was deduped against a primary doc + if entry.get("dup_of"): + results.append(DocCheckResult( + label=label, url="", doc_type=doc_type, + error=f"Nicht separat vorhanden — wird im Dokument " + f"'{_doc_type_label(entry['dup_of'])}' " + f"mit-geprueft.", + )) + continue + # P24: DSB-Kontakt ist Pflichtangabe in der DSE (Art. 13(1)(b) + # DSGVO) — wenn kein separates DSB-Dokument vorliegt, ist das + # KEIN Fehler. DSB-Pruefung passiert ohnehin in der DSE. + if doc_type == "dsb" and not (entry.get("url") or "").strip(): + results.append(DocCheckResult( + label=label, url="", doc_type=doc_type, + error="Nicht separat vorhanden — DSB-Kontaktdaten " + "werden in der Datenschutzerklaerung als " + "Pflichtangabe nach Art. 13(1)(b) DSGVO geprueft.", + )) + continue + # Empty entry — either from auto-discovery padding (no URL + # to fetch) or from a fetch that returned nothing. If there + # was a URL we keep the error so the user knows the fetch + # failed; otherwise let the padding step label it + # 'Nicht eingereicht' / 'Auf der Website nicht gefunden'. + if (entry.get("url") or "").strip(): + results.append(DocCheckResult( + label=label, url=url, doc_type=doc_type, + error="Kein Text vorhanden oder zu kurz", + )) + continue + + result = await _check_single( + text, doc_type, label, url, + entry["word_count"], use_agent_flag, + business_scope=business_scope, + business_profile={"no_direct_sales": getattr(profile, "no_direct_sales", False)}, + ) + + # Apply profile context filter + result = _apply_profile_filter(result, profile, doc_type) + + # Add placement findings — but only if the regex checks confirm + # the text doesn't match. If completeness >= 50%, the text IS the + # right doc_type despite missing cross-search keywords. + if result.completeness_pct < 50: + for pf in placement_findings: + if pf.get("doc_type") == doc_type: + result.checks.insert(0, CheckItem(**{ + k: v for k, v in pf.items() if k != "doc_type" + })) + + results.append(result) + total_findings += result.findings_count + + state["profile"] = profile + state["profile_dict"] = profile_dict + state["business_scope"] = business_scope + state["results"] = results + state["total_findings"] = total_findings diff --git a/backend-compliance/compliance/api/agent_check/_phase_c_banner.py b/backend-compliance/compliance/api/agent_check/_phase_c_banner.py new file mode 100644 index 00000000..00ec3384 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_c_banner.py @@ -0,0 +1,129 @@ +"""Phase C — Banner scan + Cookie/DSE cross-check + TCF check + profile extract. + +Covers (in the original `_run_compliance_check`): + - Step 3b Cookie-banner scan via consent-tester /scan (homepage, + 3-phase consent test) + - Step 3c Cross-check banner findings vs. cookie-policy text + - Step 3d TCF vendor vs. DSI cross-check + VVT entries + - Step 4 Extract profile hints from documents + - Step 4b Determine scenario per document (skip / regenerate / fix / + import) + - Step 4c Pad missing canonical doc_types so the report always shows + every checklist row +""" + +from __future__ import annotations + +import logging + +import httpx + +from ._constants import CONSENT_TESTER_URL +from ._helpers import _pad_results_with_missing, _update + +logger = logging.getLogger(__name__) + + +async def run_phase_c(state: dict) -> None: + """Run banner scan + cross-checks + profile extraction. Mutates state.""" + check_id = state["check_id"] + req = state["req"] + doc_texts = state["doc_texts"] + doc_entries = state["doc_entries"] + results = state["results"] + profile_dict = state["profile_dict"] + + # Step 3b: Banner-Check (automatic, uses first URL or homepage) + banner_result = None + banner_url = req.documents[0].url if req.documents and req.documents[0].url else "" + # Use the homepage (strip path) for banner check + if banner_url: + from urllib.parse import urlparse + parsed = urlparse(banner_url) + banner_url = f"{parsed.scheme}://{parsed.netloc}" + if banner_url: + _update(check_id, "Cookie-Banner wird geprueft...", 82) + try: + async with httpx.AsyncClient(timeout=900.0) as client: # P50: +10min for vendor-detail-phase + resp = await client.post( + f"{CONSENT_TESTER_URL}/scan", + json={"url": banner_url, "timeout_per_phase": 10}, + ) + if resp.status_code == 200: + banner_result = resp.json() + except Exception as e: + logger.warning( + "Banner check failed: %s (%s)", e or "", type(e).__name__, + ) + + # Step 3c: Cross-check Banner vs Cookie-Richtlinie (88-90%) + if banner_result and "cookie" in doc_texts: + from compliance.services.banner_cookie_cross_check import ( + cross_check_banner_vs_cookie, + ) + from ..agent_doc_check_routes import CheckItem + _update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...", 89) + cross_findings = cross_check_banner_vs_cookie( + banner_result, doc_texts["cookie"], + ) + if cross_findings: + for r in results: + if r.doc_type == "cookie": + for cf in cross_findings: + r.checks.append(CheckItem(**cf)) + l2 = [c for c in r.checks if c.level == 2 and not c.skipped] + l2p = sum(1 for c in l2 if c.passed) + r.correctness_pct = round(l2p / len(l2) * 100) if l2 else 0 + + # Step 3d: TCF Vendor cross-check against DSI + tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else [] + vvt_entries: list[dict] = [] + if tcf_vendors and "dse" in doc_texts: + _update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...", 91) + from compliance.services.banner_cookie_cross_check import ( + cross_check_vendors_vs_dsi, + ) + from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt + from ..agent_doc_check_routes import CheckItem + vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"]) + if vendor_findings: + for r in results: + if r.doc_type == "dse": + for vf in vendor_findings: + r.checks.append(CheckItem(**vf)) + vvt_entries = map_vendors_to_vvt(tcf_vendors) + + # Step 4: Extract profile hints from documents (92-95%) + _update(check_id, "Profil wird aus Dokumenten extrahiert...", 93) + from compliance.services.profile_extractor import ( + extract_profile_from_documents, + ) + extracted_profile = extract_profile_from_documents(doc_texts, profile_dict) + + # Step 4b: Determine scenario per document + for r in results: + if r.error: + r.scenario = "skip" + elif r.completeness_pct < 30: + r.scenario = "regenerate" + elif r.completeness_pct < 95: + r.scenario = "fix" + else: + r.scenario = "import" + + # Step 4c: Always render all 8 canonical doc types. Missing types + # are differentiated: + # - Discovery was tried but found nothing -> 'Auf der Website + # nicht gefunden' (suggest user provides URL manually) + # - No submitted URLs at all -> 'Nicht eingereicht' + attempted = { + e["doc_type"] for e in doc_entries if e.get("discovery_attempted") + } + results = _pad_results_with_missing(results, discovery_attempted=attempted) + + state["banner_result"] = banner_result + state["banner_url"] = banner_url + state["tcf_vendors"] = tcf_vendors + state["vvt_entries"] = vvt_entries + state["extracted_profile"] = extracted_profile + state["results"] = results diff --git a/backend-compliance/compliance/api/agent_check/_phase_d1_vendors_raw.py b/backend-compliance/compliance/api/agent_check/_phase_d1_vendors_raw.py new file mode 100644 index 00000000..ca75e24d --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_d1_vendors_raw.py @@ -0,0 +1,315 @@ +"""Phase D-1 — Vendor-extraction raw stages. + +Covers (in the original Step 5 of `_run_compliance_check`): + - Aggregate cmp_payloads from all doc_entries + banner_result (P30/P48) + - Fallback: use DSE text when cookie was deduped (P17-D) + - Extract structured vendor records from CMP payloads + - LLM-cascade fallback when structured extract yields < 5 vendors (P52) + - Phase-G vendor-details append (P57) + - HTML-table DOM parse (Stage D) + - Crawled cookie-table parse (Stage B) + - Tesseract OCR over evidence slices (Stage C) — also captures the + cookie_evidence_slices used by A1 e-mail attachment +""" + +from __future__ import annotations + +import logging + +from ._helpers import _company_name_from_url, _update + +logger = logging.getLogger(__name__) + + +async def run_phase_d1(state: dict) -> None: + """Vendor-extract raw stages. Mutates state in place.""" + check_id = state["check_id"] + doc_entries = state["doc_entries"] + doc_texts = state["doc_texts"] + banner_result = state["banner_result"] + pasted_table_vendors = state["pasted_table_vendors"] + + cmp_vendors: list[dict] = [] + cookie_payloads: list[dict] = [] + cookie_text = "" + cookie_evidence_slices: list[dict] | None = None + cookie_evidence_meta: dict | None = None + + try: + from compliance.services.vendor_extractor import ( + extract_vendors_from_payloads, + ) + + # P30: aggregate cmp_payloads from ALL doc_entries — sites + # like Mercedes load Usercentrics only on the homepage, so the + # JSON gets captured during DSE/Impressum discovery, not in the + # cookies.html fetch. Dedup by URL since the same payload is + # captured on every page load. + seen_cmp_urls: set[str] = set() + for e in doc_entries: + for p in (e.get("cmp_payloads") or []): + p_url = p.get("url") or "" + if p_url and p_url in seen_cmp_urls: + continue + seen_cmp_urls.add(p_url) + cookie_payloads.append(p) + if e.get("doc_type") == "cookie" and e.get("text"): + cookie_text = e["text"] + # P48: also pull cmp_payloads from the Banner-Scan (homepage 3-phase + # consent test). Mercedes' Usercentrics-JSON is captured there even + # when not in DSI-Discovery of static legal pages. + if banner_result: + for p in (banner_result.get("cmp_payloads") or []): + p_url = p.get("url") or "" + if p_url and p_url in seen_cmp_urls: + continue + seen_cmp_urls.add(p_url) + cookie_payloads.append(p) + if cookie_payloads: + logger.info("P48: %d CMP-payloads available for vendor-extract " + "(after Banner-Scan merge)", len(cookie_payloads)) + # P17-D: Fallback wenn cookie via P15 deduped wurde — nutze DSE-Text + # sofern Cookie-Begriffe drin sind, damit LLM-Vendor-Extract trotzdem + # greifen kann. + if not cookie_text and not cookie_payloads: + dse_t = doc_texts.get("dse", "") + if dse_t and any(w in dse_t.lower() for w in + ("cookie", "tracking", "google analytics", "consent")): + cookie_text = dse_t + logger.info("P17-D: vendor-extract Fallback auf DSE (Cookie deduped)") + owner_name = _company_name_from_url(doc_entries) or "" + if cookie_payloads: + cmp_vendors = extract_vendors_from_payloads( + cookie_payloads, owner_name=owner_name, + ) + + # P52: LLM-Fallback nicht nur wenn 0 Vendors, sondern auch wenn die + # strukturierten Quellen < 5 Vendors lieferten und der Cookie-Text + # substantiell ist. + if (len(cmp_vendors) < 5 + and cookie_text and len(cookie_text.split()) >= 500): + from compliance.services.vendor_llm_extractor import ( + extract_vendors_via_llm, + ) + from compliance.services.vendor_classifier import classify + _update(check_id, "Vendor-Liste per LLM extrahieren...", 94) + llm_vendors = await extract_vendors_via_llm(cookie_text) + existing_names = {(v.get("name") or "").strip().lower() + for v in cmp_vendors} + added_llm = 0 + for v in llm_vendors: + nm = (v.get("name") or "").strip() + if not nm or nm.lower() in existing_names: + continue + v["recipient_type"] = classify( + vendor_name=nm, + category=v.get("category", ""), + owner_name=owner_name, + ) + v.setdefault("source", "llm_cascade") + cmp_vendors.append(v) + existing_names.add(nm.lower()) + added_llm += 1 + if added_llm: + logger.info("P52 LLM-Cascade: +%d Vendors (total: %d)", + added_llm, len(cmp_vendors)) + + # P57: Phase G vendor_details als zusätzliche Vendor-Quelle. + if banner_result: + vd_list = banner_result.get("vendor_details") or [] + vd_list = [v for v in vd_list if v.get("name") != "__TDM_OPTOUT__"] + existing_names = {(v.get("name") or "").strip().lower() + for v in cmp_vendors} + added = 0 + for d in vd_list: + n = (d.get("name") or "").strip() + if not n or n.lower() in existing_names: + continue + if n.lower() in ("technisch erforderlich", "analyse und statistik", + "marketing", "alles auswählen", + "alles auswaehlen"): + continue + from compliance.services.vendor_classifier import classify + cmp_vendors.append({ + "name": n, + "country": "", + "purpose": d.get("description", "")[:500], + "category": "", + "opt_out_url": d.get("opt_out_url", ""), + "privacy_policy_url": d.get("privacy_url", ""), + "persistence": d.get("retention", ""), + "cookies": d.get("cookies", []), + "processing_company": d.get("processing_company", ""), + "address": d.get("address", ""), + "purposes": d.get("purposes", []), + "technologies": d.get("technologies", []), + "recipient_type": classify( + vendor_name=n, category="", owner_name=owner_name, + ), + }) + existing_names.add(n.lower()) + added += 1 + if added: + logger.info("P57: added %d new vendors from Phase G (total: %d)", + added, len(cmp_vendors)) + + # D — HTML-Tabellen aus DOM + for pl in (cookie_payloads or []): + if pl.get("kind") != "html_table": + continue + rows = pl.get("rows") or [] + if len(rows) < 3: + continue + try: + from compliance.services.cookies_table_parser import ( + parse_cookie_table as _parse_ct_d, + ) + table_text = "\n".join(rows) + d_vendors = _parse_ct_d(table_text) + if d_vendors: + existing_d = {(v.get("name") or "").strip().lower() + for v in cmp_vendors} + added_d = 0 + for v in d_vendors: + nm = (v.get("name") or "").strip() + if not nm or nm.lower() in existing_d: + continue + v.setdefault("source", "html_table_dom") + cmp_vendors.append(v) + existing_d.add(nm.lower()) + added_d += 1 + if added_d: + logger.info("D HTML-Table-DOM-Parse: +%d Vendors aus " + "%d-Zeilen-Tabelle (total: %d)", + added_d, len(rows), len(cmp_vendors)) + except Exception as e: + logger.warning("html_table parse failed: %s", e) + + # B — cookies_table_parser auch auf gecrawltem Cookie-Text + if cookie_text and len(cookie_text) >= 500: + try: + from compliance.services.cookies_table_parser import ( + parse_cookie_table as _parse_ct, + parse_flat_cookie_text as _parse_flat, + ) + crawled_table_vendors = _parse_ct(cookie_text) + if not crawled_table_vendors: + crawled_table_vendors = _parse_flat(cookie_text) + if crawled_table_vendors: + existing = {(v.get("name") or "").strip().lower() + for v in cmp_vendors} + added_c = 0 + for v in crawled_table_vendors: + nm = (v.get("name") or "").strip() + if not nm or nm.lower() in existing: + continue + v.setdefault("source", "table_crawled") + cmp_vendors.append(v) + existing.add(nm.lower()) + added_c += 1 + if added_c: + logger.info("B Crawled-Tabellen-Parse: +%d Vendors " + "(total: %d)", added_c, len(cmp_vendors)) + except Exception as e: + logger.warning("crawled-table-parse failed: %s", e) + + # C — Screenshot + Tesseract-OCR (auch Quelle für A1 ZIP-Anhang) + cookie_url_for_shot = "" + for _e in doc_entries: + if _e.get("doc_type") == "cookie" and _e.get("url"): + cookie_url_for_shot = _e["url"]; break + if cookie_url_for_shot: + try: + from compliance.services.cookie_screenshot_ocr import ( + capture_cookie_evidence_slices, + cookies_to_vendor_records, + ocr_slices_extract_cookies, + ) + from compliance.services.cookies_table_parser import ( + _guess_vendor as _gv, + ) + _update(check_id, + "Cookie-Richtlinie wird fotografiert " + "(lueckenlose Beweiskette)...", 92) + ev = await capture_cookie_evidence_slices( + cookie_url_for_shot, check_id=check_id, + viewport_h=1024, overlap_px=200, max_slices=40, + ) + if ev.get("slices"): + cookie_evidence_slices = ev["slices"] + cookie_evidence_meta = { + "total_height_px": ev.get("total_height_px"), + "width_px": ev.get("width_px"), + "accepted_banner": ev.get("accepted_banner"), + "expanded": ev.get("expanded"), + "url": ev.get("url"), + "slice_count": len(ev["slices"]), + } + _update(check_id, "Tesseract OCR über alle Slices...", 93) + ocr_cookies, ocr_stats = ocr_slices_extract_cookies( + ev["slices"], + ) + if ocr_cookies: + ocr_vendors = cookies_to_vendor_records( + ocr_cookies, guess_vendor_fn=_gv, + ) + existing = {(v.get("name") or "").strip().lower() + for v in cmp_vendors} + added_v = 0 + for v in ocr_vendors: + nm = (v.get("name") or "").strip() + if not nm: + continue + if nm.lower() in existing: + for ex in cmp_vendors: + if (ex.get("name") or "").strip().lower() == nm.lower(): + ex_names = { + (c.get("name") or "").lower() + for c in (ex.get("cookies") or []) + } + for c in (v.get("cookies") or []): + if c["name"].lower() not in ex_names: + ex.setdefault("cookies", []).append(c) + ex_names.add(c["name"].lower()) + cur_src = ex.get("source", "") + if "tesseract_ocr" not in cur_src: + ex["source"] = (cur_src + ";tesseract_ocr").strip(";") + break + continue + cmp_vendors.append(v) + existing.add(nm.lower()) + added_v += 1 + logger.info( + "C Tesseract-OCR: +%d Vendors / %d Cookies " + "(über %d Slices, total: %d)", + added_v, len(ocr_cookies), + ocr_stats.get("slices", 0), len(cmp_vendors), + ) + except Exception as e: + logger.warning("Tesseract-OCR pipeline failed: %s (%s)", + str(e) or "(no msg)", type(e).__name__) + + # User-pasted Cookie-Tabelle (deterministisch, kein LLM): + # die hat IMMER Vorrang weil 100% genau. + if pasted_table_vendors: + existing = {(v.get("name") or "").strip().lower() + for v in cmp_vendors} + added_p = 0 + for v in pasted_table_vendors: + nm = (v.get("name") or "").strip() + if not nm or nm.lower() in existing: + continue + cmp_vendors.append(v) + existing.add(nm.lower()) + added_p += 1 + if added_p: + logger.info("Pasted-Tabellen-Merge: +%d Vendors (total: %d)", + added_p, len(cmp_vendors)) + except Exception as e: + logger.warning("VVT vendor extraction skipped: %s", e) + + state["cmp_vendors"] = cmp_vendors + state["cookie_payloads"] = cookie_payloads + state["cookie_text"] = cookie_text + state["cookie_evidence_slices"] = cookie_evidence_slices + state["cookie_evidence_meta"] = cookie_evidence_meta diff --git a/backend-compliance/compliance/api/agent_check/_phase_d2_vendors_finalize.py b/backend-compliance/compliance/api/agent_check/_phase_d2_vendors_finalize.py new file mode 100644 index 00000000..344b89a8 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_d2_vendors_finalize.py @@ -0,0 +1,250 @@ +"""Phase D-2 — Vendor finalize: enrich + normalize + library fallback. + +Covers (in the original Step 5 of `_run_compliance_check`): + - Cookie-Library-Fallback (P52 Lite) — when < 20 vendors but many + after-accept cookies, resolve via library + - Vendor-Normalizer (Google-Familie dedup, garbage filter) + - Detail-modal enrichment from Phase G (P50) + TDM-opt-out sentinel + - Cookie-Behavior-Validator (P59b) — 3-Tier severity findings + - Implicit cookies detection (P61) — GTM brings GA/GCL/DoubleClick + - validate_vendor_urls + score_vendors + cookie-function classify + - Vendor-Redundanz (O4) + EU-Alternativen + Cost/Savings +""" + +from __future__ import annotations + +import logging +import re + +logger = logging.getLogger(__name__) + + +async def run_phase_d2(state: dict) -> None: + """Vendor finalize stages + redundancy. Mutates state in place.""" + cmp_vendors = state["cmp_vendors"] + cookie_text = state.get("cookie_text", "") + banner_result = state["banner_result"] + banner_url = state["banner_url"] + profile = state["profile"] + business_scope = state["business_scope"] + + tdm_opt_out_notice = "" + cookie_behavior_findings: list[dict] = [] + redundancy_report = None + + try: + from compliance.services.cookie_link_validator import ( + score_vendors, validate_vendor_urls, + ) + + # Cookie-Library-Fallback (P52 Lite): wenn weiterhin wenige + # Vendors aber viele after_accept-Cookies, aus Library auflösen. + # VW-Lehre: 6 LLM-Grob-Vendors reichen NICHT — die Library + # holt 30+ weitere aus den Cookie-Namen + Cookie-Doc-Pattern. + # Schwelle: immer probieren wenn < 20 Vendors. + if banner_result and len(cmp_vendors) < 20: + try: + from compliance.services.cookie_to_vendor_fallback import ( + fallback_vendors_for_run, + ) + from database import SessionLocal as _SLfb + _fb_db = _SLfb() + try: + extra = fallback_vendors_for_run( + _fb_db, banner_result, len(cmp_vendors), + cookie_doc_text=cookie_text, + ) + if extra: + existing_names = {(v.get("name") or "").strip().lower() + for v in cmp_vendors} + for v in extra: + if v["name"].lower() in existing_names: + continue + cmp_vendors.append(v) + logger.info( + "Cookie-Library-Fallback: cmp_vendors %d -> %d", + len(cmp_vendors) - len(extra), len(cmp_vendors), + ) + finally: + _fb_db.close() + except Exception as e: + logger.warning("Cookie-Library-Fallback skipped: %s", e) + + # Vendor-Normalizer: Dedup (Google-Familie etc) + Garbage-Filter + try: + from compliance.services.vendor_normalizer import ( + normalize_vendors as _norm_v, + ) + cmp_vendors = _norm_v(cmp_vendors) + except Exception as e: + logger.warning("vendor_normalizer skipped: %s", e) + + # P50: enrich vendors with per-vendor detail-modal-extracts + if cmp_vendors and banner_result: + vendor_details = banner_result.get("vendor_details") or [] + # P50f: filter out TDM-opt-out sentinel + tdm_sentinel = next((v for v in vendor_details + if v.get("name") == "__TDM_OPTOUT__"), None) + if tdm_sentinel: + tdm_opt_out_notice = tdm_sentinel.get("description", "") + logger.info("P50f: TDM opt-out — skipped detail-enrichment for vendors") + vendor_details = [v for v in vendor_details + if v.get("name") != "__TDM_OPTOUT__"] + if vendor_details: + details_by_name = {} + for d in vendor_details: + n = (d.get("name") or "").strip().lower() + if n: + details_by_name[n] = d + enriched = 0 + for v in cmp_vendors: + key = (v.get("name") or "").strip().lower() + d = details_by_name.get(key) + if not d: + for dn, dv in details_by_name.items(): + if key in dn or dn in key: + d = dv + break + if not d: + continue + if not v.get("country") and (d.get("processing_company") or d.get("address")): + addr = d.get("address", "") + if re.search(r"\b(deutschland|germany|berlin|m(?:ue|ü)nchen|hamburg|stuttgart)\b", addr, re.I): + v["country"] = "DE" + elif re.search(r"\bireland|irland|dublin\b", addr, re.I): + v["country"] = "IE" + elif re.search(r"\busa|united states|california|new york|delaware\b", addr, re.I): + v["country"] = "US" + if not v.get("purpose"): + v["purpose"] = d.get("description", "")[:500] + if not v.get("opt_out_url"): + v["opt_out_url"] = d.get("opt_out_url", "") + if not v.get("privacy_policy_url"): + v["privacy_policy_url"] = d.get("privacy_url", "") + if not v.get("cookies"): + v["cookies"] = d.get("cookies", []) + v["purposes"] = d.get("purposes", []) + v["technologies"] = d.get("technologies", []) + if not v.get("persistence"): + v["persistence"] = d.get("retention", "") + v["processing_company"] = d.get("processing_company", "") + v["address"] = d.get("address", "") + enriched += 1 + logger.info("P50: enriched %d/%d vendors with detail-modal data", + enriched, len(cmp_vendors)) + + # P59b: Cookie-Behavior-Validator + if banner_result: + cookies_detailed = banner_result.get("cookies_detailed") or [] + if cookies_detailed: + cb_session = None + try: + from database import SessionLocal + from compliance.services.cookie_behavior_validator import ( + validate_cookie_behavior, + ) + from urllib.parse import urlparse + fp_domain = "" + if banner_url: + fp_domain = urlparse(banner_url).netloc.replace("www.", "") + cb_session = SessionLocal() + cookie_behavior_findings = validate_cookie_behavior( + cb_session, cookies_detailed, + network_requests=[], # TODO Layer B in P59d + first_party_domain=fp_domain, + ) + if cookie_behavior_findings: + sevs = {f["severity"] for f in cookie_behavior_findings} + logger.info( + "P59b: Cookie-Behavior-Check %d findings (severities: %s) " + "ueber %d Cookies", + len(cookie_behavior_findings), + sorted(sevs), len(cookies_detailed), + ) + banner_result["cookie_behavior_findings"] = ( + cookie_behavior_findings + ) + else: + logger.info( + "P59b: Cookie-Behavior-Check 0 findings ueber %d Cookies " + "(library miss / clean)", len(cookies_detailed), + ) + except Exception as cb_err: + logger.warning("P59b Cookie-Behavior-Check failed: %s", cb_err) + finally: + if cb_session is not None: + try: + cb_session.close() + except Exception: + pass + + # P61: "Untergeschobene Cookies" + if banner_result and cmp_vendors: + try: + from compliance.services.vendor_package_cookies import ( + detect_implicit_cookies, + ) + declared = [v.get("name", "") for v in cmp_vendors if v.get("name")] + actual_cookies: list[str] = [] + for phase_data in (banner_result.get("phases") or {}).values(): + if isinstance(phase_data, dict): + for ck in (phase_data.get("cookies") or []): + if isinstance(ck, dict) and ck.get("name"): + actual_cookies.append(ck["name"]) + implicit_findings = detect_implicit_cookies( + declared, actual_cookies_set=actual_cookies or None, + ) + if implicit_findings: + banner_result["implicit_vendor_findings"] = implicit_findings + logger.info( + "P61: %d implicit vendor-package items detected " + "(%d cookies + %d vendors)", + len(implicit_findings), + sum(1 for f in implicit_findings if f["implicit"]["type"] == "cookie"), + sum(1 for f in implicit_findings if f["implicit"]["type"] == "vendor"), + ) + except Exception as p61_err: + logger.warning("P61 implicit-vendor detection failed: %s", p61_err) + + if cmp_vendors: + logger.info("VVT: %d vendors extracted, validating links", + len(cmp_vendors)) + cmp_vendors = await validate_vendor_urls(cmp_vendors) + cmp_vendors = score_vendors(cmp_vendors) + try: + from compliance.services.cookie_function_classifier import ( + annotate_vendor_cookies, + ) + cmp_vendors = [annotate_vendor_cookies(v) for v in cmp_vendors] + except Exception as e: + logger.warning("Cookie function classification skipped: %s", e) + except Exception as e: + logger.warning("VVT vendor finalize skipped: %s", e) + + # Vendor-Redundanz + EU-Alternativen + Cost/Savings (O4) + try: + from compliance.services.vendor_cost_estimator import infer_company_tier + from compliance.services.vendor_redundancy import ( + analyze as analyze_redundancy, + ) + if cmp_vendors: + bp_dict = { + "type": getattr(profile, "business_type", ""), + "features": list(business_scope), + } + ctier = infer_company_tier(bp_dict) + redundancy_report = analyze_redundancy(cmp_vendors, company_tier=ctier) + logger.info( + "Redundanz: %d Kategorien mit Mehrfach-Anbietern, " + "Spar-Schaetzung %s pro Jahr (company_tier=%s)", + redundancy_report["summary"]["redundancy_count"], + redundancy_report["summary"]["estimated_saving_pct"], + ctier, + ) + except Exception as e: + logger.warning("Vendor redundancy analysis skipped: %s", e) + + state["cmp_vendors"] = cmp_vendors + state["tdm_opt_out_notice"] = tdm_opt_out_notice + state["cookie_behavior_findings"] = cookie_behavior_findings + state["redundancy_report"] = redundancy_report diff --git a/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_bot.py b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_bot.py new file mode 100644 index 00000000..49f6ac6d --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_bot.py @@ -0,0 +1,220 @@ +"""Phase D-3-Bot — Bottom HTML blocks + final composition. + +Covers (in the original Step 5): + - P71 JC-vs-AVV Entscheidungsbaum (only when DSE ambig) + - P6/P53/P55 Branchen-Kontext + Site-History + - P106 Internal-Checks-Block + - P85 Banner-Screenshot + - A Audit-Quality-Checks (Banner-Detect-Failure, vendor-extract dünn) + - P82 GF-1-Pager + - Doc-Input-Warnings (User text in falsches Feld gepastet) + - P86 Branchen-Benchmark + - P84 Diff-Mode (since-last-run delta) + - Final HTML composition + +NOTE: in the original code `audit_quality_findings` was used by +build_gf_one_pager_html BEFORE it was initialised — a silent +UnboundLocalError caught by the surrounding try/except, so the +gf_one_pager block effectively never rendered. Here we run +audit-quality FIRST so the data is actually available. +""" + +from __future__ import annotations + +import logging + +logger = logging.getLogger(__name__) + + +async def run_phase_d3_bot(state: dict) -> None: + """Bottom blocks + assemble full_html. Mutates state in place.""" + check_id = state["check_id"] + req = state["req"] + doc_entries = state["doc_entries"] + doc_texts = state["doc_texts"] + banner_result = state["banner_result"] + cmp_vendors = state["cmp_vendors"] + mc_split = state["mc_split"] + scorecard = state["scorecard"] + prev_scorecard = state.get("prev_scorecard") + mismatches = state.get("mismatches") or [] + site_name_for_exec = state.get("site_name_for_exec", "") + domain_for_exec = state.get("domain_for_exec") + html_blocks = state["html_blocks"] + + # P71: JC-vs-AVV Entscheidungsbaum + jc_decision_html = "" + try: + from compliance.services.jc_avv_decision import ( + build_jc_avv_decision_html, + ) + jc_decision_html = build_jc_avv_decision_html(doc_texts.get("dse")) + except Exception as e: + logger.warning("P71 jc_avv_decision skipped: %s", e) + + # P6/P53/P55 — Branchen-Kontext + Site-History + industry_ctx_html = "" + try: + from compliance.services.industry_library import ( + build_industry_context_block_html, load_site_profile, + ) + from database import SessionLocal as _SLib + _ind_db = _SLib() + try: + ind = (req.scan_context or {}).get("industry") if req.scan_context else None + site_prof = load_site_profile(_ind_db, domain_for_exec or "") + industry_ctx_html = build_industry_context_block_html(ind, site_prof) + finally: + _ind_db.close() + except Exception as e: + logger.warning("industry context skipped: %s", e) + + # P106 — Internal-Checks-Block + internal_checks_html = "" + try: + from compliance.services.mc_audit_type import ( + build_internal_checks_block_html, + ) + ic = (mc_split or {}).get("internal_checks") or [] + if ic: + internal_checks_html = build_internal_checks_block_html(ic) + logger.info("P106: %d interne Checks (statt FAIL) im Block", + len(ic)) + except Exception as e: + logger.warning("P106 internal_checks_html skipped: %s", e) + + # P85 — Banner-Screenshot + banner_shot_html = "" + try: + from compliance.services.banner_screenshot_block import ( + build_banner_screenshot_html, + ) + banner_shot_html = build_banner_screenshot_html(banner_result) + except Exception as e: + logger.warning("P85 banner-screenshot skipped: %s", e) + + # A — Audit-Quality-Checks (run BEFORE gf_one_pager so the data is + # available — original code had this inverted, causing + # UnboundLocalError silently caught). + audit_quality_html = "" + audit_quality_findings: list[dict] = [] + try: + from compliance.services.audit_quality_checks import ( + build_audit_quality_block_html, run_all as run_audit_quality, + ) + cookie_text_for_aq = doc_texts.get("cookie") or "" + audit_quality_findings = run_audit_quality( + banner_result, cookie_text_for_aq, cmp_vendors, doc_entries, + ) + if audit_quality_findings: + audit_quality_html = build_audit_quality_block_html(audit_quality_findings) + logger.info("audit-quality: %d Vorbehalte erkannt", + len(audit_quality_findings)) + except Exception as e: + logger.warning("audit-quality-checks failed: %s", e) + + # P82: GF-1-Pager (now has the audit_quality_findings filled) + gf_one_pager_html = "" + try: + from compliance.services.gf_one_pager import build_gf_one_pager_html + gf_one_pager_html = build_gf_one_pager_html( + site_name=site_name_for_exec, + scorecard=scorecard, + previous_scorecard=prev_scorecard, + banner_result=banner_result, + library_mismatch_findings=mismatches, + scan_context=req.scan_context, + audit_quality_findings=audit_quality_findings, + ) + except Exception as e: + logger.warning("P82 GF-1-pager skipped: %s", e) + + # Doc-Input-Warnings — wenn User Text ins falsche Feld gepastet hat + input_warn_html = "" + try: + from compliance.services.doc_input_warnings import ( + build_warnings_block_html, collect_warnings, + ) + warns = collect_warnings(doc_entries) + if warns: + input_warn_html = build_warnings_block_html(warns) + logger.info("doc-input-warnings: %d Mismatches gefunden", len(warns)) + except Exception as e: + logger.warning("doc-input-warnings skipped: %s", e) + + # P86: Branchen-Benchmark + bench_html = "" + try: + from compliance.services.industry_benchmark import ( + _extract_score, build_benchmark_html, compute_benchmark, + ) + from database import SessionLocal as _SLb + industry = (req.scan_context or {}).get("industry") if req.scan_context else None + curr_score = _extract_score(banner_result) + if industry and curr_score is not None: + _b_db = _SLb() + try: + bench = compute_benchmark( + _b_db, industry, curr_score, check_id, + ) + if bench: + bench_html = build_benchmark_html(bench) + finally: + _b_db.close() + except Exception as e: + logger.warning("P86 industry-benchmark skipped: %s", e) + + # P84: Diff-Mode + diff_html = "" + try: + from compliance.services.run_diff import ( + build_diff_block_html, compute_diff, + ) + from database import SessionLocal as _SL + _diff_db = _SL() + try: + diff = compute_diff( + _diff_db, check_id, domain_for_exec or "", + banner_result, scorecard, + ) + if diff: + diff_html = build_diff_block_html(diff) + finally: + _diff_db.close() + except Exception as e: + logger.warning("P84 diff-mode skipped: %s", e) + + # B1 / B3 cross-cutting findings (own renderers, may be empty). + reachability_html = state.get("reachability_html", "") + retention_html = state.get("retention_html", "") + + # Reihenfolge — Sales-optimiert. + # B1 (Reachability) sits next to critical because it's an Art.7-Abs.3 + # finding. B3 (Retention) sits next to cookie_audit because both + # are 3-source comparisons of cookie metadata. + full_html = ( + gf_one_pager_html + audit_quality_html + input_warn_html + + bench_html + diff_html + + html_blocks["critical_html"] + reachability_html + + html_blocks["scope_disclaimer_html"] + + html_blocks["exec_summary_html"] + + html_blocks["cookie_arch_html"] + html_blocks["summary_html"] + + html_blocks["scanned_html"] + html_blocks["profile_html"] + + html_blocks["scorecard_html"] + internal_checks_html + + html_blocks["redundancy_html"] + + industry_ctx_html + + banner_shot_html + + html_blocks["providers_html"] + html_blocks["banner_deep_html"] + + html_blocks["cookie_audit_html"] + retention_html + + html_blocks["tcf_authority_html"] + + html_blocks["entropy_html"] + + html_blocks["network_trace_html"] + + html_blocks["library_mismatch_html"] + + html_blocks["consistency_html"] + html_blocks["signals_html"] + + html_blocks["solutions_html"] + + jc_decision_html + + html_blocks["vvt_html"] + html_blocks["report_html"] + ) + + state["audit_quality_findings"] = audit_quality_findings + state["full_html"] = full_html diff --git a/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_mid.py b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_mid.py new file mode 100644 index 00000000..b080a5c5 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_mid.py @@ -0,0 +1,221 @@ +"""Phase D-3-Mid — Mid HTML blocks (P62/P103/P104/P105/audit/mismatch/signals). + +Covers (in the original Step 5): + - P62 Scope-Disclaimer + - P103 Cookie-Value-Entropy + P104 Network-Tracing + - P105 IAB TCF Authority cross-reference + - Cookie-Compliance-Audit (3-Quellen-Vergleich, central USP) + - P102 Cookie-Klassifikations-Pruefung (library mismatch) + - P35/P77/P78 Doc-Text signals + - P92/P94 Banner-Konsistenz + - P73 MC-Solution-Generator (LLM suggestions per HIGH-Fail) +""" + +from __future__ import annotations + +import logging + +logger = logging.getLogger(__name__) + + +async def run_phase_d3_mid(state: dict) -> None: + """Mid HTML blocks. Mutates state in place.""" + doc_entries = state["doc_entries"] + doc_texts = state["doc_texts"] + banner_result = state["banner_result"] + cmp_vendors = state["cmp_vendors"] + fails_by_doc = state["fails_by_doc"] + html_blocks = state["html_blocks"] + + # P62: Marketing-Manager-Disclaimer + scope_disclaimer_html = "" + try: + from ..scope_disclaimer import build_scope_disclaimer_html + scope_disclaimer_html = build_scope_disclaimer_html() + except Exception as e: + logger.warning("Scope-disclaimer block skipped: %s", e) + + # P103 + P104 — Cookie-Value-Entropy + Network-Tracing + entropy_html = "" + network_trace_html = "" + try: + from compliance.services.cookie_network_tracer import ( + build_network_trace_block_html, + trace_cookie_network, + ) + from compliance.services.cookie_value_entropy import ( + build_entropy_block_html, + check_cookies_for_entropy_mismatch, + ) + cookies_detailed = (banner_result or {}).get("cookies_detailed") or [] + entropy_findings = check_cookies_for_entropy_mismatch(cookies_detailed) + if entropy_findings: + entropy_html = build_entropy_block_html(entropy_findings) + logger.info("P103 Entropy: %d Findings", len(entropy_findings)) + primary_url = "" + for e_ in doc_entries: + if e_.get("url"): + primary_url = e_["url"]; break + net_findings = trace_cookie_network(cookies_detailed, primary_url) + if net_findings: + network_trace_html = build_network_trace_block_html(net_findings) + logger.info("P104 Network-Trace: %d Findings", len(net_findings)) + except Exception as e: + logger.warning("P103/P104 entropy/network-trace skipped: %s", e) + + # P105 — IAB TCF Authority-Cross-Reference + tcf_authority_html = "" + try: + from compliance.services.tcf_vendor_authority import ( + build_tcf_authority_block_html, cross_reference_with_tcf, + ) + from database import SessionLocal as _SLtcf + _tcf_db = _SLtcf() + try: + tcf_findings = cross_reference_with_tcf(_tcf_db, cmp_vendors) + if tcf_findings: + tcf_authority_html = build_tcf_authority_block_html(tcf_findings) + logger.info( + "TCF-Authority: %d Vendor-Discrepancies gefunden", + len(tcf_findings), + ) + finally: + _tcf_db.close() + except Exception as e: + logger.warning("TCF-Authority-Check skipped: %s", e) + + # COOKIE-COMPLIANCE-AUDIT (3-Quellen-Vergleich — central USP) + cookie_audit: dict = {} + cookie_audit_html = "" + try: + from compliance.services.cookie_compliance_audit import ( + audit_cookie_compliance, build_cookie_audit_block_html, + ) + from database import SessionLocal as _SLca + _ca_db = _SLca() + try: + cookie_audit = audit_cookie_compliance( + _ca_db, doc_texts.get("cookie") or doc_texts.get("dse"), + banner_result, + ) + if cookie_audit and (cookie_audit.get("declared_count") or + cookie_audit.get("browser_count")): + cookie_audit_html = build_cookie_audit_block_html(cookie_audit) + logger.info( + "Cookie-Audit: %d deklariert, %d im Browser, " + "%d undokumentiert, %d compliant", + cookie_audit.get("declared_count"), + cookie_audit.get("browser_count"), + len(cookie_audit.get("undeclared_in_browser") or []), + len(cookie_audit.get("compliant") or []), + ) + finally: + _ca_db.close() + except Exception as e: + logger.warning("cookie-compliance-audit skipped: %s", e) + + # P102: Cookie-Klassifikations-Pruefung + library_mismatch_html = "" + mismatches: list[dict] = [] + try: + from compliance.services.cookie_library_mismatch import ( + build_mismatch_block_html, detect_mismatches, + ) + from database import SessionLocal + cookie_doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or "" + all_cookies_seen: list[str] = [] + if banner_result: + for ph in (banner_result.get("phases") or {}).values(): + if isinstance(ph, dict): + for ck in (ph.get("cookies") or []): + if isinstance(ck, str): + all_cookies_seen.append(ck) + elif isinstance(ck, dict) and ck.get("name"): + all_cookies_seen.append(ck["name"]) + if all_cookies_seen and cookie_doc_for_check: + _mm_db = SessionLocal() + try: + mismatches = detect_mismatches( + _mm_db, all_cookies_seen, cookie_doc_for_check, + ) + if mismatches: + library_mismatch_html = build_mismatch_block_html(mismatches) + logger.info( + "P102: %d Cookie-Mismatches gefunden", len(mismatches), + ) + finally: + _mm_db.close() + except Exception as e: + logger.warning("P102 mismatch detection failed: %s", e) + + # P35 + P77 + P78: Textsignal-Checks + signals_html = "" + try: + from compliance.services.doc_text_signals import ( + build_signals_block_html, run_all as run_signal_checks, + ) + cookie_doc_missing = not bool(doc_texts.get("cookie")) + sig_findings = run_signal_checks( + banner_result, doc_texts, cookie_doc_missing, + ) + if sig_findings: + signals_html = build_signals_block_html(sig_findings) + except Exception as e: + logger.warning("P35/P77/P78 signals-check failed: %s", e) + + # P92 + P94: Banner-Konsistenz + consistency_html = "" + try: + from compliance.services.banner_consistency_checks import ( + build_consistency_block_html, run_all as run_consistency_checks, + ) + cookie_doc_for_check = (doc_texts.get("cookie") + or doc_texts.get("dse") or "") + cons_findings = run_consistency_checks( + banner_result or {}, cookie_doc_for_check, cmp_vendors, + doc_texts=doc_texts, + ) + if cons_findings: + consistency_html = build_consistency_block_html(cons_findings) + logger.info("P92/P94: %d Konsistenz-Findings", len(cons_findings)) + except Exception as e: + logger.warning("P92/P94 consistency-check failed: %s", e) + + # P73: MC-Solution-Generator — LLM-Vorschlaege pro HIGH-Fail + solutions_html = "" + try: + from compliance.services.mc_solution_generator import ( + build_solutions_block_html, generate_solutions_for_fails, + ) + all_solutions: list[dict] = [] + for dt, fails in fails_by_doc.items(): + if not fails: + continue + doc_txt = doc_texts.get(dt) or doc_texts.get("dse") or "" + if not doc_txt or len(doc_txt) < 500: + continue + sols = await generate_solutions_for_fails( + fails, doc_txt, dt, limit=3, + ) + all_solutions.extend(sols) + if len(all_solutions) >= 8: + break + if all_solutions: + solutions_html = build_solutions_block_html(all_solutions[:8]) + logger.info("P73: %d MC-Solutions generiert", len(all_solutions)) + except Exception as e: + logger.warning("P73 MC-Solution-Generator skipped: %s", e) + + html_blocks.update({ + "scope_disclaimer_html": scope_disclaimer_html, + "entropy_html": entropy_html, + "network_trace_html": network_trace_html, + "tcf_authority_html": tcf_authority_html, + "cookie_audit_html": cookie_audit_html, + "library_mismatch_html": library_mismatch_html, + "signals_html": signals_html, + "consistency_html": consistency_html, + "solutions_html": solutions_html, + }) + state["cookie_audit"] = cookie_audit + state["mismatches"] = mismatches diff --git a/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_top.py b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_top.py new file mode 100644 index 00000000..bf4e5840 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_top.py @@ -0,0 +1,198 @@ +"""Phase D-3-Top — Top-of-mail HTML blocks. + +Covers (in the original Step 5 of `_run_compliance_check`): + - Summary / Scanned-URLs / Provider-list / Banner-deep / VVT HTML + - MC-scorecard aggregation (all_mc_checks + scorecard) + trend lookup + - P106 mc_audit_type split (internal_checks vs. verifiable_fails) + - Profile HTML / Redundancy HTML + - P1 Executive Summary + - P18 Critical Findings block + - P10 Cookie-Policy-Architecture detection +""" + +from __future__ import annotations + +import logging + +from ._helpers import _build_profile_html, _company_name_from_url, _extract_domain + +logger = logging.getLogger(__name__) + + +async def run_phase_d3_top(state: dict) -> None: + """Top-of-mail HTML blocks. Mutates state in place.""" + req = state["req"] + results = state["results"] + doc_entries = state["doc_entries"] + doc_texts = state["doc_texts"] + banner_result = state["banner_result"] + vvt_entries = state["vvt_entries"] + cmp_vendors = state["cmp_vendors"] + profile = state["profile"] + redundancy_report = state.get("redundancy_report") + + from ..agent_doc_check_banner import build_banner_deep_html + from ..agent_doc_check_critical import build_critical_findings_html + from ..agent_doc_check_exec_summary import build_exec_summary_html + from ..agent_doc_check_extras import build_vvt_table_html + from ..agent_doc_check_redundancy import build_redundancy_html + from ..agent_doc_check_report import ( + build_html_report, + build_management_summary, + build_provider_list_html, + build_scanned_urls_html, + ) + from ..agent_doc_check_scorecard import build_scorecard_html + from compliance.services.mc_scorecard import build_scorecard + + summary_html = build_management_summary(results) + scanned_html = build_scanned_urls_html(doc_entries) + providers_html = build_provider_list_html(banner_result, vvt_entries) + # P18: Deep-Block mit Phases + Quality-Score + Per-Category-Tracker + banner_deep_html = build_banner_deep_html(banner_result) + vvt_html = build_vvt_table_html(cmp_vendors) + + # MC scorecard aggregated across ALL docs (DSGVO/TDDDG/BGB/...) + all_mc_checks: list[dict] = [] + fails_by_doc: dict[str, list[dict]] = {} + for r in results: + for c in r.checks: + if c.id.startswith("mc-"): + rec = { + "id": c.id, "label": c.label, "passed": c.passed, + "severity": c.severity, "skipped": c.skipped, + "regulation": c.regulation, + "hint": getattr(c, "hint", "") or "", + } + all_mc_checks.append(rec) + if (not c.passed and not c.skipped + and (c.severity or "").upper() in ("CRITICAL", "HIGH")): + fails_by_doc.setdefault(r.doc_type, []).append(rec) + # P106 — Audit-Type-Klassifizierung pro MC + mc_split: dict = {"internal_checks": [], "verifiable_fails": all_mc_checks} + try: + from compliance.services.mc_audit_type import ( + annotate_mc_results, split_by_audit_type, + ) + annotate_mc_results(all_mc_checks) + mc_split = split_by_audit_type(all_mc_checks) + fails_by_doc = {} + for r in mc_split.get("verifiable_fails") or []: + fails_by_doc.setdefault("dse", []).append(r) + except Exception as e: + logger.warning("P106 mc_audit_type skipped: %s", e) + scorecard = build_scorecard(all_mc_checks) if all_mc_checks else {} + + # Trend: load previous scorecard for the same tenant + domain + prev_scorecard: dict | None = None + if scorecard: + try: + from compliance.services.compliance_audit_log import ( + list_runs_for_tenant, + ) + tenant_id_for_trend = req.recipient or "" + base_domain_for_trend = _extract_domain(doc_entries) or "" + prev_runs = list_runs_for_tenant( + tenant_id_for_trend, + base_domain=base_domain_for_trend, + limit=1, + ) + if prev_runs: + prev_scorecard = prev_runs[0].get("scorecard") + except Exception as e: + logger.debug("trend lookup skipped: %s", e) + scorecard_html = ( + build_scorecard_html(scorecard, previous_scorecard=prev_scorecard) + if scorecard else "" + ) + + report_html = build_html_report(results, None, doc_texts) + profile_html = _build_profile_html(profile) + + # O4: Vendor-Redundanz / EU-Alternativen + Cost-Savings-Block + redundancy_html = build_redundancy_html(redundancy_report) + + # P1: Executive-Summary + url_company_for_exec = _company_name_from_url(doc_entries) + domain_for_exec = _extract_domain(doc_entries) + site_name_for_exec = url_company_for_exec or domain_for_exec or "" + exec_summary_html = build_exec_summary_html( + scorecard=scorecard, + previous_scorecard=prev_scorecard, + cmp_vendors=cmp_vendors, + redundancy_report=redundancy_report, + site_name=site_name_for_exec, + ) + + # P18: Critical-Findings-Block + critical_html = "" + try: + critical_html = build_critical_findings_html( + banner_result=banner_result, + scorecard=scorecard, + results=results, + ) + except Exception as e: + logger.warning("Critical-findings block skipped: %s", e) + + # P10: Cookie-Policy-Architecture-Detection (BMW-Pattern erkennen) + cookie_arch_html = "" + try: + from compliance.services.cookie_policy_architecture import ( + build_architecture_html, + detect_architecture, + ) + cookie_doc_url = "" + cookie_doc_text = doc_texts.get("cookie", "") + cookie_cmp_payloads: list[dict] = [] + for e in doc_entries: + if (e.get("doc_type") or "").lower() in ("cookie", "cookie_policy"): + cookie_doc_url = e.get("url", "") + cookie_cmp_payloads = e.get("cmp_payloads") or [] + break + # P17-A: Fallback wenn Cookie-Doc via P15 deduped wurde + if not cookie_doc_text: + dse_text = doc_texts.get("dse", "") + if dse_text and any(w in dse_text.lower() for w in + ("cookie", "tracking", "google analytics", + "consent")): + cookie_doc_text = dse_text + dse_entry = next((e for e in doc_entries + if e.get("doc_type") == "dse"), {}) + cookie_doc_url = dse_entry.get("url", "") + cookie_cmp_payloads = dse_entry.get("cmp_payloads") or [] + logger.info("P17-A: cookie-arch fallback auf DSE") + if cookie_doc_text: + arch = detect_architecture( + doc_url=cookie_doc_url, + doc_text=cookie_doc_text, + cmp_payloads=cookie_cmp_payloads, + homepage_cmp_payloads=state.get("cookie_payloads") or [], + ) + cookie_arch_html = build_architecture_html(arch) + logger.info("cookie-arch: layer=%s versioned=%s risk=%s", + arch["layer_separation"], arch["versioned"], + arch["risk_label"]) + except Exception as e: + logger.warning("cookie-architecture detection failed: %s", e) + + state["scorecard"] = scorecard + state["prev_scorecard"] = prev_scorecard + state["mc_split"] = mc_split + state["fails_by_doc"] = fails_by_doc + state["site_name_for_exec"] = site_name_for_exec + state["domain_for_exec"] = domain_for_exec + state["html_blocks"] = { + "summary_html": summary_html, + "scanned_html": scanned_html, + "providers_html": providers_html, + "banner_deep_html": banner_deep_html, + "vvt_html": vvt_html, + "scorecard_html": scorecard_html, + "report_html": report_html, + "profile_html": profile_html, + "redundancy_html": redundancy_html, + "exec_summary_html": exec_summary_html, + "critical_html": critical_html, + "cookie_arch_html": cookie_arch_html, + } diff --git a/backend-compliance/compliance/api/agent_check/_phase_e_email.py b/backend-compliance/compliance/api/agent_check/_phase_e_email.py new file mode 100644 index 00000000..466d239a --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_e_email.py @@ -0,0 +1,75 @@ +"""Phase E — Send compliance-check email, with A1 ZIP-Anhang. + +Original Step 6 of `_run_compliance_check`, extended with the A1 +attachment: when the Tesseract pipeline captured evidence slices, +bundle them into evidence-{check_id}.zip (manifest.json + +audit_metadata.json + slice_NNN.png) and attach to the e-mail. The +attachment makes the evidence chain portable so a DSB / lawyer can +hand it to an external auditor or supervisory authority. +""" + +from __future__ import annotations + +import logging + +from compliance.services.smtp_sender import send_email + +from ._helpers import _company_name_from_url, _extract_domain, _update + +logger = logging.getLogger(__name__) + + +def run_phase_e(state: dict) -> None: + """Build site label, optional ZIP attachment, send mail. Mutate state.""" + check_id = state["check_id"] + req = state["req"] + results = state["results"] + doc_entries = state["doc_entries"] + full_html = state["full_html"] + cookie_evidence_slices = state.get("cookie_evidence_slices") + cookie_evidence_meta = state.get("cookie_evidence_meta") + + # Derive site name primarily from entered URL. + # The extracted_profile.companyName is often noisy (e.g. picks up + # juris.de from legal references). Domain-derived name is more + # predictable for the GF email subject. + doc_count = len([r for r in results if not r.error]) + url_company = _company_name_from_url(doc_entries) + domain = _extract_domain(doc_entries) + site_name = url_company or domain or "Unbekannt" + _update(check_id, "E-Mail wird versendet...", 98) + + # A1: bundle cookie-evidence slices into a ZIP attachment so the + # audit chain reaches the recipient. Each slice has its own + # SHA-256 + capture timestamp; manifest.json + audit_metadata.json + # make the chain verifiable for an external auditor. + evidence_attachments: list[dict] = [] + if cookie_evidence_slices: + try: + from compliance.services.evidence_zip_builder import ( + build_evidence_zip, + ) + zip_bytes = build_evidence_zip( + slices=cookie_evidence_slices, + meta=cookie_evidence_meta, + check_id=check_id, + ) + evidence_attachments.append({ + "filename": f"evidence-{check_id[:8]}.zip", + "data": zip_bytes, + "mime": "application/zip", + }) + except Exception as e: + logger.warning("A1 evidence-zip build failed: %s", e) + + email_result = send_email( + recipient=req.recipient, + subject=f"[COMPLIANCE-CHECK] {site_name} — {doc_count} Dokumente geprueft", + body_html=full_html, + attachments=evidence_attachments or None, + ) + + state["email_result"] = email_result + state["site_name"] = site_name + state["domain"] = domain + state["doc_count"] = doc_count diff --git a/backend-compliance/compliance/api/agent_check/_phase_f_persist.py b/backend-compliance/compliance/api/agent_check/_phase_f_persist.py new file mode 100644 index 00000000..413a9ef6 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_phase_f_persist.py @@ -0,0 +1,166 @@ +"""Phase F — Build response + persist snapshot/audit-log/unified-findings. + +Covers (in the original `_run_compliance_check`): + - Step 7 Build response dict, mark job as completed + - P80 Persist raw scan data so we can replay the audit pipeline + without re-crawling (7min → 5sec test cycle) + - SQLite audit log (compliance.api/audit endpoints + trend view A6) + - P5 Unified findings (MC + Pflichtangaben + Vendor + Redundanz + in one searchable table behind /agent/findings/) +""" + +from __future__ import annotations + +import logging +from datetime import datetime, timezone + +from ._constants import _compliance_check_jobs +from ._helpers import _result_to_dict + +logger = logging.getLogger(__name__) + + +def run_phase_f(state: dict) -> None: + """Build response + persist. Mutates state in place.""" + check_id = state["check_id"] + req = state["req"] + results = state["results"] + profile = state["profile"] + profile_dict = state["profile_dict"] + extracted_profile = state["extracted_profile"] + banner_result = state["banner_result"] + tcf_vendors = state["tcf_vendors"] + vvt_entries = state["vvt_entries"] + cmp_vendors = state["cmp_vendors"] + cookie_audit = state["cookie_audit"] + total_findings = state["total_findings"] + email_result = state["email_result"] + doc_entries = state["doc_entries"] + doc_texts = state["doc_texts"] + redundancy_report = state.get("redundancy_report") + scorecard = state["scorecard"] + site_name = state.get("site_name", "") + domain = state.get("domain", "") + doc_count = state.get("doc_count", 0) + + response = { + "check_id": check_id, + "results": [_result_to_dict(r) for r in results], + "business_profile": profile_dict, + "extracted_profile": extracted_profile, + # P18: vollen consent-tester-Output durchreichen statt nur 4 Felder. + # phases (before/after-accept/reject) + banner_checks.violations + + # category_tests werden vom Renderer + Critical-Findings-Block genutzt. + "banner_result": ({ + "detected": banner_result.get("banner_detected", False), + "provider": banner_result.get("banner_provider", ""), + "violations": len((banner_result.get("banner_checks") or {}) + .get("violations", [])), + "tcf_vendor_count": len(tcf_vendors), + "completeness_pct": banner_result.get("completeness_pct"), + "correctness_pct": banner_result.get("correctness_pct"), + "phases": banner_result.get("phases", {}), + "banner_checks": banner_result.get("banner_checks", {}), + "category_tests": banner_result.get("category_tests", []), + "structured_checks": banner_result.get("structured_checks", []), + "summary": banner_result.get("summary", {}), + } if banner_result else None), + "tcf_vendors": vvt_entries if tcf_vendors else [], + "cmp_vendors": cmp_vendors, + "cookie_audit": cookie_audit if cookie_audit else None, + "total_documents": len(results), + "total_findings": total_findings, + "email_status": email_result.get("status", "failed"), + "checked_at": datetime.now(timezone.utc).isoformat(), + } + + _compliance_check_jobs[check_id]["status"] = "completed" + _compliance_check_jobs[check_id]["result"] = response + _compliance_check_jobs[check_id]["progress"] = "Fertig" + _compliance_check_jobs[check_id]["progress_pct"] = 100 + + # P80: persist raw scan data so we can replay audit pipeline + # without re-crawling (7min -> 5sec test cycle). + try: + from database import SessionLocal + from compliance.services.check_snapshot import save_snapshot + snap_db = SessionLocal() + try: + save_snapshot( + snap_db, + check_id=check_id, + doc_entries=doc_entries, + banner_result=banner_result, + profile=profile, + cmp_vendors=cmp_vendors, + scan_context=req.scan_context, # P79 + site_label=site_name, + notes=f"recipient={req.recipient}", + ) + finally: + snap_db.close() + except Exception as snap_err: + logger.warning("P80 snapshot save skipped: %s", snap_err) + + # Persist to sidecar SQLite audit log — enables /audit endpoints + # (A5 admin tab) and trend view (A6). Best-effort; failures here + # do not affect the user-facing response. + try: + from compliance.services.compliance_audit_log import record_check_run + from compliance.services.mc_scorecard import full_audit_records + audit_rows: list[dict] = [] + for r in results: + doc_mc = [c for c in r.checks if c.id.startswith("mc-")] + audit_rows.extend(full_audit_records( + [{"id": c.id, "label": c.label, "passed": c.passed, + "severity": c.severity, "skipped": c.skipped, + "regulation": c.regulation, "matched_text": c.matched_text, + "hint": c.hint, "level": c.level} + for c in doc_mc], + check_id=check_id, + doc_type=r.doc_type, + )) + record_check_run( + check_id=check_id, + tenant_id=req.recipient or "", + site_name=site_name, + base_domain=domain or "", + doc_count=doc_count, + scorecard=scorecard, + vvt_summary={ + "total": len(cmp_vendors), + "internal": sum(1 for v in cmp_vendors + if (v.get("recipient_type") or "").upper() + in ("INTERNAL", "GROUP_COMPANY")), + "external": sum(1 for v in cmp_vendors + if (v.get("recipient_type") or "").upper() + in ("PROCESSOR", "CONTROLLER")), + }, + mc_records=audit_rows, + ) + from compliance.services.compliance_audit_log import record_check_payload + record_check_payload( + check_id=check_id, + vendors=cmp_vendors, + profile=extracted_profile, + banner=banner_result, + ) + # Unified findings (P5): bundle MC + Pflichtangaben + Vendor + + # Redundanz in one searchable table behind /agent/findings/. + try: + from compliance.services.unified_findings_collector import collect + from compliance.services.unified_findings_store import record_findings + unified = collect( + check_id=check_id, + results=results, + cmp_vendors=cmp_vendors, + redundancy_report=redundancy_report, + doc_texts=doc_texts, + ) + record_findings(check_id, unified) + except Exception as e: + logger.warning("Unified findings collect failed: %s", e) + except Exception as e: + logger.warning("Audit persistence skipped: %s", e) + + state["response"] = response diff --git a/backend-compliance/compliance/api/agent_check/_schemas.py b/backend-compliance/compliance/api/agent_check/_schemas.py new file mode 100644 index 00000000..d4625533 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_schemas.py @@ -0,0 +1,44 @@ +"""Pydantic request/response schemas for the compliance-check route.""" + +from __future__ import annotations + +from pydantic import BaseModel + + +class ExtractTextRequest(BaseModel): + url: str + + +class DocumentInput(BaseModel): + doc_type: str # dse, agb, impressum, cookie, widerruf, avv, loeschkonzept, etc. + url: str = "" + text: str = "" # text has priority over URL + + +class ComplianceCheckRequest(BaseModel): + documents: list[DocumentInput] + use_agent: bool = False + recipient: str = "dsb@breakpilot.local" + # P12: Override fuer TDM-Vorbehalt bei dokumentierter Kunden-Erlaubnis. + # Pflichtfeld tdm_override_reason wenn tdm_override=True + # (z.B. "Auftragsbeziehung Safetykon GmbH, Email Hr. X 18.05.2026"). + tdm_override: bool = False + tdm_override_reason: str = "" + # P79: 8-Feld Pre-Scan-Wizard (Branche, B2B/B2C, Direkt-Vertrieb, + # Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im + # Snapshot persistiert und filtert die MC-Auswertung (P72). + scan_context: dict | None = None + + +class ComplianceCheckStartResponse(BaseModel): + check_id: str + status: str = "running" + + +class ComplianceCheckStatusResponse(BaseModel): + check_id: str + status: str + progress: str = "" + progress_pct: int = 0 + result: dict | None = None + error: str = "" diff --git a/backend-compliance/compliance/api/agent_check/_single_check.py b/backend-compliance/compliance/api/agent_check/_single_check.py new file mode 100644 index 00000000..48ced787 --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_single_check.py @@ -0,0 +1,118 @@ +"""Per-document regex + MC + LLM checks for the compliance-check route. + +Each document goes through: + 1. regex completeness/correctness checklist + 2. Master Control evaluation (all MCs for this doc_type) + 3. LLM verification of failed regex checks (overturns where evidence + was missed by the regex) + 4. Cookie-only: opt-out + privacy-policy URL health-check +""" + +from __future__ import annotations + +import logging + +logger = logging.getLogger(__name__) + + +async def _check_single( + text: str, doc_type: str, label: str, url: str, + word_count: int, use_agent: bool, + business_scope: set[str] | None = None, + business_profile: dict | None = None, +): + """Run regex + MC checks on a single document.""" + from compliance.services.doc_checks.runner import check_document_completeness + from compliance.services.rag_document_checker import check_document_with_controls + from ..agent_doc_check_routes import CheckItem, DocCheckResult + + # Regex checklist + findings = check_document_completeness(text, doc_type, label, url, + business_profile=business_profile) + + all_checks: list[CheckItem] = [] + completeness = 0 + correctness = 0 + + for f in findings: + if "SCORE" in f.get("code", ""): + for c in f.get("all_checks", []): + all_checks.append(CheckItem( + id=c["id"], label=c["label"], passed=c["passed"], + severity=c["severity"], matched_text=c.get("matched_text", ""), + level=c.get("level", 1), parent=c.get("parent"), + skipped=c.get("skipped", False), hint=c.get("hint", ""), + )) + completeness = f.get("completeness_pct", 0) + correctness = f.get("correctness_pct", 0) + + # Master Control checks (top 20 by severity to avoid noise) + try: + # max_controls=0 -> evaluate ALL MCs for this doc_type (DB has + # 1874 across 8 types; regex matching is cheap and dominates + # well under 1s per doc). Caps remain on the LLM-enrich step + # (top-10 FAILs) so cost stays bounded. + mc_results = await check_document_with_controls( + text, doc_type, label, max_controls=0, use_agent=use_agent, + business_scope=business_scope, + ) + if mc_results: + for mc in mc_results: + all_checks.append(CheckItem(**mc)) + l2 = [c for c in all_checks if c.level == 2 and not c.skipped] + l2_passed = sum(1 for c in l2 if c.passed) + correctness = round(l2_passed / len(l2) * 100) if l2 else 0 + except Exception as e: + logger.warning("MC check skipped for %s: %s", label, e) + + # LLM verification of regex fails + failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint] + if failed: + try: + from compliance.services.doc_checks.llm_verify import verify_failed_checks + overturns = await verify_failed_checks( + text, + [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed], + label, + ) + for c in all_checks: + if c.id in overturns and overturns[c.id]["overturned"]: + c.passed = True + c.matched_text = f"[LLM] {overturns[c.id]['evidence']}" + l2_active = [c for c in all_checks if c.level == 2 and not c.skipped] + l2_passed = sum(1 for c in l2_active if c.passed) + if l2_active: + correctness = round(l2_passed / len(l2_active) * 100) + except Exception as e: + logger.warning("LLM verification skipped: %s", e) + + # Cookie-policy only: actively HTTP-probe the Opt-Out + Privacy-Policy + # URLs the document advertises. Broken links make individual provider + # entries non-compliant under Art. 7(3) DSGVO. + if doc_type == "cookie": + try: + from compliance.services.cookie_link_validator import ( + extract_links, validate_links, build_check_items, + ) + links = extract_links(text) + if links: + logger.info("Cookie-link validator: %d urls extracted from %s", + len(links), label) + validated = await validate_links(links) + for item in build_check_items(validated): + all_checks.append(CheckItem(**item)) + # Re-compute correctness with the new L2 items + l2_active = [c for c in all_checks if c.level == 2 and not c.skipped] + l2_passed = sum(1 for c in l2_active if c.passed) + if l2_active: + correctness = round(l2_passed / len(l2_active) * 100) + except Exception as e: + logger.warning("Cookie-link validation skipped for %s: %s", label, e) + + non_score = [f for f in findings if "SCORE" not in f.get("code", "")] + return DocCheckResult( + label=label, url=url, doc_type=doc_type, + word_count=word_count or len(text.split()), + completeness_pct=completeness, correctness_pct=correctness, + checks=all_checks, findings_count=len(non_score), + ) diff --git a/backend-compliance/compliance/api/agent_check/_state.py b/backend-compliance/compliance/api/agent_check/_state.py new file mode 100644 index 00000000..76beb3fb --- /dev/null +++ b/backend-compliance/compliance/api/agent_check/_state.py @@ -0,0 +1,58 @@ +"""Shared state for the compliance-check pipeline. + +The 7-step pipeline accumulates ~60 named values that flow across +phases (doc_entries, profile, results, banner_result, cmp_vendors, +scorecard, HTML blocks, …). Rather than threading 60 parameters +through each function, we pass one mutable `CheckState` dict. + +Phases read what they need with `state[key]` and write their outputs +with `state[key] = value`. This is intentionally untyped: enforcing +strict typing would require freezing the schema before all phases +landed, and the report-building phase routinely adds new optional +keys (P1, P10, P50, P59b, P82, P103, P104, P106, …). + +`CheckState.new(check_id, req)` initialises the dict with the few +keys that must exist from the start. +""" + +from __future__ import annotations + + +def new_state(check_id: str, req) -> dict: + """Create a fresh state dict for a check run. + + Pre-populates a few keys that downstream phases assume exist + (e.g. `cmp_vendors` defaulting to `[]`). + """ + return { + "check_id": check_id, + "req": req, + # Phase-1 outputs + "doc_texts": {}, + "doc_entries": [], + "url_text_cache": {}, + "pasted_table_vendors": [], + "placement_findings": [], + # Phase-2/3/4 outputs + "profile": None, + "profile_dict": {}, + "results": [], + "total_findings": 0, + "business_scope": set(), + "banner_result": None, + "banner_url": "", + "tcf_vendors": [], + "vvt_entries": [], + "extracted_profile": {}, + # Phase-5 outputs + "cmp_vendors": [], + "cookie_audit": {}, + "cookie_evidence_slices": None, + "cookie_evidence_meta": None, + "scorecard": {}, + "full_html": "", + "audit_quality_findings": [], + # Phase-6/7 outputs + "email_result": {"status": "skipped"}, + "site_name": "", + } diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 36ac607d..324326cc 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -4,72 +4,70 @@ Unified Compliance Check Routes — check all documents in one request. POST /compliance/agent/extract-text — extract text from a URL POST /compliance/agent/compliance-check — unified check for all documents GET /compliance/agent/compliance-check/{check_id} — poll status + +Phase 5 split (2026-06-06): the original 2700-line monolith is now +decomposed into the `agent_check/` subpackage: + - _orchestrator.py — thin run_compliance_check pipeline + - _phase_a_resolve.py — TDM + Step 1 (resolve / discover / split) + - _phase_b_profile_check.py — Step 2 + Step 3 (profile + doc checks) + - _phase_c_banner.py — Step 3b-d (banner + cross-check + TCF) + Step 4 + - _phase_d1_vendors_raw.py / _phase_d2_vendors_finalize.py — Step 5 + vendor extraction + finalize + - _phase_d3_blocks_top.py / mid / bot — Step 5 HTML blocks + - _phase_e_email.py — Step 6 (with A1 ZIP-Anhang) + - _phase_f_persist.py — Step 7 (snapshot + audit log + unified findings) + - _helpers.py / _constants.py / _state.py / _schemas.py — shared + +External callers (saving_scan_routes, agent_migration_routes, tests) +keep importing helpers from THIS module — everything is re-exported. """ +from __future__ import annotations + import asyncio import logging -import os -import re import uuid as _uuid -from dataclasses import asdict -from datetime import datetime, timezone import httpx from fastapi import APIRouter -from pydantic import BaseModel -from compliance.services.smtp_sender import send_email +# ── Re-exports: external callers import these from THIS module ────── +from .agent_check._constants import ( # noqa: F401 + CONSENT_TESTER_URL, + _ALL_DOC_TYPES, + _COMPOUND_TLDS, + _DISCOVERY_RULES, + _DOC_TYPE_LABELS, + _compliance_check_jobs, +) +from .agent_check._discovery import _autodiscover_missing # noqa: F401 +from .agent_check._fetch import _fetch_text # noqa: F401 +from .agent_check._helpers import ( # noqa: F401 + _apply_profile_filter, + _build_profile_html, + _classify_discovered_doc, + _company_name_from_url, + _doc_type_label, + _extract_domain, + _get_skip_types, + _pad_results_with_missing, + _result_to_dict, + _update, +) +from .agent_check._orchestrator import run_compliance_check as _run_compliance_check # noqa: F401 +from .agent_check._schemas import ( + ComplianceCheckRequest, + ComplianceCheckStartResponse, + ComplianceCheckStatusResponse, + DocumentInput, + ExtractTextRequest, +) +from .agent_check._single_check import _check_single # noqa: F401 logger = logging.getLogger(__name__) router = APIRouter(prefix="/compliance/agent", tags=["agent"]) -CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094" - -# In-memory job store (same pattern as doc-check) -_compliance_check_jobs: dict[str, dict] = {} - - -# ── Models ─────────────────────────────────────────────────────────── - -class ExtractTextRequest(BaseModel): - url: str - - -class DocumentInput(BaseModel): - doc_type: str # dse, agb, impressum, cookie, widerruf, avv, loeschkonzept, etc. - url: str = "" - text: str = "" # text has priority over URL - - -class ComplianceCheckRequest(BaseModel): - documents: list[DocumentInput] - use_agent: bool = False - recipient: str = "dsb@breakpilot.local" - # P12: Override fuer TDM-Vorbehalt bei dokumentierter Kunden-Erlaubnis. - # Pflichtfeld tdm_override_reason wenn tdm_override=True - # (z.B. "Auftragsbeziehung Safetykon GmbH, Email Hr. X 18.05.2026"). - tdm_override: bool = False - tdm_override_reason: str = "" - # P79: 8-Feld Pre-Scan-Wizard (Branche, B2B/B2C, Direkt-Vertrieb, - # Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im - # Snapshot persistiert und filtert die MC-Auswertung (P72). - scan_context: dict | None = None - - -class ComplianceCheckStartResponse(BaseModel): - check_id: str - status: str = "running" - - -class ComplianceCheckStatusResponse(BaseModel): - check_id: str - status: str - progress: str = "" - progress_pct: int = 0 - result: dict | None = None - error: str = "" - # ── Extract text endpoint ──────────────────────────────────────────── @@ -214,15 +212,12 @@ async def benchmark( anonymized: bool = False, limit: int = 50, ): - """P107 — Branchen-Benchmark-Cockpit Endpoint. - industry: 'automotive' / 'banking' / etc (optional) - sites: comma-separated site_label list (optional) - anonymized: bool — wenn true, Hersteller-Namen → 'OEM 1/2/3' - """ + """P107 — Branchen-Benchmark-Cockpit Endpoint.""" from database import SessionLocal from compliance.services.benchmark_extractor import ( - load_snapshots_for_benchmark, anonymize_kpis, + anonymize_kpis, build_benchmark_summary, + load_snapshots_for_benchmark, ) site_list = [s.strip() for s in sites.split(",") if s.strip()] if sites else None db = SessionLocal() @@ -245,9 +240,7 @@ async def benchmark( @router.post("/admin/tcf-ingest") async def tcf_ingest(): - """P105 — IAB TCF Vendor-Liste ingestieren / refreshen. - Idempotent: holt aktuelle GVL und upserted in compliance.cookie_library - mit source='iab_tcf_v2'. Aufruf ein paar Mal pro Jahr ausreichend.""" + """P105 — IAB TCF Vendor-Liste ingestieren / refreshen.""" from database import SessionLocal from compliance.services.tcf_vendor_authority import ( fetch_and_ingest_tcf_vendors, @@ -306,2344 +299,6 @@ async def replay_snapshot( db.close() -async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): - """Background task: check all documents with business-profile context.""" - try: - from compliance.services.business_profiler import detect_business_profile - from compliance.services.doc_checks.runner import check_document_completeness - from compliance.services.rag_document_checker import check_document_with_controls - from .agent_doc_check_routes import CheckItem, DocCheckResult - from .agent_doc_check_report import build_html_report - - # Reset anchor-locator cache per run (avoid cross-run leak) - try: - from compliance.services.doc_anchor_locator import reset_cache - reset_cache() - except Exception: - pass - - # P7: TDM-Reservation-Check der Base-Domain (§ 44b UrhG). - # Bei reserved/denied: Run sofort beenden, kein Crawl. - try: - from compliance.services.tdm_reservation_check import ( - check_tdm_reservation, is_crawl_allowed, - ) - first_url = next( - (d.url for d in req.documents if d.url), "", - ) - if first_url: - tdm = await check_tdm_reservation(first_url) - _compliance_check_jobs[check_id]["tdm"] = tdm - # P12: Bei tdm_override + Reason wird NICHT abgebrochen, - # sondern nur dokumentiert. Override ohne Reason wird ignoriert. - override_active = ( - req.tdm_override - and len((req.tdm_override_reason or "").strip()) >= 10 - ) - if not is_crawl_allowed(tdm) and not override_active: - _compliance_check_jobs[check_id]["status"] = "skipped_tdm" - _compliance_check_jobs[check_id]["error"] = ( - f"TDM-Vorbehalt fuer {tdm.get('domain')} erkannt " - f"(status={tdm.get('status')}) — Crawl nach § 44b " - f"UrhG nicht zulaessig. Signals: " - f"{[s.get('src') for s in tdm.get('signals', [])]}" - ) - _compliance_check_jobs[check_id]["progress_pct"] = 100 - logger.info("TDM-skip check_id=%s domain=%s status=%s", - check_id, tdm.get("domain"), tdm.get("status")) - return - if override_active and not is_crawl_allowed(tdm): - _compliance_check_jobs[check_id]["tdm_override"] = { - "reason": req.tdm_override_reason.strip()[:500], - "original_status": tdm.get("status"), - } - logger.warning( - "TDM-Override aktiv: check_id=%s domain=%s " - "status=%s reason=%r", - check_id, tdm.get("domain"), tdm.get("status"), - req.tdm_override_reason.strip()[:80], - ) - except Exception as e: - logger.warning("TDM-check failed (proceeding): %s", e) - - # Step 1: Resolve texts (fetch from URL if needed) — 0-30% - _update(check_id, "Texte werden geladen...", 1) - doc_texts: dict[str, str] = {} - doc_entries: list[dict] = [] - - # Cache fetched URLs to detect duplicates - url_text_cache: dict[str, str] = {} - - n_docs = max(1, len(req.documents)) - # User-pasted-Tabellen-Vendors (kein LLM noetig) — werden weiter - # unten in cmp_vendors gemerged. - pasted_table_vendors: list[dict] = [] - for i, doc in enumerate(req.documents): - pct = int(1 + (i / n_docs) * 29) - _update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct) - text = (doc.text or "").strip() - input_source = "url" - cmp_payloads: list[dict] = [] - if text: - input_source = "text" - if doc.url: - input_source = "text+url" # User hat beide gefuellt - logger.info( - "doc_type=%s: User hat URL UND Text geliefert — " - "Text gewinnt, URL wird als Quellen-Referenz behalten", - doc.doc_type, - ) - elif doc.url: - url_key = doc.url.strip().rstrip("/").lower() - if url_key in url_text_cache: - text = url_text_cache[url_key] - else: - text, cmp_payloads = await _fetch_text(doc.url, doc_type=doc.doc_type) - if text: - url_text_cache[url_key] = text - - # Auto-Reclassify-Check: wenn der user Text in das falsche - # Doc-Type-Feld kopiert hat (z.B. Impressum-Text in DSE), - # erkennen und ggf. umtaggen. - actual_doc_type = doc.doc_type - reclassify_hint: dict | None = None - if input_source.startswith("text") and len(text) >= 500: - try: - from compliance.services.doc_type_classifier import ( - detect_mismatch, - ) - reclassify_hint = detect_mismatch(doc.doc_type, text) - if reclassify_hint and reclassify_hint["action"] == "reclassify": - actual_doc_type = reclassify_hint["detected"] - logger.info( - "doc_type AUTO-RECLASSIFY: deklariert=%s " - "erkannt=%s (score %d vs %d) — uebernehme erkannten Typ", - doc.doc_type, actual_doc_type, - reclassify_hint["detected_score"], - reclassify_hint["declared_score"], - ) - except Exception as e: - logger.warning("doc_type_classifier failed: %s", e) - - # Cookie-Tabelle: wenn User Tabelle reinkopiert hat, deterministisch - # parsen (kein LLM noetig) und Vendors gleich ableiten. - if input_source.startswith("text") and actual_doc_type == "cookie": - try: - from compliance.services.cookies_table_parser import ( - parse_cookie_table, - ) - tab_vendors = parse_cookie_table(text) - if tab_vendors: - pasted_table_vendors.extend(tab_vendors) - logger.info( - "Cookie-Tabelle erkannt im pasted Text — " - "%d Vendors / %d Cookies deterministisch geparst", - len(tab_vendors), - sum(len(v.get("cookies", [])) for v in tab_vendors), - ) - except Exception as e: - logger.warning("cookies_table_parser failed: %s", e) - - if text: - doc_texts[actual_doc_type] = text - doc_entries.append({ - "doc_type": actual_doc_type, - "declared_doc_type": doc.doc_type, - "url": doc.url, - "text": text, - "word_count": len(text.split()) if text else 0, - "auto_discovered": False, - "discovery_attempted": False, - "cmp_payloads": cmp_payloads, - "input_source": input_source, - "reclassify_hint": reclassify_hint, - }) - - # Step 1a-bis: AUTO-DISCOVERY. For each canonical doc_type the user - # did NOT submit a URL/text for, try to find it on the homepage of - # the submitted URLs. This bridges the gap between "user knows the - # exact URL" (rare) and "user pasted the homepage" (common). - await _autodiscover_missing( - check_id, doc_entries, doc_texts, url_text_cache, - ) - - # Step 1b: Section splitting — two cases: - # 1. Same URL used for multiple doc_types → split by heading - # 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows - from compliance.services.section_splitter import ( - split_shared_texts, auto_fill_from_dsi, cross_search_documents, - ) - split_shared_texts(doc_entries, url_text_cache) - auto_fill_from_dsi(doc_entries) - - # Step 1c: Cross-document search — find doc_types in wrong documents (30-35%) - _update(check_id, "Dokumente werden uebergreifend durchsucht...", 32) - placement_findings = cross_search_documents(doc_entries) - - # Refresh doc_texts after all splitting/searching - for entry in doc_entries: - if entry.get("text"): - doc_texts[entry["doc_type"]] = entry["text"] - - # P15: Dedupe — wenn mehrere Doc-Types DASSELBE Dokument referenzieren - # (z.B. Safetykon: User gibt /datenschutz fuer dse + cookie + widerruf), - # behalten wir nur den primaeren Doc-Type. Andere: leeren + note. - # Priorität: dse > impressum > cookie > widerruf > agb > nutzungsbedingungen - _DOC_PRIORITY = ["dse", "impressum", "cookie", "widerruf", "agb", - "nutzungsbedingungen", "social_media", "dsb"] - seen_text_hash: dict[int, str] = {} - for dt in _DOC_PRIORITY: - entry = next((e for e in doc_entries if e.get("doc_type") == dt - and e.get("text")), None) - if not entry: - continue - text_hash = hash((entry.get("text") or "").strip()[:1000]) - if text_hash in seen_text_hash: - primary = seen_text_hash[text_hash] - logger.info( - "P15 dedup: doc_type=%s referenziert dasselbe Dokument " - "wie %s (URL=%s) -> als Duplikat markiert.", - dt, primary, entry.get("url", "")[:60], - ) - entry["text"] = "" - entry["word_count"] = 0 - entry["url"] = "" - entry["dup_of"] = primary - doc_texts.pop(dt, None) - else: - seen_text_hash[text_hash] = dt - - # Step 2: Detect business profile (35-40%) - _update(check_id, "Geschaeftsmodell wird erkannt...", 37) - # P16: Homepage-Text mit fuer Profile-Detection (no_direct_sales - # B2B-Indikatoren wie "CE-Zertifizierung" / "Schulungen" stehen oft - # nur im Homepage-Menue, nicht im Pflichttext). - profile_input = dict(doc_texts) - try: - base_url = "" - for e in doc_entries: - if e.get("url"): - from urllib.parse import urlparse - p = urlparse(e["url"]) - if p.scheme and p.netloc: - base_url = f"{p.scheme}://{p.netloc}/" - break - if base_url: - import re as _re - async with httpx.AsyncClient( - timeout=8.0, follow_redirects=True, - headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) " - "AppleWebKit/537.36 HeadlessChrome/120.0.0.0"}, - ) as _hc: - _hr = await _hc.get(base_url) - if _hr.status_code == 200 and "text/html" in _hr.headers.get( - "content-type", ""): - _html = _hr.text[:60000] - _html = _re.sub(r"]*>.*?", " ", - _html, flags=_re.DOTALL | _re.IGNORECASE) - _html = _re.sub(r"]*>.*?", " ", - _html, flags=_re.DOTALL | _re.IGNORECASE) - _html = _re.sub(r"<[^>]+>", " ", _html) - _html = _re.sub(r"\s+", " ", _html).strip() - if len(_html.split()) > 30: - profile_input["__homepage"] = _html[:20000] - logger.info("P16 homepage merged for profile: %d words", - len(_html.split())) - except Exception as e: - logger.debug("homepage fetch for profile failed: %s", e) - profile = await detect_business_profile(profile_input) - profile_dict = asdict(profile) - - # Step 3: Check each document - results: list[DocCheckResult] = [] - total_findings = 0 - use_agent_flag = req.use_agent or os.getenv( - "COMPLIANCE_USE_AGENT", "false" - ).lower() == "true" - - # Filter out doc_types that don't apply to this business profile - skip_types = _get_skip_types(profile) - - # Derive business_scope hints for the MC filter (O1 — Doc-type Scope-Flag). - # MCs that explicitly require a feature (e.g. 'biometric_processing', - # 'ai_decision_making', 'child_targeting') get dropped when the - # detected profile doesn't declare it. - business_scope: set[str] = set() - for svc in (getattr(profile, "detected_services", []) or []): - business_scope.add(str(svc).lower()) - if (getattr(profile, "business_type", "") or "").lower() == "b2c": - business_scope.add("b2c") - if getattr(profile, "has_online_shop", False): - business_scope.add("ecommerce") - if getattr(profile, "is_regulated_profession", False): - business_scope.add("regulated_profession") - - # Document checks: 40-80% - n_entries = max(1, len(doc_entries)) - for i, entry in enumerate(doc_entries): - text = entry["text"] - doc_type = entry["doc_type"] - label = _doc_type_label(doc_type) - url = entry["url"] - - if doc_type in skip_types: - results.append(DocCheckResult( - label=label, url=url, doc_type=doc_type, - error=skip_types[doc_type], - )) - continue - - pct = int(40 + (i / n_entries) * 40) - _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct) - - if not text or len(text) < 50: - # P15: duplicate doc that was deduped against a primary doc - if entry.get("dup_of"): - results.append(DocCheckResult( - label=label, url="", doc_type=doc_type, - error=f"Nicht separat vorhanden — wird im Dokument " - f"'{_doc_type_label(entry['dup_of'])}' " - f"mit-geprueft.", - )) - continue - # P24: DSB-Kontakt ist Pflichtangabe in der DSE (Art. 13(1)(b) - # DSGVO) — wenn kein separates DSB-Dokument vorliegt, ist das - # KEIN Fehler. DSB-Pruefung passiert ohnehin in der DSE. - if doc_type == "dsb" and not (entry.get("url") or "").strip(): - results.append(DocCheckResult( - label=label, url="", doc_type=doc_type, - error="Nicht separat vorhanden — DSB-Kontaktdaten " - "werden in der Datenschutzerklaerung als " - "Pflichtangabe nach Art. 13(1)(b) DSGVO geprueft.", - )) - continue - # Empty entry — either from auto-discovery padding (no URL - # to fetch) or from a fetch that returned nothing. If there - # was a URL we keep the error so the user knows the fetch - # failed; otherwise let the padding step label it - # 'Nicht eingereicht' / 'Auf der Website nicht gefunden'. - if (entry.get("url") or "").strip(): - results.append(DocCheckResult( - label=label, url=url, doc_type=doc_type, - error="Kein Text vorhanden oder zu kurz", - )) - continue - - result = await _check_single( - text, doc_type, label, url, - entry["word_count"], use_agent_flag, - business_scope=business_scope, - business_profile={"no_direct_sales": getattr(profile, "no_direct_sales", False)}, - ) - - # Apply profile context filter - result = _apply_profile_filter(result, profile, doc_type) - - # Add placement findings — but only if the regex checks confirm - # the text doesn't match. If completeness >= 50%, the text IS the - # right doc_type despite missing cross-search keywords. - if result.completeness_pct < 50: - for pf in placement_findings: - if pf.get("doc_type") == doc_type: - result.checks.insert(0, CheckItem(**{ - k: v for k, v in pf.items() if k != "doc_type" - })) - - results.append(result) - total_findings += result.findings_count - - # Step 3b: Banner-Check (automatic, uses first URL or homepage) - banner_result = None - banner_url = req.documents[0].url if req.documents and req.documents[0].url else "" - # Use the homepage (strip path) for banner check - if banner_url: - from urllib.parse import urlparse - parsed = urlparse(banner_url) - banner_url = f"{parsed.scheme}://{parsed.netloc}" - if banner_url: - _update(check_id, "Cookie-Banner wird geprueft...", 82) - try: - async with httpx.AsyncClient(timeout=900.0) as client: # P50: +10min for vendor-detail-phase - resp = await client.post( - f"{CONSENT_TESTER_URL}/scan", - json={"url": banner_url, "timeout_per_phase": 10}, - ) - if resp.status_code == 200: - banner_result = resp.json() - except Exception as e: - logger.warning( - "Banner check failed: %s (%s)", e or "", type(e).__name__ - ) - - # Step 3c: Cross-check Banner vs Cookie-Richtlinie (88-90%) - if banner_result and "cookie" in doc_texts: - _update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...", 89) - cross_findings = _cross_check_banner_vs_cookie( - banner_result, doc_texts["cookie"], - ) - if cross_findings: - for r in results: - if r.doc_type == "cookie": - for cf in cross_findings: - r.checks.append(CheckItem(**cf)) - l2 = [c for c in r.checks if c.level == 2 and not c.skipped] - l2p = sum(1 for c in l2 if c.passed) - r.correctness_pct = round(l2p / len(l2) * 100) if l2 else 0 - - # Step 3d: TCF Vendor cross-check against DSI - tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else [] - vvt_entries: list[dict] = [] - if tcf_vendors and "dse" in doc_texts: - _update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...", 91) - from compliance.services.banner_cookie_cross_check import cross_check_vendors_vs_dsi - from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt - vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"]) - if vendor_findings: - for r in results: - if r.doc_type == "dse": - for vf in vendor_findings: - r.checks.append(CheckItem(**vf)) - vvt_entries = map_vendors_to_vvt(tcf_vendors) - - # Step 4: Extract profile hints from documents (92-95%) - _update(check_id, "Profil wird aus Dokumenten extrahiert...", 93) - from compliance.services.profile_extractor import extract_profile_from_documents - extracted_profile = extract_profile_from_documents(doc_texts, profile_dict) - - # Step 4b: Determine scenario per document - for r in results: - if r.error: - r.scenario = "skip" - elif r.completeness_pct < 30: - r.scenario = "regenerate" - elif r.completeness_pct < 95: - r.scenario = "fix" - else: - r.scenario = "import" - - # Step 4c: Always render all 8 canonical doc types. Missing types - # are differentiated: - # - Discovery was tried but found nothing -> 'Auf der Website - # nicht gefunden' (suggest user provides URL manually) - # - No submitted URLs at all -> 'Nicht eingereicht' - attempted = { - e["doc_type"] for e in doc_entries if e.get("discovery_attempted") - } - results = _pad_results_with_missing(results, discovery_attempted=attempted) - - # Step 5: Build report with management summary (95-98%) - _update(check_id, "Report wird erstellt...", 96) - from .agent_doc_check_report import ( - build_management_summary, - build_scanned_urls_html, - build_provider_list_html, - ) - from .agent_doc_check_extras import build_vvt_table_html - - # Extract structured vendor records from any CMP payloads captured - # for the cookie doc (BMW ePaaS, OneTrust, etc.), validate their - # opt-out + privacy URLs concurrently, score each entry. - cmp_vendors: list[dict] = [] - try: - from compliance.services.vendor_extractor import ( - extract_vendors_from_payloads, - ) - from compliance.services.cookie_link_validator import ( - validate_vendor_urls, score_vendors, - ) - cookie_payloads = [] - cookie_text = "" - # P30: aggregate cmp_payloads from ALL doc_entries — sites - # like Mercedes load Usercentrics only on the homepage, so - # the JSON gets captured during DSE/Impressum discovery, not - # in the cookies.html fetch. Dedup by URL since the same - # payload is captured on every page load. - seen_cmp_urls: set[str] = set() - for e in doc_entries: - for p in (e.get("cmp_payloads") or []): - p_url = p.get("url") or "" - if p_url and p_url in seen_cmp_urls: - continue - seen_cmp_urls.add(p_url) - cookie_payloads.append(p) - if e.get("doc_type") == "cookie" and e.get("text"): - cookie_text = e["text"] - # P48: also pull cmp_payloads from the Banner-Scan (homepage - # 3-phase consent test). Mercedes' Usercentrics-JSON is - # captured there even when not in DSI-Discovery of static - # legal pages. - if banner_result: - for p in (banner_result.get("cmp_payloads") or []): - p_url = p.get("url") or "" - if p_url and p_url in seen_cmp_urls: - continue - seen_cmp_urls.add(p_url) - cookie_payloads.append(p) - if cookie_payloads: - logger.info("P48: %d CMP-payloads available for vendor-extract (after Banner-Scan merge)", - len(cookie_payloads)) - # P17-D: Fallback wenn cookie via P15 deduped wurde — nutze DSE-Text - # sofern Cookie-Begriffe drin sind, damit LLM-Vendor-Extract trotzdem - # greifen kann. - if not cookie_text and not cookie_payloads: - dse_t = doc_texts.get("dse", "") - if dse_t and any(w in dse_t.lower() for w in - ("cookie", "tracking", "google analytics", "consent")): - cookie_text = dse_t - logger.info("P17-D: vendor-extract Fallback auf DSE (Cookie deduped)") - # Site-owner derived from the submitted URLs — drives the - # INTERNAL/GROUP_COMPANY classification of vendor records. - owner_name = _company_name_from_url(doc_entries) or "" - if cookie_payloads: - cmp_vendors = extract_vendors_from_payloads( - cookie_payloads, owner_name=owner_name, - ) - # P52: LLM-Fallback nicht nur wenn 0 Vendors, sondern auch - # wenn die strukturierten Quellen < 5 Vendors lieferten und - # der Cookie-Text substantiell ist. So holt sich VW-typische - # Setups (Generic CMP, 28 Cookies aber 0 cmp_payloads) noch - # ihre echten Vendors aus dem Text. - if (len(cmp_vendors) < 5 - and cookie_text and len(cookie_text.split()) >= 500): - from compliance.services.vendor_llm_extractor import ( - extract_vendors_via_llm, - ) - from compliance.services.vendor_classifier import classify - _update(check_id, "Vendor-Liste per LLM extrahieren...", 94) - llm_vendors = await extract_vendors_via_llm(cookie_text) - # P52: classify die LLM-Vendors und MERGE mit existing - # statt zu ueberschreiben. - existing_names = {(v.get("name") or "").strip().lower() - for v in cmp_vendors} - added_llm = 0 - for v in llm_vendors: - nm = (v.get("name") or "").strip() - if not nm or nm.lower() in existing_names: - continue - v["recipient_type"] = classify( - vendor_name=nm, - category=v.get("category", ""), - owner_name=owner_name, - ) - v.setdefault("source", "llm_cascade") - cmp_vendors.append(v) - existing_names.add(nm.lower()) - added_llm += 1 - if added_llm: - logger.info( - "P52 LLM-Cascade: +%d Vendors (total: %d)", - added_llm, len(cmp_vendors), - ) - # P57: Phase G vendor_details als zusätzliche Vendor-Quelle. - # Wenn extract_vendors_from_payloads weniger findet als - # Phase G's Info-Click-Through (z.B. Mercedes-Settings nicht - # erkannt als usercentrics-kind), die Phase-G-Namen als - # eigenständige Vendors hinzufügen. - if banner_result: - vd_list = banner_result.get("vendor_details") or [] - vd_list = [v for v in vd_list if v.get("name") != "__TDM_OPTOUT__"] - existing_names = {(v.get("name") or "").strip().lower() - for v in cmp_vendors} - added = 0 - for d in vd_list: - n = (d.get("name") or "").strip() - if not n or n.lower() in existing_names: - continue - # Skip generic category-labels (Mercedes-Kategorien) - if n.lower() in ("technisch erforderlich", "analyse und statistik", - "marketing", "alles auswählen", - "alles auswaehlen"): - continue - from compliance.services.vendor_classifier import classify - cmp_vendors.append({ - "name": n, - "country": "", - "purpose": d.get("description", "")[:500], - "category": "", - "opt_out_url": d.get("opt_out_url", ""), - "privacy_policy_url": d.get("privacy_url", ""), - "persistence": d.get("retention", ""), - "cookies": d.get("cookies", []), - "processing_company": d.get("processing_company", ""), - "address": d.get("address", ""), - "purposes": d.get("purposes", []), - "technologies": d.get("technologies", []), - "recipient_type": classify( - vendor_name=n, category="", owner_name=owner_name, - ), - }) - existing_names.add(n.lower()) - added += 1 - if added: - logger.info("P57: added %d new vendors from Phase G (total: %d)", - added, len(cmp_vendors)) - - # D — HTML-Tabellen die der consent-tester aus dem DOM - # extrahiert hat: direkt deterministisch parsen (hoechste - # Genauigkeit, keine LLM-Halluzinationen). - for pl in (cookie_payloads or []): - if pl.get("kind") != "html_table": - continue - rows = pl.get("rows") or [] - if len(rows) < 3: - continue - try: - from compliance.services.cookies_table_parser import ( - parse_cookie_table as _parse_ct_d, - ) - table_text = "\n".join(rows) - d_vendors = _parse_ct_d(table_text) - if d_vendors: - existing_d = {(v.get("name") or "").strip().lower() - for v in cmp_vendors} - added_d = 0 - for v in d_vendors: - nm = (v.get("name") or "").strip() - if not nm or nm.lower() in existing_d: - continue - v.setdefault("source", "html_table_dom") - cmp_vendors.append(v) - existing_d.add(nm.lower()) - added_d += 1 - if added_d: - logger.info( - "D HTML-Table-DOM-Parse: +%d Vendors aus " - "%d-Zeilen-Tabelle (total: %d)", - added_d, len(rows), len(cmp_vendors), - ) - except Exception as e: - logger.warning("html_table parse failed: %s", e) - - # B — cookies_table_parser auch auf gecrawltem Cookie-Text. - # Erst Standard-Parse (Tab/Pipe-getrennt). Wenn der nichts - # findet (kein Separator), Flat-Pattern-Parse fuer Sites wie - # VW die ihre Tabelle als flachen Text liefern. - if cookie_text and len(cookie_text) >= 500: - try: - from compliance.services.cookies_table_parser import ( - parse_cookie_table as _parse_ct, - parse_flat_cookie_text as _parse_flat, - ) - crawled_table_vendors = _parse_ct(cookie_text) - if not crawled_table_vendors: - crawled_table_vendors = _parse_flat(cookie_text) - if crawled_table_vendors: - existing = {(v.get("name") or "").strip().lower() - for v in cmp_vendors} - added_c = 0 - for v in crawled_table_vendors: - nm = (v.get("name") or "").strip() - if not nm or nm.lower() in existing: - continue - v.setdefault("source", "table_crawled") - cmp_vendors.append(v) - existing.add(nm.lower()) - added_c += 1 - if added_c: - logger.info( - "B Crawled-Tabellen-Parse: +%d Vendors " - "(total: %d)", - added_c, len(cmp_vendors), - ) - except Exception as e: - logger.warning("crawled-table-parse failed: %s", e) - - # C — Screenshot + Tesseract-OCR der Cookie-Richtlinie. - # Overlapping scrolling screenshots (jede Slice ueberlappt die - # vorherige um overlap_px Pixel) → lueckenlose Beweiskette. - # Pro Slice Tesseract OCR + parse_ocr_cookie_table; Dedup nach - # Cookie-Name über alle Slices. Site-unabhaengig, deterministisch. - cookie_url_for_shot = "" - for _e in doc_entries: - if _e.get("doc_type") == "cookie" and _e.get("url"): - cookie_url_for_shot = _e["url"]; break - cookie_evidence_slices: list[dict] | None = None - cookie_evidence_meta: dict | None = None - if cookie_url_for_shot: - try: - from compliance.services.cookie_screenshot_ocr import ( - capture_cookie_evidence_slices, - ocr_slices_extract_cookies, - cookies_to_vendor_records, - ) - from compliance.services.cookies_table_parser import ( - _guess_vendor as _gv, - ) - _update(check_id, - "Cookie-Richtlinie wird fotografiert (lueckenlose Beweiskette)...", - 92) - ev = await capture_cookie_evidence_slices( - cookie_url_for_shot, check_id=check_id, - viewport_h=1024, overlap_px=200, max_slices=40, - ) - if ev.get("slices"): - cookie_evidence_slices = ev["slices"] # ZIP-Anhang - cookie_evidence_meta = { - "total_height_px": ev.get("total_height_px"), - "width_px": ev.get("width_px"), - "accepted_banner": ev.get("accepted_banner"), - "expanded": ev.get("expanded"), - "url": ev.get("url"), - "slice_count": len(ev["slices"]), - } - _update(check_id, - "Tesseract OCR über alle Slices...", 93) - ocr_cookies, ocr_stats = ocr_slices_extract_cookies( - ev["slices"], - ) - if ocr_cookies: - ocr_vendors = cookies_to_vendor_records( - ocr_cookies, guess_vendor_fn=_gv, - ) - existing = { - (v.get("name") or "").strip().lower() - for v in cmp_vendors - } - added_v = 0 - for v in ocr_vendors: - nm = (v.get("name") or "").strip() - if not nm: - continue - if nm.lower() in existing: - for ex in cmp_vendors: - if (ex.get("name") or "").strip().lower() == nm.lower(): - ex_names = { - (c.get("name") or "").lower() - for c in (ex.get("cookies") or []) - } - for c in (v.get("cookies") or []): - if c["name"].lower() not in ex_names: - ex.setdefault("cookies", []).append(c) - ex_names.add(c["name"].lower()) - cur_src = ex.get("source", "") - if "tesseract_ocr" not in cur_src: - ex["source"] = (cur_src + ";tesseract_ocr").strip(";") - break - continue - cmp_vendors.append(v) - existing.add(nm.lower()) - added_v += 1 - logger.info( - "C Tesseract-OCR: +%d Vendors / %d Cookies " - "(über %d Slices, total: %d)", - added_v, len(ocr_cookies), - ocr_stats.get("slices", 0), len(cmp_vendors), - ) - except Exception as e: - logger.warning( - "Tesseract-OCR pipeline failed: %s (%s)", - str(e) or "(no msg)", type(e).__name__, - ) - - # User-pasted Cookie-Tabelle (deterministisch, kein LLM): - # die hat IMMER Vorrang weil 100% genau. - if pasted_table_vendors: - existing = {(v.get("name") or "").strip().lower() - for v in cmp_vendors} - added_p = 0 - for v in pasted_table_vendors: - nm = (v.get("name") or "").strip() - if not nm or nm.lower() in existing: - continue - cmp_vendors.append(v) - existing.add(nm.lower()) - added_p += 1 - if added_p: - logger.info( - "Pasted-Tabellen-Merge: +%d Vendors (total: %d)", - added_p, len(cmp_vendors), - ) - - # Cookie-Library-Fallback (P52 Lite): wenn weiterhin wenige - # Vendors aber viele after_accept-Cookies, aus Library auflösen. - # VW-Lehre: 6 LLM-Grob-Vendors reichen NICHT — die Library - # holt 30+ weitere aus den Cookie-Namen + Cookie-Doc-Pattern. - # Schwelle: immer probieren wenn < 20 Vendors. - if banner_result and len(cmp_vendors) < 20: - try: - from compliance.services.cookie_to_vendor_fallback import ( - fallback_vendors_for_run, - ) - from database import SessionLocal as _SLfb - _fb_db = _SLfb() - try: - extra = fallback_vendors_for_run( - _fb_db, banner_result, len(cmp_vendors), - cookie_doc_text=cookie_text, - ) - if extra: - existing_names = {(v.get("name") or "").strip().lower() - for v in cmp_vendors} - for v in extra: - if v["name"].lower() in existing_names: - continue - cmp_vendors.append(v) - logger.info( - "Cookie-Library-Fallback: cmp_vendors %d -> %d", - len(cmp_vendors) - len(extra), len(cmp_vendors), - ) - finally: - _fb_db.close() - except Exception as e: - logger.warning("Cookie-Library-Fallback skipped: %s", e) - - # Vendor-Normalizer: Dedup (Google-Familie etc) + Garbage-Filter - try: - from compliance.services.vendor_normalizer import ( - normalize_vendors as _norm_v, - ) - cmp_vendors = _norm_v(cmp_vendors) - except Exception as e: - logger.warning("vendor_normalizer skipped: %s", e) - - # P50: enrich vendors with per-vendor detail-modal-extracts - # (description, opt-out URL, privacy URL, cookies). Detail - # comes from Phase G Info-button-click-through in /scan. - tdm_opt_out_notice = "" - if cmp_vendors and banner_result: - vendor_details = banner_result.get("vendor_details") or [] - # P50f: filter out TDM-opt-out sentinel - tdm_sentinel = next((v for v in vendor_details - if v.get("name") == "__TDM_OPTOUT__"), None) - if tdm_sentinel: - tdm_opt_out_notice = tdm_sentinel.get("description", "") - logger.info("P50f: TDM opt-out — skipped detail-enrichment for vendors") - vendor_details = [v for v in vendor_details - if v.get("name") != "__TDM_OPTOUT__"] - if vendor_details: - details_by_name = {} - for d in vendor_details: - n = (d.get("name") or "").strip().lower() - if n: - details_by_name[n] = d - enriched = 0 - for v in cmp_vendors: - key = (v.get("name") or "").strip().lower() - # Substring fallback for fuzzy matches (e.g. - # "Google Analytics" detail-name may differ slightly) - d = details_by_name.get(key) - if not d: - for dn, dv in details_by_name.items(): - if key in dn or dn in key: - d = dv - break - if not d: - continue - if not v.get("country") and (d.get("processing_company") or d.get("address")): - # Heuristic country extract from address (DE/EU keywords) - addr = d.get("address", "") - if re.search(r"\b(deutschland|germany|berlin|m(?:ue|ü)nchen|hamburg|stuttgart)\b", addr, re.I): - v["country"] = "DE" - elif re.search(r"\bireland|irland|dublin\b", addr, re.I): - v["country"] = "IE" - elif re.search(r"\busa|united states|california|new york|delaware\b", addr, re.I): - v["country"] = "US" - if not v.get("purpose"): - v["purpose"] = d.get("description", "")[:500] - if not v.get("opt_out_url"): - v["opt_out_url"] = d.get("opt_out_url", "") - if not v.get("privacy_policy_url"): - v["privacy_policy_url"] = d.get("privacy_url", "") - if not v.get("cookies"): - v["cookies"] = d.get("cookies", []) - v["purposes"] = d.get("purposes", []) - v["technologies"] = d.get("technologies", []) - if not v.get("persistence"): - v["persistence"] = d.get("retention", "") - v["processing_company"] = d.get("processing_company", "") - v["address"] = d.get("address", "") - enriched += 1 - logger.info("P50: enriched %d/%d vendors with detail-modal data", - enriched, len(cmp_vendors)) - # P59b: Cookie-Behavior-Validator — pruefe alle gesetzten Cookies - # gegen unsere Library, generiere 3-Tier-Severity-Findings. - # Background-Task hat keinen DB-Dependency-Inject -> SessionLocal - # selber oeffnen + sauber schliessen. - cookie_behavior_findings: list[dict] = [] - if banner_result: - cookies_detailed = banner_result.get("cookies_detailed") or [] - if cookies_detailed: - cb_session = None - try: - from database import SessionLocal - from compliance.services.cookie_behavior_validator import ( - validate_cookie_behavior, - ) - from urllib.parse import urlparse - fp_domain = "" - if banner_url: - fp_domain = urlparse(banner_url).netloc.replace("www.", "") - cb_session = SessionLocal() - cookie_behavior_findings = validate_cookie_behavior( - cb_session, cookies_detailed, - network_requests=[], # TODO Layer B in P59d - first_party_domain=fp_domain, - ) - if cookie_behavior_findings: - sevs = {f["severity"] for f in cookie_behavior_findings} - logger.info( - "P59b: Cookie-Behavior-Check %d findings " - "(severities: %s) ueber %d Cookies", - len(cookie_behavior_findings), - sorted(sevs), - len(cookies_detailed), - ) - banner_result["cookie_behavior_findings"] = ( - cookie_behavior_findings - ) - else: - logger.info( - "P59b: Cookie-Behavior-Check 0 findings " - "ueber %d Cookies (library miss / clean)", - len(cookies_detailed), - ) - except Exception as cb_err: - logger.warning("P59b Cookie-Behavior-Check failed: %s", cb_err) - finally: - if cb_session is not None: - try: - cb_session.close() - except Exception: - pass - - # P61: "Untergeschobene Cookies" — wenn z.B. Google Tag Manager - # deklariert ist, kommen GA + GCL_AU + DoubleClick automatisch mit. - # Findings landen im banner_result fuer Mail-Render. - if banner_result and cmp_vendors: - try: - from compliance.services.vendor_package_cookies import ( - detect_implicit_cookies, - ) - declared = [v.get("name", "") for v in cmp_vendors if v.get("name")] - actual_cookies: list[str] = [] - for phase_data in (banner_result.get("phases") or {}).values(): - if isinstance(phase_data, dict): - for ck in (phase_data.get("cookies") or []): - if isinstance(ck, dict) and ck.get("name"): - actual_cookies.append(ck["name"]) - implicit_findings = detect_implicit_cookies( - declared, actual_cookies_set=actual_cookies or None, - ) - if implicit_findings: - banner_result["implicit_vendor_findings"] = implicit_findings - logger.info( - "P61: %d implicit vendor-package items detected " - "(%d cookies + %d vendors)", - len(implicit_findings), - sum(1 for f in implicit_findings if f["implicit"]["type"] == "cookie"), - sum(1 for f in implicit_findings if f["implicit"]["type"] == "vendor"), - ) - except Exception as p61_err: - logger.warning("P61 implicit-vendor detection failed: %s", p61_err) - - if cmp_vendors: - logger.info("VVT: %d vendors extracted, validating links", - len(cmp_vendors)) - cmp_vendors = await validate_vendor_urls(cmp_vendors) - cmp_vendors = score_vendors(cmp_vendors) - # Enrich each vendor with per-cookie functional roles - try: - from compliance.services.cookie_function_classifier import ( - annotate_vendor_cookies, - ) - cmp_vendors = [annotate_vendor_cookies(v) for v in cmp_vendors] - except Exception as e: - logger.warning("Cookie function classification skipped: %s", e) - except Exception as e: - logger.warning("VVT vendor extraction skipped: %s", e) - - # Vendor-Redundanz + EU-Alternativen + Cost/Savings (O4) - redundancy_report = None - try: - from compliance.services.vendor_redundancy import analyze as analyze_redundancy - from compliance.services.vendor_cost_estimator import infer_company_tier - if cmp_vendors: - # Company-Tier aus business_profile ableiten — beeinflusst die - # Cost-Range so dass z.B. fuer DAX-Konzerne nicht starter-Preise - # die untere Schranke duruecken. - bp_dict = { - "type": getattr(profile, "business_type", ""), - "features": list(business_scope), - } - ctier = infer_company_tier(bp_dict) - redundancy_report = analyze_redundancy(cmp_vendors, company_tier=ctier) - logger.info( - "Redundanz: %d Kategorien mit Mehrfach-Anbietern, " - "Spar-Schaetzung %s pro Jahr (company_tier=%s)", - redundancy_report["summary"]["redundancy_count"], - redundancy_report["summary"]["estimated_saving_pct"], - ctier, - ) - except Exception as e: - logger.warning("Vendor redundancy analysis skipped: %s", e) - - summary_html = build_management_summary(results) - scanned_html = build_scanned_urls_html(doc_entries) - providers_html = build_provider_list_html(banner_result, vvt_entries) - # P18: Deep-Block mit Phases + Quality-Score + Per-Category-Tracker - from .agent_doc_check_banner import build_banner_deep_html - banner_deep_html = build_banner_deep_html(banner_result) - vvt_html = build_vvt_table_html(cmp_vendors) - - # MC scorecard aggregated across ALL docs in this run (DSGVO/TDDDG/ - # BGB/...). Sits at the top so the GF sees the regulation-by- - # regulation view before drilling into per-doc details. - from compliance.services.mc_scorecard import build_scorecard - from .agent_doc_check_scorecard import build_scorecard_html - all_mc_checks: list[dict] = [] - # P73: pro-doc Fails sammeln um Solution-Generator pro Doc-Type - # mit dem korrekten doc_text aufzurufen. - fails_by_doc: dict[str, list[dict]] = {} - for r in results: - for c in r.checks: - if c.id.startswith("mc-"): - rec = { - "id": c.id, "label": c.label, "passed": c.passed, - "severity": c.severity, "skipped": c.skipped, - "regulation": c.regulation, - "hint": getattr(c, "hint", "") or "", - } - all_mc_checks.append(rec) - if (not c.passed and not c.skipped - and (c.severity or "").upper() in ("CRITICAL", "HIGH")): - fails_by_doc.setdefault(r.doc_type, []).append(rec) - # P106 — Audit-Type-Klassifizierung pro MC. Interne Prozess-/ - # Doku-Checks werden NICHT als FAIL gewertet sondern als CHECK - # (manuelle Pruefung beim DSB notwendig). - try: - from compliance.services.mc_audit_type import ( - annotate_mc_results, split_by_audit_type, - ) - annotate_mc_results(all_mc_checks) - mc_split = split_by_audit_type(all_mc_checks) - # Fails-by-doc neu aufbauen: nur noch echte verifiable Fails - fails_by_doc = {} - for r in mc_split.get("verifiable_fails") or []: - fails_by_doc.setdefault("dse", []).append(r) - except Exception as e: - logger.warning("P106 mc_audit_type skipped: %s", e) - mc_split = {"internal_checks": [], "verifiable_fails": all_mc_checks} - scorecard = build_scorecard(all_mc_checks) if all_mc_checks else {} - # Trend: load previous scorecard for the same tenant + domain so the - # email can show delta indicators (A6). - prev_scorecard: dict | None = None - if scorecard: - try: - from compliance.services.compliance_audit_log import ( - list_runs_for_tenant, - ) - tenant_id_for_trend = req.recipient or "" - base_domain_for_trend = _extract_domain(doc_entries) or "" - prev_runs = list_runs_for_tenant( - tenant_id_for_trend, - base_domain=base_domain_for_trend, - limit=1, - ) - if prev_runs: - prev_scorecard = prev_runs[0].get("scorecard") - except Exception as e: - logger.debug("trend lookup skipped: %s", e) - scorecard_html = ( - build_scorecard_html(scorecard, previous_scorecard=prev_scorecard) - if scorecard else "" - ) - - report_html = build_html_report(results, None, doc_texts) - profile_html = _build_profile_html(profile) - - # O4: Vendor-Redundanz / EU-Alternativen + Cost-Savings-Block - from .agent_doc_check_redundancy import build_redundancy_html - redundancy_html = build_redundancy_html(redundancy_report) - - # P1: Executive-Summary GANZ oben — CFO/GF sieht 4 KPIs + 2 CTAs. - from .agent_doc_check_exec_summary import build_exec_summary_html - # Site-Name fuer Header bestimmen (gleiche Logik wie Email-Subject) - url_company_for_exec = _company_name_from_url(doc_entries) - domain_for_exec = _extract_domain(doc_entries) - site_name_for_exec = url_company_for_exec or domain_for_exec or "" - exec_summary_html = build_exec_summary_html( - scorecard=scorecard, - previous_scorecard=prev_scorecard, - cmp_vendors=cmp_vendors, - redundancy_report=redundancy_report, - site_name=site_name_for_exec, - ) - - # P18: Critical-Findings-Block (rot oben, mit Sofortmassnahmen + - # Quellen + Bussgeld-Praezedenz). Wird nur gerendert wenn echte - # kritische Verstoesse vorliegen. - critical_html = "" - try: - from .agent_doc_check_critical import build_critical_findings_html - critical_html = build_critical_findings_html( - banner_result=banner_result, - scorecard=scorecard, - results=results, - ) - except Exception as e: - logger.warning("Critical-findings block skipped: %s", e) - - # P10: Cookie-Policy-Architecture-Detection (BMW-Pattern erkennen) - cookie_arch_html = "" - try: - from compliance.services.cookie_policy_architecture import ( - detect_architecture, build_architecture_html, - ) - cookie_doc_url = "" - cookie_doc_text = doc_texts.get("cookie", "") - cookie_cmp_payloads: list[dict] = [] - for e in doc_entries: - if (e.get("doc_type") or "").lower() in ("cookie", "cookie_policy"): - cookie_doc_url = e.get("url", "") - cookie_cmp_payloads = e.get("cmp_payloads") or [] - break - # P17-A: Fallback wenn Cookie-Doc via P15 deduped wurde — nutze - # den DSE-Text wenn er Cookie-Schluesselwoerter enthaelt. - if not cookie_doc_text: - dse_text = doc_texts.get("dse", "") - if dse_text and any(w in dse_text.lower() for w in - ("cookie", "tracking", "google analytics", - "consent")): - cookie_doc_text = dse_text - dse_entry = next((e for e in doc_entries - if e.get("doc_type") == "dse"), {}) - cookie_doc_url = dse_entry.get("url", "") - cookie_cmp_payloads = dse_entry.get("cmp_payloads") or [] - logger.info("P17-A: cookie-arch fallback auf DSE (Cookie-Doc deduped)") - if cookie_doc_text: - arch = detect_architecture( - doc_url=cookie_doc_url, - doc_text=cookie_doc_text, - cmp_payloads=cookie_cmp_payloads, - homepage_cmp_payloads=cmp_payloads or [], - ) - cookie_arch_html = build_architecture_html(arch) - logger.info("cookie-arch: layer=%s versioned=%s risk=%s", - arch["layer_separation"], arch["versioned"], arch["risk_label"]) - except Exception as e: - logger.warning("cookie-architecture detection failed: %s", e) - - # Reihenfolge — Sales-optimiert: - # 1) Exec-Summary (KPIs + Saving + CTAs) - # 2) summary_html (Konkrete Aufgaben fuer die Geschaeftsfuehrung) - # 3) scanned_urls (Quellen-Transparenz) - # 4) profile_html (Erkanntes Geschaeftsmodell) - # 5) scorecard_html (MC-Scorecard) - # 6) redundancy_html (Optimierungspotenzial — direkt nach Compliance-Score) - # 7) providers_html + vvt_html (Vendor-Liste) - # 8) report_html (Doc-Pruefung Details) - # P62: Marketing-Manager-Disclaimer — was wir sehen vs nicht sehen - scope_disclaimer_html = "" - try: - from .scope_disclaimer import build_scope_disclaimer_html - scope_disclaimer_html = build_scope_disclaimer_html() - except Exception as e: - logger.warning("Scope-disclaimer block skipped: %s", e) - - # P103 + P104 — Cookie-Value-Entropy + Network-Tracing (Stufe 3 + 4) - entropy_html = "" - network_trace_html = "" - try: - from compliance.services.cookie_value_entropy import ( - check_cookies_for_entropy_mismatch, build_entropy_block_html, - ) - from compliance.services.cookie_network_tracer import ( - trace_cookie_network, build_network_trace_block_html, - ) - cookies_detailed = (banner_result or {}).get("cookies_detailed") or [] - entropy_findings = check_cookies_for_entropy_mismatch(cookies_detailed) - if entropy_findings: - entropy_html = build_entropy_block_html(entropy_findings) - logger.info("P103 Entropy: %d Findings", len(entropy_findings)) - primary_url = "" - for e_ in doc_entries: - if e_.get("url"): - primary_url = e_["url"]; break - net_findings = trace_cookie_network(cookies_detailed, primary_url) - if net_findings: - network_trace_html = build_network_trace_block_html(net_findings) - logger.info("P104 Network-Trace: %d Findings", len(net_findings)) - except Exception as e: - logger.warning("P103/P104 entropy/network-trace skipped: %s", e) - - # P105 — IAB TCF Authority-Cross-Reference (Stufe 5) - tcf_authority_html = "" - try: - from compliance.services.tcf_vendor_authority import ( - cross_reference_with_tcf, build_tcf_authority_block_html, - ) - from database import SessionLocal as _SLtcf - _tcf_db = _SLtcf() - try: - tcf_findings = cross_reference_with_tcf(_tcf_db, cmp_vendors) - if tcf_findings: - tcf_authority_html = build_tcf_authority_block_html(tcf_findings) - logger.info( - "TCF-Authority: %d Vendor-Discrepancies gefunden", - len(tcf_findings), - ) - finally: - _tcf_db.close() - except Exception as e: - logger.warning("TCF-Authority-Check skipped: %s", e) - - # COOKIE-COMPLIANCE-AUDIT (3-Quellen-Vergleich) — das ist der - # zentrale USP: deklariert in Richtlinie vs tatsaechlich im - # Browser geladen vs Library-Match. - cookie_audit = {} - cookie_audit_html = "" - try: - from compliance.services.cookie_compliance_audit import ( - audit_cookie_compliance, build_cookie_audit_block_html, - ) - from database import SessionLocal as _SLca - _ca_db = _SLca() - try: - cookie_audit = audit_cookie_compliance( - _ca_db, doc_texts.get("cookie") or doc_texts.get("dse"), - banner_result, - ) - if cookie_audit and (cookie_audit.get("declared_count") or - cookie_audit.get("browser_count")): - cookie_audit_html = build_cookie_audit_block_html(cookie_audit) - logger.info( - "Cookie-Audit: %d deklariert, %d im Browser, " - "%d undokumentiert, %d compliant", - cookie_audit.get("declared_count"), - cookie_audit.get("browser_count"), - len(cookie_audit.get("undeclared_in_browser") or []), - len(cookie_audit.get("compliant") or []), - ) - finally: - _ca_db.close() - except Exception as e: - logger.warning("cookie-compliance-audit skipped: %s", e) - - # P102: Cookie-Klassifikations-Pruefung (deklariert vs Library) - library_mismatch_html = "" - mismatches: list[dict] = [] - try: - from compliance.services.cookie_library_mismatch import ( - detect_mismatches, build_mismatch_block_html, - ) - from database import SessionLocal - cookie_doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or "" - all_cookies_seen: list[str] = [] - if banner_result: - for ph in (banner_result.get("phases") or {}).values(): - if isinstance(ph, dict): - for ck in (ph.get("cookies") or []): - if isinstance(ck, str): - all_cookies_seen.append(ck) - elif isinstance(ck, dict) and ck.get("name"): - all_cookies_seen.append(ck["name"]) - if all_cookies_seen and cookie_doc_for_check: - _mm_db = SessionLocal() - try: - mismatches = detect_mismatches( - _mm_db, all_cookies_seen, cookie_doc_for_check, - ) - if mismatches: - library_mismatch_html = build_mismatch_block_html(mismatches) - logger.info( - "P102: %d Cookie-Mismatches gefunden", len(mismatches) - ) - finally: - _mm_db.close() - except Exception as e: - logger.warning("P102 mismatch detection failed: %s", e) - - # P35 + P77 + P78: Textsignal-Checks (Save-Label, Cookies-in-DSE, - # JC-Klausel im DSE) - signals_html = "" - try: - from compliance.services.doc_text_signals import ( - run_all as run_signal_checks, - build_signals_block_html, - ) - cookie_doc_missing = not bool(doc_texts.get("cookie")) - sig_findings = run_signal_checks( - banner_result, doc_texts, cookie_doc_missing, - ) - if sig_findings: - signals_html = build_signals_block_html(sig_findings) - except Exception as e: - logger.warning("P35/P77/P78 signals-check failed: %s", e) - - # P92 + P94: Banner-Konsistenz (CMP-Tool kaputt / Banner-vs-Doc-Diff) - consistency_html = "" - try: - from compliance.services.banner_consistency_checks import ( - run_all as run_consistency_checks, - build_consistency_block_html, - ) - cookie_doc_for_check = (doc_texts.get("cookie") - or doc_texts.get("dse") or "") - cons_findings = run_consistency_checks( - banner_result or {}, cookie_doc_for_check, cmp_vendors, - doc_texts=doc_texts, - ) - if cons_findings: - consistency_html = build_consistency_block_html(cons_findings) - logger.info("P92/P94: %d Konsistenz-Findings", len(cons_findings)) - except Exception as e: - logger.warning("P92/P94 consistency-check failed: %s", e) - - # P73: MC-Solution-Generator — LLM-Vorschlaege pro HIGH-Fail. - # Max 5 Solutions pro Doc-Type um Latenz < 60s zu halten. - solutions_html = "" - try: - from compliance.services.mc_solution_generator import ( - generate_solutions_for_fails, build_solutions_block_html, - ) - all_solutions: list[dict] = [] - for dt, fails in fails_by_doc.items(): - if not fails: - continue - doc_txt = doc_texts.get(dt) or doc_texts.get("dse") or "" - if not doc_txt or len(doc_txt) < 500: - continue - sols = await generate_solutions_for_fails( - fails, doc_txt, dt, limit=3, - ) - all_solutions.extend(sols) - if len(all_solutions) >= 8: - break # global cap - if all_solutions: - solutions_html = build_solutions_block_html(all_solutions[:8]) - logger.info("P73: %d MC-Solutions generiert", len(all_solutions)) - except Exception as e: - logger.warning("P73 MC-Solution-Generator skipped: %s", e) - - # P71: JC-vs-AVV Entscheidungsbaum (nur wenn DSE ambig) - jc_decision_html = "" - try: - from compliance.services.jc_avv_decision import ( - build_jc_avv_decision_html, - ) - jc_decision_html = build_jc_avv_decision_html(doc_texts.get("dse")) - except Exception as e: - logger.warning("P71 jc_avv_decision skipped: %s", e) - - # P6/P53/P55 — Branchen-Kontext + Site-History - industry_ctx_html = "" - try: - from compliance.services.industry_library import ( - build_industry_context_block_html, load_site_profile, - ) - from database import SessionLocal as _SLib - _ind_db = _SLib() - try: - ind = (req.scan_context or {}).get("industry") if req.scan_context else None - site_prof = load_site_profile(_ind_db, domain_for_exec or "") - industry_ctx_html = build_industry_context_block_html(ind, site_prof) - finally: - _ind_db.close() - except Exception as e: - logger.warning("industry context skipped: %s", e) - - # P106 — Internal-Checks-Block (interne Prozesse / Doku-Pflichten) - internal_checks_html = "" - try: - from compliance.services.mc_audit_type import ( - build_internal_checks_block_html, - ) - ic = (mc_split or {}).get("internal_checks") or [] - if ic: - internal_checks_html = build_internal_checks_block_html(ic) - logger.info( - "P106: %d interne Checks (statt FAIL) im Block", - len(ic), - ) - except Exception as e: - logger.warning("P106 internal_checks_html skipped: %s", e) - - # P85 — Banner-Screenshot fuer visuellen Beweis (zwischen - # GF-1-Pager und Detail-Bloecken) - banner_shot_html = "" - try: - from compliance.services.banner_screenshot_block import ( - build_banner_screenshot_html, - ) - banner_shot_html = build_banner_screenshot_html(banner_result) - except Exception as e: - logger.warning("P85 banner-screenshot skipped: %s", e) - - # P82: GF-1-Pager ganz oben in der Mail — 5-Bullet-Zusammenfassung - # damit die GF nicht 124k Char lesen muss. - gf_one_pager_html = "" - try: - from compliance.services.gf_one_pager import build_gf_one_pager_html - gf_one_pager_html = build_gf_one_pager_html( - site_name=site_name_for_exec, - scorecard=scorecard, - previous_scorecard=prev_scorecard, - banner_result=banner_result, - library_mismatch_findings=mismatches, - scan_context=req.scan_context, - audit_quality_findings=audit_quality_findings, - ) - except Exception as e: - logger.warning("P82 GF-1-pager skipped: %s", e) - - # A — Audit-Quality-Checks: Banner-Detect-Failure, Vendor-Extract - # auffaellig duenn, URL-Fetch fehlgeschlagen → IMMER prominent zeigen. - audit_quality_html = "" - audit_quality_findings: list[dict] = [] - try: - from compliance.services.audit_quality_checks import ( - run_all as run_audit_quality, build_audit_quality_block_html, - ) - cookie_text_for_aq = doc_texts.get("cookie") or "" - audit_quality_findings = run_audit_quality( - banner_result, cookie_text_for_aq, cmp_vendors, doc_entries, - ) - if audit_quality_findings: - audit_quality_html = build_audit_quality_block_html(audit_quality_findings) - logger.info( - "audit-quality: %d Vorbehalte erkannt", - len(audit_quality_findings), - ) - except Exception as e: - logger.warning("audit-quality-checks failed: %s", e) - - # Doc-Input-Warnings — wenn User Text ins falsche Feld gepastet hat - input_warn_html = "" - try: - from compliance.services.doc_input_warnings import ( - collect_warnings, build_warnings_block_html, - ) - warns = collect_warnings(doc_entries) - if warns: - input_warn_html = build_warnings_block_html(warns) - logger.info("doc-input-warnings: %d Mismatches gefunden", len(warns)) - except Exception as e: - logger.warning("doc-input-warnings skipped: %s", e) - - # P86: Branchen-Benchmark (nur wenn scan_context.industry gesetzt) - bench_html = "" - try: - from database import SessionLocal as _SLb - from compliance.services.industry_benchmark import ( - compute_benchmark, build_benchmark_html, _extract_score, - ) - industry = (req.scan_context or {}).get("industry") if req.scan_context else None - curr_score = _extract_score(banner_result) - if industry and curr_score is not None: - _b_db = _SLb() - try: - bench = compute_benchmark( - _b_db, industry, curr_score, check_id, - ) - if bench: - bench_html = build_benchmark_html(bench) - finally: - _b_db.close() - except Exception as e: - logger.warning("P86 industry-benchmark skipped: %s", e) - - # P84: Diff-Mode — "Seit letztem Lauf X Findings weg, Y neue". - diff_html = "" - try: - from database import SessionLocal as _SL - from compliance.services.run_diff import ( - compute_diff, build_diff_block_html, - ) - _diff_db = _SL() - try: - diff = compute_diff( - _diff_db, check_id, domain_for_exec or "", - banner_result, scorecard, - ) - if diff: - diff_html = build_diff_block_html(diff) - finally: - _diff_db.close() - except Exception as e: - logger.warning("P84 diff-mode skipped: %s", e) - - full_html = ( - gf_one_pager_html + audit_quality_html + input_warn_html - + bench_html + diff_html - + critical_html + scope_disclaimer_html + exec_summary_html - + cookie_arch_html + summary_html + scanned_html + profile_html - + scorecard_html + internal_checks_html + redundancy_html - + industry_ctx_html - + banner_shot_html - + providers_html + banner_deep_html - + cookie_audit_html - + tcf_authority_html - + entropy_html - + network_trace_html - + library_mismatch_html - + consistency_html + signals_html + solutions_html - + jc_decision_html - + vvt_html + report_html - ) - - # Step 6: Send email — derive site name primarily from entered URL. - # The extracted_profile.companyName is often noisy (e.g. picks up - # juris.de from legal references). Domain-derived name is more - # predictable for the GF email subject. - doc_count = len([r for r in results if not r.error]) - url_company = _company_name_from_url(doc_entries) - domain = _extract_domain(doc_entries) - site_name = url_company or domain or "Unbekannt" - _update(check_id, "E-Mail wird versendet...", 98) - email_result = send_email( - recipient=req.recipient, - subject=f"[COMPLIANCE-CHECK] {site_name} — {doc_count} Dokumente geprueft", - body_html=full_html, - ) - - # Step 7: Store result - response = { - "check_id": check_id, - "results": [_result_to_dict(r) for r in results], - "business_profile": profile_dict, - "extracted_profile": extracted_profile, - # P18: vollen consent-tester-Output durchreichen statt nur 4 Felder. - # phases (before/after-accept/reject) + banner_checks.violations + - # category_tests werden vom Renderer + Critical-Findings-Block genutzt. - "banner_result": ({ - "detected": banner_result.get("banner_detected", False), - "provider": banner_result.get("banner_provider", ""), - "violations": len((banner_result.get("banner_checks") or {}) - .get("violations", [])), - "tcf_vendor_count": len(tcf_vendors), - "completeness_pct": banner_result.get("completeness_pct"), - "correctness_pct": banner_result.get("correctness_pct"), - "phases": banner_result.get("phases", {}), - "banner_checks": banner_result.get("banner_checks", {}), - "category_tests": banner_result.get("category_tests", []), - "structured_checks": banner_result.get("structured_checks", []), - "summary": banner_result.get("summary", {}), - } if banner_result else None), - "tcf_vendors": vvt_entries if tcf_vendors else [], - "cmp_vendors": cmp_vendors, - "cookie_audit": cookie_audit if cookie_audit else None, - "total_documents": len(results), - "total_findings": total_findings, - "email_status": email_result.get("status", "failed"), - "checked_at": datetime.now(timezone.utc).isoformat(), - } - - _compliance_check_jobs[check_id]["status"] = "completed" - _compliance_check_jobs[check_id]["result"] = response - _compliance_check_jobs[check_id]["progress"] = "Fertig" - _compliance_check_jobs[check_id]["progress_pct"] = 100 - - # P80: persist raw scan data so we can replay audit pipeline - # without re-crawling (7min -> 5sec test cycle). - try: - from database import SessionLocal - from compliance.services.check_snapshot import save_snapshot - snap_db = SessionLocal() - try: - save_snapshot( - snap_db, - check_id=check_id, - doc_entries=doc_entries, - banner_result=banner_result, - profile=profile, - cmp_vendors=cmp_vendors, - scan_context=req.scan_context, # P79 - site_label=site_name, - notes=f"recipient={req.recipient}", - ) - finally: - snap_db.close() - except Exception as snap_err: - logger.warning("P80 snapshot save skipped: %s", snap_err) - - # Persist to sidecar SQLite audit log — enables /audit endpoints - # (A5 admin tab) and trend view (A6). Best-effort; failures here - # do not affect the user-facing response. - try: - from compliance.services.compliance_audit_log import record_check_run - from compliance.services.mc_scorecard import full_audit_records - audit_rows: list[dict] = [] - for r in results: - doc_mc = [c for c in r.checks if c.id.startswith("mc-")] - audit_rows.extend(full_audit_records( - [{"id": c.id, "label": c.label, "passed": c.passed, - "severity": c.severity, "skipped": c.skipped, - "regulation": c.regulation, "matched_text": c.matched_text, - "hint": c.hint, "level": c.level} - for c in doc_mc], - check_id=check_id, - doc_type=r.doc_type, - )) - record_check_run( - check_id=check_id, - tenant_id=req.recipient or "", - site_name=site_name, - base_domain=domain or "", - doc_count=doc_count, - scorecard=scorecard, - vvt_summary={ - "total": len(cmp_vendors), - "internal": sum(1 for v in cmp_vendors - if (v.get("recipient_type") or "").upper() - in ("INTERNAL", "GROUP_COMPANY")), - "external": sum(1 for v in cmp_vendors - if (v.get("recipient_type") or "").upper() - in ("PROCESSOR", "CONTROLLER")), - }, - mc_records=audit_rows, - ) - from compliance.services.compliance_audit_log import record_check_payload - record_check_payload( - check_id=check_id, - vendors=cmp_vendors, - profile=extracted_profile, - banner=banner_result, - ) - # Unified findings (P5): bundle MC + Pflichtangaben + Vendor + - # Redundanz in one searchable table behind /agent/findings/. - try: - from compliance.services.unified_findings_collector import collect - from compliance.services.unified_findings_store import record_findings - unified = collect( - check_id=check_id, - results=results, - cmp_vendors=cmp_vendors, - redundancy_report=redundancy_report, - doc_texts=doc_texts, - ) - record_findings(check_id, unified) - except Exception as e: - logger.warning("Unified findings collect failed: %s", e) - except Exception as e: - logger.warning("Audit persistence skipped: %s", e) - - except Exception as e: - logger.error("Compliance check %s failed: %s", check_id, e, exc_info=True) - _compliance_check_jobs[check_id]["status"] = "failed" - _compliance_check_jobs[check_id]["error"] = str(e)[:500] - - -def _update(check_id: str, msg: str, pct: int | None = None): - job = _compliance_check_jobs[check_id] - job["progress"] = msg - if pct is not None: - job["progress_pct"] = max(0, min(100, int(pct))) - - -async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]: - """Fetch text from URL via consent-tester, with HTTP fallback. - - Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured - during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or - HTTP fallback was used. Backend turns payloads into structured vendor - records for the VVT table in the email. - """ - # 1. Consent-tester (Playwright-based, full JS rendering). - # max_documents depends on doc_type: - # - cookie/dse/social_media: self-extract (often + CMP capture) is - # authoritative, sub-pages dilute the policy text. max=1. - # - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar - # enterprise sites split this across 3-4 short sub-pages - # (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows - # them. The 15s networkidle bail (dsi_helpers) keeps timing safe. - short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"} - max_docs = 1 if (doc_type or "") in short_extract_types else 3 - try: - # P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt - # 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit - # 120s auch oft an Akamai-Latenz. - async with httpx.AsyncClient(timeout=240.0) as client: - resp = await client.post( - f"{CONSENT_TESTER_URL}/dsi-discovery", - json={"url": url, "max_documents": max_docs}, - timeout=240.0, - ) - if resp.status_code == 200: - payload = resp.json() - docs = payload.get("documents", []) - cmp_payloads = payload.get("cmp_payloads") or [] - cmp_cookie_text = payload.get("cmp_cookie_text") or "" - # D — wenn der consent-tester HTML-Tabellen aus dem DOM - # extrahiert hat, in die cmp_payloads als "generic_table" - # einschleusen damit das Backend sie via cookies_table_parser - # verarbeiten kann. - for doc in (docs or []): - for tbl in (doc.get("tables") or []): - if not tbl or len(tbl) < 3: - continue - cmp_payloads.append({ - "kind": "html_table", - "url": doc.get("url", ""), - "rows": tbl, - }) - if docs: - texts = [] - for doc in docs: - t = doc.get("full_text", "") or doc.get("text_preview", "") or "" - if t and len(t) > 50: - texts.append(t) - merged = "\n\n".join(texts) - # For cookie/dse/social_media: when CMP reconstruction is - # substantially richer than DOM extraction, use it. This - # fixes the BMW case where DOM yields ~600 words of - # navigation but the ePaaS payload reconstructs to ~1800 - # words of actual cookie policy. - if (doc_type in short_extract_types - and cmp_cookie_text - and len(cmp_cookie_text.split()) > len(merged.split())): - logger.info( - "Preferring CMP-reconstructed text for %s on %s " - "(%d words CMP vs %d words DOM)", - doc_type, url, - len(cmp_cookie_text.split()), - len(merged.split()), - ) - merged = cmp_cookie_text - if merged and len(merged.split()) > 100: - if len(texts) > 1: - logger.info("Merged %d docs from %s (%d words)", - len(texts), url, len(merged.split())) - return merged, cmp_payloads - # P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort- - # Schwelle ist, die captured CMP-Payloads NICHT verwerfen. - # BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON - # (393KB) wurde captured. Backend braucht die fuer - # extract_vendors_from_payloads (VVT-Tabelle). - if cmp_payloads: - logger.info( - "P90: keeping %d CMP payloads for %s despite " - "short text (%d words) — HTTP fallback runs in parallel", - len(cmp_payloads), url, - len((merged or cmp_cookie_text).split()), - ) - fallback_text = merged or cmp_cookie_text or "" - return fallback_text, cmp_payloads - except Exception as e: - # P90: verbose exception fuer Diagnose (war vorher empty) - logger.warning("Consent-tester fetch failed for %s: %s (%s)", - url, str(e) or "(empty)", type(e).__name__) - - # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW). - # P7: kenntlicher UA + per-Domain Rate-Limit. - try: - import re as _re - from compliance.services.compliance_user_agent import ( - default_request_headers, DomainRateLimiter, - ) - async with httpx.AsyncClient( - timeout=30.0, follow_redirects=True, - headers=default_request_headers(), - ) as client: - async with DomainRateLimiter(url): - resp = await client.get(url) - if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""): - html = resp.text - # Strip HTML tags, decode entities - text = _re.sub(r"]*>.*?", " ", html, flags=_re.DOTALL | _re.IGNORECASE) - text = _re.sub(r"]*>.*?", " ", text, flags=_re.DOTALL | _re.IGNORECASE) - text = _re.sub(r"<[^>]+>", " ", text) - text = _re.sub(r"\s+", " ", text).strip() - if len(text.split()) > 100: - logger.info("HTTP fallback for %s: %d words", url, len(text.split())) - return text, [] - except Exception as e: - logger.warning("HTTP fallback failed for %s: %s", url, e) - - return "", [] - - -async def _autodiscover_missing( - check_id: str, - doc_entries: list[dict], - doc_texts: dict[str, str], - url_text_cache: dict[str, str], -) -> None: - """For each canonical doc_type the user did not submit, try to find - the corresponding document on the homepage of the site they DID submit. - - Modifies doc_entries in place: fills text/url/word_count and sets - `auto_discovered=True`. Marks `discovery_attempted=True` on every - missing entry (even when nothing was found) so the report can - distinguish 'Nicht eingereicht' from 'Auf der Website nicht gefunden'. - """ - from urllib.parse import urlparse - - # VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen - # als 'submitted'. Wenn der User eine URL eingegeben hat aber die - # 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger - # als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln - # damit der Discovery-Pass alternative URLs probiert. - _MIN_USEFUL_CHARS = 200 - submitted_types = { - e["doc_type"] for e in doc_entries - if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS - } - # Markiere die fehlgeschlagenen URL-Submissions damit der Discovery - # ihre URL nicht erneut probiert (waere sinnlos). - failed_urls: set[str] = { - (e.get("url") or "").strip() - for e in doc_entries - if (e.get("url") or "").strip() - and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS - } - if failed_urls: - logger.info( - "VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery " - "soll Alternativen probieren: %s", - len(failed_urls), _MIN_USEFUL_CHARS, - ", ".join(list(failed_urls)[:3]), - ) - # Map alias types to canonical - submitted_canon = { - "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types - } - # Missing = canonical types the user did NOT submit - missing = set(_ALL_DOC_TYPES) - submitted_canon - if not missing: - return - - # Pick the most common base (scheme://netloc) from submitted URLs. - bases: dict[str, int] = {} - for e in doc_entries: - u = (e.get("url") or "").strip() - if u and "://" in u: - p = urlparse(u) - base = f"{p.scheme}://{p.netloc}" - bases[base] = bases.get(base, 0) + 1 - if not bases: - # No submitted URL at all — nothing to crawl from. Add empty - # placeholders (with discovery_attempted=False) so the padding - # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden'). - for dt in missing: - doc_entries.append({ - "doc_type": dt, "url": "", "text": "", "word_count": 0, - "auto_discovered": False, "discovery_attempted": False, - }) - return - - # Build crawl plan: primary base + any related domains mentioned in - # the submitted texts that share the owner's SLD. Example: BMW Group - # text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de. - primary_base = max(bases, key=bases.get) + "/" - crawl_bases: list[str] = [primary_base] - primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.") - owner_token = primary_netloc.split(".")[0] # 'bmw' - - if owner_token and len(owner_token) >= 3: - domain_re = re.compile( - r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token) - + r"[a-z0-9\-]*\.[a-z]{2,}", - re.IGNORECASE, - ) - seen_bases = {primary_base} - for entry in doc_entries: - text = entry.get("text") or "" - for m in domain_re.finditer(text): - p = urlparse(m.group(0)) - base = f"{p.scheme}://{p.netloc}/" - base_netloc = p.netloc.lower().lstrip("www.") - if base_netloc == primary_netloc: - continue - if base in seen_bases: - continue - seen_bases.add(base) - crawl_bases.append(base) - if len(crawl_bases) >= 3: - break - if len(crawl_bases) >= 3: - break - - _update( - check_id, - f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...", - 18, - ) - - discovered: list[dict] = [] - disc_payloads: list[dict] = [] - disc_cookie_texts: list[str] = [] - for base in crawl_bases: - try: - async with httpx.AsyncClient(timeout=300.0) as client: # P90: 180s -> 300s - resp = await client.post( - f"{CONSENT_TESTER_URL}/dsi-discovery", - json={"url": base, "max_documents": 15}, - timeout=300.0, # P90: 180s -> 300s - ) - if resp.status_code != 200: - logger.warning("auto-discovery: HTTP %d for %s", - resp.status_code, base) - continue - body = resp.json() - discovered.extend(body.get("documents", []) or []) - disc_payloads.extend(body.get("cmp_payloads") or []) - cmp_text = body.get("cmp_cookie_text") or "" - if cmp_text: - disc_cookie_texts.append(cmp_text) - logger.info("auto-discovery on %s: %d docs, %d CMP payloads, " - "cmp_cookie_text=%d words", base, - len(body.get("documents", []) or []), - len(body.get("cmp_payloads") or []), - len(cmp_text.split())) - except Exception as e: - # P90: verbose exception fuer Diagnose - logger.warning("auto-discovery failed for %s: %s (%s)", - base, str(e) or "(empty)", type(e).__name__) - - # Classify each discovered doc into a canonical doc_type - by_type: dict[str, dict] = {} - for d in discovered: - title = (d.get("title") or "").lower() - url = (d.get("url") or "").lower() - wc = d.get("word_count") or 0 - if wc < 100: - continue - canon = _classify_discovered_doc(title, url) - if canon and canon in missing and canon not in by_type: - by_type[canon] = d - - # Append/Update entry for every missing canonical type. Auto-discovered - # ones get the text/URL filled; ungratched ones stay empty so the - # padding step renders them as 'Auf der Website nicht gefunden'. - # VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber - # fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren. - filled = 0 - for dt in missing: - existing = next((e for e in doc_entries - if e.get("doc_type") == dt), None) - new_entry: dict = existing if existing else { - "doc_type": dt, "url": "", "text": "", "word_count": 0, - "auto_discovered": False, "discovery_attempted": True, - "cmp_payloads": [], - } - new_entry["discovery_attempted"] = True - d = by_type.get(dt) - if d: - full = d.get("full_text") or d.get("text_preview") or "" - # For cookie: prefer the CMP-reconstructed text when it's - # substantially richer than the auto-discovered DOM extraction. - # BMW homepage CMP yields ~1800 words of authoritative policy; - # DOM extraction typically yields ~600 words of site chrome. - if dt == "cookie" and disc_cookie_texts: - cmp_merged = "\n\n".join(disc_cookie_texts) - if len(cmp_merged.split()) > len(full.split()): - logger.info( - "cookie: using CMP-reconstructed text (%d words) " - "instead of DOM (%d words)", - len(cmp_merged.split()), len(full.split()), - ) - full = cmp_merged - if len(full.split()) >= 100: - new_entry["text"] = full - # Behalte die original URL als "rejected_url" damit Audit - # zeigt 'X war 404, wir haben Y gefunden'. - if existing and (existing.get("url") or "").strip() in failed_urls: - new_entry["rejected_url"] = existing.get("url") - new_entry["url"] = d.get("url", "") - new_entry["word_count"] = len(full.split()) - new_entry["auto_discovered"] = True - if dt == "cookie" and disc_payloads: - new_entry["cmp_payloads"] = disc_payloads - doc_texts[dt] = full - filled += 1 - logger.info( - "auto-discovered %s on %s: %s (%d words)%s", - dt, base, d.get("url", "")[:80], new_entry["word_count"], - " [REPLACED failed URL]" if existing else "", - ) - if not existing: - doc_entries.append(new_entry) - - logger.info( - "auto-discovery: filled %d/%d missing types from %s", - filled, len(missing), base, - ) - - -# Title/URL keywords → canonical doc_type. Order matters: most-specific first. -_DISCOVERY_RULES: list[tuple[str, tuple[str, ...]]] = [ - ("cookie", ("cookie", "kuche", "biscuit", "cookies-")), - ("widerruf", ("widerruf", "rueckgabe", "rückgabe", "cancellation", - "right-of-withdrawal", "ruecktritts", "rücktritts")), - ("social_media", ("social-media", "soziale-medien", "social_media", - "social-media-policy")), - # P23: 'terms-and-conditions' kann Allgemeine Geschaeftsbedingungen ODER - # Nutzungsbedingungen meinen. Discovery-Funktion klassifiziert spaeter - # praeziser per Titel + Inhalt. Hier nur Url-Hint: - ("agb", ("/agb", "geschaeftsbedingungen", "geschäftsbedingungen", - "general-terms")), - ("nutzungsbedingungen", ("nutzungsbedingung", "nutzungsbedingungen", - "terms-of-use", "terms-and-conditions", - "nutzungsordnung", "terms-of-service", - "allgemeine-nutzungsbedingungen")), - ("dsb", ("datenschutzbeauftragt", "data-protection-officer", - "dpo-contact", "/dsb")), - ("impressum", ("impressum", "imprint", "legal-notice", "site-notice", - "anbieterkennzeichnung", "legal-disclaimer-pool")), - ("dse", ("data-privacy", "datenschutz", "data-protection", - "privacy-policy", "privacy-notice", "dsgvo", - "data_privacy", "datenschutzinformation")), -] - - -def _classify_discovered_doc(title: str, url: str) -> str | None: - """Map a discovered doc (by its title + URL) to one of our 8 canonical types.""" - haystack = f"{title} {url}" - for canon, keywords in _DISCOVERY_RULES: - if any(kw in haystack for kw in keywords): - return canon - return None - - -async def _check_single( - text: str, doc_type: str, label: str, url: str, - word_count: int, use_agent: bool, - business_scope: set[str] | None = None, - business_profile: dict | None = None, -): - """Run regex + MC checks on a single document.""" - from compliance.services.doc_checks.runner import check_document_completeness - from compliance.services.rag_document_checker import check_document_with_controls - from .agent_doc_check_routes import CheckItem, DocCheckResult - - # Regex checklist - findings = check_document_completeness(text, doc_type, label, url, - business_profile=business_profile) - - all_checks: list[CheckItem] = [] - completeness = 0 - correctness = 0 - - for f in findings: - if "SCORE" in f.get("code", ""): - for c in f.get("all_checks", []): - all_checks.append(CheckItem( - id=c["id"], label=c["label"], passed=c["passed"], - severity=c["severity"], matched_text=c.get("matched_text", ""), - level=c.get("level", 1), parent=c.get("parent"), - skipped=c.get("skipped", False), hint=c.get("hint", ""), - )) - completeness = f.get("completeness_pct", 0) - correctness = f.get("correctness_pct", 0) - - # Master Control checks (top 20 by severity to avoid noise) - try: - # max_controls=0 -> evaluate ALL MCs for this doc_type (DB has - # 1874 across 8 types; regex matching is cheap and dominates - # well under 1s per doc). Caps remain on the LLM-enrich step - # (top-10 FAILs) so cost stays bounded. - mc_results = await check_document_with_controls( - text, doc_type, label, max_controls=0, use_agent=use_agent, - business_scope=business_scope, - ) - if mc_results: - for mc in mc_results: - all_checks.append(CheckItem(**mc)) - l2 = [c for c in all_checks if c.level == 2 and not c.skipped] - l2_passed = sum(1 for c in l2 if c.passed) - correctness = round(l2_passed / len(l2) * 100) if l2 else 0 - except Exception as e: - logger.warning("MC check skipped for %s: %s", label, e) - - # LLM verification of regex fails - failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint] - if failed: - try: - from compliance.services.doc_checks.llm_verify import verify_failed_checks - overturns = await verify_failed_checks( - text, - [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed], - label, - ) - for c in all_checks: - if c.id in overturns and overturns[c.id]["overturned"]: - c.passed = True - c.matched_text = f"[LLM] {overturns[c.id]['evidence']}" - l2_active = [c for c in all_checks if c.level == 2 and not c.skipped] - l2_passed = sum(1 for c in l2_active if c.passed) - if l2_active: - correctness = round(l2_passed / len(l2_active) * 100) - except Exception as e: - logger.warning("LLM verification skipped: %s", e) - - # Cookie-policy only: actively HTTP-probe the Opt-Out + Privacy-Policy - # URLs the document advertises. Broken links make individual provider - # entries non-compliant under Art. 7(3) DSGVO. - if doc_type == "cookie": - try: - from compliance.services.cookie_link_validator import ( - extract_links, validate_links, build_check_items, - ) - links = extract_links(text) - if links: - logger.info("Cookie-link validator: %d urls extracted from %s", - len(links), label) - validated = await validate_links(links) - for item in build_check_items(validated): - all_checks.append(CheckItem(**item)) - # Re-compute correctness with the new L2 items - l2_active = [c for c in all_checks if c.level == 2 and not c.skipped] - l2_passed = sum(1 for c in l2_active if c.passed) - if l2_active: - correctness = round(l2_passed / len(l2_active) * 100) - except Exception as e: - logger.warning("Cookie-link validation skipped for %s: %s", label, e) - - non_score = [f for f in findings if "SCORE" not in f.get("code", "")] - return DocCheckResult( - label=label, url=url, doc_type=doc_type, - word_count=word_count or len(text.split()), - completeness_pct=completeness, correctness_pct=correctness, - checks=all_checks, findings_count=len(non_score), - ) - - -def _pad_results_with_missing( - results: list, - discovery_attempted: set[str] | None = None, -) -> list: - """Ensure every canonical doc_type has an entry in the results list. - - Doc_types the user did not submit AND auto-discovery did not find get - a placeholder DocCheckResult. The error message distinguishes: - - 'Auf der Website nicht gefunden' (discovery was attempted) - - 'Nicht eingereicht' (no submitted URLs to crawl from) - - Preserves the canonical ordering from _ALL_DOC_TYPES so the report - layout is stable. - """ - from .agent_doc_check_routes import DocCheckResult - attempted = discovery_attempted or set() - - by_type: dict[str, object] = {} - for r in results: - canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type - by_type[canon] = r - - ordered: list = [] - for dt in _ALL_DOC_TYPES: - if dt in by_type: - ordered.append(by_type[dt]) - continue - if dt in attempted: - msg = ("Auf der Website nicht gefunden — bitte URL des " - "Dokuments manuell eintragen, falls vorhanden") - else: - msg = "Nicht eingereicht — Quelle nicht angegeben" - ordered.append(DocCheckResult( - label=_doc_type_label(dt), - url="", - doc_type=dt, - word_count=0, - completeness_pct=0, - correctness_pct=0, - checks=[], - findings_count=0, - error=msg, - scenario="missing", - )) - - extras = [r for r in results - if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse") - not in _ALL_DOC_TYPES] - ordered.extend(extras) - return ordered - - -_COMPOUND_TLDS = { - "co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in", - "com.au", "com.br", "com.mx", "com.tr", "com.sg", -} - - -def _extract_domain(doc_entries: list[dict]) -> str | None: - """Extract base domain (without www) from first URL.""" - for entry in doc_entries: - url = entry.get("url", "") - if url and "://" in url: - from urllib.parse import urlparse - host = urlparse(url).netloc.lower() - if host.startswith("www."): - host = host[4:] - return host or None - return None - - -def _company_name_from_url(doc_entries: list[dict]) -> str | None: - """Derive a display company name from the entered URLs. - - Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"), - uppercase short acronyms (<=4 chars, no hyphens), title-case the rest. - - Examples: - www.bmw.de -> BMW - mercedes-benz.de -> Mercedes-Benz - shop.example.co.uk -> Example - juris.de -> Juris - """ - from urllib.parse import urlparse - - for entry in doc_entries: - url = entry.get("url", "") - if not url or "://" not in url: - continue - host = urlparse(url).netloc.lower() - if host.startswith("www."): - host = host[4:] - parts = host.split(".") - if len(parts) < 2: - continue - # Handle compound TLDs (.co.uk etc.) - if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS: - sld = parts[-3] - else: - sld = parts[-2] - if not sld: - continue - if len(sld) <= 4 and "-" not in sld: - return sld.upper() - return "-".join(p.capitalize() for p in sld.split("-")) - return None - - -def _get_skip_types(profile) -> dict[str, str]: - """Doc_types to skip entirely with a per-type reason message. - - Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes): - wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/ - Nutzungsbedingungen nicht Pflicht auf der Website — sie werden - beim Vertragshaendler ausgehaendigt. - """ - if getattr(profile, "no_direct_sales", False): - msg = ( - "Nicht anwendbar — die Webseite schliesst keinen Direkt-" - "Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft " - "ueber Vertragshaendler). AGB/Widerruf werden beim " - "Haendler ausgehaendigt." - ) - return { - "agb": msg, - "widerruf": msg, - "nutzungsbedingungen": msg, - } - return {} - - -def _apply_profile_filter(result, profile, doc_type: str): - """Adjust INFO-level checks based on business profile context. - - For example: ODR check only relevant for B2C online shops. - """ - from .agent_doc_check_routes import CheckItem - - for check in result.checks: - cid = check.id.lower() - - # ODR/OS-Link: relevant ONLY for B2C online shops. The check's - # default hint is written for B2B (it explains why it's not - # relevant) — for B2C we must replace it with action-oriented - # guidance, otherwise the report contradicts itself. - if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower(): - if profile.needs_odr: - if not check.passed: - check.hint = ( - "Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 " - "auf die OS-Plattform (https://ec.europa.eu/consumers/odr) " - "verlinken — klickbarer Link, nicht nur Text. Zusaetzlich " - "§36 VSBG: angeben, ob Sie an Verbraucher-" - "Streitbeilegungsverfahren teilnehmen (oder nicht)." - ) - else: - check.skipped = True - check.hint = "Nicht relevant (kein B2C Online-Shop)" - - # Widerruf: Flag entire document as unnecessary for B2B - if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"): - check.severity = "INFO" - if not check.passed: - check.hint = ( - "Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung " - "(§355 BGB gilt nur fuer Verbrauchervertraege). " - "Empfehlung: Entfernen Sie die Widerrufsbelehrung von " - "Ihrer Website, da sie Verwirrung stiften kann." - ) - - # Regulated profession: check for Kammer info - if "kammer" in cid or "berufsordnung" in check.label.lower(): - if not profile.is_regulated_profession: - check.skipped = True - check.hint = "Nicht relevant (kein regulierter Beruf)" - - return result - - -# ── Helpers ────────────────────────────────────────────────────────── - -_DOC_TYPE_LABELS = { - "dse": "Datenschutzerklaerung", - "datenschutz": "Datenschutzerklaerung", - "privacy": "Datenschutzerklaerung", - "impressum": "Impressum", - "agb": "AGB", - "widerruf": "Widerrufsbelehrung", - "cookie": "Cookie-Richtlinie", - "avv": "Auftragsverarbeitung", - "loeschkonzept": "Loeschkonzept", - "dsfa": "Datenschutz-Folgenabschaetzung", - "social_media": "Social Media Datenschutz", - "nutzungsbedingungen": "Nutzungsbedingungen", - "dsb": "DSB-Kontakt", - # P74: Legal-Notice / Rechtliche Hinweise (IP, Forward-Looking, Risiko) - "legal_notice": "Rechtliche Hinweise", - # P96: Digital Services Act-Pflichtangaben (Art. 12+17 DSA) - "dsa": "DSA-Pflichtangaben", - # P97: Lizenzhinweise Dritter (OSS-Compliance) - "lizenzhinweise": "Lizenzhinweise Dritter", -} - -# Canonical doc types in the same order as the frontend ComplianceCheckTab. -# The route pads `results` to always contain an entry for each — even if -# the user did not submit a URL — so the email + frontend always show -# the complete checklist (missing rows marked as 'Nicht eingereicht'). -# -# DSB-Kontakt is intentionally NOT canonical: per GDPR practice the DSB is -# named *inside* the DSI/datenschutz document (email or contact block), not -# as a separate page. We check 'DSB benannt' as a sub-check of the DSE -# instead. If a tenant insists on a separate DSB document, they can still -# submit one — it just won't appear as a missing checklist row. -_ALL_DOC_TYPES = [ - "dse", "impressum", "social_media", "cookie", - "agb", "nutzungsbedingungen", "widerruf", -] - - -def _doc_type_label(doc_type: str) -> str: - return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper()) - - -def _result_to_dict(r) -> dict: - """Convert DocCheckResult to JSON-serializable dict.""" - fields = ("id", "label", "passed", "severity", "matched_text", - "level", "parent", "skipped", "hint") - return { - "label": r.label, "url": r.url, "doc_type": r.doc_type, - "word_count": r.word_count, "completeness_pct": r.completeness_pct, - "correctness_pct": r.correctness_pct, - "checks": [{f: getattr(c, f) for f in fields} for c in r.checks], - "findings_count": r.findings_count, "error": r.error, - "scenario": getattr(r, "scenario", ""), - } - - -def _build_profile_html(profile) -> str: - from .agent_doc_check_report import build_profile_html - return build_profile_html(profile) - - -# Cross-check extracted to compliance.services.banner_cookie_cross_check -from compliance.services.banner_cookie_cross_check import cross_check_banner_vs_cookie as _cross_check_banner_vs_cookie - - # ── Admin: audit drill-down (A5) + trend view (A6) ────────────────── @router.get("/audit/{check_id}") diff --git a/backend-compliance/compliance/services/consent_reachability_check.py b/backend-compliance/compliance/services/consent_reachability_check.py new file mode 100644 index 00000000..d47d61fc --- /dev/null +++ b/backend-compliance/compliance/services/consent_reachability_check.py @@ -0,0 +1,278 @@ +""" +B1 — Cookie-Consent-UX-001: Mobile Reachability of Consent Settings. + +DSGVO Art. 7 Abs. 3 requires that withdrawing consent must be as +easy as giving it. EDPB Cookie Banner Taskforce Report (2023) and +DSK OH Digitale Dienste v1.2 (2024) both demand a permanent, directly +reachable way to change cookie preferences — typically a Footer link +labelled "Cookie-Einstellungen" that re-opens the CMP in place. + +Common anti-patterns we want to flag: + - Footer points to a Cookie-Policy *page* in a new tab, no CMP + - Footer only offers "more info" but no "manage settings" + - Only mention is a verbal reference to browser settings inside the + privacy-policy text + - Mobile footer hides the link in a multi-level accordion + +This module does the STATIC HTML analysis. The dynamic part (mobile +viewport rendering, tap-target measurement, click-behaviour +verification) is performed by consent-tester via Playwright and feeds +back into `evaluate_combined` in a later phase. + +Pure module — no DB, no network. Tests live in +tests/test_consent_reachability_check.py. +""" + +from __future__ import annotations + +import logging +import re +from html.parser import HTMLParser +from urllib.parse import urljoin, urlparse + +logger = logging.getLogger(__name__) + +# Phrases that suggest "open the consent manager" rather than "show +# more info / open a policy page". +_REOPEN_PHRASES = ( + "cookie-einstellungen", "cookie einstellungen", + "cookie-präferenzen", "cookie praeferenzen", "cookie-praferenzen", + "cookie-einwilligung", "einwilligung verwalten", + "consent manager", "consent settings", "consent-einstellungen", + "datenschutz-einstellungen", "datenschutzeinstellungen", + "cookies verwalten", "manage cookies", "manage preferences", + "privacy settings", "privacy preferences", + "tracking-einstellungen", +) + +# Weaker — these usually point at a policy page, not the CMP itself. +_INFO_ONLY_PHRASES = ( + "cookie-richtlinie", "cookie richtlinie", "cookie-policy", + "cookie policy", "cookies (information)", + "datenschutz", "datenschutzerklärung", "privacy policy", + "weitere informationen", "more information", +) + +# Phrases that try to shift the burden to the user's browser — +# Bundesländer-Datenschutzbeauftragte explicitly call this insufficient. +_BROWSER_DEFLECTION_PHRASES = ( + "browser-einstellungen", "browsereinstellungen", + "einstellungen ihres browsers", "browser settings", + "in ihrem browser", "über ihren browser", +) + + +class _AnchorCollector(HTMLParser): + """Collects and ') + anchors = find_consent_anchors_in_footer(html) + assert anchors[0]["intent"] == "reopen_cmp" + + def test_info_only_link_to_policy(self): + html = _wrap('Cookie-Richtlinie') + anchors = find_consent_anchors_in_footer(html) + assert len(anchors) == 1 + assert anchors[0]["intent"] == "info_only" + + def test_browser_deflection_link(self): + html = _wrap('Browser-Einstellungen') + anchors = find_consent_anchors_in_footer(html) + assert anchors[0]["intent"] == "browser_deflect" + + def test_ignores_anchors_outside_footer(self): + html = ('' + 'Cookie-Einstellungen' + '' + '') + assert find_consent_anchors_in_footer(html) == [] + + def test_role_contentinfo_treated_as_footer(self): + html = ('' + '') + anchors = find_consent_anchors_in_footer(html) + assert len(anchors) == 1 + + def test_class_with_footer_treated_as_footer(self): + html = ('' + '') + anchors = find_consent_anchors_in_footer(html) + assert len(anchors) == 1 + + def test_empty_html(self): + assert find_consent_anchors_in_footer("") == [] + + def test_malformed_html(self): + # broken markup shouldn't crash + anchors = find_consent_anchors_in_footer("