diff --git a/.claude/rules/loc-exceptions.txt b/.claude/rules/loc-exceptions.txt
index 401ce785..48ffb95a 100644
--- a/.claude/rules/loc-exceptions.txt
+++ b/.claude/rules/loc-exceptions.txt
@@ -122,9 +122,9 @@ consent-sdk/src/mobile/ios/ConsentManager.swift
consent-tester/services/dsi_discovery.py
# --- backend-compliance: unified compliance check orchestrator ---
-# Sequential 7-step pipeline (text resolve, profile detect, check documents,
-# banner scan, cross-check, profile extract, report). Phase 5 split target.
-backend-compliance/compliance/api/agent_compliance_check_routes.py
+# 2026-06-06: REMOVED — file split into agent_check/ subpackage
+# (19 files, main module now 347 LOC). Phase 5 target completed.
+# [guardrail-change]
# --- docs-src: binary office files (not source code) ---
# (Also excluded by extension in scripts/check-loc.sh — kept here for legibility.)
diff --git a/backend-compliance/compliance/api/agent_check/__init__.py b/backend-compliance/compliance/api/agent_check/__init__.py
new file mode 100644
index 00000000..b15c5367
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/__init__.py
@@ -0,0 +1,10 @@
+"""
+Subpackage for the compliance-check route — extracted to keep
+`agent_compliance_check_routes.py` under the 500-line guardrail.
+
+The route module still owns the public HTTP endpoints and re-exports
+all helpers from this subpackage, so external callers
+(`saving_scan_routes`, `agent_migration_routes`, tests) continue to
+import them from `compliance.api.agent_compliance_check_routes`
+unchanged.
+"""
diff --git a/backend-compliance/compliance/api/agent_check/_b1_wiring.py b/backend-compliance/compliance/api/agent_check/_b1_wiring.py
new file mode 100644
index 00000000..599a893d
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_b1_wiring.py
@@ -0,0 +1,105 @@
+"""B1 wiring — Mobile Consent-Reachability check + HTML block.
+
+Fetches the homepage of the first submitted URL, runs the static
+`evaluate_reachability` analysis on the footer, and renders the
+result as an HTML block for the audit mail.
+
+Only renders a block when the check FAILS — a passing site doesn't
+need a block. The block is severity-colored and lists the specific
+notes that triggered the finding (missing reopen anchor, new-tab
+break, browser-deflection language).
+"""
+
+from __future__ import annotations
+
+import html
+import logging
+
+import httpx
+
+from compliance.services.consent_reachability_check import (
+ evaluate_reachability,
+)
+
+from ._helpers import _update
+
+logger = logging.getLogger(__name__)
+
+
+async def run_b1(state: dict) -> None:
+ """Run the reachability check + render HTML. Mutates state in place."""
+ req = state["req"]
+ check_id = state["check_id"]
+ homepage_url = ""
+ for d in req.documents:
+ if d.url:
+ from urllib.parse import urlparse
+ p = urlparse(d.url)
+ if p.scheme and p.netloc:
+ homepage_url = f"{p.scheme}://{p.netloc}/"
+ break
+ if not homepage_url:
+ return
+
+ _update(check_id, "Mobile Consent-Reachability prüfen...", 95)
+ try:
+ async with httpx.AsyncClient(
+ timeout=20.0, follow_redirects=True,
+ headers={"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 "
+ "like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) "
+ "Version/17.5 Mobile/15E148 Safari/604.1"},
+ ) as c:
+ r = await c.get(homepage_url)
+ if r.status_code != 200:
+ logger.info("B1: homepage fetch %s → HTTP %d", homepage_url, r.status_code)
+ return
+ page_html = r.text
+ except Exception as e:
+ logger.warning("B1: homepage fetch failed: %s", e)
+ return
+
+ finding = evaluate_reachability(page_html, homepage_url)
+ state["reachability_finding"] = finding
+ state["reachability_html"] = _render_block(finding)
+ logger.info(
+ "B1 Reachability: passed=%s severity=%s reason=%s",
+ finding["passed"], finding.get("severity"),
+ finding.get("severity_reason"),
+ )
+
+
+def _render_block(finding: dict) -> str:
+ """Render the reachability finding as an audit-mail HTML block."""
+ if finding["passed"]:
+ return ""
+ sev = (finding.get("severity") or "").upper()
+ color = "#dc2626" if sev == "HIGH" else "#f59e0b"
+ notes_html = "".join(
+ f"
{html.escape(n)}" for n in finding.get("notes") or []
+ )
+ anchor = finding.get("reopen_anchor") or {}
+ anchor_html = ""
+ if anchor:
+ anchor_html = (
+ ""
+ "Gefundener Footer-Link: "
+ f"{html.escape((anchor.get('text') or '')[:80])} "
+ f"→ {html.escape((anchor.get('href') or '')[:120])} "
+ f"(target_class: {html.escape(anchor.get('target_class') or '—')})"
+ "
"
+ )
+ return (
+ f""
+ f"
"
+ "COOKIE-CONSENT-UX-001 — Mobile Consent-Reachability
"
+ f"
Severity: "
+ f"{sev} ({html.escape(finding.get('severity_reason') or '')})
"
+ "
"
+ "Art. 7 Abs. 3 DSGVO: Widerruf muss so einfach wie Erteilung sein. "
+ "Auf Mobile-Safari konnten wir folgendes Problem feststellen:
"
+ f"
"
+ f"{anchor_html}"
+ "
"
+ )
diff --git a/backend-compliance/compliance/api/agent_check/_b3_wiring.py b/backend-compliance/compliance/api/agent_check/_b3_wiring.py
new file mode 100644
index 00000000..8f6e1a9d
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_b3_wiring.py
@@ -0,0 +1,189 @@
+"""B3 wiring — Cross-doc retention consistency check + HTML block.
+
+Combines three sources of retention truth per cookie:
+
+ - DSI text (state["doc_texts"]["dse"] or "cookie")
+ - cookie-table `duration` from cmp_vendors[i]["cookies"][j]
+ - actual cookie expiry from banner_result["cookies_detailed"][k]
+
+and produces per-cookie findings + a TH-RETENTION theme summary. Only
+renders an HTML block when there are findings to show; the block is
+sorted by severity (HIGH first) and shows the top-10 mismatches.
+"""
+
+from __future__ import annotations
+
+import html
+import logging
+import time
+
+from compliance.services.retention_comparator import (
+ build_retention_theme_summary,
+ compare_retention,
+ extract_retention_claims,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _actual_max_age_seconds(cookie: dict) -> float | None:
+ """Get cookie Max-Age in seconds.
+
+ Playwright gives us `expires` as a Unix timestamp (seconds-since-
+ epoch). Some sources give `max_age` directly. -1 / 0 means session
+ cookie (no expiry) — return None to signal that.
+ """
+ ma = cookie.get("max_age")
+ if isinstance(ma, (int, float)) and ma > 0:
+ return float(ma)
+ exp = cookie.get("expires")
+ if isinstance(exp, (int, float)) and exp > 0:
+ delta = exp - time.time()
+ if delta > 0:
+ return float(delta)
+ return None
+
+
+def run_b3(state: dict) -> None:
+ """Cross-doc retention check + render HTML. Mutates state in place."""
+ doc_texts = state["doc_texts"]
+ cmp_vendors = state["cmp_vendors"]
+ banner_result = state["banner_result"]
+
+ dsi_text = doc_texts.get("dse") or doc_texts.get("cookie") or ""
+ if not dsi_text:
+ return
+
+ cookie_records: list[dict] = []
+ cookie_names: list[str] = []
+ vendor_names: list[str] = []
+ for v in cmp_vendors or []:
+ vname = (v.get("name") or "").strip()
+ if vname:
+ vendor_names.append(vname)
+ for c in (v.get("cookies") or []):
+ cname = (c.get("name") or "").strip()
+ if not cname:
+ continue
+ duration = (c.get("duration") or c.get("persistence")
+ or c.get("expiry") or "")
+ cookie_names.append(cname)
+ cookie_records.append({
+ "name": cname,
+ "vendor": vname,
+ "table_duration": duration,
+ "actual_max_age": None,
+ })
+
+ if not cookie_records:
+ return
+
+ # Match actual max_age from banner_result.cookies_detailed
+ if banner_result:
+ cookies_detailed = banner_result.get("cookies_detailed") or []
+ by_name: dict[str, dict] = {}
+ for c in cookies_detailed:
+ n = (c.get("name") or "").lower()
+ if n:
+ by_name[n] = c
+ for rec in cookie_records:
+ nm = rec["name"].lower()
+ if nm in by_name:
+ rec["actual_max_age"] = _actual_max_age_seconds(by_name[nm])
+
+ claims = extract_retention_claims(dsi_text, cookie_names, vendor_names)
+
+ findings: list[dict] = []
+ for rec in cookie_records:
+ finding = compare_retention(
+ cookie_name=rec["name"],
+ table_duration=rec["table_duration"],
+ actual_max_age_seconds=rec["actual_max_age"],
+ dsi_claims=claims,
+ vendor_name=rec["vendor"] or None,
+ )
+ findings.append(finding)
+
+ summary = build_retention_theme_summary(findings)
+ state["retention_findings"] = findings
+ state["retention_theme_summary"] = summary
+ state["retention_html"] = _render_block(summary, findings)
+ logger.info(
+ "B3 Retention: %d findings, %d passed, %d failed, %d incomplete",
+ summary["total"], summary["passed"], summary["failed"],
+ summary["incomplete"],
+ )
+
+
+def _fmt_days(d: float | None) -> str:
+ if d is None:
+ return "—"
+ if d < 1:
+ return f"{int(d * 24)}h"
+ if d < 30:
+ return f"{int(d)}d"
+ if d < 365:
+ return f"{int(d / 30)}mo"
+ return f"{d / 365:.1f}y"
+
+
+def _render_block(summary: dict, findings: list[dict]) -> str:
+ if summary["total"] == 0:
+ return ""
+ failed_findings = [f for f in findings if not f.get("matches")
+ and f.get("severity_reason") != "incomplete"]
+ if not failed_findings:
+ return "" # all OK, no block needed
+ # Sort by severity (HIGH first) then diff_days desc
+ sev_rank = {"HIGH": 0, "MEDIUM": 1, "LOW": 2}
+ failed_findings.sort(key=lambda f: (
+ sev_rank.get((f.get("severity") or "").upper(), 9),
+ -(f.get("diff_days") or 0),
+ ))
+ rows = []
+ for f in failed_findings[:10]:
+ sev = (f.get("severity") or "").upper()
+ color = ("#dc2626" if sev == "HIGH"
+ else "#f59e0b" if sev == "MEDIUM" else "#64748b")
+ rows.append(
+ ""
+ f""
+ f"{html.escape(f.get('cookie_name') or '—')} | "
+ f""
+ f"{html.escape((f.get('vendor_name') or '—'))} | "
+ f""
+ f"DSI: {_fmt_days(f.get('dsi_days'))} • "
+ f"Tabelle: {_fmt_days(f.get('table_days'))} • "
+ f"Realität: {_fmt_days(f.get('actual_days'))} | "
+ f""
+ f"{sev} ({html.escape(f.get('mismatch_type') or '—')}) | "
+ "
"
+ )
+ total = summary["total"]
+ passed = summary["passed"]
+ failed = summary["failed"]
+ incomplete = summary["incomplete"]
+ return (
+ ""
+ "
"
+ "TH-RETENTION — Speicherdauer-Konsistenz (DSI ↔ Cookie-Tabelle ↔ Realität)"
+ "
"
+ "
"
+ f"{total} Cookies verglichen: "
+ f"{passed} ✓ / "
+ f"{failed} ✗ / "
+ f"{incomplete} ?
"
+ "
"
+ ""
+ "| Cookie | "
+ "Vendor | "
+ "Werte | "
+ "Mismatch | "
+ "
"
+ f"{''.join(rows)}"
+ "
"
+ "
"
+ )
diff --git a/backend-compliance/compliance/api/agent_check/_constants.py b/backend-compliance/compliance/api/agent_check/_constants.py
new file mode 100644
index 00000000..628f45de
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_constants.py
@@ -0,0 +1,93 @@
+"""Module-level constants + shared job state for the compliance-check
+route.
+
+`_compliance_check_jobs` is the SINGLE source of truth for in-flight
+job progress. Other modules MUST import the same object — never
+re-declare it — otherwise progress updates land in a detached dict.
+"""
+
+from __future__ import annotations
+
+# Internal hostname of the consent-tester container.
+CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
+
+# In-memory job registry. Keyed by check_id. Values:
+# {"status": "running"|"completed"|"failed"|"skipped_tdm",
+# "progress": str, "progress_pct": int, "result": dict, ...}
+# Read/written by:
+# - agent_compliance_check_routes (start/status/_run/_update)
+# - saving_scan_routes (start)
+# - agent_migration_routes (status mirror)
+_compliance_check_jobs: dict[str, dict] = {}
+
+
+# Canonical doc types in the same order the frontend
+# ComplianceCheckTab renders them. The route pads `results` to always
+# include an entry for each — missing rows are flagged as 'Nicht
+# eingereicht' or 'Auf der Website nicht gefunden'.
+#
+# DSB-Kontakt is NOT canonical: per GDPR practice the DSB is named
+# inside the DSI/datenschutz document (email or contact block), not as
+# a separate page. We check 'DSB benannt' as a sub-check of the DSE.
+_ALL_DOC_TYPES = [
+ "dse", "impressum", "social_media", "cookie",
+ "agb", "nutzungsbedingungen", "widerruf",
+]
+
+
+# Human-readable labels per doc_type. Used in the report + emails.
+_DOC_TYPE_LABELS = {
+ "dse": "Datenschutzerklaerung",
+ "datenschutz": "Datenschutzerklaerung",
+ "privacy": "Datenschutzerklaerung",
+ "impressum": "Impressum",
+ "agb": "AGB",
+ "widerruf": "Widerrufsbelehrung",
+ "cookie": "Cookie-Richtlinie",
+ "avv": "Auftragsverarbeitung",
+ "loeschkonzept": "Loeschkonzept",
+ "dsfa": "Datenschutz-Folgenabschaetzung",
+ "social_media": "Social Media Datenschutz",
+ "nutzungsbedingungen": "Nutzungsbedingungen",
+ "dsb": "DSB-Kontakt",
+ # P74: Legal-Notice / Rechtliche Hinweise (IP, Forward-Looking, Risiko)
+ "legal_notice": "Rechtliche Hinweise",
+ # P96: Digital Services Act-Pflichtangaben (Art. 12+17 DSA)
+ "dsa": "DSA-Pflichtangaben",
+ # P97: Lizenzhinweise Dritter (OSS-Compliance)
+ "lizenzhinweise": "Lizenzhinweise Dritter",
+}
+
+
+# Title/URL keywords → canonical doc_type. Order matters: most-specific first.
+_DISCOVERY_RULES: list[tuple[str, tuple[str, ...]]] = [
+ ("cookie", ("cookie", "kuche", "biscuit", "cookies-")),
+ ("widerruf", ("widerruf", "rueckgabe", "rückgabe", "cancellation",
+ "right-of-withdrawal", "ruecktritts", "rücktritts")),
+ ("social_media", ("social-media", "soziale-medien", "social_media",
+ "social-media-policy")),
+ # P23: 'terms-and-conditions' kann Allgemeine Geschaeftsbedingungen ODER
+ # Nutzungsbedingungen meinen. Discovery-Funktion klassifiziert spaeter
+ # praeziser per Titel + Inhalt. Hier nur Url-Hint:
+ ("agb", ("/agb", "geschaeftsbedingungen", "geschäftsbedingungen",
+ "general-terms")),
+ ("nutzungsbedingungen", ("nutzungsbedingung", "nutzungsbedingungen",
+ "terms-of-use", "terms-and-conditions",
+ "nutzungsordnung", "terms-of-service",
+ "allgemeine-nutzungsbedingungen")),
+ ("dsb", ("datenschutzbeauftragt", "data-protection-officer",
+ "dpo-contact", "/dsb")),
+ ("impressum", ("impressum", "imprint", "legal-notice", "site-notice",
+ "anbieterkennzeichnung", "legal-disclaimer-pool")),
+ ("dse", ("data-privacy", "datenschutz", "data-protection",
+ "privacy-policy", "privacy-notice", "dsgvo",
+ "data_privacy", "datenschutzinformation")),
+]
+
+
+# Compound TLDs that count as 2 labels when extracting the second-level
+# domain (e.g. shop.example.co.uk → 'example', not 'co').
+_COMPOUND_TLDS = {
+ "co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in",
+ "com.au", "com.br", "com.mx", "com.tr", "com.sg",
+}
diff --git a/backend-compliance/compliance/api/agent_check/_discovery.py b/backend-compliance/compliance/api/agent_check/_discovery.py
new file mode 100644
index 00000000..e7e4d392
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_discovery.py
@@ -0,0 +1,230 @@
+"""Auto-discovery of missing canonical doc-types.
+
+For each canonical type the user did NOT submit, try to find it on the
+homepage of the URLs they DID submit. Also follow same-owner subdomains
+mentioned in the submitted text (BMW Group → bmwgroup.com etc.).
+
+Discovered docs are classified by `_classify_discovered_doc` and merged
+back into `doc_entries`; entries that stayed empty get
+`discovery_attempted=True` so the padding step can differentiate
+"Nicht eingereicht" from "Auf der Website nicht gefunden".
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from urllib.parse import urlparse
+
+import httpx
+
+from ._constants import _ALL_DOC_TYPES, CONSENT_TESTER_URL
+from ._helpers import _classify_discovered_doc, _update
+
+logger = logging.getLogger(__name__)
+
+
+async def _autodiscover_missing(
+ check_id: str,
+ doc_entries: list[dict],
+ doc_texts: dict[str, str],
+ url_text_cache: dict[str, str],
+) -> None:
+ """For each canonical doc_type the user did not submit, try to find
+ the corresponding document on the homepage of the site they DID submit.
+
+ Modifies doc_entries in place: fills text/url/word_count and sets
+ `auto_discovered=True`. Marks `discovery_attempted=True` on every
+ missing entry (even when nothing was found) so the report can
+ distinguish 'Nicht eingereicht' from 'Auf der Website nicht gefunden'.
+ """
+ # VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen
+ # als 'submitted'. Wenn der User eine URL eingegeben hat aber die
+ # 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger
+ # als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln
+ # damit der Discovery-Pass alternative URLs probiert.
+ _MIN_USEFUL_CHARS = 200
+ submitted_types = {
+ e["doc_type"] for e in doc_entries
+ if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS
+ }
+ # Markiere die fehlgeschlagenen URL-Submissions damit der Discovery
+ # ihre URL nicht erneut probiert (waere sinnlos).
+ failed_urls: set[str] = {
+ (e.get("url") or "").strip()
+ for e in doc_entries
+ if (e.get("url") or "").strip()
+ and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS
+ }
+ if failed_urls:
+ logger.info(
+ "VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery "
+ "soll Alternativen probieren: %s",
+ len(failed_urls), _MIN_USEFUL_CHARS,
+ ", ".join(list(failed_urls)[:3]),
+ )
+ # Map alias types to canonical
+ submitted_canon = {
+ "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
+ }
+ # Missing = canonical types the user did NOT submit
+ missing = set(_ALL_DOC_TYPES) - submitted_canon
+ if not missing:
+ return
+
+ # Pick the most common base (scheme://netloc) from submitted URLs.
+ bases: dict[str, int] = {}
+ for e in doc_entries:
+ u = (e.get("url") or "").strip()
+ if u and "://" in u:
+ p = urlparse(u)
+ base = f"{p.scheme}://{p.netloc}"
+ bases[base] = bases.get(base, 0) + 1
+ if not bases:
+ # No submitted URL at all — nothing to crawl from. Add empty
+ # placeholders (with discovery_attempted=False) so the padding
+ # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
+ for dt in missing:
+ doc_entries.append({
+ "doc_type": dt, "url": "", "text": "", "word_count": 0,
+ "auto_discovered": False, "discovery_attempted": False,
+ })
+ return
+
+ # Build crawl plan: primary base + any related domains mentioned in
+ # the submitted texts that share the owner's SLD. Example: BMW Group
+ # text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de.
+ primary_base = max(bases, key=bases.get) + "/"
+ crawl_bases: list[str] = [primary_base]
+ primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.")
+ owner_token = primary_netloc.split(".")[0] # 'bmw'
+
+ if owner_token and len(owner_token) >= 3:
+ domain_re = re.compile(
+ r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token)
+ + r"[a-z0-9\-]*\.[a-z]{2,}",
+ re.IGNORECASE,
+ )
+ seen_bases = {primary_base}
+ for entry in doc_entries:
+ text = entry.get("text") or ""
+ for m in domain_re.finditer(text):
+ p = urlparse(m.group(0))
+ base = f"{p.scheme}://{p.netloc}/"
+ base_netloc = p.netloc.lower().lstrip("www.")
+ if base_netloc == primary_netloc:
+ continue
+ if base in seen_bases:
+ continue
+ seen_bases.add(base)
+ crawl_bases.append(base)
+ if len(crawl_bases) >= 3:
+ break
+ if len(crawl_bases) >= 3:
+ break
+
+ _update(
+ check_id,
+ f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...",
+ 18,
+ )
+
+ discovered: list[dict] = []
+ disc_payloads: list[dict] = []
+ disc_cookie_texts: list[str] = []
+ for base in crawl_bases:
+ try:
+ async with httpx.AsyncClient(timeout=300.0) as client: # P90: 180s -> 300s
+ resp = await client.post(
+ f"{CONSENT_TESTER_URL}/dsi-discovery",
+ json={"url": base, "max_documents": 15},
+ timeout=300.0, # P90: 180s -> 300s
+ )
+ if resp.status_code != 200:
+ logger.warning("auto-discovery: HTTP %d for %s",
+ resp.status_code, base)
+ continue
+ body = resp.json()
+ discovered.extend(body.get("documents", []) or [])
+ disc_payloads.extend(body.get("cmp_payloads") or [])
+ cmp_text = body.get("cmp_cookie_text") or ""
+ if cmp_text:
+ disc_cookie_texts.append(cmp_text)
+ logger.info("auto-discovery on %s: %d docs, %d CMP payloads, "
+ "cmp_cookie_text=%d words", base,
+ len(body.get("documents", []) or []),
+ len(body.get("cmp_payloads") or []),
+ len(cmp_text.split()))
+ except Exception as e:
+ # P90: verbose exception fuer Diagnose
+ logger.warning("auto-discovery failed for %s: %s (%s)",
+ base, str(e) or "(empty)", type(e).__name__)
+
+ # Classify each discovered doc into a canonical doc_type
+ by_type: dict[str, dict] = {}
+ for d in discovered:
+ title = (d.get("title") or "").lower()
+ url = (d.get("url") or "").lower()
+ wc = d.get("word_count") or 0
+ if wc < 100:
+ continue
+ canon = _classify_discovered_doc(title, url)
+ if canon and canon in missing and canon not in by_type:
+ by_type[canon] = d
+
+ # Append/Update entry for every missing canonical type. Auto-discovered
+ # ones get the text/URL filled; ungratched ones stay empty so the
+ # padding step renders them as 'Auf der Website nicht gefunden'.
+ # VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber
+ # fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren.
+ filled = 0
+ for dt in missing:
+ existing = next((e for e in doc_entries
+ if e.get("doc_type") == dt), None)
+ new_entry: dict = existing if existing else {
+ "doc_type": dt, "url": "", "text": "", "word_count": 0,
+ "auto_discovered": False, "discovery_attempted": True,
+ "cmp_payloads": [],
+ }
+ new_entry["discovery_attempted"] = True
+ d = by_type.get(dt)
+ if d:
+ full = d.get("full_text") or d.get("text_preview") or ""
+ # For cookie: prefer the CMP-reconstructed text when it's
+ # substantially richer than the auto-discovered DOM extraction.
+ # BMW homepage CMP yields ~1800 words of authoritative policy;
+ # DOM extraction typically yields ~600 words of site chrome.
+ if dt == "cookie" and disc_cookie_texts:
+ cmp_merged = "\n\n".join(disc_cookie_texts)
+ if len(cmp_merged.split()) > len(full.split()):
+ logger.info(
+ "cookie: using CMP-reconstructed text (%d words) "
+ "instead of DOM (%d words)",
+ len(cmp_merged.split()), len(full.split()),
+ )
+ full = cmp_merged
+ if len(full.split()) >= 100:
+ new_entry["text"] = full
+ # Behalte die original URL als "rejected_url" damit Audit
+ # zeigt 'X war 404, wir haben Y gefunden'.
+ if existing and (existing.get("url") or "").strip() in failed_urls:
+ new_entry["rejected_url"] = existing.get("url")
+ new_entry["url"] = d.get("url", "")
+ new_entry["word_count"] = len(full.split())
+ new_entry["auto_discovered"] = True
+ if dt == "cookie" and disc_payloads:
+ new_entry["cmp_payloads"] = disc_payloads
+ doc_texts[dt] = full
+ filled += 1
+ logger.info(
+ "auto-discovered %s on %s: %s (%d words)%s",
+ dt, base, d.get("url", "")[:80], new_entry["word_count"],
+ " [REPLACED failed URL]" if existing else "",
+ )
+ if not existing:
+ doc_entries.append(new_entry)
+
+ logger.info(
+ "auto-discovery: filled %d/%d missing types from %s",
+ filled, len(missing), base,
+ )
diff --git a/backend-compliance/compliance/api/agent_check/_fetch.py b/backend-compliance/compliance/api/agent_check/_fetch.py
new file mode 100644
index 00000000..52c96e7e
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_fetch.py
@@ -0,0 +1,142 @@
+"""URL → text fetch helper for the compliance-check pipeline.
+
+Tries the consent-tester service first (Playwright, full JS render +
+CMP capture). On any failure or empty result, falls back to a direct
+HTTP GET with an identifiable User-Agent and per-domain rate limiting.
+
+For cookie/dse/social_media doc types we cap discovery to 1 sub-page
+(the policy itself is authoritative). For Impressum/AGB/Widerruf and
+similar enterprise-split pages we follow up to 3 sub-pages.
+"""
+
+from __future__ import annotations
+
+import logging
+import re as _re
+
+import httpx
+
+from ._constants import CONSENT_TESTER_URL
+
+logger = logging.getLogger(__name__)
+
+
+async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
+ """Fetch text from URL via consent-tester, with HTTP fallback.
+
+ Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
+ during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
+ HTTP fallback was used. Backend turns payloads into structured vendor
+ records for the VVT table in the email.
+ """
+ # 1. Consent-tester (Playwright-based, full JS rendering).
+ # max_documents depends on doc_type:
+ # - cookie/dse/social_media: self-extract (often + CMP capture) is
+ # authoritative, sub-pages dilute the policy text. max=1.
+ # - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar
+ # enterprise sites split this across 3-4 short sub-pages
+ # (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows
+ # them. The 15s networkidle bail (dsi_helpers) keeps timing safe.
+ short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"}
+ max_docs = 1 if (doc_type or "") in short_extract_types else 3
+ try:
+ # P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt
+ # 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit
+ # 120s auch oft an Akamai-Latenz.
+ async with httpx.AsyncClient(timeout=240.0) as client:
+ resp = await client.post(
+ f"{CONSENT_TESTER_URL}/dsi-discovery",
+ json={"url": url, "max_documents": max_docs},
+ timeout=240.0,
+ )
+ if resp.status_code == 200:
+ payload = resp.json()
+ docs = payload.get("documents", [])
+ cmp_payloads = payload.get("cmp_payloads") or []
+ cmp_cookie_text = payload.get("cmp_cookie_text") or ""
+ # D — wenn der consent-tester HTML-Tabellen aus dem DOM
+ # extrahiert hat, in die cmp_payloads als "generic_table"
+ # einschleusen damit das Backend sie via cookies_table_parser
+ # verarbeiten kann.
+ for doc in (docs or []):
+ for tbl in (doc.get("tables") or []):
+ if not tbl or len(tbl) < 3:
+ continue
+ cmp_payloads.append({
+ "kind": "html_table",
+ "url": doc.get("url", ""),
+ "rows": tbl,
+ })
+ if docs:
+ texts = []
+ for doc in docs:
+ t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
+ if t and len(t) > 50:
+ texts.append(t)
+ merged = "\n\n".join(texts)
+ # For cookie/dse/social_media: when CMP reconstruction is
+ # substantially richer than DOM extraction, use it. This
+ # fixes the BMW case where DOM yields ~600 words of
+ # navigation but the ePaaS payload reconstructs to ~1800
+ # words of actual cookie policy.
+ if (doc_type in short_extract_types
+ and cmp_cookie_text
+ and len(cmp_cookie_text.split()) > len(merged.split())):
+ logger.info(
+ "Preferring CMP-reconstructed text for %s on %s "
+ "(%d words CMP vs %d words DOM)",
+ doc_type, url,
+ len(cmp_cookie_text.split()),
+ len(merged.split()),
+ )
+ merged = cmp_cookie_text
+ if merged and len(merged.split()) > 100:
+ if len(texts) > 1:
+ logger.info("Merged %d docs from %s (%d words)",
+ len(texts), url, len(merged.split()))
+ return merged, cmp_payloads
+ # P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort-
+ # Schwelle ist, die captured CMP-Payloads NICHT verwerfen.
+ # BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON
+ # (393KB) wurde captured. Backend braucht die fuer
+ # extract_vendors_from_payloads (VVT-Tabelle).
+ if cmp_payloads:
+ logger.info(
+ "P90: keeping %d CMP payloads for %s despite "
+ "short text (%d words) — HTTP fallback runs in parallel",
+ len(cmp_payloads), url,
+ len((merged or cmp_cookie_text).split()),
+ )
+ fallback_text = merged or cmp_cookie_text or ""
+ return fallback_text, cmp_payloads
+ except Exception as e:
+ # P90: verbose exception fuer Diagnose (war vorher empty)
+ logger.warning("Consent-tester fetch failed for %s: %s (%s)",
+ url, str(e) or "(empty)", type(e).__name__)
+
+ # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW).
+ # P7: kenntlicher UA + per-Domain Rate-Limit.
+ try:
+ from compliance.services.compliance_user_agent import (
+ default_request_headers, DomainRateLimiter,
+ )
+ async with httpx.AsyncClient(
+ timeout=30.0, follow_redirects=True,
+ headers=default_request_headers(),
+ ) as client:
+ async with DomainRateLimiter(url):
+ resp = await client.get(url)
+ if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
+ html = resp.text
+ # Strip HTML tags, decode entities
+ text = _re.sub(r"", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
+ text = _re.sub(r"", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
+ text = _re.sub(r"<[^>]+>", " ", text)
+ text = _re.sub(r"\s+", " ", text).strip()
+ if len(text.split()) > 100:
+ logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
+ return text, []
+ except Exception as e:
+ logger.warning("HTTP fallback failed for %s: %s", url, e)
+
+ return "", []
diff --git a/backend-compliance/compliance/api/agent_check/_helpers.py b/backend-compliance/compliance/api/agent_check/_helpers.py
new file mode 100644
index 00000000..4c8d5d28
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_helpers.py
@@ -0,0 +1,228 @@
+"""Pure helpers for the compliance-check route — no I/O, no async.
+
+Grouped here because each is small and they share the same constants
+imports. Splitting further would not improve readability.
+"""
+
+from __future__ import annotations
+
+import logging
+from urllib.parse import urlparse
+
+from ._constants import (
+ _ALL_DOC_TYPES,
+ _COMPOUND_TLDS,
+ _DISCOVERY_RULES,
+ _DOC_TYPE_LABELS,
+ _compliance_check_jobs,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _update(check_id: str, msg: str, pct: int | None = None) -> None:
+ """Update the in-memory job entry with a progress message + pct."""
+ job = _compliance_check_jobs[check_id]
+ job["progress"] = msg
+ if pct is not None:
+ job["progress_pct"] = max(0, min(100, int(pct)))
+
+
+def _doc_type_label(doc_type: str) -> str:
+ return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper())
+
+
+def _classify_discovered_doc(title: str, url: str) -> str | None:
+ """Map a discovered doc (by its title + URL) to one of our 8 canonical types."""
+ haystack = f"{title} {url}"
+ for canon, keywords in _DISCOVERY_RULES:
+ if any(kw in haystack for kw in keywords):
+ return canon
+ return None
+
+
+def _extract_domain(doc_entries: list[dict]) -> str | None:
+ """Extract base domain (without www) from first URL."""
+ for entry in doc_entries:
+ url = entry.get("url", "")
+ if url and "://" in url:
+ host = urlparse(url).netloc.lower()
+ if host.startswith("www."):
+ host = host[4:]
+ return host or None
+ return None
+
+
+def _company_name_from_url(doc_entries: list[dict]) -> str | None:
+ """Derive a display company name from the entered URLs.
+
+ Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"),
+ uppercase short acronyms (<=4 chars, no hyphens), title-case the rest.
+
+ Examples:
+ www.bmw.de -> BMW
+ mercedes-benz.de -> Mercedes-Benz
+ shop.example.co.uk -> Example
+ juris.de -> Juris
+ """
+ for entry in doc_entries:
+ url = entry.get("url", "")
+ if not url or "://" not in url:
+ continue
+ host = urlparse(url).netloc.lower()
+ if host.startswith("www."):
+ host = host[4:]
+ parts = host.split(".")
+ if len(parts) < 2:
+ continue
+ # Handle compound TLDs (.co.uk etc.)
+ if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS:
+ sld = parts[-3]
+ else:
+ sld = parts[-2]
+ if not sld:
+ continue
+ if len(sld) <= 4 and "-" not in sld:
+ return sld.upper()
+ return "-".join(p.capitalize() for p in sld.split("-"))
+ return None
+
+
+def _get_skip_types(profile) -> dict[str, str]:
+ """Doc_types to skip entirely with a per-type reason message.
+
+ Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes):
+ wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/
+ Nutzungsbedingungen nicht Pflicht auf der Website — sie werden
+ beim Vertragshaendler ausgehaendigt.
+ """
+ if getattr(profile, "no_direct_sales", False):
+ msg = (
+ "Nicht anwendbar — die Webseite schliesst keinen Direkt-"
+ "Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft "
+ "ueber Vertragshaendler). AGB/Widerruf werden beim "
+ "Haendler ausgehaendigt."
+ )
+ return {
+ "agb": msg,
+ "widerruf": msg,
+ "nutzungsbedingungen": msg,
+ }
+ return {}
+
+
+def _apply_profile_filter(result, profile, doc_type: str):
+ """Adjust INFO-level checks based on business profile context.
+
+ For example: ODR check only relevant for B2C online shops.
+ """
+ for check in result.checks:
+ cid = check.id.lower()
+
+ # ODR/OS-Link: relevant ONLY for B2C online shops. The check's
+ # default hint is written for B2B (it explains why it's not
+ # relevant) — for B2C we must replace it with action-oriented
+ # guidance, otherwise the report contradicts itself.
+ if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower():
+ if profile.needs_odr:
+ if not check.passed:
+ check.hint = (
+ "Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 "
+ "auf die OS-Plattform (https://ec.europa.eu/consumers/odr) "
+ "verlinken — klickbarer Link, nicht nur Text. Zusaetzlich "
+ "§36 VSBG: angeben, ob Sie an Verbraucher-"
+ "Streitbeilegungsverfahren teilnehmen (oder nicht)."
+ )
+ else:
+ check.skipped = True
+ check.hint = "Nicht relevant (kein B2C Online-Shop)"
+
+ # Widerruf: Flag entire document as unnecessary for B2B
+ if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"):
+ check.severity = "INFO"
+ if not check.passed:
+ check.hint = (
+ "Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung "
+ "(§355 BGB gilt nur fuer Verbrauchervertraege). "
+ "Empfehlung: Entfernen Sie die Widerrufsbelehrung von "
+ "Ihrer Website, da sie Verwirrung stiften kann."
+ )
+
+ # Regulated profession: check for Kammer info
+ if "kammer" in cid or "berufsordnung" in check.label.lower():
+ if not profile.is_regulated_profession:
+ check.skipped = True
+ check.hint = "Nicht relevant (kein regulierter Beruf)"
+
+ return result
+
+
+def _pad_results_with_missing(
+ results: list,
+ discovery_attempted: set[str] | None = None,
+) -> list:
+ """Ensure every canonical doc_type has an entry in the results list.
+
+ Doc_types the user did not submit AND auto-discovery did not find get
+ a placeholder DocCheckResult. The error message distinguishes:
+ - 'Auf der Website nicht gefunden' (discovery was attempted)
+ - 'Nicht eingereicht' (no submitted URLs to crawl from)
+
+ Preserves the canonical ordering from _ALL_DOC_TYPES so the report
+ layout is stable.
+ """
+ from ..agent_doc_check_routes import DocCheckResult
+ attempted = discovery_attempted or set()
+
+ by_type: dict[str, object] = {}
+ for r in results:
+ canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type
+ by_type[canon] = r
+
+ ordered: list = []
+ for dt in _ALL_DOC_TYPES:
+ if dt in by_type:
+ ordered.append(by_type[dt])
+ continue
+ if dt in attempted:
+ msg = ("Auf der Website nicht gefunden — bitte URL des "
+ "Dokuments manuell eintragen, falls vorhanden")
+ else:
+ msg = "Nicht eingereicht — Quelle nicht angegeben"
+ ordered.append(DocCheckResult(
+ label=_doc_type_label(dt),
+ url="",
+ doc_type=dt,
+ word_count=0,
+ completeness_pct=0,
+ correctness_pct=0,
+ checks=[],
+ findings_count=0,
+ error=msg,
+ scenario="missing",
+ ))
+
+ extras = [r for r in results
+ if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse")
+ not in _ALL_DOC_TYPES]
+ ordered.extend(extras)
+ return ordered
+
+
+def _result_to_dict(r) -> dict:
+ """Convert DocCheckResult to JSON-serializable dict."""
+ fields = ("id", "label", "passed", "severity", "matched_text",
+ "level", "parent", "skipped", "hint")
+ return {
+ "label": r.label, "url": r.url, "doc_type": r.doc_type,
+ "word_count": r.word_count, "completeness_pct": r.completeness_pct,
+ "correctness_pct": r.correctness_pct,
+ "checks": [{f: getattr(c, f) for f in fields} for c in r.checks],
+ "findings_count": r.findings_count, "error": r.error,
+ "scenario": getattr(r, "scenario", ""),
+ }
+
+
+def _build_profile_html(profile) -> str:
+ from ..agent_doc_check_report import build_profile_html
+ return build_profile_html(profile)
diff --git a/backend-compliance/compliance/api/agent_check/_orchestrator.py b/backend-compliance/compliance/api/agent_check/_orchestrator.py
new file mode 100644
index 00000000..3fcbb4f1
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_orchestrator.py
@@ -0,0 +1,69 @@
+"""Thin orchestrator — runs the 6 phases of the compliance check.
+
+The original `_run_compliance_check` was a 1620-line monolith. It is
+now decomposed into six phases (A=resolve, B=profile+check,
+C=banner+extract, D=report-build [D1 raw vendors, D2 finalize,
+D3-top/mid/bot blocks], E=email, F=persist), each in its own module.
+
+State flows through a single mutable `dict` (see `_state.new_state`).
+This intentionally trades type safety for additive flexibility: the
+report-building phase routinely adds new optional keys for each new
+HTML block, and a typed dataclass would freeze the schema before the
+new blocks could land.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ._b1_wiring import run_b1
+from ._b3_wiring import run_b3
+from ._constants import _compliance_check_jobs
+from ._phase_a_resolve import run_phase_a
+from ._phase_b_profile_check import run_phase_b
+from ._phase_c_banner import run_phase_c
+from ._phase_d1_vendors_raw import run_phase_d1
+from ._phase_d2_vendors_finalize import run_phase_d2
+from ._phase_d3_blocks_bot import run_phase_d3_bot
+from ._phase_d3_blocks_mid import run_phase_d3_mid
+from ._phase_d3_blocks_top import run_phase_d3_top
+from ._phase_e_email import run_phase_e
+from ._phase_f_persist import run_phase_f
+from ._state import new_state
+
+logger = logging.getLogger(__name__)
+
+
+async def run_compliance_check(check_id: str, req) -> None:
+ """Background task: check all documents with business-profile context."""
+ state = new_state(check_id, req)
+ try:
+ # Phase A: TDM gate + Step 1 (resolve / discover / split / dedup)
+ continue_run = await run_phase_a(state)
+ if not continue_run:
+ return # TDM denied — job already marked skipped_tdm
+ # Phase B: Step 2 (profile detect) + Step 3 (per-doc checks)
+ await run_phase_b(state)
+ # Phase C: Step 3b-d (banner + cross-check + TCF) + Step 4
+ await run_phase_c(state)
+ # Phase D-1/D-2: Step 5 vendor extraction + finalize
+ await run_phase_d1(state)
+ await run_phase_d2(state)
+ # B1 + B3: cross-cutting checks that need the finalized vendor
+ # list + DSI text. Render their own HTML blocks consumed by
+ # phase D-3 bot's full_html composition.
+ await run_b1(state)
+ run_b3(state)
+ # Phase D-3 top/mid/bot: Step 5 HTML blocks
+ await run_phase_d3_top(state)
+ await run_phase_d3_mid(state)
+ await run_phase_d3_bot(state)
+ # Phase E: Step 6 send mail (with A1 ZIP attachment)
+ run_phase_e(state)
+ # Phase F: Step 7 persist + audit log + unified findings
+ run_phase_f(state)
+ except Exception as e:
+ logger.error("Compliance check %s failed: %s",
+ check_id, e, exc_info=True)
+ _compliance_check_jobs[check_id]["status"] = "failed"
+ _compliance_check_jobs[check_id]["error"] = str(e)[:500]
diff --git a/backend-compliance/compliance/api/agent_check/_phase_a_resolve.py b/backend-compliance/compliance/api/agent_check/_phase_a_resolve.py
new file mode 100644
index 00000000..b6bfa679
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_a_resolve.py
@@ -0,0 +1,232 @@
+"""Phase A — TDM gate + text resolution + section split + dedup.
+
+Covers (in the original `_run_compliance_check`):
+ - TDM-reservation pre-check (§ 44b UrhG)
+ - Step 1 Resolve texts (URL fetch / pasted text / auto-reclassify)
+ - Step 1a Auto-discovery of missing canonical doc_types
+ - Step 1b Section splitting (shared URL → multiple doc_types,
+ DSI → Cookie/Social-Media auto-fill)
+ - Step 1c Cross-document keyword search
+ - P15 Dedup of doc_types referencing the same source document
+
+Returns True to continue, False if the run was aborted (TDM denied).
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ._constants import _compliance_check_jobs
+from ._discovery import _autodiscover_missing
+from ._fetch import _fetch_text
+from ._helpers import _update
+
+logger = logging.getLogger(__name__)
+
+
+async def run_phase_a(state: dict) -> bool:
+ """Run TDM gate + Step 1 + Step 1a-c + P15 dedup. Mutate state in place."""
+ check_id = state["check_id"]
+ req = state["req"]
+
+ # Reset anchor-locator cache per run (avoid cross-run leak)
+ try:
+ from compliance.services.doc_anchor_locator import reset_cache
+ reset_cache()
+ except Exception:
+ pass
+
+ # P7: TDM-Reservation-Check der Base-Domain (§ 44b UrhG).
+ # Bei reserved/denied: Run sofort beenden, kein Crawl.
+ try:
+ from compliance.services.tdm_reservation_check import (
+ check_tdm_reservation, is_crawl_allowed,
+ )
+ first_url = next(
+ (d.url for d in req.documents if d.url), "",
+ )
+ if first_url:
+ tdm = await check_tdm_reservation(first_url)
+ _compliance_check_jobs[check_id]["tdm"] = tdm
+ # P12: Bei tdm_override + Reason wird NICHT abgebrochen,
+ # sondern nur dokumentiert. Override ohne Reason wird ignoriert.
+ override_active = (
+ req.tdm_override
+ and len((req.tdm_override_reason or "").strip()) >= 10
+ )
+ if not is_crawl_allowed(tdm) and not override_active:
+ _compliance_check_jobs[check_id]["status"] = "skipped_tdm"
+ _compliance_check_jobs[check_id]["error"] = (
+ f"TDM-Vorbehalt fuer {tdm.get('domain')} erkannt "
+ f"(status={tdm.get('status')}) — Crawl nach § 44b "
+ f"UrhG nicht zulaessig. Signals: "
+ f"{[s.get('src') for s in tdm.get('signals', [])]}"
+ )
+ _compliance_check_jobs[check_id]["progress_pct"] = 100
+ logger.info("TDM-skip check_id=%s domain=%s status=%s",
+ check_id, tdm.get("domain"), tdm.get("status"))
+ return False
+ if override_active and not is_crawl_allowed(tdm):
+ _compliance_check_jobs[check_id]["tdm_override"] = {
+ "reason": req.tdm_override_reason.strip()[:500],
+ "original_status": tdm.get("status"),
+ }
+ logger.warning(
+ "TDM-Override aktiv: check_id=%s domain=%s "
+ "status=%s reason=%r",
+ check_id, tdm.get("domain"), tdm.get("status"),
+ req.tdm_override_reason.strip()[:80],
+ )
+ except Exception as e:
+ logger.warning("TDM-check failed (proceeding): %s", e)
+
+ # Step 1: Resolve texts (fetch from URL if needed) — 0-30%
+ _update(check_id, "Texte werden geladen...", 1)
+ doc_texts: dict[str, str] = {}
+ doc_entries: list[dict] = []
+
+ # Cache fetched URLs to detect duplicates
+ url_text_cache: dict[str, str] = {}
+
+ n_docs = max(1, len(req.documents))
+ # User-pasted-Tabellen-Vendors (kein LLM noetig) — werden weiter
+ # unten in cmp_vendors gemerged.
+ pasted_table_vendors: list[dict] = []
+ for i, doc in enumerate(req.documents):
+ pct = int(1 + (i / n_docs) * 29)
+ _update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct)
+ text = (doc.text or "").strip()
+ input_source = "url"
+ cmp_payloads: list[dict] = []
+ if text:
+ input_source = "text"
+ if doc.url:
+ input_source = "text+url" # User hat beide gefuellt
+ logger.info(
+ "doc_type=%s: User hat URL UND Text geliefert — "
+ "Text gewinnt, URL wird als Quellen-Referenz behalten",
+ doc.doc_type,
+ )
+ elif doc.url:
+ url_key = doc.url.strip().rstrip("/").lower()
+ if url_key in url_text_cache:
+ text = url_text_cache[url_key]
+ else:
+ text, cmp_payloads = await _fetch_text(doc.url, doc_type=doc.doc_type)
+ if text:
+ url_text_cache[url_key] = text
+
+ # Auto-Reclassify-Check: wenn der user Text in das falsche
+ # Doc-Type-Feld kopiert hat (z.B. Impressum-Text in DSE),
+ # erkennen und ggf. umtaggen.
+ actual_doc_type = doc.doc_type
+ reclassify_hint: dict | None = None
+ if input_source.startswith("text") and len(text) >= 500:
+ try:
+ from compliance.services.doc_type_classifier import (
+ detect_mismatch,
+ )
+ reclassify_hint = detect_mismatch(doc.doc_type, text)
+ if reclassify_hint and reclassify_hint["action"] == "reclassify":
+ actual_doc_type = reclassify_hint["detected"]
+ logger.info(
+ "doc_type AUTO-RECLASSIFY: deklariert=%s "
+ "erkannt=%s (score %d vs %d) — uebernehme erkannten Typ",
+ doc.doc_type, actual_doc_type,
+ reclassify_hint["detected_score"],
+ reclassify_hint["declared_score"],
+ )
+ except Exception as e:
+ logger.warning("doc_type_classifier failed: %s", e)
+
+ # Cookie-Tabelle: wenn User Tabelle reinkopiert hat, deterministisch
+ # parsen (kein LLM noetig) und Vendors gleich ableiten.
+ if input_source.startswith("text") and actual_doc_type == "cookie":
+ try:
+ from compliance.services.cookies_table_parser import (
+ parse_cookie_table,
+ )
+ tab_vendors = parse_cookie_table(text)
+ if tab_vendors:
+ pasted_table_vendors.extend(tab_vendors)
+ logger.info(
+ "Cookie-Tabelle erkannt im pasted Text — "
+ "%d Vendors / %d Cookies deterministisch geparst",
+ len(tab_vendors),
+ sum(len(v.get("cookies", [])) for v in tab_vendors),
+ )
+ except Exception as e:
+ logger.warning("cookies_table_parser failed: %s", e)
+
+ if text:
+ doc_texts[actual_doc_type] = text
+ doc_entries.append({
+ "doc_type": actual_doc_type,
+ "declared_doc_type": doc.doc_type,
+ "url": doc.url,
+ "text": text,
+ "word_count": len(text.split()) if text else 0,
+ "auto_discovered": False,
+ "discovery_attempted": False,
+ "cmp_payloads": cmp_payloads,
+ "input_source": input_source,
+ "reclassify_hint": reclassify_hint,
+ })
+
+ # Step 1a-bis: AUTO-DISCOVERY
+ await _autodiscover_missing(
+ check_id, doc_entries, doc_texts, url_text_cache,
+ )
+
+ # Step 1b: Section splitting — two cases:
+ # 1. Same URL used for multiple doc_types → split by heading
+ # 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows
+ from compliance.services.section_splitter import (
+ split_shared_texts, auto_fill_from_dsi, cross_search_documents,
+ )
+ split_shared_texts(doc_entries, url_text_cache)
+ auto_fill_from_dsi(doc_entries)
+
+ # Step 1c: Cross-document search — find doc_types in wrong documents (30-35%)
+ _update(check_id, "Dokumente werden uebergreifend durchsucht...", 32)
+ placement_findings = cross_search_documents(doc_entries)
+
+ # Refresh doc_texts after all splitting/searching
+ for entry in doc_entries:
+ if entry.get("text"):
+ doc_texts[entry["doc_type"]] = entry["text"]
+
+ # P15: Dedupe — wenn mehrere Doc-Types DASSELBE Dokument referenzieren
+ # (z.B. Safetykon: User gibt /datenschutz fuer dse + cookie + widerruf),
+ # behalten wir nur den primaeren Doc-Type. Andere: leeren + note.
+ # Priorität: dse > impressum > cookie > widerruf > agb > nutzungsbedingungen
+ _DOC_PRIORITY = ["dse", "impressum", "cookie", "widerruf", "agb",
+ "nutzungsbedingungen", "social_media", "dsb"]
+ seen_text_hash: dict[int, str] = {}
+ for dt in _DOC_PRIORITY:
+ entry = next((e for e in doc_entries if e.get("doc_type") == dt
+ and e.get("text")), None)
+ if not entry:
+ continue
+ text_hash = hash((entry.get("text") or "").strip()[:1000])
+ if text_hash in seen_text_hash:
+ primary = seen_text_hash[text_hash]
+ logger.info(
+ "P15 dedup: doc_type=%s referenziert dasselbe Dokument "
+ "wie %s (URL=%s) -> als Duplikat markiert.",
+ dt, primary, entry.get("url", "")[:60],
+ )
+ entry["text"] = ""
+ entry["word_count"] = 0
+ entry["url"] = ""
+ entry["dup_of"] = primary
+ doc_texts.pop(dt, None)
+ else:
+ seen_text_hash[text_hash] = dt
+
+ state["doc_texts"] = doc_texts
+ state["doc_entries"] = doc_entries
+ state["url_text_cache"] = url_text_cache
+ state["pasted_table_vendors"] = pasted_table_vendors
+ state["placement_findings"] = placement_findings
+ return True
diff --git a/backend-compliance/compliance/api/agent_check/_phase_b_profile_check.py b/backend-compliance/compliance/api/agent_check/_phase_b_profile_check.py
new file mode 100644
index 00000000..b19c5ed9
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_b_profile_check.py
@@ -0,0 +1,183 @@
+"""Phase B — Business-profile detection + per-document checks.
+
+Covers (in the original `_run_compliance_check`):
+ - Step 2 Detect business profile (with optional homepage merge for
+ P16 keywords)
+ - Step 3 Run regex + MC + LLM checks on each submitted document
+ (`_check_single`), applying skip rules + profile filter
+ + placement findings
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import re as _re
+from dataclasses import asdict
+
+import httpx
+
+from ._helpers import (
+ _apply_profile_filter,
+ _doc_type_label,
+ _get_skip_types,
+ _update,
+)
+from ._single_check import _check_single
+
+logger = logging.getLogger(__name__)
+
+
+async def run_phase_b(state: dict) -> None:
+ """Detect business profile + check each document. Mutates state in place."""
+ check_id = state["check_id"]
+ req = state["req"]
+ doc_texts = state["doc_texts"]
+ doc_entries = state["doc_entries"]
+ placement_findings = state["placement_findings"]
+
+ # Step 2: Detect business profile (35-40%)
+ from compliance.services.business_profiler import detect_business_profile
+ _update(check_id, "Geschaeftsmodell wird erkannt...", 37)
+ # P16: Homepage-Text mit fuer Profile-Detection (no_direct_sales
+ # B2B-Indikatoren wie "CE-Zertifizierung" / "Schulungen" stehen oft
+ # nur im Homepage-Menue, nicht im Pflichttext).
+ profile_input = dict(doc_texts)
+ try:
+ base_url = ""
+ for e in doc_entries:
+ if e.get("url"):
+ from urllib.parse import urlparse
+ p = urlparse(e["url"])
+ if p.scheme and p.netloc:
+ base_url = f"{p.scheme}://{p.netloc}/"
+ break
+ if base_url:
+ async with httpx.AsyncClient(
+ timeout=8.0, follow_redirects=True,
+ headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
+ "AppleWebKit/537.36 HeadlessChrome/120.0.0.0"},
+ ) as _hc:
+ _hr = await _hc.get(base_url)
+ if _hr.status_code == 200 and "text/html" in _hr.headers.get(
+ "content-type", ""):
+ _html = _hr.text[:60000]
+ _html = _re.sub(r"", " ",
+ _html, flags=_re.DOTALL | _re.IGNORECASE)
+ _html = _re.sub(r"", " ",
+ _html, flags=_re.DOTALL | _re.IGNORECASE)
+ _html = _re.sub(r"<[^>]+>", " ", _html)
+ _html = _re.sub(r"\s+", " ", _html).strip()
+ if len(_html.split()) > 30:
+ profile_input["__homepage"] = _html[:20000]
+ logger.info("P16 homepage merged for profile: %d words",
+ len(_html.split()))
+ except Exception as e:
+ logger.debug("homepage fetch for profile failed: %s", e)
+ profile = await detect_business_profile(profile_input)
+ profile_dict = asdict(profile)
+
+ # Step 3: Check each document
+ from ..agent_doc_check_routes import CheckItem, DocCheckResult
+ results: list[DocCheckResult] = []
+ total_findings = 0
+ use_agent_flag = req.use_agent or os.getenv(
+ "COMPLIANCE_USE_AGENT", "false",
+ ).lower() == "true"
+
+ # Filter out doc_types that don't apply to this business profile
+ skip_types = _get_skip_types(profile)
+
+ # Derive business_scope hints for the MC filter (O1 — Doc-type Scope-Flag).
+ # MCs that explicitly require a feature (e.g. 'biometric_processing',
+ # 'ai_decision_making', 'child_targeting') get dropped when the
+ # detected profile doesn't declare it.
+ business_scope: set[str] = set()
+ for svc in (getattr(profile, "detected_services", []) or []):
+ business_scope.add(str(svc).lower())
+ if (getattr(profile, "business_type", "") or "").lower() == "b2c":
+ business_scope.add("b2c")
+ if getattr(profile, "has_online_shop", False):
+ business_scope.add("ecommerce")
+ if getattr(profile, "is_regulated_profession", False):
+ business_scope.add("regulated_profession")
+
+ # Document checks: 40-80%
+ n_entries = max(1, len(doc_entries))
+ for i, entry in enumerate(doc_entries):
+ text = entry["text"]
+ doc_type = entry["doc_type"]
+ label = _doc_type_label(doc_type)
+ url = entry["url"]
+
+ if doc_type in skip_types:
+ results.append(DocCheckResult(
+ label=label, url=url, doc_type=doc_type,
+ error=skip_types[doc_type],
+ ))
+ continue
+
+ pct = int(40 + (i / n_entries) * 40)
+ _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
+
+ if not text or len(text) < 50:
+ # P15: duplicate doc that was deduped against a primary doc
+ if entry.get("dup_of"):
+ results.append(DocCheckResult(
+ label=label, url="", doc_type=doc_type,
+ error=f"Nicht separat vorhanden — wird im Dokument "
+ f"'{_doc_type_label(entry['dup_of'])}' "
+ f"mit-geprueft.",
+ ))
+ continue
+ # P24: DSB-Kontakt ist Pflichtangabe in der DSE (Art. 13(1)(b)
+ # DSGVO) — wenn kein separates DSB-Dokument vorliegt, ist das
+ # KEIN Fehler. DSB-Pruefung passiert ohnehin in der DSE.
+ if doc_type == "dsb" and not (entry.get("url") or "").strip():
+ results.append(DocCheckResult(
+ label=label, url="", doc_type=doc_type,
+ error="Nicht separat vorhanden — DSB-Kontaktdaten "
+ "werden in der Datenschutzerklaerung als "
+ "Pflichtangabe nach Art. 13(1)(b) DSGVO geprueft.",
+ ))
+ continue
+ # Empty entry — either from auto-discovery padding (no URL
+ # to fetch) or from a fetch that returned nothing. If there
+ # was a URL we keep the error so the user knows the fetch
+ # failed; otherwise let the padding step label it
+ # 'Nicht eingereicht' / 'Auf der Website nicht gefunden'.
+ if (entry.get("url") or "").strip():
+ results.append(DocCheckResult(
+ label=label, url=url, doc_type=doc_type,
+ error="Kein Text vorhanden oder zu kurz",
+ ))
+ continue
+
+ result = await _check_single(
+ text, doc_type, label, url,
+ entry["word_count"], use_agent_flag,
+ business_scope=business_scope,
+ business_profile={"no_direct_sales": getattr(profile, "no_direct_sales", False)},
+ )
+
+ # Apply profile context filter
+ result = _apply_profile_filter(result, profile, doc_type)
+
+ # Add placement findings — but only if the regex checks confirm
+ # the text doesn't match. If completeness >= 50%, the text IS the
+ # right doc_type despite missing cross-search keywords.
+ if result.completeness_pct < 50:
+ for pf in placement_findings:
+ if pf.get("doc_type") == doc_type:
+ result.checks.insert(0, CheckItem(**{
+ k: v for k, v in pf.items() if k != "doc_type"
+ }))
+
+ results.append(result)
+ total_findings += result.findings_count
+
+ state["profile"] = profile
+ state["profile_dict"] = profile_dict
+ state["business_scope"] = business_scope
+ state["results"] = results
+ state["total_findings"] = total_findings
diff --git a/backend-compliance/compliance/api/agent_check/_phase_c_banner.py b/backend-compliance/compliance/api/agent_check/_phase_c_banner.py
new file mode 100644
index 00000000..00ec3384
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_c_banner.py
@@ -0,0 +1,129 @@
+"""Phase C — Banner scan + Cookie/DSE cross-check + TCF check + profile extract.
+
+Covers (in the original `_run_compliance_check`):
+ - Step 3b Cookie-banner scan via consent-tester /scan (homepage,
+ 3-phase consent test)
+ - Step 3c Cross-check banner findings vs. cookie-policy text
+ - Step 3d TCF vendor vs. DSI cross-check + VVT entries
+ - Step 4 Extract profile hints from documents
+ - Step 4b Determine scenario per document (skip / regenerate / fix /
+ import)
+ - Step 4c Pad missing canonical doc_types so the report always shows
+ every checklist row
+"""
+
+from __future__ import annotations
+
+import logging
+
+import httpx
+
+from ._constants import CONSENT_TESTER_URL
+from ._helpers import _pad_results_with_missing, _update
+
+logger = logging.getLogger(__name__)
+
+
+async def run_phase_c(state: dict) -> None:
+ """Run banner scan + cross-checks + profile extraction. Mutates state."""
+ check_id = state["check_id"]
+ req = state["req"]
+ doc_texts = state["doc_texts"]
+ doc_entries = state["doc_entries"]
+ results = state["results"]
+ profile_dict = state["profile_dict"]
+
+ # Step 3b: Banner-Check (automatic, uses first URL or homepage)
+ banner_result = None
+ banner_url = req.documents[0].url if req.documents and req.documents[0].url else ""
+ # Use the homepage (strip path) for banner check
+ if banner_url:
+ from urllib.parse import urlparse
+ parsed = urlparse(banner_url)
+ banner_url = f"{parsed.scheme}://{parsed.netloc}"
+ if banner_url:
+ _update(check_id, "Cookie-Banner wird geprueft...", 82)
+ try:
+ async with httpx.AsyncClient(timeout=900.0) as client: # P50: +10min for vendor-detail-phase
+ resp = await client.post(
+ f"{CONSENT_TESTER_URL}/scan",
+ json={"url": banner_url, "timeout_per_phase": 10},
+ )
+ if resp.status_code == 200:
+ banner_result = resp.json()
+ except Exception as e:
+ logger.warning(
+ "Banner check failed: %s (%s)", e or "", type(e).__name__,
+ )
+
+ # Step 3c: Cross-check Banner vs Cookie-Richtlinie (88-90%)
+ if banner_result and "cookie" in doc_texts:
+ from compliance.services.banner_cookie_cross_check import (
+ cross_check_banner_vs_cookie,
+ )
+ from ..agent_doc_check_routes import CheckItem
+ _update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...", 89)
+ cross_findings = cross_check_banner_vs_cookie(
+ banner_result, doc_texts["cookie"],
+ )
+ if cross_findings:
+ for r in results:
+ if r.doc_type == "cookie":
+ for cf in cross_findings:
+ r.checks.append(CheckItem(**cf))
+ l2 = [c for c in r.checks if c.level == 2 and not c.skipped]
+ l2p = sum(1 for c in l2 if c.passed)
+ r.correctness_pct = round(l2p / len(l2) * 100) if l2 else 0
+
+ # Step 3d: TCF Vendor cross-check against DSI
+ tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else []
+ vvt_entries: list[dict] = []
+ if tcf_vendors and "dse" in doc_texts:
+ _update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...", 91)
+ from compliance.services.banner_cookie_cross_check import (
+ cross_check_vendors_vs_dsi,
+ )
+ from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt
+ from ..agent_doc_check_routes import CheckItem
+ vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"])
+ if vendor_findings:
+ for r in results:
+ if r.doc_type == "dse":
+ for vf in vendor_findings:
+ r.checks.append(CheckItem(**vf))
+ vvt_entries = map_vendors_to_vvt(tcf_vendors)
+
+ # Step 4: Extract profile hints from documents (92-95%)
+ _update(check_id, "Profil wird aus Dokumenten extrahiert...", 93)
+ from compliance.services.profile_extractor import (
+ extract_profile_from_documents,
+ )
+ extracted_profile = extract_profile_from_documents(doc_texts, profile_dict)
+
+ # Step 4b: Determine scenario per document
+ for r in results:
+ if r.error:
+ r.scenario = "skip"
+ elif r.completeness_pct < 30:
+ r.scenario = "regenerate"
+ elif r.completeness_pct < 95:
+ r.scenario = "fix"
+ else:
+ r.scenario = "import"
+
+ # Step 4c: Always render all 8 canonical doc types. Missing types
+ # are differentiated:
+ # - Discovery was tried but found nothing -> 'Auf der Website
+ # nicht gefunden' (suggest user provides URL manually)
+ # - No submitted URLs at all -> 'Nicht eingereicht'
+ attempted = {
+ e["doc_type"] for e in doc_entries if e.get("discovery_attempted")
+ }
+ results = _pad_results_with_missing(results, discovery_attempted=attempted)
+
+ state["banner_result"] = banner_result
+ state["banner_url"] = banner_url
+ state["tcf_vendors"] = tcf_vendors
+ state["vvt_entries"] = vvt_entries
+ state["extracted_profile"] = extracted_profile
+ state["results"] = results
diff --git a/backend-compliance/compliance/api/agent_check/_phase_d1_vendors_raw.py b/backend-compliance/compliance/api/agent_check/_phase_d1_vendors_raw.py
new file mode 100644
index 00000000..ca75e24d
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_d1_vendors_raw.py
@@ -0,0 +1,315 @@
+"""Phase D-1 — Vendor-extraction raw stages.
+
+Covers (in the original Step 5 of `_run_compliance_check`):
+ - Aggregate cmp_payloads from all doc_entries + banner_result (P30/P48)
+ - Fallback: use DSE text when cookie was deduped (P17-D)
+ - Extract structured vendor records from CMP payloads
+ - LLM-cascade fallback when structured extract yields < 5 vendors (P52)
+ - Phase-G vendor-details append (P57)
+ - HTML-table DOM parse (Stage D)
+ - Crawled cookie-table parse (Stage B)
+ - Tesseract OCR over evidence slices (Stage C) — also captures the
+ cookie_evidence_slices used by A1 e-mail attachment
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ._helpers import _company_name_from_url, _update
+
+logger = logging.getLogger(__name__)
+
+
+async def run_phase_d1(state: dict) -> None:
+ """Vendor-extract raw stages. Mutates state in place."""
+ check_id = state["check_id"]
+ doc_entries = state["doc_entries"]
+ doc_texts = state["doc_texts"]
+ banner_result = state["banner_result"]
+ pasted_table_vendors = state["pasted_table_vendors"]
+
+ cmp_vendors: list[dict] = []
+ cookie_payloads: list[dict] = []
+ cookie_text = ""
+ cookie_evidence_slices: list[dict] | None = None
+ cookie_evidence_meta: dict | None = None
+
+ try:
+ from compliance.services.vendor_extractor import (
+ extract_vendors_from_payloads,
+ )
+
+ # P30: aggregate cmp_payloads from ALL doc_entries — sites
+ # like Mercedes load Usercentrics only on the homepage, so the
+ # JSON gets captured during DSE/Impressum discovery, not in the
+ # cookies.html fetch. Dedup by URL since the same payload is
+ # captured on every page load.
+ seen_cmp_urls: set[str] = set()
+ for e in doc_entries:
+ for p in (e.get("cmp_payloads") or []):
+ p_url = p.get("url") or ""
+ if p_url and p_url in seen_cmp_urls:
+ continue
+ seen_cmp_urls.add(p_url)
+ cookie_payloads.append(p)
+ if e.get("doc_type") == "cookie" and e.get("text"):
+ cookie_text = e["text"]
+ # P48: also pull cmp_payloads from the Banner-Scan (homepage 3-phase
+ # consent test). Mercedes' Usercentrics-JSON is captured there even
+ # when not in DSI-Discovery of static legal pages.
+ if banner_result:
+ for p in (banner_result.get("cmp_payloads") or []):
+ p_url = p.get("url") or ""
+ if p_url and p_url in seen_cmp_urls:
+ continue
+ seen_cmp_urls.add(p_url)
+ cookie_payloads.append(p)
+ if cookie_payloads:
+ logger.info("P48: %d CMP-payloads available for vendor-extract "
+ "(after Banner-Scan merge)", len(cookie_payloads))
+ # P17-D: Fallback wenn cookie via P15 deduped wurde — nutze DSE-Text
+ # sofern Cookie-Begriffe drin sind, damit LLM-Vendor-Extract trotzdem
+ # greifen kann.
+ if not cookie_text and not cookie_payloads:
+ dse_t = doc_texts.get("dse", "")
+ if dse_t and any(w in dse_t.lower() for w in
+ ("cookie", "tracking", "google analytics", "consent")):
+ cookie_text = dse_t
+ logger.info("P17-D: vendor-extract Fallback auf DSE (Cookie deduped)")
+ owner_name = _company_name_from_url(doc_entries) or ""
+ if cookie_payloads:
+ cmp_vendors = extract_vendors_from_payloads(
+ cookie_payloads, owner_name=owner_name,
+ )
+
+ # P52: LLM-Fallback nicht nur wenn 0 Vendors, sondern auch wenn die
+ # strukturierten Quellen < 5 Vendors lieferten und der Cookie-Text
+ # substantiell ist.
+ if (len(cmp_vendors) < 5
+ and cookie_text and len(cookie_text.split()) >= 500):
+ from compliance.services.vendor_llm_extractor import (
+ extract_vendors_via_llm,
+ )
+ from compliance.services.vendor_classifier import classify
+ _update(check_id, "Vendor-Liste per LLM extrahieren...", 94)
+ llm_vendors = await extract_vendors_via_llm(cookie_text)
+ existing_names = {(v.get("name") or "").strip().lower()
+ for v in cmp_vendors}
+ added_llm = 0
+ for v in llm_vendors:
+ nm = (v.get("name") or "").strip()
+ if not nm or nm.lower() in existing_names:
+ continue
+ v["recipient_type"] = classify(
+ vendor_name=nm,
+ category=v.get("category", ""),
+ owner_name=owner_name,
+ )
+ v.setdefault("source", "llm_cascade")
+ cmp_vendors.append(v)
+ existing_names.add(nm.lower())
+ added_llm += 1
+ if added_llm:
+ logger.info("P52 LLM-Cascade: +%d Vendors (total: %d)",
+ added_llm, len(cmp_vendors))
+
+ # P57: Phase G vendor_details als zusätzliche Vendor-Quelle.
+ if banner_result:
+ vd_list = banner_result.get("vendor_details") or []
+ vd_list = [v for v in vd_list if v.get("name") != "__TDM_OPTOUT__"]
+ existing_names = {(v.get("name") or "").strip().lower()
+ for v in cmp_vendors}
+ added = 0
+ for d in vd_list:
+ n = (d.get("name") or "").strip()
+ if not n or n.lower() in existing_names:
+ continue
+ if n.lower() in ("technisch erforderlich", "analyse und statistik",
+ "marketing", "alles auswählen",
+ "alles auswaehlen"):
+ continue
+ from compliance.services.vendor_classifier import classify
+ cmp_vendors.append({
+ "name": n,
+ "country": "",
+ "purpose": d.get("description", "")[:500],
+ "category": "",
+ "opt_out_url": d.get("opt_out_url", ""),
+ "privacy_policy_url": d.get("privacy_url", ""),
+ "persistence": d.get("retention", ""),
+ "cookies": d.get("cookies", []),
+ "processing_company": d.get("processing_company", ""),
+ "address": d.get("address", ""),
+ "purposes": d.get("purposes", []),
+ "technologies": d.get("technologies", []),
+ "recipient_type": classify(
+ vendor_name=n, category="", owner_name=owner_name,
+ ),
+ })
+ existing_names.add(n.lower())
+ added += 1
+ if added:
+ logger.info("P57: added %d new vendors from Phase G (total: %d)",
+ added, len(cmp_vendors))
+
+ # D — HTML-Tabellen aus DOM
+ for pl in (cookie_payloads or []):
+ if pl.get("kind") != "html_table":
+ continue
+ rows = pl.get("rows") or []
+ if len(rows) < 3:
+ continue
+ try:
+ from compliance.services.cookies_table_parser import (
+ parse_cookie_table as _parse_ct_d,
+ )
+ table_text = "\n".join(rows)
+ d_vendors = _parse_ct_d(table_text)
+ if d_vendors:
+ existing_d = {(v.get("name") or "").strip().lower()
+ for v in cmp_vendors}
+ added_d = 0
+ for v in d_vendors:
+ nm = (v.get("name") or "").strip()
+ if not nm or nm.lower() in existing_d:
+ continue
+ v.setdefault("source", "html_table_dom")
+ cmp_vendors.append(v)
+ existing_d.add(nm.lower())
+ added_d += 1
+ if added_d:
+ logger.info("D HTML-Table-DOM-Parse: +%d Vendors aus "
+ "%d-Zeilen-Tabelle (total: %d)",
+ added_d, len(rows), len(cmp_vendors))
+ except Exception as e:
+ logger.warning("html_table parse failed: %s", e)
+
+ # B — cookies_table_parser auch auf gecrawltem Cookie-Text
+ if cookie_text and len(cookie_text) >= 500:
+ try:
+ from compliance.services.cookies_table_parser import (
+ parse_cookie_table as _parse_ct,
+ parse_flat_cookie_text as _parse_flat,
+ )
+ crawled_table_vendors = _parse_ct(cookie_text)
+ if not crawled_table_vendors:
+ crawled_table_vendors = _parse_flat(cookie_text)
+ if crawled_table_vendors:
+ existing = {(v.get("name") or "").strip().lower()
+ for v in cmp_vendors}
+ added_c = 0
+ for v in crawled_table_vendors:
+ nm = (v.get("name") or "").strip()
+ if not nm or nm.lower() in existing:
+ continue
+ v.setdefault("source", "table_crawled")
+ cmp_vendors.append(v)
+ existing.add(nm.lower())
+ added_c += 1
+ if added_c:
+ logger.info("B Crawled-Tabellen-Parse: +%d Vendors "
+ "(total: %d)", added_c, len(cmp_vendors))
+ except Exception as e:
+ logger.warning("crawled-table-parse failed: %s", e)
+
+ # C — Screenshot + Tesseract-OCR (auch Quelle für A1 ZIP-Anhang)
+ cookie_url_for_shot = ""
+ for _e in doc_entries:
+ if _e.get("doc_type") == "cookie" and _e.get("url"):
+ cookie_url_for_shot = _e["url"]; break
+ if cookie_url_for_shot:
+ try:
+ from compliance.services.cookie_screenshot_ocr import (
+ capture_cookie_evidence_slices,
+ cookies_to_vendor_records,
+ ocr_slices_extract_cookies,
+ )
+ from compliance.services.cookies_table_parser import (
+ _guess_vendor as _gv,
+ )
+ _update(check_id,
+ "Cookie-Richtlinie wird fotografiert "
+ "(lueckenlose Beweiskette)...", 92)
+ ev = await capture_cookie_evidence_slices(
+ cookie_url_for_shot, check_id=check_id,
+ viewport_h=1024, overlap_px=200, max_slices=40,
+ )
+ if ev.get("slices"):
+ cookie_evidence_slices = ev["slices"]
+ cookie_evidence_meta = {
+ "total_height_px": ev.get("total_height_px"),
+ "width_px": ev.get("width_px"),
+ "accepted_banner": ev.get("accepted_banner"),
+ "expanded": ev.get("expanded"),
+ "url": ev.get("url"),
+ "slice_count": len(ev["slices"]),
+ }
+ _update(check_id, "Tesseract OCR über alle Slices...", 93)
+ ocr_cookies, ocr_stats = ocr_slices_extract_cookies(
+ ev["slices"],
+ )
+ if ocr_cookies:
+ ocr_vendors = cookies_to_vendor_records(
+ ocr_cookies, guess_vendor_fn=_gv,
+ )
+ existing = {(v.get("name") or "").strip().lower()
+ for v in cmp_vendors}
+ added_v = 0
+ for v in ocr_vendors:
+ nm = (v.get("name") or "").strip()
+ if not nm:
+ continue
+ if nm.lower() in existing:
+ for ex in cmp_vendors:
+ if (ex.get("name") or "").strip().lower() == nm.lower():
+ ex_names = {
+ (c.get("name") or "").lower()
+ for c in (ex.get("cookies") or [])
+ }
+ for c in (v.get("cookies") or []):
+ if c["name"].lower() not in ex_names:
+ ex.setdefault("cookies", []).append(c)
+ ex_names.add(c["name"].lower())
+ cur_src = ex.get("source", "")
+ if "tesseract_ocr" not in cur_src:
+ ex["source"] = (cur_src + ";tesseract_ocr").strip(";")
+ break
+ continue
+ cmp_vendors.append(v)
+ existing.add(nm.lower())
+ added_v += 1
+ logger.info(
+ "C Tesseract-OCR: +%d Vendors / %d Cookies "
+ "(über %d Slices, total: %d)",
+ added_v, len(ocr_cookies),
+ ocr_stats.get("slices", 0), len(cmp_vendors),
+ )
+ except Exception as e:
+ logger.warning("Tesseract-OCR pipeline failed: %s (%s)",
+ str(e) or "(no msg)", type(e).__name__)
+
+ # User-pasted Cookie-Tabelle (deterministisch, kein LLM):
+ # die hat IMMER Vorrang weil 100% genau.
+ if pasted_table_vendors:
+ existing = {(v.get("name") or "").strip().lower()
+ for v in cmp_vendors}
+ added_p = 0
+ for v in pasted_table_vendors:
+ nm = (v.get("name") or "").strip()
+ if not nm or nm.lower() in existing:
+ continue
+ cmp_vendors.append(v)
+ existing.add(nm.lower())
+ added_p += 1
+ if added_p:
+ logger.info("Pasted-Tabellen-Merge: +%d Vendors (total: %d)",
+ added_p, len(cmp_vendors))
+ except Exception as e:
+ logger.warning("VVT vendor extraction skipped: %s", e)
+
+ state["cmp_vendors"] = cmp_vendors
+ state["cookie_payloads"] = cookie_payloads
+ state["cookie_text"] = cookie_text
+ state["cookie_evidence_slices"] = cookie_evidence_slices
+ state["cookie_evidence_meta"] = cookie_evidence_meta
diff --git a/backend-compliance/compliance/api/agent_check/_phase_d2_vendors_finalize.py b/backend-compliance/compliance/api/agent_check/_phase_d2_vendors_finalize.py
new file mode 100644
index 00000000..344b89a8
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_d2_vendors_finalize.py
@@ -0,0 +1,250 @@
+"""Phase D-2 — Vendor finalize: enrich + normalize + library fallback.
+
+Covers (in the original Step 5 of `_run_compliance_check`):
+ - Cookie-Library-Fallback (P52 Lite) — when < 20 vendors but many
+ after-accept cookies, resolve via library
+ - Vendor-Normalizer (Google-Familie dedup, garbage filter)
+ - Detail-modal enrichment from Phase G (P50) + TDM-opt-out sentinel
+ - Cookie-Behavior-Validator (P59b) — 3-Tier severity findings
+ - Implicit cookies detection (P61) — GTM brings GA/GCL/DoubleClick
+ - validate_vendor_urls + score_vendors + cookie-function classify
+ - Vendor-Redundanz (O4) + EU-Alternativen + Cost/Savings
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+
+logger = logging.getLogger(__name__)
+
+
+async def run_phase_d2(state: dict) -> None:
+ """Vendor finalize stages + redundancy. Mutates state in place."""
+ cmp_vendors = state["cmp_vendors"]
+ cookie_text = state.get("cookie_text", "")
+ banner_result = state["banner_result"]
+ banner_url = state["banner_url"]
+ profile = state["profile"]
+ business_scope = state["business_scope"]
+
+ tdm_opt_out_notice = ""
+ cookie_behavior_findings: list[dict] = []
+ redundancy_report = None
+
+ try:
+ from compliance.services.cookie_link_validator import (
+ score_vendors, validate_vendor_urls,
+ )
+
+ # Cookie-Library-Fallback (P52 Lite): wenn weiterhin wenige
+ # Vendors aber viele after_accept-Cookies, aus Library auflösen.
+ # VW-Lehre: 6 LLM-Grob-Vendors reichen NICHT — die Library
+ # holt 30+ weitere aus den Cookie-Namen + Cookie-Doc-Pattern.
+ # Schwelle: immer probieren wenn < 20 Vendors.
+ if banner_result and len(cmp_vendors) < 20:
+ try:
+ from compliance.services.cookie_to_vendor_fallback import (
+ fallback_vendors_for_run,
+ )
+ from database import SessionLocal as _SLfb
+ _fb_db = _SLfb()
+ try:
+ extra = fallback_vendors_for_run(
+ _fb_db, banner_result, len(cmp_vendors),
+ cookie_doc_text=cookie_text,
+ )
+ if extra:
+ existing_names = {(v.get("name") or "").strip().lower()
+ for v in cmp_vendors}
+ for v in extra:
+ if v["name"].lower() in existing_names:
+ continue
+ cmp_vendors.append(v)
+ logger.info(
+ "Cookie-Library-Fallback: cmp_vendors %d -> %d",
+ len(cmp_vendors) - len(extra), len(cmp_vendors),
+ )
+ finally:
+ _fb_db.close()
+ except Exception as e:
+ logger.warning("Cookie-Library-Fallback skipped: %s", e)
+
+ # Vendor-Normalizer: Dedup (Google-Familie etc) + Garbage-Filter
+ try:
+ from compliance.services.vendor_normalizer import (
+ normalize_vendors as _norm_v,
+ )
+ cmp_vendors = _norm_v(cmp_vendors)
+ except Exception as e:
+ logger.warning("vendor_normalizer skipped: %s", e)
+
+ # P50: enrich vendors with per-vendor detail-modal-extracts
+ if cmp_vendors and banner_result:
+ vendor_details = banner_result.get("vendor_details") or []
+ # P50f: filter out TDM-opt-out sentinel
+ tdm_sentinel = next((v for v in vendor_details
+ if v.get("name") == "__TDM_OPTOUT__"), None)
+ if tdm_sentinel:
+ tdm_opt_out_notice = tdm_sentinel.get("description", "")
+ logger.info("P50f: TDM opt-out — skipped detail-enrichment for vendors")
+ vendor_details = [v for v in vendor_details
+ if v.get("name") != "__TDM_OPTOUT__"]
+ if vendor_details:
+ details_by_name = {}
+ for d in vendor_details:
+ n = (d.get("name") or "").strip().lower()
+ if n:
+ details_by_name[n] = d
+ enriched = 0
+ for v in cmp_vendors:
+ key = (v.get("name") or "").strip().lower()
+ d = details_by_name.get(key)
+ if not d:
+ for dn, dv in details_by_name.items():
+ if key in dn or dn in key:
+ d = dv
+ break
+ if not d:
+ continue
+ if not v.get("country") and (d.get("processing_company") or d.get("address")):
+ addr = d.get("address", "")
+ if re.search(r"\b(deutschland|germany|berlin|m(?:ue|ü)nchen|hamburg|stuttgart)\b", addr, re.I):
+ v["country"] = "DE"
+ elif re.search(r"\bireland|irland|dublin\b", addr, re.I):
+ v["country"] = "IE"
+ elif re.search(r"\busa|united states|california|new york|delaware\b", addr, re.I):
+ v["country"] = "US"
+ if not v.get("purpose"):
+ v["purpose"] = d.get("description", "")[:500]
+ if not v.get("opt_out_url"):
+ v["opt_out_url"] = d.get("opt_out_url", "")
+ if not v.get("privacy_policy_url"):
+ v["privacy_policy_url"] = d.get("privacy_url", "")
+ if not v.get("cookies"):
+ v["cookies"] = d.get("cookies", [])
+ v["purposes"] = d.get("purposes", [])
+ v["technologies"] = d.get("technologies", [])
+ if not v.get("persistence"):
+ v["persistence"] = d.get("retention", "")
+ v["processing_company"] = d.get("processing_company", "")
+ v["address"] = d.get("address", "")
+ enriched += 1
+ logger.info("P50: enriched %d/%d vendors with detail-modal data",
+ enriched, len(cmp_vendors))
+
+ # P59b: Cookie-Behavior-Validator
+ if banner_result:
+ cookies_detailed = banner_result.get("cookies_detailed") or []
+ if cookies_detailed:
+ cb_session = None
+ try:
+ from database import SessionLocal
+ from compliance.services.cookie_behavior_validator import (
+ validate_cookie_behavior,
+ )
+ from urllib.parse import urlparse
+ fp_domain = ""
+ if banner_url:
+ fp_domain = urlparse(banner_url).netloc.replace("www.", "")
+ cb_session = SessionLocal()
+ cookie_behavior_findings = validate_cookie_behavior(
+ cb_session, cookies_detailed,
+ network_requests=[], # TODO Layer B in P59d
+ first_party_domain=fp_domain,
+ )
+ if cookie_behavior_findings:
+ sevs = {f["severity"] for f in cookie_behavior_findings}
+ logger.info(
+ "P59b: Cookie-Behavior-Check %d findings (severities: %s) "
+ "ueber %d Cookies",
+ len(cookie_behavior_findings),
+ sorted(sevs), len(cookies_detailed),
+ )
+ banner_result["cookie_behavior_findings"] = (
+ cookie_behavior_findings
+ )
+ else:
+ logger.info(
+ "P59b: Cookie-Behavior-Check 0 findings ueber %d Cookies "
+ "(library miss / clean)", len(cookies_detailed),
+ )
+ except Exception as cb_err:
+ logger.warning("P59b Cookie-Behavior-Check failed: %s", cb_err)
+ finally:
+ if cb_session is not None:
+ try:
+ cb_session.close()
+ except Exception:
+ pass
+
+ # P61: "Untergeschobene Cookies"
+ if banner_result and cmp_vendors:
+ try:
+ from compliance.services.vendor_package_cookies import (
+ detect_implicit_cookies,
+ )
+ declared = [v.get("name", "") for v in cmp_vendors if v.get("name")]
+ actual_cookies: list[str] = []
+ for phase_data in (banner_result.get("phases") or {}).values():
+ if isinstance(phase_data, dict):
+ for ck in (phase_data.get("cookies") or []):
+ if isinstance(ck, dict) and ck.get("name"):
+ actual_cookies.append(ck["name"])
+ implicit_findings = detect_implicit_cookies(
+ declared, actual_cookies_set=actual_cookies or None,
+ )
+ if implicit_findings:
+ banner_result["implicit_vendor_findings"] = implicit_findings
+ logger.info(
+ "P61: %d implicit vendor-package items detected "
+ "(%d cookies + %d vendors)",
+ len(implicit_findings),
+ sum(1 for f in implicit_findings if f["implicit"]["type"] == "cookie"),
+ sum(1 for f in implicit_findings if f["implicit"]["type"] == "vendor"),
+ )
+ except Exception as p61_err:
+ logger.warning("P61 implicit-vendor detection failed: %s", p61_err)
+
+ if cmp_vendors:
+ logger.info("VVT: %d vendors extracted, validating links",
+ len(cmp_vendors))
+ cmp_vendors = await validate_vendor_urls(cmp_vendors)
+ cmp_vendors = score_vendors(cmp_vendors)
+ try:
+ from compliance.services.cookie_function_classifier import (
+ annotate_vendor_cookies,
+ )
+ cmp_vendors = [annotate_vendor_cookies(v) for v in cmp_vendors]
+ except Exception as e:
+ logger.warning("Cookie function classification skipped: %s", e)
+ except Exception as e:
+ logger.warning("VVT vendor finalize skipped: %s", e)
+
+ # Vendor-Redundanz + EU-Alternativen + Cost/Savings (O4)
+ try:
+ from compliance.services.vendor_cost_estimator import infer_company_tier
+ from compliance.services.vendor_redundancy import (
+ analyze as analyze_redundancy,
+ )
+ if cmp_vendors:
+ bp_dict = {
+ "type": getattr(profile, "business_type", ""),
+ "features": list(business_scope),
+ }
+ ctier = infer_company_tier(bp_dict)
+ redundancy_report = analyze_redundancy(cmp_vendors, company_tier=ctier)
+ logger.info(
+ "Redundanz: %d Kategorien mit Mehrfach-Anbietern, "
+ "Spar-Schaetzung %s pro Jahr (company_tier=%s)",
+ redundancy_report["summary"]["redundancy_count"],
+ redundancy_report["summary"]["estimated_saving_pct"],
+ ctier,
+ )
+ except Exception as e:
+ logger.warning("Vendor redundancy analysis skipped: %s", e)
+
+ state["cmp_vendors"] = cmp_vendors
+ state["tdm_opt_out_notice"] = tdm_opt_out_notice
+ state["cookie_behavior_findings"] = cookie_behavior_findings
+ state["redundancy_report"] = redundancy_report
diff --git a/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_bot.py b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_bot.py
new file mode 100644
index 00000000..49f6ac6d
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_bot.py
@@ -0,0 +1,220 @@
+"""Phase D-3-Bot — Bottom HTML blocks + final composition.
+
+Covers (in the original Step 5):
+ - P71 JC-vs-AVV Entscheidungsbaum (only when DSE ambig)
+ - P6/P53/P55 Branchen-Kontext + Site-History
+ - P106 Internal-Checks-Block
+ - P85 Banner-Screenshot
+ - A Audit-Quality-Checks (Banner-Detect-Failure, vendor-extract dünn)
+ - P82 GF-1-Pager
+ - Doc-Input-Warnings (User text in falsches Feld gepastet)
+ - P86 Branchen-Benchmark
+ - P84 Diff-Mode (since-last-run delta)
+ - Final HTML composition
+
+NOTE: in the original code `audit_quality_findings` was used by
+build_gf_one_pager_html BEFORE it was initialised — a silent
+UnboundLocalError caught by the surrounding try/except, so the
+gf_one_pager block effectively never rendered. Here we run
+audit-quality FIRST so the data is actually available.
+"""
+
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+async def run_phase_d3_bot(state: dict) -> None:
+ """Bottom blocks + assemble full_html. Mutates state in place."""
+ check_id = state["check_id"]
+ req = state["req"]
+ doc_entries = state["doc_entries"]
+ doc_texts = state["doc_texts"]
+ banner_result = state["banner_result"]
+ cmp_vendors = state["cmp_vendors"]
+ mc_split = state["mc_split"]
+ scorecard = state["scorecard"]
+ prev_scorecard = state.get("prev_scorecard")
+ mismatches = state.get("mismatches") or []
+ site_name_for_exec = state.get("site_name_for_exec", "")
+ domain_for_exec = state.get("domain_for_exec")
+ html_blocks = state["html_blocks"]
+
+ # P71: JC-vs-AVV Entscheidungsbaum
+ jc_decision_html = ""
+ try:
+ from compliance.services.jc_avv_decision import (
+ build_jc_avv_decision_html,
+ )
+ jc_decision_html = build_jc_avv_decision_html(doc_texts.get("dse"))
+ except Exception as e:
+ logger.warning("P71 jc_avv_decision skipped: %s", e)
+
+ # P6/P53/P55 — Branchen-Kontext + Site-History
+ industry_ctx_html = ""
+ try:
+ from compliance.services.industry_library import (
+ build_industry_context_block_html, load_site_profile,
+ )
+ from database import SessionLocal as _SLib
+ _ind_db = _SLib()
+ try:
+ ind = (req.scan_context or {}).get("industry") if req.scan_context else None
+ site_prof = load_site_profile(_ind_db, domain_for_exec or "")
+ industry_ctx_html = build_industry_context_block_html(ind, site_prof)
+ finally:
+ _ind_db.close()
+ except Exception as e:
+ logger.warning("industry context skipped: %s", e)
+
+ # P106 — Internal-Checks-Block
+ internal_checks_html = ""
+ try:
+ from compliance.services.mc_audit_type import (
+ build_internal_checks_block_html,
+ )
+ ic = (mc_split or {}).get("internal_checks") or []
+ if ic:
+ internal_checks_html = build_internal_checks_block_html(ic)
+ logger.info("P106: %d interne Checks (statt FAIL) im Block",
+ len(ic))
+ except Exception as e:
+ logger.warning("P106 internal_checks_html skipped: %s", e)
+
+ # P85 — Banner-Screenshot
+ banner_shot_html = ""
+ try:
+ from compliance.services.banner_screenshot_block import (
+ build_banner_screenshot_html,
+ )
+ banner_shot_html = build_banner_screenshot_html(banner_result)
+ except Exception as e:
+ logger.warning("P85 banner-screenshot skipped: %s", e)
+
+ # A — Audit-Quality-Checks (run BEFORE gf_one_pager so the data is
+ # available — original code had this inverted, causing
+ # UnboundLocalError silently caught).
+ audit_quality_html = ""
+ audit_quality_findings: list[dict] = []
+ try:
+ from compliance.services.audit_quality_checks import (
+ build_audit_quality_block_html, run_all as run_audit_quality,
+ )
+ cookie_text_for_aq = doc_texts.get("cookie") or ""
+ audit_quality_findings = run_audit_quality(
+ banner_result, cookie_text_for_aq, cmp_vendors, doc_entries,
+ )
+ if audit_quality_findings:
+ audit_quality_html = build_audit_quality_block_html(audit_quality_findings)
+ logger.info("audit-quality: %d Vorbehalte erkannt",
+ len(audit_quality_findings))
+ except Exception as e:
+ logger.warning("audit-quality-checks failed: %s", e)
+
+ # P82: GF-1-Pager (now has the audit_quality_findings filled)
+ gf_one_pager_html = ""
+ try:
+ from compliance.services.gf_one_pager import build_gf_one_pager_html
+ gf_one_pager_html = build_gf_one_pager_html(
+ site_name=site_name_for_exec,
+ scorecard=scorecard,
+ previous_scorecard=prev_scorecard,
+ banner_result=banner_result,
+ library_mismatch_findings=mismatches,
+ scan_context=req.scan_context,
+ audit_quality_findings=audit_quality_findings,
+ )
+ except Exception as e:
+ logger.warning("P82 GF-1-pager skipped: %s", e)
+
+ # Doc-Input-Warnings — wenn User Text ins falsche Feld gepastet hat
+ input_warn_html = ""
+ try:
+ from compliance.services.doc_input_warnings import (
+ build_warnings_block_html, collect_warnings,
+ )
+ warns = collect_warnings(doc_entries)
+ if warns:
+ input_warn_html = build_warnings_block_html(warns)
+ logger.info("doc-input-warnings: %d Mismatches gefunden", len(warns))
+ except Exception as e:
+ logger.warning("doc-input-warnings skipped: %s", e)
+
+ # P86: Branchen-Benchmark
+ bench_html = ""
+ try:
+ from compliance.services.industry_benchmark import (
+ _extract_score, build_benchmark_html, compute_benchmark,
+ )
+ from database import SessionLocal as _SLb
+ industry = (req.scan_context or {}).get("industry") if req.scan_context else None
+ curr_score = _extract_score(banner_result)
+ if industry and curr_score is not None:
+ _b_db = _SLb()
+ try:
+ bench = compute_benchmark(
+ _b_db, industry, curr_score, check_id,
+ )
+ if bench:
+ bench_html = build_benchmark_html(bench)
+ finally:
+ _b_db.close()
+ except Exception as e:
+ logger.warning("P86 industry-benchmark skipped: %s", e)
+
+ # P84: Diff-Mode
+ diff_html = ""
+ try:
+ from compliance.services.run_diff import (
+ build_diff_block_html, compute_diff,
+ )
+ from database import SessionLocal as _SL
+ _diff_db = _SL()
+ try:
+ diff = compute_diff(
+ _diff_db, check_id, domain_for_exec or "",
+ banner_result, scorecard,
+ )
+ if diff:
+ diff_html = build_diff_block_html(diff)
+ finally:
+ _diff_db.close()
+ except Exception as e:
+ logger.warning("P84 diff-mode skipped: %s", e)
+
+ # B1 / B3 cross-cutting findings (own renderers, may be empty).
+ reachability_html = state.get("reachability_html", "")
+ retention_html = state.get("retention_html", "")
+
+ # Reihenfolge — Sales-optimiert.
+ # B1 (Reachability) sits next to critical because it's an Art.7-Abs.3
+ # finding. B3 (Retention) sits next to cookie_audit because both
+ # are 3-source comparisons of cookie metadata.
+ full_html = (
+ gf_one_pager_html + audit_quality_html + input_warn_html
+ + bench_html + diff_html
+ + html_blocks["critical_html"] + reachability_html
+ + html_blocks["scope_disclaimer_html"]
+ + html_blocks["exec_summary_html"]
+ + html_blocks["cookie_arch_html"] + html_blocks["summary_html"]
+ + html_blocks["scanned_html"] + html_blocks["profile_html"]
+ + html_blocks["scorecard_html"] + internal_checks_html
+ + html_blocks["redundancy_html"]
+ + industry_ctx_html
+ + banner_shot_html
+ + html_blocks["providers_html"] + html_blocks["banner_deep_html"]
+ + html_blocks["cookie_audit_html"] + retention_html
+ + html_blocks["tcf_authority_html"]
+ + html_blocks["entropy_html"]
+ + html_blocks["network_trace_html"]
+ + html_blocks["library_mismatch_html"]
+ + html_blocks["consistency_html"] + html_blocks["signals_html"]
+ + html_blocks["solutions_html"]
+ + jc_decision_html
+ + html_blocks["vvt_html"] + html_blocks["report_html"]
+ )
+
+ state["audit_quality_findings"] = audit_quality_findings
+ state["full_html"] = full_html
diff --git a/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_mid.py b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_mid.py
new file mode 100644
index 00000000..b080a5c5
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_mid.py
@@ -0,0 +1,221 @@
+"""Phase D-3-Mid — Mid HTML blocks (P62/P103/P104/P105/audit/mismatch/signals).
+
+Covers (in the original Step 5):
+ - P62 Scope-Disclaimer
+ - P103 Cookie-Value-Entropy + P104 Network-Tracing
+ - P105 IAB TCF Authority cross-reference
+ - Cookie-Compliance-Audit (3-Quellen-Vergleich, central USP)
+ - P102 Cookie-Klassifikations-Pruefung (library mismatch)
+ - P35/P77/P78 Doc-Text signals
+ - P92/P94 Banner-Konsistenz
+ - P73 MC-Solution-Generator (LLM suggestions per HIGH-Fail)
+"""
+
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+async def run_phase_d3_mid(state: dict) -> None:
+ """Mid HTML blocks. Mutates state in place."""
+ doc_entries = state["doc_entries"]
+ doc_texts = state["doc_texts"]
+ banner_result = state["banner_result"]
+ cmp_vendors = state["cmp_vendors"]
+ fails_by_doc = state["fails_by_doc"]
+ html_blocks = state["html_blocks"]
+
+ # P62: Marketing-Manager-Disclaimer
+ scope_disclaimer_html = ""
+ try:
+ from ..scope_disclaimer import build_scope_disclaimer_html
+ scope_disclaimer_html = build_scope_disclaimer_html()
+ except Exception as e:
+ logger.warning("Scope-disclaimer block skipped: %s", e)
+
+ # P103 + P104 — Cookie-Value-Entropy + Network-Tracing
+ entropy_html = ""
+ network_trace_html = ""
+ try:
+ from compliance.services.cookie_network_tracer import (
+ build_network_trace_block_html,
+ trace_cookie_network,
+ )
+ from compliance.services.cookie_value_entropy import (
+ build_entropy_block_html,
+ check_cookies_for_entropy_mismatch,
+ )
+ cookies_detailed = (banner_result or {}).get("cookies_detailed") or []
+ entropy_findings = check_cookies_for_entropy_mismatch(cookies_detailed)
+ if entropy_findings:
+ entropy_html = build_entropy_block_html(entropy_findings)
+ logger.info("P103 Entropy: %d Findings", len(entropy_findings))
+ primary_url = ""
+ for e_ in doc_entries:
+ if e_.get("url"):
+ primary_url = e_["url"]; break
+ net_findings = trace_cookie_network(cookies_detailed, primary_url)
+ if net_findings:
+ network_trace_html = build_network_trace_block_html(net_findings)
+ logger.info("P104 Network-Trace: %d Findings", len(net_findings))
+ except Exception as e:
+ logger.warning("P103/P104 entropy/network-trace skipped: %s", e)
+
+ # P105 — IAB TCF Authority-Cross-Reference
+ tcf_authority_html = ""
+ try:
+ from compliance.services.tcf_vendor_authority import (
+ build_tcf_authority_block_html, cross_reference_with_tcf,
+ )
+ from database import SessionLocal as _SLtcf
+ _tcf_db = _SLtcf()
+ try:
+ tcf_findings = cross_reference_with_tcf(_tcf_db, cmp_vendors)
+ if tcf_findings:
+ tcf_authority_html = build_tcf_authority_block_html(tcf_findings)
+ logger.info(
+ "TCF-Authority: %d Vendor-Discrepancies gefunden",
+ len(tcf_findings),
+ )
+ finally:
+ _tcf_db.close()
+ except Exception as e:
+ logger.warning("TCF-Authority-Check skipped: %s", e)
+
+ # COOKIE-COMPLIANCE-AUDIT (3-Quellen-Vergleich — central USP)
+ cookie_audit: dict = {}
+ cookie_audit_html = ""
+ try:
+ from compliance.services.cookie_compliance_audit import (
+ audit_cookie_compliance, build_cookie_audit_block_html,
+ )
+ from database import SessionLocal as _SLca
+ _ca_db = _SLca()
+ try:
+ cookie_audit = audit_cookie_compliance(
+ _ca_db, doc_texts.get("cookie") or doc_texts.get("dse"),
+ banner_result,
+ )
+ if cookie_audit and (cookie_audit.get("declared_count") or
+ cookie_audit.get("browser_count")):
+ cookie_audit_html = build_cookie_audit_block_html(cookie_audit)
+ logger.info(
+ "Cookie-Audit: %d deklariert, %d im Browser, "
+ "%d undokumentiert, %d compliant",
+ cookie_audit.get("declared_count"),
+ cookie_audit.get("browser_count"),
+ len(cookie_audit.get("undeclared_in_browser") or []),
+ len(cookie_audit.get("compliant") or []),
+ )
+ finally:
+ _ca_db.close()
+ except Exception as e:
+ logger.warning("cookie-compliance-audit skipped: %s", e)
+
+ # P102: Cookie-Klassifikations-Pruefung
+ library_mismatch_html = ""
+ mismatches: list[dict] = []
+ try:
+ from compliance.services.cookie_library_mismatch import (
+ build_mismatch_block_html, detect_mismatches,
+ )
+ from database import SessionLocal
+ cookie_doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or ""
+ all_cookies_seen: list[str] = []
+ if banner_result:
+ for ph in (banner_result.get("phases") or {}).values():
+ if isinstance(ph, dict):
+ for ck in (ph.get("cookies") or []):
+ if isinstance(ck, str):
+ all_cookies_seen.append(ck)
+ elif isinstance(ck, dict) and ck.get("name"):
+ all_cookies_seen.append(ck["name"])
+ if all_cookies_seen and cookie_doc_for_check:
+ _mm_db = SessionLocal()
+ try:
+ mismatches = detect_mismatches(
+ _mm_db, all_cookies_seen, cookie_doc_for_check,
+ )
+ if mismatches:
+ library_mismatch_html = build_mismatch_block_html(mismatches)
+ logger.info(
+ "P102: %d Cookie-Mismatches gefunden", len(mismatches),
+ )
+ finally:
+ _mm_db.close()
+ except Exception as e:
+ logger.warning("P102 mismatch detection failed: %s", e)
+
+ # P35 + P77 + P78: Textsignal-Checks
+ signals_html = ""
+ try:
+ from compliance.services.doc_text_signals import (
+ build_signals_block_html, run_all as run_signal_checks,
+ )
+ cookie_doc_missing = not bool(doc_texts.get("cookie"))
+ sig_findings = run_signal_checks(
+ banner_result, doc_texts, cookie_doc_missing,
+ )
+ if sig_findings:
+ signals_html = build_signals_block_html(sig_findings)
+ except Exception as e:
+ logger.warning("P35/P77/P78 signals-check failed: %s", e)
+
+ # P92 + P94: Banner-Konsistenz
+ consistency_html = ""
+ try:
+ from compliance.services.banner_consistency_checks import (
+ build_consistency_block_html, run_all as run_consistency_checks,
+ )
+ cookie_doc_for_check = (doc_texts.get("cookie")
+ or doc_texts.get("dse") or "")
+ cons_findings = run_consistency_checks(
+ banner_result or {}, cookie_doc_for_check, cmp_vendors,
+ doc_texts=doc_texts,
+ )
+ if cons_findings:
+ consistency_html = build_consistency_block_html(cons_findings)
+ logger.info("P92/P94: %d Konsistenz-Findings", len(cons_findings))
+ except Exception as e:
+ logger.warning("P92/P94 consistency-check failed: %s", e)
+
+ # P73: MC-Solution-Generator — LLM-Vorschlaege pro HIGH-Fail
+ solutions_html = ""
+ try:
+ from compliance.services.mc_solution_generator import (
+ build_solutions_block_html, generate_solutions_for_fails,
+ )
+ all_solutions: list[dict] = []
+ for dt, fails in fails_by_doc.items():
+ if not fails:
+ continue
+ doc_txt = doc_texts.get(dt) or doc_texts.get("dse") or ""
+ if not doc_txt or len(doc_txt) < 500:
+ continue
+ sols = await generate_solutions_for_fails(
+ fails, doc_txt, dt, limit=3,
+ )
+ all_solutions.extend(sols)
+ if len(all_solutions) >= 8:
+ break
+ if all_solutions:
+ solutions_html = build_solutions_block_html(all_solutions[:8])
+ logger.info("P73: %d MC-Solutions generiert", len(all_solutions))
+ except Exception as e:
+ logger.warning("P73 MC-Solution-Generator skipped: %s", e)
+
+ html_blocks.update({
+ "scope_disclaimer_html": scope_disclaimer_html,
+ "entropy_html": entropy_html,
+ "network_trace_html": network_trace_html,
+ "tcf_authority_html": tcf_authority_html,
+ "cookie_audit_html": cookie_audit_html,
+ "library_mismatch_html": library_mismatch_html,
+ "signals_html": signals_html,
+ "consistency_html": consistency_html,
+ "solutions_html": solutions_html,
+ })
+ state["cookie_audit"] = cookie_audit
+ state["mismatches"] = mismatches
diff --git a/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_top.py b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_top.py
new file mode 100644
index 00000000..bf4e5840
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_d3_blocks_top.py
@@ -0,0 +1,198 @@
+"""Phase D-3-Top — Top-of-mail HTML blocks.
+
+Covers (in the original Step 5 of `_run_compliance_check`):
+ - Summary / Scanned-URLs / Provider-list / Banner-deep / VVT HTML
+ - MC-scorecard aggregation (all_mc_checks + scorecard) + trend lookup
+ - P106 mc_audit_type split (internal_checks vs. verifiable_fails)
+ - Profile HTML / Redundancy HTML
+ - P1 Executive Summary
+ - P18 Critical Findings block
+ - P10 Cookie-Policy-Architecture detection
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ._helpers import _build_profile_html, _company_name_from_url, _extract_domain
+
+logger = logging.getLogger(__name__)
+
+
+async def run_phase_d3_top(state: dict) -> None:
+ """Top-of-mail HTML blocks. Mutates state in place."""
+ req = state["req"]
+ results = state["results"]
+ doc_entries = state["doc_entries"]
+ doc_texts = state["doc_texts"]
+ banner_result = state["banner_result"]
+ vvt_entries = state["vvt_entries"]
+ cmp_vendors = state["cmp_vendors"]
+ profile = state["profile"]
+ redundancy_report = state.get("redundancy_report")
+
+ from ..agent_doc_check_banner import build_banner_deep_html
+ from ..agent_doc_check_critical import build_critical_findings_html
+ from ..agent_doc_check_exec_summary import build_exec_summary_html
+ from ..agent_doc_check_extras import build_vvt_table_html
+ from ..agent_doc_check_redundancy import build_redundancy_html
+ from ..agent_doc_check_report import (
+ build_html_report,
+ build_management_summary,
+ build_provider_list_html,
+ build_scanned_urls_html,
+ )
+ from ..agent_doc_check_scorecard import build_scorecard_html
+ from compliance.services.mc_scorecard import build_scorecard
+
+ summary_html = build_management_summary(results)
+ scanned_html = build_scanned_urls_html(doc_entries)
+ providers_html = build_provider_list_html(banner_result, vvt_entries)
+ # P18: Deep-Block mit Phases + Quality-Score + Per-Category-Tracker
+ banner_deep_html = build_banner_deep_html(banner_result)
+ vvt_html = build_vvt_table_html(cmp_vendors)
+
+ # MC scorecard aggregated across ALL docs (DSGVO/TDDDG/BGB/...)
+ all_mc_checks: list[dict] = []
+ fails_by_doc: dict[str, list[dict]] = {}
+ for r in results:
+ for c in r.checks:
+ if c.id.startswith("mc-"):
+ rec = {
+ "id": c.id, "label": c.label, "passed": c.passed,
+ "severity": c.severity, "skipped": c.skipped,
+ "regulation": c.regulation,
+ "hint": getattr(c, "hint", "") or "",
+ }
+ all_mc_checks.append(rec)
+ if (not c.passed and not c.skipped
+ and (c.severity or "").upper() in ("CRITICAL", "HIGH")):
+ fails_by_doc.setdefault(r.doc_type, []).append(rec)
+ # P106 — Audit-Type-Klassifizierung pro MC
+ mc_split: dict = {"internal_checks": [], "verifiable_fails": all_mc_checks}
+ try:
+ from compliance.services.mc_audit_type import (
+ annotate_mc_results, split_by_audit_type,
+ )
+ annotate_mc_results(all_mc_checks)
+ mc_split = split_by_audit_type(all_mc_checks)
+ fails_by_doc = {}
+ for r in mc_split.get("verifiable_fails") or []:
+ fails_by_doc.setdefault("dse", []).append(r)
+ except Exception as e:
+ logger.warning("P106 mc_audit_type skipped: %s", e)
+ scorecard = build_scorecard(all_mc_checks) if all_mc_checks else {}
+
+ # Trend: load previous scorecard for the same tenant + domain
+ prev_scorecard: dict | None = None
+ if scorecard:
+ try:
+ from compliance.services.compliance_audit_log import (
+ list_runs_for_tenant,
+ )
+ tenant_id_for_trend = req.recipient or ""
+ base_domain_for_trend = _extract_domain(doc_entries) or ""
+ prev_runs = list_runs_for_tenant(
+ tenant_id_for_trend,
+ base_domain=base_domain_for_trend,
+ limit=1,
+ )
+ if prev_runs:
+ prev_scorecard = prev_runs[0].get("scorecard")
+ except Exception as e:
+ logger.debug("trend lookup skipped: %s", e)
+ scorecard_html = (
+ build_scorecard_html(scorecard, previous_scorecard=prev_scorecard)
+ if scorecard else ""
+ )
+
+ report_html = build_html_report(results, None, doc_texts)
+ profile_html = _build_profile_html(profile)
+
+ # O4: Vendor-Redundanz / EU-Alternativen + Cost-Savings-Block
+ redundancy_html = build_redundancy_html(redundancy_report)
+
+ # P1: Executive-Summary
+ url_company_for_exec = _company_name_from_url(doc_entries)
+ domain_for_exec = _extract_domain(doc_entries)
+ site_name_for_exec = url_company_for_exec or domain_for_exec or ""
+ exec_summary_html = build_exec_summary_html(
+ scorecard=scorecard,
+ previous_scorecard=prev_scorecard,
+ cmp_vendors=cmp_vendors,
+ redundancy_report=redundancy_report,
+ site_name=site_name_for_exec,
+ )
+
+ # P18: Critical-Findings-Block
+ critical_html = ""
+ try:
+ critical_html = build_critical_findings_html(
+ banner_result=banner_result,
+ scorecard=scorecard,
+ results=results,
+ )
+ except Exception as e:
+ logger.warning("Critical-findings block skipped: %s", e)
+
+ # P10: Cookie-Policy-Architecture-Detection (BMW-Pattern erkennen)
+ cookie_arch_html = ""
+ try:
+ from compliance.services.cookie_policy_architecture import (
+ build_architecture_html,
+ detect_architecture,
+ )
+ cookie_doc_url = ""
+ cookie_doc_text = doc_texts.get("cookie", "")
+ cookie_cmp_payloads: list[dict] = []
+ for e in doc_entries:
+ if (e.get("doc_type") or "").lower() in ("cookie", "cookie_policy"):
+ cookie_doc_url = e.get("url", "")
+ cookie_cmp_payloads = e.get("cmp_payloads") or []
+ break
+ # P17-A: Fallback wenn Cookie-Doc via P15 deduped wurde
+ if not cookie_doc_text:
+ dse_text = doc_texts.get("dse", "")
+ if dse_text and any(w in dse_text.lower() for w in
+ ("cookie", "tracking", "google analytics",
+ "consent")):
+ cookie_doc_text = dse_text
+ dse_entry = next((e for e in doc_entries
+ if e.get("doc_type") == "dse"), {})
+ cookie_doc_url = dse_entry.get("url", "")
+ cookie_cmp_payloads = dse_entry.get("cmp_payloads") or []
+ logger.info("P17-A: cookie-arch fallback auf DSE")
+ if cookie_doc_text:
+ arch = detect_architecture(
+ doc_url=cookie_doc_url,
+ doc_text=cookie_doc_text,
+ cmp_payloads=cookie_cmp_payloads,
+ homepage_cmp_payloads=state.get("cookie_payloads") or [],
+ )
+ cookie_arch_html = build_architecture_html(arch)
+ logger.info("cookie-arch: layer=%s versioned=%s risk=%s",
+ arch["layer_separation"], arch["versioned"],
+ arch["risk_label"])
+ except Exception as e:
+ logger.warning("cookie-architecture detection failed: %s", e)
+
+ state["scorecard"] = scorecard
+ state["prev_scorecard"] = prev_scorecard
+ state["mc_split"] = mc_split
+ state["fails_by_doc"] = fails_by_doc
+ state["site_name_for_exec"] = site_name_for_exec
+ state["domain_for_exec"] = domain_for_exec
+ state["html_blocks"] = {
+ "summary_html": summary_html,
+ "scanned_html": scanned_html,
+ "providers_html": providers_html,
+ "banner_deep_html": banner_deep_html,
+ "vvt_html": vvt_html,
+ "scorecard_html": scorecard_html,
+ "report_html": report_html,
+ "profile_html": profile_html,
+ "redundancy_html": redundancy_html,
+ "exec_summary_html": exec_summary_html,
+ "critical_html": critical_html,
+ "cookie_arch_html": cookie_arch_html,
+ }
diff --git a/backend-compliance/compliance/api/agent_check/_phase_e_email.py b/backend-compliance/compliance/api/agent_check/_phase_e_email.py
new file mode 100644
index 00000000..466d239a
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_e_email.py
@@ -0,0 +1,75 @@
+"""Phase E — Send compliance-check email, with A1 ZIP-Anhang.
+
+Original Step 6 of `_run_compliance_check`, extended with the A1
+attachment: when the Tesseract pipeline captured evidence slices,
+bundle them into evidence-{check_id}.zip (manifest.json +
+audit_metadata.json + slice_NNN.png) and attach to the e-mail. The
+attachment makes the evidence chain portable so a DSB / lawyer can
+hand it to an external auditor or supervisory authority.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from compliance.services.smtp_sender import send_email
+
+from ._helpers import _company_name_from_url, _extract_domain, _update
+
+logger = logging.getLogger(__name__)
+
+
+def run_phase_e(state: dict) -> None:
+ """Build site label, optional ZIP attachment, send mail. Mutate state."""
+ check_id = state["check_id"]
+ req = state["req"]
+ results = state["results"]
+ doc_entries = state["doc_entries"]
+ full_html = state["full_html"]
+ cookie_evidence_slices = state.get("cookie_evidence_slices")
+ cookie_evidence_meta = state.get("cookie_evidence_meta")
+
+ # Derive site name primarily from entered URL.
+ # The extracted_profile.companyName is often noisy (e.g. picks up
+ # juris.de from legal references). Domain-derived name is more
+ # predictable for the GF email subject.
+ doc_count = len([r for r in results if not r.error])
+ url_company = _company_name_from_url(doc_entries)
+ domain = _extract_domain(doc_entries)
+ site_name = url_company or domain or "Unbekannt"
+ _update(check_id, "E-Mail wird versendet...", 98)
+
+ # A1: bundle cookie-evidence slices into a ZIP attachment so the
+ # audit chain reaches the recipient. Each slice has its own
+ # SHA-256 + capture timestamp; manifest.json + audit_metadata.json
+ # make the chain verifiable for an external auditor.
+ evidence_attachments: list[dict] = []
+ if cookie_evidence_slices:
+ try:
+ from compliance.services.evidence_zip_builder import (
+ build_evidence_zip,
+ )
+ zip_bytes = build_evidence_zip(
+ slices=cookie_evidence_slices,
+ meta=cookie_evidence_meta,
+ check_id=check_id,
+ )
+ evidence_attachments.append({
+ "filename": f"evidence-{check_id[:8]}.zip",
+ "data": zip_bytes,
+ "mime": "application/zip",
+ })
+ except Exception as e:
+ logger.warning("A1 evidence-zip build failed: %s", e)
+
+ email_result = send_email(
+ recipient=req.recipient,
+ subject=f"[COMPLIANCE-CHECK] {site_name} — {doc_count} Dokumente geprueft",
+ body_html=full_html,
+ attachments=evidence_attachments or None,
+ )
+
+ state["email_result"] = email_result
+ state["site_name"] = site_name
+ state["domain"] = domain
+ state["doc_count"] = doc_count
diff --git a/backend-compliance/compliance/api/agent_check/_phase_f_persist.py b/backend-compliance/compliance/api/agent_check/_phase_f_persist.py
new file mode 100644
index 00000000..413a9ef6
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_phase_f_persist.py
@@ -0,0 +1,166 @@
+"""Phase F — Build response + persist snapshot/audit-log/unified-findings.
+
+Covers (in the original `_run_compliance_check`):
+ - Step 7 Build response dict, mark job as completed
+ - P80 Persist raw scan data so we can replay the audit pipeline
+ without re-crawling (7min → 5sec test cycle)
+ - SQLite audit log (compliance.api/audit endpoints + trend view A6)
+ - P5 Unified findings (MC + Pflichtangaben + Vendor + Redundanz
+ in one searchable table behind /agent/findings/)
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import datetime, timezone
+
+from ._constants import _compliance_check_jobs
+from ._helpers import _result_to_dict
+
+logger = logging.getLogger(__name__)
+
+
+def run_phase_f(state: dict) -> None:
+ """Build response + persist. Mutates state in place."""
+ check_id = state["check_id"]
+ req = state["req"]
+ results = state["results"]
+ profile = state["profile"]
+ profile_dict = state["profile_dict"]
+ extracted_profile = state["extracted_profile"]
+ banner_result = state["banner_result"]
+ tcf_vendors = state["tcf_vendors"]
+ vvt_entries = state["vvt_entries"]
+ cmp_vendors = state["cmp_vendors"]
+ cookie_audit = state["cookie_audit"]
+ total_findings = state["total_findings"]
+ email_result = state["email_result"]
+ doc_entries = state["doc_entries"]
+ doc_texts = state["doc_texts"]
+ redundancy_report = state.get("redundancy_report")
+ scorecard = state["scorecard"]
+ site_name = state.get("site_name", "")
+ domain = state.get("domain", "")
+ doc_count = state.get("doc_count", 0)
+
+ response = {
+ "check_id": check_id,
+ "results": [_result_to_dict(r) for r in results],
+ "business_profile": profile_dict,
+ "extracted_profile": extracted_profile,
+ # P18: vollen consent-tester-Output durchreichen statt nur 4 Felder.
+ # phases (before/after-accept/reject) + banner_checks.violations +
+ # category_tests werden vom Renderer + Critical-Findings-Block genutzt.
+ "banner_result": ({
+ "detected": banner_result.get("banner_detected", False),
+ "provider": banner_result.get("banner_provider", ""),
+ "violations": len((banner_result.get("banner_checks") or {})
+ .get("violations", [])),
+ "tcf_vendor_count": len(tcf_vendors),
+ "completeness_pct": banner_result.get("completeness_pct"),
+ "correctness_pct": banner_result.get("correctness_pct"),
+ "phases": banner_result.get("phases", {}),
+ "banner_checks": banner_result.get("banner_checks", {}),
+ "category_tests": banner_result.get("category_tests", []),
+ "structured_checks": banner_result.get("structured_checks", []),
+ "summary": banner_result.get("summary", {}),
+ } if banner_result else None),
+ "tcf_vendors": vvt_entries if tcf_vendors else [],
+ "cmp_vendors": cmp_vendors,
+ "cookie_audit": cookie_audit if cookie_audit else None,
+ "total_documents": len(results),
+ "total_findings": total_findings,
+ "email_status": email_result.get("status", "failed"),
+ "checked_at": datetime.now(timezone.utc).isoformat(),
+ }
+
+ _compliance_check_jobs[check_id]["status"] = "completed"
+ _compliance_check_jobs[check_id]["result"] = response
+ _compliance_check_jobs[check_id]["progress"] = "Fertig"
+ _compliance_check_jobs[check_id]["progress_pct"] = 100
+
+ # P80: persist raw scan data so we can replay audit pipeline
+ # without re-crawling (7min -> 5sec test cycle).
+ try:
+ from database import SessionLocal
+ from compliance.services.check_snapshot import save_snapshot
+ snap_db = SessionLocal()
+ try:
+ save_snapshot(
+ snap_db,
+ check_id=check_id,
+ doc_entries=doc_entries,
+ banner_result=banner_result,
+ profile=profile,
+ cmp_vendors=cmp_vendors,
+ scan_context=req.scan_context, # P79
+ site_label=site_name,
+ notes=f"recipient={req.recipient}",
+ )
+ finally:
+ snap_db.close()
+ except Exception as snap_err:
+ logger.warning("P80 snapshot save skipped: %s", snap_err)
+
+ # Persist to sidecar SQLite audit log — enables /audit endpoints
+ # (A5 admin tab) and trend view (A6). Best-effort; failures here
+ # do not affect the user-facing response.
+ try:
+ from compliance.services.compliance_audit_log import record_check_run
+ from compliance.services.mc_scorecard import full_audit_records
+ audit_rows: list[dict] = []
+ for r in results:
+ doc_mc = [c for c in r.checks if c.id.startswith("mc-")]
+ audit_rows.extend(full_audit_records(
+ [{"id": c.id, "label": c.label, "passed": c.passed,
+ "severity": c.severity, "skipped": c.skipped,
+ "regulation": c.regulation, "matched_text": c.matched_text,
+ "hint": c.hint, "level": c.level}
+ for c in doc_mc],
+ check_id=check_id,
+ doc_type=r.doc_type,
+ ))
+ record_check_run(
+ check_id=check_id,
+ tenant_id=req.recipient or "",
+ site_name=site_name,
+ base_domain=domain or "",
+ doc_count=doc_count,
+ scorecard=scorecard,
+ vvt_summary={
+ "total": len(cmp_vendors),
+ "internal": sum(1 for v in cmp_vendors
+ if (v.get("recipient_type") or "").upper()
+ in ("INTERNAL", "GROUP_COMPANY")),
+ "external": sum(1 for v in cmp_vendors
+ if (v.get("recipient_type") or "").upper()
+ in ("PROCESSOR", "CONTROLLER")),
+ },
+ mc_records=audit_rows,
+ )
+ from compliance.services.compliance_audit_log import record_check_payload
+ record_check_payload(
+ check_id=check_id,
+ vendors=cmp_vendors,
+ profile=extracted_profile,
+ banner=banner_result,
+ )
+ # Unified findings (P5): bundle MC + Pflichtangaben + Vendor +
+ # Redundanz in one searchable table behind /agent/findings/.
+ try:
+ from compliance.services.unified_findings_collector import collect
+ from compliance.services.unified_findings_store import record_findings
+ unified = collect(
+ check_id=check_id,
+ results=results,
+ cmp_vendors=cmp_vendors,
+ redundancy_report=redundancy_report,
+ doc_texts=doc_texts,
+ )
+ record_findings(check_id, unified)
+ except Exception as e:
+ logger.warning("Unified findings collect failed: %s", e)
+ except Exception as e:
+ logger.warning("Audit persistence skipped: %s", e)
+
+ state["response"] = response
diff --git a/backend-compliance/compliance/api/agent_check/_schemas.py b/backend-compliance/compliance/api/agent_check/_schemas.py
new file mode 100644
index 00000000..d4625533
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_schemas.py
@@ -0,0 +1,44 @@
+"""Pydantic request/response schemas for the compliance-check route."""
+
+from __future__ import annotations
+
+from pydantic import BaseModel
+
+
+class ExtractTextRequest(BaseModel):
+ url: str
+
+
+class DocumentInput(BaseModel):
+ doc_type: str # dse, agb, impressum, cookie, widerruf, avv, loeschkonzept, etc.
+ url: str = ""
+ text: str = "" # text has priority over URL
+
+
+class ComplianceCheckRequest(BaseModel):
+ documents: list[DocumentInput]
+ use_agent: bool = False
+ recipient: str = "dsb@breakpilot.local"
+ # P12: Override fuer TDM-Vorbehalt bei dokumentierter Kunden-Erlaubnis.
+ # Pflichtfeld tdm_override_reason wenn tdm_override=True
+ # (z.B. "Auftragsbeziehung Safetykon GmbH, Email Hr. X 18.05.2026").
+ tdm_override: bool = False
+ tdm_override_reason: str = ""
+ # P79: 8-Feld Pre-Scan-Wizard (Branche, B2B/B2C, Direkt-Vertrieb,
+ # Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im
+ # Snapshot persistiert und filtert die MC-Auswertung (P72).
+ scan_context: dict | None = None
+
+
+class ComplianceCheckStartResponse(BaseModel):
+ check_id: str
+ status: str = "running"
+
+
+class ComplianceCheckStatusResponse(BaseModel):
+ check_id: str
+ status: str
+ progress: str = ""
+ progress_pct: int = 0
+ result: dict | None = None
+ error: str = ""
diff --git a/backend-compliance/compliance/api/agent_check/_single_check.py b/backend-compliance/compliance/api/agent_check/_single_check.py
new file mode 100644
index 00000000..48ced787
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_single_check.py
@@ -0,0 +1,118 @@
+"""Per-document regex + MC + LLM checks for the compliance-check route.
+
+Each document goes through:
+ 1. regex completeness/correctness checklist
+ 2. Master Control evaluation (all MCs for this doc_type)
+ 3. LLM verification of failed regex checks (overturns where evidence
+ was missed by the regex)
+ 4. Cookie-only: opt-out + privacy-policy URL health-check
+"""
+
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+async def _check_single(
+ text: str, doc_type: str, label: str, url: str,
+ word_count: int, use_agent: bool,
+ business_scope: set[str] | None = None,
+ business_profile: dict | None = None,
+):
+ """Run regex + MC checks on a single document."""
+ from compliance.services.doc_checks.runner import check_document_completeness
+ from compliance.services.rag_document_checker import check_document_with_controls
+ from ..agent_doc_check_routes import CheckItem, DocCheckResult
+
+ # Regex checklist
+ findings = check_document_completeness(text, doc_type, label, url,
+ business_profile=business_profile)
+
+ all_checks: list[CheckItem] = []
+ completeness = 0
+ correctness = 0
+
+ for f in findings:
+ if "SCORE" in f.get("code", ""):
+ for c in f.get("all_checks", []):
+ all_checks.append(CheckItem(
+ id=c["id"], label=c["label"], passed=c["passed"],
+ severity=c["severity"], matched_text=c.get("matched_text", ""),
+ level=c.get("level", 1), parent=c.get("parent"),
+ skipped=c.get("skipped", False), hint=c.get("hint", ""),
+ ))
+ completeness = f.get("completeness_pct", 0)
+ correctness = f.get("correctness_pct", 0)
+
+ # Master Control checks (top 20 by severity to avoid noise)
+ try:
+ # max_controls=0 -> evaluate ALL MCs for this doc_type (DB has
+ # 1874 across 8 types; regex matching is cheap and dominates
+ # well under 1s per doc). Caps remain on the LLM-enrich step
+ # (top-10 FAILs) so cost stays bounded.
+ mc_results = await check_document_with_controls(
+ text, doc_type, label, max_controls=0, use_agent=use_agent,
+ business_scope=business_scope,
+ )
+ if mc_results:
+ for mc in mc_results:
+ all_checks.append(CheckItem(**mc))
+ l2 = [c for c in all_checks if c.level == 2 and not c.skipped]
+ l2_passed = sum(1 for c in l2 if c.passed)
+ correctness = round(l2_passed / len(l2) * 100) if l2 else 0
+ except Exception as e:
+ logger.warning("MC check skipped for %s: %s", label, e)
+
+ # LLM verification of regex fails
+ failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint]
+ if failed:
+ try:
+ from compliance.services.doc_checks.llm_verify import verify_failed_checks
+ overturns = await verify_failed_checks(
+ text,
+ [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
+ label,
+ )
+ for c in all_checks:
+ if c.id in overturns and overturns[c.id]["overturned"]:
+ c.passed = True
+ c.matched_text = f"[LLM] {overturns[c.id]['evidence']}"
+ l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
+ l2_passed = sum(1 for c in l2_active if c.passed)
+ if l2_active:
+ correctness = round(l2_passed / len(l2_active) * 100)
+ except Exception as e:
+ logger.warning("LLM verification skipped: %s", e)
+
+ # Cookie-policy only: actively HTTP-probe the Opt-Out + Privacy-Policy
+ # URLs the document advertises. Broken links make individual provider
+ # entries non-compliant under Art. 7(3) DSGVO.
+ if doc_type == "cookie":
+ try:
+ from compliance.services.cookie_link_validator import (
+ extract_links, validate_links, build_check_items,
+ )
+ links = extract_links(text)
+ if links:
+ logger.info("Cookie-link validator: %d urls extracted from %s",
+ len(links), label)
+ validated = await validate_links(links)
+ for item in build_check_items(validated):
+ all_checks.append(CheckItem(**item))
+ # Re-compute correctness with the new L2 items
+ l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
+ l2_passed = sum(1 for c in l2_active if c.passed)
+ if l2_active:
+ correctness = round(l2_passed / len(l2_active) * 100)
+ except Exception as e:
+ logger.warning("Cookie-link validation skipped for %s: %s", label, e)
+
+ non_score = [f for f in findings if "SCORE" not in f.get("code", "")]
+ return DocCheckResult(
+ label=label, url=url, doc_type=doc_type,
+ word_count=word_count or len(text.split()),
+ completeness_pct=completeness, correctness_pct=correctness,
+ checks=all_checks, findings_count=len(non_score),
+ )
diff --git a/backend-compliance/compliance/api/agent_check/_state.py b/backend-compliance/compliance/api/agent_check/_state.py
new file mode 100644
index 00000000..76beb3fb
--- /dev/null
+++ b/backend-compliance/compliance/api/agent_check/_state.py
@@ -0,0 +1,58 @@
+"""Shared state for the compliance-check pipeline.
+
+The 7-step pipeline accumulates ~60 named values that flow across
+phases (doc_entries, profile, results, banner_result, cmp_vendors,
+scorecard, HTML blocks, …). Rather than threading 60 parameters
+through each function, we pass one mutable `CheckState` dict.
+
+Phases read what they need with `state[key]` and write their outputs
+with `state[key] = value`. This is intentionally untyped: enforcing
+strict typing would require freezing the schema before all phases
+landed, and the report-building phase routinely adds new optional
+keys (P1, P10, P50, P59b, P82, P103, P104, P106, …).
+
+`CheckState.new(check_id, req)` initialises the dict with the few
+keys that must exist from the start.
+"""
+
+from __future__ import annotations
+
+
+def new_state(check_id: str, req) -> dict:
+ """Create a fresh state dict for a check run.
+
+ Pre-populates a few keys that downstream phases assume exist
+ (e.g. `cmp_vendors` defaulting to `[]`).
+ """
+ return {
+ "check_id": check_id,
+ "req": req,
+ # Phase-1 outputs
+ "doc_texts": {},
+ "doc_entries": [],
+ "url_text_cache": {},
+ "pasted_table_vendors": [],
+ "placement_findings": [],
+ # Phase-2/3/4 outputs
+ "profile": None,
+ "profile_dict": {},
+ "results": [],
+ "total_findings": 0,
+ "business_scope": set(),
+ "banner_result": None,
+ "banner_url": "",
+ "tcf_vendors": [],
+ "vvt_entries": [],
+ "extracted_profile": {},
+ # Phase-5 outputs
+ "cmp_vendors": [],
+ "cookie_audit": {},
+ "cookie_evidence_slices": None,
+ "cookie_evidence_meta": None,
+ "scorecard": {},
+ "full_html": "",
+ "audit_quality_findings": [],
+ # Phase-6/7 outputs
+ "email_result": {"status": "skipped"},
+ "site_name": "",
+ }
diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py
index 36ac607d..324326cc 100644
--- a/backend-compliance/compliance/api/agent_compliance_check_routes.py
+++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py
@@ -4,72 +4,70 @@ Unified Compliance Check Routes — check all documents in one request.
POST /compliance/agent/extract-text — extract text from a URL
POST /compliance/agent/compliance-check — unified check for all documents
GET /compliance/agent/compliance-check/{check_id} — poll status
+
+Phase 5 split (2026-06-06): the original 2700-line monolith is now
+decomposed into the `agent_check/` subpackage:
+ - _orchestrator.py — thin run_compliance_check pipeline
+ - _phase_a_resolve.py — TDM + Step 1 (resolve / discover / split)
+ - _phase_b_profile_check.py — Step 2 + Step 3 (profile + doc checks)
+ - _phase_c_banner.py — Step 3b-d (banner + cross-check + TCF) + Step 4
+ - _phase_d1_vendors_raw.py / _phase_d2_vendors_finalize.py — Step 5
+ vendor extraction + finalize
+ - _phase_d3_blocks_top.py / mid / bot — Step 5 HTML blocks
+ - _phase_e_email.py — Step 6 (with A1 ZIP-Anhang)
+ - _phase_f_persist.py — Step 7 (snapshot + audit log + unified findings)
+ - _helpers.py / _constants.py / _state.py / _schemas.py — shared
+
+External callers (saving_scan_routes, agent_migration_routes, tests)
+keep importing helpers from THIS module — everything is re-exported.
"""
+from __future__ import annotations
+
import asyncio
import logging
-import os
-import re
import uuid as _uuid
-from dataclasses import asdict
-from datetime import datetime, timezone
import httpx
from fastapi import APIRouter
-from pydantic import BaseModel
-from compliance.services.smtp_sender import send_email
+# ── Re-exports: external callers import these from THIS module ──────
+from .agent_check._constants import ( # noqa: F401
+ CONSENT_TESTER_URL,
+ _ALL_DOC_TYPES,
+ _COMPOUND_TLDS,
+ _DISCOVERY_RULES,
+ _DOC_TYPE_LABELS,
+ _compliance_check_jobs,
+)
+from .agent_check._discovery import _autodiscover_missing # noqa: F401
+from .agent_check._fetch import _fetch_text # noqa: F401
+from .agent_check._helpers import ( # noqa: F401
+ _apply_profile_filter,
+ _build_profile_html,
+ _classify_discovered_doc,
+ _company_name_from_url,
+ _doc_type_label,
+ _extract_domain,
+ _get_skip_types,
+ _pad_results_with_missing,
+ _result_to_dict,
+ _update,
+)
+from .agent_check._orchestrator import run_compliance_check as _run_compliance_check # noqa: F401
+from .agent_check._schemas import (
+ ComplianceCheckRequest,
+ ComplianceCheckStartResponse,
+ ComplianceCheckStatusResponse,
+ DocumentInput,
+ ExtractTextRequest,
+)
+from .agent_check._single_check import _check_single # noqa: F401
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
-CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
-
-# In-memory job store (same pattern as doc-check)
-_compliance_check_jobs: dict[str, dict] = {}
-
-
-# ── Models ───────────────────────────────────────────────────────────
-
-class ExtractTextRequest(BaseModel):
- url: str
-
-
-class DocumentInput(BaseModel):
- doc_type: str # dse, agb, impressum, cookie, widerruf, avv, loeschkonzept, etc.
- url: str = ""
- text: str = "" # text has priority over URL
-
-
-class ComplianceCheckRequest(BaseModel):
- documents: list[DocumentInput]
- use_agent: bool = False
- recipient: str = "dsb@breakpilot.local"
- # P12: Override fuer TDM-Vorbehalt bei dokumentierter Kunden-Erlaubnis.
- # Pflichtfeld tdm_override_reason wenn tdm_override=True
- # (z.B. "Auftragsbeziehung Safetykon GmbH, Email Hr. X 18.05.2026").
- tdm_override: bool = False
- tdm_override_reason: str = ""
- # P79: 8-Feld Pre-Scan-Wizard (Branche, B2B/B2C, Direkt-Vertrieb,
- # Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im
- # Snapshot persistiert und filtert die MC-Auswertung (P72).
- scan_context: dict | None = None
-
-
-class ComplianceCheckStartResponse(BaseModel):
- check_id: str
- status: str = "running"
-
-
-class ComplianceCheckStatusResponse(BaseModel):
- check_id: str
- status: str
- progress: str = ""
- progress_pct: int = 0
- result: dict | None = None
- error: str = ""
-
# ── Extract text endpoint ────────────────────────────────────────────
@@ -214,15 +212,12 @@ async def benchmark(
anonymized: bool = False,
limit: int = 50,
):
- """P107 — Branchen-Benchmark-Cockpit Endpoint.
- industry: 'automotive' / 'banking' / etc (optional)
- sites: comma-separated site_label list (optional)
- anonymized: bool — wenn true, Hersteller-Namen → 'OEM 1/2/3'
- """
+ """P107 — Branchen-Benchmark-Cockpit Endpoint."""
from database import SessionLocal
from compliance.services.benchmark_extractor import (
- load_snapshots_for_benchmark, anonymize_kpis,
+ anonymize_kpis,
build_benchmark_summary,
+ load_snapshots_for_benchmark,
)
site_list = [s.strip() for s in sites.split(",") if s.strip()] if sites else None
db = SessionLocal()
@@ -245,9 +240,7 @@ async def benchmark(
@router.post("/admin/tcf-ingest")
async def tcf_ingest():
- """P105 — IAB TCF Vendor-Liste ingestieren / refreshen.
- Idempotent: holt aktuelle GVL und upserted in compliance.cookie_library
- mit source='iab_tcf_v2'. Aufruf ein paar Mal pro Jahr ausreichend."""
+ """P105 — IAB TCF Vendor-Liste ingestieren / refreshen."""
from database import SessionLocal
from compliance.services.tcf_vendor_authority import (
fetch_and_ingest_tcf_vendors,
@@ -306,2344 +299,6 @@ async def replay_snapshot(
db.close()
-async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
- """Background task: check all documents with business-profile context."""
- try:
- from compliance.services.business_profiler import detect_business_profile
- from compliance.services.doc_checks.runner import check_document_completeness
- from compliance.services.rag_document_checker import check_document_with_controls
- from .agent_doc_check_routes import CheckItem, DocCheckResult
- from .agent_doc_check_report import build_html_report
-
- # Reset anchor-locator cache per run (avoid cross-run leak)
- try:
- from compliance.services.doc_anchor_locator import reset_cache
- reset_cache()
- except Exception:
- pass
-
- # P7: TDM-Reservation-Check der Base-Domain (§ 44b UrhG).
- # Bei reserved/denied: Run sofort beenden, kein Crawl.
- try:
- from compliance.services.tdm_reservation_check import (
- check_tdm_reservation, is_crawl_allowed,
- )
- first_url = next(
- (d.url for d in req.documents if d.url), "",
- )
- if first_url:
- tdm = await check_tdm_reservation(first_url)
- _compliance_check_jobs[check_id]["tdm"] = tdm
- # P12: Bei tdm_override + Reason wird NICHT abgebrochen,
- # sondern nur dokumentiert. Override ohne Reason wird ignoriert.
- override_active = (
- req.tdm_override
- and len((req.tdm_override_reason or "").strip()) >= 10
- )
- if not is_crawl_allowed(tdm) and not override_active:
- _compliance_check_jobs[check_id]["status"] = "skipped_tdm"
- _compliance_check_jobs[check_id]["error"] = (
- f"TDM-Vorbehalt fuer {tdm.get('domain')} erkannt "
- f"(status={tdm.get('status')}) — Crawl nach § 44b "
- f"UrhG nicht zulaessig. Signals: "
- f"{[s.get('src') for s in tdm.get('signals', [])]}"
- )
- _compliance_check_jobs[check_id]["progress_pct"] = 100
- logger.info("TDM-skip check_id=%s domain=%s status=%s",
- check_id, tdm.get("domain"), tdm.get("status"))
- return
- if override_active and not is_crawl_allowed(tdm):
- _compliance_check_jobs[check_id]["tdm_override"] = {
- "reason": req.tdm_override_reason.strip()[:500],
- "original_status": tdm.get("status"),
- }
- logger.warning(
- "TDM-Override aktiv: check_id=%s domain=%s "
- "status=%s reason=%r",
- check_id, tdm.get("domain"), tdm.get("status"),
- req.tdm_override_reason.strip()[:80],
- )
- except Exception as e:
- logger.warning("TDM-check failed (proceeding): %s", e)
-
- # Step 1: Resolve texts (fetch from URL if needed) — 0-30%
- _update(check_id, "Texte werden geladen...", 1)
- doc_texts: dict[str, str] = {}
- doc_entries: list[dict] = []
-
- # Cache fetched URLs to detect duplicates
- url_text_cache: dict[str, str] = {}
-
- n_docs = max(1, len(req.documents))
- # User-pasted-Tabellen-Vendors (kein LLM noetig) — werden weiter
- # unten in cmp_vendors gemerged.
- pasted_table_vendors: list[dict] = []
- for i, doc in enumerate(req.documents):
- pct = int(1 + (i / n_docs) * 29)
- _update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct)
- text = (doc.text or "").strip()
- input_source = "url"
- cmp_payloads: list[dict] = []
- if text:
- input_source = "text"
- if doc.url:
- input_source = "text+url" # User hat beide gefuellt
- logger.info(
- "doc_type=%s: User hat URL UND Text geliefert — "
- "Text gewinnt, URL wird als Quellen-Referenz behalten",
- doc.doc_type,
- )
- elif doc.url:
- url_key = doc.url.strip().rstrip("/").lower()
- if url_key in url_text_cache:
- text = url_text_cache[url_key]
- else:
- text, cmp_payloads = await _fetch_text(doc.url, doc_type=doc.doc_type)
- if text:
- url_text_cache[url_key] = text
-
- # Auto-Reclassify-Check: wenn der user Text in das falsche
- # Doc-Type-Feld kopiert hat (z.B. Impressum-Text in DSE),
- # erkennen und ggf. umtaggen.
- actual_doc_type = doc.doc_type
- reclassify_hint: dict | None = None
- if input_source.startswith("text") and len(text) >= 500:
- try:
- from compliance.services.doc_type_classifier import (
- detect_mismatch,
- )
- reclassify_hint = detect_mismatch(doc.doc_type, text)
- if reclassify_hint and reclassify_hint["action"] == "reclassify":
- actual_doc_type = reclassify_hint["detected"]
- logger.info(
- "doc_type AUTO-RECLASSIFY: deklariert=%s "
- "erkannt=%s (score %d vs %d) — uebernehme erkannten Typ",
- doc.doc_type, actual_doc_type,
- reclassify_hint["detected_score"],
- reclassify_hint["declared_score"],
- )
- except Exception as e:
- logger.warning("doc_type_classifier failed: %s", e)
-
- # Cookie-Tabelle: wenn User Tabelle reinkopiert hat, deterministisch
- # parsen (kein LLM noetig) und Vendors gleich ableiten.
- if input_source.startswith("text") and actual_doc_type == "cookie":
- try:
- from compliance.services.cookies_table_parser import (
- parse_cookie_table,
- )
- tab_vendors = parse_cookie_table(text)
- if tab_vendors:
- pasted_table_vendors.extend(tab_vendors)
- logger.info(
- "Cookie-Tabelle erkannt im pasted Text — "
- "%d Vendors / %d Cookies deterministisch geparst",
- len(tab_vendors),
- sum(len(v.get("cookies", [])) for v in tab_vendors),
- )
- except Exception as e:
- logger.warning("cookies_table_parser failed: %s", e)
-
- if text:
- doc_texts[actual_doc_type] = text
- doc_entries.append({
- "doc_type": actual_doc_type,
- "declared_doc_type": doc.doc_type,
- "url": doc.url,
- "text": text,
- "word_count": len(text.split()) if text else 0,
- "auto_discovered": False,
- "discovery_attempted": False,
- "cmp_payloads": cmp_payloads,
- "input_source": input_source,
- "reclassify_hint": reclassify_hint,
- })
-
- # Step 1a-bis: AUTO-DISCOVERY. For each canonical doc_type the user
- # did NOT submit a URL/text for, try to find it on the homepage of
- # the submitted URLs. This bridges the gap between "user knows the
- # exact URL" (rare) and "user pasted the homepage" (common).
- await _autodiscover_missing(
- check_id, doc_entries, doc_texts, url_text_cache,
- )
-
- # Step 1b: Section splitting — two cases:
- # 1. Same URL used for multiple doc_types → split by heading
- # 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows
- from compliance.services.section_splitter import (
- split_shared_texts, auto_fill_from_dsi, cross_search_documents,
- )
- split_shared_texts(doc_entries, url_text_cache)
- auto_fill_from_dsi(doc_entries)
-
- # Step 1c: Cross-document search — find doc_types in wrong documents (30-35%)
- _update(check_id, "Dokumente werden uebergreifend durchsucht...", 32)
- placement_findings = cross_search_documents(doc_entries)
-
- # Refresh doc_texts after all splitting/searching
- for entry in doc_entries:
- if entry.get("text"):
- doc_texts[entry["doc_type"]] = entry["text"]
-
- # P15: Dedupe — wenn mehrere Doc-Types DASSELBE Dokument referenzieren
- # (z.B. Safetykon: User gibt /datenschutz fuer dse + cookie + widerruf),
- # behalten wir nur den primaeren Doc-Type. Andere: leeren + note.
- # Priorität: dse > impressum > cookie > widerruf > agb > nutzungsbedingungen
- _DOC_PRIORITY = ["dse", "impressum", "cookie", "widerruf", "agb",
- "nutzungsbedingungen", "social_media", "dsb"]
- seen_text_hash: dict[int, str] = {}
- for dt in _DOC_PRIORITY:
- entry = next((e for e in doc_entries if e.get("doc_type") == dt
- and e.get("text")), None)
- if not entry:
- continue
- text_hash = hash((entry.get("text") or "").strip()[:1000])
- if text_hash in seen_text_hash:
- primary = seen_text_hash[text_hash]
- logger.info(
- "P15 dedup: doc_type=%s referenziert dasselbe Dokument "
- "wie %s (URL=%s) -> als Duplikat markiert.",
- dt, primary, entry.get("url", "")[:60],
- )
- entry["text"] = ""
- entry["word_count"] = 0
- entry["url"] = ""
- entry["dup_of"] = primary
- doc_texts.pop(dt, None)
- else:
- seen_text_hash[text_hash] = dt
-
- # Step 2: Detect business profile (35-40%)
- _update(check_id, "Geschaeftsmodell wird erkannt...", 37)
- # P16: Homepage-Text mit fuer Profile-Detection (no_direct_sales
- # B2B-Indikatoren wie "CE-Zertifizierung" / "Schulungen" stehen oft
- # nur im Homepage-Menue, nicht im Pflichttext).
- profile_input = dict(doc_texts)
- try:
- base_url = ""
- for e in doc_entries:
- if e.get("url"):
- from urllib.parse import urlparse
- p = urlparse(e["url"])
- if p.scheme and p.netloc:
- base_url = f"{p.scheme}://{p.netloc}/"
- break
- if base_url:
- import re as _re
- async with httpx.AsyncClient(
- timeout=8.0, follow_redirects=True,
- headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
- "AppleWebKit/537.36 HeadlessChrome/120.0.0.0"},
- ) as _hc:
- _hr = await _hc.get(base_url)
- if _hr.status_code == 200 and "text/html" in _hr.headers.get(
- "content-type", ""):
- _html = _hr.text[:60000]
- _html = _re.sub(r"", " ",
- _html, flags=_re.DOTALL | _re.IGNORECASE)
- _html = _re.sub(r"", " ",
- _html, flags=_re.DOTALL | _re.IGNORECASE)
- _html = _re.sub(r"<[^>]+>", " ", _html)
- _html = _re.sub(r"\s+", " ", _html).strip()
- if len(_html.split()) > 30:
- profile_input["__homepage"] = _html[:20000]
- logger.info("P16 homepage merged for profile: %d words",
- len(_html.split()))
- except Exception as e:
- logger.debug("homepage fetch for profile failed: %s", e)
- profile = await detect_business_profile(profile_input)
- profile_dict = asdict(profile)
-
- # Step 3: Check each document
- results: list[DocCheckResult] = []
- total_findings = 0
- use_agent_flag = req.use_agent or os.getenv(
- "COMPLIANCE_USE_AGENT", "false"
- ).lower() == "true"
-
- # Filter out doc_types that don't apply to this business profile
- skip_types = _get_skip_types(profile)
-
- # Derive business_scope hints for the MC filter (O1 — Doc-type Scope-Flag).
- # MCs that explicitly require a feature (e.g. 'biometric_processing',
- # 'ai_decision_making', 'child_targeting') get dropped when the
- # detected profile doesn't declare it.
- business_scope: set[str] = set()
- for svc in (getattr(profile, "detected_services", []) or []):
- business_scope.add(str(svc).lower())
- if (getattr(profile, "business_type", "") or "").lower() == "b2c":
- business_scope.add("b2c")
- if getattr(profile, "has_online_shop", False):
- business_scope.add("ecommerce")
- if getattr(profile, "is_regulated_profession", False):
- business_scope.add("regulated_profession")
-
- # Document checks: 40-80%
- n_entries = max(1, len(doc_entries))
- for i, entry in enumerate(doc_entries):
- text = entry["text"]
- doc_type = entry["doc_type"]
- label = _doc_type_label(doc_type)
- url = entry["url"]
-
- if doc_type in skip_types:
- results.append(DocCheckResult(
- label=label, url=url, doc_type=doc_type,
- error=skip_types[doc_type],
- ))
- continue
-
- pct = int(40 + (i / n_entries) * 40)
- _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
-
- if not text or len(text) < 50:
- # P15: duplicate doc that was deduped against a primary doc
- if entry.get("dup_of"):
- results.append(DocCheckResult(
- label=label, url="", doc_type=doc_type,
- error=f"Nicht separat vorhanden — wird im Dokument "
- f"'{_doc_type_label(entry['dup_of'])}' "
- f"mit-geprueft.",
- ))
- continue
- # P24: DSB-Kontakt ist Pflichtangabe in der DSE (Art. 13(1)(b)
- # DSGVO) — wenn kein separates DSB-Dokument vorliegt, ist das
- # KEIN Fehler. DSB-Pruefung passiert ohnehin in der DSE.
- if doc_type == "dsb" and not (entry.get("url") or "").strip():
- results.append(DocCheckResult(
- label=label, url="", doc_type=doc_type,
- error="Nicht separat vorhanden — DSB-Kontaktdaten "
- "werden in der Datenschutzerklaerung als "
- "Pflichtangabe nach Art. 13(1)(b) DSGVO geprueft.",
- ))
- continue
- # Empty entry — either from auto-discovery padding (no URL
- # to fetch) or from a fetch that returned nothing. If there
- # was a URL we keep the error so the user knows the fetch
- # failed; otherwise let the padding step label it
- # 'Nicht eingereicht' / 'Auf der Website nicht gefunden'.
- if (entry.get("url") or "").strip():
- results.append(DocCheckResult(
- label=label, url=url, doc_type=doc_type,
- error="Kein Text vorhanden oder zu kurz",
- ))
- continue
-
- result = await _check_single(
- text, doc_type, label, url,
- entry["word_count"], use_agent_flag,
- business_scope=business_scope,
- business_profile={"no_direct_sales": getattr(profile, "no_direct_sales", False)},
- )
-
- # Apply profile context filter
- result = _apply_profile_filter(result, profile, doc_type)
-
- # Add placement findings — but only if the regex checks confirm
- # the text doesn't match. If completeness >= 50%, the text IS the
- # right doc_type despite missing cross-search keywords.
- if result.completeness_pct < 50:
- for pf in placement_findings:
- if pf.get("doc_type") == doc_type:
- result.checks.insert(0, CheckItem(**{
- k: v for k, v in pf.items() if k != "doc_type"
- }))
-
- results.append(result)
- total_findings += result.findings_count
-
- # Step 3b: Banner-Check (automatic, uses first URL or homepage)
- banner_result = None
- banner_url = req.documents[0].url if req.documents and req.documents[0].url else ""
- # Use the homepage (strip path) for banner check
- if banner_url:
- from urllib.parse import urlparse
- parsed = urlparse(banner_url)
- banner_url = f"{parsed.scheme}://{parsed.netloc}"
- if banner_url:
- _update(check_id, "Cookie-Banner wird geprueft...", 82)
- try:
- async with httpx.AsyncClient(timeout=900.0) as client: # P50: +10min for vendor-detail-phase
- resp = await client.post(
- f"{CONSENT_TESTER_URL}/scan",
- json={"url": banner_url, "timeout_per_phase": 10},
- )
- if resp.status_code == 200:
- banner_result = resp.json()
- except Exception as e:
- logger.warning(
- "Banner check failed: %s (%s)", e or "", type(e).__name__
- )
-
- # Step 3c: Cross-check Banner vs Cookie-Richtlinie (88-90%)
- if banner_result and "cookie" in doc_texts:
- _update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...", 89)
- cross_findings = _cross_check_banner_vs_cookie(
- banner_result, doc_texts["cookie"],
- )
- if cross_findings:
- for r in results:
- if r.doc_type == "cookie":
- for cf in cross_findings:
- r.checks.append(CheckItem(**cf))
- l2 = [c for c in r.checks if c.level == 2 and not c.skipped]
- l2p = sum(1 for c in l2 if c.passed)
- r.correctness_pct = round(l2p / len(l2) * 100) if l2 else 0
-
- # Step 3d: TCF Vendor cross-check against DSI
- tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else []
- vvt_entries: list[dict] = []
- if tcf_vendors and "dse" in doc_texts:
- _update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...", 91)
- from compliance.services.banner_cookie_cross_check import cross_check_vendors_vs_dsi
- from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt
- vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"])
- if vendor_findings:
- for r in results:
- if r.doc_type == "dse":
- for vf in vendor_findings:
- r.checks.append(CheckItem(**vf))
- vvt_entries = map_vendors_to_vvt(tcf_vendors)
-
- # Step 4: Extract profile hints from documents (92-95%)
- _update(check_id, "Profil wird aus Dokumenten extrahiert...", 93)
- from compliance.services.profile_extractor import extract_profile_from_documents
- extracted_profile = extract_profile_from_documents(doc_texts, profile_dict)
-
- # Step 4b: Determine scenario per document
- for r in results:
- if r.error:
- r.scenario = "skip"
- elif r.completeness_pct < 30:
- r.scenario = "regenerate"
- elif r.completeness_pct < 95:
- r.scenario = "fix"
- else:
- r.scenario = "import"
-
- # Step 4c: Always render all 8 canonical doc types. Missing types
- # are differentiated:
- # - Discovery was tried but found nothing -> 'Auf der Website
- # nicht gefunden' (suggest user provides URL manually)
- # - No submitted URLs at all -> 'Nicht eingereicht'
- attempted = {
- e["doc_type"] for e in doc_entries if e.get("discovery_attempted")
- }
- results = _pad_results_with_missing(results, discovery_attempted=attempted)
-
- # Step 5: Build report with management summary (95-98%)
- _update(check_id, "Report wird erstellt...", 96)
- from .agent_doc_check_report import (
- build_management_summary,
- build_scanned_urls_html,
- build_provider_list_html,
- )
- from .agent_doc_check_extras import build_vvt_table_html
-
- # Extract structured vendor records from any CMP payloads captured
- # for the cookie doc (BMW ePaaS, OneTrust, etc.), validate their
- # opt-out + privacy URLs concurrently, score each entry.
- cmp_vendors: list[dict] = []
- try:
- from compliance.services.vendor_extractor import (
- extract_vendors_from_payloads,
- )
- from compliance.services.cookie_link_validator import (
- validate_vendor_urls, score_vendors,
- )
- cookie_payloads = []
- cookie_text = ""
- # P30: aggregate cmp_payloads from ALL doc_entries — sites
- # like Mercedes load Usercentrics only on the homepage, so
- # the JSON gets captured during DSE/Impressum discovery, not
- # in the cookies.html fetch. Dedup by URL since the same
- # payload is captured on every page load.
- seen_cmp_urls: set[str] = set()
- for e in doc_entries:
- for p in (e.get("cmp_payloads") or []):
- p_url = p.get("url") or ""
- if p_url and p_url in seen_cmp_urls:
- continue
- seen_cmp_urls.add(p_url)
- cookie_payloads.append(p)
- if e.get("doc_type") == "cookie" and e.get("text"):
- cookie_text = e["text"]
- # P48: also pull cmp_payloads from the Banner-Scan (homepage
- # 3-phase consent test). Mercedes' Usercentrics-JSON is
- # captured there even when not in DSI-Discovery of static
- # legal pages.
- if banner_result:
- for p in (banner_result.get("cmp_payloads") or []):
- p_url = p.get("url") or ""
- if p_url and p_url in seen_cmp_urls:
- continue
- seen_cmp_urls.add(p_url)
- cookie_payloads.append(p)
- if cookie_payloads:
- logger.info("P48: %d CMP-payloads available for vendor-extract (after Banner-Scan merge)",
- len(cookie_payloads))
- # P17-D: Fallback wenn cookie via P15 deduped wurde — nutze DSE-Text
- # sofern Cookie-Begriffe drin sind, damit LLM-Vendor-Extract trotzdem
- # greifen kann.
- if not cookie_text and not cookie_payloads:
- dse_t = doc_texts.get("dse", "")
- if dse_t and any(w in dse_t.lower() for w in
- ("cookie", "tracking", "google analytics", "consent")):
- cookie_text = dse_t
- logger.info("P17-D: vendor-extract Fallback auf DSE (Cookie deduped)")
- # Site-owner derived from the submitted URLs — drives the
- # INTERNAL/GROUP_COMPANY classification of vendor records.
- owner_name = _company_name_from_url(doc_entries) or ""
- if cookie_payloads:
- cmp_vendors = extract_vendors_from_payloads(
- cookie_payloads, owner_name=owner_name,
- )
- # P52: LLM-Fallback nicht nur wenn 0 Vendors, sondern auch
- # wenn die strukturierten Quellen < 5 Vendors lieferten und
- # der Cookie-Text substantiell ist. So holt sich VW-typische
- # Setups (Generic CMP, 28 Cookies aber 0 cmp_payloads) noch
- # ihre echten Vendors aus dem Text.
- if (len(cmp_vendors) < 5
- and cookie_text and len(cookie_text.split()) >= 500):
- from compliance.services.vendor_llm_extractor import (
- extract_vendors_via_llm,
- )
- from compliance.services.vendor_classifier import classify
- _update(check_id, "Vendor-Liste per LLM extrahieren...", 94)
- llm_vendors = await extract_vendors_via_llm(cookie_text)
- # P52: classify die LLM-Vendors und MERGE mit existing
- # statt zu ueberschreiben.
- existing_names = {(v.get("name") or "").strip().lower()
- for v in cmp_vendors}
- added_llm = 0
- for v in llm_vendors:
- nm = (v.get("name") or "").strip()
- if not nm or nm.lower() in existing_names:
- continue
- v["recipient_type"] = classify(
- vendor_name=nm,
- category=v.get("category", ""),
- owner_name=owner_name,
- )
- v.setdefault("source", "llm_cascade")
- cmp_vendors.append(v)
- existing_names.add(nm.lower())
- added_llm += 1
- if added_llm:
- logger.info(
- "P52 LLM-Cascade: +%d Vendors (total: %d)",
- added_llm, len(cmp_vendors),
- )
- # P57: Phase G vendor_details als zusätzliche Vendor-Quelle.
- # Wenn extract_vendors_from_payloads weniger findet als
- # Phase G's Info-Click-Through (z.B. Mercedes-Settings nicht
- # erkannt als usercentrics-kind), die Phase-G-Namen als
- # eigenständige Vendors hinzufügen.
- if banner_result:
- vd_list = banner_result.get("vendor_details") or []
- vd_list = [v for v in vd_list if v.get("name") != "__TDM_OPTOUT__"]
- existing_names = {(v.get("name") or "").strip().lower()
- for v in cmp_vendors}
- added = 0
- for d in vd_list:
- n = (d.get("name") or "").strip()
- if not n or n.lower() in existing_names:
- continue
- # Skip generic category-labels (Mercedes-Kategorien)
- if n.lower() in ("technisch erforderlich", "analyse und statistik",
- "marketing", "alles auswählen",
- "alles auswaehlen"):
- continue
- from compliance.services.vendor_classifier import classify
- cmp_vendors.append({
- "name": n,
- "country": "",
- "purpose": d.get("description", "")[:500],
- "category": "",
- "opt_out_url": d.get("opt_out_url", ""),
- "privacy_policy_url": d.get("privacy_url", ""),
- "persistence": d.get("retention", ""),
- "cookies": d.get("cookies", []),
- "processing_company": d.get("processing_company", ""),
- "address": d.get("address", ""),
- "purposes": d.get("purposes", []),
- "technologies": d.get("technologies", []),
- "recipient_type": classify(
- vendor_name=n, category="", owner_name=owner_name,
- ),
- })
- existing_names.add(n.lower())
- added += 1
- if added:
- logger.info("P57: added %d new vendors from Phase G (total: %d)",
- added, len(cmp_vendors))
-
- # D — HTML-Tabellen die der consent-tester aus dem DOM
- # extrahiert hat: direkt deterministisch parsen (hoechste
- # Genauigkeit, keine LLM-Halluzinationen).
- for pl in (cookie_payloads or []):
- if pl.get("kind") != "html_table":
- continue
- rows = pl.get("rows") or []
- if len(rows) < 3:
- continue
- try:
- from compliance.services.cookies_table_parser import (
- parse_cookie_table as _parse_ct_d,
- )
- table_text = "\n".join(rows)
- d_vendors = _parse_ct_d(table_text)
- if d_vendors:
- existing_d = {(v.get("name") or "").strip().lower()
- for v in cmp_vendors}
- added_d = 0
- for v in d_vendors:
- nm = (v.get("name") or "").strip()
- if not nm or nm.lower() in existing_d:
- continue
- v.setdefault("source", "html_table_dom")
- cmp_vendors.append(v)
- existing_d.add(nm.lower())
- added_d += 1
- if added_d:
- logger.info(
- "D HTML-Table-DOM-Parse: +%d Vendors aus "
- "%d-Zeilen-Tabelle (total: %d)",
- added_d, len(rows), len(cmp_vendors),
- )
- except Exception as e:
- logger.warning("html_table parse failed: %s", e)
-
- # B — cookies_table_parser auch auf gecrawltem Cookie-Text.
- # Erst Standard-Parse (Tab/Pipe-getrennt). Wenn der nichts
- # findet (kein Separator), Flat-Pattern-Parse fuer Sites wie
- # VW die ihre Tabelle als flachen Text liefern.
- if cookie_text and len(cookie_text) >= 500:
- try:
- from compliance.services.cookies_table_parser import (
- parse_cookie_table as _parse_ct,
- parse_flat_cookie_text as _parse_flat,
- )
- crawled_table_vendors = _parse_ct(cookie_text)
- if not crawled_table_vendors:
- crawled_table_vendors = _parse_flat(cookie_text)
- if crawled_table_vendors:
- existing = {(v.get("name") or "").strip().lower()
- for v in cmp_vendors}
- added_c = 0
- for v in crawled_table_vendors:
- nm = (v.get("name") or "").strip()
- if not nm or nm.lower() in existing:
- continue
- v.setdefault("source", "table_crawled")
- cmp_vendors.append(v)
- existing.add(nm.lower())
- added_c += 1
- if added_c:
- logger.info(
- "B Crawled-Tabellen-Parse: +%d Vendors "
- "(total: %d)",
- added_c, len(cmp_vendors),
- )
- except Exception as e:
- logger.warning("crawled-table-parse failed: %s", e)
-
- # C — Screenshot + Tesseract-OCR der Cookie-Richtlinie.
- # Overlapping scrolling screenshots (jede Slice ueberlappt die
- # vorherige um overlap_px Pixel) → lueckenlose Beweiskette.
- # Pro Slice Tesseract OCR + parse_ocr_cookie_table; Dedup nach
- # Cookie-Name über alle Slices. Site-unabhaengig, deterministisch.
- cookie_url_for_shot = ""
- for _e in doc_entries:
- if _e.get("doc_type") == "cookie" and _e.get("url"):
- cookie_url_for_shot = _e["url"]; break
- cookie_evidence_slices: list[dict] | None = None
- cookie_evidence_meta: dict | None = None
- if cookie_url_for_shot:
- try:
- from compliance.services.cookie_screenshot_ocr import (
- capture_cookie_evidence_slices,
- ocr_slices_extract_cookies,
- cookies_to_vendor_records,
- )
- from compliance.services.cookies_table_parser import (
- _guess_vendor as _gv,
- )
- _update(check_id,
- "Cookie-Richtlinie wird fotografiert (lueckenlose Beweiskette)...",
- 92)
- ev = await capture_cookie_evidence_slices(
- cookie_url_for_shot, check_id=check_id,
- viewport_h=1024, overlap_px=200, max_slices=40,
- )
- if ev.get("slices"):
- cookie_evidence_slices = ev["slices"] # ZIP-Anhang
- cookie_evidence_meta = {
- "total_height_px": ev.get("total_height_px"),
- "width_px": ev.get("width_px"),
- "accepted_banner": ev.get("accepted_banner"),
- "expanded": ev.get("expanded"),
- "url": ev.get("url"),
- "slice_count": len(ev["slices"]),
- }
- _update(check_id,
- "Tesseract OCR über alle Slices...", 93)
- ocr_cookies, ocr_stats = ocr_slices_extract_cookies(
- ev["slices"],
- )
- if ocr_cookies:
- ocr_vendors = cookies_to_vendor_records(
- ocr_cookies, guess_vendor_fn=_gv,
- )
- existing = {
- (v.get("name") or "").strip().lower()
- for v in cmp_vendors
- }
- added_v = 0
- for v in ocr_vendors:
- nm = (v.get("name") or "").strip()
- if not nm:
- continue
- if nm.lower() in existing:
- for ex in cmp_vendors:
- if (ex.get("name") or "").strip().lower() == nm.lower():
- ex_names = {
- (c.get("name") or "").lower()
- for c in (ex.get("cookies") or [])
- }
- for c in (v.get("cookies") or []):
- if c["name"].lower() not in ex_names:
- ex.setdefault("cookies", []).append(c)
- ex_names.add(c["name"].lower())
- cur_src = ex.get("source", "")
- if "tesseract_ocr" not in cur_src:
- ex["source"] = (cur_src + ";tesseract_ocr").strip(";")
- break
- continue
- cmp_vendors.append(v)
- existing.add(nm.lower())
- added_v += 1
- logger.info(
- "C Tesseract-OCR: +%d Vendors / %d Cookies "
- "(über %d Slices, total: %d)",
- added_v, len(ocr_cookies),
- ocr_stats.get("slices", 0), len(cmp_vendors),
- )
- except Exception as e:
- logger.warning(
- "Tesseract-OCR pipeline failed: %s (%s)",
- str(e) or "(no msg)", type(e).__name__,
- )
-
- # User-pasted Cookie-Tabelle (deterministisch, kein LLM):
- # die hat IMMER Vorrang weil 100% genau.
- if pasted_table_vendors:
- existing = {(v.get("name") or "").strip().lower()
- for v in cmp_vendors}
- added_p = 0
- for v in pasted_table_vendors:
- nm = (v.get("name") or "").strip()
- if not nm or nm.lower() in existing:
- continue
- cmp_vendors.append(v)
- existing.add(nm.lower())
- added_p += 1
- if added_p:
- logger.info(
- "Pasted-Tabellen-Merge: +%d Vendors (total: %d)",
- added_p, len(cmp_vendors),
- )
-
- # Cookie-Library-Fallback (P52 Lite): wenn weiterhin wenige
- # Vendors aber viele after_accept-Cookies, aus Library auflösen.
- # VW-Lehre: 6 LLM-Grob-Vendors reichen NICHT — die Library
- # holt 30+ weitere aus den Cookie-Namen + Cookie-Doc-Pattern.
- # Schwelle: immer probieren wenn < 20 Vendors.
- if banner_result and len(cmp_vendors) < 20:
- try:
- from compliance.services.cookie_to_vendor_fallback import (
- fallback_vendors_for_run,
- )
- from database import SessionLocal as _SLfb
- _fb_db = _SLfb()
- try:
- extra = fallback_vendors_for_run(
- _fb_db, banner_result, len(cmp_vendors),
- cookie_doc_text=cookie_text,
- )
- if extra:
- existing_names = {(v.get("name") or "").strip().lower()
- for v in cmp_vendors}
- for v in extra:
- if v["name"].lower() in existing_names:
- continue
- cmp_vendors.append(v)
- logger.info(
- "Cookie-Library-Fallback: cmp_vendors %d -> %d",
- len(cmp_vendors) - len(extra), len(cmp_vendors),
- )
- finally:
- _fb_db.close()
- except Exception as e:
- logger.warning("Cookie-Library-Fallback skipped: %s", e)
-
- # Vendor-Normalizer: Dedup (Google-Familie etc) + Garbage-Filter
- try:
- from compliance.services.vendor_normalizer import (
- normalize_vendors as _norm_v,
- )
- cmp_vendors = _norm_v(cmp_vendors)
- except Exception as e:
- logger.warning("vendor_normalizer skipped: %s", e)
-
- # P50: enrich vendors with per-vendor detail-modal-extracts
- # (description, opt-out URL, privacy URL, cookies). Detail
- # comes from Phase G Info-button-click-through in /scan.
- tdm_opt_out_notice = ""
- if cmp_vendors and banner_result:
- vendor_details = banner_result.get("vendor_details") or []
- # P50f: filter out TDM-opt-out sentinel
- tdm_sentinel = next((v for v in vendor_details
- if v.get("name") == "__TDM_OPTOUT__"), None)
- if tdm_sentinel:
- tdm_opt_out_notice = tdm_sentinel.get("description", "")
- logger.info("P50f: TDM opt-out — skipped detail-enrichment for vendors")
- vendor_details = [v for v in vendor_details
- if v.get("name") != "__TDM_OPTOUT__"]
- if vendor_details:
- details_by_name = {}
- for d in vendor_details:
- n = (d.get("name") or "").strip().lower()
- if n:
- details_by_name[n] = d
- enriched = 0
- for v in cmp_vendors:
- key = (v.get("name") or "").strip().lower()
- # Substring fallback for fuzzy matches (e.g.
- # "Google Analytics" detail-name may differ slightly)
- d = details_by_name.get(key)
- if not d:
- for dn, dv in details_by_name.items():
- if key in dn or dn in key:
- d = dv
- break
- if not d:
- continue
- if not v.get("country") and (d.get("processing_company") or d.get("address")):
- # Heuristic country extract from address (DE/EU keywords)
- addr = d.get("address", "")
- if re.search(r"\b(deutschland|germany|berlin|m(?:ue|ü)nchen|hamburg|stuttgart)\b", addr, re.I):
- v["country"] = "DE"
- elif re.search(r"\bireland|irland|dublin\b", addr, re.I):
- v["country"] = "IE"
- elif re.search(r"\busa|united states|california|new york|delaware\b", addr, re.I):
- v["country"] = "US"
- if not v.get("purpose"):
- v["purpose"] = d.get("description", "")[:500]
- if not v.get("opt_out_url"):
- v["opt_out_url"] = d.get("opt_out_url", "")
- if not v.get("privacy_policy_url"):
- v["privacy_policy_url"] = d.get("privacy_url", "")
- if not v.get("cookies"):
- v["cookies"] = d.get("cookies", [])
- v["purposes"] = d.get("purposes", [])
- v["technologies"] = d.get("technologies", [])
- if not v.get("persistence"):
- v["persistence"] = d.get("retention", "")
- v["processing_company"] = d.get("processing_company", "")
- v["address"] = d.get("address", "")
- enriched += 1
- logger.info("P50: enriched %d/%d vendors with detail-modal data",
- enriched, len(cmp_vendors))
- # P59b: Cookie-Behavior-Validator — pruefe alle gesetzten Cookies
- # gegen unsere Library, generiere 3-Tier-Severity-Findings.
- # Background-Task hat keinen DB-Dependency-Inject -> SessionLocal
- # selber oeffnen + sauber schliessen.
- cookie_behavior_findings: list[dict] = []
- if banner_result:
- cookies_detailed = banner_result.get("cookies_detailed") or []
- if cookies_detailed:
- cb_session = None
- try:
- from database import SessionLocal
- from compliance.services.cookie_behavior_validator import (
- validate_cookie_behavior,
- )
- from urllib.parse import urlparse
- fp_domain = ""
- if banner_url:
- fp_domain = urlparse(banner_url).netloc.replace("www.", "")
- cb_session = SessionLocal()
- cookie_behavior_findings = validate_cookie_behavior(
- cb_session, cookies_detailed,
- network_requests=[], # TODO Layer B in P59d
- first_party_domain=fp_domain,
- )
- if cookie_behavior_findings:
- sevs = {f["severity"] for f in cookie_behavior_findings}
- logger.info(
- "P59b: Cookie-Behavior-Check %d findings "
- "(severities: %s) ueber %d Cookies",
- len(cookie_behavior_findings),
- sorted(sevs),
- len(cookies_detailed),
- )
- banner_result["cookie_behavior_findings"] = (
- cookie_behavior_findings
- )
- else:
- logger.info(
- "P59b: Cookie-Behavior-Check 0 findings "
- "ueber %d Cookies (library miss / clean)",
- len(cookies_detailed),
- )
- except Exception as cb_err:
- logger.warning("P59b Cookie-Behavior-Check failed: %s", cb_err)
- finally:
- if cb_session is not None:
- try:
- cb_session.close()
- except Exception:
- pass
-
- # P61: "Untergeschobene Cookies" — wenn z.B. Google Tag Manager
- # deklariert ist, kommen GA + GCL_AU + DoubleClick automatisch mit.
- # Findings landen im banner_result fuer Mail-Render.
- if banner_result and cmp_vendors:
- try:
- from compliance.services.vendor_package_cookies import (
- detect_implicit_cookies,
- )
- declared = [v.get("name", "") for v in cmp_vendors if v.get("name")]
- actual_cookies: list[str] = []
- for phase_data in (banner_result.get("phases") or {}).values():
- if isinstance(phase_data, dict):
- for ck in (phase_data.get("cookies") or []):
- if isinstance(ck, dict) and ck.get("name"):
- actual_cookies.append(ck["name"])
- implicit_findings = detect_implicit_cookies(
- declared, actual_cookies_set=actual_cookies or None,
- )
- if implicit_findings:
- banner_result["implicit_vendor_findings"] = implicit_findings
- logger.info(
- "P61: %d implicit vendor-package items detected "
- "(%d cookies + %d vendors)",
- len(implicit_findings),
- sum(1 for f in implicit_findings if f["implicit"]["type"] == "cookie"),
- sum(1 for f in implicit_findings if f["implicit"]["type"] == "vendor"),
- )
- except Exception as p61_err:
- logger.warning("P61 implicit-vendor detection failed: %s", p61_err)
-
- if cmp_vendors:
- logger.info("VVT: %d vendors extracted, validating links",
- len(cmp_vendors))
- cmp_vendors = await validate_vendor_urls(cmp_vendors)
- cmp_vendors = score_vendors(cmp_vendors)
- # Enrich each vendor with per-cookie functional roles
- try:
- from compliance.services.cookie_function_classifier import (
- annotate_vendor_cookies,
- )
- cmp_vendors = [annotate_vendor_cookies(v) for v in cmp_vendors]
- except Exception as e:
- logger.warning("Cookie function classification skipped: %s", e)
- except Exception as e:
- logger.warning("VVT vendor extraction skipped: %s", e)
-
- # Vendor-Redundanz + EU-Alternativen + Cost/Savings (O4)
- redundancy_report = None
- try:
- from compliance.services.vendor_redundancy import analyze as analyze_redundancy
- from compliance.services.vendor_cost_estimator import infer_company_tier
- if cmp_vendors:
- # Company-Tier aus business_profile ableiten — beeinflusst die
- # Cost-Range so dass z.B. fuer DAX-Konzerne nicht starter-Preise
- # die untere Schranke duruecken.
- bp_dict = {
- "type": getattr(profile, "business_type", ""),
- "features": list(business_scope),
- }
- ctier = infer_company_tier(bp_dict)
- redundancy_report = analyze_redundancy(cmp_vendors, company_tier=ctier)
- logger.info(
- "Redundanz: %d Kategorien mit Mehrfach-Anbietern, "
- "Spar-Schaetzung %s pro Jahr (company_tier=%s)",
- redundancy_report["summary"]["redundancy_count"],
- redundancy_report["summary"]["estimated_saving_pct"],
- ctier,
- )
- except Exception as e:
- logger.warning("Vendor redundancy analysis skipped: %s", e)
-
- summary_html = build_management_summary(results)
- scanned_html = build_scanned_urls_html(doc_entries)
- providers_html = build_provider_list_html(banner_result, vvt_entries)
- # P18: Deep-Block mit Phases + Quality-Score + Per-Category-Tracker
- from .agent_doc_check_banner import build_banner_deep_html
- banner_deep_html = build_banner_deep_html(banner_result)
- vvt_html = build_vvt_table_html(cmp_vendors)
-
- # MC scorecard aggregated across ALL docs in this run (DSGVO/TDDDG/
- # BGB/...). Sits at the top so the GF sees the regulation-by-
- # regulation view before drilling into per-doc details.
- from compliance.services.mc_scorecard import build_scorecard
- from .agent_doc_check_scorecard import build_scorecard_html
- all_mc_checks: list[dict] = []
- # P73: pro-doc Fails sammeln um Solution-Generator pro Doc-Type
- # mit dem korrekten doc_text aufzurufen.
- fails_by_doc: dict[str, list[dict]] = {}
- for r in results:
- for c in r.checks:
- if c.id.startswith("mc-"):
- rec = {
- "id": c.id, "label": c.label, "passed": c.passed,
- "severity": c.severity, "skipped": c.skipped,
- "regulation": c.regulation,
- "hint": getattr(c, "hint", "") or "",
- }
- all_mc_checks.append(rec)
- if (not c.passed and not c.skipped
- and (c.severity or "").upper() in ("CRITICAL", "HIGH")):
- fails_by_doc.setdefault(r.doc_type, []).append(rec)
- # P106 — Audit-Type-Klassifizierung pro MC. Interne Prozess-/
- # Doku-Checks werden NICHT als FAIL gewertet sondern als CHECK
- # (manuelle Pruefung beim DSB notwendig).
- try:
- from compliance.services.mc_audit_type import (
- annotate_mc_results, split_by_audit_type,
- )
- annotate_mc_results(all_mc_checks)
- mc_split = split_by_audit_type(all_mc_checks)
- # Fails-by-doc neu aufbauen: nur noch echte verifiable Fails
- fails_by_doc = {}
- for r in mc_split.get("verifiable_fails") or []:
- fails_by_doc.setdefault("dse", []).append(r)
- except Exception as e:
- logger.warning("P106 mc_audit_type skipped: %s", e)
- mc_split = {"internal_checks": [], "verifiable_fails": all_mc_checks}
- scorecard = build_scorecard(all_mc_checks) if all_mc_checks else {}
- # Trend: load previous scorecard for the same tenant + domain so the
- # email can show delta indicators (A6).
- prev_scorecard: dict | None = None
- if scorecard:
- try:
- from compliance.services.compliance_audit_log import (
- list_runs_for_tenant,
- )
- tenant_id_for_trend = req.recipient or ""
- base_domain_for_trend = _extract_domain(doc_entries) or ""
- prev_runs = list_runs_for_tenant(
- tenant_id_for_trend,
- base_domain=base_domain_for_trend,
- limit=1,
- )
- if prev_runs:
- prev_scorecard = prev_runs[0].get("scorecard")
- except Exception as e:
- logger.debug("trend lookup skipped: %s", e)
- scorecard_html = (
- build_scorecard_html(scorecard, previous_scorecard=prev_scorecard)
- if scorecard else ""
- )
-
- report_html = build_html_report(results, None, doc_texts)
- profile_html = _build_profile_html(profile)
-
- # O4: Vendor-Redundanz / EU-Alternativen + Cost-Savings-Block
- from .agent_doc_check_redundancy import build_redundancy_html
- redundancy_html = build_redundancy_html(redundancy_report)
-
- # P1: Executive-Summary GANZ oben — CFO/GF sieht 4 KPIs + 2 CTAs.
- from .agent_doc_check_exec_summary import build_exec_summary_html
- # Site-Name fuer Header bestimmen (gleiche Logik wie Email-Subject)
- url_company_for_exec = _company_name_from_url(doc_entries)
- domain_for_exec = _extract_domain(doc_entries)
- site_name_for_exec = url_company_for_exec or domain_for_exec or ""
- exec_summary_html = build_exec_summary_html(
- scorecard=scorecard,
- previous_scorecard=prev_scorecard,
- cmp_vendors=cmp_vendors,
- redundancy_report=redundancy_report,
- site_name=site_name_for_exec,
- )
-
- # P18: Critical-Findings-Block (rot oben, mit Sofortmassnahmen +
- # Quellen + Bussgeld-Praezedenz). Wird nur gerendert wenn echte
- # kritische Verstoesse vorliegen.
- critical_html = ""
- try:
- from .agent_doc_check_critical import build_critical_findings_html
- critical_html = build_critical_findings_html(
- banner_result=banner_result,
- scorecard=scorecard,
- results=results,
- )
- except Exception as e:
- logger.warning("Critical-findings block skipped: %s", e)
-
- # P10: Cookie-Policy-Architecture-Detection (BMW-Pattern erkennen)
- cookie_arch_html = ""
- try:
- from compliance.services.cookie_policy_architecture import (
- detect_architecture, build_architecture_html,
- )
- cookie_doc_url = ""
- cookie_doc_text = doc_texts.get("cookie", "")
- cookie_cmp_payloads: list[dict] = []
- for e in doc_entries:
- if (e.get("doc_type") or "").lower() in ("cookie", "cookie_policy"):
- cookie_doc_url = e.get("url", "")
- cookie_cmp_payloads = e.get("cmp_payloads") or []
- break
- # P17-A: Fallback wenn Cookie-Doc via P15 deduped wurde — nutze
- # den DSE-Text wenn er Cookie-Schluesselwoerter enthaelt.
- if not cookie_doc_text:
- dse_text = doc_texts.get("dse", "")
- if dse_text and any(w in dse_text.lower() for w in
- ("cookie", "tracking", "google analytics",
- "consent")):
- cookie_doc_text = dse_text
- dse_entry = next((e for e in doc_entries
- if e.get("doc_type") == "dse"), {})
- cookie_doc_url = dse_entry.get("url", "")
- cookie_cmp_payloads = dse_entry.get("cmp_payloads") or []
- logger.info("P17-A: cookie-arch fallback auf DSE (Cookie-Doc deduped)")
- if cookie_doc_text:
- arch = detect_architecture(
- doc_url=cookie_doc_url,
- doc_text=cookie_doc_text,
- cmp_payloads=cookie_cmp_payloads,
- homepage_cmp_payloads=cmp_payloads or [],
- )
- cookie_arch_html = build_architecture_html(arch)
- logger.info("cookie-arch: layer=%s versioned=%s risk=%s",
- arch["layer_separation"], arch["versioned"], arch["risk_label"])
- except Exception as e:
- logger.warning("cookie-architecture detection failed: %s", e)
-
- # Reihenfolge — Sales-optimiert:
- # 1) Exec-Summary (KPIs + Saving + CTAs)
- # 2) summary_html (Konkrete Aufgaben fuer die Geschaeftsfuehrung)
- # 3) scanned_urls (Quellen-Transparenz)
- # 4) profile_html (Erkanntes Geschaeftsmodell)
- # 5) scorecard_html (MC-Scorecard)
- # 6) redundancy_html (Optimierungspotenzial — direkt nach Compliance-Score)
- # 7) providers_html + vvt_html (Vendor-Liste)
- # 8) report_html (Doc-Pruefung Details)
- # P62: Marketing-Manager-Disclaimer — was wir sehen vs nicht sehen
- scope_disclaimer_html = ""
- try:
- from .scope_disclaimer import build_scope_disclaimer_html
- scope_disclaimer_html = build_scope_disclaimer_html()
- except Exception as e:
- logger.warning("Scope-disclaimer block skipped: %s", e)
-
- # P103 + P104 — Cookie-Value-Entropy + Network-Tracing (Stufe 3 + 4)
- entropy_html = ""
- network_trace_html = ""
- try:
- from compliance.services.cookie_value_entropy import (
- check_cookies_for_entropy_mismatch, build_entropy_block_html,
- )
- from compliance.services.cookie_network_tracer import (
- trace_cookie_network, build_network_trace_block_html,
- )
- cookies_detailed = (banner_result or {}).get("cookies_detailed") or []
- entropy_findings = check_cookies_for_entropy_mismatch(cookies_detailed)
- if entropy_findings:
- entropy_html = build_entropy_block_html(entropy_findings)
- logger.info("P103 Entropy: %d Findings", len(entropy_findings))
- primary_url = ""
- for e_ in doc_entries:
- if e_.get("url"):
- primary_url = e_["url"]; break
- net_findings = trace_cookie_network(cookies_detailed, primary_url)
- if net_findings:
- network_trace_html = build_network_trace_block_html(net_findings)
- logger.info("P104 Network-Trace: %d Findings", len(net_findings))
- except Exception as e:
- logger.warning("P103/P104 entropy/network-trace skipped: %s", e)
-
- # P105 — IAB TCF Authority-Cross-Reference (Stufe 5)
- tcf_authority_html = ""
- try:
- from compliance.services.tcf_vendor_authority import (
- cross_reference_with_tcf, build_tcf_authority_block_html,
- )
- from database import SessionLocal as _SLtcf
- _tcf_db = _SLtcf()
- try:
- tcf_findings = cross_reference_with_tcf(_tcf_db, cmp_vendors)
- if tcf_findings:
- tcf_authority_html = build_tcf_authority_block_html(tcf_findings)
- logger.info(
- "TCF-Authority: %d Vendor-Discrepancies gefunden",
- len(tcf_findings),
- )
- finally:
- _tcf_db.close()
- except Exception as e:
- logger.warning("TCF-Authority-Check skipped: %s", e)
-
- # COOKIE-COMPLIANCE-AUDIT (3-Quellen-Vergleich) — das ist der
- # zentrale USP: deklariert in Richtlinie vs tatsaechlich im
- # Browser geladen vs Library-Match.
- cookie_audit = {}
- cookie_audit_html = ""
- try:
- from compliance.services.cookie_compliance_audit import (
- audit_cookie_compliance, build_cookie_audit_block_html,
- )
- from database import SessionLocal as _SLca
- _ca_db = _SLca()
- try:
- cookie_audit = audit_cookie_compliance(
- _ca_db, doc_texts.get("cookie") or doc_texts.get("dse"),
- banner_result,
- )
- if cookie_audit and (cookie_audit.get("declared_count") or
- cookie_audit.get("browser_count")):
- cookie_audit_html = build_cookie_audit_block_html(cookie_audit)
- logger.info(
- "Cookie-Audit: %d deklariert, %d im Browser, "
- "%d undokumentiert, %d compliant",
- cookie_audit.get("declared_count"),
- cookie_audit.get("browser_count"),
- len(cookie_audit.get("undeclared_in_browser") or []),
- len(cookie_audit.get("compliant") or []),
- )
- finally:
- _ca_db.close()
- except Exception as e:
- logger.warning("cookie-compliance-audit skipped: %s", e)
-
- # P102: Cookie-Klassifikations-Pruefung (deklariert vs Library)
- library_mismatch_html = ""
- mismatches: list[dict] = []
- try:
- from compliance.services.cookie_library_mismatch import (
- detect_mismatches, build_mismatch_block_html,
- )
- from database import SessionLocal
- cookie_doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or ""
- all_cookies_seen: list[str] = []
- if banner_result:
- for ph in (banner_result.get("phases") or {}).values():
- if isinstance(ph, dict):
- for ck in (ph.get("cookies") or []):
- if isinstance(ck, str):
- all_cookies_seen.append(ck)
- elif isinstance(ck, dict) and ck.get("name"):
- all_cookies_seen.append(ck["name"])
- if all_cookies_seen and cookie_doc_for_check:
- _mm_db = SessionLocal()
- try:
- mismatches = detect_mismatches(
- _mm_db, all_cookies_seen, cookie_doc_for_check,
- )
- if mismatches:
- library_mismatch_html = build_mismatch_block_html(mismatches)
- logger.info(
- "P102: %d Cookie-Mismatches gefunden", len(mismatches)
- )
- finally:
- _mm_db.close()
- except Exception as e:
- logger.warning("P102 mismatch detection failed: %s", e)
-
- # P35 + P77 + P78: Textsignal-Checks (Save-Label, Cookies-in-DSE,
- # JC-Klausel im DSE)
- signals_html = ""
- try:
- from compliance.services.doc_text_signals import (
- run_all as run_signal_checks,
- build_signals_block_html,
- )
- cookie_doc_missing = not bool(doc_texts.get("cookie"))
- sig_findings = run_signal_checks(
- banner_result, doc_texts, cookie_doc_missing,
- )
- if sig_findings:
- signals_html = build_signals_block_html(sig_findings)
- except Exception as e:
- logger.warning("P35/P77/P78 signals-check failed: %s", e)
-
- # P92 + P94: Banner-Konsistenz (CMP-Tool kaputt / Banner-vs-Doc-Diff)
- consistency_html = ""
- try:
- from compliance.services.banner_consistency_checks import (
- run_all as run_consistency_checks,
- build_consistency_block_html,
- )
- cookie_doc_for_check = (doc_texts.get("cookie")
- or doc_texts.get("dse") or "")
- cons_findings = run_consistency_checks(
- banner_result or {}, cookie_doc_for_check, cmp_vendors,
- doc_texts=doc_texts,
- )
- if cons_findings:
- consistency_html = build_consistency_block_html(cons_findings)
- logger.info("P92/P94: %d Konsistenz-Findings", len(cons_findings))
- except Exception as e:
- logger.warning("P92/P94 consistency-check failed: %s", e)
-
- # P73: MC-Solution-Generator — LLM-Vorschlaege pro HIGH-Fail.
- # Max 5 Solutions pro Doc-Type um Latenz < 60s zu halten.
- solutions_html = ""
- try:
- from compliance.services.mc_solution_generator import (
- generate_solutions_for_fails, build_solutions_block_html,
- )
- all_solutions: list[dict] = []
- for dt, fails in fails_by_doc.items():
- if not fails:
- continue
- doc_txt = doc_texts.get(dt) or doc_texts.get("dse") or ""
- if not doc_txt or len(doc_txt) < 500:
- continue
- sols = await generate_solutions_for_fails(
- fails, doc_txt, dt, limit=3,
- )
- all_solutions.extend(sols)
- if len(all_solutions) >= 8:
- break # global cap
- if all_solutions:
- solutions_html = build_solutions_block_html(all_solutions[:8])
- logger.info("P73: %d MC-Solutions generiert", len(all_solutions))
- except Exception as e:
- logger.warning("P73 MC-Solution-Generator skipped: %s", e)
-
- # P71: JC-vs-AVV Entscheidungsbaum (nur wenn DSE ambig)
- jc_decision_html = ""
- try:
- from compliance.services.jc_avv_decision import (
- build_jc_avv_decision_html,
- )
- jc_decision_html = build_jc_avv_decision_html(doc_texts.get("dse"))
- except Exception as e:
- logger.warning("P71 jc_avv_decision skipped: %s", e)
-
- # P6/P53/P55 — Branchen-Kontext + Site-History
- industry_ctx_html = ""
- try:
- from compliance.services.industry_library import (
- build_industry_context_block_html, load_site_profile,
- )
- from database import SessionLocal as _SLib
- _ind_db = _SLib()
- try:
- ind = (req.scan_context or {}).get("industry") if req.scan_context else None
- site_prof = load_site_profile(_ind_db, domain_for_exec or "")
- industry_ctx_html = build_industry_context_block_html(ind, site_prof)
- finally:
- _ind_db.close()
- except Exception as e:
- logger.warning("industry context skipped: %s", e)
-
- # P106 — Internal-Checks-Block (interne Prozesse / Doku-Pflichten)
- internal_checks_html = ""
- try:
- from compliance.services.mc_audit_type import (
- build_internal_checks_block_html,
- )
- ic = (mc_split or {}).get("internal_checks") or []
- if ic:
- internal_checks_html = build_internal_checks_block_html(ic)
- logger.info(
- "P106: %d interne Checks (statt FAIL) im Block",
- len(ic),
- )
- except Exception as e:
- logger.warning("P106 internal_checks_html skipped: %s", e)
-
- # P85 — Banner-Screenshot fuer visuellen Beweis (zwischen
- # GF-1-Pager und Detail-Bloecken)
- banner_shot_html = ""
- try:
- from compliance.services.banner_screenshot_block import (
- build_banner_screenshot_html,
- )
- banner_shot_html = build_banner_screenshot_html(banner_result)
- except Exception as e:
- logger.warning("P85 banner-screenshot skipped: %s", e)
-
- # P82: GF-1-Pager ganz oben in der Mail — 5-Bullet-Zusammenfassung
- # damit die GF nicht 124k Char lesen muss.
- gf_one_pager_html = ""
- try:
- from compliance.services.gf_one_pager import build_gf_one_pager_html
- gf_one_pager_html = build_gf_one_pager_html(
- site_name=site_name_for_exec,
- scorecard=scorecard,
- previous_scorecard=prev_scorecard,
- banner_result=banner_result,
- library_mismatch_findings=mismatches,
- scan_context=req.scan_context,
- audit_quality_findings=audit_quality_findings,
- )
- except Exception as e:
- logger.warning("P82 GF-1-pager skipped: %s", e)
-
- # A — Audit-Quality-Checks: Banner-Detect-Failure, Vendor-Extract
- # auffaellig duenn, URL-Fetch fehlgeschlagen → IMMER prominent zeigen.
- audit_quality_html = ""
- audit_quality_findings: list[dict] = []
- try:
- from compliance.services.audit_quality_checks import (
- run_all as run_audit_quality, build_audit_quality_block_html,
- )
- cookie_text_for_aq = doc_texts.get("cookie") or ""
- audit_quality_findings = run_audit_quality(
- banner_result, cookie_text_for_aq, cmp_vendors, doc_entries,
- )
- if audit_quality_findings:
- audit_quality_html = build_audit_quality_block_html(audit_quality_findings)
- logger.info(
- "audit-quality: %d Vorbehalte erkannt",
- len(audit_quality_findings),
- )
- except Exception as e:
- logger.warning("audit-quality-checks failed: %s", e)
-
- # Doc-Input-Warnings — wenn User Text ins falsche Feld gepastet hat
- input_warn_html = ""
- try:
- from compliance.services.doc_input_warnings import (
- collect_warnings, build_warnings_block_html,
- )
- warns = collect_warnings(doc_entries)
- if warns:
- input_warn_html = build_warnings_block_html(warns)
- logger.info("doc-input-warnings: %d Mismatches gefunden", len(warns))
- except Exception as e:
- logger.warning("doc-input-warnings skipped: %s", e)
-
- # P86: Branchen-Benchmark (nur wenn scan_context.industry gesetzt)
- bench_html = ""
- try:
- from database import SessionLocal as _SLb
- from compliance.services.industry_benchmark import (
- compute_benchmark, build_benchmark_html, _extract_score,
- )
- industry = (req.scan_context or {}).get("industry") if req.scan_context else None
- curr_score = _extract_score(banner_result)
- if industry and curr_score is not None:
- _b_db = _SLb()
- try:
- bench = compute_benchmark(
- _b_db, industry, curr_score, check_id,
- )
- if bench:
- bench_html = build_benchmark_html(bench)
- finally:
- _b_db.close()
- except Exception as e:
- logger.warning("P86 industry-benchmark skipped: %s", e)
-
- # P84: Diff-Mode — "Seit letztem Lauf X Findings weg, Y neue".
- diff_html = ""
- try:
- from database import SessionLocal as _SL
- from compliance.services.run_diff import (
- compute_diff, build_diff_block_html,
- )
- _diff_db = _SL()
- try:
- diff = compute_diff(
- _diff_db, check_id, domain_for_exec or "",
- banner_result, scorecard,
- )
- if diff:
- diff_html = build_diff_block_html(diff)
- finally:
- _diff_db.close()
- except Exception as e:
- logger.warning("P84 diff-mode skipped: %s", e)
-
- full_html = (
- gf_one_pager_html + audit_quality_html + input_warn_html
- + bench_html + diff_html
- + critical_html + scope_disclaimer_html + exec_summary_html
- + cookie_arch_html + summary_html + scanned_html + profile_html
- + scorecard_html + internal_checks_html + redundancy_html
- + industry_ctx_html
- + banner_shot_html
- + providers_html + banner_deep_html
- + cookie_audit_html
- + tcf_authority_html
- + entropy_html
- + network_trace_html
- + library_mismatch_html
- + consistency_html + signals_html + solutions_html
- + jc_decision_html
- + vvt_html + report_html
- )
-
- # Step 6: Send email — derive site name primarily from entered URL.
- # The extracted_profile.companyName is often noisy (e.g. picks up
- # juris.de from legal references). Domain-derived name is more
- # predictable for the GF email subject.
- doc_count = len([r for r in results if not r.error])
- url_company = _company_name_from_url(doc_entries)
- domain = _extract_domain(doc_entries)
- site_name = url_company or domain or "Unbekannt"
- _update(check_id, "E-Mail wird versendet...", 98)
- email_result = send_email(
- recipient=req.recipient,
- subject=f"[COMPLIANCE-CHECK] {site_name} — {doc_count} Dokumente geprueft",
- body_html=full_html,
- )
-
- # Step 7: Store result
- response = {
- "check_id": check_id,
- "results": [_result_to_dict(r) for r in results],
- "business_profile": profile_dict,
- "extracted_profile": extracted_profile,
- # P18: vollen consent-tester-Output durchreichen statt nur 4 Felder.
- # phases (before/after-accept/reject) + banner_checks.violations +
- # category_tests werden vom Renderer + Critical-Findings-Block genutzt.
- "banner_result": ({
- "detected": banner_result.get("banner_detected", False),
- "provider": banner_result.get("banner_provider", ""),
- "violations": len((banner_result.get("banner_checks") or {})
- .get("violations", [])),
- "tcf_vendor_count": len(tcf_vendors),
- "completeness_pct": banner_result.get("completeness_pct"),
- "correctness_pct": banner_result.get("correctness_pct"),
- "phases": banner_result.get("phases", {}),
- "banner_checks": banner_result.get("banner_checks", {}),
- "category_tests": banner_result.get("category_tests", []),
- "structured_checks": banner_result.get("structured_checks", []),
- "summary": banner_result.get("summary", {}),
- } if banner_result else None),
- "tcf_vendors": vvt_entries if tcf_vendors else [],
- "cmp_vendors": cmp_vendors,
- "cookie_audit": cookie_audit if cookie_audit else None,
- "total_documents": len(results),
- "total_findings": total_findings,
- "email_status": email_result.get("status", "failed"),
- "checked_at": datetime.now(timezone.utc).isoformat(),
- }
-
- _compliance_check_jobs[check_id]["status"] = "completed"
- _compliance_check_jobs[check_id]["result"] = response
- _compliance_check_jobs[check_id]["progress"] = "Fertig"
- _compliance_check_jobs[check_id]["progress_pct"] = 100
-
- # P80: persist raw scan data so we can replay audit pipeline
- # without re-crawling (7min -> 5sec test cycle).
- try:
- from database import SessionLocal
- from compliance.services.check_snapshot import save_snapshot
- snap_db = SessionLocal()
- try:
- save_snapshot(
- snap_db,
- check_id=check_id,
- doc_entries=doc_entries,
- banner_result=banner_result,
- profile=profile,
- cmp_vendors=cmp_vendors,
- scan_context=req.scan_context, # P79
- site_label=site_name,
- notes=f"recipient={req.recipient}",
- )
- finally:
- snap_db.close()
- except Exception as snap_err:
- logger.warning("P80 snapshot save skipped: %s", snap_err)
-
- # Persist to sidecar SQLite audit log — enables /audit endpoints
- # (A5 admin tab) and trend view (A6). Best-effort; failures here
- # do not affect the user-facing response.
- try:
- from compliance.services.compliance_audit_log import record_check_run
- from compliance.services.mc_scorecard import full_audit_records
- audit_rows: list[dict] = []
- for r in results:
- doc_mc = [c for c in r.checks if c.id.startswith("mc-")]
- audit_rows.extend(full_audit_records(
- [{"id": c.id, "label": c.label, "passed": c.passed,
- "severity": c.severity, "skipped": c.skipped,
- "regulation": c.regulation, "matched_text": c.matched_text,
- "hint": c.hint, "level": c.level}
- for c in doc_mc],
- check_id=check_id,
- doc_type=r.doc_type,
- ))
- record_check_run(
- check_id=check_id,
- tenant_id=req.recipient or "",
- site_name=site_name,
- base_domain=domain or "",
- doc_count=doc_count,
- scorecard=scorecard,
- vvt_summary={
- "total": len(cmp_vendors),
- "internal": sum(1 for v in cmp_vendors
- if (v.get("recipient_type") or "").upper()
- in ("INTERNAL", "GROUP_COMPANY")),
- "external": sum(1 for v in cmp_vendors
- if (v.get("recipient_type") or "").upper()
- in ("PROCESSOR", "CONTROLLER")),
- },
- mc_records=audit_rows,
- )
- from compliance.services.compliance_audit_log import record_check_payload
- record_check_payload(
- check_id=check_id,
- vendors=cmp_vendors,
- profile=extracted_profile,
- banner=banner_result,
- )
- # Unified findings (P5): bundle MC + Pflichtangaben + Vendor +
- # Redundanz in one searchable table behind /agent/findings/.
- try:
- from compliance.services.unified_findings_collector import collect
- from compliance.services.unified_findings_store import record_findings
- unified = collect(
- check_id=check_id,
- results=results,
- cmp_vendors=cmp_vendors,
- redundancy_report=redundancy_report,
- doc_texts=doc_texts,
- )
- record_findings(check_id, unified)
- except Exception as e:
- logger.warning("Unified findings collect failed: %s", e)
- except Exception as e:
- logger.warning("Audit persistence skipped: %s", e)
-
- except Exception as e:
- logger.error("Compliance check %s failed: %s", check_id, e, exc_info=True)
- _compliance_check_jobs[check_id]["status"] = "failed"
- _compliance_check_jobs[check_id]["error"] = str(e)[:500]
-
-
-def _update(check_id: str, msg: str, pct: int | None = None):
- job = _compliance_check_jobs[check_id]
- job["progress"] = msg
- if pct is not None:
- job["progress_pct"] = max(0, min(100, int(pct)))
-
-
-async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
- """Fetch text from URL via consent-tester, with HTTP fallback.
-
- Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
- during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
- HTTP fallback was used. Backend turns payloads into structured vendor
- records for the VVT table in the email.
- """
- # 1. Consent-tester (Playwright-based, full JS rendering).
- # max_documents depends on doc_type:
- # - cookie/dse/social_media: self-extract (often + CMP capture) is
- # authoritative, sub-pages dilute the policy text. max=1.
- # - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar
- # enterprise sites split this across 3-4 short sub-pages
- # (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows
- # them. The 15s networkidle bail (dsi_helpers) keeps timing safe.
- short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"}
- max_docs = 1 if (doc_type or "") in short_extract_types else 3
- try:
- # P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt
- # 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit
- # 120s auch oft an Akamai-Latenz.
- async with httpx.AsyncClient(timeout=240.0) as client:
- resp = await client.post(
- f"{CONSENT_TESTER_URL}/dsi-discovery",
- json={"url": url, "max_documents": max_docs},
- timeout=240.0,
- )
- if resp.status_code == 200:
- payload = resp.json()
- docs = payload.get("documents", [])
- cmp_payloads = payload.get("cmp_payloads") or []
- cmp_cookie_text = payload.get("cmp_cookie_text") or ""
- # D — wenn der consent-tester HTML-Tabellen aus dem DOM
- # extrahiert hat, in die cmp_payloads als "generic_table"
- # einschleusen damit das Backend sie via cookies_table_parser
- # verarbeiten kann.
- for doc in (docs or []):
- for tbl in (doc.get("tables") or []):
- if not tbl or len(tbl) < 3:
- continue
- cmp_payloads.append({
- "kind": "html_table",
- "url": doc.get("url", ""),
- "rows": tbl,
- })
- if docs:
- texts = []
- for doc in docs:
- t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
- if t and len(t) > 50:
- texts.append(t)
- merged = "\n\n".join(texts)
- # For cookie/dse/social_media: when CMP reconstruction is
- # substantially richer than DOM extraction, use it. This
- # fixes the BMW case where DOM yields ~600 words of
- # navigation but the ePaaS payload reconstructs to ~1800
- # words of actual cookie policy.
- if (doc_type in short_extract_types
- and cmp_cookie_text
- and len(cmp_cookie_text.split()) > len(merged.split())):
- logger.info(
- "Preferring CMP-reconstructed text for %s on %s "
- "(%d words CMP vs %d words DOM)",
- doc_type, url,
- len(cmp_cookie_text.split()),
- len(merged.split()),
- )
- merged = cmp_cookie_text
- if merged and len(merged.split()) > 100:
- if len(texts) > 1:
- logger.info("Merged %d docs from %s (%d words)",
- len(texts), url, len(merged.split()))
- return merged, cmp_payloads
- # P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort-
- # Schwelle ist, die captured CMP-Payloads NICHT verwerfen.
- # BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON
- # (393KB) wurde captured. Backend braucht die fuer
- # extract_vendors_from_payloads (VVT-Tabelle).
- if cmp_payloads:
- logger.info(
- "P90: keeping %d CMP payloads for %s despite "
- "short text (%d words) — HTTP fallback runs in parallel",
- len(cmp_payloads), url,
- len((merged or cmp_cookie_text).split()),
- )
- fallback_text = merged or cmp_cookie_text or ""
- return fallback_text, cmp_payloads
- except Exception as e:
- # P90: verbose exception fuer Diagnose (war vorher empty)
- logger.warning("Consent-tester fetch failed for %s: %s (%s)",
- url, str(e) or "(empty)", type(e).__name__)
-
- # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW).
- # P7: kenntlicher UA + per-Domain Rate-Limit.
- try:
- import re as _re
- from compliance.services.compliance_user_agent import (
- default_request_headers, DomainRateLimiter,
- )
- async with httpx.AsyncClient(
- timeout=30.0, follow_redirects=True,
- headers=default_request_headers(),
- ) as client:
- async with DomainRateLimiter(url):
- resp = await client.get(url)
- if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
- html = resp.text
- # Strip HTML tags, decode entities
- text = _re.sub(r"", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
- text = _re.sub(r"", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
- text = _re.sub(r"<[^>]+>", " ", text)
- text = _re.sub(r"\s+", " ", text).strip()
- if len(text.split()) > 100:
- logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
- return text, []
- except Exception as e:
- logger.warning("HTTP fallback failed for %s: %s", url, e)
-
- return "", []
-
-
-async def _autodiscover_missing(
- check_id: str,
- doc_entries: list[dict],
- doc_texts: dict[str, str],
- url_text_cache: dict[str, str],
-) -> None:
- """For each canonical doc_type the user did not submit, try to find
- the corresponding document on the homepage of the site they DID submit.
-
- Modifies doc_entries in place: fills text/url/word_count and sets
- `auto_discovered=True`. Marks `discovery_attempted=True` on every
- missing entry (even when nothing was found) so the report can
- distinguish 'Nicht eingereicht' from 'Auf der Website nicht gefunden'.
- """
- from urllib.parse import urlparse
-
- # VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen
- # als 'submitted'. Wenn der User eine URL eingegeben hat aber die
- # 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger
- # als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln
- # damit der Discovery-Pass alternative URLs probiert.
- _MIN_USEFUL_CHARS = 200
- submitted_types = {
- e["doc_type"] for e in doc_entries
- if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS
- }
- # Markiere die fehlgeschlagenen URL-Submissions damit der Discovery
- # ihre URL nicht erneut probiert (waere sinnlos).
- failed_urls: set[str] = {
- (e.get("url") or "").strip()
- for e in doc_entries
- if (e.get("url") or "").strip()
- and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS
- }
- if failed_urls:
- logger.info(
- "VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery "
- "soll Alternativen probieren: %s",
- len(failed_urls), _MIN_USEFUL_CHARS,
- ", ".join(list(failed_urls)[:3]),
- )
- # Map alias types to canonical
- submitted_canon = {
- "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
- }
- # Missing = canonical types the user did NOT submit
- missing = set(_ALL_DOC_TYPES) - submitted_canon
- if not missing:
- return
-
- # Pick the most common base (scheme://netloc) from submitted URLs.
- bases: dict[str, int] = {}
- for e in doc_entries:
- u = (e.get("url") or "").strip()
- if u and "://" in u:
- p = urlparse(u)
- base = f"{p.scheme}://{p.netloc}"
- bases[base] = bases.get(base, 0) + 1
- if not bases:
- # No submitted URL at all — nothing to crawl from. Add empty
- # placeholders (with discovery_attempted=False) so the padding
- # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
- for dt in missing:
- doc_entries.append({
- "doc_type": dt, "url": "", "text": "", "word_count": 0,
- "auto_discovered": False, "discovery_attempted": False,
- })
- return
-
- # Build crawl plan: primary base + any related domains mentioned in
- # the submitted texts that share the owner's SLD. Example: BMW Group
- # text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de.
- primary_base = max(bases, key=bases.get) + "/"
- crawl_bases: list[str] = [primary_base]
- primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.")
- owner_token = primary_netloc.split(".")[0] # 'bmw'
-
- if owner_token and len(owner_token) >= 3:
- domain_re = re.compile(
- r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token)
- + r"[a-z0-9\-]*\.[a-z]{2,}",
- re.IGNORECASE,
- )
- seen_bases = {primary_base}
- for entry in doc_entries:
- text = entry.get("text") or ""
- for m in domain_re.finditer(text):
- p = urlparse(m.group(0))
- base = f"{p.scheme}://{p.netloc}/"
- base_netloc = p.netloc.lower().lstrip("www.")
- if base_netloc == primary_netloc:
- continue
- if base in seen_bases:
- continue
- seen_bases.add(base)
- crawl_bases.append(base)
- if len(crawl_bases) >= 3:
- break
- if len(crawl_bases) >= 3:
- break
-
- _update(
- check_id,
- f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...",
- 18,
- )
-
- discovered: list[dict] = []
- disc_payloads: list[dict] = []
- disc_cookie_texts: list[str] = []
- for base in crawl_bases:
- try:
- async with httpx.AsyncClient(timeout=300.0) as client: # P90: 180s -> 300s
- resp = await client.post(
- f"{CONSENT_TESTER_URL}/dsi-discovery",
- json={"url": base, "max_documents": 15},
- timeout=300.0, # P90: 180s -> 300s
- )
- if resp.status_code != 200:
- logger.warning("auto-discovery: HTTP %d for %s",
- resp.status_code, base)
- continue
- body = resp.json()
- discovered.extend(body.get("documents", []) or [])
- disc_payloads.extend(body.get("cmp_payloads") or [])
- cmp_text = body.get("cmp_cookie_text") or ""
- if cmp_text:
- disc_cookie_texts.append(cmp_text)
- logger.info("auto-discovery on %s: %d docs, %d CMP payloads, "
- "cmp_cookie_text=%d words", base,
- len(body.get("documents", []) or []),
- len(body.get("cmp_payloads") or []),
- len(cmp_text.split()))
- except Exception as e:
- # P90: verbose exception fuer Diagnose
- logger.warning("auto-discovery failed for %s: %s (%s)",
- base, str(e) or "(empty)", type(e).__name__)
-
- # Classify each discovered doc into a canonical doc_type
- by_type: dict[str, dict] = {}
- for d in discovered:
- title = (d.get("title") or "").lower()
- url = (d.get("url") or "").lower()
- wc = d.get("word_count") or 0
- if wc < 100:
- continue
- canon = _classify_discovered_doc(title, url)
- if canon and canon in missing and canon not in by_type:
- by_type[canon] = d
-
- # Append/Update entry for every missing canonical type. Auto-discovered
- # ones get the text/URL filled; ungratched ones stay empty so the
- # padding step renders them as 'Auf der Website nicht gefunden'.
- # VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber
- # fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren.
- filled = 0
- for dt in missing:
- existing = next((e for e in doc_entries
- if e.get("doc_type") == dt), None)
- new_entry: dict = existing if existing else {
- "doc_type": dt, "url": "", "text": "", "word_count": 0,
- "auto_discovered": False, "discovery_attempted": True,
- "cmp_payloads": [],
- }
- new_entry["discovery_attempted"] = True
- d = by_type.get(dt)
- if d:
- full = d.get("full_text") or d.get("text_preview") or ""
- # For cookie: prefer the CMP-reconstructed text when it's
- # substantially richer than the auto-discovered DOM extraction.
- # BMW homepage CMP yields ~1800 words of authoritative policy;
- # DOM extraction typically yields ~600 words of site chrome.
- if dt == "cookie" and disc_cookie_texts:
- cmp_merged = "\n\n".join(disc_cookie_texts)
- if len(cmp_merged.split()) > len(full.split()):
- logger.info(
- "cookie: using CMP-reconstructed text (%d words) "
- "instead of DOM (%d words)",
- len(cmp_merged.split()), len(full.split()),
- )
- full = cmp_merged
- if len(full.split()) >= 100:
- new_entry["text"] = full
- # Behalte die original URL als "rejected_url" damit Audit
- # zeigt 'X war 404, wir haben Y gefunden'.
- if existing and (existing.get("url") or "").strip() in failed_urls:
- new_entry["rejected_url"] = existing.get("url")
- new_entry["url"] = d.get("url", "")
- new_entry["word_count"] = len(full.split())
- new_entry["auto_discovered"] = True
- if dt == "cookie" and disc_payloads:
- new_entry["cmp_payloads"] = disc_payloads
- doc_texts[dt] = full
- filled += 1
- logger.info(
- "auto-discovered %s on %s: %s (%d words)%s",
- dt, base, d.get("url", "")[:80], new_entry["word_count"],
- " [REPLACED failed URL]" if existing else "",
- )
- if not existing:
- doc_entries.append(new_entry)
-
- logger.info(
- "auto-discovery: filled %d/%d missing types from %s",
- filled, len(missing), base,
- )
-
-
-# Title/URL keywords → canonical doc_type. Order matters: most-specific first.
-_DISCOVERY_RULES: list[tuple[str, tuple[str, ...]]] = [
- ("cookie", ("cookie", "kuche", "biscuit", "cookies-")),
- ("widerruf", ("widerruf", "rueckgabe", "rückgabe", "cancellation",
- "right-of-withdrawal", "ruecktritts", "rücktritts")),
- ("social_media", ("social-media", "soziale-medien", "social_media",
- "social-media-policy")),
- # P23: 'terms-and-conditions' kann Allgemeine Geschaeftsbedingungen ODER
- # Nutzungsbedingungen meinen. Discovery-Funktion klassifiziert spaeter
- # praeziser per Titel + Inhalt. Hier nur Url-Hint:
- ("agb", ("/agb", "geschaeftsbedingungen", "geschäftsbedingungen",
- "general-terms")),
- ("nutzungsbedingungen", ("nutzungsbedingung", "nutzungsbedingungen",
- "terms-of-use", "terms-and-conditions",
- "nutzungsordnung", "terms-of-service",
- "allgemeine-nutzungsbedingungen")),
- ("dsb", ("datenschutzbeauftragt", "data-protection-officer",
- "dpo-contact", "/dsb")),
- ("impressum", ("impressum", "imprint", "legal-notice", "site-notice",
- "anbieterkennzeichnung", "legal-disclaimer-pool")),
- ("dse", ("data-privacy", "datenschutz", "data-protection",
- "privacy-policy", "privacy-notice", "dsgvo",
- "data_privacy", "datenschutzinformation")),
-]
-
-
-def _classify_discovered_doc(title: str, url: str) -> str | None:
- """Map a discovered doc (by its title + URL) to one of our 8 canonical types."""
- haystack = f"{title} {url}"
- for canon, keywords in _DISCOVERY_RULES:
- if any(kw in haystack for kw in keywords):
- return canon
- return None
-
-
-async def _check_single(
- text: str, doc_type: str, label: str, url: str,
- word_count: int, use_agent: bool,
- business_scope: set[str] | None = None,
- business_profile: dict | None = None,
-):
- """Run regex + MC checks on a single document."""
- from compliance.services.doc_checks.runner import check_document_completeness
- from compliance.services.rag_document_checker import check_document_with_controls
- from .agent_doc_check_routes import CheckItem, DocCheckResult
-
- # Regex checklist
- findings = check_document_completeness(text, doc_type, label, url,
- business_profile=business_profile)
-
- all_checks: list[CheckItem] = []
- completeness = 0
- correctness = 0
-
- for f in findings:
- if "SCORE" in f.get("code", ""):
- for c in f.get("all_checks", []):
- all_checks.append(CheckItem(
- id=c["id"], label=c["label"], passed=c["passed"],
- severity=c["severity"], matched_text=c.get("matched_text", ""),
- level=c.get("level", 1), parent=c.get("parent"),
- skipped=c.get("skipped", False), hint=c.get("hint", ""),
- ))
- completeness = f.get("completeness_pct", 0)
- correctness = f.get("correctness_pct", 0)
-
- # Master Control checks (top 20 by severity to avoid noise)
- try:
- # max_controls=0 -> evaluate ALL MCs for this doc_type (DB has
- # 1874 across 8 types; regex matching is cheap and dominates
- # well under 1s per doc). Caps remain on the LLM-enrich step
- # (top-10 FAILs) so cost stays bounded.
- mc_results = await check_document_with_controls(
- text, doc_type, label, max_controls=0, use_agent=use_agent,
- business_scope=business_scope,
- )
- if mc_results:
- for mc in mc_results:
- all_checks.append(CheckItem(**mc))
- l2 = [c for c in all_checks if c.level == 2 and not c.skipped]
- l2_passed = sum(1 for c in l2 if c.passed)
- correctness = round(l2_passed / len(l2) * 100) if l2 else 0
- except Exception as e:
- logger.warning("MC check skipped for %s: %s", label, e)
-
- # LLM verification of regex fails
- failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint]
- if failed:
- try:
- from compliance.services.doc_checks.llm_verify import verify_failed_checks
- overturns = await verify_failed_checks(
- text,
- [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
- label,
- )
- for c in all_checks:
- if c.id in overturns and overturns[c.id]["overturned"]:
- c.passed = True
- c.matched_text = f"[LLM] {overturns[c.id]['evidence']}"
- l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
- l2_passed = sum(1 for c in l2_active if c.passed)
- if l2_active:
- correctness = round(l2_passed / len(l2_active) * 100)
- except Exception as e:
- logger.warning("LLM verification skipped: %s", e)
-
- # Cookie-policy only: actively HTTP-probe the Opt-Out + Privacy-Policy
- # URLs the document advertises. Broken links make individual provider
- # entries non-compliant under Art. 7(3) DSGVO.
- if doc_type == "cookie":
- try:
- from compliance.services.cookie_link_validator import (
- extract_links, validate_links, build_check_items,
- )
- links = extract_links(text)
- if links:
- logger.info("Cookie-link validator: %d urls extracted from %s",
- len(links), label)
- validated = await validate_links(links)
- for item in build_check_items(validated):
- all_checks.append(CheckItem(**item))
- # Re-compute correctness with the new L2 items
- l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
- l2_passed = sum(1 for c in l2_active if c.passed)
- if l2_active:
- correctness = round(l2_passed / len(l2_active) * 100)
- except Exception as e:
- logger.warning("Cookie-link validation skipped for %s: %s", label, e)
-
- non_score = [f for f in findings if "SCORE" not in f.get("code", "")]
- return DocCheckResult(
- label=label, url=url, doc_type=doc_type,
- word_count=word_count or len(text.split()),
- completeness_pct=completeness, correctness_pct=correctness,
- checks=all_checks, findings_count=len(non_score),
- )
-
-
-def _pad_results_with_missing(
- results: list,
- discovery_attempted: set[str] | None = None,
-) -> list:
- """Ensure every canonical doc_type has an entry in the results list.
-
- Doc_types the user did not submit AND auto-discovery did not find get
- a placeholder DocCheckResult. The error message distinguishes:
- - 'Auf der Website nicht gefunden' (discovery was attempted)
- - 'Nicht eingereicht' (no submitted URLs to crawl from)
-
- Preserves the canonical ordering from _ALL_DOC_TYPES so the report
- layout is stable.
- """
- from .agent_doc_check_routes import DocCheckResult
- attempted = discovery_attempted or set()
-
- by_type: dict[str, object] = {}
- for r in results:
- canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type
- by_type[canon] = r
-
- ordered: list = []
- for dt in _ALL_DOC_TYPES:
- if dt in by_type:
- ordered.append(by_type[dt])
- continue
- if dt in attempted:
- msg = ("Auf der Website nicht gefunden — bitte URL des "
- "Dokuments manuell eintragen, falls vorhanden")
- else:
- msg = "Nicht eingereicht — Quelle nicht angegeben"
- ordered.append(DocCheckResult(
- label=_doc_type_label(dt),
- url="",
- doc_type=dt,
- word_count=0,
- completeness_pct=0,
- correctness_pct=0,
- checks=[],
- findings_count=0,
- error=msg,
- scenario="missing",
- ))
-
- extras = [r for r in results
- if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse")
- not in _ALL_DOC_TYPES]
- ordered.extend(extras)
- return ordered
-
-
-_COMPOUND_TLDS = {
- "co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in",
- "com.au", "com.br", "com.mx", "com.tr", "com.sg",
-}
-
-
-def _extract_domain(doc_entries: list[dict]) -> str | None:
- """Extract base domain (without www) from first URL."""
- for entry in doc_entries:
- url = entry.get("url", "")
- if url and "://" in url:
- from urllib.parse import urlparse
- host = urlparse(url).netloc.lower()
- if host.startswith("www."):
- host = host[4:]
- return host or None
- return None
-
-
-def _company_name_from_url(doc_entries: list[dict]) -> str | None:
- """Derive a display company name from the entered URLs.
-
- Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"),
- uppercase short acronyms (<=4 chars, no hyphens), title-case the rest.
-
- Examples:
- www.bmw.de -> BMW
- mercedes-benz.de -> Mercedes-Benz
- shop.example.co.uk -> Example
- juris.de -> Juris
- """
- from urllib.parse import urlparse
-
- for entry in doc_entries:
- url = entry.get("url", "")
- if not url or "://" not in url:
- continue
- host = urlparse(url).netloc.lower()
- if host.startswith("www."):
- host = host[4:]
- parts = host.split(".")
- if len(parts) < 2:
- continue
- # Handle compound TLDs (.co.uk etc.)
- if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS:
- sld = parts[-3]
- else:
- sld = parts[-2]
- if not sld:
- continue
- if len(sld) <= 4 and "-" not in sld:
- return sld.upper()
- return "-".join(p.capitalize() for p in sld.split("-"))
- return None
-
-
-def _get_skip_types(profile) -> dict[str, str]:
- """Doc_types to skip entirely with a per-type reason message.
-
- Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes):
- wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/
- Nutzungsbedingungen nicht Pflicht auf der Website — sie werden
- beim Vertragshaendler ausgehaendigt.
- """
- if getattr(profile, "no_direct_sales", False):
- msg = (
- "Nicht anwendbar — die Webseite schliesst keinen Direkt-"
- "Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft "
- "ueber Vertragshaendler). AGB/Widerruf werden beim "
- "Haendler ausgehaendigt."
- )
- return {
- "agb": msg,
- "widerruf": msg,
- "nutzungsbedingungen": msg,
- }
- return {}
-
-
-def _apply_profile_filter(result, profile, doc_type: str):
- """Adjust INFO-level checks based on business profile context.
-
- For example: ODR check only relevant for B2C online shops.
- """
- from .agent_doc_check_routes import CheckItem
-
- for check in result.checks:
- cid = check.id.lower()
-
- # ODR/OS-Link: relevant ONLY for B2C online shops. The check's
- # default hint is written for B2B (it explains why it's not
- # relevant) — for B2C we must replace it with action-oriented
- # guidance, otherwise the report contradicts itself.
- if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower():
- if profile.needs_odr:
- if not check.passed:
- check.hint = (
- "Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 "
- "auf die OS-Plattform (https://ec.europa.eu/consumers/odr) "
- "verlinken — klickbarer Link, nicht nur Text. Zusaetzlich "
- "§36 VSBG: angeben, ob Sie an Verbraucher-"
- "Streitbeilegungsverfahren teilnehmen (oder nicht)."
- )
- else:
- check.skipped = True
- check.hint = "Nicht relevant (kein B2C Online-Shop)"
-
- # Widerruf: Flag entire document as unnecessary for B2B
- if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"):
- check.severity = "INFO"
- if not check.passed:
- check.hint = (
- "Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung "
- "(§355 BGB gilt nur fuer Verbrauchervertraege). "
- "Empfehlung: Entfernen Sie die Widerrufsbelehrung von "
- "Ihrer Website, da sie Verwirrung stiften kann."
- )
-
- # Regulated profession: check for Kammer info
- if "kammer" in cid or "berufsordnung" in check.label.lower():
- if not profile.is_regulated_profession:
- check.skipped = True
- check.hint = "Nicht relevant (kein regulierter Beruf)"
-
- return result
-
-
-# ── Helpers ──────────────────────────────────────────────────────────
-
-_DOC_TYPE_LABELS = {
- "dse": "Datenschutzerklaerung",
- "datenschutz": "Datenschutzerklaerung",
- "privacy": "Datenschutzerklaerung",
- "impressum": "Impressum",
- "agb": "AGB",
- "widerruf": "Widerrufsbelehrung",
- "cookie": "Cookie-Richtlinie",
- "avv": "Auftragsverarbeitung",
- "loeschkonzept": "Loeschkonzept",
- "dsfa": "Datenschutz-Folgenabschaetzung",
- "social_media": "Social Media Datenschutz",
- "nutzungsbedingungen": "Nutzungsbedingungen",
- "dsb": "DSB-Kontakt",
- # P74: Legal-Notice / Rechtliche Hinweise (IP, Forward-Looking, Risiko)
- "legal_notice": "Rechtliche Hinweise",
- # P96: Digital Services Act-Pflichtangaben (Art. 12+17 DSA)
- "dsa": "DSA-Pflichtangaben",
- # P97: Lizenzhinweise Dritter (OSS-Compliance)
- "lizenzhinweise": "Lizenzhinweise Dritter",
-}
-
-# Canonical doc types in the same order as the frontend ComplianceCheckTab.
-# The route pads `results` to always contain an entry for each — even if
-# the user did not submit a URL — so the email + frontend always show
-# the complete checklist (missing rows marked as 'Nicht eingereicht').
-#
-# DSB-Kontakt is intentionally NOT canonical: per GDPR practice the DSB is
-# named *inside* the DSI/datenschutz document (email or contact block), not
-# as a separate page. We check 'DSB benannt' as a sub-check of the DSE
-# instead. If a tenant insists on a separate DSB document, they can still
-# submit one — it just won't appear as a missing checklist row.
-_ALL_DOC_TYPES = [
- "dse", "impressum", "social_media", "cookie",
- "agb", "nutzungsbedingungen", "widerruf",
-]
-
-
-def _doc_type_label(doc_type: str) -> str:
- return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper())
-
-
-def _result_to_dict(r) -> dict:
- """Convert DocCheckResult to JSON-serializable dict."""
- fields = ("id", "label", "passed", "severity", "matched_text",
- "level", "parent", "skipped", "hint")
- return {
- "label": r.label, "url": r.url, "doc_type": r.doc_type,
- "word_count": r.word_count, "completeness_pct": r.completeness_pct,
- "correctness_pct": r.correctness_pct,
- "checks": [{f: getattr(c, f) for f in fields} for c in r.checks],
- "findings_count": r.findings_count, "error": r.error,
- "scenario": getattr(r, "scenario", ""),
- }
-
-
-def _build_profile_html(profile) -> str:
- from .agent_doc_check_report import build_profile_html
- return build_profile_html(profile)
-
-
-# Cross-check extracted to compliance.services.banner_cookie_cross_check
-from compliance.services.banner_cookie_cross_check import cross_check_banner_vs_cookie as _cross_check_banner_vs_cookie
-
-
# ── Admin: audit drill-down (A5) + trend view (A6) ──────────────────
@router.get("/audit/{check_id}")
diff --git a/backend-compliance/compliance/services/consent_reachability_check.py b/backend-compliance/compliance/services/consent_reachability_check.py
new file mode 100644
index 00000000..d47d61fc
--- /dev/null
+++ b/backend-compliance/compliance/services/consent_reachability_check.py
@@ -0,0 +1,278 @@
+"""
+B1 — Cookie-Consent-UX-001: Mobile Reachability of Consent Settings.
+
+DSGVO Art. 7 Abs. 3 requires that withdrawing consent must be as
+easy as giving it. EDPB Cookie Banner Taskforce Report (2023) and
+DSK OH Digitale Dienste v1.2 (2024) both demand a permanent, directly
+reachable way to change cookie preferences — typically a Footer link
+labelled "Cookie-Einstellungen" that re-opens the CMP in place.
+
+Common anti-patterns we want to flag:
+ - Footer points to a Cookie-Policy *page* in a new tab, no CMP
+ - Footer only offers "more info" but no "manage settings"
+ - Only mention is a verbal reference to browser settings inside the
+ privacy-policy text
+ - Mobile footer hides the link in a multi-level accordion
+
+This module does the STATIC HTML analysis. The dynamic part (mobile
+viewport rendering, tap-target measurement, click-behaviour
+verification) is performed by consent-tester via Playwright and feeds
+back into `evaluate_combined` in a later phase.
+
+Pure module — no DB, no network. Tests live in
+tests/test_consent_reachability_check.py.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from html.parser import HTMLParser
+from urllib.parse import urljoin, urlparse
+
+logger = logging.getLogger(__name__)
+
+# Phrases that suggest "open the consent manager" rather than "show
+# more info / open a policy page".
+_REOPEN_PHRASES = (
+ "cookie-einstellungen", "cookie einstellungen",
+ "cookie-präferenzen", "cookie praeferenzen", "cookie-praferenzen",
+ "cookie-einwilligung", "einwilligung verwalten",
+ "consent manager", "consent settings", "consent-einstellungen",
+ "datenschutz-einstellungen", "datenschutzeinstellungen",
+ "cookies verwalten", "manage cookies", "manage preferences",
+ "privacy settings", "privacy preferences",
+ "tracking-einstellungen",
+)
+
+# Weaker — these usually point at a policy page, not the CMP itself.
+_INFO_ONLY_PHRASES = (
+ "cookie-richtlinie", "cookie richtlinie", "cookie-policy",
+ "cookie policy", "cookies (information)",
+ "datenschutz", "datenschutzerklärung", "privacy policy",
+ "weitere informationen", "more information",
+)
+
+# Phrases that try to shift the burden to the user's browser —
+# Bundesländer-Datenschutzbeauftragte explicitly call this insufficient.
+_BROWSER_DEFLECTION_PHRASES = (
+ "browser-einstellungen", "browsereinstellungen",
+ "einstellungen ihres browsers", "browser settings",
+ "in ihrem browser", "über ihren browser",
+)
+
+
+class _AnchorCollector(HTMLParser):
+ """Collects and