""" Unified Compliance Check Routes — check all documents in one request. POST /compliance/agent/extract-text — extract text from a URL POST /compliance/agent/compliance-check — unified check for all documents GET /compliance/agent/compliance-check/{check_id} — poll status """ import asyncio import logging import os import re import uuid as _uuid from dataclasses import asdict from datetime import datetime, timezone import httpx from fastapi import APIRouter from pydantic import BaseModel from compliance.services.smtp_sender import send_email logger = logging.getLogger(__name__) router = APIRouter(prefix="/compliance/agent", tags=["agent"]) CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094" # In-memory job store (same pattern as doc-check) _compliance_check_jobs: dict[str, dict] = {} # ── Models ─────────────────────────────────────────────────────────── class ExtractTextRequest(BaseModel): url: str class DocumentInput(BaseModel): doc_type: str # dse, agb, impressum, cookie, widerruf, avv, loeschkonzept, etc. url: str = "" text: str = "" # text has priority over URL class ComplianceCheckRequest(BaseModel): documents: list[DocumentInput] use_agent: bool = False recipient: str = "dsb@breakpilot.local" # P12: Override fuer TDM-Vorbehalt bei dokumentierter Kunden-Erlaubnis. # Pflichtfeld tdm_override_reason wenn tdm_override=True # (z.B. "Auftragsbeziehung Safetykon GmbH, Email Hr. X 18.05.2026"). tdm_override: bool = False tdm_override_reason: str = "" # P79: 8-Feld Pre-Scan-Wizard (Branche, B2B/B2C, Direkt-Vertrieb, # Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im # Snapshot persistiert und filtert die MC-Auswertung (P72). scan_context: dict | None = None class ComplianceCheckStartResponse(BaseModel): check_id: str status: str = "running" class ComplianceCheckStatusResponse(BaseModel): check_id: str status: str progress: str = "" progress_pct: int = 0 result: dict | None = None error: str = "" # ── Extract text endpoint ──────────────────────────────────────────── @router.post("/extract-text") async def extract_text(req: ExtractTextRequest): """Extract text from a URL via consent-tester DSI discovery. Merges all documents found on the page (sub-pages, accordions, etc.) """ try: async with httpx.AsyncClient(timeout=300.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", json={"url": req.url, "max_documents": 5}, timeout=300.0, ) if resp.status_code != 200: return { "text": "", "word_count": 0, "title": "", "error": f"HTTP {resp.status_code} von Consent-Tester", } data = resp.json() docs = data.get("documents", []) if not docs: return { "text": "", "word_count": 0, "title": "", "error": "Kein Text extrahierbar", } # Merge all documents (handles multi-page DSIs like BMW) texts = [] for doc in docs: t = doc.get("full_text", "") or doc.get("text_preview", "") or "" if t and len(t) > 50: texts.append(t) text = "\n\n".join(texts) if texts else "" title = docs[0].get("title", "") or docs[0].get("doc_type", "") word_count = len(text.split()) return { "text": text, "word_count": word_count, "title": title, "error": "", } except Exception as e: logger.warning("extract-text failed for %s: %s", req.url, e) return { "text": "", "word_count": 0, "title": "", "error": str(e)[:200], } # ── Unified compliance check ──────────────────────────────────────── @router.post("/compliance-check") async def start_compliance_check(req: ComplianceCheckRequest): """Start async compliance check for all documents.""" check_id = str(_uuid.uuid4())[:8] _compliance_check_jobs[check_id] = { "status": "running", "progress": "Pruefung gestartet...", "progress_pct": 0, "result": None, "error": "", } asyncio.create_task(_run_compliance_check(check_id, req)) return ComplianceCheckStartResponse(check_id=check_id, status="running") @router.get("/compliance-check/{check_id}") async def get_compliance_check_status(check_id: str): """Poll compliance check status.""" job = _compliance_check_jobs.get(check_id) if not job: return {"check_id": check_id, "status": "not_found"} return ComplianceCheckStatusResponse( check_id=check_id, status=job["status"], progress=job.get("progress", ""), progress_pct=job.get("progress_pct", 0), result=job.get("result"), error=job.get("error", ""), ) # ── P80: Snapshot + Replay ─────────────────────────────────────────── @router.get("/snapshots") async def list_snapshots(domain: str = "", limit: int = 20): """P80: list recent snapshots, optionally filtered by site_domain.""" from database import SessionLocal from compliance.services.check_snapshot import list_snapshots_for_domain db = SessionLocal() try: if domain: return {"snapshots": list_snapshots_for_domain(db, domain, limit)} from sqlalchemy import text rows = db.execute( text(""" SELECT id, check_id, site_domain, site_label, created_at, replay_count, notes FROM compliance.compliance_check_snapshots ORDER BY created_at DESC LIMIT :lim """), {"lim": limit}, ).fetchall() return {"snapshots": [ {"id": str(r[0]), "check_id": r[1], "site_domain": r[2], "site_label": r[3], "created_at": str(r[4]), "replay_count": r[5], "notes": r[6]} for r in rows ]} finally: db.close() @router.get("/snapshots/{snapshot_id}") async def get_snapshot(snapshot_id: str): """P80: load full snapshot raw data.""" from fastapi import HTTPException from database import SessionLocal from compliance.services.check_snapshot import load_snapshot db = SessionLocal() try: snap = load_snapshot(db, snapshot_id) if not snap: raise HTTPException(status_code=404, detail="snapshot not found") return snap finally: db.close() @router.post("/admin/tcf-ingest") async def tcf_ingest(): """P105 — IAB TCF Vendor-Liste ingestieren / refreshen. Idempotent: holt aktuelle GVL und upserted in compliance.cookie_library mit source='iab_tcf_v2'. Aufruf ein paar Mal pro Jahr ausreichend.""" from database import SessionLocal from compliance.services.tcf_vendor_authority import ( fetch_and_ingest_tcf_vendors, ) db = SessionLocal() try: return await fetch_and_ingest_tcf_vendors(db) finally: db.close() @router.get("/snapshots/{snapshot_id}/pdf") async def export_snapshot_pdf(snapshot_id: str): """P88 — PDF-Export der Audit-Mail. Liefert application/pdf.""" from fastapi import HTTPException from fastapi.responses import Response from database import SessionLocal from compliance.services.mail_pdf_export import render_snapshot_as_pdf db = SessionLocal() try: pdf = render_snapshot_as_pdf(db, snapshot_id) finally: db.close() if not pdf: raise HTTPException(404, f"Snapshot {snapshot_id} nicht gefunden " "oder PDF-Render fehlgeschlagen.") fname = f"breakpilot-audit-{snapshot_id[:8]}.pdf" return Response( content=pdf, media_type="application/pdf", headers={"Content-Disposition": f'attachment; filename="{fname}"'}, ) @router.post("/snapshots/{snapshot_id}/replay") async def replay_snapshot( snapshot_id: str, recipient: str = "", dry_run: bool = True, ): """P80: replay audit mail render from snapshot. 7min->2sec test cycle. Default dry_run=true just returns rendered HTML size + section breakdown. Pass recipient + dry_run=false to actually send a [REPLAY] mail. """ from database import SessionLocal from compliance.services.check_replay import replay_from_snapshot db = SessionLocal() try: return replay_from_snapshot( db, snapshot_id=snapshot_id, recipient=(recipient if recipient else None), dry_run=dry_run, ) finally: db.close() async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): """Background task: check all documents with business-profile context.""" try: from compliance.services.business_profiler import detect_business_profile from compliance.services.doc_checks.runner import check_document_completeness from compliance.services.rag_document_checker import check_document_with_controls from .agent_doc_check_routes import CheckItem, DocCheckResult from .agent_doc_check_report import build_html_report # Reset anchor-locator cache per run (avoid cross-run leak) try: from compliance.services.doc_anchor_locator import reset_cache reset_cache() except Exception: pass # P7: TDM-Reservation-Check der Base-Domain (§ 44b UrhG). # Bei reserved/denied: Run sofort beenden, kein Crawl. try: from compliance.services.tdm_reservation_check import ( check_tdm_reservation, is_crawl_allowed, ) first_url = next( (d.url for d in req.documents if d.url), "", ) if first_url: tdm = await check_tdm_reservation(first_url) _compliance_check_jobs[check_id]["tdm"] = tdm # P12: Bei tdm_override + Reason wird NICHT abgebrochen, # sondern nur dokumentiert. Override ohne Reason wird ignoriert. override_active = ( req.tdm_override and len((req.tdm_override_reason or "").strip()) >= 10 ) if not is_crawl_allowed(tdm) and not override_active: _compliance_check_jobs[check_id]["status"] = "skipped_tdm" _compliance_check_jobs[check_id]["error"] = ( f"TDM-Vorbehalt fuer {tdm.get('domain')} erkannt " f"(status={tdm.get('status')}) — Crawl nach § 44b " f"UrhG nicht zulaessig. Signals: " f"{[s.get('src') for s in tdm.get('signals', [])]}" ) _compliance_check_jobs[check_id]["progress_pct"] = 100 logger.info("TDM-skip check_id=%s domain=%s status=%s", check_id, tdm.get("domain"), tdm.get("status")) return if override_active and not is_crawl_allowed(tdm): _compliance_check_jobs[check_id]["tdm_override"] = { "reason": req.tdm_override_reason.strip()[:500], "original_status": tdm.get("status"), } logger.warning( "TDM-Override aktiv: check_id=%s domain=%s " "status=%s reason=%r", check_id, tdm.get("domain"), tdm.get("status"), req.tdm_override_reason.strip()[:80], ) except Exception as e: logger.warning("TDM-check failed (proceeding): %s", e) # Step 1: Resolve texts (fetch from URL if needed) — 0-30% _update(check_id, "Texte werden geladen...", 1) doc_texts: dict[str, str] = {} doc_entries: list[dict] = [] # Cache fetched URLs to detect duplicates url_text_cache: dict[str, str] = {} n_docs = max(1, len(req.documents)) # User-pasted-Tabellen-Vendors (kein LLM noetig) — werden weiter # unten in cmp_vendors gemerged. pasted_table_vendors: list[dict] = [] for i, doc in enumerate(req.documents): pct = int(1 + (i / n_docs) * 29) _update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct) text = (doc.text or "").strip() input_source = "url" cmp_payloads: list[dict] = [] if text: input_source = "text" if doc.url: input_source = "text+url" # User hat beide gefuellt logger.info( "doc_type=%s: User hat URL UND Text geliefert — " "Text gewinnt, URL wird als Quellen-Referenz behalten", doc.doc_type, ) elif doc.url: url_key = doc.url.strip().rstrip("/").lower() if url_key in url_text_cache: text = url_text_cache[url_key] else: text, cmp_payloads = await _fetch_text(doc.url, doc_type=doc.doc_type) if text: url_text_cache[url_key] = text # Auto-Reclassify-Check: wenn der user Text in das falsche # Doc-Type-Feld kopiert hat (z.B. Impressum-Text in DSE), # erkennen und ggf. umtaggen. actual_doc_type = doc.doc_type reclassify_hint: dict | None = None if input_source.startswith("text") and len(text) >= 500: try: from compliance.services.doc_type_classifier import ( detect_mismatch, ) reclassify_hint = detect_mismatch(doc.doc_type, text) if reclassify_hint and reclassify_hint["action"] == "reclassify": actual_doc_type = reclassify_hint["detected"] logger.info( "doc_type AUTO-RECLASSIFY: deklariert=%s " "erkannt=%s (score %d vs %d) — uebernehme erkannten Typ", doc.doc_type, actual_doc_type, reclassify_hint["detected_score"], reclassify_hint["declared_score"], ) except Exception as e: logger.warning("doc_type_classifier failed: %s", e) # Cookie-Tabelle: wenn User Tabelle reinkopiert hat, deterministisch # parsen (kein LLM noetig) und Vendors gleich ableiten. if input_source.startswith("text") and actual_doc_type == "cookie": try: from compliance.services.cookies_table_parser import ( parse_cookie_table, ) tab_vendors = parse_cookie_table(text) if tab_vendors: pasted_table_vendors.extend(tab_vendors) logger.info( "Cookie-Tabelle erkannt im pasted Text — " "%d Vendors / %d Cookies deterministisch geparst", len(tab_vendors), sum(len(v.get("cookies", [])) for v in tab_vendors), ) except Exception as e: logger.warning("cookies_table_parser failed: %s", e) if text: doc_texts[actual_doc_type] = text doc_entries.append({ "doc_type": actual_doc_type, "declared_doc_type": doc.doc_type, "url": doc.url, "text": text, "word_count": len(text.split()) if text else 0, "auto_discovered": False, "discovery_attempted": False, "cmp_payloads": cmp_payloads, "input_source": input_source, "reclassify_hint": reclassify_hint, }) # Step 1a-bis: AUTO-DISCOVERY. For each canonical doc_type the user # did NOT submit a URL/text for, try to find it on the homepage of # the submitted URLs. This bridges the gap between "user knows the # exact URL" (rare) and "user pasted the homepage" (common). await _autodiscover_missing( check_id, doc_entries, doc_texts, url_text_cache, ) # Step 1b: Section splitting — two cases: # 1. Same URL used for multiple doc_types → split by heading # 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows from compliance.services.section_splitter import ( split_shared_texts, auto_fill_from_dsi, cross_search_documents, ) split_shared_texts(doc_entries, url_text_cache) auto_fill_from_dsi(doc_entries) # Step 1c: Cross-document search — find doc_types in wrong documents (30-35%) _update(check_id, "Dokumente werden uebergreifend durchsucht...", 32) placement_findings = cross_search_documents(doc_entries) # Refresh doc_texts after all splitting/searching for entry in doc_entries: if entry.get("text"): doc_texts[entry["doc_type"]] = entry["text"] # P15: Dedupe — wenn mehrere Doc-Types DASSELBE Dokument referenzieren # (z.B. Safetykon: User gibt /datenschutz fuer dse + cookie + widerruf), # behalten wir nur den primaeren Doc-Type. Andere: leeren + note. # Priorität: dse > impressum > cookie > widerruf > agb > nutzungsbedingungen _DOC_PRIORITY = ["dse", "impressum", "cookie", "widerruf", "agb", "nutzungsbedingungen", "social_media", "dsb"] seen_text_hash: dict[int, str] = {} for dt in _DOC_PRIORITY: entry = next((e for e in doc_entries if e.get("doc_type") == dt and e.get("text")), None) if not entry: continue text_hash = hash((entry.get("text") or "").strip()[:1000]) if text_hash in seen_text_hash: primary = seen_text_hash[text_hash] logger.info( "P15 dedup: doc_type=%s referenziert dasselbe Dokument " "wie %s (URL=%s) -> als Duplikat markiert.", dt, primary, entry.get("url", "")[:60], ) entry["text"] = "" entry["word_count"] = 0 entry["url"] = "" entry["dup_of"] = primary doc_texts.pop(dt, None) else: seen_text_hash[text_hash] = dt # Step 2: Detect business profile (35-40%) _update(check_id, "Geschaeftsmodell wird erkannt...", 37) # P16: Homepage-Text mit fuer Profile-Detection (no_direct_sales # B2B-Indikatoren wie "CE-Zertifizierung" / "Schulungen" stehen oft # nur im Homepage-Menue, nicht im Pflichttext). profile_input = dict(doc_texts) try: base_url = "" for e in doc_entries: if e.get("url"): from urllib.parse import urlparse p = urlparse(e["url"]) if p.scheme and p.netloc: base_url = f"{p.scheme}://{p.netloc}/" break if base_url: import re as _re async with httpx.AsyncClient( timeout=8.0, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) " "AppleWebKit/537.36 HeadlessChrome/120.0.0.0"}, ) as _hc: _hr = await _hc.get(base_url) if _hr.status_code == 200 and "text/html" in _hr.headers.get( "content-type", ""): _html = _hr.text[:60000] _html = _re.sub(r"]*>.*?", " ", _html, flags=_re.DOTALL | _re.IGNORECASE) _html = _re.sub(r"]*>.*?", " ", _html, flags=_re.DOTALL | _re.IGNORECASE) _html = _re.sub(r"<[^>]+>", " ", _html) _html = _re.sub(r"\s+", " ", _html).strip() if len(_html.split()) > 30: profile_input["__homepage"] = _html[:20000] logger.info("P16 homepage merged for profile: %d words", len(_html.split())) except Exception as e: logger.debug("homepage fetch for profile failed: %s", e) profile = await detect_business_profile(profile_input) profile_dict = asdict(profile) # Step 3: Check each document results: list[DocCheckResult] = [] total_findings = 0 use_agent_flag = req.use_agent or os.getenv( "COMPLIANCE_USE_AGENT", "false" ).lower() == "true" # Filter out doc_types that don't apply to this business profile skip_types = _get_skip_types(profile) # Derive business_scope hints for the MC filter (O1 — Doc-type Scope-Flag). # MCs that explicitly require a feature (e.g. 'biometric_processing', # 'ai_decision_making', 'child_targeting') get dropped when the # detected profile doesn't declare it. business_scope: set[str] = set() for svc in (getattr(profile, "detected_services", []) or []): business_scope.add(str(svc).lower()) if (getattr(profile, "business_type", "") or "").lower() == "b2c": business_scope.add("b2c") if getattr(profile, "has_online_shop", False): business_scope.add("ecommerce") if getattr(profile, "is_regulated_profession", False): business_scope.add("regulated_profession") # Document checks: 40-80% n_entries = max(1, len(doc_entries)) for i, entry in enumerate(doc_entries): text = entry["text"] doc_type = entry["doc_type"] label = _doc_type_label(doc_type) url = entry["url"] if doc_type in skip_types: results.append(DocCheckResult( label=label, url=url, doc_type=doc_type, error=skip_types[doc_type], )) continue pct = int(40 + (i / n_entries) * 40) _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct) if not text or len(text) < 50: # P15: duplicate doc that was deduped against a primary doc if entry.get("dup_of"): results.append(DocCheckResult( label=label, url="", doc_type=doc_type, error=f"Nicht separat vorhanden — wird im Dokument " f"'{_doc_type_label(entry['dup_of'])}' " f"mit-geprueft.", )) continue # P24: DSB-Kontakt ist Pflichtangabe in der DSE (Art. 13(1)(b) # DSGVO) — wenn kein separates DSB-Dokument vorliegt, ist das # KEIN Fehler. DSB-Pruefung passiert ohnehin in der DSE. if doc_type == "dsb" and not (entry.get("url") or "").strip(): results.append(DocCheckResult( label=label, url="", doc_type=doc_type, error="Nicht separat vorhanden — DSB-Kontaktdaten " "werden in der Datenschutzerklaerung als " "Pflichtangabe nach Art. 13(1)(b) DSGVO geprueft.", )) continue # Empty entry — either from auto-discovery padding (no URL # to fetch) or from a fetch that returned nothing. If there # was a URL we keep the error so the user knows the fetch # failed; otherwise let the padding step label it # 'Nicht eingereicht' / 'Auf der Website nicht gefunden'. if (entry.get("url") or "").strip(): results.append(DocCheckResult( label=label, url=url, doc_type=doc_type, error="Kein Text vorhanden oder zu kurz", )) continue result = await _check_single( text, doc_type, label, url, entry["word_count"], use_agent_flag, business_scope=business_scope, business_profile={"no_direct_sales": getattr(profile, "no_direct_sales", False)}, ) # Apply profile context filter result = _apply_profile_filter(result, profile, doc_type) # Add placement findings — but only if the regex checks confirm # the text doesn't match. If completeness >= 50%, the text IS the # right doc_type despite missing cross-search keywords. if result.completeness_pct < 50: for pf in placement_findings: if pf.get("doc_type") == doc_type: result.checks.insert(0, CheckItem(**{ k: v for k, v in pf.items() if k != "doc_type" })) results.append(result) total_findings += result.findings_count # Step 3b: Banner-Check (automatic, uses first URL or homepage) banner_result = None banner_url = req.documents[0].url if req.documents and req.documents[0].url else "" # Use the homepage (strip path) for banner check if banner_url: from urllib.parse import urlparse parsed = urlparse(banner_url) banner_url = f"{parsed.scheme}://{parsed.netloc}" if banner_url: _update(check_id, "Cookie-Banner wird geprueft...", 82) try: async with httpx.AsyncClient(timeout=900.0) as client: # P50: +10min for vendor-detail-phase resp = await client.post( f"{CONSENT_TESTER_URL}/scan", json={"url": banner_url, "timeout_per_phase": 10}, ) if resp.status_code == 200: banner_result = resp.json() except Exception as e: logger.warning( "Banner check failed: %s (%s)", e or "", type(e).__name__ ) # Step 3c: Cross-check Banner vs Cookie-Richtlinie (88-90%) if banner_result and "cookie" in doc_texts: _update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...", 89) cross_findings = _cross_check_banner_vs_cookie( banner_result, doc_texts["cookie"], ) if cross_findings: for r in results: if r.doc_type == "cookie": for cf in cross_findings: r.checks.append(CheckItem(**cf)) l2 = [c for c in r.checks if c.level == 2 and not c.skipped] l2p = sum(1 for c in l2 if c.passed) r.correctness_pct = round(l2p / len(l2) * 100) if l2 else 0 # Step 3d: TCF Vendor cross-check against DSI tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else [] vvt_entries: list[dict] = [] if tcf_vendors and "dse" in doc_texts: _update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...", 91) from compliance.services.banner_cookie_cross_check import cross_check_vendors_vs_dsi from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"]) if vendor_findings: for r in results: if r.doc_type == "dse": for vf in vendor_findings: r.checks.append(CheckItem(**vf)) vvt_entries = map_vendors_to_vvt(tcf_vendors) # Step 4: Extract profile hints from documents (92-95%) _update(check_id, "Profil wird aus Dokumenten extrahiert...", 93) from compliance.services.profile_extractor import extract_profile_from_documents extracted_profile = extract_profile_from_documents(doc_texts, profile_dict) # Step 4b: Determine scenario per document for r in results: if r.error: r.scenario = "skip" elif r.completeness_pct < 30: r.scenario = "regenerate" elif r.completeness_pct < 95: r.scenario = "fix" else: r.scenario = "import" # Step 4c: Always render all 8 canonical doc types. Missing types # are differentiated: # - Discovery was tried but found nothing -> 'Auf der Website # nicht gefunden' (suggest user provides URL manually) # - No submitted URLs at all -> 'Nicht eingereicht' attempted = { e["doc_type"] for e in doc_entries if e.get("discovery_attempted") } results = _pad_results_with_missing(results, discovery_attempted=attempted) # Step 5: Build report with management summary (95-98%) _update(check_id, "Report wird erstellt...", 96) from .agent_doc_check_report import ( build_management_summary, build_scanned_urls_html, build_provider_list_html, ) from .agent_doc_check_extras import build_vvt_table_html # Extract structured vendor records from any CMP payloads captured # for the cookie doc (BMW ePaaS, OneTrust, etc.), validate their # opt-out + privacy URLs concurrently, score each entry. cmp_vendors: list[dict] = [] try: from compliance.services.vendor_extractor import ( extract_vendors_from_payloads, ) from compliance.services.cookie_link_validator import ( validate_vendor_urls, score_vendors, ) cookie_payloads = [] cookie_text = "" # P30: aggregate cmp_payloads from ALL doc_entries — sites # like Mercedes load Usercentrics only on the homepage, so # the JSON gets captured during DSE/Impressum discovery, not # in the cookies.html fetch. Dedup by URL since the same # payload is captured on every page load. seen_cmp_urls: set[str] = set() for e in doc_entries: for p in (e.get("cmp_payloads") or []): p_url = p.get("url") or "" if p_url and p_url in seen_cmp_urls: continue seen_cmp_urls.add(p_url) cookie_payloads.append(p) if e.get("doc_type") == "cookie" and e.get("text"): cookie_text = e["text"] # P48: also pull cmp_payloads from the Banner-Scan (homepage # 3-phase consent test). Mercedes' Usercentrics-JSON is # captured there even when not in DSI-Discovery of static # legal pages. if banner_result: for p in (banner_result.get("cmp_payloads") or []): p_url = p.get("url") or "" if p_url and p_url in seen_cmp_urls: continue seen_cmp_urls.add(p_url) cookie_payloads.append(p) if cookie_payloads: logger.info("P48: %d CMP-payloads available for vendor-extract (after Banner-Scan merge)", len(cookie_payloads)) # P17-D: Fallback wenn cookie via P15 deduped wurde — nutze DSE-Text # sofern Cookie-Begriffe drin sind, damit LLM-Vendor-Extract trotzdem # greifen kann. if not cookie_text and not cookie_payloads: dse_t = doc_texts.get("dse", "") if dse_t and any(w in dse_t.lower() for w in ("cookie", "tracking", "google analytics", "consent")): cookie_text = dse_t logger.info("P17-D: vendor-extract Fallback auf DSE (Cookie deduped)") # Site-owner derived from the submitted URLs — drives the # INTERNAL/GROUP_COMPANY classification of vendor records. owner_name = _company_name_from_url(doc_entries) or "" if cookie_payloads: cmp_vendors = extract_vendors_from_payloads( cookie_payloads, owner_name=owner_name, ) # P52: LLM-Fallback nicht nur wenn 0 Vendors, sondern auch # wenn die strukturierten Quellen < 5 Vendors lieferten und # der Cookie-Text substantiell ist. So holt sich VW-typische # Setups (Generic CMP, 28 Cookies aber 0 cmp_payloads) noch # ihre echten Vendors aus dem Text. if (len(cmp_vendors) < 5 and cookie_text and len(cookie_text.split()) >= 500): from compliance.services.vendor_llm_extractor import ( extract_vendors_via_llm, ) from compliance.services.vendor_classifier import classify _update(check_id, "Vendor-Liste per LLM extrahieren...", 94) llm_vendors = await extract_vendors_via_llm(cookie_text) # P52: classify die LLM-Vendors und MERGE mit existing # statt zu ueberschreiben. existing_names = {(v.get("name") or "").strip().lower() for v in cmp_vendors} added_llm = 0 for v in llm_vendors: nm = (v.get("name") or "").strip() if not nm or nm.lower() in existing_names: continue v["recipient_type"] = classify( vendor_name=nm, category=v.get("category", ""), owner_name=owner_name, ) v.setdefault("source", "llm_cascade") cmp_vendors.append(v) existing_names.add(nm.lower()) added_llm += 1 if added_llm: logger.info( "P52 LLM-Cascade: +%d Vendors (total: %d)", added_llm, len(cmp_vendors), ) # P57: Phase G vendor_details als zusätzliche Vendor-Quelle. # Wenn extract_vendors_from_payloads weniger findet als # Phase G's Info-Click-Through (z.B. Mercedes-Settings nicht # erkannt als usercentrics-kind), die Phase-G-Namen als # eigenständige Vendors hinzufügen. if banner_result: vd_list = banner_result.get("vendor_details") or [] vd_list = [v for v in vd_list if v.get("name") != "__TDM_OPTOUT__"] existing_names = {(v.get("name") or "").strip().lower() for v in cmp_vendors} added = 0 for d in vd_list: n = (d.get("name") or "").strip() if not n or n.lower() in existing_names: continue # Skip generic category-labels (Mercedes-Kategorien) if n.lower() in ("technisch erforderlich", "analyse und statistik", "marketing", "alles auswählen", "alles auswaehlen"): continue from compliance.services.vendor_classifier import classify cmp_vendors.append({ "name": n, "country": "", "purpose": d.get("description", "")[:500], "category": "", "opt_out_url": d.get("opt_out_url", ""), "privacy_policy_url": d.get("privacy_url", ""), "persistence": d.get("retention", ""), "cookies": d.get("cookies", []), "processing_company": d.get("processing_company", ""), "address": d.get("address", ""), "purposes": d.get("purposes", []), "technologies": d.get("technologies", []), "recipient_type": classify( vendor_name=n, category="", owner_name=owner_name, ), }) existing_names.add(n.lower()) added += 1 if added: logger.info("P57: added %d new vendors from Phase G (total: %d)", added, len(cmp_vendors)) # D — HTML-Tabellen die der consent-tester aus dem DOM # extrahiert hat: direkt deterministisch parsen (hoechste # Genauigkeit, keine LLM-Halluzinationen). for pl in (cookie_payloads or []): if pl.get("kind") != "html_table": continue rows = pl.get("rows") or [] if len(rows) < 3: continue try: from compliance.services.cookies_table_parser import ( parse_cookie_table as _parse_ct_d, ) table_text = "\n".join(rows) d_vendors = _parse_ct_d(table_text) if d_vendors: existing_d = {(v.get("name") or "").strip().lower() for v in cmp_vendors} added_d = 0 for v in d_vendors: nm = (v.get("name") or "").strip() if not nm or nm.lower() in existing_d: continue v.setdefault("source", "html_table_dom") cmp_vendors.append(v) existing_d.add(nm.lower()) added_d += 1 if added_d: logger.info( "D HTML-Table-DOM-Parse: +%d Vendors aus " "%d-Zeilen-Tabelle (total: %d)", added_d, len(rows), len(cmp_vendors), ) except Exception as e: logger.warning("html_table parse failed: %s", e) # B — cookies_table_parser auch auf gecrawltem Cookie-Text. # Erst Standard-Parse (Tab/Pipe-getrennt). Wenn der nichts # findet (kein Separator), Flat-Pattern-Parse fuer Sites wie # VW die ihre Tabelle als flachen Text liefern. if cookie_text and len(cookie_text) >= 500: try: from compliance.services.cookies_table_parser import ( parse_cookie_table as _parse_ct, parse_flat_cookie_text as _parse_flat, ) crawled_table_vendors = _parse_ct(cookie_text) if not crawled_table_vendors: crawled_table_vendors = _parse_flat(cookie_text) if crawled_table_vendors: existing = {(v.get("name") or "").strip().lower() for v in cmp_vendors} added_c = 0 for v in crawled_table_vendors: nm = (v.get("name") or "").strip() if not nm or nm.lower() in existing: continue v.setdefault("source", "table_crawled") cmp_vendors.append(v) existing.add(nm.lower()) added_c += 1 if added_c: logger.info( "B Crawled-Tabellen-Parse: +%d Vendors " "(total: %d)", added_c, len(cmp_vendors), ) except Exception as e: logger.warning("crawled-table-parse failed: %s", e) # User-pasted Cookie-Tabelle (deterministisch, kein LLM): # die hat IMMER Vorrang weil 100% genau. if pasted_table_vendors: existing = {(v.get("name") or "").strip().lower() for v in cmp_vendors} added_p = 0 for v in pasted_table_vendors: nm = (v.get("name") or "").strip() if not nm or nm.lower() in existing: continue cmp_vendors.append(v) existing.add(nm.lower()) added_p += 1 if added_p: logger.info( "Pasted-Tabellen-Merge: +%d Vendors (total: %d)", added_p, len(cmp_vendors), ) # Cookie-Library-Fallback (P52 Lite): wenn weiterhin wenige # Vendors aber viele after_accept-Cookies, aus Library auflösen. # VW-Lehre: 6 LLM-Grob-Vendors reichen NICHT — die Library # holt 30+ weitere aus den Cookie-Namen + Cookie-Doc-Pattern. # Schwelle: immer probieren wenn < 20 Vendors. if banner_result and len(cmp_vendors) < 20: try: from compliance.services.cookie_to_vendor_fallback import ( fallback_vendors_for_run, ) from database import SessionLocal as _SLfb _fb_db = _SLfb() try: extra = fallback_vendors_for_run( _fb_db, banner_result, len(cmp_vendors), cookie_doc_text=cookie_text, ) if extra: existing_names = {(v.get("name") or "").strip().lower() for v in cmp_vendors} for v in extra: if v["name"].lower() in existing_names: continue cmp_vendors.append(v) logger.info( "Cookie-Library-Fallback: cmp_vendors %d -> %d", len(cmp_vendors) - len(extra), len(cmp_vendors), ) finally: _fb_db.close() except Exception as e: logger.warning("Cookie-Library-Fallback skipped: %s", e) # Vendor-Normalizer: Dedup (Google-Familie etc) + Garbage-Filter try: from compliance.services.vendor_normalizer import ( normalize_vendors as _norm_v, ) cmp_vendors = _norm_v(cmp_vendors) except Exception as e: logger.warning("vendor_normalizer skipped: %s", e) # P50: enrich vendors with per-vendor detail-modal-extracts # (description, opt-out URL, privacy URL, cookies). Detail # comes from Phase G Info-button-click-through in /scan. tdm_opt_out_notice = "" if cmp_vendors and banner_result: vendor_details = banner_result.get("vendor_details") or [] # P50f: filter out TDM-opt-out sentinel tdm_sentinel = next((v for v in vendor_details if v.get("name") == "__TDM_OPTOUT__"), None) if tdm_sentinel: tdm_opt_out_notice = tdm_sentinel.get("description", "") logger.info("P50f: TDM opt-out — skipped detail-enrichment for vendors") vendor_details = [v for v in vendor_details if v.get("name") != "__TDM_OPTOUT__"] if vendor_details: details_by_name = {} for d in vendor_details: n = (d.get("name") or "").strip().lower() if n: details_by_name[n] = d enriched = 0 for v in cmp_vendors: key = (v.get("name") or "").strip().lower() # Substring fallback for fuzzy matches (e.g. # "Google Analytics" detail-name may differ slightly) d = details_by_name.get(key) if not d: for dn, dv in details_by_name.items(): if key in dn or dn in key: d = dv break if not d: continue if not v.get("country") and (d.get("processing_company") or d.get("address")): # Heuristic country extract from address (DE/EU keywords) addr = d.get("address", "") if re.search(r"\b(deutschland|germany|berlin|m(?:ue|ü)nchen|hamburg|stuttgart)\b", addr, re.I): v["country"] = "DE" elif re.search(r"\bireland|irland|dublin\b", addr, re.I): v["country"] = "IE" elif re.search(r"\busa|united states|california|new york|delaware\b", addr, re.I): v["country"] = "US" if not v.get("purpose"): v["purpose"] = d.get("description", "")[:500] if not v.get("opt_out_url"): v["opt_out_url"] = d.get("opt_out_url", "") if not v.get("privacy_policy_url"): v["privacy_policy_url"] = d.get("privacy_url", "") if not v.get("cookies"): v["cookies"] = d.get("cookies", []) v["purposes"] = d.get("purposes", []) v["technologies"] = d.get("technologies", []) if not v.get("persistence"): v["persistence"] = d.get("retention", "") v["processing_company"] = d.get("processing_company", "") v["address"] = d.get("address", "") enriched += 1 logger.info("P50: enriched %d/%d vendors with detail-modal data", enriched, len(cmp_vendors)) # P59b: Cookie-Behavior-Validator — pruefe alle gesetzten Cookies # gegen unsere Library, generiere 3-Tier-Severity-Findings. # Background-Task hat keinen DB-Dependency-Inject -> SessionLocal # selber oeffnen + sauber schliessen. cookie_behavior_findings: list[dict] = [] if banner_result: cookies_detailed = banner_result.get("cookies_detailed") or [] if cookies_detailed: cb_session = None try: from database import SessionLocal from compliance.services.cookie_behavior_validator import ( validate_cookie_behavior, ) from urllib.parse import urlparse fp_domain = "" if banner_url: fp_domain = urlparse(banner_url).netloc.replace("www.", "") cb_session = SessionLocal() cookie_behavior_findings = validate_cookie_behavior( cb_session, cookies_detailed, network_requests=[], # TODO Layer B in P59d first_party_domain=fp_domain, ) if cookie_behavior_findings: sevs = {f["severity"] for f in cookie_behavior_findings} logger.info( "P59b: Cookie-Behavior-Check %d findings " "(severities: %s) ueber %d Cookies", len(cookie_behavior_findings), sorted(sevs), len(cookies_detailed), ) banner_result["cookie_behavior_findings"] = ( cookie_behavior_findings ) else: logger.info( "P59b: Cookie-Behavior-Check 0 findings " "ueber %d Cookies (library miss / clean)", len(cookies_detailed), ) except Exception as cb_err: logger.warning("P59b Cookie-Behavior-Check failed: %s", cb_err) finally: if cb_session is not None: try: cb_session.close() except Exception: pass # P61: "Untergeschobene Cookies" — wenn z.B. Google Tag Manager # deklariert ist, kommen GA + GCL_AU + DoubleClick automatisch mit. # Findings landen im banner_result fuer Mail-Render. if banner_result and cmp_vendors: try: from compliance.services.vendor_package_cookies import ( detect_implicit_cookies, ) declared = [v.get("name", "") for v in cmp_vendors if v.get("name")] actual_cookies: list[str] = [] for phase_data in (banner_result.get("phases") or {}).values(): if isinstance(phase_data, dict): for ck in (phase_data.get("cookies") or []): if isinstance(ck, dict) and ck.get("name"): actual_cookies.append(ck["name"]) implicit_findings = detect_implicit_cookies( declared, actual_cookies_set=actual_cookies or None, ) if implicit_findings: banner_result["implicit_vendor_findings"] = implicit_findings logger.info( "P61: %d implicit vendor-package items detected " "(%d cookies + %d vendors)", len(implicit_findings), sum(1 for f in implicit_findings if f["implicit"]["type"] == "cookie"), sum(1 for f in implicit_findings if f["implicit"]["type"] == "vendor"), ) except Exception as p61_err: logger.warning("P61 implicit-vendor detection failed: %s", p61_err) if cmp_vendors: logger.info("VVT: %d vendors extracted, validating links", len(cmp_vendors)) cmp_vendors = await validate_vendor_urls(cmp_vendors) cmp_vendors = score_vendors(cmp_vendors) # Enrich each vendor with per-cookie functional roles try: from compliance.services.cookie_function_classifier import ( annotate_vendor_cookies, ) cmp_vendors = [annotate_vendor_cookies(v) for v in cmp_vendors] except Exception as e: logger.warning("Cookie function classification skipped: %s", e) except Exception as e: logger.warning("VVT vendor extraction skipped: %s", e) # Vendor-Redundanz + EU-Alternativen + Cost/Savings (O4) redundancy_report = None try: from compliance.services.vendor_redundancy import analyze as analyze_redundancy from compliance.services.vendor_cost_estimator import infer_company_tier if cmp_vendors: # Company-Tier aus business_profile ableiten — beeinflusst die # Cost-Range so dass z.B. fuer DAX-Konzerne nicht starter-Preise # die untere Schranke duruecken. bp_dict = { "type": getattr(profile, "business_type", ""), "features": list(business_scope), } ctier = infer_company_tier(bp_dict) redundancy_report = analyze_redundancy(cmp_vendors, company_tier=ctier) logger.info( "Redundanz: %d Kategorien mit Mehrfach-Anbietern, " "Spar-Schaetzung %s pro Jahr (company_tier=%s)", redundancy_report["summary"]["redundancy_count"], redundancy_report["summary"]["estimated_saving_pct"], ctier, ) except Exception as e: logger.warning("Vendor redundancy analysis skipped: %s", e) summary_html = build_management_summary(results) scanned_html = build_scanned_urls_html(doc_entries) providers_html = build_provider_list_html(banner_result, vvt_entries) # P18: Deep-Block mit Phases + Quality-Score + Per-Category-Tracker from .agent_doc_check_banner import build_banner_deep_html banner_deep_html = build_banner_deep_html(banner_result) vvt_html = build_vvt_table_html(cmp_vendors) # MC scorecard aggregated across ALL docs in this run (DSGVO/TDDDG/ # BGB/...). Sits at the top so the GF sees the regulation-by- # regulation view before drilling into per-doc details. from compliance.services.mc_scorecard import build_scorecard from .agent_doc_check_scorecard import build_scorecard_html all_mc_checks: list[dict] = [] # P73: pro-doc Fails sammeln um Solution-Generator pro Doc-Type # mit dem korrekten doc_text aufzurufen. fails_by_doc: dict[str, list[dict]] = {} for r in results: for c in r.checks: if c.id.startswith("mc-"): rec = { "id": c.id, "label": c.label, "passed": c.passed, "severity": c.severity, "skipped": c.skipped, "regulation": c.regulation, "hint": getattr(c, "hint", "") or "", } all_mc_checks.append(rec) if (not c.passed and not c.skipped and (c.severity or "").upper() in ("CRITICAL", "HIGH")): fails_by_doc.setdefault(r.doc_type, []).append(rec) # P106 — Audit-Type-Klassifizierung pro MC. Interne Prozess-/ # Doku-Checks werden NICHT als FAIL gewertet sondern als CHECK # (manuelle Pruefung beim DSB notwendig). try: from compliance.services.mc_audit_type import ( annotate_mc_results, split_by_audit_type, ) annotate_mc_results(all_mc_checks) mc_split = split_by_audit_type(all_mc_checks) # Fails-by-doc neu aufbauen: nur noch echte verifiable Fails fails_by_doc = {} for r in mc_split.get("verifiable_fails") or []: fails_by_doc.setdefault("dse", []).append(r) except Exception as e: logger.warning("P106 mc_audit_type skipped: %s", e) mc_split = {"internal_checks": [], "verifiable_fails": all_mc_checks} scorecard = build_scorecard(all_mc_checks) if all_mc_checks else {} # Trend: load previous scorecard for the same tenant + domain so the # email can show delta indicators (A6). prev_scorecard: dict | None = None if scorecard: try: from compliance.services.compliance_audit_log import ( list_runs_for_tenant, ) tenant_id_for_trend = req.recipient or "" base_domain_for_trend = _extract_domain(doc_entries) or "" prev_runs = list_runs_for_tenant( tenant_id_for_trend, base_domain=base_domain_for_trend, limit=1, ) if prev_runs: prev_scorecard = prev_runs[0].get("scorecard") except Exception as e: logger.debug("trend lookup skipped: %s", e) scorecard_html = ( build_scorecard_html(scorecard, previous_scorecard=prev_scorecard) if scorecard else "" ) report_html = build_html_report(results, None, doc_texts) profile_html = _build_profile_html(profile) # O4: Vendor-Redundanz / EU-Alternativen + Cost-Savings-Block from .agent_doc_check_redundancy import build_redundancy_html redundancy_html = build_redundancy_html(redundancy_report) # P1: Executive-Summary GANZ oben — CFO/GF sieht 4 KPIs + 2 CTAs. from .agent_doc_check_exec_summary import build_exec_summary_html # Site-Name fuer Header bestimmen (gleiche Logik wie Email-Subject) url_company_for_exec = _company_name_from_url(doc_entries) domain_for_exec = _extract_domain(doc_entries) site_name_for_exec = url_company_for_exec or domain_for_exec or "" exec_summary_html = build_exec_summary_html( scorecard=scorecard, previous_scorecard=prev_scorecard, cmp_vendors=cmp_vendors, redundancy_report=redundancy_report, site_name=site_name_for_exec, ) # P18: Critical-Findings-Block (rot oben, mit Sofortmassnahmen + # Quellen + Bussgeld-Praezedenz). Wird nur gerendert wenn echte # kritische Verstoesse vorliegen. critical_html = "" try: from .agent_doc_check_critical import build_critical_findings_html critical_html = build_critical_findings_html( banner_result=banner_result, scorecard=scorecard, results=results, ) except Exception as e: logger.warning("Critical-findings block skipped: %s", e) # P10: Cookie-Policy-Architecture-Detection (BMW-Pattern erkennen) cookie_arch_html = "" try: from compliance.services.cookie_policy_architecture import ( detect_architecture, build_architecture_html, ) cookie_doc_url = "" cookie_doc_text = doc_texts.get("cookie", "") cookie_cmp_payloads: list[dict] = [] for e in doc_entries: if (e.get("doc_type") or "").lower() in ("cookie", "cookie_policy"): cookie_doc_url = e.get("url", "") cookie_cmp_payloads = e.get("cmp_payloads") or [] break # P17-A: Fallback wenn Cookie-Doc via P15 deduped wurde — nutze # den DSE-Text wenn er Cookie-Schluesselwoerter enthaelt. if not cookie_doc_text: dse_text = doc_texts.get("dse", "") if dse_text and any(w in dse_text.lower() for w in ("cookie", "tracking", "google analytics", "consent")): cookie_doc_text = dse_text dse_entry = next((e for e in doc_entries if e.get("doc_type") == "dse"), {}) cookie_doc_url = dse_entry.get("url", "") cookie_cmp_payloads = dse_entry.get("cmp_payloads") or [] logger.info("P17-A: cookie-arch fallback auf DSE (Cookie-Doc deduped)") if cookie_doc_text: arch = detect_architecture( doc_url=cookie_doc_url, doc_text=cookie_doc_text, cmp_payloads=cookie_cmp_payloads, homepage_cmp_payloads=cmp_payloads or [], ) cookie_arch_html = build_architecture_html(arch) logger.info("cookie-arch: layer=%s versioned=%s risk=%s", arch["layer_separation"], arch["versioned"], arch["risk_label"]) except Exception as e: logger.warning("cookie-architecture detection failed: %s", e) # Reihenfolge — Sales-optimiert: # 1) Exec-Summary (KPIs + Saving + CTAs) # 2) summary_html (Konkrete Aufgaben fuer die Geschaeftsfuehrung) # 3) scanned_urls (Quellen-Transparenz) # 4) profile_html (Erkanntes Geschaeftsmodell) # 5) scorecard_html (MC-Scorecard) # 6) redundancy_html (Optimierungspotenzial — direkt nach Compliance-Score) # 7) providers_html + vvt_html (Vendor-Liste) # 8) report_html (Doc-Pruefung Details) # P62: Marketing-Manager-Disclaimer — was wir sehen vs nicht sehen scope_disclaimer_html = "" try: from .scope_disclaimer import build_scope_disclaimer_html scope_disclaimer_html = build_scope_disclaimer_html() except Exception as e: logger.warning("Scope-disclaimer block skipped: %s", e) # P103 + P104 — Cookie-Value-Entropy + Network-Tracing (Stufe 3 + 4) entropy_html = "" network_trace_html = "" try: from compliance.services.cookie_value_entropy import ( check_cookies_for_entropy_mismatch, build_entropy_block_html, ) from compliance.services.cookie_network_tracer import ( trace_cookie_network, build_network_trace_block_html, ) cookies_detailed = (banner_result or {}).get("cookies_detailed") or [] entropy_findings = check_cookies_for_entropy_mismatch(cookies_detailed) if entropy_findings: entropy_html = build_entropy_block_html(entropy_findings) logger.info("P103 Entropy: %d Findings", len(entropy_findings)) primary_url = "" for e_ in doc_entries: if e_.get("url"): primary_url = e_["url"]; break net_findings = trace_cookie_network(cookies_detailed, primary_url) if net_findings: network_trace_html = build_network_trace_block_html(net_findings) logger.info("P104 Network-Trace: %d Findings", len(net_findings)) except Exception as e: logger.warning("P103/P104 entropy/network-trace skipped: %s", e) # P105 — IAB TCF Authority-Cross-Reference (Stufe 5) tcf_authority_html = "" try: from compliance.services.tcf_vendor_authority import ( cross_reference_with_tcf, build_tcf_authority_block_html, ) from database import SessionLocal as _SLtcf _tcf_db = _SLtcf() try: tcf_findings = cross_reference_with_tcf(_tcf_db, cmp_vendors) if tcf_findings: tcf_authority_html = build_tcf_authority_block_html(tcf_findings) logger.info( "TCF-Authority: %d Vendor-Discrepancies gefunden", len(tcf_findings), ) finally: _tcf_db.close() except Exception as e: logger.warning("TCF-Authority-Check skipped: %s", e) # COOKIE-COMPLIANCE-AUDIT (3-Quellen-Vergleich) — das ist der # zentrale USP: deklariert in Richtlinie vs tatsaechlich im # Browser geladen vs Library-Match. cookie_audit = {} cookie_audit_html = "" try: from compliance.services.cookie_compliance_audit import ( audit_cookie_compliance, build_cookie_audit_block_html, ) from database import SessionLocal as _SLca _ca_db = _SLca() try: cookie_audit = audit_cookie_compliance( _ca_db, doc_texts.get("cookie") or doc_texts.get("dse"), banner_result, ) if cookie_audit and (cookie_audit.get("declared_count") or cookie_audit.get("browser_count")): cookie_audit_html = build_cookie_audit_block_html(cookie_audit) logger.info( "Cookie-Audit: %d deklariert, %d im Browser, " "%d undokumentiert, %d compliant", cookie_audit.get("declared_count"), cookie_audit.get("browser_count"), len(cookie_audit.get("undeclared_in_browser") or []), len(cookie_audit.get("compliant") or []), ) finally: _ca_db.close() except Exception as e: logger.warning("cookie-compliance-audit skipped: %s", e) # P102: Cookie-Klassifikations-Pruefung (deklariert vs Library) library_mismatch_html = "" mismatches: list[dict] = [] try: from compliance.services.cookie_library_mismatch import ( detect_mismatches, build_mismatch_block_html, ) from database import SessionLocal cookie_doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or "" all_cookies_seen: list[str] = [] if banner_result: for ph in (banner_result.get("phases") or {}).values(): if isinstance(ph, dict): for ck in (ph.get("cookies") or []): if isinstance(ck, str): all_cookies_seen.append(ck) elif isinstance(ck, dict) and ck.get("name"): all_cookies_seen.append(ck["name"]) if all_cookies_seen and cookie_doc_for_check: _mm_db = SessionLocal() try: mismatches = detect_mismatches( _mm_db, all_cookies_seen, cookie_doc_for_check, ) if mismatches: library_mismatch_html = build_mismatch_block_html(mismatches) logger.info( "P102: %d Cookie-Mismatches gefunden", len(mismatches) ) finally: _mm_db.close() except Exception as e: logger.warning("P102 mismatch detection failed: %s", e) # P35 + P77 + P78: Textsignal-Checks (Save-Label, Cookies-in-DSE, # JC-Klausel im DSE) signals_html = "" try: from compliance.services.doc_text_signals import ( run_all as run_signal_checks, build_signals_block_html, ) cookie_doc_missing = not bool(doc_texts.get("cookie")) sig_findings = run_signal_checks( banner_result, doc_texts, cookie_doc_missing, ) if sig_findings: signals_html = build_signals_block_html(sig_findings) except Exception as e: logger.warning("P35/P77/P78 signals-check failed: %s", e) # P92 + P94: Banner-Konsistenz (CMP-Tool kaputt / Banner-vs-Doc-Diff) consistency_html = "" try: from compliance.services.banner_consistency_checks import ( run_all as run_consistency_checks, build_consistency_block_html, ) cookie_doc_for_check = (doc_texts.get("cookie") or doc_texts.get("dse") or "") cons_findings = run_consistency_checks( banner_result or {}, cookie_doc_for_check, cmp_vendors, doc_texts=doc_texts, ) if cons_findings: consistency_html = build_consistency_block_html(cons_findings) logger.info("P92/P94: %d Konsistenz-Findings", len(cons_findings)) except Exception as e: logger.warning("P92/P94 consistency-check failed: %s", e) # P73: MC-Solution-Generator — LLM-Vorschlaege pro HIGH-Fail. # Max 5 Solutions pro Doc-Type um Latenz < 60s zu halten. solutions_html = "" try: from compliance.services.mc_solution_generator import ( generate_solutions_for_fails, build_solutions_block_html, ) all_solutions: list[dict] = [] for dt, fails in fails_by_doc.items(): if not fails: continue doc_txt = doc_texts.get(dt) or doc_texts.get("dse") or "" if not doc_txt or len(doc_txt) < 500: continue sols = await generate_solutions_for_fails( fails, doc_txt, dt, limit=3, ) all_solutions.extend(sols) if len(all_solutions) >= 8: break # global cap if all_solutions: solutions_html = build_solutions_block_html(all_solutions[:8]) logger.info("P73: %d MC-Solutions generiert", len(all_solutions)) except Exception as e: logger.warning("P73 MC-Solution-Generator skipped: %s", e) # P71: JC-vs-AVV Entscheidungsbaum (nur wenn DSE ambig) jc_decision_html = "" try: from compliance.services.jc_avv_decision import ( build_jc_avv_decision_html, ) jc_decision_html = build_jc_avv_decision_html(doc_texts.get("dse")) except Exception as e: logger.warning("P71 jc_avv_decision skipped: %s", e) # P6/P53/P55 — Branchen-Kontext + Site-History industry_ctx_html = "" try: from compliance.services.industry_library import ( build_industry_context_block_html, load_site_profile, ) from database import SessionLocal as _SLib _ind_db = _SLib() try: ind = (req.scan_context or {}).get("industry") if req.scan_context else None site_prof = load_site_profile(_ind_db, domain_for_exec or "") industry_ctx_html = build_industry_context_block_html(ind, site_prof) finally: _ind_db.close() except Exception as e: logger.warning("industry context skipped: %s", e) # P106 — Internal-Checks-Block (interne Prozesse / Doku-Pflichten) internal_checks_html = "" try: from compliance.services.mc_audit_type import ( build_internal_checks_block_html, ) ic = (mc_split or {}).get("internal_checks") or [] if ic: internal_checks_html = build_internal_checks_block_html(ic) logger.info( "P106: %d interne Checks (statt FAIL) im Block", len(ic), ) except Exception as e: logger.warning("P106 internal_checks_html skipped: %s", e) # P85 — Banner-Screenshot fuer visuellen Beweis (zwischen # GF-1-Pager und Detail-Bloecken) banner_shot_html = "" try: from compliance.services.banner_screenshot_block import ( build_banner_screenshot_html, ) banner_shot_html = build_banner_screenshot_html(banner_result) except Exception as e: logger.warning("P85 banner-screenshot skipped: %s", e) # P82: GF-1-Pager ganz oben in der Mail — 5-Bullet-Zusammenfassung # damit die GF nicht 124k Char lesen muss. gf_one_pager_html = "" try: from compliance.services.gf_one_pager import build_gf_one_pager_html gf_one_pager_html = build_gf_one_pager_html( site_name=site_name_for_exec, scorecard=scorecard, previous_scorecard=prev_scorecard, banner_result=banner_result, library_mismatch_findings=mismatches, scan_context=req.scan_context, audit_quality_findings=audit_quality_findings, ) except Exception as e: logger.warning("P82 GF-1-pager skipped: %s", e) # A — Audit-Quality-Checks: Banner-Detect-Failure, Vendor-Extract # auffaellig duenn, URL-Fetch fehlgeschlagen → IMMER prominent zeigen. audit_quality_html = "" audit_quality_findings: list[dict] = [] try: from compliance.services.audit_quality_checks import ( run_all as run_audit_quality, build_audit_quality_block_html, ) cookie_text_for_aq = doc_texts.get("cookie") or "" audit_quality_findings = run_audit_quality( banner_result, cookie_text_for_aq, cmp_vendors, doc_entries, ) if audit_quality_findings: audit_quality_html = build_audit_quality_block_html(audit_quality_findings) logger.info( "audit-quality: %d Vorbehalte erkannt", len(audit_quality_findings), ) except Exception as e: logger.warning("audit-quality-checks failed: %s", e) # Doc-Input-Warnings — wenn User Text ins falsche Feld gepastet hat input_warn_html = "" try: from compliance.services.doc_input_warnings import ( collect_warnings, build_warnings_block_html, ) warns = collect_warnings(doc_entries) if warns: input_warn_html = build_warnings_block_html(warns) logger.info("doc-input-warnings: %d Mismatches gefunden", len(warns)) except Exception as e: logger.warning("doc-input-warnings skipped: %s", e) # P86: Branchen-Benchmark (nur wenn scan_context.industry gesetzt) bench_html = "" try: from database import SessionLocal as _SLb from compliance.services.industry_benchmark import ( compute_benchmark, build_benchmark_html, _extract_score, ) industry = (req.scan_context or {}).get("industry") if req.scan_context else None curr_score = _extract_score(banner_result) if industry and curr_score is not None: _b_db = _SLb() try: bench = compute_benchmark( _b_db, industry, curr_score, check_id, ) if bench: bench_html = build_benchmark_html(bench) finally: _b_db.close() except Exception as e: logger.warning("P86 industry-benchmark skipped: %s", e) # P84: Diff-Mode — "Seit letztem Lauf X Findings weg, Y neue". diff_html = "" try: from database import SessionLocal as _SL from compliance.services.run_diff import ( compute_diff, build_diff_block_html, ) _diff_db = _SL() try: diff = compute_diff( _diff_db, check_id, domain_for_exec or "", banner_result, scorecard, ) if diff: diff_html = build_diff_block_html(diff) finally: _diff_db.close() except Exception as e: logger.warning("P84 diff-mode skipped: %s", e) full_html = ( gf_one_pager_html + audit_quality_html + input_warn_html + bench_html + diff_html + critical_html + scope_disclaimer_html + exec_summary_html + cookie_arch_html + summary_html + scanned_html + profile_html + scorecard_html + internal_checks_html + redundancy_html + industry_ctx_html + banner_shot_html + providers_html + banner_deep_html + cookie_audit_html + tcf_authority_html + entropy_html + network_trace_html + library_mismatch_html + consistency_html + signals_html + solutions_html + jc_decision_html + vvt_html + report_html ) # Step 6: Send email — derive site name primarily from entered URL. # The extracted_profile.companyName is often noisy (e.g. picks up # juris.de from legal references). Domain-derived name is more # predictable for the GF email subject. doc_count = len([r for r in results if not r.error]) url_company = _company_name_from_url(doc_entries) domain = _extract_domain(doc_entries) site_name = url_company or domain or "Unbekannt" _update(check_id, "E-Mail wird versendet...", 98) email_result = send_email( recipient=req.recipient, subject=f"[COMPLIANCE-CHECK] {site_name} — {doc_count} Dokumente geprueft", body_html=full_html, ) # Step 7: Store result response = { "check_id": check_id, "results": [_result_to_dict(r) for r in results], "business_profile": profile_dict, "extracted_profile": extracted_profile, # P18: vollen consent-tester-Output durchreichen statt nur 4 Felder. # phases (before/after-accept/reject) + banner_checks.violations + # category_tests werden vom Renderer + Critical-Findings-Block genutzt. "banner_result": ({ "detected": banner_result.get("banner_detected", False), "provider": banner_result.get("banner_provider", ""), "violations": len((banner_result.get("banner_checks") or {}) .get("violations", [])), "tcf_vendor_count": len(tcf_vendors), "completeness_pct": banner_result.get("completeness_pct"), "correctness_pct": banner_result.get("correctness_pct"), "phases": banner_result.get("phases", {}), "banner_checks": banner_result.get("banner_checks", {}), "category_tests": banner_result.get("category_tests", []), "structured_checks": banner_result.get("structured_checks", []), "summary": banner_result.get("summary", {}), } if banner_result else None), "tcf_vendors": vvt_entries if tcf_vendors else [], "cmp_vendors": cmp_vendors, "cookie_audit": cookie_audit if cookie_audit else None, "total_documents": len(results), "total_findings": total_findings, "email_status": email_result.get("status", "failed"), "checked_at": datetime.now(timezone.utc).isoformat(), } _compliance_check_jobs[check_id]["status"] = "completed" _compliance_check_jobs[check_id]["result"] = response _compliance_check_jobs[check_id]["progress"] = "Fertig" _compliance_check_jobs[check_id]["progress_pct"] = 100 # P80: persist raw scan data so we can replay audit pipeline # without re-crawling (7min -> 5sec test cycle). try: from database import SessionLocal from compliance.services.check_snapshot import save_snapshot snap_db = SessionLocal() try: save_snapshot( snap_db, check_id=check_id, doc_entries=doc_entries, banner_result=banner_result, profile=profile, cmp_vendors=cmp_vendors, scan_context=req.scan_context, # P79 site_label=site_name, notes=f"recipient={req.recipient}", ) finally: snap_db.close() except Exception as snap_err: logger.warning("P80 snapshot save skipped: %s", snap_err) # Persist to sidecar SQLite audit log — enables /audit endpoints # (A5 admin tab) and trend view (A6). Best-effort; failures here # do not affect the user-facing response. try: from compliance.services.compliance_audit_log import record_check_run from compliance.services.mc_scorecard import full_audit_records audit_rows: list[dict] = [] for r in results: doc_mc = [c for c in r.checks if c.id.startswith("mc-")] audit_rows.extend(full_audit_records( [{"id": c.id, "label": c.label, "passed": c.passed, "severity": c.severity, "skipped": c.skipped, "regulation": c.regulation, "matched_text": c.matched_text, "hint": c.hint, "level": c.level} for c in doc_mc], check_id=check_id, doc_type=r.doc_type, )) record_check_run( check_id=check_id, tenant_id=req.recipient or "", site_name=site_name, base_domain=domain or "", doc_count=doc_count, scorecard=scorecard, vvt_summary={ "total": len(cmp_vendors), "internal": sum(1 for v in cmp_vendors if (v.get("recipient_type") or "").upper() in ("INTERNAL", "GROUP_COMPANY")), "external": sum(1 for v in cmp_vendors if (v.get("recipient_type") or "").upper() in ("PROCESSOR", "CONTROLLER")), }, mc_records=audit_rows, ) from compliance.services.compliance_audit_log import record_check_payload record_check_payload( check_id=check_id, vendors=cmp_vendors, profile=extracted_profile, banner=banner_result, ) # Unified findings (P5): bundle MC + Pflichtangaben + Vendor + # Redundanz in one searchable table behind /agent/findings/. try: from compliance.services.unified_findings_collector import collect from compliance.services.unified_findings_store import record_findings unified = collect( check_id=check_id, results=results, cmp_vendors=cmp_vendors, redundancy_report=redundancy_report, doc_texts=doc_texts, ) record_findings(check_id, unified) except Exception as e: logger.warning("Unified findings collect failed: %s", e) except Exception as e: logger.warning("Audit persistence skipped: %s", e) except Exception as e: logger.error("Compliance check %s failed: %s", check_id, e, exc_info=True) _compliance_check_jobs[check_id]["status"] = "failed" _compliance_check_jobs[check_id]["error"] = str(e)[:500] def _update(check_id: str, msg: str, pct: int | None = None): job = _compliance_check_jobs[check_id] job["progress"] = msg if pct is not None: job["progress_pct"] = max(0, min(100, int(pct))) async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]: """Fetch text from URL via consent-tester, with HTTP fallback. Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or HTTP fallback was used. Backend turns payloads into structured vendor records for the VVT table in the email. """ # 1. Consent-tester (Playwright-based, full JS rendering). # max_documents depends on doc_type: # - cookie/dse/social_media: self-extract (often + CMP capture) is # authoritative, sub-pages dilute the policy text. max=1. # - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar # enterprise sites split this across 3-4 short sub-pages # (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows # them. The 15s networkidle bail (dsi_helpers) keeps timing safe. short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"} max_docs = 1 if (doc_type or "") in short_extract_types else 3 try: # P90: 120s reicht nicht fuer BMW-Impressum (Auto-Discovery folgt # 3 Sub-Docs). 240s gibt Spielraum. Mercedes faellt aktuell mit # 120s auch oft an Akamai-Latenz. async with httpx.AsyncClient(timeout=240.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", json={"url": url, "max_documents": max_docs}, timeout=240.0, ) if resp.status_code == 200: payload = resp.json() docs = payload.get("documents", []) cmp_payloads = payload.get("cmp_payloads") or [] cmp_cookie_text = payload.get("cmp_cookie_text") or "" # D — wenn der consent-tester HTML-Tabellen aus dem DOM # extrahiert hat, in die cmp_payloads als "generic_table" # einschleusen damit das Backend sie via cookies_table_parser # verarbeiten kann. for doc in (docs or []): for tbl in (doc.get("tables") or []): if not tbl or len(tbl) < 3: continue cmp_payloads.append({ "kind": "html_table", "url": doc.get("url", ""), "rows": tbl, }) if docs: texts = [] for doc in docs: t = doc.get("full_text", "") or doc.get("text_preview", "") or "" if t and len(t) > 50: texts.append(t) merged = "\n\n".join(texts) # For cookie/dse/social_media: when CMP reconstruction is # substantially richer than DOM extraction, use it. This # fixes the BMW case where DOM yields ~600 words of # navigation but the ePaaS payload reconstructs to ~1800 # words of actual cookie policy. if (doc_type in short_extract_types and cmp_cookie_text and len(cmp_cookie_text.split()) > len(merged.split())): logger.info( "Preferring CMP-reconstructed text for %s on %s " "(%d words CMP vs %d words DOM)", doc_type, url, len(cmp_cookie_text.split()), len(merged.split()), ) merged = cmp_cookie_text if merged and len(merged.split()) > 100: if len(texts) > 1: logger.info("Merged %d docs from %s (%d words)", len(texts), url, len(merged.split())) return merged, cmp_payloads # P90-Bug-Fix: auch wenn DSE-Text zu kurz fuer 100-Wort- # Schwelle ist, die captured CMP-Payloads NICHT verwerfen. # BMW-Bug: DSE liefert 10 Wort SPA-Shell, aber ePaaS-JSON # (393KB) wurde captured. Backend braucht die fuer # extract_vendors_from_payloads (VVT-Tabelle). if cmp_payloads: logger.info( "P90: keeping %d CMP payloads for %s despite " "short text (%d words) — HTTP fallback runs in parallel", len(cmp_payloads), url, len((merged or cmp_cookie_text).split()), ) fallback_text = merged or cmp_cookie_text or "" return fallback_text, cmp_payloads except Exception as e: # P90: verbose exception fuer Diagnose (war vorher empty) logger.warning("Consent-tester fetch failed for %s: %s (%s)", url, str(e) or "(empty)", type(e).__name__) # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW). # P7: kenntlicher UA + per-Domain Rate-Limit. try: import re as _re from compliance.services.compliance_user_agent import ( default_request_headers, DomainRateLimiter, ) async with httpx.AsyncClient( timeout=30.0, follow_redirects=True, headers=default_request_headers(), ) as client: async with DomainRateLimiter(url): resp = await client.get(url) if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""): html = resp.text # Strip HTML tags, decode entities text = _re.sub(r"]*>.*?", " ", html, flags=_re.DOTALL | _re.IGNORECASE) text = _re.sub(r"]*>.*?", " ", text, flags=_re.DOTALL | _re.IGNORECASE) text = _re.sub(r"<[^>]+>", " ", text) text = _re.sub(r"\s+", " ", text).strip() if len(text.split()) > 100: logger.info("HTTP fallback for %s: %d words", url, len(text.split())) return text, [] except Exception as e: logger.warning("HTTP fallback failed for %s: %s", url, e) return "", [] async def _autodiscover_missing( check_id: str, doc_entries: list[dict], doc_texts: dict[str, str], url_text_cache: dict[str, str], ) -> None: """For each canonical doc_type the user did not submit, try to find the corresponding document on the homepage of the site they DID submit. Modifies doc_entries in place: fills text/url/word_count and sets `auto_discovered=True`. Marks `discovery_attempted=True` on every missing entry (even when nothing was found) so the report can distinguish 'Nicht eingereicht' from 'Auf der Website nicht gefunden'. """ from urllib.parse import urlparse # VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen # als 'submitted'. Wenn der User eine URL eingegeben hat aber die # 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger # als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln # damit der Discovery-Pass alternative URLs probiert. _MIN_USEFUL_CHARS = 200 submitted_types = { e["doc_type"] for e in doc_entries if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS } # Markiere die fehlgeschlagenen URL-Submissions damit der Discovery # ihre URL nicht erneut probiert (waere sinnlos). failed_urls: set[str] = { (e.get("url") or "").strip() for e in doc_entries if (e.get("url") or "").strip() and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS } if failed_urls: logger.info( "VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery " "soll Alternativen probieren: %s", len(failed_urls), _MIN_USEFUL_CHARS, ", ".join(list(failed_urls)[:3]), ) # Map alias types to canonical submitted_canon = { "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types } # Missing = canonical types the user did NOT submit missing = set(_ALL_DOC_TYPES) - submitted_canon if not missing: return # Pick the most common base (scheme://netloc) from submitted URLs. bases: dict[str, int] = {} for e in doc_entries: u = (e.get("url") or "").strip() if u and "://" in u: p = urlparse(u) base = f"{p.scheme}://{p.netloc}" bases[base] = bases.get(base, 0) + 1 if not bases: # No submitted URL at all — nothing to crawl from. Add empty # placeholders (with discovery_attempted=False) so the padding # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden'). for dt in missing: doc_entries.append({ "doc_type": dt, "url": "", "text": "", "word_count": 0, "auto_discovered": False, "discovery_attempted": False, }) return # Build crawl plan: primary base + any related domains mentioned in # the submitted texts that share the owner's SLD. Example: BMW Group # text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de. primary_base = max(bases, key=bases.get) + "/" crawl_bases: list[str] = [primary_base] primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.") owner_token = primary_netloc.split(".")[0] # 'bmw' if owner_token and len(owner_token) >= 3: domain_re = re.compile( r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token) + r"[a-z0-9\-]*\.[a-z]{2,}", re.IGNORECASE, ) seen_bases = {primary_base} for entry in doc_entries: text = entry.get("text") or "" for m in domain_re.finditer(text): p = urlparse(m.group(0)) base = f"{p.scheme}://{p.netloc}/" base_netloc = p.netloc.lower().lstrip("www.") if base_netloc == primary_netloc: continue if base in seen_bases: continue seen_bases.add(base) crawl_bases.append(base) if len(crawl_bases) >= 3: break if len(crawl_bases) >= 3: break _update( check_id, f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...", 18, ) discovered: list[dict] = [] disc_payloads: list[dict] = [] disc_cookie_texts: list[str] = [] for base in crawl_bases: try: async with httpx.AsyncClient(timeout=300.0) as client: # P90: 180s -> 300s resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", json={"url": base, "max_documents": 15}, timeout=300.0, # P90: 180s -> 300s ) if resp.status_code != 200: logger.warning("auto-discovery: HTTP %d for %s", resp.status_code, base) continue body = resp.json() discovered.extend(body.get("documents", []) or []) disc_payloads.extend(body.get("cmp_payloads") or []) cmp_text = body.get("cmp_cookie_text") or "" if cmp_text: disc_cookie_texts.append(cmp_text) logger.info("auto-discovery on %s: %d docs, %d CMP payloads, " "cmp_cookie_text=%d words", base, len(body.get("documents", []) or []), len(body.get("cmp_payloads") or []), len(cmp_text.split())) except Exception as e: # P90: verbose exception fuer Diagnose logger.warning("auto-discovery failed for %s: %s (%s)", base, str(e) or "(empty)", type(e).__name__) # Classify each discovered doc into a canonical doc_type by_type: dict[str, dict] = {} for d in discovered: title = (d.get("title") or "").lower() url = (d.get("url") or "").lower() wc = d.get("word_count") or 0 if wc < 100: continue canon = _classify_discovered_doc(title, url) if canon and canon in missing and canon not in by_type: by_type[canon] = d # Append/Update entry for every missing canonical type. Auto-discovered # ones get the text/URL filled; ungratched ones stay empty so the # padding step renders them as 'Auf der Website nicht gefunden'. # VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber # fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren. filled = 0 for dt in missing: existing = next((e for e in doc_entries if e.get("doc_type") == dt), None) new_entry: dict = existing if existing else { "doc_type": dt, "url": "", "text": "", "word_count": 0, "auto_discovered": False, "discovery_attempted": True, "cmp_payloads": [], } new_entry["discovery_attempted"] = True d = by_type.get(dt) if d: full = d.get("full_text") or d.get("text_preview") or "" # For cookie: prefer the CMP-reconstructed text when it's # substantially richer than the auto-discovered DOM extraction. # BMW homepage CMP yields ~1800 words of authoritative policy; # DOM extraction typically yields ~600 words of site chrome. if dt == "cookie" and disc_cookie_texts: cmp_merged = "\n\n".join(disc_cookie_texts) if len(cmp_merged.split()) > len(full.split()): logger.info( "cookie: using CMP-reconstructed text (%d words) " "instead of DOM (%d words)", len(cmp_merged.split()), len(full.split()), ) full = cmp_merged if len(full.split()) >= 100: new_entry["text"] = full # Behalte die original URL als "rejected_url" damit Audit # zeigt 'X war 404, wir haben Y gefunden'. if existing and (existing.get("url") or "").strip() in failed_urls: new_entry["rejected_url"] = existing.get("url") new_entry["url"] = d.get("url", "") new_entry["word_count"] = len(full.split()) new_entry["auto_discovered"] = True if dt == "cookie" and disc_payloads: new_entry["cmp_payloads"] = disc_payloads doc_texts[dt] = full filled += 1 logger.info( "auto-discovered %s on %s: %s (%d words)%s", dt, base, d.get("url", "")[:80], new_entry["word_count"], " [REPLACED failed URL]" if existing else "", ) if not existing: doc_entries.append(new_entry) logger.info( "auto-discovery: filled %d/%d missing types from %s", filled, len(missing), base, ) # Title/URL keywords → canonical doc_type. Order matters: most-specific first. _DISCOVERY_RULES: list[tuple[str, tuple[str, ...]]] = [ ("cookie", ("cookie", "kuche", "biscuit", "cookies-")), ("widerruf", ("widerruf", "rueckgabe", "rückgabe", "cancellation", "right-of-withdrawal", "ruecktritts", "rücktritts")), ("social_media", ("social-media", "soziale-medien", "social_media", "social-media-policy")), # P23: 'terms-and-conditions' kann Allgemeine Geschaeftsbedingungen ODER # Nutzungsbedingungen meinen. Discovery-Funktion klassifiziert spaeter # praeziser per Titel + Inhalt. Hier nur Url-Hint: ("agb", ("/agb", "geschaeftsbedingungen", "geschäftsbedingungen", "general-terms")), ("nutzungsbedingungen", ("nutzungsbedingung", "nutzungsbedingungen", "terms-of-use", "terms-and-conditions", "nutzungsordnung", "terms-of-service", "allgemeine-nutzungsbedingungen")), ("dsb", ("datenschutzbeauftragt", "data-protection-officer", "dpo-contact", "/dsb")), ("impressum", ("impressum", "imprint", "legal-notice", "site-notice", "anbieterkennzeichnung", "legal-disclaimer-pool")), ("dse", ("data-privacy", "datenschutz", "data-protection", "privacy-policy", "privacy-notice", "dsgvo", "data_privacy", "datenschutzinformation")), ] def _classify_discovered_doc(title: str, url: str) -> str | None: """Map a discovered doc (by its title + URL) to one of our 8 canonical types.""" haystack = f"{title} {url}" for canon, keywords in _DISCOVERY_RULES: if any(kw in haystack for kw in keywords): return canon return None async def _check_single( text: str, doc_type: str, label: str, url: str, word_count: int, use_agent: bool, business_scope: set[str] | None = None, business_profile: dict | None = None, ): """Run regex + MC checks on a single document.""" from compliance.services.doc_checks.runner import check_document_completeness from compliance.services.rag_document_checker import check_document_with_controls from .agent_doc_check_routes import CheckItem, DocCheckResult # Regex checklist findings = check_document_completeness(text, doc_type, label, url, business_profile=business_profile) all_checks: list[CheckItem] = [] completeness = 0 correctness = 0 for f in findings: if "SCORE" in f.get("code", ""): for c in f.get("all_checks", []): all_checks.append(CheckItem( id=c["id"], label=c["label"], passed=c["passed"], severity=c["severity"], matched_text=c.get("matched_text", ""), level=c.get("level", 1), parent=c.get("parent"), skipped=c.get("skipped", False), hint=c.get("hint", ""), )) completeness = f.get("completeness_pct", 0) correctness = f.get("correctness_pct", 0) # Master Control checks (top 20 by severity to avoid noise) try: # max_controls=0 -> evaluate ALL MCs for this doc_type (DB has # 1874 across 8 types; regex matching is cheap and dominates # well under 1s per doc). Caps remain on the LLM-enrich step # (top-10 FAILs) so cost stays bounded. mc_results = await check_document_with_controls( text, doc_type, label, max_controls=0, use_agent=use_agent, business_scope=business_scope, ) if mc_results: for mc in mc_results: all_checks.append(CheckItem(**mc)) l2 = [c for c in all_checks if c.level == 2 and not c.skipped] l2_passed = sum(1 for c in l2 if c.passed) correctness = round(l2_passed / len(l2) * 100) if l2 else 0 except Exception as e: logger.warning("MC check skipped for %s: %s", label, e) # LLM verification of regex fails failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint] if failed: try: from compliance.services.doc_checks.llm_verify import verify_failed_checks overturns = await verify_failed_checks( text, [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed], label, ) for c in all_checks: if c.id in overturns and overturns[c.id]["overturned"]: c.passed = True c.matched_text = f"[LLM] {overturns[c.id]['evidence']}" l2_active = [c for c in all_checks if c.level == 2 and not c.skipped] l2_passed = sum(1 for c in l2_active if c.passed) if l2_active: correctness = round(l2_passed / len(l2_active) * 100) except Exception as e: logger.warning("LLM verification skipped: %s", e) # Cookie-policy only: actively HTTP-probe the Opt-Out + Privacy-Policy # URLs the document advertises. Broken links make individual provider # entries non-compliant under Art. 7(3) DSGVO. if doc_type == "cookie": try: from compliance.services.cookie_link_validator import ( extract_links, validate_links, build_check_items, ) links = extract_links(text) if links: logger.info("Cookie-link validator: %d urls extracted from %s", len(links), label) validated = await validate_links(links) for item in build_check_items(validated): all_checks.append(CheckItem(**item)) # Re-compute correctness with the new L2 items l2_active = [c for c in all_checks if c.level == 2 and not c.skipped] l2_passed = sum(1 for c in l2_active if c.passed) if l2_active: correctness = round(l2_passed / len(l2_active) * 100) except Exception as e: logger.warning("Cookie-link validation skipped for %s: %s", label, e) non_score = [f for f in findings if "SCORE" not in f.get("code", "")] return DocCheckResult( label=label, url=url, doc_type=doc_type, word_count=word_count or len(text.split()), completeness_pct=completeness, correctness_pct=correctness, checks=all_checks, findings_count=len(non_score), ) def _pad_results_with_missing( results: list, discovery_attempted: set[str] | None = None, ) -> list: """Ensure every canonical doc_type has an entry in the results list. Doc_types the user did not submit AND auto-discovery did not find get a placeholder DocCheckResult. The error message distinguishes: - 'Auf der Website nicht gefunden' (discovery was attempted) - 'Nicht eingereicht' (no submitted URLs to crawl from) Preserves the canonical ordering from _ALL_DOC_TYPES so the report layout is stable. """ from .agent_doc_check_routes import DocCheckResult attempted = discovery_attempted or set() by_type: dict[str, object] = {} for r in results: canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type by_type[canon] = r ordered: list = [] for dt in _ALL_DOC_TYPES: if dt in by_type: ordered.append(by_type[dt]) continue if dt in attempted: msg = ("Auf der Website nicht gefunden — bitte URL des " "Dokuments manuell eintragen, falls vorhanden") else: msg = "Nicht eingereicht — Quelle nicht angegeben" ordered.append(DocCheckResult( label=_doc_type_label(dt), url="", doc_type=dt, word_count=0, completeness_pct=0, correctness_pct=0, checks=[], findings_count=0, error=msg, scenario="missing", )) extras = [r for r in results if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse") not in _ALL_DOC_TYPES] ordered.extend(extras) return ordered _COMPOUND_TLDS = { "co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in", "com.au", "com.br", "com.mx", "com.tr", "com.sg", } def _extract_domain(doc_entries: list[dict]) -> str | None: """Extract base domain (without www) from first URL.""" for entry in doc_entries: url = entry.get("url", "") if url and "://" in url: from urllib.parse import urlparse host = urlparse(url).netloc.lower() if host.startswith("www."): host = host[4:] return host or None return None def _company_name_from_url(doc_entries: list[dict]) -> str | None: """Derive a display company name from the entered URLs. Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"), uppercase short acronyms (<=4 chars, no hyphens), title-case the rest. Examples: www.bmw.de -> BMW mercedes-benz.de -> Mercedes-Benz shop.example.co.uk -> Example juris.de -> Juris """ from urllib.parse import urlparse for entry in doc_entries: url = entry.get("url", "") if not url or "://" not in url: continue host = urlparse(url).netloc.lower() if host.startswith("www."): host = host[4:] parts = host.split(".") if len(parts) < 2: continue # Handle compound TLDs (.co.uk etc.) if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS: sld = parts[-3] else: sld = parts[-2] if not sld: continue if len(sld) <= 4 and "-" not in sld: return sld.upper() return "-".join(p.capitalize() for p in sld.split("-")) return None def _get_skip_types(profile) -> dict[str, str]: """Doc_types to skip entirely with a per-type reason message. Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes): wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/ Nutzungsbedingungen nicht Pflicht auf der Website — sie werden beim Vertragshaendler ausgehaendigt. """ if getattr(profile, "no_direct_sales", False): msg = ( "Nicht anwendbar — die Webseite schliesst keinen Direkt-" "Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft " "ueber Vertragshaendler). AGB/Widerruf werden beim " "Haendler ausgehaendigt." ) return { "agb": msg, "widerruf": msg, "nutzungsbedingungen": msg, } return {} def _apply_profile_filter(result, profile, doc_type: str): """Adjust INFO-level checks based on business profile context. For example: ODR check only relevant for B2C online shops. """ from .agent_doc_check_routes import CheckItem for check in result.checks: cid = check.id.lower() # ODR/OS-Link: relevant ONLY for B2C online shops. The check's # default hint is written for B2B (it explains why it's not # relevant) — for B2C we must replace it with action-oriented # guidance, otherwise the report contradicts itself. if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower(): if profile.needs_odr: if not check.passed: check.hint = ( "Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 " "auf die OS-Plattform (https://ec.europa.eu/consumers/odr) " "verlinken — klickbarer Link, nicht nur Text. Zusaetzlich " "§36 VSBG: angeben, ob Sie an Verbraucher-" "Streitbeilegungsverfahren teilnehmen (oder nicht)." ) else: check.skipped = True check.hint = "Nicht relevant (kein B2C Online-Shop)" # Widerruf: Flag entire document as unnecessary for B2B if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"): check.severity = "INFO" if not check.passed: check.hint = ( "Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung " "(§355 BGB gilt nur fuer Verbrauchervertraege). " "Empfehlung: Entfernen Sie die Widerrufsbelehrung von " "Ihrer Website, da sie Verwirrung stiften kann." ) # Regulated profession: check for Kammer info if "kammer" in cid or "berufsordnung" in check.label.lower(): if not profile.is_regulated_profession: check.skipped = True check.hint = "Nicht relevant (kein regulierter Beruf)" return result # ── Helpers ────────────────────────────────────────────────────────── _DOC_TYPE_LABELS = { "dse": "Datenschutzerklaerung", "datenschutz": "Datenschutzerklaerung", "privacy": "Datenschutzerklaerung", "impressum": "Impressum", "agb": "AGB", "widerruf": "Widerrufsbelehrung", "cookie": "Cookie-Richtlinie", "avv": "Auftragsverarbeitung", "loeschkonzept": "Loeschkonzept", "dsfa": "Datenschutz-Folgenabschaetzung", "social_media": "Social Media Datenschutz", "nutzungsbedingungen": "Nutzungsbedingungen", "dsb": "DSB-Kontakt", # P74: Legal-Notice / Rechtliche Hinweise (IP, Forward-Looking, Risiko) "legal_notice": "Rechtliche Hinweise", # P96: Digital Services Act-Pflichtangaben (Art. 12+17 DSA) "dsa": "DSA-Pflichtangaben", # P97: Lizenzhinweise Dritter (OSS-Compliance) "lizenzhinweise": "Lizenzhinweise Dritter", } # Canonical doc types in the same order as the frontend ComplianceCheckTab. # The route pads `results` to always contain an entry for each — even if # the user did not submit a URL — so the email + frontend always show # the complete checklist (missing rows marked as 'Nicht eingereicht'). # # DSB-Kontakt is intentionally NOT canonical: per GDPR practice the DSB is # named *inside* the DSI/datenschutz document (email or contact block), not # as a separate page. We check 'DSB benannt' as a sub-check of the DSE # instead. If a tenant insists on a separate DSB document, they can still # submit one — it just won't appear as a missing checklist row. _ALL_DOC_TYPES = [ "dse", "impressum", "social_media", "cookie", "agb", "nutzungsbedingungen", "widerruf", ] def _doc_type_label(doc_type: str) -> str: return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper()) def _result_to_dict(r) -> dict: """Convert DocCheckResult to JSON-serializable dict.""" fields = ("id", "label", "passed", "severity", "matched_text", "level", "parent", "skipped", "hint") return { "label": r.label, "url": r.url, "doc_type": r.doc_type, "word_count": r.word_count, "completeness_pct": r.completeness_pct, "correctness_pct": r.correctness_pct, "checks": [{f: getattr(c, f) for f in fields} for c in r.checks], "findings_count": r.findings_count, "error": r.error, "scenario": getattr(r, "scenario", ""), } def _build_profile_html(profile) -> str: from .agent_doc_check_report import build_profile_html return build_profile_html(profile) # Cross-check extracted to compliance.services.banner_cookie_cross_check from compliance.services.banner_cookie_cross_check import cross_check_banner_vs_cookie as _cross_check_banner_vs_cookie # ── Admin: audit drill-down (A5) + trend view (A6) ────────────────── @router.get("/audit/{check_id}") async def audit_drill_down( check_id: str, doc_type: str = "", regulation: str = "", only_failed: bool = False, ): """Return scorecard + filterable MC results for a single check run. Frontend uses this to render the /sdk/agent/audit/ view. """ from compliance.services.compliance_audit_log import ( get_check_run, list_mc_results, ) run = get_check_run(check_id) if not run: return {"check_id": check_id, "found": False} rows = list_mc_results( check_id, doc_type=doc_type or None, regulation=regulation or None, only_failed=only_failed, ) return { "check_id": check_id, "found": True, "run": run, "mc_count": len(rows), "results": rows, } @router.get("/audit/tenant/{tenant_id}") async def audit_tenant_history( tenant_id: str, base_domain: str = "", limit: int = 30, ): """Tenant-level history for the trend view (A6).""" from compliance.services.compliance_audit_log import list_runs_for_tenant runs = list_runs_for_tenant( tenant_id, base_domain=base_domain or None, limit=limit, ) return {"tenant_id": tenant_id, "count": len(runs), "runs": runs}