diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index b0a23e16..5eb82ad1 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -1043,11 +1043,45 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): except Exception as e: logger.warning("Scope-disclaimer block skipped: %s", e) + # P102: Cookie-Klassifikations-Pruefung (deklariert vs Library) + library_mismatch_html = "" + try: + from compliance.services.cookie_library_mismatch import ( + detect_mismatches, build_mismatch_block_html, + ) + from database import SessionLocal + cookie_doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or "" + all_cookies_seen: list[str] = [] + if banner_result: + for ph in (banner_result.get("phases") or {}).values(): + if isinstance(ph, dict): + for ck in (ph.get("cookies") or []): + if isinstance(ck, str): + all_cookies_seen.append(ck) + elif isinstance(ck, dict) and ck.get("name"): + all_cookies_seen.append(ck["name"]) + if all_cookies_seen and cookie_doc_for_check: + _mm_db = SessionLocal() + try: + mismatches = detect_mismatches( + _mm_db, all_cookies_seen, cookie_doc_for_check, + ) + if mismatches: + library_mismatch_html = build_mismatch_block_html(mismatches) + logger.info( + "P102: %d Cookie-Mismatches gefunden", len(mismatches) + ) + finally: + _mm_db.close() + except Exception as e: + logger.warning("P102 mismatch detection failed: %s", e) + full_html = ( critical_html + scope_disclaimer_html + exec_summary_html + cookie_arch_html + summary_html + scanned_html + profile_html + scorecard_html + redundancy_html - + providers_html + banner_deep_html + vvt_html + report_html + + providers_html + banner_deep_html + library_mismatch_html + + vvt_html + report_html ) # Step 6: Send email — derive site name primarily from entered URL. diff --git a/backend-compliance/compliance/services/check_replay.py b/backend-compliance/compliance/services/check_replay.py index e110370d..33dba477 100644 --- a/backend-compliance/compliance/services/check_replay.py +++ b/backend-compliance/compliance/services/check_replay.py @@ -116,6 +116,29 @@ def replay_from_snapshot( except Exception as e: logger.warning("Replay: vvt failed: %s", e) + # P102: Cookie-Klassifikations-Pruefung + try: + from compliance.services.cookie_library_mismatch import ( + detect_mismatches, build_mismatch_block_html, + ) + cookies_seen: list[str] = [] + for ph in (banner_result.get("phases") or {}).values(): + if isinstance(ph, dict): + for ck in (ph.get("cookies") or []): + if isinstance(ck, str): + cookies_seen.append(ck) + elif isinstance(ck, dict) and ck.get("name"): + cookies_seen.append(ck["name"]) + doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or "" + if cookies_seen and doc_for_check: + mm = detect_mismatches(db, cookies_seen, doc_for_check) + if mm: + mm_html = build_mismatch_block_html(mm) + parts.append(mm_html) + section_sizes["library_mismatch"] = len(mm_html) + except Exception as e: + logger.warning("Replay: mismatch block failed: %s", e) + full_html = "".join(parts) result = { diff --git a/backend-compliance/compliance/services/cookie_library_mismatch.py b/backend-compliance/compliance/services/cookie_library_mismatch.py new file mode 100644 index 00000000..6199260e --- /dev/null +++ b/backend-compliance/compliance/services/cookie_library_mismatch.py @@ -0,0 +1,157 @@ +""" +P102 — Cookie-Library-Mismatch-Detection pro Site. + +Vergleicht die in einem Lauf erfassten Cookies (mit deklarierter +Kategorie aus dem Cookie-Doc-Text) gegen die Library +(compliance.cookie_library). Liefert Mismatches: deklariert ≠ Library. + +Genutzt im Mail-Render als neuer Block "Cookie-Klassifikations-Pruefung". +""" + +from __future__ import annotations + +import logging +import re + +from sqlalchemy import text +from sqlalchemy.orm import Session + +logger = logging.getLogger(__name__) + + +_CATEGORY_PATTERNS = [ + (re.compile(r"\b(?:strictly[-\s]?)?(?:notwendig|essential|funktional|" + r"funktionscookie|technisch[- ]?notwendig)\b", re.I), + "essential"), + (re.compile(r"\b(?:tracking|analytics|analyse|statistik|" + r"measurement|performance)\b", re.I), + "statistics"), + (re.compile(r"\b(?:marketing|werbung|advertising|targeting|" + r"drittanbieter[- ]?cookie)\b", re.I), + "marketing"), + (re.compile(r"\b(?:social[-\s]?media|share|like)\b", re.I), + "social_media"), +] + + +def _category_for(name: str, doc_text: str) -> str | None: + if not doc_text or not name: + return None + idx = doc_text.find(name) + if idx < 0: + return None + window = doc_text[max(0, idx - 50):idx + 400] + for pat, cat in _CATEGORY_PATTERNS: + if pat.search(window): + return cat + return None + + +def _load_library(db: Session) -> dict[str, dict]: + rows = db.execute(text( + "SELECT cookie_name, actual_category, vendor_name " + "FROM compliance.cookie_library" + )).fetchall() + return {r[0].lower(): {"category": r[1], "vendor": r[2]} for r in rows} + + +def detect_mismatches( + db: Session, + cookie_names_seen: list[str], + doc_text: str, +) -> list[dict]: + """Returns list of finding dicts.""" + if not cookie_names_seen or not doc_text: + return [] + + lib = _load_library(db) + findings: list[dict] = [] + seen: set[str] = set() + + for cname in cookie_names_seen: + cname = (cname or "").strip() + if not cname or cname.lower() in seen: + continue + seen.add(cname.lower()) + declared = _category_for(cname, doc_text) + if not declared: + continue + lib_entry = lib.get(cname.lower()) + if not lib_entry: + continue + lib_cat = lib_entry["category"] + if lib_cat in (None, "unknown") or lib_cat == declared: + continue + + # HIGH wenn Library sagt Marketing aber Site als essential/statistics + # deklariert (faktische Drittland-/Werbe-Verarbeitung versteckt + # als technische/statistische Notwendigkeit). MEDIUM sonst. + severity = "HIGH" if ( + lib_cat == "marketing" and declared in ("essential", "statistics") + ) else "MEDIUM" + + findings.append({ + "cookie": cname, + "declared_category": declared, + "library_category": lib_cat, + "library_vendor": lib_entry["vendor"], + "severity": severity, + }) + + return findings + + +def build_mismatch_block_html(findings: list[dict]) -> str: + """Render the mismatch findings as a Mail-Block.""" + if not findings: + return "" + + n_high = sum(1 for f in findings if f["severity"] == "HIGH") + items: list[str] = [] + for f in findings[:25]: + sev_color = "#dc2626" if f["severity"] == "HIGH" else "#d97706" + items.append( + f'
  • ' + f'' + f'{f["cookie"]} ' + f'— deklariert als ' + f'{f["declared_category"]}, ' + f'unsere Bibliothek + verbreitete ' + f'Vendor-Doku sagen ' + f'{f["library_category"]} ' + f'(Vendor: {f["library_vendor"]})' + f'
  • ' + ) + + return ( + '
    ' + '
    ' + 'Cookie-Klassifikations-Pruefung
    ' + f'

    ' + f'{len(findings)} Cookie{"s" if len(findings) != 1 else ""}' + f' mit abweichender Klassifikation gefunden' + f'{f" ({n_high} davon mit erhoehter Bedeutung)" if n_high else ""}' + f'

    ' + '

    ' + 'Wir haben die in Ihrer Cookie-Richtlinie deklarierte Kategorie der ' + 'Cookies mit unserer globalen Bibliothek (~2.300 Cookies aus Open-' + 'Cookie-Database + DACH-spezifischen Quellen) und der verbreiteten ' + 'Vendor-Doku abgeglichen. Bei den folgenden Cookies stimmt die ' + 'deklarierte Kategorie nicht mit dem typischerweise erwarteten ' + 'Zweck ueberein. Das ist kein automatischer Verstoss — aber ein ' + 'Pruefanlass: bei Marketing-Cookies braucht es Einwilligung, bei ' + 'als "essential" deklarierten nicht. Empfehlung: mit DSB / ' + 'Marketing-Agentur klaeren ob die Klassifikation korrigiert ' + 'oder die Einwilligung anders eingeholt werden muss.

    ' + '' + '

    Hintergrund: Art. 13(1)(c) DSGVO + EDPB 5/2020 ' + '— der angegebene Verarbeitungszweck muss dem tatsaechlichen ' + 'entsprechen.

    ' + '
    ' + ) diff --git a/backend-compliance/compliance/services/vendor_llm_extractor.py b/backend-compliance/compliance/services/vendor_llm_extractor.py index 8a84180c..715579d3 100644 --- a/backend-compliance/compliance/services/vendor_llm_extractor.py +++ b/backend-compliance/compliance/services/vendor_llm_extractor.py @@ -49,13 +49,19 @@ _SYSTEM_PROMPT = ( async def extract_vendors_via_llm( cookie_text: str, - max_text_chars: int = 12000, + max_text_chars: int = 50000, ) -> list[dict]: - """Run the Qwen → OVH cascade. Returns vendor records (possibly empty).""" + """Run the Qwen → OVH cascade. Returns vendor records (possibly empty). + + max_text_chars: VW-Cookie-Richtlinie hat ~60k chars mit ~100 Cookies in + der Tabelle. Bei 12k waren wir auf die ersten ~5 Cookies begrenzt und + haben nur 1 Vendor extrahiert. 50k deckt VW/BMW/Mercedes komplett ab + und passt in Qwen3-30b-a3b (128k Context) sowie OVH 120B. + """ if not cookie_text or len(cookie_text) < 500: return [] excerpt = cookie_text[:max_text_chars] - user_prompt = f"Cookie-Richtlinie-Text (gekuerzt):\n\n{excerpt}" + user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}" # Stage 1: local Qwen content = await _call_ollama(user_prompt) @@ -82,10 +88,13 @@ async def _call_ollama(user_prompt: str) -> str: {"role": "user", "content": user_prompt}, ], "stream": False, "format": "json", - "options": {"temperature": 0.05, "num_predict": 6000}, + # 16k tokens fuer ~80 Vendors mit je 30 Cookies. War vorher 6k → + # output wurde mittendrin abgeschnitten, JSON unparseable → 0 Vendors. + "options": {"temperature": 0.05, "num_predict": 16000}, } try: - async with httpx.AsyncClient(timeout=120.0) as client: + # Qwen 30b braucht fuer 16k output ~4-6min auf M4 Pro. + async with httpx.AsyncClient(timeout=420.0) as client: resp = await client.post(f"{base.rstrip('/')}/api/chat", json=payload) resp.raise_for_status() return (resp.json().get("message") or {}).get("content", "") @@ -109,7 +118,7 @@ async def _call_ovh(user_prompt: str) -> str: {"role": "system", "content": _SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}, ], - "temperature": 0.05, "max_tokens": 6000, + "temperature": 0.05, "max_tokens": 16000, "response_format": {"type": "json_object"}, } try: diff --git a/backend-compliance/scripts/cookie_library_enrich.py b/backend-compliance/scripts/cookie_library_enrich.py new file mode 100644 index 00000000..8374992f --- /dev/null +++ b/backend-compliance/scripts/cookie_library_enrich.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +"""P101 — Cookie-Library Auto-Enrich aus Snapshots. + +Geht alle compliance_check_snapshots durch und: + 1. Extrahiert unique (cookie_name, vendor_hint) aus Phase-A/B/C-Cookies + 2. Sammelt deklarierte Kategorie + Speicherdauer pro Cookie pro Site + 3. Vergleicht mit cookie_library (Open-Cookie-Database + DACH) + 4. Reportet: new_cookies, kategorie_mismatches, multi_site_inconsistencies + +Run im Container: + docker exec bp-compliance-backend python /tmp/enrich.py +""" +from __future__ import annotations + +import json +import re +import sys +from collections import defaultdict +from typing import Any + +from database import engine +from sqlalchemy import text + + +def _category_from_text_context(cookie_name: str, doc_text: str) -> str | None: + """Lookup cookie_name in doc_text + extract deklarierte Kategorie aus + der Tabellen-Zeile darum (innerhalb 200 Zeichen davor/danach).""" + if not doc_text or not cookie_name: + return None + idx = doc_text.find(cookie_name) + if idx < 0: + return None + window = doc_text[max(0, idx - 50):idx + 400].lower() + category_patterns = [ + (r"(?:strictly[- ])?(?:notwendig|essential|funktional|funktionscookie|" + r"funktional cookie|technisch notwendig)", "essential"), + (r"(?:tracking|analytics|analyse|statistik|measurement|performance)", + "statistics"), + (r"(?:marketing|werbung|advertising|targeting|drittanbieter)", + "marketing"), + (r"(?:social[- ]?media|share|like|like[- ]?button)", "social_media"), + ] + for pat, cat in category_patterns: + if re.search(pat, window): + return cat + return None + + +def _purpose_text(cookie_name: str, doc_text: str) -> str | None: + """Extract die Zweck-Beschreibung aus dem Doc-Text (Sätze um den Namen).""" + if not doc_text or not cookie_name: + return None + idx = doc_text.find(cookie_name) + if idx < 0: + return None + after = doc_text[idx + len(cookie_name):idx + len(cookie_name) + 400] + sentences = re.split(r"[.\n]", after) + text_lines = [s.strip() for s in sentences if 30 < len(s.strip()) < 300] + return text_lines[0] if text_lines else None + + +def main() -> int: + with engine.connect() as c: + rows = c.execute(text( + "SELECT id, site_domain, doc_entries, banner_result " + "FROM compliance.compliance_check_snapshots" + )).fetchall() + print(f"Loaded {len(rows)} snapshots", file=sys.stderr) + + # cookie_name -> list of observations + observations: dict[str, list[dict]] = defaultdict(list) + + for row in rows: + snap_id, domain, doc_entries, banner_result = row + doc_entries = doc_entries or [] + banner_result = banner_result or {} + + # Build combined doc_text fuer Kategorie-Inference (Cookie-Doc bevorzugt, + # fallback DSE) + doc_text = "" + for e in doc_entries: + if e.get("doc_type") in ("cookie", "dse"): + t = e.get("text") or e.get("full_text") or e.get("text_preview") or "" + if len(t) > len(doc_text): + doc_text = t + + phases = (banner_result or {}).get("phases", {}) + for phase_name in ("before_consent", "after_reject", "after_accept"): + phase = phases.get(phase_name) or {} + if not isinstance(phase, dict): + continue + cookies = phase.get("cookies") or [] + for ck in cookies: + # Snapshots: cookies sind meist string-Listen (Cookie-Namen), + # vereinzelt dicts mit name/domain/max_age. + if isinstance(ck, dict): + cname = (ck.get("name") or "").strip() + cdomain = (ck.get("domain") or "").lstrip(".").lower() + max_age = ck.get("max_age") or ck.get("expires") + else: + cname = str(ck).strip() + cdomain = "" + max_age = None + if not cname or len(cname) > 80: + continue + cat_declared = _category_from_text_context(cname, doc_text) + purpose = _purpose_text(cname, doc_text) + observations[cname].append({ + "site": domain, + "phase": phase_name, + "cookie_domain": cdomain, + "max_age": max_age, + "declared_category": cat_declared, + "declared_purpose": (purpose[:150] if purpose else None), + }) + + print(f"\nUnique cookies observed: {len(observations)}\n") + + # Lookup vs cookie_library + with engine.connect() as c: + lib_rows = c.execute(text( + "SELECT cookie_name, actual_category, vendor_name " + "FROM compliance.cookie_library" + )).fetchall() + lib_lookup = {r[0].lower(): {"category": r[1], "vendor": r[2]} + for r in lib_rows} + + new_cookies: list[str] = [] + mismatches: list[dict] = [] + inconsistencies: list[dict] = [] + + for cname, obs_list in observations.items(): + sites = {o["site"] for o in obs_list} + declared_cats = {o["declared_category"] for o in obs_list + if o["declared_category"]} + + # 1) Multi-Site Inkonsistenz + if len(declared_cats) > 1: + inconsistencies.append({ + "cookie": cname, + "sites": list(sites), + "categories": list(declared_cats), + }) + + # 2) Library lookup + lib_entry = lib_lookup.get(cname.lower()) + if not lib_entry: + new_cookies.append(cname) + continue + + # 3) Mismatch declared vs library + for dc in declared_cats: + if dc and lib_entry["category"] != dc and lib_entry["category"] != "unknown": + mismatches.append({ + "cookie": cname, + "declared_by_site": dc, + "library_says": lib_entry["category"], + "library_vendor": lib_entry["vendor"], + "sites": list(sites), + }) + break + + # === Report === + print("=" * 70) + print(f"AUDIT-REPORT: P101 Cookie-Library Auto-Enrich") + print(f" Snapshots: {len(rows)}") + print(f" Unique cookies observed: {len(observations)}") + print(f" In Library (Open-Cookie-DB + DACH): {len(observations) - len(new_cookies)}") + print(f" NEW (unbekannt): {len(new_cookies)}") + print(f" Mismatches (declared != library): {len(mismatches)}") + print(f" Multi-Site Inkonsistenzen: {len(inconsistencies)}") + print("=" * 70) + + print("\n--- TOP-20 NEW COOKIES (Kandidaten fuer Library-Enrich) ---") + enriched_candidates: list[tuple[str, dict]] = [] + for cname in new_cookies: + obs = observations[cname] + cats = [o["declared_category"] for o in obs if o["declared_category"]] + primary_cat = cats[0] if cats else None + purpose = next((o["declared_purpose"] for o in obs + if o["declared_purpose"]), None) + sites = sorted({o["site"] for o in obs}) + if not primary_cat: + continue # ohne deklarierte Kategorie nicht enrichbar + confidence = min(0.6 + 0.1 * len(sites), 0.95) + enriched_candidates.append((cname, { + "category": primary_cat, + "purpose": purpose, + "sites": sites, + "confidence": confidence, + })) + for cname, info in enriched_candidates[:20]: + print(f" {cname:30s} [{info['category']:12s}] conf={info['confidence']} " + f"sites={info['sites']}") + if info.get("purpose"): + print(f" purpose: {info['purpose'][:100]}") + + print(f"\n--- ALLE MISMATCHES ({len(mismatches)}) ---") + for m in mismatches[:30]: + print(f" {m['cookie']:30s} declared={m['declared_by_site']:12s} " + f"library={m['library_says']:12s} " + f"sites={m['sites']}") + + print(f"\n--- ALLE INKONSISTENZEN ({len(inconsistencies)}) ---") + for i in inconsistencies[:30]: + print(f" {i['cookie']:30s} cats={i['categories']} sites={i['sites']}") + + # Auto-Insert die mit confidence >= 0.75 + print(f"\n--- AUTO-INSERTING in cookie_library (confidence>=0.75) ---") + inserted = 0 + with engine.begin() as c: + for cname, info in enriched_candidates: + if info["confidence"] < 0.75: + continue + r = c.execute(text(""" + INSERT INTO compliance.cookie_library + (cookie_name, domain_pattern, vendor_name, + actual_category, purpose_de, + source_name, source_url, source_license, confidence) + VALUES (:n, '*', 'Mehrere OEMs (BreakPilot-Snapshot)', + :cat, :pd, + 'BreakPilot-Auto-Enrich', 'https://breakpilot.ai', + 'CC-BY-eigene-Sammlung', :cf) + ON CONFLICT DO NOTHING + """), dict( + n=cname[:200], + cat=info["category"], + pd=info.get("purpose") or f"Beobachtet bei {len(info['sites'])} OEMs", + cf=info["confidence"], + )) + inserted += r.rowcount + print(f" inserted: {inserted}") + return 0 + + +if __name__ == "__main__": + sys.exit(main())