diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py
index b0a23e16..5eb82ad1 100644
--- a/backend-compliance/compliance/api/agent_compliance_check_routes.py
+++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py
@@ -1043,11 +1043,45 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
except Exception as e:
logger.warning("Scope-disclaimer block skipped: %s", e)
+ # P102: Cookie-Klassifikations-Pruefung (deklariert vs Library)
+ library_mismatch_html = ""
+ try:
+ from compliance.services.cookie_library_mismatch import (
+ detect_mismatches, build_mismatch_block_html,
+ )
+ from database import SessionLocal
+ cookie_doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or ""
+ all_cookies_seen: list[str] = []
+ if banner_result:
+ for ph in (banner_result.get("phases") or {}).values():
+ if isinstance(ph, dict):
+ for ck in (ph.get("cookies") or []):
+ if isinstance(ck, str):
+ all_cookies_seen.append(ck)
+ elif isinstance(ck, dict) and ck.get("name"):
+ all_cookies_seen.append(ck["name"])
+ if all_cookies_seen and cookie_doc_for_check:
+ _mm_db = SessionLocal()
+ try:
+ mismatches = detect_mismatches(
+ _mm_db, all_cookies_seen, cookie_doc_for_check,
+ )
+ if mismatches:
+ library_mismatch_html = build_mismatch_block_html(mismatches)
+ logger.info(
+ "P102: %d Cookie-Mismatches gefunden", len(mismatches)
+ )
+ finally:
+ _mm_db.close()
+ except Exception as e:
+ logger.warning("P102 mismatch detection failed: %s", e)
+
full_html = (
critical_html + scope_disclaimer_html + exec_summary_html
+ cookie_arch_html + summary_html + scanned_html + profile_html
+ scorecard_html + redundancy_html
- + providers_html + banner_deep_html + vvt_html + report_html
+ + providers_html + banner_deep_html + library_mismatch_html
+ + vvt_html + report_html
)
# Step 6: Send email — derive site name primarily from entered URL.
diff --git a/backend-compliance/compliance/services/check_replay.py b/backend-compliance/compliance/services/check_replay.py
index e110370d..33dba477 100644
--- a/backend-compliance/compliance/services/check_replay.py
+++ b/backend-compliance/compliance/services/check_replay.py
@@ -116,6 +116,29 @@ def replay_from_snapshot(
except Exception as e:
logger.warning("Replay: vvt failed: %s", e)
+ # P102: Cookie-Klassifikations-Pruefung
+ try:
+ from compliance.services.cookie_library_mismatch import (
+ detect_mismatches, build_mismatch_block_html,
+ )
+ cookies_seen: list[str] = []
+ for ph in (banner_result.get("phases") or {}).values():
+ if isinstance(ph, dict):
+ for ck in (ph.get("cookies") or []):
+ if isinstance(ck, str):
+ cookies_seen.append(ck)
+ elif isinstance(ck, dict) and ck.get("name"):
+ cookies_seen.append(ck["name"])
+ doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or ""
+ if cookies_seen and doc_for_check:
+ mm = detect_mismatches(db, cookies_seen, doc_for_check)
+ if mm:
+ mm_html = build_mismatch_block_html(mm)
+ parts.append(mm_html)
+ section_sizes["library_mismatch"] = len(mm_html)
+ except Exception as e:
+ logger.warning("Replay: mismatch block failed: %s", e)
+
full_html = "".join(parts)
result = {
diff --git a/backend-compliance/compliance/services/cookie_library_mismatch.py b/backend-compliance/compliance/services/cookie_library_mismatch.py
new file mode 100644
index 00000000..6199260e
--- /dev/null
+++ b/backend-compliance/compliance/services/cookie_library_mismatch.py
@@ -0,0 +1,157 @@
+"""
+P102 — Cookie-Library-Mismatch-Detection pro Site.
+
+Vergleicht die in einem Lauf erfassten Cookies (mit deklarierter
+Kategorie aus dem Cookie-Doc-Text) gegen die Library
+(compliance.cookie_library). Liefert Mismatches: deklariert ≠ Library.
+
+Genutzt im Mail-Render als neuer Block "Cookie-Klassifikations-Pruefung".
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+
+from sqlalchemy import text
+from sqlalchemy.orm import Session
+
+logger = logging.getLogger(__name__)
+
+
+_CATEGORY_PATTERNS = [
+ (re.compile(r"\b(?:strictly[-\s]?)?(?:notwendig|essential|funktional|"
+ r"funktionscookie|technisch[- ]?notwendig)\b", re.I),
+ "essential"),
+ (re.compile(r"\b(?:tracking|analytics|analyse|statistik|"
+ r"measurement|performance)\b", re.I),
+ "statistics"),
+ (re.compile(r"\b(?:marketing|werbung|advertising|targeting|"
+ r"drittanbieter[- ]?cookie)\b", re.I),
+ "marketing"),
+ (re.compile(r"\b(?:social[-\s]?media|share|like)\b", re.I),
+ "social_media"),
+]
+
+
+def _category_for(name: str, doc_text: str) -> str | None:
+ if not doc_text or not name:
+ return None
+ idx = doc_text.find(name)
+ if idx < 0:
+ return None
+ window = doc_text[max(0, idx - 50):idx + 400]
+ for pat, cat in _CATEGORY_PATTERNS:
+ if pat.search(window):
+ return cat
+ return None
+
+
+def _load_library(db: Session) -> dict[str, dict]:
+ rows = db.execute(text(
+ "SELECT cookie_name, actual_category, vendor_name "
+ "FROM compliance.cookie_library"
+ )).fetchall()
+ return {r[0].lower(): {"category": r[1], "vendor": r[2]} for r in rows}
+
+
+def detect_mismatches(
+ db: Session,
+ cookie_names_seen: list[str],
+ doc_text: str,
+) -> list[dict]:
+ """Returns list of finding dicts."""
+ if not cookie_names_seen or not doc_text:
+ return []
+
+ lib = _load_library(db)
+ findings: list[dict] = []
+ seen: set[str] = set()
+
+ for cname in cookie_names_seen:
+ cname = (cname or "").strip()
+ if not cname or cname.lower() in seen:
+ continue
+ seen.add(cname.lower())
+ declared = _category_for(cname, doc_text)
+ if not declared:
+ continue
+ lib_entry = lib.get(cname.lower())
+ if not lib_entry:
+ continue
+ lib_cat = lib_entry["category"]
+ if lib_cat in (None, "unknown") or lib_cat == declared:
+ continue
+
+ # HIGH wenn Library sagt Marketing aber Site als essential/statistics
+ # deklariert (faktische Drittland-/Werbe-Verarbeitung versteckt
+ # als technische/statistische Notwendigkeit). MEDIUM sonst.
+ severity = "HIGH" if (
+ lib_cat == "marketing" and declared in ("essential", "statistics")
+ ) else "MEDIUM"
+
+ findings.append({
+ "cookie": cname,
+ "declared_category": declared,
+ "library_category": lib_cat,
+ "library_vendor": lib_entry["vendor"],
+ "severity": severity,
+ })
+
+ return findings
+
+
+def build_mismatch_block_html(findings: list[dict]) -> str:
+ """Render the mismatch findings as a Mail-Block."""
+ if not findings:
+ return ""
+
+ n_high = sum(1 for f in findings if f["severity"] == "HIGH")
+ items: list[str] = []
+ for f in findings[:25]:
+ sev_color = "#dc2626" if f["severity"] == "HIGH" else "#d97706"
+ items.append(
+ f'
'
+ f''
+ f'{f["cookie"]} '
+ f'— deklariert als '
+ f'{f["declared_category"]}, '
+ f'unsere Bibliothek + verbreitete '
+ f'Vendor-Doku sagen '
+ f'{f["library_category"]} '
+ f'(Vendor: {f["library_vendor"]})'
+ f''
+ )
+
+ return (
+ ''
+ '
'
+ 'Cookie-Klassifikations-Pruefung
'
+ f'
'
+ f'{len(findings)} Cookie{"s" if len(findings) != 1 else ""}'
+ f' mit abweichender Klassifikation gefunden'
+ f'{f" ({n_high} davon mit erhoehter Bedeutung)" if n_high else ""}'
+ f'
'
+ '
'
+ 'Wir haben die in Ihrer Cookie-Richtlinie deklarierte Kategorie der '
+ 'Cookies mit unserer globalen Bibliothek (~2.300 Cookies aus Open-'
+ 'Cookie-Database + DACH-spezifischen Quellen) und der verbreiteten '
+ 'Vendor-Doku abgeglichen. Bei den folgenden Cookies stimmt die '
+ 'deklarierte Kategorie nicht mit dem typischerweise erwarteten '
+ 'Zweck ueberein. Das ist kein automatischer Verstoss — aber ein '
+ 'Pruefanlass: bei Marketing-Cookies braucht es Einwilligung, bei '
+ 'als "essential" deklarierten nicht. Empfehlung: mit DSB / '
+ 'Marketing-Agentur klaeren ob die Klassifikation korrigiert '
+ 'oder die Einwilligung anders eingeholt werden muss.
'
+ '
'
+ + "".join(items) +
+ '
'
+ '
Hintergrund: Art. 13(1)(c) DSGVO + EDPB 5/2020 '
+ '— der angegebene Verarbeitungszweck muss dem tatsaechlichen '
+ 'entsprechen.
'
+ '
'
+ )
diff --git a/backend-compliance/compliance/services/vendor_llm_extractor.py b/backend-compliance/compliance/services/vendor_llm_extractor.py
index 8a84180c..715579d3 100644
--- a/backend-compliance/compliance/services/vendor_llm_extractor.py
+++ b/backend-compliance/compliance/services/vendor_llm_extractor.py
@@ -49,13 +49,19 @@ _SYSTEM_PROMPT = (
async def extract_vendors_via_llm(
cookie_text: str,
- max_text_chars: int = 12000,
+ max_text_chars: int = 50000,
) -> list[dict]:
- """Run the Qwen → OVH cascade. Returns vendor records (possibly empty)."""
+ """Run the Qwen → OVH cascade. Returns vendor records (possibly empty).
+
+ max_text_chars: VW-Cookie-Richtlinie hat ~60k chars mit ~100 Cookies in
+ der Tabelle. Bei 12k waren wir auf die ersten ~5 Cookies begrenzt und
+ haben nur 1 Vendor extrahiert. 50k deckt VW/BMW/Mercedes komplett ab
+ und passt in Qwen3-30b-a3b (128k Context) sowie OVH 120B.
+ """
if not cookie_text or len(cookie_text) < 500:
return []
excerpt = cookie_text[:max_text_chars]
- user_prompt = f"Cookie-Richtlinie-Text (gekuerzt):\n\n{excerpt}"
+ user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}"
# Stage 1: local Qwen
content = await _call_ollama(user_prompt)
@@ -82,10 +88,13 @@ async def _call_ollama(user_prompt: str) -> str:
{"role": "user", "content": user_prompt},
],
"stream": False, "format": "json",
- "options": {"temperature": 0.05, "num_predict": 6000},
+ # 16k tokens fuer ~80 Vendors mit je 30 Cookies. War vorher 6k →
+ # output wurde mittendrin abgeschnitten, JSON unparseable → 0 Vendors.
+ "options": {"temperature": 0.05, "num_predict": 16000},
}
try:
- async with httpx.AsyncClient(timeout=120.0) as client:
+ # Qwen 30b braucht fuer 16k output ~4-6min auf M4 Pro.
+ async with httpx.AsyncClient(timeout=420.0) as client:
resp = await client.post(f"{base.rstrip('/')}/api/chat", json=payload)
resp.raise_for_status()
return (resp.json().get("message") or {}).get("content", "")
@@ -109,7 +118,7 @@ async def _call_ovh(user_prompt: str) -> str:
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
- "temperature": 0.05, "max_tokens": 6000,
+ "temperature": 0.05, "max_tokens": 16000,
"response_format": {"type": "json_object"},
}
try:
diff --git a/backend-compliance/scripts/cookie_library_enrich.py b/backend-compliance/scripts/cookie_library_enrich.py
new file mode 100644
index 00000000..8374992f
--- /dev/null
+++ b/backend-compliance/scripts/cookie_library_enrich.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+"""P101 — Cookie-Library Auto-Enrich aus Snapshots.
+
+Geht alle compliance_check_snapshots durch und:
+ 1. Extrahiert unique (cookie_name, vendor_hint) aus Phase-A/B/C-Cookies
+ 2. Sammelt deklarierte Kategorie + Speicherdauer pro Cookie pro Site
+ 3. Vergleicht mit cookie_library (Open-Cookie-Database + DACH)
+ 4. Reportet: new_cookies, kategorie_mismatches, multi_site_inconsistencies
+
+Run im Container:
+ docker exec bp-compliance-backend python /tmp/enrich.py
+"""
+from __future__ import annotations
+
+import json
+import re
+import sys
+from collections import defaultdict
+from typing import Any
+
+from database import engine
+from sqlalchemy import text
+
+
+def _category_from_text_context(cookie_name: str, doc_text: str) -> str | None:
+ """Lookup cookie_name in doc_text + extract deklarierte Kategorie aus
+ der Tabellen-Zeile darum (innerhalb 200 Zeichen davor/danach)."""
+ if not doc_text or not cookie_name:
+ return None
+ idx = doc_text.find(cookie_name)
+ if idx < 0:
+ return None
+ window = doc_text[max(0, idx - 50):idx + 400].lower()
+ category_patterns = [
+ (r"(?:strictly[- ])?(?:notwendig|essential|funktional|funktionscookie|"
+ r"funktional cookie|technisch notwendig)", "essential"),
+ (r"(?:tracking|analytics|analyse|statistik|measurement|performance)",
+ "statistics"),
+ (r"(?:marketing|werbung|advertising|targeting|drittanbieter)",
+ "marketing"),
+ (r"(?:social[- ]?media|share|like|like[- ]?button)", "social_media"),
+ ]
+ for pat, cat in category_patterns:
+ if re.search(pat, window):
+ return cat
+ return None
+
+
+def _purpose_text(cookie_name: str, doc_text: str) -> str | None:
+ """Extract die Zweck-Beschreibung aus dem Doc-Text (Sätze um den Namen)."""
+ if not doc_text or not cookie_name:
+ return None
+ idx = doc_text.find(cookie_name)
+ if idx < 0:
+ return None
+ after = doc_text[idx + len(cookie_name):idx + len(cookie_name) + 400]
+ sentences = re.split(r"[.\n]", after)
+ text_lines = [s.strip() for s in sentences if 30 < len(s.strip()) < 300]
+ return text_lines[0] if text_lines else None
+
+
+def main() -> int:
+ with engine.connect() as c:
+ rows = c.execute(text(
+ "SELECT id, site_domain, doc_entries, banner_result "
+ "FROM compliance.compliance_check_snapshots"
+ )).fetchall()
+ print(f"Loaded {len(rows)} snapshots", file=sys.stderr)
+
+ # cookie_name -> list of observations
+ observations: dict[str, list[dict]] = defaultdict(list)
+
+ for row in rows:
+ snap_id, domain, doc_entries, banner_result = row
+ doc_entries = doc_entries or []
+ banner_result = banner_result or {}
+
+ # Build combined doc_text fuer Kategorie-Inference (Cookie-Doc bevorzugt,
+ # fallback DSE)
+ doc_text = ""
+ for e in doc_entries:
+ if e.get("doc_type") in ("cookie", "dse"):
+ t = e.get("text") or e.get("full_text") or e.get("text_preview") or ""
+ if len(t) > len(doc_text):
+ doc_text = t
+
+ phases = (banner_result or {}).get("phases", {})
+ for phase_name in ("before_consent", "after_reject", "after_accept"):
+ phase = phases.get(phase_name) or {}
+ if not isinstance(phase, dict):
+ continue
+ cookies = phase.get("cookies") or []
+ for ck in cookies:
+ # Snapshots: cookies sind meist string-Listen (Cookie-Namen),
+ # vereinzelt dicts mit name/domain/max_age.
+ if isinstance(ck, dict):
+ cname = (ck.get("name") or "").strip()
+ cdomain = (ck.get("domain") or "").lstrip(".").lower()
+ max_age = ck.get("max_age") or ck.get("expires")
+ else:
+ cname = str(ck).strip()
+ cdomain = ""
+ max_age = None
+ if not cname or len(cname) > 80:
+ continue
+ cat_declared = _category_from_text_context(cname, doc_text)
+ purpose = _purpose_text(cname, doc_text)
+ observations[cname].append({
+ "site": domain,
+ "phase": phase_name,
+ "cookie_domain": cdomain,
+ "max_age": max_age,
+ "declared_category": cat_declared,
+ "declared_purpose": (purpose[:150] if purpose else None),
+ })
+
+ print(f"\nUnique cookies observed: {len(observations)}\n")
+
+ # Lookup vs cookie_library
+ with engine.connect() as c:
+ lib_rows = c.execute(text(
+ "SELECT cookie_name, actual_category, vendor_name "
+ "FROM compliance.cookie_library"
+ )).fetchall()
+ lib_lookup = {r[0].lower(): {"category": r[1], "vendor": r[2]}
+ for r in lib_rows}
+
+ new_cookies: list[str] = []
+ mismatches: list[dict] = []
+ inconsistencies: list[dict] = []
+
+ for cname, obs_list in observations.items():
+ sites = {o["site"] for o in obs_list}
+ declared_cats = {o["declared_category"] for o in obs_list
+ if o["declared_category"]}
+
+ # 1) Multi-Site Inkonsistenz
+ if len(declared_cats) > 1:
+ inconsistencies.append({
+ "cookie": cname,
+ "sites": list(sites),
+ "categories": list(declared_cats),
+ })
+
+ # 2) Library lookup
+ lib_entry = lib_lookup.get(cname.lower())
+ if not lib_entry:
+ new_cookies.append(cname)
+ continue
+
+ # 3) Mismatch declared vs library
+ for dc in declared_cats:
+ if dc and lib_entry["category"] != dc and lib_entry["category"] != "unknown":
+ mismatches.append({
+ "cookie": cname,
+ "declared_by_site": dc,
+ "library_says": lib_entry["category"],
+ "library_vendor": lib_entry["vendor"],
+ "sites": list(sites),
+ })
+ break
+
+ # === Report ===
+ print("=" * 70)
+ print(f"AUDIT-REPORT: P101 Cookie-Library Auto-Enrich")
+ print(f" Snapshots: {len(rows)}")
+ print(f" Unique cookies observed: {len(observations)}")
+ print(f" In Library (Open-Cookie-DB + DACH): {len(observations) - len(new_cookies)}")
+ print(f" NEW (unbekannt): {len(new_cookies)}")
+ print(f" Mismatches (declared != library): {len(mismatches)}")
+ print(f" Multi-Site Inkonsistenzen: {len(inconsistencies)}")
+ print("=" * 70)
+
+ print("\n--- TOP-20 NEW COOKIES (Kandidaten fuer Library-Enrich) ---")
+ enriched_candidates: list[tuple[str, dict]] = []
+ for cname in new_cookies:
+ obs = observations[cname]
+ cats = [o["declared_category"] for o in obs if o["declared_category"]]
+ primary_cat = cats[0] if cats else None
+ purpose = next((o["declared_purpose"] for o in obs
+ if o["declared_purpose"]), None)
+ sites = sorted({o["site"] for o in obs})
+ if not primary_cat:
+ continue # ohne deklarierte Kategorie nicht enrichbar
+ confidence = min(0.6 + 0.1 * len(sites), 0.95)
+ enriched_candidates.append((cname, {
+ "category": primary_cat,
+ "purpose": purpose,
+ "sites": sites,
+ "confidence": confidence,
+ }))
+ for cname, info in enriched_candidates[:20]:
+ print(f" {cname:30s} [{info['category']:12s}] conf={info['confidence']} "
+ f"sites={info['sites']}")
+ if info.get("purpose"):
+ print(f" purpose: {info['purpose'][:100]}")
+
+ print(f"\n--- ALLE MISMATCHES ({len(mismatches)}) ---")
+ for m in mismatches[:30]:
+ print(f" {m['cookie']:30s} declared={m['declared_by_site']:12s} "
+ f"library={m['library_says']:12s} "
+ f"sites={m['sites']}")
+
+ print(f"\n--- ALLE INKONSISTENZEN ({len(inconsistencies)}) ---")
+ for i in inconsistencies[:30]:
+ print(f" {i['cookie']:30s} cats={i['categories']} sites={i['sites']}")
+
+ # Auto-Insert die mit confidence >= 0.75
+ print(f"\n--- AUTO-INSERTING in cookie_library (confidence>=0.75) ---")
+ inserted = 0
+ with engine.begin() as c:
+ for cname, info in enriched_candidates:
+ if info["confidence"] < 0.75:
+ continue
+ r = c.execute(text("""
+ INSERT INTO compliance.cookie_library
+ (cookie_name, domain_pattern, vendor_name,
+ actual_category, purpose_de,
+ source_name, source_url, source_license, confidence)
+ VALUES (:n, '*', 'Mehrere OEMs (BreakPilot-Snapshot)',
+ :cat, :pd,
+ 'BreakPilot-Auto-Enrich', 'https://breakpilot.ai',
+ 'CC-BY-eigene-Sammlung', :cf)
+ ON CONFLICT DO NOTHING
+ """), dict(
+ n=cname[:200],
+ cat=info["category"],
+ pd=info.get("purpose") or f"Beobachtet bei {len(info['sites'])} OEMs",
+ cf=info["confidence"],
+ ))
+ inserted += r.rowcount
+ print(f" inserted: {inserted}")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())