feat(audit): VW-Cookie-Bug-Fix + P101/P102 Cookie-Library-Mismatch-Findings
CI / loc-budget (push) Failing after 19s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 15s

VW-Bug B1: extract_vendors_via_llm hatte max_text_chars=12000 -> bei
VW-Cookie-Doc (60k chars, 100 Cookies in Tabelle) wurden 80% abgeschnitten,
LLM extrahierte nur 1 Vendor. Fix: max_text_chars=50000, num_predict
6000->16000 fuer mehr Vendor-Output, Ollama-Timeout 120s->420s.

P101 Aggregator-Script (backend-compliance/scripts/cookie_library_enrich.py)
geht alle compliance_check_snapshots durch und extrahiert (cookie_name,
declared_category, observed_sites). Erste Auswertung ueber 8 Snapshots:
101 unique Cookies, 47 in Library, 54 unbekannt, 18 Mismatches.

P102 Cookie-Klassifikations-Pruefung als Mail-Block. Vergleicht
Site-deklarierte Kategorie vs Library + Vendor-Doku. HIGH wenn Library
sagt 'marketing' aber Site als 'essential'/'statistics' deklariert
(faktische Drittland-/Werbe-Verarbeitung versteckt). MEDIUM sonst.
In agent_compliance_check_routes Mail-Komposition + Replay-Pipeline
eingebaut.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-21 15:47:11 +02:00
parent 9c11b5463c
commit 94057b1536
5 changed files with 467 additions and 7 deletions
@@ -1043,11 +1043,45 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
except Exception as e:
logger.warning("Scope-disclaimer block skipped: %s", e)
# P102: Cookie-Klassifikations-Pruefung (deklariert vs Library)
library_mismatch_html = ""
try:
from compliance.services.cookie_library_mismatch import (
detect_mismatches, build_mismatch_block_html,
)
from database import SessionLocal
cookie_doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or ""
all_cookies_seen: list[str] = []
if banner_result:
for ph in (banner_result.get("phases") or {}).values():
if isinstance(ph, dict):
for ck in (ph.get("cookies") or []):
if isinstance(ck, str):
all_cookies_seen.append(ck)
elif isinstance(ck, dict) and ck.get("name"):
all_cookies_seen.append(ck["name"])
if all_cookies_seen and cookie_doc_for_check:
_mm_db = SessionLocal()
try:
mismatches = detect_mismatches(
_mm_db, all_cookies_seen, cookie_doc_for_check,
)
if mismatches:
library_mismatch_html = build_mismatch_block_html(mismatches)
logger.info(
"P102: %d Cookie-Mismatches gefunden", len(mismatches)
)
finally:
_mm_db.close()
except Exception as e:
logger.warning("P102 mismatch detection failed: %s", e)
full_html = (
critical_html + scope_disclaimer_html + exec_summary_html
+ cookie_arch_html + summary_html + scanned_html + profile_html
+ scorecard_html + redundancy_html
+ providers_html + banner_deep_html + vvt_html + report_html
+ providers_html + banner_deep_html + library_mismatch_html
+ vvt_html + report_html
)
# Step 6: Send email — derive site name primarily from entered URL.
@@ -116,6 +116,29 @@ def replay_from_snapshot(
except Exception as e:
logger.warning("Replay: vvt failed: %s", e)
# P102: Cookie-Klassifikations-Pruefung
try:
from compliance.services.cookie_library_mismatch import (
detect_mismatches, build_mismatch_block_html,
)
cookies_seen: list[str] = []
for ph in (banner_result.get("phases") or {}).values():
if isinstance(ph, dict):
for ck in (ph.get("cookies") or []):
if isinstance(ck, str):
cookies_seen.append(ck)
elif isinstance(ck, dict) and ck.get("name"):
cookies_seen.append(ck["name"])
doc_for_check = doc_texts.get("cookie") or doc_texts.get("dse") or ""
if cookies_seen and doc_for_check:
mm = detect_mismatches(db, cookies_seen, doc_for_check)
if mm:
mm_html = build_mismatch_block_html(mm)
parts.append(mm_html)
section_sizes["library_mismatch"] = len(mm_html)
except Exception as e:
logger.warning("Replay: mismatch block failed: %s", e)
full_html = "".join(parts)
result = {
@@ -0,0 +1,157 @@
"""
P102 Cookie-Library-Mismatch-Detection pro Site.
Vergleicht die in einem Lauf erfassten Cookies (mit deklarierter
Kategorie aus dem Cookie-Doc-Text) gegen die Library
(compliance.cookie_library). Liefert Mismatches: deklariert Library.
Genutzt im Mail-Render als neuer Block "Cookie-Klassifikations-Pruefung".
"""
from __future__ import annotations
import logging
import re
from sqlalchemy import text
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
_CATEGORY_PATTERNS = [
(re.compile(r"\b(?:strictly[-\s]?)?(?:notwendig|essential|funktional|"
r"funktionscookie|technisch[- ]?notwendig)\b", re.I),
"essential"),
(re.compile(r"\b(?:tracking|analytics|analyse|statistik|"
r"measurement|performance)\b", re.I),
"statistics"),
(re.compile(r"\b(?:marketing|werbung|advertising|targeting|"
r"drittanbieter[- ]?cookie)\b", re.I),
"marketing"),
(re.compile(r"\b(?:social[-\s]?media|share|like)\b", re.I),
"social_media"),
]
def _category_for(name: str, doc_text: str) -> str | None:
if not doc_text or not name:
return None
idx = doc_text.find(name)
if idx < 0:
return None
window = doc_text[max(0, idx - 50):idx + 400]
for pat, cat in _CATEGORY_PATTERNS:
if pat.search(window):
return cat
return None
def _load_library(db: Session) -> dict[str, dict]:
rows = db.execute(text(
"SELECT cookie_name, actual_category, vendor_name "
"FROM compliance.cookie_library"
)).fetchall()
return {r[0].lower(): {"category": r[1], "vendor": r[2]} for r in rows}
def detect_mismatches(
db: Session,
cookie_names_seen: list[str],
doc_text: str,
) -> list[dict]:
"""Returns list of finding dicts."""
if not cookie_names_seen or not doc_text:
return []
lib = _load_library(db)
findings: list[dict] = []
seen: set[str] = set()
for cname in cookie_names_seen:
cname = (cname or "").strip()
if not cname or cname.lower() in seen:
continue
seen.add(cname.lower())
declared = _category_for(cname, doc_text)
if not declared:
continue
lib_entry = lib.get(cname.lower())
if not lib_entry:
continue
lib_cat = lib_entry["category"]
if lib_cat in (None, "unknown") or lib_cat == declared:
continue
# HIGH wenn Library sagt Marketing aber Site als essential/statistics
# deklariert (faktische Drittland-/Werbe-Verarbeitung versteckt
# als technische/statistische Notwendigkeit). MEDIUM sonst.
severity = "HIGH" if (
lib_cat == "marketing" and declared in ("essential", "statistics")
) else "MEDIUM"
findings.append({
"cookie": cname,
"declared_category": declared,
"library_category": lib_cat,
"library_vendor": lib_entry["vendor"],
"severity": severity,
})
return findings
def build_mismatch_block_html(findings: list[dict]) -> str:
"""Render the mismatch findings as a Mail-Block."""
if not findings:
return ""
n_high = sum(1 for f in findings if f["severity"] == "HIGH")
items: list[str] = []
for f in findings[:25]:
sev_color = "#dc2626" if f["severity"] == "HIGH" else "#d97706"
items.append(
f'<li style="margin-bottom:6px;font-size:11px">'
f'<code style="background:#f1f5f9;padding:1px 4px;border-radius:2px">'
f'{f["cookie"]}</code> '
f'<span style="color:#64748b">— deklariert als</span> '
f'<strong>{f["declared_category"]}</strong>, '
f'<span style="color:#64748b">unsere Bibliothek + verbreitete '
f'Vendor-Doku sagen</span> <strong style="color:{sev_color}">'
f'{f["library_category"]}</strong> '
f'(Vendor: {f["library_vendor"]})'
f'</li>'
)
return (
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:760px;margin:0 auto 16px;padding:14px 18px;'
'background:#fffbeb;border:1px solid #fde68a;border-radius:8px">'
'<div style="font-size:11px;color:#92400e;text-transform:uppercase;'
'letter-spacing:1.2px;margin-bottom:4px;font-weight:600">'
'Cookie-Klassifikations-Pruefung</div>'
f'<h3 style="margin:0 0 8px;font-size:14px;color:#1e293b">'
f'{len(findings)} Cookie{"s" if len(findings) != 1 else ""}'
f' mit abweichender Klassifikation gefunden'
f'{f" ({n_high} davon mit erhoehter Bedeutung)" if n_high else ""}'
f'</h3>'
'<p style="margin:0 0 10px;font-size:11px;color:#475569;line-height:1.5">'
'Wir haben die in Ihrer Cookie-Richtlinie deklarierte Kategorie der '
'Cookies mit unserer globalen Bibliothek (~2.300 Cookies aus Open-'
'Cookie-Database + DACH-spezifischen Quellen) und der verbreiteten '
'Vendor-Doku abgeglichen. Bei den folgenden Cookies stimmt die '
'deklarierte Kategorie nicht mit dem typischerweise erwarteten '
'Zweck ueberein. Das ist kein automatischer Verstoss — aber ein '
'Pruefanlass: bei Marketing-Cookies braucht es Einwilligung, bei '
'als "essential" deklarierten nicht. Empfehlung: mit DSB / '
'Marketing-Agentur klaeren ob die Klassifikation korrigiert '
'oder die Einwilligung anders eingeholt werden muss.</p>'
'<ul style="margin:0 0 0 18px;padding:0">'
+ "".join(items) +
'</ul>'
'<p style="margin:8px 0 0;font-size:10px;color:#94a3b8;'
'font-style:italic">Hintergrund: Art. 13(1)(c) DSGVO + EDPB 5/2020 '
'— der angegebene Verarbeitungszweck muss dem tatsaechlichen '
'entsprechen.</p>'
'</div>'
)
@@ -49,13 +49,19 @@ _SYSTEM_PROMPT = (
async def extract_vendors_via_llm(
cookie_text: str,
max_text_chars: int = 12000,
max_text_chars: int = 50000,
) -> list[dict]:
"""Run the Qwen → OVH cascade. Returns vendor records (possibly empty)."""
"""Run the Qwen → OVH cascade. Returns vendor records (possibly empty).
max_text_chars: VW-Cookie-Richtlinie hat ~60k chars mit ~100 Cookies in
der Tabelle. Bei 12k waren wir auf die ersten ~5 Cookies begrenzt und
haben nur 1 Vendor extrahiert. 50k deckt VW/BMW/Mercedes komplett ab
und passt in Qwen3-30b-a3b (128k Context) sowie OVH 120B.
"""
if not cookie_text or len(cookie_text) < 500:
return []
excerpt = cookie_text[:max_text_chars]
user_prompt = f"Cookie-Richtlinie-Text (gekuerzt):\n\n{excerpt}"
user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}"
# Stage 1: local Qwen
content = await _call_ollama(user_prompt)
@@ -82,10 +88,13 @@ async def _call_ollama(user_prompt: str) -> str:
{"role": "user", "content": user_prompt},
],
"stream": False, "format": "json",
"options": {"temperature": 0.05, "num_predict": 6000},
# 16k tokens fuer ~80 Vendors mit je 30 Cookies. War vorher 6k →
# output wurde mittendrin abgeschnitten, JSON unparseable → 0 Vendors.
"options": {"temperature": 0.05, "num_predict": 16000},
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
# Qwen 30b braucht fuer 16k output ~4-6min auf M4 Pro.
async with httpx.AsyncClient(timeout=420.0) as client:
resp = await client.post(f"{base.rstrip('/')}/api/chat", json=payload)
resp.raise_for_status()
return (resp.json().get("message") or {}).get("content", "")
@@ -109,7 +118,7 @@ async def _call_ovh(user_prompt: str) -> str:
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
"temperature": 0.05, "max_tokens": 6000,
"temperature": 0.05, "max_tokens": 16000,
"response_format": {"type": "json_object"},
}
try: