94057b1536
CI / loc-budget (push) Failing after 19s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 15s
VW-Bug B1: extract_vendors_via_llm hatte max_text_chars=12000 -> bei VW-Cookie-Doc (60k chars, 100 Cookies in Tabelle) wurden 80% abgeschnitten, LLM extrahierte nur 1 Vendor. Fix: max_text_chars=50000, num_predict 6000->16000 fuer mehr Vendor-Output, Ollama-Timeout 120s->420s. P101 Aggregator-Script (backend-compliance/scripts/cookie_library_enrich.py) geht alle compliance_check_snapshots durch und extrahiert (cookie_name, declared_category, observed_sites). Erste Auswertung ueber 8 Snapshots: 101 unique Cookies, 47 in Library, 54 unbekannt, 18 Mismatches. P102 Cookie-Klassifikations-Pruefung als Mail-Block. Vergleicht Site-deklarierte Kategorie vs Library + Vendor-Doku. HIGH wenn Library sagt 'marketing' aber Site als 'essential'/'statistics' deklariert (faktische Drittland-/Werbe-Verarbeitung versteckt). MEDIUM sonst. In agent_compliance_check_routes Mail-Komposition + Replay-Pipeline eingebaut. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
158 lines
5.8 KiB
Python
158 lines
5.8 KiB
Python
"""
|
|
P102 — Cookie-Library-Mismatch-Detection pro Site.
|
|
|
|
Vergleicht die in einem Lauf erfassten Cookies (mit deklarierter
|
|
Kategorie aus dem Cookie-Doc-Text) gegen die Library
|
|
(compliance.cookie_library). Liefert Mismatches: deklariert ≠ Library.
|
|
|
|
Genutzt im Mail-Render als neuer Block "Cookie-Klassifikations-Pruefung".
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
_CATEGORY_PATTERNS = [
|
|
(re.compile(r"\b(?:strictly[-\s]?)?(?:notwendig|essential|funktional|"
|
|
r"funktionscookie|technisch[- ]?notwendig)\b", re.I),
|
|
"essential"),
|
|
(re.compile(r"\b(?:tracking|analytics|analyse|statistik|"
|
|
r"measurement|performance)\b", re.I),
|
|
"statistics"),
|
|
(re.compile(r"\b(?:marketing|werbung|advertising|targeting|"
|
|
r"drittanbieter[- ]?cookie)\b", re.I),
|
|
"marketing"),
|
|
(re.compile(r"\b(?:social[-\s]?media|share|like)\b", re.I),
|
|
"social_media"),
|
|
]
|
|
|
|
|
|
def _category_for(name: str, doc_text: str) -> str | None:
|
|
if not doc_text or not name:
|
|
return None
|
|
idx = doc_text.find(name)
|
|
if idx < 0:
|
|
return None
|
|
window = doc_text[max(0, idx - 50):idx + 400]
|
|
for pat, cat in _CATEGORY_PATTERNS:
|
|
if pat.search(window):
|
|
return cat
|
|
return None
|
|
|
|
|
|
def _load_library(db: Session) -> dict[str, dict]:
|
|
rows = db.execute(text(
|
|
"SELECT cookie_name, actual_category, vendor_name "
|
|
"FROM compliance.cookie_library"
|
|
)).fetchall()
|
|
return {r[0].lower(): {"category": r[1], "vendor": r[2]} for r in rows}
|
|
|
|
|
|
def detect_mismatches(
|
|
db: Session,
|
|
cookie_names_seen: list[str],
|
|
doc_text: str,
|
|
) -> list[dict]:
|
|
"""Returns list of finding dicts."""
|
|
if not cookie_names_seen or not doc_text:
|
|
return []
|
|
|
|
lib = _load_library(db)
|
|
findings: list[dict] = []
|
|
seen: set[str] = set()
|
|
|
|
for cname in cookie_names_seen:
|
|
cname = (cname or "").strip()
|
|
if not cname or cname.lower() in seen:
|
|
continue
|
|
seen.add(cname.lower())
|
|
declared = _category_for(cname, doc_text)
|
|
if not declared:
|
|
continue
|
|
lib_entry = lib.get(cname.lower())
|
|
if not lib_entry:
|
|
continue
|
|
lib_cat = lib_entry["category"]
|
|
if lib_cat in (None, "unknown") or lib_cat == declared:
|
|
continue
|
|
|
|
# HIGH wenn Library sagt Marketing aber Site als essential/statistics
|
|
# deklariert (faktische Drittland-/Werbe-Verarbeitung versteckt
|
|
# als technische/statistische Notwendigkeit). MEDIUM sonst.
|
|
severity = "HIGH" if (
|
|
lib_cat == "marketing" and declared in ("essential", "statistics")
|
|
) else "MEDIUM"
|
|
|
|
findings.append({
|
|
"cookie": cname,
|
|
"declared_category": declared,
|
|
"library_category": lib_cat,
|
|
"library_vendor": lib_entry["vendor"],
|
|
"severity": severity,
|
|
})
|
|
|
|
return findings
|
|
|
|
|
|
def build_mismatch_block_html(findings: list[dict]) -> str:
|
|
"""Render the mismatch findings as a Mail-Block."""
|
|
if not findings:
|
|
return ""
|
|
|
|
n_high = sum(1 for f in findings if f["severity"] == "HIGH")
|
|
items: list[str] = []
|
|
for f in findings[:25]:
|
|
sev_color = "#dc2626" if f["severity"] == "HIGH" else "#d97706"
|
|
items.append(
|
|
f'<li style="margin-bottom:6px;font-size:11px">'
|
|
f'<code style="background:#f1f5f9;padding:1px 4px;border-radius:2px">'
|
|
f'{f["cookie"]}</code> '
|
|
f'<span style="color:#64748b">— deklariert als</span> '
|
|
f'<strong>{f["declared_category"]}</strong>, '
|
|
f'<span style="color:#64748b">unsere Bibliothek + verbreitete '
|
|
f'Vendor-Doku sagen</span> <strong style="color:{sev_color}">'
|
|
f'{f["library_category"]}</strong> '
|
|
f'(Vendor: {f["library_vendor"]})'
|
|
f'</li>'
|
|
)
|
|
|
|
return (
|
|
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
|
|
'max-width:760px;margin:0 auto 16px;padding:14px 18px;'
|
|
'background:#fffbeb;border:1px solid #fde68a;border-radius:8px">'
|
|
'<div style="font-size:11px;color:#92400e;text-transform:uppercase;'
|
|
'letter-spacing:1.2px;margin-bottom:4px;font-weight:600">'
|
|
'Cookie-Klassifikations-Pruefung</div>'
|
|
f'<h3 style="margin:0 0 8px;font-size:14px;color:#1e293b">'
|
|
f'{len(findings)} Cookie{"s" if len(findings) != 1 else ""}'
|
|
f' mit abweichender Klassifikation gefunden'
|
|
f'{f" ({n_high} davon mit erhoehter Bedeutung)" if n_high else ""}'
|
|
f'</h3>'
|
|
'<p style="margin:0 0 10px;font-size:11px;color:#475569;line-height:1.5">'
|
|
'Wir haben die in Ihrer Cookie-Richtlinie deklarierte Kategorie der '
|
|
'Cookies mit unserer globalen Bibliothek (~2.300 Cookies aus Open-'
|
|
'Cookie-Database + DACH-spezifischen Quellen) und der verbreiteten '
|
|
'Vendor-Doku abgeglichen. Bei den folgenden Cookies stimmt die '
|
|
'deklarierte Kategorie nicht mit dem typischerweise erwarteten '
|
|
'Zweck ueberein. Das ist kein automatischer Verstoss — aber ein '
|
|
'Pruefanlass: bei Marketing-Cookies braucht es Einwilligung, bei '
|
|
'als "essential" deklarierten nicht. Empfehlung: mit DSB / '
|
|
'Marketing-Agentur klaeren ob die Klassifikation korrigiert '
|
|
'oder die Einwilligung anders eingeholt werden muss.</p>'
|
|
'<ul style="margin:0 0 0 18px;padding:0">'
|
|
+ "".join(items) +
|
|
'</ul>'
|
|
'<p style="margin:8px 0 0;font-size:10px;color:#94a3b8;'
|
|
'font-style:italic">Hintergrund: Art. 13(1)(c) DSGVO + EDPB 5/2020 '
|
|
'— der angegebene Verarbeitungszweck muss dem tatsaechlichen '
|
|
'entsprechen.</p>'
|
|
'</div>'
|
|
)
|