fix(audit): VW-404-Recovery + P52 LLM-Merge + P51 Banner-UX-Checks
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
VW-404-Fix: submitted_types zaehlt jetzt nur Doc-Types mit >= 200 Zeichen echtem Text. Eine eingegebene URL die 404/Mini-Text liefert (VW cookie- richtlinie.html) wird als 'missing' behandelt, sodass Auto-Discovery alternative URLs auf der Homepage probiert. In-place-Update statt Duplicate-Entry, rejected_url wird fuer Audit-Transparenz aufgehoben. P52 LLM-Cascade Merge: vendor_llm_extractor laeuft jetzt bei < 5 Vendors (nicht nur bei 0), und die Ergebnisse werden MIT existing cmp_vendors gemerged statt zu ueberschreiben. VW-typische Setups (Generic CMP + 0 cmp_payloads) bekommen damit den Text-basierten Vendor-Layer dazu. P51 — banner_consistency_checks erweitert: * check_banner_copyability: scannt banner_html nach user-select:none / oncopy=return false / onselectstart. MEDIUM Finding wenn Banner-Text nicht kopierbar (Art. 7 (2) DSGVO). * check_consent_history: prueft auf 'Meine Einwilligungen' / Consent- Historie / Datenschutz-Cockpit. MEDIUM wenn keine sichtbare Historie (Art. 7 (3) — Widerruf muss so einfach wie Erteilung sein). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -687,24 +687,42 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
cmp_vendors = extract_vendors_from_payloads(
|
cmp_vendors = extract_vendors_from_payloads(
|
||||||
cookie_payloads, owner_name=owner_name,
|
cookie_payloads, owner_name=owner_name,
|
||||||
)
|
)
|
||||||
# V3 fallback: no named CMP captured but we have substantive
|
# P52: LLM-Fallback nicht nur wenn 0 Vendors, sondern auch
|
||||||
# cookie text → ask Qwen/OVH to extract vendor list from the text.
|
# wenn die strukturierten Quellen < 5 Vendors lieferten und
|
||||||
# Skip on very short text (likely navigation) to save LLM cost.
|
# der Cookie-Text substantiell ist. So holt sich VW-typische
|
||||||
if not cmp_vendors and cookie_text and len(cookie_text.split()) >= 500:
|
# Setups (Generic CMP, 28 Cookies aber 0 cmp_payloads) noch
|
||||||
|
# ihre echten Vendors aus dem Text.
|
||||||
|
if (len(cmp_vendors) < 5
|
||||||
|
and cookie_text and len(cookie_text.split()) >= 500):
|
||||||
from compliance.services.vendor_llm_extractor import (
|
from compliance.services.vendor_llm_extractor import (
|
||||||
extract_vendors_via_llm,
|
extract_vendors_via_llm,
|
||||||
)
|
)
|
||||||
from compliance.services.vendor_classifier import classify
|
from compliance.services.vendor_classifier import classify
|
||||||
_update(check_id, "Vendor-Liste per LLM extrahieren...", 94)
|
_update(check_id, "Vendor-Liste per LLM extrahieren...", 94)
|
||||||
cmp_vendors = await extract_vendors_via_llm(cookie_text)
|
llm_vendors = await extract_vendors_via_llm(cookie_text)
|
||||||
# LLM path doesn't run through extract_vendors_from_payloads,
|
# P52: classify die LLM-Vendors und MERGE mit existing
|
||||||
# so classify here.
|
# statt zu ueberschreiben.
|
||||||
for v in cmp_vendors:
|
existing_names = {(v.get("name") or "").strip().lower()
|
||||||
|
for v in cmp_vendors}
|
||||||
|
added_llm = 0
|
||||||
|
for v in llm_vendors:
|
||||||
|
nm = (v.get("name") or "").strip()
|
||||||
|
if not nm or nm.lower() in existing_names:
|
||||||
|
continue
|
||||||
v["recipient_type"] = classify(
|
v["recipient_type"] = classify(
|
||||||
vendor_name=v.get("name", ""),
|
vendor_name=nm,
|
||||||
category=v.get("category", ""),
|
category=v.get("category", ""),
|
||||||
owner_name=owner_name,
|
owner_name=owner_name,
|
||||||
)
|
)
|
||||||
|
v.setdefault("source", "llm_cascade")
|
||||||
|
cmp_vendors.append(v)
|
||||||
|
existing_names.add(nm.lower())
|
||||||
|
added_llm += 1
|
||||||
|
if added_llm:
|
||||||
|
logger.info(
|
||||||
|
"P52 LLM-Cascade: +%d Vendors (total: %d)",
|
||||||
|
added_llm, len(cmp_vendors),
|
||||||
|
)
|
||||||
# P57: Phase G vendor_details als zusätzliche Vendor-Quelle.
|
# P57: Phase G vendor_details als zusätzliche Vendor-Quelle.
|
||||||
# Wenn extract_vendors_from_payloads weniger findet als
|
# Wenn extract_vendors_from_payloads weniger findet als
|
||||||
# Phase G's Info-Click-Through (z.B. Mercedes-Settings nicht
|
# Phase G's Info-Click-Through (z.B. Mercedes-Settings nicht
|
||||||
@@ -1543,11 +1561,31 @@ async def _autodiscover_missing(
|
|||||||
"""
|
"""
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
# Submitted doc_types (those the user actually entered URL or text for).
|
# VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen
|
||||||
|
# als 'submitted'. Wenn der User eine URL eingegeben hat aber die
|
||||||
|
# 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger
|
||||||
|
# als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln
|
||||||
|
# damit der Discovery-Pass alternative URLs probiert.
|
||||||
|
_MIN_USEFUL_CHARS = 200
|
||||||
submitted_types = {
|
submitted_types = {
|
||||||
e["doc_type"] for e in doc_entries
|
e["doc_type"] for e in doc_entries
|
||||||
if e.get("text") or (e.get("url") or "").strip()
|
if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS
|
||||||
}
|
}
|
||||||
|
# Markiere die fehlgeschlagenen URL-Submissions damit der Discovery
|
||||||
|
# ihre URL nicht erneut probiert (waere sinnlos).
|
||||||
|
failed_urls: set[str] = {
|
||||||
|
(e.get("url") or "").strip()
|
||||||
|
for e in doc_entries
|
||||||
|
if (e.get("url") or "").strip()
|
||||||
|
and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS
|
||||||
|
}
|
||||||
|
if failed_urls:
|
||||||
|
logger.info(
|
||||||
|
"VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery "
|
||||||
|
"soll Alternativen probieren: %s",
|
||||||
|
len(failed_urls), _MIN_USEFUL_CHARS,
|
||||||
|
", ".join(list(failed_urls)[:3]),
|
||||||
|
)
|
||||||
# Map alias types to canonical
|
# Map alias types to canonical
|
||||||
submitted_canon = {
|
submitted_canon = {
|
||||||
"dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
|
"dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
|
||||||
@@ -1657,16 +1695,21 @@ async def _autodiscover_missing(
|
|||||||
if canon and canon in missing and canon not in by_type:
|
if canon and canon in missing and canon not in by_type:
|
||||||
by_type[canon] = d
|
by_type[canon] = d
|
||||||
|
|
||||||
# Append a new entry for every missing canonical type. Auto-discovered
|
# Append/Update entry for every missing canonical type. Auto-discovered
|
||||||
# ones get the text/URL filled; ungratched ones stay empty so the
|
# ones get the text/URL filled; ungratched ones stay empty so the
|
||||||
# padding step renders them as 'Auf der Website nicht gefunden'.
|
# padding step renders them as 'Auf der Website nicht gefunden'.
|
||||||
|
# VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber
|
||||||
|
# fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren.
|
||||||
filled = 0
|
filled = 0
|
||||||
for dt in missing:
|
for dt in missing:
|
||||||
new_entry: dict = {
|
existing = next((e for e in doc_entries
|
||||||
|
if e.get("doc_type") == dt), None)
|
||||||
|
new_entry: dict = existing if existing else {
|
||||||
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
||||||
"auto_discovered": False, "discovery_attempted": True,
|
"auto_discovered": False, "discovery_attempted": True,
|
||||||
"cmp_payloads": [],
|
"cmp_payloads": [],
|
||||||
}
|
}
|
||||||
|
new_entry["discovery_attempted"] = True
|
||||||
d = by_type.get(dt)
|
d = by_type.get(dt)
|
||||||
if d:
|
if d:
|
||||||
full = d.get("full_text") or d.get("text_preview") or ""
|
full = d.get("full_text") or d.get("text_preview") or ""
|
||||||
@@ -1685,21 +1728,24 @@ async def _autodiscover_missing(
|
|||||||
full = cmp_merged
|
full = cmp_merged
|
||||||
if len(full.split()) >= 100:
|
if len(full.split()) >= 100:
|
||||||
new_entry["text"] = full
|
new_entry["text"] = full
|
||||||
|
# Behalte die original URL als "rejected_url" damit Audit
|
||||||
|
# zeigt 'X war 404, wir haben Y gefunden'.
|
||||||
|
if existing and (existing.get("url") or "").strip() in failed_urls:
|
||||||
|
new_entry["rejected_url"] = existing.get("url")
|
||||||
new_entry["url"] = d.get("url", "")
|
new_entry["url"] = d.get("url", "")
|
||||||
new_entry["word_count"] = len(full.split())
|
new_entry["word_count"] = len(full.split())
|
||||||
new_entry["auto_discovered"] = True
|
new_entry["auto_discovered"] = True
|
||||||
# Auto-discovery happens on the HOMEPAGE — any CMP payload
|
|
||||||
# captured at that level likely belongs to the cookie page
|
|
||||||
# (CMP widget loaded site-wide). Attach to 'cookie' entry.
|
|
||||||
if dt == "cookie" and disc_payloads:
|
if dt == "cookie" and disc_payloads:
|
||||||
new_entry["cmp_payloads"] = disc_payloads
|
new_entry["cmp_payloads"] = disc_payloads
|
||||||
doc_texts[dt] = full
|
doc_texts[dt] = full
|
||||||
filled += 1
|
filled += 1
|
||||||
logger.info(
|
logger.info(
|
||||||
"auto-discovered %s on %s: %s (%d words)",
|
"auto-discovered %s on %s: %s (%d words)%s",
|
||||||
dt, base, d.get("url", "")[:80], new_entry["word_count"],
|
dt, base, d.get("url", "")[:80], new_entry["word_count"],
|
||||||
|
" [REPLACED failed URL]" if existing else "",
|
||||||
)
|
)
|
||||||
doc_entries.append(new_entry)
|
if not existing:
|
||||||
|
doc_entries.append(new_entry)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"auto-discovery: filled %d/%d missing types from %s",
|
"auto-discovery: filled %d/%d missing types from %s",
|
||||||
|
|||||||
@@ -303,6 +303,87 @@ def check_banner_vs_cmp_partner_count(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def check_banner_copyability(banner_result: dict) -> dict | None:
|
||||||
|
"""P51a — Banner-Text muss kopierbar sein. CSS user-select:none oder
|
||||||
|
-webkit-user-select:none verhindert das (Article 7(2) DSGVO — verstaendlich
|
||||||
|
und in einer Form, die spaetere Pruefung ermoeglicht).
|
||||||
|
"""
|
||||||
|
if not isinstance(banner_result, dict):
|
||||||
|
return None
|
||||||
|
phases = banner_result.get("phases") or {}
|
||||||
|
initial = phases.get("initial") or phases.get("before_accept") or {}
|
||||||
|
html = (initial.get("banner_html") or "")[:50000].lower()
|
||||||
|
if not html:
|
||||||
|
return None
|
||||||
|
blocked_signals = [
|
||||||
|
"user-select:none", "user-select: none",
|
||||||
|
"-webkit-user-select:none", "-webkit-user-select: none",
|
||||||
|
"-moz-user-select:none", "pointer-events:none",
|
||||||
|
"oncopy=\"return false", "onselectstart=\"return false",
|
||||||
|
]
|
||||||
|
hits = [s for s in blocked_signals if s in html]
|
||||||
|
if not hits:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"severity": "MEDIUM",
|
||||||
|
"code": "banner_not_copyable",
|
||||||
|
"label": "Banner-Text laesst sich nicht kopieren "
|
||||||
|
"(user-select:none / oncopy disabled)",
|
||||||
|
"detail": (
|
||||||
|
f'Im Banner-HTML gefunden: {", ".join(hits[:3])}. Der Nutzer '
|
||||||
|
"kann den Banner-Text nicht in eine Mail / Doku einfuegen, was "
|
||||||
|
"die spaetere Pruefung erschwert. Empfehlung: das CSS entfernen "
|
||||||
|
"oder explizit auf 'auto' setzen."
|
||||||
|
),
|
||||||
|
"legal_basis": "Art. 7 (1)+(2) DSGVO + EDPB 5/2020 — Einwilligungen "
|
||||||
|
"muessen in verstaendlicher und zugaenglicher Form "
|
||||||
|
"erteilt werden; eine spaetere Pruefung darf nicht "
|
||||||
|
"technisch erschwert werden.",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def check_consent_history(banner_result: dict) -> dict | None:
|
||||||
|
"""P51b — Es muss eine Moeglichkeit geben, die eigene Einwilligungs-
|
||||||
|
Historie einzusehen (Art. 7 (3) — Widerruf muss so einfach wie die
|
||||||
|
Erteilung sein; das setzt voraus dass man WEISS was man einwilligt hat).
|
||||||
|
"""
|
||||||
|
if not isinstance(banner_result, dict):
|
||||||
|
return None
|
||||||
|
phases = banner_result.get("phases") or {}
|
||||||
|
blob_parts: list[str] = []
|
||||||
|
for ph in phases.values():
|
||||||
|
if isinstance(ph, dict):
|
||||||
|
blob_parts.append((ph.get("banner_text") or "")[:5000])
|
||||||
|
blob_parts.append((ph.get("banner_html") or "")[:20000])
|
||||||
|
blob = " ".join(blob_parts).lower()
|
||||||
|
if not blob:
|
||||||
|
return None
|
||||||
|
history_signals = [
|
||||||
|
"meine einwilligung", "consent-historie", "consent history",
|
||||||
|
"einwilligungshistorie", "einwilligungs-historie",
|
||||||
|
"ihre einwilligungen", "datenschutz-cockpit",
|
||||||
|
"privacy dashboard", "einwilligungs-protokoll",
|
||||||
|
"consent record", "consent log",
|
||||||
|
]
|
||||||
|
if any(s in blob for s in history_signals):
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"severity": "MEDIUM",
|
||||||
|
"code": "consent_history_missing",
|
||||||
|
"label": "Keine sichtbare Consent-Historie / 'Meine Einwilligungen'-Ansicht",
|
||||||
|
"detail": (
|
||||||
|
"Im Banner und in den verlinkten Footer-Bereichen ist keine "
|
||||||
|
"Moeglichkeit erkennbar, die eigene Einwilligungs-Historie "
|
||||||
|
"einzusehen oder zu exportieren. Empfehlung: einen "
|
||||||
|
"'Meine Einwilligungen'-Bereich verlinken (Borlabs / Cookiebot / "
|
||||||
|
"Usercentrics bieten dafuer fertige Komponenten)."
|
||||||
|
),
|
||||||
|
"legal_basis": "Art. 7 (3) DSGVO + EDPB 5/2020 — der Widerruf muss "
|
||||||
|
"ebenso einfach sein wie die Erteilung, was eine "
|
||||||
|
"Sichtbarmachung der eigenen Einwilligungen voraussetzt.",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def run_all(banner_result: dict, cookie_doc_text: str | None = None,
|
def run_all(banner_result: dict, cookie_doc_text: str | None = None,
|
||||||
cmp_vendors: list | None = None,
|
cmp_vendors: list | None = None,
|
||||||
doc_texts: dict[str, str] | None = None) -> list[dict]:
|
doc_texts: dict[str, str] | None = None) -> list[dict]:
|
||||||
@@ -331,6 +412,18 @@ def run_all(banner_result: dict, cookie_doc_text: str | None = None,
|
|||||||
findings.append(f4)
|
findings.append(f4)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("P33 three_source_vendor failed: %s", e)
|
logger.warning("P33 three_source_vendor failed: %s", e)
|
||||||
|
try:
|
||||||
|
f5 = check_banner_copyability(banner_result)
|
||||||
|
if f5:
|
||||||
|
findings.append(f5)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P51a copyability failed: %s", e)
|
||||||
|
try:
|
||||||
|
f6 = check_consent_history(banner_result)
|
||||||
|
if f6:
|
||||||
|
findings.append(f6)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("P51b consent_history failed: %s", e)
|
||||||
return findings
|
return findings
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user