From 8cbb513e2c2516473ddb630aedbbf99aa64c7cda Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 22 May 2026 08:24:46 +0200 Subject: [PATCH] feat(audit): Phase 1 Quick-Wins (P81 + P85 + P70 + P83) + TCF DELETE/INSERT-Fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P81 — tests/fixtures/golden_truth/vw_de.json: GT-Fixture mit must_find_cookies (47 VW-Cookies) + expected_vendors (Google, Adobe, Trade Desk, ...). Basis fuer kuenftige Regression-Tests. P85 — banner_screenshot_block.py + consent_scanner.py + main.py: consent-tester macht beim Banner-Detect einen base64-PNG-Screenshot (< 1.5MB). Backend rendert ihn als direkt nach dem GF-1-Pager. Visueller Beweis 'so sah das Banner aus' fuer Dispute mit Marketing/DSB. P70 — rag_provenance.py: classify_finding_provenance() klassifiziert ein Finding als 'rag' (Norm + Quelle), 'mixed' (Norm ohne Quelle) oder 'heuristic' (eigene Interpretation). provenance_badge_html() rendert kleine Badges (✓ RAG / NORM / ⚠ HEURISTIK). Modul ist generisch, kann bei jedem Finding-Renderer einklinkt werden. P83 — scripts/check-rebuild-needed.sh: Prueft ob die im Container deployten BUILD_SHA mit local HEAD uebereinstimmen. Bei Mismatch exit 1 mit 'REBUILD REQUIRED'-Hinweis. Verhindert das 'alter Code im Container'-Problem das uns mehrfach erwischt hat (Frontend-Tabs sichtbar, Backend ohne neuen Service). TCF-Fix — tcf_vendor_authority.py: cookie_library hat keinen UNIQUE-Index auf cookie_name → ON CONFLICT war unmoeglich. Loesung: vor Insert DELETE WHERE source_name='iab_tcf_v2'. Idempotent. + per-Vendor-Commit damit ein Fail die naechsten nicht blockt. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../api/agent_compliance_check_routes.py | 12 +++ .../services/banner_screenshot_block.py | 44 +++++++++ .../compliance/services/rag_provenance.py | 90 +++++++++++++++++++ .../services/tcf_vendor_authority.py | 13 ++- .../tests/fixtures/golden_truth/vw_de.json | 51 +++++++++++ consent-tester/main.py | 2 + consent-tester/services/consent_scanner.py | 15 ++++ scripts/check-rebuild-needed.sh | 49 ++++++++++ 8 files changed, 269 insertions(+), 7 deletions(-) create mode 100644 backend-compliance/compliance/services/banner_screenshot_block.py create mode 100644 backend-compliance/compliance/services/rag_provenance.py create mode 100644 backend-compliance/tests/fixtures/golden_truth/vw_de.json create mode 100755 scripts/check-rebuild-needed.sh diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 47db13a6..d33e28ea 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -1486,6 +1486,17 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): except Exception as e: logger.warning("P71 jc_avv_decision skipped: %s", e) + # P85 — Banner-Screenshot fuer visuellen Beweis (zwischen + # GF-1-Pager und Detail-Bloecken) + banner_shot_html = "" + try: + from compliance.services.banner_screenshot_block import ( + build_banner_screenshot_html, + ) + banner_shot_html = build_banner_screenshot_html(banner_result) + except Exception as e: + logger.warning("P85 banner-screenshot skipped: %s", e) + # P82: GF-1-Pager ganz oben in der Mail — 5-Bullet-Zusammenfassung # damit die GF nicht 124k Char lesen muss. gf_one_pager_html = "" @@ -1585,6 +1596,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): + critical_html + scope_disclaimer_html + exec_summary_html + cookie_arch_html + summary_html + scanned_html + profile_html + scorecard_html + redundancy_html + + banner_shot_html + providers_html + banner_deep_html + cookie_audit_html + tcf_authority_html diff --git a/backend-compliance/compliance/services/banner_screenshot_block.py b/backend-compliance/compliance/services/banner_screenshot_block.py new file mode 100644 index 00000000..edda5664 --- /dev/null +++ b/backend-compliance/compliance/services/banner_screenshot_block.py @@ -0,0 +1,44 @@ +""" +P85 — Banner-Screenshot-Block in der Mail. + +Embedded den von consent-tester captured Screenshot des Banners +(banner_result.banner_screenshot_b64) als data-URI in die Mail. +"so sah euer Banner zum Audit-Zeitpunkt aus" — visueller Beweis fuer +Dispute mit Marketing-Team oder DSB. +""" + +from __future__ import annotations + +import logging + +logger = logging.getLogger(__name__) + + +def build_banner_screenshot_html(banner_result: dict | None) -> str: + if not isinstance(banner_result, dict): + return "" + b64 = banner_result.get("banner_screenshot_b64") or "" + if not b64 or len(b64) < 200: + return "" + provider = banner_result.get("banner_provider") or "Generic" + detected = banner_result.get("banner_detected") + return ( + '
' + '
' + 'Screenshot des Cookie-Banners zum Audit-Zeitpunkt
' + f'

' + f'Provider: {provider} · ' + f'erkannt: {"ja" if detected else "nein"}

' + '

' + 'Visueller Beweis wie das Banner zum Zeitpunkt des Audits angezeigt ' + 'wurde. Bei spaeterer Aenderung des Banners bitte mit diesem ' + 'Screenshot abgleichen.' + '

' + f'Cookie-Banner' + '
' + ) diff --git a/backend-compliance/compliance/services/rag_provenance.py b/backend-compliance/compliance/services/rag_provenance.py new file mode 100644 index 00000000..3f5f8433 --- /dev/null +++ b/backend-compliance/compliance/services/rag_provenance.py @@ -0,0 +1,90 @@ +""" +P70 — RAG-Provenance-Marker. + +Wenn ein Finding aus dem RAG-Korpus belegt ist (z.B. Art-Match auf +einen konkreten Gesetzes-Paragrafen aus dem ingestierten DSGVO/TDDDG/ +TMG-Korpus), bekommt es einen ✓-Marker. Wenn es nur aus unserer +Heuristik kommt (Pattern-Match ohne RAG-Belegung), bekommt es ein ⚠ +"Heuristik". + +Dadurch sieht der Nutzer sofort welche Aussagen rechtlich verbindlich +gestuetzt sind vs welche unsere Eigeninterpretation sind. + +Generisch: dataclass-aehnliche Funktion die ein Finding-dict klassifiziert. +""" + +from __future__ import annotations + +import logging +import re + +logger = logging.getLogger(__name__) + + +# Pattern fuer "Belegt aus Korpus": Finding enthaelt expliziten +# Norm-Bezug mit Artikel + Quelle. +_NORM_RE = re.compile( + r"(Art\.?\s*\d+(?:\s*Abs\.?\s*\d+)?(?:\s*lit\.?\s*[a-z])?\s*" + r"(?:DSGVO|GDPR|TDDDG|TMG|BDSG|UWG|TKG|EuGH|EDPB)|" + r"\(?(EU|VO)\s*\d{4}/\d+\)?|" + r"§\s*\d+[a-z]?\s*(TMG|UWG|BDSG|TKG|TDDDG))", + re.I, +) + + +def classify_finding_provenance(finding: dict) -> str: + """Returns 'rag', 'heuristic', or 'mixed'. + + rag — Norm-Bezug + Quellen-URL (verbindlich) + heuristic — Pattern-Match ohne Norm-Bezug (Eigeninterpretation) + mixed — Norm-Bezug aber ohne Quellen-URL (teilweise belegbar) + """ + if not isinstance(finding, dict): + return "heuristic" + legal = (finding.get("legal_basis") or "").strip() + detail = (finding.get("detail") or "").strip() + rag_id = finding.get("rag_chunk_id") + rag_url = finding.get("rag_source_url") + blob = " ".join([legal, detail]) + has_norm = bool(_NORM_RE.search(blob)) + has_source = bool(rag_id or rag_url or + "https://" in legal or "https://" in detail) + if has_norm and has_source: + return "rag" + if has_norm: + return "mixed" + return "heuristic" + + +def provenance_badge_html(provenance: str) -> str: + if provenance == "rag": + return ( + '' + '✓ RAG' + ) + if provenance == "mixed": + return ( + '' + 'NORM' + ) + return ( + '' + '⚠ HEURISTIK' + ) + + +def annotate_findings(findings: list[dict]) -> list[dict]: + """In-place: setzt finding['provenance'] auf jeden Eintrag.""" + for f in (findings or []): + if isinstance(f, dict) and "provenance" not in f: + f["provenance"] = classify_finding_provenance(f) + return findings diff --git a/backend-compliance/compliance/services/tcf_vendor_authority.py b/backend-compliance/compliance/services/tcf_vendor_authority.py index a78a48fa..429341e5 100644 --- a/backend-compliance/compliance/services/tcf_vendor_authority.py +++ b/backend-compliance/compliance/services/tcf_vendor_authority.py @@ -81,6 +81,12 @@ async def fetch_and_ingest_tcf_vendors(db: Session) -> dict: if not vendors: return {"error": "no vendors in TCF response", "n_vendors": 0} + # Erst alte TCF-Eintraege weg (kein UNIQUE-Index auf cookie_name, + # daher kein ON CONFLICT moeglich → idempotent via DELETE+INSERT). + db.execute(sa_text( + "DELETE FROM compliance.cookie_library WHERE source_name='iab_tcf_v2'" + )) + db.commit() inserted = 0 skipped = 0 for vid, v in vendors.items(): @@ -106,13 +112,6 @@ async def fetch_and_ingest_tcf_vendors(db: Session) -> dict: VALUES (:n, :dp, :v, :pu, :cat, :purp, 'iab_tcf_v2', 'https://vendor-list.consensu.org/v3/vendor-list.json', 0.99) - ON CONFLICT (cookie_name) DO UPDATE - SET actual_category = EXCLUDED.actual_category, - vendor_name = EXCLUDED.vendor_name, - vendor_privacy_url = EXCLUDED.vendor_privacy_url, - purpose_en = EXCLUDED.purpose_en, - source_name = EXCLUDED.source_name, - confidence = EXCLUDED.confidence """ ), {"n": marker, "dp": "*", "v": f"[TCF-{vid}] {name}", diff --git a/backend-compliance/tests/fixtures/golden_truth/vw_de.json b/backend-compliance/tests/fixtures/golden_truth/vw_de.json new file mode 100644 index 00000000..7ad1c132 --- /dev/null +++ b/backend-compliance/tests/fixtures/golden_truth/vw_de.json @@ -0,0 +1,51 @@ +{ + "site": "Volkswagen Deutschland", + "site_url": "https://www.volkswagen.de", + "captured_at": "2026-05-22T00:00:00Z", + "source": "User-Copy aus Cookie-Richtlinie (Browser Strg+A → Strg+C)", + "cookie_richtlinie_url": "https://www.volkswagen.de/de/mehr/rechtliches/cookie-richtlinie.html", + "expectations": { + "min_declared_cookies": 90, + "expected_unique_vendors_after_dedup": 18, + "must_find_cookies": [ + "VWD6_ENSIGHTEN_PRIVACY_MODAL_LOADED", + "VWD6_ENSIGHTEN_PRIVACY_MODAL_VIEWED", + "smartSignals2UiD", "smartSignals2sUiD", + "s_ecid", "s_cc", "s_sq", + "AMCV_", "AMCVS_", "demdex", "dextp", + "mbox", "mboxEdgeCluster", + "TDID", "TDCPM", "TTDOptOut", + "DSID", "ANID", "AID", "IDE", "TAID", + "_gcl_au", "_gcl_dc", "_fbc", "_fbp", "fr", + "_pk_uid", + "OptanonConsent", + "everest_g_v2", "everest_session_v2", + "adbCDP", + "liveagent_sid", "liveagent_chatted", + "X-Salesforce-eLB", "sfdc-stream", + "__cfduid", "__cflb", + "FPAU", "FPGCLDC", "FLC", "APC", + "wlfeDoLogin", "wlfeRefreshSessionId", "LBCOOKIE", + "CookieConsentPolicy", + "BrowserId", "BrowserId_sec", + "inbenta-km-session-id" + ], + "expected_vendors_present": [ + "Google", + "Adobe Experience Cloud", + "Adobe Analytics", + "The Trade Desk", + "AdForm", + "Meta / Facebook", + "Salesforce", + "Cloudflare", + "Borlabs" + ], + "expected_high_findings_minimum": 1, + "banner_must_be_detected": true, + "expected_doc_types_with_text": [ + "dse", "cookie", "impressum", "nutzungsbedingungen" + ] + }, + "raw_paste": "Name des Cookies\nKategorie\nVerwendungszweck\nSpeicherdauer\nArt des Cookies\nSee tests/fixtures/cookie_gt/vw_cookie_richtlinie.txt for the abbreviated raw form." +} diff --git a/consent-tester/main.py b/consent-tester/main.py index baa5bd7e..a4463be9 100644 --- a/consent-tester/main.py +++ b/consent-tester/main.py @@ -53,6 +53,7 @@ class ScanResponse(BaseModel): cmp_payloads: list[dict] = [] # P48: raw CMP JSON-payloads (Usercentrics/OneTrust/...) captured during scan vendor_details: list[dict] = [] # P50: per-vendor detail-modal-extracts (Beschreibung/Cookies/Opt-Out/Privacy) cookies_detailed: list[dict] = [] # P59b: full cookie details for behavior-validation (name,value,domain,expires,phase,declared_category) + banner_screenshot_b64: str = "" # P85: base64-PNG des Banners (initial-view) @app.get("/health") @@ -133,6 +134,7 @@ async def scan_consent(req: ScanRequest): cmp_payloads=result.cmp_payloads, # P48 vendor_details=result.vendor_details, # P50 cookies_detailed=result.cookies_detailed, # P59b + banner_screenshot_b64=result.banner_screenshot_b64, # P85 ) diff --git a/consent-tester/services/consent_scanner.py b/consent-tester/services/consent_scanner.py index 4cedc42f..9a4596f2 100644 --- a/consent-tester/services/consent_scanner.py +++ b/consent-tester/services/consent_scanner.py @@ -77,6 +77,10 @@ class ConsentTestResult: # for behavior-validation in backend. Implicit declared_category: # before/reject phase = essential (site claims), accept = any. cookies_detailed: list = field(default_factory=list) + # P85: base64-PNG-Screenshot des Banners vor dem ersten Klick. + # Backend embedded das als in der Mail — visueller Beweis + # "so sah das Banner zum Audit-Zeitpunkt aus". + banner_screenshot_b64: str = "" async def run_consent_test( @@ -196,6 +200,17 @@ async def run_consent_test( result.banner_text_violations = banner_violations["violations"] result.banner_has_impressum_link = banner_violations["has_impressum"] result.banner_has_dse_link = banner_violations["has_dse"] + # P85 — visueller Beweis fuer die Mail. + try: + import base64 as _b64 + png = await page_a.screenshot( + full_page=False, type="png", timeout=10000, + ) + if png and len(png) < 1_500_000: # < 1.5 MB + result.banner_screenshot_b64 = _b64.b64encode(png).decode("ascii") + logger.info("P85: banner screenshot captured (%d bytes)", len(png)) + except Exception as _se: + logger.warning("P85: banner screenshot failed: %s", _se) await ctx_a.close() diff --git a/scripts/check-rebuild-needed.sh b/scripts/check-rebuild-needed.sh new file mode 100755 index 00000000..7c5a8a95 --- /dev/null +++ b/scripts/check-rebuild-needed.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# P83 — verhindert "alter Code im Container"-Bug. +# +# Vergleicht den im Container deployten git-SHA mit dem aktuellen +# Source-SHA. Wenn abweichend → exit 1 mit Hinweis Build/Recreate. +# +# Aufruf-Beispiele: +# ./scripts/check-rebuild-needed.sh backend-compliance +# ./scripts/check-rebuild-needed.sh admin-compliance +# ./scripts/check-rebuild-needed.sh consent-tester +# +# CI-Verwendung: nach git push, vor dem ersten Health-Check. +# Lokal: claude / dev kann es via pre-merge-hook nutzen. +# +# Voraussetzung: Container hat BUILD_SHA env (gesetzt im Dockerfile via +# ARG BUILD_SHA + ENV BUILD_SHA=$BUILD_SHA). Falls leer → Warnung. + +set -e + +SERVICE="${1:-backend-compliance}" +CONTAINER="bp-compliance-${SERVICE#*-}" # backend-compliance → bp-compliance-backend +if [[ "$SERVICE" == "consent-tester" ]]; then + CONTAINER="bp-compliance-consent-tester" +fi + +DOCKER="${DOCKER:-/usr/local/bin/docker}" + +deployed_sha=$($DOCKER exec "$CONTAINER" sh -c 'echo "${BUILD_SHA:-unknown}"' 2>/dev/null || echo "container-down") +local_sha=$(git rev-parse --short HEAD) + +if [[ "$deployed_sha" == "container-down" ]]; then + echo "❌ Container $CONTAINER is not running" + exit 2 +fi + +if [[ "$deployed_sha" == "unknown" ]]; then + echo "⚠️ $CONTAINER has no BUILD_SHA env — cannot verify." + echo " Add to Dockerfile: ARG BUILD_SHA / ENV BUILD_SHA=\$BUILD_SHA" + exit 0 +fi + +if [[ "$deployed_sha" != "$local_sha"* && "$local_sha" != "$deployed_sha"* ]]; then + echo "❌ $CONTAINER is on commit $deployed_sha, local is $local_sha" + echo " REBUILD REQUIRED:" + echo " docker compose build $SERVICE && docker compose up -d --no-deps --force-recreate $SERVICE" + exit 1 +fi + +echo "✓ $CONTAINER ($deployed_sha) matches local ($local_sha)"