603381a67f
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 38s
CI / test-python-document-crawler (push) Has been skipped
CI / detect-changes (push) Successful in 12s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
P58 Anti-Audit-Detection robuster (script-domain + settings-spezifisch —
war bereits im Code, jetzt sauber als completed dokumentiert).
P59c DACH-Custom-Cookies in compliance.cookie_library: Borlabs,
etracker, Matomo/Piwik, Userlike, Cookiebot/Cookieyes/Usercentrics,
Akamai/Cloudflare/Datadome Bot-Manager + HubSpot. 21 neue Eintraege
(3 von 24 schon via Open-Cookie-Database vorhanden).
Script: backend-compliance/scripts/seed_dach_cookies.py.
P60b Vendor-Pattern-Dedupe mit Fuzzy-Match (Jaccard >= 0.7) statt exakter
Tuple-Equality. Vendors mit teilweise befuellten Feldern (z.B.
Sitzland eingetragen) fallen nicht mehr aus der globalen Notice —
Bug: Amazon/Psyma/Qualtrics hatten zuvor wiederholte per-row Actions.
P61 "Untergeschobene Cookies"-Erkennung — wenn ein deklarierter Vendor
(z.B. Google Tag Manager) automatisch weitere mitbringt (GA + GCL_AU
+ DoubleClick), werden diese als separater Mail-Block (gelb) mit
COOKIE/VENDOR-Badges + Quellen-Doku ausgewiesen. Neuer Service:
compliance.services.vendor_package_cookies (8 Primary-Vendors mit
je 2-4 implicit Cookies/Vendors).
P62 Marketing-Manager-Disclaimer "Was wir sehen / nicht sehen" als
blauer Box-Block direkt unter dem Critical-Findings-Block. Erklaert
Grenzen unseres Audits (Server-Side-Tracking, Vendor-interne
Datenweitergabe, Cross-Page-Banner) und Risiko des Falschvertrauens
in einen 100%-Score. Neuer Renderer: compliance.api.scope_disclaimer.
Architektur: VVT-Tabellen-Renderer aus agent_doc_check_extras.py (552
LOC -> 242 LOC) in compliance.api.vvt_table_renderer ausgelagert, um den
500-LOC-Hardcap einzuhalten.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
243 lines
9.8 KiB
Python
243 lines
9.8 KiB
Python
"""
|
|
Extras for the agent doc-check email report.
|
|
|
|
Split out from agent_doc_check_report.py to keep both files under the
|
|
500-line hard cap. Contains:
|
|
- build_scanned_urls_html (list of fetched URLs + cross-domain notice)
|
|
- build_provider_list_html (cookie banner + TCF vendor table)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
def build_scanned_urls_html(doc_entries: list[dict]) -> str:
|
|
"""Render the list of scanned URLs at the top of the report.
|
|
|
|
Transparent for the GF which sources were actually fetched/analysed.
|
|
Skips empty URLs (text-only uploads). Adds a cross-domain warning when
|
|
legal texts are distributed across multiple domains (e.g. BMW spreads
|
|
across bmw.de, bmwgroup.com, bmwgroup.jobs).
|
|
"""
|
|
from urllib.parse import urlparse
|
|
|
|
rows: list[str] = []
|
|
seen: set[str] = set()
|
|
domains: dict[str, list[str]] = {} # netloc -> list of doc_types
|
|
for entry in doc_entries:
|
|
url = (entry.get("url") or "").strip()
|
|
if not url or url in seen:
|
|
continue
|
|
seen.add(url)
|
|
label = _doc_type_label(entry.get("doc_type", ""))
|
|
words = entry.get("word_count") or 0
|
|
auto = entry.get("auto_discovered")
|
|
try:
|
|
netloc = urlparse(url).netloc.lower().lstrip("www.")
|
|
if netloc:
|
|
domains.setdefault(netloc, []).append(label)
|
|
except Exception:
|
|
pass
|
|
badge = ('<span style="display:inline-block;margin-left:6px;'
|
|
'background:#dbeafe;color:#1e40af;font-size:10px;'
|
|
'padding:1px 6px;border-radius:8px;font-family:sans-serif">'
|
|
'auto-entdeckt</span>') if auto else ""
|
|
rows.append(
|
|
f'<tr>'
|
|
f'<td style="padding:3px 12px 3px 0;color:#475569;font-size:12px">'
|
|
f'{label}{badge}</td>'
|
|
f'<td style="padding:3px 12px 3px 0;font-size:12px;'
|
|
f'font-family:ui-monospace,monospace;color:#1e293b;word-break:break-all">'
|
|
f'<a href="{url}" style="color:#2563eb;text-decoration:none">{url}</a></td>'
|
|
f'<td style="padding:3px 0;color:#94a3b8;font-size:11px;text-align:right;'
|
|
f'white-space:nowrap">{words} Woerter</td>'
|
|
f'</tr>'
|
|
)
|
|
if not rows:
|
|
return ""
|
|
|
|
cross_domain_html = _cross_domain_notice(domains) if len(domains) >= 2 else ""
|
|
|
|
return (
|
|
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
|
|
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
|
|
'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
|
|
'<h3 style="margin:0 0 8px;font-size:14px;color:#334155">'
|
|
f'Gepruefte Quellen ({len(rows)})</h3>'
|
|
'<table style="width:100%;border-collapse:collapse">'
|
|
+ "".join(rows)
|
|
+ '</table>'
|
|
+ cross_domain_html
|
|
+ '</div>'
|
|
)
|
|
|
|
|
|
def _cross_domain_notice(domains: dict[str, list[str]]) -> str:
|
|
"""Warning box when legal texts are spread across multiple domains.
|
|
|
|
Relevant for big corporate groups (BMW Group: bmw.de / bmwgroup.com /
|
|
bmwgroup.jobs). Affects findability for data subjects and may indicate
|
|
incomplete disclosure on the main site.
|
|
"""
|
|
items = []
|
|
for netloc, labels in sorted(domains.items()):
|
|
labels_str = ", ".join(sorted(set(labels)))
|
|
items.append(
|
|
f'<li style="margin-bottom:2px"><strong>{netloc}</strong> '
|
|
f'<span style="color:#92400e;font-size:11px">→ {labels_str}</span></li>'
|
|
)
|
|
return (
|
|
'<div style="margin-top:12px;padding:10px 12px;background:#fffbeb;'
|
|
'border-left:3px solid #f59e0b;border-radius:4px;font-size:12px;'
|
|
'color:#78350f">'
|
|
'<strong>Hinweis: Rechtstexte verteilt auf '
|
|
f'{len(domains)} Domains.</strong> '
|
|
'Erschwert die Auffindbarkeit fuer Betroffene (Art. 12 Abs. 1 DSGVO — '
|
|
'transparente Information). Pruefen Sie, ob alle Texte auch von der '
|
|
'Hauptdomain aus klar verlinkt sind.'
|
|
'<ul style="margin:6px 0 0 16px;padding-left:0">'
|
|
+ "".join(items) +
|
|
'</ul></div>'
|
|
)
|
|
|
|
|
|
def _doc_type_label(doc_type: str) -> str:
|
|
"""Lazy resolver — avoids circular import with agent_compliance_check_routes."""
|
|
labels = {
|
|
"dse": "Datenschutzerklaerung",
|
|
"datenschutz": "Datenschutzerklaerung",
|
|
"privacy": "Datenschutzerklaerung",
|
|
"impressum": "Impressum",
|
|
"agb": "AGB",
|
|
"widerruf": "Widerrufsbelehrung",
|
|
"cookie": "Cookie-Richtlinie",
|
|
"avv": "Auftragsverarbeitung",
|
|
"loeschkonzept": "Loeschkonzept",
|
|
"dsfa": "Datenschutz-Folgenabschaetzung",
|
|
"social_media": "Social Media Datenschutz",
|
|
"nutzungsbedingungen": "Nutzungsbedingungen",
|
|
"dsb": "DSB-Kontakt",
|
|
}
|
|
return labels.get(doc_type, doc_type.upper() if doc_type else "Dokument")
|
|
|
|
|
|
def build_provider_list_html(
|
|
banner_result: dict | None,
|
|
vvt_entries: list[dict] | None,
|
|
) -> str:
|
|
"""Render the cookie banner result + TCF vendor table for the email.
|
|
|
|
Sections:
|
|
1. Banner summary (provider, violations count)
|
|
2. Vendor table: Name | Kategorie | Zweck | Drittland | Rechtsgrundlage
|
|
"""
|
|
if not banner_result and not vvt_entries:
|
|
return ""
|
|
|
|
parts: list[str] = [
|
|
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
|
|
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
|
|
'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
|
|
'<h3 style="margin:0 0 10px;font-size:14px;color:#334155">'
|
|
'Cookie-Banner & Verarbeiter</h3>'
|
|
]
|
|
|
|
if banner_result:
|
|
detected = banner_result.get("banner_detected", False)
|
|
provider = banner_result.get("banner_provider") or "unbekannt"
|
|
violations = banner_result.get("banner_checks", {}).get("violations", [])
|
|
n_viol = len(violations) if isinstance(violations, list) else int(violations or 0)
|
|
|
|
status_color = "#16a34a" if detected and n_viol == 0 else (
|
|
"#d97706" if detected else "#6b7280"
|
|
)
|
|
parts.append(
|
|
f'<div style="font-size:13px;color:#374151;margin-bottom:10px">'
|
|
f'<span style="display:inline-block;width:8px;height:8px;'
|
|
f'border-radius:50%;background:{status_color};margin-right:8px"></span>'
|
|
f'Banner erkannt: <strong>{"Ja" if detected else "Nein"}</strong>'
|
|
f' · Anbieter: <strong>{provider}</strong>'
|
|
f' · Auffaelligkeiten: <strong>{n_viol}</strong>'
|
|
f'</div>'
|
|
)
|
|
|
|
vendors = vvt_entries or []
|
|
if vendors:
|
|
parts.append(
|
|
f'<div style="font-size:12px;color:#475569;margin:8px 0 6px">'
|
|
f'<strong>{len(vendors)} TCF-Verarbeiter ueber das Banner eingebunden:</strong>'
|
|
f'</div>'
|
|
'<table style="width:100%;border-collapse:collapse;font-size:11px">'
|
|
'<thead><tr style="background:#f1f5f9;color:#475569;text-align:left">'
|
|
'<th style="padding:5px 8px">Name</th>'
|
|
'<th style="padding:5px 8px">Kategorie</th>'
|
|
'<th style="padding:5px 8px">Zweck</th>'
|
|
'<th style="padding:5px 8px">Drittland</th>'
|
|
'<th style="padding:5px 8px">Rechtsgrundlage</th>'
|
|
'</tr></thead><tbody>'
|
|
)
|
|
for v in vendors[:50]:
|
|
parts.append(_render_vendor_row(v))
|
|
parts.append('</tbody></table>')
|
|
if len(vendors) > 50:
|
|
parts.append(
|
|
f'<div style="font-size:11px;color:#94a3b8;margin-top:4px">'
|
|
f'... und {len(vendors) - 50} weitere</div>'
|
|
)
|
|
elif banner_result and banner_result.get("banner_detected"):
|
|
parts.append(
|
|
'<div style="font-size:11px;color:#94a3b8">'
|
|
'Keine TCF-Verarbeiter erkannt (Banner nutzt kein TCF v2 Framework '
|
|
'oder Vendor-Liste konnte nicht ausgelesen werden).</div>'
|
|
)
|
|
|
|
parts.append('</div>')
|
|
return "".join(parts)
|
|
|
|
|
|
def _render_vendor_row(v: dict) -> str:
|
|
name = v.get("name") or "Unbekannt"
|
|
kategorie = _category_label(v.get("kategorie", ""))
|
|
zweck = v.get("zweck_kurz") or ", ".join((v.get("zweck") or [])[:2])
|
|
drittland = v.get("drittland")
|
|
land = v.get("land") or ""
|
|
if drittland is True:
|
|
drittland_str = (f'<span style="color:#dc2626">Ja ({land})</span>'
|
|
if land else '<span style="color:#dc2626">Ja</span>')
|
|
elif drittland is False:
|
|
drittland_str = (f'<span style="color:#16a34a">Nein ({land})</span>'
|
|
if land else '<span style="color:#16a34a">Nein</span>')
|
|
else:
|
|
drittland_str = '<span style="color:#94a3b8">unbekannt</span>'
|
|
rg = v.get("rechtsgrundlage", "")
|
|
rg_short = "Einwilligung" if "Einwilligung" in rg else (
|
|
"Berechtigtes Interesse" if "Berechtigtes" in rg else rg[:40]
|
|
)
|
|
return (
|
|
f'<tr style="border-top:1px solid #e2e8f0">'
|
|
f'<td style="padding:4px 8px;color:#1e293b">{name}</td>'
|
|
f'<td style="padding:4px 8px;color:#475569">{kategorie}</td>'
|
|
f'<td style="padding:4px 8px;color:#475569">{zweck}</td>'
|
|
f'<td style="padding:4px 8px">{drittland_str}</td>'
|
|
f'<td style="padding:4px 8px;color:#475569">{rg_short}</td>'
|
|
f'</tr>'
|
|
)
|
|
|
|
|
|
def _category_label(kat: str) -> str:
|
|
return {
|
|
"necessary": "Notwendig",
|
|
"functional": "Funktional",
|
|
"statistics": "Statistik",
|
|
"marketing": "Marketing",
|
|
"strictlyNecessary": "Notwendig",
|
|
"advertising": "Marketing",
|
|
}.get(kat, kat or "—")
|
|
|
|
|
|
# VVT-Tabelle (gruppiert + P60/P60b Pattern-Notice) wurde in
|
|
# vvt_table_renderer.py ausgelagert, damit dieses File unter dem
|
|
# 500-LOC-Hardcap bleibt. Re-export, damit bestehende Aufrufer (z.B.
|
|
# agent_compliance_check_routes) unveraendert weiter funktionieren.
|
|
from compliance.api.vvt_table_renderer import build_vvt_table_html # noqa: E402,F401
|
|
|