Files
breakpilot-compliance/backend-compliance/compliance/api/agent_doc_check_extras.py
T
Benjamin Admin e61e9d9e2a feat(agent): progress_pct + 6 BMW-Run Verbesserungen
Backend (agent_compliance_check_routes.py):
- progress_pct (0-100%) im Job-State, ueber alle Phasen verteilt
  (Laden 0-30, Profil 35-40, Pruefen 40-80, Banner 80-92, Report 95-100)
- Status-Texte vereinheitlicht ("Texte laden X/N", "Pruefen X/N")
- Firmenname fuer Email-Subject jetzt aus URL abgeleitet
  (bmw.de -> "BMW", mercedes-benz.de -> "Mercedes-Benz") statt
  unzuverlaessigem extracted_profile.companyName (matchte oft juris.de)
- E-Mail-Report enthaelt jetzt Banner+TCF-Vendor-Liste (build_provider_list_html)

Backend (agent_doc_check_extras.py — neu):
- build_scanned_urls_html: gepruefte URLs als Tabelle oben im Report
  (transparent fuer GF, welche Quellen wirklich gezogen wurden)
- Cross-Domain-Hinweis bei >1 netloc (BMW: bmw.de / bmwgroup.com /
  bmwgroup.jobs — Auffindbarkeit nach Art. 12 DSGVO)
- build_provider_list_html: Banner-Box + TCF-Vendor-Tabelle mit Spalten
  Name | Kategorie | Zweck | Drittland | Rechtsgrundlage

Backend (business_profiler.py):
- §34d-GewO Versicherungsvermittler-Hinweise zaehlen nicht mehr als
  "finance"-Industrie (BMW wurde dadurch falsch als B2B/finance erkannt)
- Neue Industry "automotive" (Fahrzeug/KFZ/Konfigurator/Modellpalette)
- B2B-Keywords: generische Begriffe wie "unternehmen", "beratung",
  "consulting" entfernt (matchten in jedem Konzerntext)
- B2C-Fallback: bei Verbraucher-Signalen ("widerruf", "kunde",
  redaktioneller Inhalt) tendiert auf b2c statt b2b

Frontend (ComplianceCheckTab.tsx):
- Progress-Balken mit Width-% und XX%-Anzeige rechts
- liest data.progress_pct aus Polling-Response

Consent-Tester (dsi_discovery.py):
- Cookie-Policy-Extraktion kritisch fixt: wait_for_function bis
  body.innerText > 500 chars (BMW SPA-Rendering brauchte mehr Zeit)
- _extract_text_robust: 3-Strategien-Extraktion (Selektoren -> Body-
  Cleanup -> P/LI/TD-Tags)
- _extract_text_from_iframes: liest OneTrust/Sourcepoint/Usercentrics
  Iframe-Inhalte (manche Cookie-Policies leben dort)

Adressiert alle Findings aus dem BMW-Ground-Truth-Vergleich.
2026-05-16 17:53:14 +02:00

227 lines
9.1 KiB
Python

"""
Extras for the agent doc-check email report.
Split out from agent_doc_check_report.py to keep both files under the
500-line hard cap. Contains:
- build_scanned_urls_html (list of fetched URLs + cross-domain notice)
- build_provider_list_html (cookie banner + TCF vendor table)
"""
from __future__ import annotations
def build_scanned_urls_html(doc_entries: list[dict]) -> str:
"""Render the list of scanned URLs at the top of the report.
Transparent for the GF which sources were actually fetched/analysed.
Skips empty URLs (text-only uploads). Adds a cross-domain warning when
legal texts are distributed across multiple domains (e.g. BMW spreads
across bmw.de, bmwgroup.com, bmwgroup.jobs).
"""
from urllib.parse import urlparse
rows: list[str] = []
seen: set[str] = set()
domains: dict[str, list[str]] = {} # netloc -> list of doc_types
for entry in doc_entries:
url = (entry.get("url") or "").strip()
if not url or url in seen:
continue
seen.add(url)
label = _doc_type_label(entry.get("doc_type", ""))
words = entry.get("word_count") or 0
try:
netloc = urlparse(url).netloc.lower().lstrip("www.")
if netloc:
domains.setdefault(netloc, []).append(label)
except Exception:
pass
rows.append(
f'<tr>'
f'<td style="padding:3px 12px 3px 0;color:#475569;font-size:12px">{label}</td>'
f'<td style="padding:3px 12px 3px 0;font-size:12px;'
f'font-family:ui-monospace,monospace;color:#1e293b;word-break:break-all">'
f'<a href="{url}" style="color:#2563eb;text-decoration:none">{url}</a></td>'
f'<td style="padding:3px 0;color:#94a3b8;font-size:11px;text-align:right;'
f'white-space:nowrap">{words} Woerter</td>'
f'</tr>'
)
if not rows:
return ""
cross_domain_html = _cross_domain_notice(domains) if len(domains) >= 2 else ""
return (
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
'<h3 style="margin:0 0 8px;font-size:14px;color:#334155">'
f'Gepruefte Quellen ({len(rows)})</h3>'
'<table style="width:100%;border-collapse:collapse">'
+ "".join(rows)
+ '</table>'
+ cross_domain_html
+ '</div>'
)
def _cross_domain_notice(domains: dict[str, list[str]]) -> str:
"""Warning box when legal texts are spread across multiple domains.
Relevant for big corporate groups (BMW Group: bmw.de / bmwgroup.com /
bmwgroup.jobs). Affects findability for data subjects and may indicate
incomplete disclosure on the main site.
"""
items = []
for netloc, labels in sorted(domains.items()):
labels_str = ", ".join(sorted(set(labels)))
items.append(
f'<li style="margin-bottom:2px"><strong>{netloc}</strong> '
f'<span style="color:#92400e;font-size:11px">&rarr; {labels_str}</span></li>'
)
return (
'<div style="margin-top:12px;padding:10px 12px;background:#fffbeb;'
'border-left:3px solid #f59e0b;border-radius:4px;font-size:12px;'
'color:#78350f">'
'<strong>Hinweis: Rechtstexte verteilt auf '
f'{len(domains)} Domains.</strong> '
'Erschwert die Auffindbarkeit fuer Betroffene (Art. 12 Abs. 1 DSGVO &mdash; '
'transparente Information). Pruefen Sie, ob alle Texte auch von der '
'Hauptdomain aus klar verlinkt sind.'
'<ul style="margin:6px 0 0 16px;padding-left:0">'
+ "".join(items) +
'</ul></div>'
)
def _doc_type_label(doc_type: str) -> str:
"""Lazy resolver — avoids circular import with agent_compliance_check_routes."""
labels = {
"dse": "Datenschutzerklaerung",
"datenschutz": "Datenschutzerklaerung",
"privacy": "Datenschutzerklaerung",
"impressum": "Impressum",
"agb": "AGB",
"widerruf": "Widerrufsbelehrung",
"cookie": "Cookie-Richtlinie",
"avv": "Auftragsverarbeitung",
"loeschkonzept": "Loeschkonzept",
"dsfa": "Datenschutz-Folgenabschaetzung",
"social_media": "Social Media Datenschutz",
"nutzungsbedingungen": "Nutzungsbedingungen",
"dsb": "DSB-Kontakt",
}
return labels.get(doc_type, doc_type.upper() if doc_type else "Dokument")
def build_provider_list_html(
banner_result: dict | None,
vvt_entries: list[dict] | None,
) -> str:
"""Render the cookie banner result + TCF vendor table for the email.
Sections:
1. Banner summary (provider, violations count)
2. Vendor table: Name | Kategorie | Zweck | Drittland | Rechtsgrundlage
"""
if not banner_result and not vvt_entries:
return ""
parts: list[str] = [
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
'<h3 style="margin:0 0 10px;font-size:14px;color:#334155">'
'Cookie-Banner &amp; Verarbeiter</h3>'
]
if banner_result:
detected = banner_result.get("banner_detected", False)
provider = banner_result.get("banner_provider") or "unbekannt"
violations = banner_result.get("banner_checks", {}).get("violations", [])
n_viol = len(violations) if isinstance(violations, list) else int(violations or 0)
status_color = "#16a34a" if detected and n_viol == 0 else (
"#d97706" if detected else "#6b7280"
)
parts.append(
f'<div style="font-size:13px;color:#374151;margin-bottom:10px">'
f'<span style="display:inline-block;width:8px;height:8px;'
f'border-radius:50%;background:{status_color};margin-right:8px"></span>'
f'Banner erkannt: <strong>{"Ja" if detected else "Nein"}</strong>'
f' &nbsp;&middot;&nbsp; Anbieter: <strong>{provider}</strong>'
f' &nbsp;&middot;&nbsp; Auffaelligkeiten: <strong>{n_viol}</strong>'
f'</div>'
)
vendors = vvt_entries or []
if vendors:
parts.append(
f'<div style="font-size:12px;color:#475569;margin:8px 0 6px">'
f'<strong>{len(vendors)} TCF-Verarbeiter ueber das Banner eingebunden:</strong>'
f'</div>'
'<table style="width:100%;border-collapse:collapse;font-size:11px">'
'<thead><tr style="background:#f1f5f9;color:#475569;text-align:left">'
'<th style="padding:5px 8px">Name</th>'
'<th style="padding:5px 8px">Kategorie</th>'
'<th style="padding:5px 8px">Zweck</th>'
'<th style="padding:5px 8px">Drittland</th>'
'<th style="padding:5px 8px">Rechtsgrundlage</th>'
'</tr></thead><tbody>'
)
for v in vendors[:50]:
parts.append(_render_vendor_row(v))
parts.append('</tbody></table>')
if len(vendors) > 50:
parts.append(
f'<div style="font-size:11px;color:#94a3b8;margin-top:4px">'
f'... und {len(vendors) - 50} weitere</div>'
)
elif banner_result and banner_result.get("banner_detected"):
parts.append(
'<div style="font-size:11px;color:#94a3b8">'
'Keine TCF-Verarbeiter erkannt (Banner nutzt kein TCF v2 Framework '
'oder Vendor-Liste konnte nicht ausgelesen werden).</div>'
)
parts.append('</div>')
return "".join(parts)
def _render_vendor_row(v: dict) -> str:
name = v.get("name") or "Unbekannt"
kategorie = _category_label(v.get("kategorie", ""))
zweck = v.get("zweck_kurz") or ", ".join((v.get("zweck") or [])[:2])
drittland = v.get("drittland")
land = v.get("land") or ""
if drittland is True:
drittland_str = (f'<span style="color:#dc2626">Ja ({land})</span>'
if land else '<span style="color:#dc2626">Ja</span>')
elif drittland is False:
drittland_str = (f'<span style="color:#16a34a">Nein ({land})</span>'
if land else '<span style="color:#16a34a">Nein</span>')
else:
drittland_str = '<span style="color:#94a3b8">unbekannt</span>'
rg = v.get("rechtsgrundlage", "")
rg_short = "Einwilligung" if "Einwilligung" in rg else (
"Berechtigtes Interesse" if "Berechtigtes" in rg else rg[:40]
)
return (
f'<tr style="border-top:1px solid #e2e8f0">'
f'<td style="padding:4px 8px;color:#1e293b">{name}</td>'
f'<td style="padding:4px 8px;color:#475569">{kategorie}</td>'
f'<td style="padding:4px 8px;color:#475569">{zweck}</td>'
f'<td style="padding:4px 8px">{drittland_str}</td>'
f'<td style="padding:4px 8px;color:#475569">{rg_short}</td>'
f'</tr>'
)
def _category_label(kat: str) -> str:
return {
"necessary": "Notwendig",
"functional": "Funktional",
"statistics": "Statistik",
"marketing": "Marketing",
}.get(kat, kat or "")