""" Extras for the agent doc-check email report. Split out from agent_doc_check_report.py to keep both files under the 500-line hard cap. Contains: - build_scanned_urls_html (list of fetched URLs + cross-domain notice) - build_provider_list_html (cookie banner + TCF vendor table) """ from __future__ import annotations def build_scanned_urls_html(doc_entries: list[dict]) -> str: """Render the list of scanned URLs at the top of the report. Transparent for the GF which sources were actually fetched/analysed. Skips empty URLs (text-only uploads). Adds a cross-domain warning when legal texts are distributed across multiple domains (e.g. BMW spreads across bmw.de, bmwgroup.com, bmwgroup.jobs). """ from urllib.parse import urlparse rows: list[str] = [] seen: set[str] = set() domains: dict[str, list[str]] = {} # netloc -> list of doc_types for entry in doc_entries: url = (entry.get("url") or "").strip() if not url or url in seen: continue seen.add(url) label = _doc_type_label(entry.get("doc_type", "")) words = entry.get("word_count") or 0 auto = entry.get("auto_discovered") try: netloc = urlparse(url).netloc.lower().lstrip("www.") if netloc: domains.setdefault(netloc, []).append(label) except Exception: pass badge = ('' 'auto-entdeckt') if auto else "" rows.append( f'' f'' f'{label}{badge}' f'' f'{url}' f'{words} Woerter' f'' ) if not rows: return "" cross_domain_html = _cross_domain_notice(domains) if len(domains) >= 2 else "" return ( '

' '

' f'Gepruefte Quellen ({len(rows)})

' '' + "".join(rows) + '

' + cross_domain_html + '

' ) def _cross_domain_notice(domains: dict[str, list[str]]) -> str: """Warning box when legal texts are spread across multiple domains. Relevant for big corporate groups (BMW Group: bmw.de / bmwgroup.com / bmwgroup.jobs). Affects findability for data subjects and may indicate incomplete disclosure on the main site. """ items = [] for netloc, labels in sorted(domains.items()): labels_str = ", ".join(sorted(set(labels))) items.append( f'

{netloc} ' f'→ {labels_str}

' ) return ( '

' 'Hinweis: Rechtstexte verteilt auf ' f'{len(domains)} Domains. ' 'Erschwert die Auffindbarkeit fuer Betroffene (Art. 12 Abs. 1 DSGVO — ' 'transparente Information). Pruefen Sie, ob alle Texte auch von der ' 'Hauptdomain aus klar verlinkt sind.' '

' + "".join(items) + '

' ) def _doc_type_label(doc_type: str) -> str: """Lazy resolver — avoids circular import with agent_compliance_check_routes.""" labels = { "dse": "Datenschutzerklaerung", "datenschutz": "Datenschutzerklaerung", "privacy": "Datenschutzerklaerung", "impressum": "Impressum", "agb": "AGB", "widerruf": "Widerrufsbelehrung", "cookie": "Cookie-Richtlinie", "avv": "Auftragsverarbeitung", "loeschkonzept": "Loeschkonzept", "dsfa": "Datenschutz-Folgenabschaetzung", "social_media": "Social Media Datenschutz", "nutzungsbedingungen": "Nutzungsbedingungen", "dsb": "DSB-Kontakt", } return labels.get(doc_type, doc_type.upper() if doc_type else "Dokument") def build_provider_list_html( banner_result: dict | None, vvt_entries: list[dict] | None, ) -> str: """Render the cookie banner result + TCF vendor table for the email. Sections: 1. Banner summary (provider, violations count) 2. Vendor table: Name | Kategorie | Zweck | Drittland | Rechtsgrundlage """ if not banner_result and not vvt_entries: return "" parts: list[str] = [ '

' '

' 'Cookie-Banner & Verarbeiter

' ] if banner_result: detected = banner_result.get("banner_detected", False) provider = banner_result.get("banner_provider") or "unbekannt" violations = banner_result.get("banner_checks", {}).get("violations", []) n_viol = len(violations) if isinstance(violations, list) else int(violations or 0) status_color = "#16a34a" if detected and n_viol == 0 else ( "#d97706" if detected else "#6b7280" ) parts.append( f'

' f'' f'Banner erkannt: {"Ja" if detected else "Nein"}' f' · Anbieter: {provider}' f' · Auffaelligkeiten: {n_viol}' f'

' ) vendors = vvt_entries or [] if vendors: parts.append( f'

' f'{len(vendors)} TCF-Verarbeiter ueber das Banner eingebunden:' f'

' '' '' '' '' '' '' '' '' ) for v in vendors[:50]: parts.append(_render_vendor_row(v)) parts.append('

Name	Kategorie	Zweck	Drittland	Rechtsgrundlage

') if len(vendors) > 50: parts.append( f'

' f'... und {len(vendors) - 50} weitere

' ) elif banner_result and banner_result.get("banner_detected"): parts.append( '

' 'Keine TCF-Verarbeiter erkannt (Banner nutzt kein TCF v2 Framework ' 'oder Vendor-Liste konnte nicht ausgelesen werden).

' ) parts.append('

') return "".join(parts) def _render_vendor_row(v: dict) -> str: name = v.get("name") or "Unbekannt" kategorie = _category_label(v.get("kategorie", "")) zweck = v.get("zweck_kurz") or ", ".join((v.get("zweck") or [])[:2]) drittland = v.get("drittland") land = v.get("land") or "" if drittland is True: drittland_str = (f'Ja ({land})' if land else 'Ja') elif drittland is False: drittland_str = (f'Nein ({land})' if land else 'Nein') else: drittland_str = 'unbekannt' rg = v.get("rechtsgrundlage", "") rg_short = "Einwilligung" if "Einwilligung" in rg else ( "Berechtigtes Interesse" if "Berechtigtes" in rg else rg[:40] ) return ( f'' f'{name}' f'{kategorie}' f'{zweck}' f'{drittland_str}' f'{rg_short}' f'' ) def _category_label(kat: str) -> str: return { "necessary": "Notwendig", "functional": "Funktional", "statistics": "Statistik", "marketing": "Marketing", "strictlyNecessary": "Notwendig", "advertising": "Marketing", }.get(kat, kat or "—") # VVT-Tabelle (gruppiert + P60/P60b Pattern-Notice) wurde in # vvt_table_renderer.py ausgelagert, damit dieses File unter dem # 500-LOC-Hardcap bleibt. Re-export, damit bestehende Aufrufer (z.B. # agent_compliance_check_routes) unveraendert weiter funktionieren. from compliance.api.vvt_table_renderer import build_vvt_table_html # noqa: E402,F401