"""
Extras for the agent doc-check email report.
Split out from agent_doc_check_report.py to keep both files under the
500-line hard cap. Contains:
- build_scanned_urls_html (list of fetched URLs + cross-domain notice)
- build_provider_list_html (cookie banner + TCF vendor table)
"""
from __future__ import annotations
def build_scanned_urls_html(doc_entries: list[dict]) -> str:
"""Render the list of scanned URLs at the top of the report.
Transparent for the GF which sources were actually fetched/analysed.
Skips empty URLs (text-only uploads). Adds a cross-domain warning when
legal texts are distributed across multiple domains (e.g. BMW spreads
across bmw.de, bmwgroup.com, bmwgroup.jobs).
"""
from urllib.parse import urlparse
rows: list[str] = []
seen: set[str] = set()
domains: dict[str, list[str]] = {} # netloc -> list of doc_types
for entry in doc_entries:
url = (entry.get("url") or "").strip()
if not url or url in seen:
continue
seen.add(url)
label = _doc_type_label(entry.get("doc_type", ""))
words = entry.get("word_count") or 0
auto = entry.get("auto_discovered")
try:
netloc = urlparse(url).netloc.lower().lstrip("www.")
if netloc:
domains.setdefault(netloc, []).append(label)
except Exception:
pass
badge = (''
'auto-entdeckt') if auto else ""
rows.append(
f'
'
'
'
'Cookie-Banner & Verarbeiter
'
]
if banner_result:
detected = banner_result.get("banner_detected", False)
provider = banner_result.get("banner_provider") or "unbekannt"
violations = banner_result.get("banner_checks", {}).get("violations", [])
n_viol = len(violations) if isinstance(violations, list) else int(violations or 0)
status_color = "#16a34a" if detected and n_viol == 0 else (
"#d97706" if detected else "#6b7280"
)
parts.append(
f'
'
f''
f'Banner erkannt: {"Ja" if detected else "Nein"}'
f' · Anbieter: {provider}'
f' · Auffaelligkeiten: {n_viol}'
f'
'
)
vendors = vvt_entries or []
if vendors:
parts.append(
f'
'
f'{len(vendors)} TCF-Verarbeiter ueber das Banner eingebunden:'
f'
'
'
'
''
'| Name | '
'Kategorie | '
'Zweck | '
'Drittland | '
'Rechtsgrundlage | '
'
'
)
for v in vendors[:50]:
parts.append(_render_vendor_row(v))
parts.append('
')
if len(vendors) > 50:
parts.append(
f'
'
f'... und {len(vendors) - 50} weitere
'
)
elif banner_result and banner_result.get("banner_detected"):
parts.append(
'
'
'Keine TCF-Verarbeiter erkannt (Banner nutzt kein TCF v2 Framework '
'oder Vendor-Liste konnte nicht ausgelesen werden).
'
)
parts.append('
')
return "".join(parts)
def _render_vendor_row(v: dict) -> str:
name = v.get("name") or "Unbekannt"
kategorie = _category_label(v.get("kategorie", ""))
zweck = v.get("zweck_kurz") or ", ".join((v.get("zweck") or [])[:2])
drittland = v.get("drittland")
land = v.get("land") or ""
if drittland is True:
drittland_str = (f'