breakpilot-compliance/backend-compliance/compliance/api/agent_doc_check_extras.py

"""
Extras for the agent doc-check email report.

Split out from agent_doc_check_report.py to keep both files under the
500-line hard cap. Contains:
  - build_scanned_urls_html      (list of fetched URLs + cross-domain notice)
  - build_provider_list_html     (cookie banner + TCF vendor table)
"""

from __future__ import annotations


def build_scanned_urls_html(doc_entries: list[dict]) -> str:
    """Render the list of scanned URLs at the top of the report.

    Transparent for the GF which sources were actually fetched/analysed.
    Skips empty URLs (text-only uploads). Adds a cross-domain warning when
    legal texts are distributed across multiple domains (e.g. BMW spreads
    across bmw.de, bmwgroup.com, bmwgroup.jobs).
    """
    from urllib.parse import urlparse

    rows: list[str] = []
    seen: set[str] = set()
    domains: dict[str, list[str]] = {}  # netloc -> list of doc_types
    for entry in doc_entries:
        url = (entry.get("url") or "").strip()
        if not url or url in seen:
            continue
        seen.add(url)
        label = _doc_type_label(entry.get("doc_type", ""))
        words = entry.get("word_count") or 0
        try:
            netloc = urlparse(url).netloc.lower().lstrip("www.")
            if netloc:
                domains.setdefault(netloc, []).append(label)
        except Exception:
            pass
        rows.append(
            f'<tr>'
            f'<td style="padding:3px 12px 3px 0;color:#475569;font-size:12px">{label}</td>'
            f'<td style="padding:3px 12px 3px 0;font-size:12px;'
            f'font-family:ui-monospace,monospace;color:#1e293b;word-break:break-all">'
            f'<a href="{url}" style="color:#2563eb;text-decoration:none">{url}</a></td>'
            f'<td style="padding:3px 0;color:#94a3b8;font-size:11px;text-align:right;'
            f'white-space:nowrap">{words} Woerter</td>'
            f'</tr>'
        )
    if not rows:
        return ""

    cross_domain_html = _cross_domain_notice(domains) if len(domains) >= 2 else ""

    return (
        '<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
        'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
        'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
        '<h3 style="margin:0 0 8px;font-size:14px;color:#334155">'
        f'Gepruefte Quellen ({len(rows)})</h3>'
        '<table style="width:100%;border-collapse:collapse">'
        + "".join(rows)
        + '</table>'
        + cross_domain_html
        + '</div>'
    )


def _cross_domain_notice(domains: dict[str, list[str]]) -> str:
    """Warning box when legal texts are spread across multiple domains.

    Relevant for big corporate groups (BMW Group: bmw.de / bmwgroup.com /
    bmwgroup.jobs). Affects findability for data subjects and may indicate
    incomplete disclosure on the main site.
    """
    items = []
    for netloc, labels in sorted(domains.items()):
        labels_str = ", ".join(sorted(set(labels)))
        items.append(
            f'<li style="margin-bottom:2px"><strong>{netloc}</strong> '
            f'<span style="color:#92400e;font-size:11px">&rarr; {labels_str}</span></li>'
        )
    return (
        '<div style="margin-top:12px;padding:10px 12px;background:#fffbeb;'
        'border-left:3px solid #f59e0b;border-radius:4px;font-size:12px;'
        'color:#78350f">'
        '<strong>Hinweis: Rechtstexte verteilt auf '
        f'{len(domains)} Domains.</strong> '
        'Erschwert die Auffindbarkeit fuer Betroffene (Art. 12 Abs. 1 DSGVO &mdash; '
        'transparente Information). Pruefen Sie, ob alle Texte auch von der '
        'Hauptdomain aus klar verlinkt sind.'
        '<ul style="margin:6px 0 0 16px;padding-left:0">'
        + "".join(items) +
        '</ul></div>'
    )


def _doc_type_label(doc_type: str) -> str:
    """Lazy resolver — avoids circular import with agent_compliance_check_routes."""
    labels = {
        "dse": "Datenschutzerklaerung",
        "datenschutz": "Datenschutzerklaerung",
        "privacy": "Datenschutzerklaerung",
        "impressum": "Impressum",
        "agb": "AGB",
        "widerruf": "Widerrufsbelehrung",
        "cookie": "Cookie-Richtlinie",
        "avv": "Auftragsverarbeitung",
        "loeschkonzept": "Loeschkonzept",
        "dsfa": "Datenschutz-Folgenabschaetzung",
        "social_media": "Social Media Datenschutz",
        "nutzungsbedingungen": "Nutzungsbedingungen",
        "dsb": "DSB-Kontakt",
    }
    return labels.get(doc_type, doc_type.upper() if doc_type else "Dokument")


def build_provider_list_html(
    banner_result: dict | None,
    vvt_entries: list[dict] | None,
) -> str:
    """Render the cookie banner result + TCF vendor table for the email.

    Sections:
      1. Banner summary (provider, violations count)
      2. Vendor table: Name | Kategorie | Zweck | Drittland | Rechtsgrundlage
    """
    if not banner_result and not vvt_entries:
        return ""

    parts: list[str] = [
        '<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
        'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
        'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
        '<h3 style="margin:0 0 10px;font-size:14px;color:#334155">'
        'Cookie-Banner &amp; Verarbeiter</h3>'
    ]

    if banner_result:
        detected = banner_result.get("banner_detected", False)
        provider = banner_result.get("banner_provider") or "unbekannt"
        violations = banner_result.get("banner_checks", {}).get("violations", [])
        n_viol = len(violations) if isinstance(violations, list) else int(violations or 0)

        status_color = "#16a34a" if detected and n_viol == 0 else (
            "#d97706" if detected else "#6b7280"
        )
        parts.append(
            f'<div style="font-size:13px;color:#374151;margin-bottom:10px">'
            f'<span style="display:inline-block;width:8px;height:8px;'
            f'border-radius:50%;background:{status_color};margin-right:8px"></span>'
            f'Banner erkannt: <strong>{"Ja" if detected else "Nein"}</strong>'
            f' &nbsp;&middot;&nbsp; Anbieter: <strong>{provider}</strong>'
            f' &nbsp;&middot;&nbsp; Auffaelligkeiten: <strong>{n_viol}</strong>'
            f'</div>'
        )

    vendors = vvt_entries or []
    if vendors:
        parts.append(
            f'<div style="font-size:12px;color:#475569;margin:8px 0 6px">'
            f'<strong>{len(vendors)} TCF-Verarbeiter ueber das Banner eingebunden:</strong>'
            f'</div>'
            '<table style="width:100%;border-collapse:collapse;font-size:11px">'
            '<thead><tr style="background:#f1f5f9;color:#475569;text-align:left">'
            '<th style="padding:5px 8px">Name</th>'
            '<th style="padding:5px 8px">Kategorie</th>'
            '<th style="padding:5px 8px">Zweck</th>'
            '<th style="padding:5px 8px">Drittland</th>'
            '<th style="padding:5px 8px">Rechtsgrundlage</th>'
            '</tr></thead><tbody>'
        )
        for v in vendors[:50]:
            parts.append(_render_vendor_row(v))
        parts.append('</tbody></table>')
        if len(vendors) > 50:
            parts.append(
                f'<div style="font-size:11px;color:#94a3b8;margin-top:4px">'
                f'... und {len(vendors) - 50} weitere</div>'
            )
    elif banner_result and banner_result.get("banner_detected"):
        parts.append(
            '<div style="font-size:11px;color:#94a3b8">'
            'Keine TCF-Verarbeiter erkannt (Banner nutzt kein TCF v2 Framework '
            'oder Vendor-Liste konnte nicht ausgelesen werden).</div>'
        )

    parts.append('</div>')
    return "".join(parts)


def _render_vendor_row(v: dict) -> str:
    name = v.get("name") or "Unbekannt"
    kategorie = _category_label(v.get("kategorie", ""))
    zweck = v.get("zweck_kurz") or ", ".join((v.get("zweck") or [])[:2])
    drittland = v.get("drittland")
    land = v.get("land") or ""
    if drittland is True:
        drittland_str = (f'<span style="color:#dc2626">Ja ({land})</span>'
                         if land else '<span style="color:#dc2626">Ja</span>')
    elif drittland is False:
        drittland_str = (f'<span style="color:#16a34a">Nein ({land})</span>'
                         if land else '<span style="color:#16a34a">Nein</span>')
    else:
        drittland_str = '<span style="color:#94a3b8">unbekannt</span>'
    rg = v.get("rechtsgrundlage", "")
    rg_short = "Einwilligung" if "Einwilligung" in rg else (
        "Berechtigtes Interesse" if "Berechtigtes" in rg else rg[:40]
    )
    return (
        f'<tr style="border-top:1px solid #e2e8f0">'
        f'<td style="padding:4px 8px;color:#1e293b">{name}</td>'
        f'<td style="padding:4px 8px;color:#475569">{kategorie}</td>'
        f'<td style="padding:4px 8px;color:#475569">{zweck}</td>'
        f'<td style="padding:4px 8px">{drittland_str}</td>'
        f'<td style="padding:4px 8px;color:#475569">{rg_short}</td>'
        f'</tr>'
    )


def _category_label(kat: str) -> str:
    return {
        "necessary": "Notwendig",
        "functional": "Funktional",
        "statistics": "Statistik",
        "marketing": "Marketing",
    }.get(kat, kat or "—")