feat(vvt): per-vendor extraction + opt-out check + VVT table in email (V1)

When a known CMP (ePaaS, OneTrust) renders the cookie policy, we now extract structured vendor records, probe their opt-out + privacy URLs, score each vendor (0-100), and append a 'VVT-Vorschlag' table to the compliance email — one row per vendor, sortable by compliance score. consent-tester: - DSIDiscoveryResult.cmp_payloads: surfaces raw CMP JSON to callers - DSIDiscoveryResponse: new cmp_payloads field - discover_dsi_documents sets cmp_payloads from cmp_capture - cmp_library/{epaas,onetrust}.py: new extract_vendors(d) returning list[VendorRecord] backend: - _fetch_text() now returns (text, cmp_payloads) tuple - doc_entries store cmp_payloads per doc (mostly cookie) - _autodiscover_missing forwards homepage payloads to the cookie entry - New module vendor_extractor.py: dispatches ePaaS/OneTrust/generic schemas; dedupes vendors across multiple payloads - cookie_link_validator.py extended with validate_vendor_urls(vendors) and score_vendors(vendors) — 0-100 score per vendor based on name, purpose, country, opt-out reachable, privacy URL reachable, cookies with names + expiry - agent_doc_check_extras.build_vvt_table_html: renders the table - Route appends VVT HTML after the provider list, before the document-by-document report - Response JSON gains cmp_vendors for future frontend rendering Example for BMW: ~30 ePaaS providers → table with Name | Kategorie | Sitz | Cookies | Opt-Out (✓/✗) | Privacy (✓/✗) | Score. Sorted by score ascending so the worst-compliant vendors are at the top.
2026-05-17 09:50:11 +02:00
parent c9c0fb5965
commit ea4dbb223f
8 changed files with 592 additions and 16 deletions
@@ -171,12 +171,13 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
            pct = int(1 + (i / n_docs) * 29)
            _update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct)
            text = doc.text
+            cmp_payloads: list[dict] = []
            if not text and doc.url:
                url_key = doc.url.strip().rstrip("/").lower()
                if url_key in url_text_cache:
                    text = url_text_cache[url_key]
                else:
-                    text = await _fetch_text(doc.url, doc_type=doc.doc_type)
+                    text, cmp_payloads = await _fetch_text(doc.url, doc_type=doc.doc_type)
                    if text:
                        url_text_cache[url_key] = text
            if text:
@@ -188,6 +189,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
                "word_count": len(text.split()) if text else 0,
                "auto_discovered": False,
                "discovery_attempted": False,
+                "cmp_payloads": cmp_payloads,
            })

        # Step 1a-bis: AUTO-DISCOVERY. For each canonical doc_type the user
@@ -367,14 +369,42 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
            build_scanned_urls_html,
            build_provider_list_html,
        )
+        from .agent_doc_check_extras import build_vvt_table_html
+
+        # Extract structured vendor records from any CMP payloads captured
+        # for the cookie doc (BMW ePaaS, OneTrust, etc.), validate their
+        # opt-out + privacy URLs concurrently, score each entry.
+        cmp_vendors: list[dict] = []
+        try:
+            from compliance.services.vendor_extractor import (
+                extract_vendors_from_payloads,
+            )
+            from compliance.services.cookie_link_validator import (
+                validate_vendor_urls, score_vendors,
+            )
+            cookie_payloads = []
+            for e in doc_entries:
+                if e.get("doc_type") == "cookie" and e.get("cmp_payloads"):
+                    cookie_payloads.extend(e["cmp_payloads"])
+            if cookie_payloads:
+                cmp_vendors = extract_vendors_from_payloads(cookie_payloads)
+                if cmp_vendors:
+                    logger.info("VVT: %d vendors extracted, validating links",
+                                len(cmp_vendors))
+                    cmp_vendors = await validate_vendor_urls(cmp_vendors)
+                    cmp_vendors = score_vendors(cmp_vendors)
+        except Exception as e:
+            logger.warning("VVT vendor extraction skipped: %s", e)
+
        summary_html = build_management_summary(results)
        scanned_html = build_scanned_urls_html(doc_entries)
        providers_html = build_provider_list_html(banner_result, vvt_entries)
+        vvt_html = build_vvt_table_html(cmp_vendors)
        report_html = build_html_report(results, None)
        profile_html = _build_profile_html(profile)
        full_html = (
            summary_html + scanned_html + profile_html
-            + providers_html + report_html
+            + providers_html + vvt_html + report_html
        )

        # Step 6: Send email — derive site name primarily from entered URL.
@@ -404,6 +434,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
                "tcf_vendor_count": len(tcf_vendors),
            } if banner_result else None,
            "tcf_vendors": vvt_entries if tcf_vendors else [],
+            "cmp_vendors": cmp_vendors,
            "total_documents": len(results),
            "total_findings": total_findings,
            "email_status": email_result.get("status", "failed"),
@@ -428,15 +459,13 @@ def _update(check_id: str, msg: str, pct: int | None = None):
        job["progress_pct"] = max(0, min(100, int(pct)))


-async def _fetch_text(url: str, doc_type: str = "") -> str:
+async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
    """Fetch text from URL via consent-tester, with HTTP fallback.

-    1. Try consent-tester (Playwright) — handles JS-heavy SPAs
-    2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages
-
-    doc_type controls how aggressively we follow sub-links — cookie/dse
-    pages prefer self-extract only (CMP capture is authoritative); legal/
-    imprint pages need to follow sub-pages (Versicherungsvermittler etc).
+    Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
+    during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
+    HTTP fallback was used. Backend turns payloads into structured vendor
+    records for the VVT table in the email.
    """
    # 1. Consent-tester (Playwright-based, full JS rendering).
    # max_documents depends on doc_type:
@@ -456,7 +485,9 @@ async def _fetch_text(url: str, doc_type: str = "") -> str:
                timeout=120.0,
            )
            if resp.status_code == 200:
-                docs = resp.json().get("documents", [])
+                payload = resp.json()
+                docs = payload.get("documents", [])
+                cmp_payloads = payload.get("cmp_payloads") or []
                if docs:
                    texts = []
                    for doc in docs:
@@ -468,7 +499,7 @@ async def _fetch_text(url: str, doc_type: str = "") -> str:
                        if len(texts) > 1:
                            logger.info("Merged %d docs from %s (%d words)",
                                        len(texts), url, len(merged.split()))
-                        return merged
+                        return merged, cmp_payloads
    except Exception as e:
        logger.warning("Consent-tester fetch failed for %s: %s", url, e)

@@ -486,11 +517,11 @@ async def _fetch_text(url: str, doc_type: str = "") -> str:
                text = _re.sub(r"\s+", " ", text).strip()
                if len(text.split()) > 100:
                    logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
-                    return text
+                    return text, []
    except Exception as e:
        logger.warning("HTTP fallback failed for %s: %s", url, e)

-    return ""
+    return "", []


 async def _autodiscover_missing(
@@ -559,11 +590,15 @@ async def _autodiscover_missing(
            if resp.status_code != 200:
                logger.warning("auto-discovery: HTTP %d for %s", resp.status_code, base)
                discovered: list[dict] = []
+                disc_payloads: list[dict] = []
            else:
-                discovered = resp.json().get("documents", [])
+                disc_body = resp.json()
+                discovered = disc_body.get("documents", [])
+                disc_payloads = disc_body.get("cmp_payloads") or []
    except Exception as e:
        logger.warning("auto-discovery failed for %s: %s", base, e)
        discovered = []
+        disc_payloads = []

    # Classify each discovered doc into a canonical doc_type
    by_type: dict[str, dict] = {}
@@ -585,6 +620,7 @@ async def _autodiscover_missing(
        new_entry: dict = {
            "doc_type": dt, "url": "", "text": "", "word_count": 0,
            "auto_discovered": False, "discovery_attempted": True,
+            "cmp_payloads": [],
        }
        d = by_type.get(dt)
        if d:
@@ -594,6 +630,11 @@ async def _autodiscover_missing(
                new_entry["url"] = d.get("url", "")
                new_entry["word_count"] = len(full.split())
                new_entry["auto_discovered"] = True
+                # Auto-discovery happens on the HOMEPAGE — any CMP payload
+                # captured at that level likely belongs to the cookie page
+                # (CMP widget loaded site-wide). Attach to 'cookie' entry.
+                if dt == "cookie" and disc_payloads:
+                    new_entry["cmp_payloads"] = disc_payloads
                doc_texts[dt] = full
                filled += 1
                logger.info(
@@ -229,4 +229,105 @@ def _category_label(kat: str) -> str:
        "functional": "Funktional",
        "statistics": "Statistik",
        "marketing": "Marketing",
+        "strictlyNecessary": "Notwendig",
+        "advertising": "Marketing",
    }.get(kat, kat or "—")
+
+
+def build_vvt_table_html(vendors: list[dict]) -> str:
+    """Render the per-vendor VVT-style table for the email report.
+
+    One row per vendor. Columns: Name | Kategorie | Sitz | Cookies |
+    Opt-Out (Status) | Privacy (Status) | Compliance-Score.
+
+    Vendors are expected to come from vendor_extractor.extract_vendors_from_payloads
+    and have already been scored by cookie_link_validator.score_vendors.
+    """
+    if not vendors:
+        return ""
+
+    vendors = sorted(vendors, key=lambda v: v.get("compliance_score", 0))
+    rows: list[str] = []
+    for v in vendors:
+        name = v.get("name") or "Unbekannt"
+        category = _category_label(v.get("category", ""))
+        country = v.get("country") or "—"
+        cookies = v.get("cookies") or []
+        n_cookies = len(cookies)
+        score = int(v.get("compliance_score", 0))
+        flags = v.get("compliance_flags") or []
+
+        opt_status = _link_status_badge(
+            v.get("opt_out_url"), v.get("opt_out_ok"),
+            v.get("opt_out_status"),
+        )
+        privacy_status = _link_status_badge(
+            v.get("privacy_policy_url"), v.get("privacy_ok"),
+            v.get("privacy_status"),
+        )
+
+        score_color = ("#16a34a" if score >= 80 else
+                       "#d97706" if score >= 50 else "#dc2626")
+        flag_str = ""
+        if flags:
+            flag_str = (
+                f'<div style="font-size:10px;color:#94a3b8;margin-top:2px">'
+                f'{", ".join(flags[:4])}</div>'
+            )
+        rows.append(
+            f'<tr style="border-top:1px solid #e2e8f0">'
+            f'<td style="padding:6px 8px;color:#1e293b;font-size:11px">'
+            f'{name}{flag_str}</td>'
+            f'<td style="padding:6px 8px;color:#475569;font-size:11px">{category}</td>'
+            f'<td style="padding:6px 8px;color:#475569;font-size:11px">{country}</td>'
+            f'<td style="padding:6px 8px;text-align:center;color:#475569;font-size:11px">'
+            f'{n_cookies}</td>'
+            f'<td style="padding:6px 8px;text-align:center">{opt_status}</td>'
+            f'<td style="padding:6px 8px;text-align:center">{privacy_status}</td>'
+            f'<td style="padding:6px 8px;text-align:right;font-weight:600;'
+            f'color:{score_color};font-size:11px">{score}%</td>'
+            f'</tr>'
+        )
+
+    n_total = len(vendors)
+    n_critical = sum(1 for v in vendors if v.get("compliance_score", 0) < 50)
+    summary = (
+        f"{n_total} Anbieter erfasst"
+        + (f", <strong style=\"color:#dc2626\">{n_critical} unter 50%</strong>"
+           if n_critical else " — alle ueber 50%")
+    )
+
+    return (
+        '<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
+        'max-width:760px;margin:0 auto 16px;padding:12px 16px;'
+        'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
+        '<h3 style="margin:0 0 4px;font-size:14px;color:#334155">'
+        'VVT-Vorschlag: Drittanbieter aus Cookie-Richtlinie</h3>'
+        f'<p style="margin:0 0 10px;font-size:11px;color:#6b7280">{summary}. '
+        'Sortiert nach Compliance-Score (niedrig zuerst — diese Eintraege '
+        'pruefen).</p>'
+        '<table style="width:100%;border-collapse:collapse;font-size:11px">'
+        '<thead><tr style="background:#f1f5f9;color:#475569;text-align:left">'
+        '<th style="padding:5px 8px">Name</th>'
+        '<th style="padding:5px 8px">Kategorie</th>'
+        '<th style="padding:5px 8px">Sitz</th>'
+        '<th style="padding:5px 8px;text-align:center">Cookies</th>'
+        '<th style="padding:5px 8px;text-align:center">Opt-Out</th>'
+        '<th style="padding:5px 8px;text-align:center">Privacy</th>'
+        '<th style="padding:5px 8px;text-align:right">Score</th>'
+        '</tr></thead><tbody>'
+        + "".join(rows)
+        + '</tbody></table></div>'
+    )
+
+
+def _link_status_badge(url: str | None, ok: bool | None, status: int | None) -> str:
+    if not url:
+        return ('<span style="color:#dc2626;font-size:11px" title="Kein Link">'
+                '&#10007;</span>')
+    if ok:
+        return ('<span style="color:#16a34a;font-size:11px" '
+                f'title="HTTP {status}">&#10003;</span>')
+    status_str = str(status) if status else "?"
+    return ('<span style="color:#dc2626;font-size:11px" '
+            f'title="HTTP {status_str}">&#10007; ({status_str})</span>')