feat(vvt): per-vendor extraction + opt-out check + VVT table in email (V1)

When a known CMP (ePaaS, OneTrust) renders the cookie policy, we now
extract structured vendor records, probe their opt-out + privacy URLs,
score each vendor (0-100), and append a 'VVT-Vorschlag' table to the
compliance email — one row per vendor, sortable by compliance score.

consent-tester:
- DSIDiscoveryResult.cmp_payloads: surfaces raw CMP JSON to callers
- DSIDiscoveryResponse: new cmp_payloads field
- discover_dsi_documents sets cmp_payloads from cmp_capture
- cmp_library/{epaas,onetrust}.py: new extract_vendors(d) returning
  list[VendorRecord]

backend:
- _fetch_text() now returns (text, cmp_payloads) tuple
- doc_entries store cmp_payloads per doc (mostly cookie)
- _autodiscover_missing forwards homepage payloads to the cookie entry
- New module vendor_extractor.py: dispatches ePaaS/OneTrust/generic
  schemas; dedupes vendors across multiple payloads
- cookie_link_validator.py extended with validate_vendor_urls(vendors)
  and score_vendors(vendors) — 0-100 score per vendor based on name,
  purpose, country, opt-out reachable, privacy URL reachable, cookies
  with names + expiry
- agent_doc_check_extras.build_vvt_table_html: renders the table
- Route appends VVT HTML after the provider list, before the
  document-by-document report
- Response JSON gains cmp_vendors for future frontend rendering

Example for BMW: ~30 ePaaS providers → table with Name | Kategorie |
Sitz | Cookies | Opt-Out (✓/✗) | Privacy (✓/✗) | Score. Sorted by
score ascending so the worst-compliant vendors are at the top.
This commit is contained in:
Benjamin Admin
2026-05-17 09:50:11 +02:00
parent c9c0fb5965
commit ea4dbb223f
8 changed files with 592 additions and 16 deletions
+4
View File
@@ -293,6 +293,9 @@ class DSIDiscoveryResponse(BaseModel):
languages_detected: list[str]
errors: list[str]
scanned_at: str
# Raw CMP payloads captured during navigation (ePaaS, OneTrust, etc.).
# Backend uses these to build the per-vendor compliance table.
cmp_payloads: list[dict] = []
@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
@@ -343,6 +346,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
languages_detected=result.languages_detected,
errors=result.errors,
scanned_at=datetime.now(timezone.utc).isoformat(),
cmp_payloads=result.cmp_payloads,
)
@@ -67,3 +67,53 @@ def reconstruct(d: dict) -> str:
parts.append(_clean(str(meta["persistencePurposeText"])))
return "\n".join(parts)
def extract_vendors(d: dict) -> list[dict]:
"""Return structured vendor records from ePaaS policy JSON.
Schema returned (per vendor):
{name, country, purpose, category, opt_out_url, privacy_policy_url,
persistence, cookies: [{name, purpose, expiry, is_third_party}]}
"""
out: list[dict] = []
providers = d.get("providers", []) or []
cookies_by_provider: dict[str, list[dict]] = {}
# ePaaS sometimes stores cookies in a separate 'cookies' array referenced
# by providerId. If so, group them by provider.
cookies_list = d.get("cookies", []) or []
for c in cookies_list:
pid = (c.get("providerId") or c.get("provider")
or c.get("vendorId") or c.get("vendor") or "")
if pid:
cookies_by_provider.setdefault(str(pid), []).append({
"name": c.get("name") or c.get("id") or "",
"purpose": _clean(str(c.get("purpose") or c.get("description") or "")),
"expiry": _clean(str(c.get("expiry") or c.get("retention")
or c.get("persistence") or "")),
"is_third_party": bool(c.get("isThirdParty")
or c.get("third_party")),
})
for p in providers:
pid = str(p.get("id") or p.get("vendorId") or p.get("name") or "")
cookies = (cookies_by_provider.get(pid, [])
or [{"name": c.get("name", ""),
"purpose": _clean(str(c.get("purpose", ""))),
"expiry": _clean(str(c.get("expiry") or c.get("persistence") or "")),
"is_third_party": True}
for c in (p.get("cookies", []) or [])])
out.append({
"name": p.get("name") or pid or "",
"country": (p.get("country") or "").strip(),
"purpose": _clean(str(p.get("purpose") or "")),
"category": (p.get("category") or "").strip(),
"opt_out_url": (p.get("optOutUrl") or p.get("optoutUrl")
or p.get("opt_out_url") or "").strip(),
"privacy_policy_url": (p.get("policyUrl") or p.get("policy_url")
or p.get("privacyPolicyUrl") or "").strip(),
"persistence": _clean(str(p.get("persistencePurposeDescription") or "")),
"cookies": cookies,
})
return out
@@ -54,3 +54,61 @@ def reconstruct(d: dict) -> str:
parts.append(line)
return "\n".join(parts)
def extract_vendors(d: dict) -> list[dict]:
"""Return structured vendor records from OneTrust JSON.
OneTrust groups cookies into 'Groups' (Strictly Necessary, Analytics,
Marketing, etc). Within each group, cookies are listed with Provider,
Host, Length (retention) and optional Privacy/Opt-Out URLs.
"""
out: list[dict] = []
seen: set[str] = set()
groups = d.get("Groups") or d.get("groups") or []
for g in groups:
category = g.get("GroupName") or g.get("name") or ""
for c in g.get("Cookies") or g.get("cookies") or []:
provider = (c.get("Provider") or c.get("provider")
or c.get("Host") or c.get("host") or "").strip()
if not provider:
continue
cookie_entry = {
"name": c.get("Name") or c.get("name") or "",
"purpose": _clean(str(c.get("description")
or c.get("Description") or "")),
"expiry": _clean(str(c.get("Length") or c.get("expires") or "")),
"is_third_party": (c.get("IsThirdParty")
or c.get("isThirdParty") or False),
}
if provider in seen:
# Append cookie to existing vendor
for entry in out:
if entry["name"] == provider:
entry["cookies"].append(cookie_entry)
break
else:
seen.add(provider)
out.append({
"name": provider,
"country": "",
"purpose": _clean(str(c.get("description")
or g.get("GroupDescription") or "")),
"category": category,
"opt_out_url": "",
"privacy_policy_url": (c.get("PolicyUrl")
or c.get("policyUrl") or ""),
"persistence": "",
"cookies": [cookie_entry],
})
return out
_TAG_RE = __import__("re").compile(r"<[^>]+>")
_WS_RE = __import__("re").compile(r"\s+")
def _clean(text: str) -> str:
no_tags = _TAG_RE.sub(" ", text or "")
return _WS_RE.sub(" ", no_tags).strip()
+13 -2
View File
@@ -168,6 +168,10 @@ class DSIDiscoveryResult:
total_found: int = 0
languages_detected: list[str] = field(default_factory=list)
errors: list[str] = field(default_factory=list)
# Raw CMP payloads captured during navigation (one per matched JSON).
# Schema: [{"kind": str, "url": str, "data": dict}, ...]
# Backend uses these to build vendor records + run per-vendor checks.
cmp_payloads: list[dict] = field(default_factory=list)
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
"""Check if text contains any DSI keyword. Returns (match, language)."""
@@ -270,6 +274,10 @@ async def discover_dsi_documents(
logger.info("PDF redirect detected: %s -> %s", url, final_url)
# Return early — a PDF redirect means no HTML content to scan
result.total_found = len(result.documents)
result.cmp_payloads = [
{"kind": kind, "data": data}
for kind, data in cmp_capture.payloads
]
return result
# Step 1b: Try dismissing cookie consent banners before extraction.
@@ -534,8 +542,11 @@ async def discover_dsi_documents(
result.languages_detected = list(set(
d.language for d in result.documents if d.language
))
logger.info("DSI discovery complete: %d documents found in %s",
result.total_found, result.languages_detected)
result.cmp_payloads = [
{"kind": kind, "data": data} for kind, data in cmp_capture.payloads
]
logger.info("DSI discovery complete: %d documents found in %s, %d CMP payloads",
result.total_found, result.languages_detected, len(result.cmp_payloads))
return result
# Nav elements, not real documents