feat(vvt): per-vendor extraction + opt-out check + VVT table in email (V1)

When a known CMP (ePaaS, OneTrust) renders the cookie policy, we now
extract structured vendor records, probe their opt-out + privacy URLs,
score each vendor (0-100), and append a 'VVT-Vorschlag' table to the
compliance email — one row per vendor, sortable by compliance score.

consent-tester:
- DSIDiscoveryResult.cmp_payloads: surfaces raw CMP JSON to callers
- DSIDiscoveryResponse: new cmp_payloads field
- discover_dsi_documents sets cmp_payloads from cmp_capture
- cmp_library/{epaas,onetrust}.py: new extract_vendors(d) returning
  list[VendorRecord]

backend:
- _fetch_text() now returns (text, cmp_payloads) tuple
- doc_entries store cmp_payloads per doc (mostly cookie)
- _autodiscover_missing forwards homepage payloads to the cookie entry
- New module vendor_extractor.py: dispatches ePaaS/OneTrust/generic
  schemas; dedupes vendors across multiple payloads
- cookie_link_validator.py extended with validate_vendor_urls(vendors)
  and score_vendors(vendors) — 0-100 score per vendor based on name,
  purpose, country, opt-out reachable, privacy URL reachable, cookies
  with names + expiry
- agent_doc_check_extras.build_vvt_table_html: renders the table
- Route appends VVT HTML after the provider list, before the
  document-by-document report
- Response JSON gains cmp_vendors for future frontend rendering

Example for BMW: ~30 ePaaS providers → table with Name | Kategorie |
Sitz | Cookies | Opt-Out (✓/✗) | Privacy (✓/✗) | Score. Sorted by
score ascending so the worst-compliant vendors are at the top.
This commit is contained in:
Benjamin Admin
2026-05-17 09:50:11 +02:00
parent c9c0fb5965
commit ea4dbb223f
8 changed files with 592 additions and 16 deletions
@@ -120,6 +120,127 @@ async def validate_links(links: list[LinkCheck]) -> list[LinkCheck]:
return links
# ── Per-vendor link validation ──────────────────────────────────────
async def validate_vendor_urls(vendors: list[dict]) -> list[dict]:
"""Probe opt-out and privacy URLs of each vendor. Mutates each vendor:
vendor["opt_out_status"] = int (0 = unreachable, 2xx/3xx = ok)
vendor["opt_out_ok"] = bool
vendor["privacy_status"] = int
vendor["privacy_ok"] = bool
"""
if not vendors:
return vendors
# Flatten into one list of LinkCheck (with back-reference to vendor)
probes: list[tuple[dict, str, str]] = [] # (vendor, url, kind)
for v in vendors:
if v.get("opt_out_url"):
probes.append((v, v["opt_out_url"], "opt_out"))
if v.get("privacy_policy_url"):
probes.append((v, v["privacy_policy_url"], "privacy"))
if not probes:
return vendors
sem = asyncio.Semaphore(_MAX_CONCURRENT)
async with httpx.AsyncClient(
timeout=_PER_URL_TIMEOUT,
follow_redirects=True,
headers={"User-Agent": "BreakPilot-LinkChecker/1.0"},
) as client:
async def probe(vendor: dict, url: str, kind: str) -> None:
async with sem:
try:
resp = await client.head(url)
if resp.status_code in (405, 403):
resp = await client.get(url)
vendor[f"{kind}_status"] = resp.status_code
vendor[f"{kind}_ok"] = 200 <= resp.status_code < 400
except Exception as e:
vendor[f"{kind}_status"] = 0
vendor[f"{kind}_ok"] = False
vendor[f"{kind}_error"] = str(e)[:60]
try:
await asyncio.wait_for(
asyncio.gather(*[probe(v, u, k) for v, u, k in probes]),
timeout=_BATCH_TIMEOUT,
)
except asyncio.TimeoutError:
logger.warning("vendor-link batch timeout (%d probes)", len(probes))
return vendors
def score_vendors(vendors: list[dict]) -> list[dict]:
"""Compute per-vendor compliance score (0-100) and flags. Mutates."""
for v in vendors:
score = 0
max_score = 0
flags: list[str] = []
# Name (always required) — 20
max_score += 20
if v.get("name"):
score += 20
else:
flags.append("no_name")
# Purpose — 15
max_score += 15
if v.get("purpose"):
score += 15
else:
flags.append("no_purpose")
# Country (3rd-country transfer relevance) — 10
max_score += 10
if v.get("country"):
score += 10
else:
flags.append("no_country")
# Opt-Out URL present + reachable — 25
max_score += 25
if not v.get("opt_out_url"):
flags.append("no_opt_out_url")
elif v.get("opt_out_ok") is False:
flags.append("broken_opt_out")
score += 5 # at least they tried
else:
score += 25
# Privacy policy URL present + reachable — 15
max_score += 15
if not v.get("privacy_policy_url"):
flags.append("no_privacy_url")
elif v.get("privacy_ok") is False:
flags.append("broken_privacy_url")
score += 5
else:
score += 15
# Cookies disclosed (names + expiry) — 15
max_score += 15
cookies = v.get("cookies") or []
if cookies:
named = sum(1 for c in cookies if c.get("name"))
with_expiry = sum(1 for c in cookies if c.get("expiry"))
if named >= 1 and with_expiry >= 1:
score += 15
elif named >= 1:
score += 8
flags.append("cookies_no_expiry")
else:
flags.append("cookies_no_names")
else:
flags.append("no_cookies_listed")
v["compliance_score"] = round(score / max_score * 100) if max_score else 0
v["compliance_flags"] = flags
return vendors
# ── CheckItem rendering ──────────────────────────────────────────────
def build_check_items(validated: list[LinkCheck]) -> list[dict]: