feat(vvt): per-vendor extraction + opt-out check + VVT table in email (V1)
When a known CMP (ePaaS, OneTrust) renders the cookie policy, we now
extract structured vendor records, probe their opt-out + privacy URLs,
score each vendor (0-100), and append a 'VVT-Vorschlag' table to the
compliance email — one row per vendor, sortable by compliance score.
consent-tester:
- DSIDiscoveryResult.cmp_payloads: surfaces raw CMP JSON to callers
- DSIDiscoveryResponse: new cmp_payloads field
- discover_dsi_documents sets cmp_payloads from cmp_capture
- cmp_library/{epaas,onetrust}.py: new extract_vendors(d) returning
list[VendorRecord]
backend:
- _fetch_text() now returns (text, cmp_payloads) tuple
- doc_entries store cmp_payloads per doc (mostly cookie)
- _autodiscover_missing forwards homepage payloads to the cookie entry
- New module vendor_extractor.py: dispatches ePaaS/OneTrust/generic
schemas; dedupes vendors across multiple payloads
- cookie_link_validator.py extended with validate_vendor_urls(vendors)
and score_vendors(vendors) — 0-100 score per vendor based on name,
purpose, country, opt-out reachable, privacy URL reachable, cookies
with names + expiry
- agent_doc_check_extras.build_vvt_table_html: renders the table
- Route appends VVT HTML after the provider list, before the
document-by-document report
- Response JSON gains cmp_vendors for future frontend rendering
Example for BMW: ~30 ePaaS providers → table with Name | Kategorie |
Sitz | Cookies | Opt-Out (✓/✗) | Privacy (✓/✗) | Score. Sorted by
score ascending so the worst-compliant vendors are at the top.
This commit is contained in:
@@ -171,12 +171,13 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
pct = int(1 + (i / n_docs) * 29)
|
||||
_update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct)
|
||||
text = doc.text
|
||||
cmp_payloads: list[dict] = []
|
||||
if not text and doc.url:
|
||||
url_key = doc.url.strip().rstrip("/").lower()
|
||||
if url_key in url_text_cache:
|
||||
text = url_text_cache[url_key]
|
||||
else:
|
||||
text = await _fetch_text(doc.url, doc_type=doc.doc_type)
|
||||
text, cmp_payloads = await _fetch_text(doc.url, doc_type=doc.doc_type)
|
||||
if text:
|
||||
url_text_cache[url_key] = text
|
||||
if text:
|
||||
@@ -188,6 +189,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
"word_count": len(text.split()) if text else 0,
|
||||
"auto_discovered": False,
|
||||
"discovery_attempted": False,
|
||||
"cmp_payloads": cmp_payloads,
|
||||
})
|
||||
|
||||
# Step 1a-bis: AUTO-DISCOVERY. For each canonical doc_type the user
|
||||
@@ -367,14 +369,42 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
build_scanned_urls_html,
|
||||
build_provider_list_html,
|
||||
)
|
||||
from .agent_doc_check_extras import build_vvt_table_html
|
||||
|
||||
# Extract structured vendor records from any CMP payloads captured
|
||||
# for the cookie doc (BMW ePaaS, OneTrust, etc.), validate their
|
||||
# opt-out + privacy URLs concurrently, score each entry.
|
||||
cmp_vendors: list[dict] = []
|
||||
try:
|
||||
from compliance.services.vendor_extractor import (
|
||||
extract_vendors_from_payloads,
|
||||
)
|
||||
from compliance.services.cookie_link_validator import (
|
||||
validate_vendor_urls, score_vendors,
|
||||
)
|
||||
cookie_payloads = []
|
||||
for e in doc_entries:
|
||||
if e.get("doc_type") == "cookie" and e.get("cmp_payloads"):
|
||||
cookie_payloads.extend(e["cmp_payloads"])
|
||||
if cookie_payloads:
|
||||
cmp_vendors = extract_vendors_from_payloads(cookie_payloads)
|
||||
if cmp_vendors:
|
||||
logger.info("VVT: %d vendors extracted, validating links",
|
||||
len(cmp_vendors))
|
||||
cmp_vendors = await validate_vendor_urls(cmp_vendors)
|
||||
cmp_vendors = score_vendors(cmp_vendors)
|
||||
except Exception as e:
|
||||
logger.warning("VVT vendor extraction skipped: %s", e)
|
||||
|
||||
summary_html = build_management_summary(results)
|
||||
scanned_html = build_scanned_urls_html(doc_entries)
|
||||
providers_html = build_provider_list_html(banner_result, vvt_entries)
|
||||
vvt_html = build_vvt_table_html(cmp_vendors)
|
||||
report_html = build_html_report(results, None)
|
||||
profile_html = _build_profile_html(profile)
|
||||
full_html = (
|
||||
summary_html + scanned_html + profile_html
|
||||
+ providers_html + report_html
|
||||
+ providers_html + vvt_html + report_html
|
||||
)
|
||||
|
||||
# Step 6: Send email — derive site name primarily from entered URL.
|
||||
@@ -404,6 +434,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
"tcf_vendor_count": len(tcf_vendors),
|
||||
} if banner_result else None,
|
||||
"tcf_vendors": vvt_entries if tcf_vendors else [],
|
||||
"cmp_vendors": cmp_vendors,
|
||||
"total_documents": len(results),
|
||||
"total_findings": total_findings,
|
||||
"email_status": email_result.get("status", "failed"),
|
||||
@@ -428,15 +459,13 @@ def _update(check_id: str, msg: str, pct: int | None = None):
|
||||
job["progress_pct"] = max(0, min(100, int(pct)))
|
||||
|
||||
|
||||
async def _fetch_text(url: str, doc_type: str = "") -> str:
|
||||
async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
|
||||
"""Fetch text from URL via consent-tester, with HTTP fallback.
|
||||
|
||||
1. Try consent-tester (Playwright) — handles JS-heavy SPAs
|
||||
2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages
|
||||
|
||||
doc_type controls how aggressively we follow sub-links — cookie/dse
|
||||
pages prefer self-extract only (CMP capture is authoritative); legal/
|
||||
imprint pages need to follow sub-pages (Versicherungsvermittler etc).
|
||||
Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
|
||||
during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
|
||||
HTTP fallback was used. Backend turns payloads into structured vendor
|
||||
records for the VVT table in the email.
|
||||
"""
|
||||
# 1. Consent-tester (Playwright-based, full JS rendering).
|
||||
# max_documents depends on doc_type:
|
||||
@@ -456,7 +485,9 @@ async def _fetch_text(url: str, doc_type: str = "") -> str:
|
||||
timeout=120.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
docs = resp.json().get("documents", [])
|
||||
payload = resp.json()
|
||||
docs = payload.get("documents", [])
|
||||
cmp_payloads = payload.get("cmp_payloads") or []
|
||||
if docs:
|
||||
texts = []
|
||||
for doc in docs:
|
||||
@@ -468,7 +499,7 @@ async def _fetch_text(url: str, doc_type: str = "") -> str:
|
||||
if len(texts) > 1:
|
||||
logger.info("Merged %d docs from %s (%d words)",
|
||||
len(texts), url, len(merged.split()))
|
||||
return merged
|
||||
return merged, cmp_payloads
|
||||
except Exception as e:
|
||||
logger.warning("Consent-tester fetch failed for %s: %s", url, e)
|
||||
|
||||
@@ -486,11 +517,11 @@ async def _fetch_text(url: str, doc_type: str = "") -> str:
|
||||
text = _re.sub(r"\s+", " ", text).strip()
|
||||
if len(text.split()) > 100:
|
||||
logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
|
||||
return text
|
||||
return text, []
|
||||
except Exception as e:
|
||||
logger.warning("HTTP fallback failed for %s: %s", url, e)
|
||||
|
||||
return ""
|
||||
return "", []
|
||||
|
||||
|
||||
async def _autodiscover_missing(
|
||||
@@ -559,11 +590,15 @@ async def _autodiscover_missing(
|
||||
if resp.status_code != 200:
|
||||
logger.warning("auto-discovery: HTTP %d for %s", resp.status_code, base)
|
||||
discovered: list[dict] = []
|
||||
disc_payloads: list[dict] = []
|
||||
else:
|
||||
discovered = resp.json().get("documents", [])
|
||||
disc_body = resp.json()
|
||||
discovered = disc_body.get("documents", [])
|
||||
disc_payloads = disc_body.get("cmp_payloads") or []
|
||||
except Exception as e:
|
||||
logger.warning("auto-discovery failed for %s: %s", base, e)
|
||||
discovered = []
|
||||
disc_payloads = []
|
||||
|
||||
# Classify each discovered doc into a canonical doc_type
|
||||
by_type: dict[str, dict] = {}
|
||||
@@ -585,6 +620,7 @@ async def _autodiscover_missing(
|
||||
new_entry: dict = {
|
||||
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
||||
"auto_discovered": False, "discovery_attempted": True,
|
||||
"cmp_payloads": [],
|
||||
}
|
||||
d = by_type.get(dt)
|
||||
if d:
|
||||
@@ -594,6 +630,11 @@ async def _autodiscover_missing(
|
||||
new_entry["url"] = d.get("url", "")
|
||||
new_entry["word_count"] = len(full.split())
|
||||
new_entry["auto_discovered"] = True
|
||||
# Auto-discovery happens on the HOMEPAGE — any CMP payload
|
||||
# captured at that level likely belongs to the cookie page
|
||||
# (CMP widget loaded site-wide). Attach to 'cookie' entry.
|
||||
if dt == "cookie" and disc_payloads:
|
||||
new_entry["cmp_payloads"] = disc_payloads
|
||||
doc_texts[dt] = full
|
||||
filled += 1
|
||||
logger.info(
|
||||
|
||||
Reference in New Issue
Block a user