feat(vvt): per-vendor extraction + opt-out check + VVT table in email (V1)
When a known CMP (ePaaS, OneTrust) renders the cookie policy, we now
extract structured vendor records, probe their opt-out + privacy URLs,
score each vendor (0-100), and append a 'VVT-Vorschlag' table to the
compliance email — one row per vendor, sortable by compliance score.
consent-tester:
- DSIDiscoveryResult.cmp_payloads: surfaces raw CMP JSON to callers
- DSIDiscoveryResponse: new cmp_payloads field
- discover_dsi_documents sets cmp_payloads from cmp_capture
- cmp_library/{epaas,onetrust}.py: new extract_vendors(d) returning
list[VendorRecord]
backend:
- _fetch_text() now returns (text, cmp_payloads) tuple
- doc_entries store cmp_payloads per doc (mostly cookie)
- _autodiscover_missing forwards homepage payloads to the cookie entry
- New module vendor_extractor.py: dispatches ePaaS/OneTrust/generic
schemas; dedupes vendors across multiple payloads
- cookie_link_validator.py extended with validate_vendor_urls(vendors)
and score_vendors(vendors) — 0-100 score per vendor based on name,
purpose, country, opt-out reachable, privacy URL reachable, cookies
with names + expiry
- agent_doc_check_extras.build_vvt_table_html: renders the table
- Route appends VVT HTML after the provider list, before the
document-by-document report
- Response JSON gains cmp_vendors for future frontend rendering
Example for BMW: ~30 ePaaS providers → table with Name | Kategorie |
Sitz | Cookies | Opt-Out (✓/✗) | Privacy (✓/✗) | Score. Sorted by
score ascending so the worst-compliant vendors are at the top.
This commit is contained in:
@@ -293,6 +293,9 @@ class DSIDiscoveryResponse(BaseModel):
|
||||
languages_detected: list[str]
|
||||
errors: list[str]
|
||||
scanned_at: str
|
||||
# Raw CMP payloads captured during navigation (ePaaS, OneTrust, etc.).
|
||||
# Backend uses these to build the per-vendor compliance table.
|
||||
cmp_payloads: list[dict] = []
|
||||
|
||||
|
||||
@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
|
||||
@@ -343,6 +346,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
|
||||
languages_detected=result.languages_detected,
|
||||
errors=result.errors,
|
||||
scanned_at=datetime.now(timezone.utc).isoformat(),
|
||||
cmp_payloads=result.cmp_payloads,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -67,3 +67,53 @@ def reconstruct(d: dict) -> str:
|
||||
parts.append(_clean(str(meta["persistencePurposeText"])))
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def extract_vendors(d: dict) -> list[dict]:
|
||||
"""Return structured vendor records from ePaaS policy JSON.
|
||||
|
||||
Schema returned (per vendor):
|
||||
{name, country, purpose, category, opt_out_url, privacy_policy_url,
|
||||
persistence, cookies: [{name, purpose, expiry, is_third_party}]}
|
||||
"""
|
||||
out: list[dict] = []
|
||||
providers = d.get("providers", []) or []
|
||||
cookies_by_provider: dict[str, list[dict]] = {}
|
||||
|
||||
# ePaaS sometimes stores cookies in a separate 'cookies' array referenced
|
||||
# by providerId. If so, group them by provider.
|
||||
cookies_list = d.get("cookies", []) or []
|
||||
for c in cookies_list:
|
||||
pid = (c.get("providerId") or c.get("provider")
|
||||
or c.get("vendorId") or c.get("vendor") or "")
|
||||
if pid:
|
||||
cookies_by_provider.setdefault(str(pid), []).append({
|
||||
"name": c.get("name") or c.get("id") or "",
|
||||
"purpose": _clean(str(c.get("purpose") or c.get("description") or "")),
|
||||
"expiry": _clean(str(c.get("expiry") or c.get("retention")
|
||||
or c.get("persistence") or "")),
|
||||
"is_third_party": bool(c.get("isThirdParty")
|
||||
or c.get("third_party")),
|
||||
})
|
||||
|
||||
for p in providers:
|
||||
pid = str(p.get("id") or p.get("vendorId") or p.get("name") or "")
|
||||
cookies = (cookies_by_provider.get(pid, [])
|
||||
or [{"name": c.get("name", ""),
|
||||
"purpose": _clean(str(c.get("purpose", ""))),
|
||||
"expiry": _clean(str(c.get("expiry") or c.get("persistence") or "")),
|
||||
"is_third_party": True}
|
||||
for c in (p.get("cookies", []) or [])])
|
||||
out.append({
|
||||
"name": p.get("name") or pid or "",
|
||||
"country": (p.get("country") or "").strip(),
|
||||
"purpose": _clean(str(p.get("purpose") or "")),
|
||||
"category": (p.get("category") or "").strip(),
|
||||
"opt_out_url": (p.get("optOutUrl") or p.get("optoutUrl")
|
||||
or p.get("opt_out_url") or "").strip(),
|
||||
"privacy_policy_url": (p.get("policyUrl") or p.get("policy_url")
|
||||
or p.get("privacyPolicyUrl") or "").strip(),
|
||||
"persistence": _clean(str(p.get("persistencePurposeDescription") or "")),
|
||||
"cookies": cookies,
|
||||
})
|
||||
return out
|
||||
|
||||
@@ -54,3 +54,61 @@ def reconstruct(d: dict) -> str:
|
||||
parts.append(line)
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def extract_vendors(d: dict) -> list[dict]:
|
||||
"""Return structured vendor records from OneTrust JSON.
|
||||
|
||||
OneTrust groups cookies into 'Groups' (Strictly Necessary, Analytics,
|
||||
Marketing, etc). Within each group, cookies are listed with Provider,
|
||||
Host, Length (retention) and optional Privacy/Opt-Out URLs.
|
||||
"""
|
||||
out: list[dict] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
groups = d.get("Groups") or d.get("groups") or []
|
||||
for g in groups:
|
||||
category = g.get("GroupName") or g.get("name") or ""
|
||||
for c in g.get("Cookies") or g.get("cookies") or []:
|
||||
provider = (c.get("Provider") or c.get("provider")
|
||||
or c.get("Host") or c.get("host") or "").strip()
|
||||
if not provider:
|
||||
continue
|
||||
cookie_entry = {
|
||||
"name": c.get("Name") or c.get("name") or "",
|
||||
"purpose": _clean(str(c.get("description")
|
||||
or c.get("Description") or "")),
|
||||
"expiry": _clean(str(c.get("Length") or c.get("expires") or "")),
|
||||
"is_third_party": (c.get("IsThirdParty")
|
||||
or c.get("isThirdParty") or False),
|
||||
}
|
||||
if provider in seen:
|
||||
# Append cookie to existing vendor
|
||||
for entry in out:
|
||||
if entry["name"] == provider:
|
||||
entry["cookies"].append(cookie_entry)
|
||||
break
|
||||
else:
|
||||
seen.add(provider)
|
||||
out.append({
|
||||
"name": provider,
|
||||
"country": "",
|
||||
"purpose": _clean(str(c.get("description")
|
||||
or g.get("GroupDescription") or "")),
|
||||
"category": category,
|
||||
"opt_out_url": "",
|
||||
"privacy_policy_url": (c.get("PolicyUrl")
|
||||
or c.get("policyUrl") or ""),
|
||||
"persistence": "",
|
||||
"cookies": [cookie_entry],
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
_TAG_RE = __import__("re").compile(r"<[^>]+>")
|
||||
_WS_RE = __import__("re").compile(r"\s+")
|
||||
|
||||
|
||||
def _clean(text: str) -> str:
|
||||
no_tags = _TAG_RE.sub(" ", text or "")
|
||||
return _WS_RE.sub(" ", no_tags).strip()
|
||||
|
||||
@@ -168,6 +168,10 @@ class DSIDiscoveryResult:
|
||||
total_found: int = 0
|
||||
languages_detected: list[str] = field(default_factory=list)
|
||||
errors: list[str] = field(default_factory=list)
|
||||
# Raw CMP payloads captured during navigation (one per matched JSON).
|
||||
# Schema: [{"kind": str, "url": str, "data": dict}, ...]
|
||||
# Backend uses these to build vendor records + run per-vendor checks.
|
||||
cmp_payloads: list[dict] = field(default_factory=list)
|
||||
|
||||
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
|
||||
"""Check if text contains any DSI keyword. Returns (match, language)."""
|
||||
@@ -270,6 +274,10 @@ async def discover_dsi_documents(
|
||||
logger.info("PDF redirect detected: %s -> %s", url, final_url)
|
||||
# Return early — a PDF redirect means no HTML content to scan
|
||||
result.total_found = len(result.documents)
|
||||
result.cmp_payloads = [
|
||||
{"kind": kind, "data": data}
|
||||
for kind, data in cmp_capture.payloads
|
||||
]
|
||||
return result
|
||||
|
||||
# Step 1b: Try dismissing cookie consent banners before extraction.
|
||||
@@ -534,8 +542,11 @@ async def discover_dsi_documents(
|
||||
result.languages_detected = list(set(
|
||||
d.language for d in result.documents if d.language
|
||||
))
|
||||
logger.info("DSI discovery complete: %d documents found in %s",
|
||||
result.total_found, result.languages_detected)
|
||||
result.cmp_payloads = [
|
||||
{"kind": kind, "data": data} for kind, data in cmp_capture.payloads
|
||||
]
|
||||
logger.info("DSI discovery complete: %d documents found in %s, %d CMP payloads",
|
||||
result.total_found, result.languages_detected, len(result.cmp_payloads))
|
||||
return result
|
||||
|
||||
# Nav elements, not real documents
|
||||
|
||||
Reference in New Issue
Block a user