feat(vvt): per-vendor extraction + opt-out check + VVT table in email (V1)

When a known CMP (ePaaS, OneTrust) renders the cookie policy, we now
extract structured vendor records, probe their opt-out + privacy URLs,
score each vendor (0-100), and append a 'VVT-Vorschlag' table to the
compliance email — one row per vendor, sortable by compliance score.

consent-tester:
- DSIDiscoveryResult.cmp_payloads: surfaces raw CMP JSON to callers
- DSIDiscoveryResponse: new cmp_payloads field
- discover_dsi_documents sets cmp_payloads from cmp_capture
- cmp_library/{epaas,onetrust}.py: new extract_vendors(d) returning
  list[VendorRecord]

backend:
- _fetch_text() now returns (text, cmp_payloads) tuple
- doc_entries store cmp_payloads per doc (mostly cookie)
- _autodiscover_missing forwards homepage payloads to the cookie entry
- New module vendor_extractor.py: dispatches ePaaS/OneTrust/generic
  schemas; dedupes vendors across multiple payloads
- cookie_link_validator.py extended with validate_vendor_urls(vendors)
  and score_vendors(vendors) — 0-100 score per vendor based on name,
  purpose, country, opt-out reachable, privacy URL reachable, cookies
  with names + expiry
- agent_doc_check_extras.build_vvt_table_html: renders the table
- Route appends VVT HTML after the provider list, before the
  document-by-document report
- Response JSON gains cmp_vendors for future frontend rendering

Example for BMW: ~30 ePaaS providers → table with Name | Kategorie |
Sitz | Cookies | Opt-Out (✓/✗) | Privacy (✓/✗) | Score. Sorted by
score ascending so the worst-compliant vendors are at the top.
This commit is contained in:
Benjamin Admin
2026-05-17 09:50:11 +02:00
parent c9c0fb5965
commit ea4dbb223f
8 changed files with 592 additions and 16 deletions
@@ -171,12 +171,13 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
pct = int(1 + (i / n_docs) * 29)
_update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct)
text = doc.text
cmp_payloads: list[dict] = []
if not text and doc.url:
url_key = doc.url.strip().rstrip("/").lower()
if url_key in url_text_cache:
text = url_text_cache[url_key]
else:
text = await _fetch_text(doc.url, doc_type=doc.doc_type)
text, cmp_payloads = await _fetch_text(doc.url, doc_type=doc.doc_type)
if text:
url_text_cache[url_key] = text
if text:
@@ -188,6 +189,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
"word_count": len(text.split()) if text else 0,
"auto_discovered": False,
"discovery_attempted": False,
"cmp_payloads": cmp_payloads,
})
# Step 1a-bis: AUTO-DISCOVERY. For each canonical doc_type the user
@@ -367,14 +369,42 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
build_scanned_urls_html,
build_provider_list_html,
)
from .agent_doc_check_extras import build_vvt_table_html
# Extract structured vendor records from any CMP payloads captured
# for the cookie doc (BMW ePaaS, OneTrust, etc.), validate their
# opt-out + privacy URLs concurrently, score each entry.
cmp_vendors: list[dict] = []
try:
from compliance.services.vendor_extractor import (
extract_vendors_from_payloads,
)
from compliance.services.cookie_link_validator import (
validate_vendor_urls, score_vendors,
)
cookie_payloads = []
for e in doc_entries:
if e.get("doc_type") == "cookie" and e.get("cmp_payloads"):
cookie_payloads.extend(e["cmp_payloads"])
if cookie_payloads:
cmp_vendors = extract_vendors_from_payloads(cookie_payloads)
if cmp_vendors:
logger.info("VVT: %d vendors extracted, validating links",
len(cmp_vendors))
cmp_vendors = await validate_vendor_urls(cmp_vendors)
cmp_vendors = score_vendors(cmp_vendors)
except Exception as e:
logger.warning("VVT vendor extraction skipped: %s", e)
summary_html = build_management_summary(results)
scanned_html = build_scanned_urls_html(doc_entries)
providers_html = build_provider_list_html(banner_result, vvt_entries)
vvt_html = build_vvt_table_html(cmp_vendors)
report_html = build_html_report(results, None)
profile_html = _build_profile_html(profile)
full_html = (
summary_html + scanned_html + profile_html
+ providers_html + report_html
+ providers_html + vvt_html + report_html
)
# Step 6: Send email — derive site name primarily from entered URL.
@@ -404,6 +434,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
"tcf_vendor_count": len(tcf_vendors),
} if banner_result else None,
"tcf_vendors": vvt_entries if tcf_vendors else [],
"cmp_vendors": cmp_vendors,
"total_documents": len(results),
"total_findings": total_findings,
"email_status": email_result.get("status", "failed"),
@@ -428,15 +459,13 @@ def _update(check_id: str, msg: str, pct: int | None = None):
job["progress_pct"] = max(0, min(100, int(pct)))
async def _fetch_text(url: str, doc_type: str = "") -> str:
async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
"""Fetch text from URL via consent-tester, with HTTP fallback.
1. Try consent-tester (Playwright) — handles JS-heavy SPAs
2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages
doc_type controls how aggressively we follow sub-links — cookie/dse
pages prefer self-extract only (CMP capture is authoritative); legal/
imprint pages need to follow sub-pages (Versicherungsvermittler etc).
Returns (text, cmp_payloads). cmp_payloads is the raw CMP JSON captured
during navigation (ePaaS, OneTrust, …) — empty when no CMP fired or
HTTP fallback was used. Backend turns payloads into structured vendor
records for the VVT table in the email.
"""
# 1. Consent-tester (Playwright-based, full JS rendering).
# max_documents depends on doc_type:
@@ -456,7 +485,9 @@ async def _fetch_text(url: str, doc_type: str = "") -> str:
timeout=120.0,
)
if resp.status_code == 200:
docs = resp.json().get("documents", [])
payload = resp.json()
docs = payload.get("documents", [])
cmp_payloads = payload.get("cmp_payloads") or []
if docs:
texts = []
for doc in docs:
@@ -468,7 +499,7 @@ async def _fetch_text(url: str, doc_type: str = "") -> str:
if len(texts) > 1:
logger.info("Merged %d docs from %s (%d words)",
len(texts), url, len(merged.split()))
return merged
return merged, cmp_payloads
except Exception as e:
logger.warning("Consent-tester fetch failed for %s: %s", url, e)
@@ -486,11 +517,11 @@ async def _fetch_text(url: str, doc_type: str = "") -> str:
text = _re.sub(r"\s+", " ", text).strip()
if len(text.split()) > 100:
logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
return text
return text, []
except Exception as e:
logger.warning("HTTP fallback failed for %s: %s", url, e)
return ""
return "", []
async def _autodiscover_missing(
@@ -559,11 +590,15 @@ async def _autodiscover_missing(
if resp.status_code != 200:
logger.warning("auto-discovery: HTTP %d for %s", resp.status_code, base)
discovered: list[dict] = []
disc_payloads: list[dict] = []
else:
discovered = resp.json().get("documents", [])
disc_body = resp.json()
discovered = disc_body.get("documents", [])
disc_payloads = disc_body.get("cmp_payloads") or []
except Exception as e:
logger.warning("auto-discovery failed for %s: %s", base, e)
discovered = []
disc_payloads = []
# Classify each discovered doc into a canonical doc_type
by_type: dict[str, dict] = {}
@@ -585,6 +620,7 @@ async def _autodiscover_missing(
new_entry: dict = {
"doc_type": dt, "url": "", "text": "", "word_count": 0,
"auto_discovered": False, "discovery_attempted": True,
"cmp_payloads": [],
}
d = by_type.get(dt)
if d:
@@ -594,6 +630,11 @@ async def _autodiscover_missing(
new_entry["url"] = d.get("url", "")
new_entry["word_count"] = len(full.split())
new_entry["auto_discovered"] = True
# Auto-discovery happens on the HOMEPAGE — any CMP payload
# captured at that level likely belongs to the cookie page
# (CMP widget loaded site-wide). Attach to 'cookie' entry.
if dt == "cookie" and disc_payloads:
new_entry["cmp_payloads"] = disc_payloads
doc_texts[dt] = full
filled += 1
logger.info(
@@ -229,4 +229,105 @@ def _category_label(kat: str) -> str:
"functional": "Funktional",
"statistics": "Statistik",
"marketing": "Marketing",
"strictlyNecessary": "Notwendig",
"advertising": "Marketing",
}.get(kat, kat or "")
def build_vvt_table_html(vendors: list[dict]) -> str:
"""Render the per-vendor VVT-style table for the email report.
One row per vendor. Columns: Name | Kategorie | Sitz | Cookies |
Opt-Out (Status) | Privacy (Status) | Compliance-Score.
Vendors are expected to come from vendor_extractor.extract_vendors_from_payloads
and have already been scored by cookie_link_validator.score_vendors.
"""
if not vendors:
return ""
vendors = sorted(vendors, key=lambda v: v.get("compliance_score", 0))
rows: list[str] = []
for v in vendors:
name = v.get("name") or "Unbekannt"
category = _category_label(v.get("category", ""))
country = v.get("country") or ""
cookies = v.get("cookies") or []
n_cookies = len(cookies)
score = int(v.get("compliance_score", 0))
flags = v.get("compliance_flags") or []
opt_status = _link_status_badge(
v.get("opt_out_url"), v.get("opt_out_ok"),
v.get("opt_out_status"),
)
privacy_status = _link_status_badge(
v.get("privacy_policy_url"), v.get("privacy_ok"),
v.get("privacy_status"),
)
score_color = ("#16a34a" if score >= 80 else
"#d97706" if score >= 50 else "#dc2626")
flag_str = ""
if flags:
flag_str = (
f'<div style="font-size:10px;color:#94a3b8;margin-top:2px">'
f'{", ".join(flags[:4])}</div>'
)
rows.append(
f'<tr style="border-top:1px solid #e2e8f0">'
f'<td style="padding:6px 8px;color:#1e293b;font-size:11px">'
f'{name}{flag_str}</td>'
f'<td style="padding:6px 8px;color:#475569;font-size:11px">{category}</td>'
f'<td style="padding:6px 8px;color:#475569;font-size:11px">{country}</td>'
f'<td style="padding:6px 8px;text-align:center;color:#475569;font-size:11px">'
f'{n_cookies}</td>'
f'<td style="padding:6px 8px;text-align:center">{opt_status}</td>'
f'<td style="padding:6px 8px;text-align:center">{privacy_status}</td>'
f'<td style="padding:6px 8px;text-align:right;font-weight:600;'
f'color:{score_color};font-size:11px">{score}%</td>'
f'</tr>'
)
n_total = len(vendors)
n_critical = sum(1 for v in vendors if v.get("compliance_score", 0) < 50)
summary = (
f"{n_total} Anbieter erfasst"
+ (f", <strong style=\"color:#dc2626\">{n_critical} unter 50%</strong>"
if n_critical else " — alle ueber 50%")
)
return (
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:760px;margin:0 auto 16px;padding:12px 16px;'
'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
'<h3 style="margin:0 0 4px;font-size:14px;color:#334155">'
'VVT-Vorschlag: Drittanbieter aus Cookie-Richtlinie</h3>'
f'<p style="margin:0 0 10px;font-size:11px;color:#6b7280">{summary}. '
'Sortiert nach Compliance-Score (niedrig zuerst — diese Eintraege '
'pruefen).</p>'
'<table style="width:100%;border-collapse:collapse;font-size:11px">'
'<thead><tr style="background:#f1f5f9;color:#475569;text-align:left">'
'<th style="padding:5px 8px">Name</th>'
'<th style="padding:5px 8px">Kategorie</th>'
'<th style="padding:5px 8px">Sitz</th>'
'<th style="padding:5px 8px;text-align:center">Cookies</th>'
'<th style="padding:5px 8px;text-align:center">Opt-Out</th>'
'<th style="padding:5px 8px;text-align:center">Privacy</th>'
'<th style="padding:5px 8px;text-align:right">Score</th>'
'</tr></thead><tbody>'
+ "".join(rows)
+ '</tbody></table></div>'
)
def _link_status_badge(url: str | None, ok: bool | None, status: int | None) -> str:
if not url:
return ('<span style="color:#dc2626;font-size:11px" title="Kein Link">'
'&#10007;</span>')
if ok:
return ('<span style="color:#16a34a;font-size:11px" '
f'title="HTTP {status}">&#10003;</span>')
status_str = str(status) if status else "?"
return ('<span style="color:#dc2626;font-size:11px" '
f'title="HTTP {status_str}">&#10007; ({status_str})</span>')
@@ -120,6 +120,127 @@ async def validate_links(links: list[LinkCheck]) -> list[LinkCheck]:
return links
# ── Per-vendor link validation ──────────────────────────────────────
async def validate_vendor_urls(vendors: list[dict]) -> list[dict]:
"""Probe opt-out and privacy URLs of each vendor. Mutates each vendor:
vendor["opt_out_status"] = int (0 = unreachable, 2xx/3xx = ok)
vendor["opt_out_ok"] = bool
vendor["privacy_status"] = int
vendor["privacy_ok"] = bool
"""
if not vendors:
return vendors
# Flatten into one list of LinkCheck (with back-reference to vendor)
probes: list[tuple[dict, str, str]] = [] # (vendor, url, kind)
for v in vendors:
if v.get("opt_out_url"):
probes.append((v, v["opt_out_url"], "opt_out"))
if v.get("privacy_policy_url"):
probes.append((v, v["privacy_policy_url"], "privacy"))
if not probes:
return vendors
sem = asyncio.Semaphore(_MAX_CONCURRENT)
async with httpx.AsyncClient(
timeout=_PER_URL_TIMEOUT,
follow_redirects=True,
headers={"User-Agent": "BreakPilot-LinkChecker/1.0"},
) as client:
async def probe(vendor: dict, url: str, kind: str) -> None:
async with sem:
try:
resp = await client.head(url)
if resp.status_code in (405, 403):
resp = await client.get(url)
vendor[f"{kind}_status"] = resp.status_code
vendor[f"{kind}_ok"] = 200 <= resp.status_code < 400
except Exception as e:
vendor[f"{kind}_status"] = 0
vendor[f"{kind}_ok"] = False
vendor[f"{kind}_error"] = str(e)[:60]
try:
await asyncio.wait_for(
asyncio.gather(*[probe(v, u, k) for v, u, k in probes]),
timeout=_BATCH_TIMEOUT,
)
except asyncio.TimeoutError:
logger.warning("vendor-link batch timeout (%d probes)", len(probes))
return vendors
def score_vendors(vendors: list[dict]) -> list[dict]:
"""Compute per-vendor compliance score (0-100) and flags. Mutates."""
for v in vendors:
score = 0
max_score = 0
flags: list[str] = []
# Name (always required) — 20
max_score += 20
if v.get("name"):
score += 20
else:
flags.append("no_name")
# Purpose — 15
max_score += 15
if v.get("purpose"):
score += 15
else:
flags.append("no_purpose")
# Country (3rd-country transfer relevance) — 10
max_score += 10
if v.get("country"):
score += 10
else:
flags.append("no_country")
# Opt-Out URL present + reachable — 25
max_score += 25
if not v.get("opt_out_url"):
flags.append("no_opt_out_url")
elif v.get("opt_out_ok") is False:
flags.append("broken_opt_out")
score += 5 # at least they tried
else:
score += 25
# Privacy policy URL present + reachable — 15
max_score += 15
if not v.get("privacy_policy_url"):
flags.append("no_privacy_url")
elif v.get("privacy_ok") is False:
flags.append("broken_privacy_url")
score += 5
else:
score += 15
# Cookies disclosed (names + expiry) — 15
max_score += 15
cookies = v.get("cookies") or []
if cookies:
named = sum(1 for c in cookies if c.get("name"))
with_expiry = sum(1 for c in cookies if c.get("expiry"))
if named >= 1 and with_expiry >= 1:
score += 15
elif named >= 1:
score += 8
flags.append("cookies_no_expiry")
else:
flags.append("cookies_no_names")
else:
flags.append("no_cookies_listed")
v["compliance_score"] = round(score / max_score * 100) if max_score else 0
v["compliance_flags"] = flags
return vendors
# ── CheckItem rendering ──────────────────────────────────────────────
def build_check_items(validated: list[LinkCheck]) -> list[dict]:
@@ -0,0 +1,190 @@
"""
Vendor record extraction from captured CMP payloads.
Mirrors the per-CMP `extract_vendors()` functions in consent-tester's
cmp_library/ — duplicated here because the backend cannot import the
consent-tester package (different containers). Schemas are stable per CMP
vendor, so this is acceptable. When a new CMP is added in consent-tester,
add the matching extractor here.
Returned vendor record schema:
{
"name": str, # e.g. "Adobe Systems Software Ireland Limited"
"country": str, # ISO 2-letter (DE/US/...) when known
"purpose": str, # short description of what they do
"category": str, # marketing/analytics/functional/necessary
"opt_out_url": str, # link to opt out (Art. 7(3) DSGVO)
"privacy_policy_url": str, # link to vendor's privacy policy
"persistence": str, # human-readable retention text
"cookies": [ # cookies this vendor sets
{"name": str, "purpose": str, "expiry": str, "is_third_party": bool}
],
# Compliance scoring (filled after vendor_compliance.evaluate())
"compliance_score": int, # 0-100
"compliance_flags": list[str], # e.g. ["no_opt_out", "broken_opt_out"]
}
"""
from __future__ import annotations
import logging
import re
logger = logging.getLogger(__name__)
_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\s+")
def _clean(s: object) -> str:
text = "" if s is None else str(s)
no_tags = _TAG_RE.sub(" ", text)
return _WS_RE.sub(" ", no_tags).strip()
def extract_vendors_from_payloads(payloads: list[dict]) -> list[dict]:
"""Walk every captured CMP payload, dispatch to per-CMP extractor.
Deduplicates vendors across payloads by name (preserves richer record).
"""
all_vendors: dict[str, dict] = {}
for payload in payloads or []:
kind = payload.get("kind", "")
data = payload.get("data", {})
if not isinstance(data, dict):
continue
try:
if kind == "epaas":
vendors = _extract_epaas(data)
elif kind == "onetrust":
vendors = _extract_onetrust(data)
else:
# Generic fallback: walk data for vendor-like dicts
vendors = _extract_generic(data)
except Exception as e:
logger.warning("vendor extractor failed for %s: %s", kind, e)
continue
for v in vendors:
name = (v.get("name") or "").strip()
if not name:
continue
existing = all_vendors.get(name)
if existing:
# Merge cookies + fill empty fields
for k, v_val in v.items():
if not existing.get(k) and v_val:
existing[k] = v_val
existing.setdefault("cookies", []).extend(v.get("cookies", []))
else:
all_vendors[name] = v
return list(all_vendors.values())
# ── ePaaS (BMW Group) ───────────────────────────────────────────────
def _extract_epaas(d: dict) -> list[dict]:
out: list[dict] = []
providers = d.get("providers", []) or []
cookies_by_provider: dict[str, list[dict]] = {}
for c in d.get("cookies", []) or []:
pid = str(c.get("providerId") or c.get("provider") or c.get("vendor") or "")
if pid:
cookies_by_provider.setdefault(pid, []).append({
"name": c.get("name") or c.get("id") or "",
"purpose": _clean(c.get("purpose") or c.get("description")),
"expiry": _clean(c.get("expiry") or c.get("retention") or c.get("persistence")),
"is_third_party": bool(c.get("isThirdParty") or c.get("third_party")),
})
for p in providers:
pid = str(p.get("id") or p.get("vendorId") or p.get("name") or "")
cookies = cookies_by_provider.get(pid, []) or [{
"name": c.get("name", ""),
"purpose": _clean(c.get("purpose")),
"expiry": _clean(c.get("expiry") or c.get("persistence")),
"is_third_party": True,
} for c in (p.get("cookies", []) or [])]
out.append({
"name": p.get("name") or pid or "",
"country": (p.get("country") or "").strip(),
"purpose": _clean(p.get("purpose")),
"category": (p.get("category") or "").strip(),
"opt_out_url": (p.get("optOutUrl") or p.get("optoutUrl")
or p.get("opt_out_url") or "").strip(),
"privacy_policy_url": (p.get("policyUrl") or p.get("policy_url")
or p.get("privacyPolicyUrl") or "").strip(),
"persistence": _clean(p.get("persistencePurposeDescription")),
"cookies": cookies,
})
return out
# ── OneTrust ────────────────────────────────────────────────────────
def _extract_onetrust(d: dict) -> list[dict]:
out_by_name: dict[str, dict] = {}
for g in d.get("Groups") or d.get("groups") or []:
category = g.get("GroupName") or g.get("name") or ""
for c in g.get("Cookies") or g.get("cookies") or []:
provider = (c.get("Provider") or c.get("provider")
or c.get("Host") or c.get("host") or "").strip()
if not provider:
continue
cookie_entry = {
"name": c.get("Name") or c.get("name") or "",
"purpose": _clean(c.get("description") or c.get("Description")),
"expiry": _clean(c.get("Length") or c.get("expires")),
"is_third_party": bool(c.get("IsThirdParty") or c.get("isThirdParty")),
}
if provider in out_by_name:
out_by_name[provider]["cookies"].append(cookie_entry)
else:
out_by_name[provider] = {
"name": provider,
"country": "",
"purpose": _clean(g.get("GroupDescription") or c.get("description")),
"category": category,
"opt_out_url": "",
"privacy_policy_url": (c.get("PolicyUrl") or c.get("policyUrl") or ""),
"persistence": "",
"cookies": [cookie_entry],
}
return list(out_by_name.values())
# ── Generic fallback (other CMPs / heuristic captures) ──────────────
def _extract_generic(d: dict) -> list[dict]:
"""Best-effort walk for unknown CMP shapes.
Looks for top-level keys named 'vendors' / 'providers' / 'services' and
extracts name/purpose/country fields from each entry.
"""
out: list[dict] = []
for key in ("vendors", "providers", "services", "dataProcessingServices",
"Vendors", "Providers"):
lst = d.get(key)
if not isinstance(lst, list):
continue
for entry in lst:
if not isinstance(entry, dict):
continue
name = (entry.get("name") or entry.get("vendor")
or entry.get("dataProcessor") or "").strip()
if not name:
continue
out.append({
"name": name,
"country": (entry.get("country") or "").strip(),
"purpose": _clean(entry.get("purpose") or entry.get("description")
or entry.get("dataPurpose")),
"category": (entry.get("category") or "").strip(),
"opt_out_url": (entry.get("optOutUrl") or entry.get("opt_out_url")
or "").strip(),
"privacy_policy_url": (entry.get("policyUrl") or entry.get("privacyPolicyUrl")
or entry.get("privacy_policy_url") or "").strip(),
"persistence": _clean(entry.get("retentionPeriodDescription")),
"cookies": [],
})
return out
+4
View File
@@ -293,6 +293,9 @@ class DSIDiscoveryResponse(BaseModel):
languages_detected: list[str]
errors: list[str]
scanned_at: str
# Raw CMP payloads captured during navigation (ePaaS, OneTrust, etc.).
# Backend uses these to build the per-vendor compliance table.
cmp_payloads: list[dict] = []
@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
@@ -343,6 +346,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
languages_detected=result.languages_detected,
errors=result.errors,
scanned_at=datetime.now(timezone.utc).isoformat(),
cmp_payloads=result.cmp_payloads,
)
@@ -67,3 +67,53 @@ def reconstruct(d: dict) -> str:
parts.append(_clean(str(meta["persistencePurposeText"])))
return "\n".join(parts)
def extract_vendors(d: dict) -> list[dict]:
"""Return structured vendor records from ePaaS policy JSON.
Schema returned (per vendor):
{name, country, purpose, category, opt_out_url, privacy_policy_url,
persistence, cookies: [{name, purpose, expiry, is_third_party}]}
"""
out: list[dict] = []
providers = d.get("providers", []) or []
cookies_by_provider: dict[str, list[dict]] = {}
# ePaaS sometimes stores cookies in a separate 'cookies' array referenced
# by providerId. If so, group them by provider.
cookies_list = d.get("cookies", []) or []
for c in cookies_list:
pid = (c.get("providerId") or c.get("provider")
or c.get("vendorId") or c.get("vendor") or "")
if pid:
cookies_by_provider.setdefault(str(pid), []).append({
"name": c.get("name") or c.get("id") or "",
"purpose": _clean(str(c.get("purpose") or c.get("description") or "")),
"expiry": _clean(str(c.get("expiry") or c.get("retention")
or c.get("persistence") or "")),
"is_third_party": bool(c.get("isThirdParty")
or c.get("third_party")),
})
for p in providers:
pid = str(p.get("id") or p.get("vendorId") or p.get("name") or "")
cookies = (cookies_by_provider.get(pid, [])
or [{"name": c.get("name", ""),
"purpose": _clean(str(c.get("purpose", ""))),
"expiry": _clean(str(c.get("expiry") or c.get("persistence") or "")),
"is_third_party": True}
for c in (p.get("cookies", []) or [])])
out.append({
"name": p.get("name") or pid or "",
"country": (p.get("country") or "").strip(),
"purpose": _clean(str(p.get("purpose") or "")),
"category": (p.get("category") or "").strip(),
"opt_out_url": (p.get("optOutUrl") or p.get("optoutUrl")
or p.get("opt_out_url") or "").strip(),
"privacy_policy_url": (p.get("policyUrl") or p.get("policy_url")
or p.get("privacyPolicyUrl") or "").strip(),
"persistence": _clean(str(p.get("persistencePurposeDescription") or "")),
"cookies": cookies,
})
return out
@@ -54,3 +54,61 @@ def reconstruct(d: dict) -> str:
parts.append(line)
return "\n".join(parts)
def extract_vendors(d: dict) -> list[dict]:
"""Return structured vendor records from OneTrust JSON.
OneTrust groups cookies into 'Groups' (Strictly Necessary, Analytics,
Marketing, etc). Within each group, cookies are listed with Provider,
Host, Length (retention) and optional Privacy/Opt-Out URLs.
"""
out: list[dict] = []
seen: set[str] = set()
groups = d.get("Groups") or d.get("groups") or []
for g in groups:
category = g.get("GroupName") or g.get("name") or ""
for c in g.get("Cookies") or g.get("cookies") or []:
provider = (c.get("Provider") or c.get("provider")
or c.get("Host") or c.get("host") or "").strip()
if not provider:
continue
cookie_entry = {
"name": c.get("Name") or c.get("name") or "",
"purpose": _clean(str(c.get("description")
or c.get("Description") or "")),
"expiry": _clean(str(c.get("Length") or c.get("expires") or "")),
"is_third_party": (c.get("IsThirdParty")
or c.get("isThirdParty") or False),
}
if provider in seen:
# Append cookie to existing vendor
for entry in out:
if entry["name"] == provider:
entry["cookies"].append(cookie_entry)
break
else:
seen.add(provider)
out.append({
"name": provider,
"country": "",
"purpose": _clean(str(c.get("description")
or g.get("GroupDescription") or "")),
"category": category,
"opt_out_url": "",
"privacy_policy_url": (c.get("PolicyUrl")
or c.get("policyUrl") or ""),
"persistence": "",
"cookies": [cookie_entry],
})
return out
_TAG_RE = __import__("re").compile(r"<[^>]+>")
_WS_RE = __import__("re").compile(r"\s+")
def _clean(text: str) -> str:
no_tags = _TAG_RE.sub(" ", text or "")
return _WS_RE.sub(" ", no_tags).strip()
+13 -2
View File
@@ -168,6 +168,10 @@ class DSIDiscoveryResult:
total_found: int = 0
languages_detected: list[str] = field(default_factory=list)
errors: list[str] = field(default_factory=list)
# Raw CMP payloads captured during navigation (one per matched JSON).
# Schema: [{"kind": str, "url": str, "data": dict}, ...]
# Backend uses these to build vendor records + run per-vendor checks.
cmp_payloads: list[dict] = field(default_factory=list)
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
"""Check if text contains any DSI keyword. Returns (match, language)."""
@@ -270,6 +274,10 @@ async def discover_dsi_documents(
logger.info("PDF redirect detected: %s -> %s", url, final_url)
# Return early — a PDF redirect means no HTML content to scan
result.total_found = len(result.documents)
result.cmp_payloads = [
{"kind": kind, "data": data}
for kind, data in cmp_capture.payloads
]
return result
# Step 1b: Try dismissing cookie consent banners before extraction.
@@ -534,8 +542,11 @@ async def discover_dsi_documents(
result.languages_detected = list(set(
d.language for d in result.documents if d.language
))
logger.info("DSI discovery complete: %d documents found in %s",
result.total_found, result.languages_detected)
result.cmp_payloads = [
{"kind": kind, "data": data} for kind, data in cmp_capture.payloads
]
logger.info("DSI discovery complete: %d documents found in %s, %d CMP payloads",
result.total_found, result.languages_detected, len(result.cmp_payloads))
return result
# Nav elements, not real documents