Files
breakpilot-compliance/consent-tester/services/cmp_library/onetrust.py
T
Benjamin Admin ea4dbb223f feat(vvt): per-vendor extraction + opt-out check + VVT table in email (V1)
When a known CMP (ePaaS, OneTrust) renders the cookie policy, we now
extract structured vendor records, probe their opt-out + privacy URLs,
score each vendor (0-100), and append a 'VVT-Vorschlag' table to the
compliance email — one row per vendor, sortable by compliance score.

consent-tester:
- DSIDiscoveryResult.cmp_payloads: surfaces raw CMP JSON to callers
- DSIDiscoveryResponse: new cmp_payloads field
- discover_dsi_documents sets cmp_payloads from cmp_capture
- cmp_library/{epaas,onetrust}.py: new extract_vendors(d) returning
  list[VendorRecord]

backend:
- _fetch_text() now returns (text, cmp_payloads) tuple
- doc_entries store cmp_payloads per doc (mostly cookie)
- _autodiscover_missing forwards homepage payloads to the cookie entry
- New module vendor_extractor.py: dispatches ePaaS/OneTrust/generic
  schemas; dedupes vendors across multiple payloads
- cookie_link_validator.py extended with validate_vendor_urls(vendors)
  and score_vendors(vendors) — 0-100 score per vendor based on name,
  purpose, country, opt-out reachable, privacy URL reachable, cookies
  with names + expiry
- agent_doc_check_extras.build_vvt_table_html: renders the table
- Route appends VVT HTML after the provider list, before the
  document-by-document report
- Response JSON gains cmp_vendors for future frontend rendering

Example for BMW: ~30 ePaaS providers → table with Name | Kategorie |
Sitz | Cookies | Opt-Out (✓/✗) | Privacy (✓/✗) | Score. Sorted by
score ascending so the worst-compliant vendors are at the top.
2026-05-17 09:50:11 +02:00

115 lines
4.0 KiB
Python

"""OneTrust Cookie Consent.
URL: cdn.cookielaw.org/consent/<id>/<id>.json
OR cdn.cookielaw.org/consent/<id>/<lang>.json
Schema: Groups[] with GroupName, GroupDescription, Cookies[]
"""
import re
MATCHER = re.compile(r"cdn\.cookielaw\.org/consent/[^/]+/[^/]+\.json", re.I)
_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\s+")
def _clean(text: str) -> str:
no_tags = _TAG_RE.sub(" ", text)
no_tags = no_tags.replace("&nbsp;", " ").replace("&amp;", "&")
return _WS_RE.sub(" ", no_tags).strip()
def reconstruct(d: dict) -> str:
parts: list[str] = ["# Cookie-Richtlinie (OneTrust)"]
# Optional preamble fields
for key in ("Description", "PolicyText", "PolicyDescription"):
val = d.get(key)
if val:
parts.append("")
parts.append(_clean(str(val)))
groups = d.get("Groups") or d.get("groups") or []
for g in groups:
name = g.get("GroupName") or g.get("name") or ""
desc = g.get("GroupDescription") or g.get("description") or ""
parts.append("")
parts.append(f"## {name}")
if desc:
parts.append(_clean(str(desc)))
cookies = g.get("Cookies") or g.get("cookies") or []
for c in cookies[:50]:
cn = c.get("Name") or c.get("name") or ""
cp = c.get("Provider") or c.get("provider") or ""
cd = c.get("description") or c.get("Description") or ""
ce = c.get("Length") or c.get("expires") or ""
line = f"- {cn}"
if cp:
line += f" ({cp})"
if cd:
line += f"{cd[:120]}"
if ce:
line += f" — Speicherdauer: {ce}"
parts.append(line)
return "\n".join(parts)
def extract_vendors(d: dict) -> list[dict]:
"""Return structured vendor records from OneTrust JSON.
OneTrust groups cookies into 'Groups' (Strictly Necessary, Analytics,
Marketing, etc). Within each group, cookies are listed with Provider,
Host, Length (retention) and optional Privacy/Opt-Out URLs.
"""
out: list[dict] = []
seen: set[str] = set()
groups = d.get("Groups") or d.get("groups") or []
for g in groups:
category = g.get("GroupName") or g.get("name") or ""
for c in g.get("Cookies") or g.get("cookies") or []:
provider = (c.get("Provider") or c.get("provider")
or c.get("Host") or c.get("host") or "").strip()
if not provider:
continue
cookie_entry = {
"name": c.get("Name") or c.get("name") or "",
"purpose": _clean(str(c.get("description")
or c.get("Description") or "")),
"expiry": _clean(str(c.get("Length") or c.get("expires") or "")),
"is_third_party": (c.get("IsThirdParty")
or c.get("isThirdParty") or False),
}
if provider in seen:
# Append cookie to existing vendor
for entry in out:
if entry["name"] == provider:
entry["cookies"].append(cookie_entry)
break
else:
seen.add(provider)
out.append({
"name": provider,
"country": "",
"purpose": _clean(str(c.get("description")
or g.get("GroupDescription") or "")),
"category": category,
"opt_out_url": "",
"privacy_policy_url": (c.get("PolicyUrl")
or c.get("policyUrl") or ""),
"persistence": "",
"cookies": [cookie_entry],
})
return out
_TAG_RE = __import__("re").compile(r"<[^>]+>")
_WS_RE = __import__("re").compile(r"\s+")
def _clean(text: str) -> str:
no_tags = _TAG_RE.sub(" ", text or "")
return _WS_RE.sub(" ", no_tags).strip()