ea4dbb223f
When a known CMP (ePaaS, OneTrust) renders the cookie policy, we now
extract structured vendor records, probe their opt-out + privacy URLs,
score each vendor (0-100), and append a 'VVT-Vorschlag' table to the
compliance email — one row per vendor, sortable by compliance score.
consent-tester:
- DSIDiscoveryResult.cmp_payloads: surfaces raw CMP JSON to callers
- DSIDiscoveryResponse: new cmp_payloads field
- discover_dsi_documents sets cmp_payloads from cmp_capture
- cmp_library/{epaas,onetrust}.py: new extract_vendors(d) returning
list[VendorRecord]
backend:
- _fetch_text() now returns (text, cmp_payloads) tuple
- doc_entries store cmp_payloads per doc (mostly cookie)
- _autodiscover_missing forwards homepage payloads to the cookie entry
- New module vendor_extractor.py: dispatches ePaaS/OneTrust/generic
schemas; dedupes vendors across multiple payloads
- cookie_link_validator.py extended with validate_vendor_urls(vendors)
and score_vendors(vendors) — 0-100 score per vendor based on name,
purpose, country, opt-out reachable, privacy URL reachable, cookies
with names + expiry
- agent_doc_check_extras.build_vvt_table_html: renders the table
- Route appends VVT HTML after the provider list, before the
document-by-document report
- Response JSON gains cmp_vendors for future frontend rendering
Example for BMW: ~30 ePaaS providers → table with Name | Kategorie |
Sitz | Cookies | Opt-Out (✓/✗) | Privacy (✓/✗) | Score. Sorted by
score ascending so the worst-compliant vendors are at the top.
115 lines
4.0 KiB
Python
115 lines
4.0 KiB
Python
"""OneTrust Cookie Consent.
|
|
|
|
URL: cdn.cookielaw.org/consent/<id>/<id>.json
|
|
OR cdn.cookielaw.org/consent/<id>/<lang>.json
|
|
Schema: Groups[] with GroupName, GroupDescription, Cookies[]
|
|
"""
|
|
|
|
import re
|
|
|
|
MATCHER = re.compile(r"cdn\.cookielaw\.org/consent/[^/]+/[^/]+\.json", re.I)
|
|
|
|
_TAG_RE = re.compile(r"<[^>]+>")
|
|
_WS_RE = re.compile(r"\s+")
|
|
|
|
|
|
def _clean(text: str) -> str:
|
|
no_tags = _TAG_RE.sub(" ", text)
|
|
no_tags = no_tags.replace(" ", " ").replace("&", "&")
|
|
return _WS_RE.sub(" ", no_tags).strip()
|
|
|
|
|
|
def reconstruct(d: dict) -> str:
|
|
parts: list[str] = ["# Cookie-Richtlinie (OneTrust)"]
|
|
|
|
# Optional preamble fields
|
|
for key in ("Description", "PolicyText", "PolicyDescription"):
|
|
val = d.get(key)
|
|
if val:
|
|
parts.append("")
|
|
parts.append(_clean(str(val)))
|
|
|
|
groups = d.get("Groups") or d.get("groups") or []
|
|
for g in groups:
|
|
name = g.get("GroupName") or g.get("name") or ""
|
|
desc = g.get("GroupDescription") or g.get("description") or ""
|
|
parts.append("")
|
|
parts.append(f"## {name}")
|
|
if desc:
|
|
parts.append(_clean(str(desc)))
|
|
|
|
cookies = g.get("Cookies") or g.get("cookies") or []
|
|
for c in cookies[:50]:
|
|
cn = c.get("Name") or c.get("name") or ""
|
|
cp = c.get("Provider") or c.get("provider") or ""
|
|
cd = c.get("description") or c.get("Description") or ""
|
|
ce = c.get("Length") or c.get("expires") or ""
|
|
line = f"- {cn}"
|
|
if cp:
|
|
line += f" ({cp})"
|
|
if cd:
|
|
line += f" — {cd[:120]}"
|
|
if ce:
|
|
line += f" — Speicherdauer: {ce}"
|
|
parts.append(line)
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
def extract_vendors(d: dict) -> list[dict]:
|
|
"""Return structured vendor records from OneTrust JSON.
|
|
|
|
OneTrust groups cookies into 'Groups' (Strictly Necessary, Analytics,
|
|
Marketing, etc). Within each group, cookies are listed with Provider,
|
|
Host, Length (retention) and optional Privacy/Opt-Out URLs.
|
|
"""
|
|
out: list[dict] = []
|
|
seen: set[str] = set()
|
|
|
|
groups = d.get("Groups") or d.get("groups") or []
|
|
for g in groups:
|
|
category = g.get("GroupName") or g.get("name") or ""
|
|
for c in g.get("Cookies") or g.get("cookies") or []:
|
|
provider = (c.get("Provider") or c.get("provider")
|
|
or c.get("Host") or c.get("host") or "").strip()
|
|
if not provider:
|
|
continue
|
|
cookie_entry = {
|
|
"name": c.get("Name") or c.get("name") or "",
|
|
"purpose": _clean(str(c.get("description")
|
|
or c.get("Description") or "")),
|
|
"expiry": _clean(str(c.get("Length") or c.get("expires") or "")),
|
|
"is_third_party": (c.get("IsThirdParty")
|
|
or c.get("isThirdParty") or False),
|
|
}
|
|
if provider in seen:
|
|
# Append cookie to existing vendor
|
|
for entry in out:
|
|
if entry["name"] == provider:
|
|
entry["cookies"].append(cookie_entry)
|
|
break
|
|
else:
|
|
seen.add(provider)
|
|
out.append({
|
|
"name": provider,
|
|
"country": "",
|
|
"purpose": _clean(str(c.get("description")
|
|
or g.get("GroupDescription") or "")),
|
|
"category": category,
|
|
"opt_out_url": "",
|
|
"privacy_policy_url": (c.get("PolicyUrl")
|
|
or c.get("policyUrl") or ""),
|
|
"persistence": "",
|
|
"cookies": [cookie_entry],
|
|
})
|
|
return out
|
|
|
|
|
|
_TAG_RE = __import__("re").compile(r"<[^>]+>")
|
|
_WS_RE = __import__("re").compile(r"\s+")
|
|
|
|
|
|
def _clean(text: str) -> str:
|
|
no_tags = _TAG_RE.sub(" ", text or "")
|
|
return _WS_RE.sub(" ", no_tags).strip()
|