57c0f940a2
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / nodejs-build (push) Successful in 2m19s
CI / test-go (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
P56 Anti-Auditing-Detection als constructive Compliance-Finding (Audit-API-
Empfehlung statt Anklage, weil Mercedes berechtigt Bots blockiert)
P57 Phase G vendor_details Union mit cmp_vendors -> 42 Anbieter sichtbar
P58 Anti-Audit-Detection robuster (Script-Domain-Check + Settings-spezifisch)
P59 Cookie-Behavior-Validator (4 Layer, 3-Tier-Severity: MEDIUM=Kategorie-
Mismatch / HIGH=Zweck-Mismatch / CRITICAL=beide=Vorsatz-Indiz)
+ Open Cookie Database (CC0) als Library-Seed (2264 Cookies)
P59b Cookie-Behavior in Banner-Check verdrahtet + Mail-Block (BUGFIX:
SessionLocal selbst oeffnen, db war im Background-Task nicht im Scope)
Mail-Polish nach Mercedes-Review:
P63 Banner-Footer-Links auch im wb7-link/role=link erkennen (Shadow-DOM-
Walker label-based statt nur <a href>)
P64 Re-Access-Severity: MEDIUM statt HIGH, wenn Footer "Einstellungen" oder
Mercedes-typisch existiert; OEM-Footer-Detection (wb7-footer)
P65 Text-Truncation: Word-Boundary statt Zeichen-Cut (kein "einfa"-Bruch
mehr in Sofortmassnahmen)
P66 GF-Aktionen: Service-Zweck vs Cookie-Zweck explizit erklaert
(haeufige Verwechslung Marketing/GF: "Akamai-Beschreibung" != Cookie-
Zweck pro DSK-OH 2024)
P67 Stirring-Finding mit "Verlust-Framing"-Erklaerung + Alt-vs-Neutral-
Beispiel, statt nur EDPB-Fachbegriff
Compliance-Advisor FAQ (admin agent-core/soul):
+ CNIL/EDPB Top-Bussgelder (Google 100M, Meta 60M, Amazon 35M)
+ Deutsche Praezedenz (LG Muenchen Google Fonts, EuGH Planet49, BGH I ZR 7/16)
+ 4 Risiko-Pfade (Bussgeld/Abmahnung/Sammelklage/NOYB) + Berechnungs-Methodik
Document-Generator Templates: AGB-DE (142), Impressum (140), Widerrufs-
formular-Anlage (143), DSR-Process-Dedup (139), Cookie-Library (144).
Architektur: doc_action_mappings.py + banner_dom_walkers.py +
cookie_behavior_validator.py + vendor_detail_extractor.py rausgezogen,
um die 500-LOC-Caps in agent_doc_check_report.py und
banner_text_checker.py einzuhalten.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
676 lines
28 KiB
Python
676 lines
28 KiB
Python
"""
|
|
Phase D — Per-Vendor Detail Extraction (P50).
|
|
|
|
After Accept (Phase C) the banner contains every vendor; on most CMPs
|
|
(Usercentrics, OneTrust, Cookiebot) each vendor has an Info/Details
|
|
icon that opens a modal with Beschreibung, Verarbeitendes Unternehmen,
|
|
Zweck, Genutzte Technologien, Cookies, Opt-Out-URL and Privacy-URL.
|
|
|
|
We open the settings-view of the banner, walk the Shadow-DOM for info
|
|
icons, click each one, capture the modal text + the XHR triggered by
|
|
the click (which Usercentrics uses to load the detail JSON), and parse
|
|
the text into structured fields.
|
|
|
|
Returns: list[VendorDetail] with raw_text + structured fields.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
from playwright.async_api import Browser, Page, TimeoutError as PlaywrightTimeout
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
USER_AGENT = (
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class VendorDetail:
|
|
name: str = ""
|
|
description: str = ""
|
|
processing_company: str = ""
|
|
address: str = ""
|
|
purposes: list[str] = field(default_factory=list)
|
|
technologies: list[str] = field(default_factory=list)
|
|
cookies: list[str] = field(default_factory=list)
|
|
retention: str = ""
|
|
opt_out_url: str = ""
|
|
privacy_url: str = ""
|
|
raw_text: str = ""
|
|
|
|
|
|
# ── Shadow-DOM helper: find info-buttons in Mercedes/Usercentrics/etc.
|
|
|
|
_FIND_INFO_BUTTONS_JS = r"""
|
|
() => {
|
|
// Walk all shadow roots and collect "info"/"i"-icon clickables.
|
|
// Covers <button>, <div>, <span>, <cmm-icon> — Mercedes uses
|
|
// <div class="consent-item__information"> as info trigger.
|
|
const results = [];
|
|
function walk(root) {
|
|
if (!root || !root.querySelectorAll) return;
|
|
const buttons = root.querySelectorAll(
|
|
'button[aria-label*="info" i], button[aria-label*="details" i], ' +
|
|
'button[aria-label*="weitere" i], button[title*="info" i], ' +
|
|
'button[title*="details" i], button[class*="info" i], ' +
|
|
'button[class*="detail" i], [data-testid*="info"], [data-testid*="detail"], ' +
|
|
'button > i.material-icons, button[aria-label="i"], svg[aria-label*="info" i], ' +
|
|
// P50e: Mercedes uses button.consent-item__icon with
|
|
// data-test="toggle-consent-info-modal", aria-label=vendor name.
|
|
'button.consent-item__icon, [data-test*="toggle-consent-info"], ' +
|
|
'button[class*="info-icon"], button[class*="detail-toggle"]'
|
|
);
|
|
for (const b of buttons) {
|
|
// P50e: priority — aria-label IS vendor name for Mercedes
|
|
let label = (b.getAttribute('aria-label') || '').trim();
|
|
if (!label) {
|
|
// Walk up to find a heading/label/consent-item__name span
|
|
let el = b;
|
|
for (let i = 0; i < 5 && el; i++) {
|
|
const parent = el.parentElement || (el.getRootNode && el.getRootNode().host);
|
|
if (!parent) break;
|
|
const heading = parent.querySelector ? parent.querySelector('.consent-item__name, h1,h2,h3,h4,h5,h6,strong') : null;
|
|
if (heading && heading.textContent && heading.textContent.trim().length > 1) {
|
|
label = heading.textContent.trim().substring(0, 100);
|
|
break;
|
|
}
|
|
el = parent;
|
|
}
|
|
}
|
|
// Mercedes button is visually-hidden (width=0) — still clickable
|
|
results.push({label: label});
|
|
}
|
|
// Recurse into shadow roots
|
|
const all = root.querySelectorAll('*');
|
|
for (const el of all) {
|
|
if (el.shadowRoot) walk(el.shadowRoot);
|
|
}
|
|
}
|
|
walk(document);
|
|
return results;
|
|
}
|
|
"""
|
|
|
|
|
|
_CLICK_INFO_BY_LABEL_JS = r"""
|
|
(label) => {
|
|
// P50e: prefer direct aria-label match (Mercedes uses it).
|
|
function walk(root) {
|
|
if (!root || !root.querySelectorAll) return false;
|
|
try {
|
|
const escaped = label.replace(/"/g, '\\"');
|
|
const direct = root.querySelector('button[aria-label="' + escaped + '"]');
|
|
if (direct) { direct.click(); return true; }
|
|
} catch(e) {}
|
|
const buttons = root.querySelectorAll(
|
|
'button[aria-label*="info" i], button[aria-label*="details" i], ' +
|
|
'button[aria-label*="weitere" i], button[title*="info" i], ' +
|
|
'button[title*="details" i], button[class*="info" i], ' +
|
|
'button[class*="detail" i], [data-testid*="info"], [data-testid*="detail"], ' +
|
|
'button.consent-item__icon, [data-test*="toggle-consent-info"]'
|
|
);
|
|
for (const b of buttons) {
|
|
let el = b;
|
|
for (let i = 0; i < 5 && el; i++) {
|
|
const parent = el.parentElement || (el.getRootNode && el.getRootNode().host);
|
|
if (!parent) break;
|
|
const h = parent.querySelector ? parent.querySelector('h1,h2,h3,h4,h5,h6,label,strong,span') : null;
|
|
if (h && h.textContent && h.textContent.trim().substring(0, 100) === label) {
|
|
b.click();
|
|
return true;
|
|
}
|
|
el = parent;
|
|
}
|
|
}
|
|
const all = root.querySelectorAll('*');
|
|
for (const el of all) {
|
|
if (el.shadowRoot && walk(el.shadowRoot)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
return walk(document);
|
|
}
|
|
"""
|
|
|
|
|
|
_EXTRACT_MODAL_TEXT_JS = r"""
|
|
() => {
|
|
// P50d: Find the detail-info container that opened on click.
|
|
// Mercedes uses an inline detail-view (not <dialog>), recognisable by
|
|
// text markers ("Verarbeitendes Unternehmen", "Beschreibung",
|
|
// "Genutzte Technologien"). Walk all shadow roots, find the SMALLEST
|
|
// element containing all/most markers — that's the detail-box.
|
|
const MARKERS = [
|
|
'Verarbeitendes Unternehmen', 'Beschreibung des Services',
|
|
'Zweck der Daten', 'Genutzte Technologien', 'Gesammelte Daten',
|
|
'Datenschutz-Beauftragter', 'processing company',
|
|
'data purpose', 'technologies used',
|
|
];
|
|
let best = null, bestLen = Infinity;
|
|
function walk(root) {
|
|
if (!root || !root.querySelectorAll) return;
|
|
const all = root.querySelectorAll('*');
|
|
for (const el of all) {
|
|
const txt = (el.textContent || '');
|
|
if (txt.length < 80 || txt.length > 5000) continue;
|
|
const hits = MARKERS.filter(m => txt.includes(m)).length;
|
|
if (hits >= 2 && txt.length < bestLen) {
|
|
best = txt;
|
|
bestLen = txt.length;
|
|
}
|
|
if (el.shadowRoot) walk(el.shadowRoot);
|
|
}
|
|
}
|
|
walk(document);
|
|
if (best) return best;
|
|
|
|
// Fallback: open dialog/modal with reasonable size
|
|
function findDialog(root) {
|
|
if (!root || !root.querySelectorAll) return null;
|
|
const sels = ['[role="dialog"]:not([aria-hidden="true"])',
|
|
'[class*="modal"]:not([class*="closed"])',
|
|
'[class*="dialog"]', '[class*="popup"]',
|
|
'[class*="detail-view"]', '[class*="info-panel"]',
|
|
'[class*="detail-box"]'];
|
|
for (const sel of sels) {
|
|
const els = root.querySelectorAll(sel);
|
|
for (const el of els) {
|
|
const rect = el.getBoundingClientRect();
|
|
if (rect.width > 100 && rect.height > 100) {
|
|
const text = (el.textContent || '').trim();
|
|
if (text.length > 50 && text.length < 8000) return text;
|
|
}
|
|
}
|
|
}
|
|
const all = root.querySelectorAll('*');
|
|
for (const el of all) {
|
|
if (el.shadowRoot) {
|
|
const t = findDialog(el.shadowRoot);
|
|
if (t) return t;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
return findDialog(document) || '';
|
|
}
|
|
"""
|
|
|
|
|
|
_CLOSE_MODAL_JS = r"""
|
|
() => {
|
|
function walk(root) {
|
|
if (!root || !root.querySelectorAll) return false;
|
|
// Close-button: aria-label, title, X-character, or class
|
|
const closes = root.querySelectorAll(
|
|
'[aria-label*="schlie" i], [aria-label*="close" i], ' +
|
|
'[title*="schlie" i], [title*="close" i], ' +
|
|
'[class*="close" i]:not([disabled])'
|
|
);
|
|
for (const c of closes) {
|
|
if (c.getBoundingClientRect().width > 0) {
|
|
c.click();
|
|
return true;
|
|
}
|
|
}
|
|
const all = root.querySelectorAll('*');
|
|
for (const el of all) {
|
|
if (el.shadowRoot && walk(el.shadowRoot)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
return walk(document);
|
|
}
|
|
"""
|
|
|
|
|
|
# ── Modal-Text parsing ──────────────────────────────────────────────
|
|
|
|
_FIELD_PATTERNS = [
|
|
("description", r"Beschreibung[\s\S]{0,30}?\n([\s\S]{20,800}?)(?:\n\n|\nVerarbeit|\nZweck|\nGenutzt|\nCookies|\nAdresse|$)"),
|
|
("processing_company", r"Verarbeitende[s]?\s+Unternehmen[\s\S]{0,30}?\n([\s\S]{5,300}?)(?:\n\n|\nAdresse|\nZweck|$)"),
|
|
("address", r"(?:Adresse|Anschrift)[\s\S]{0,30}?\n([\s\S]{5,300}?)(?:\n\n|\nZweck|\nGenutzt|$)"),
|
|
("retention", r"Speicherdauer[\s\S]{0,30}?\n([\s\S]{2,200}?)(?:\n\n|\n[A-Z])"),
|
|
("opt_out_url", r"(?:Opt[\-\s]?[Oo]ut|Widerspruch)[\s\S]{0,80}?(https?://[^\s<>\"']+)"),
|
|
("privacy_url", r"(?:Datenschutz[\-\s]?(?:erkl|Information)|Privacy)[\s\S]{0,80}?(https?://[^\s<>\"']+)"),
|
|
]
|
|
|
|
|
|
def parse_modal_text(text: str) -> dict:
|
|
"""Best-effort parse of detail-modal text into structured fields."""
|
|
result = {}
|
|
for field_name, pat in _FIELD_PATTERNS:
|
|
m = re.search(pat, text)
|
|
if m:
|
|
result[field_name] = m.group(1).strip()
|
|
# Purposes / Technologies / Cookies — bullet-list style
|
|
purposes_m = re.search(
|
|
r"Zweck(?:e)?\s+der\s+Daten[\s\S]{0,80}?(?:\n)([\s\S]{20,500}?)(?:\nGenutzt|\nVerarbeit|\nCookies|\n\n[A-Z])",
|
|
text,
|
|
)
|
|
if purposes_m:
|
|
items = [s.strip(" -•*\t") for s in purposes_m.group(1).split("\n") if s.strip()]
|
|
result["purposes"] = [s for s in items if 2 < len(s) < 80][:15]
|
|
tech_m = re.search(
|
|
r"Genutzte\s+Technologien[\s\S]{0,80}?\n([\s\S]{5,500}?)(?:\nCookies|\nGesammelt|\n\n[A-Z]|\nWeb)",
|
|
text,
|
|
)
|
|
if tech_m:
|
|
items = [s.strip(" -•*\t") for s in tech_m.group(1).split("\n") if s.strip()]
|
|
result["technologies"] = [s for s in items if 2 < len(s) < 80][:10]
|
|
cookies_m = re.search(
|
|
r"Cookies?\s*(?:Name)?[\s\S]{0,80}?\n([\s\S]{5,1000}?)(?:\n\n[A-Z]|$)",
|
|
text,
|
|
)
|
|
if cookies_m:
|
|
items = [s.strip(" -•*\t") for s in cookies_m.group(1).split("\n") if s.strip()]
|
|
result["cookies"] = [s for s in items if 2 < len(s) < 100][:30]
|
|
return result
|
|
|
|
|
|
async def _try_reopen_banner(page: Page) -> bool:
|
|
"""Try to re-open the banner after Accept — floating icon or footer link."""
|
|
# 1. Common floating-icon selectors
|
|
floating_sels = [
|
|
".uc-cookie-settings-trigger", "#ot-sdk-btn", "#ot-sdk-btn-floating",
|
|
".cky-btn-revisit", "[class*='cookie-floating']",
|
|
"[class*='cmplz-cookiebanner-status']",
|
|
"[id*='cookiebot-renew']",
|
|
]
|
|
for sel in floating_sels:
|
|
try:
|
|
el = page.locator(sel).first
|
|
if await el.count() > 0:
|
|
await el.click(timeout=3000)
|
|
await page.wait_for_timeout(1500)
|
|
return True
|
|
except Exception:
|
|
continue
|
|
|
|
# 2. Footer link — generic text search
|
|
for txt in ["Cookie-Einstellungen", "Cookie Einstellungen", "Cookie-Richtlinie",
|
|
"Cookies", "Einstellungen", "Privatsphäre"]:
|
|
try:
|
|
l = page.locator(f"footer >> text=/{txt}/i").first
|
|
if await l.count() > 0:
|
|
await l.click(timeout=3000)
|
|
await page.wait_for_timeout(1500)
|
|
return True
|
|
except Exception:
|
|
continue
|
|
|
|
# 3. Shadow-DOM web-component re-open (Mercedes specific)
|
|
try:
|
|
clicked = await page.evaluate(r"""() => {
|
|
function walk(root) {
|
|
if (!root || !root.querySelectorAll) return false;
|
|
// Mercedes uses chip / persistent button inside cmm-cookie-banner
|
|
const tags = ['cmm-cookie-banner', 'cookie-consent-banner'];
|
|
for (const tag of tags) {
|
|
const els = root.querySelectorAll(tag);
|
|
for (const el of els) {
|
|
if (el.shadowRoot) {
|
|
const trigger = el.shadowRoot.querySelector(
|
|
'[aria-label*="cookie" i], [class*="trigger"], [class*="chip"]'
|
|
);
|
|
if (trigger) { trigger.click(); return true; }
|
|
}
|
|
}
|
|
}
|
|
const all = root.querySelectorAll('*');
|
|
for (const el of all) {
|
|
if (el.shadowRoot && walk(el.shadowRoot)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
return walk(document);
|
|
}""")
|
|
if clicked:
|
|
await page.wait_for_timeout(1500)
|
|
return True
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
|
|
async def _expand_all_categories(page: Page) -> int:
|
|
"""P50d: After settings-view is open, click category expanders so all
|
|
individual vendors with their info-icons become visible.
|
|
|
|
Mercedes shows 5 category items by default; each expands to a list
|
|
of vendors with consent-item__information divs."""
|
|
try:
|
|
n = await page.evaluate(r"""() => {
|
|
let clicked = 0;
|
|
function walk(root) {
|
|
if (!root || !root.querySelectorAll) return;
|
|
// Expander triggers: wb7-button / button with "+" or aria-expanded="false"
|
|
const triggers = root.querySelectorAll(
|
|
'[aria-expanded="false"], wb7-button[class*="expand" i], ' +
|
|
'button[class*="expand" i], [class*="accordion"][aria-expanded="false"], ' +
|
|
'[class*="category"] > [role="button"], ' +
|
|
'[class*="category-header"], [class*="category__header"]'
|
|
);
|
|
for (const t of triggers) {
|
|
try { t.click(); clicked++; } catch(e) {}
|
|
}
|
|
const all = root.querySelectorAll('*');
|
|
for (const el of all) {
|
|
if (el.shadowRoot) walk(el.shadowRoot);
|
|
}
|
|
}
|
|
walk(document);
|
|
return clicked;
|
|
}""")
|
|
if n:
|
|
logger.info("Detail-Phase: expanded %d category collapsibles", n)
|
|
await page.wait_for_timeout(1500)
|
|
return n or 0
|
|
except Exception as e:
|
|
logger.debug("_expand_all_categories failed: %s", e)
|
|
return 0
|
|
|
|
|
|
async def _open_settings_view(page: Page) -> bool:
|
|
"""After banner is shown, click 'Einstellungen' to reveal the vendor list
|
|
(where consent-item__information info-divs are visible)."""
|
|
try:
|
|
# Mercedes / cmm-cookie-banner: click "Einstellungen" wb7-button
|
|
clicked = await page.evaluate(r"""() => {
|
|
function walk(root) {
|
|
if (!root || !root.querySelectorAll) return false;
|
|
const buttons = root.querySelectorAll(
|
|
'button, [role="button"], wb7-button, cmm-button'
|
|
);
|
|
for (const b of buttons) {
|
|
const txt = (b.textContent || '').trim().toLowerCase();
|
|
if (txt === 'einstellungen' || txt === 'settings' ||
|
|
txt === 'mehr informationen' || txt === 'individuell' ||
|
|
txt.includes('cookie-einstellungen') ||
|
|
txt.includes('details anzeigen')) {
|
|
b.click();
|
|
return true;
|
|
}
|
|
}
|
|
const all = root.querySelectorAll('*');
|
|
for (const el of all) {
|
|
if (el.shadowRoot && walk(el.shadowRoot)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
return walk(document);
|
|
}""")
|
|
if clicked:
|
|
await page.wait_for_timeout(2500)
|
|
return True
|
|
except Exception as e:
|
|
logger.debug("open settings-view failed: %s", e)
|
|
return False
|
|
|
|
|
|
async def _detect_anti_audit(page: Page) -> dict:
|
|
"""P56: Detect anti-auditing measures on the page.
|
|
|
|
Returns dict with markers:
|
|
- bot_protection: name of detected anti-bot tool (or "")
|
|
- user_select_none: True if Banner-text has CSS user-select:none
|
|
- tdm_meta: noai/notdm meta-tag content if present
|
|
|
|
Caller decides severity. Bot-protection alone triggers TDM-skip
|
|
(§44b UrhG), user-select:none triggers HIGH Transparency-Finding
|
|
(Art. 5(1)(a) DSGVO)."""
|
|
out = {"bot_protection": "", "user_select_none": False, "tdm_meta": "",
|
|
"click_ignored": False}
|
|
try:
|
|
cookies = await page.context.cookies()
|
|
cookie_names = {c.get("name", "") for c in cookies}
|
|
if any(n.startswith(("ak_bmsc", "bm_sv", "bm_sz", "_abck")) for n in cookie_names):
|
|
out["bot_protection"] = "Akamai Bot Manager"
|
|
elif any(n in cookie_names for n in ("__cf_bm", "cf_clearance", "__cfduid")):
|
|
out["bot_protection"] = "Cloudflare Bot Management"
|
|
elif "datadome" in cookie_names:
|
|
out["bot_protection"] = "Datadome"
|
|
elif any(n.startswith("_px") for n in cookie_names):
|
|
out["bot_protection"] = "PerimeterX"
|
|
except Exception:
|
|
pass
|
|
# P58: also detect via script-domain (Mercedes loads Akamai assets even
|
|
# before bot-cookies are set on first visit).
|
|
if not out["bot_protection"]:
|
|
try:
|
|
domains = await page.evaluate(r"""() => {
|
|
const out = new Set();
|
|
document.querySelectorAll('script[src], link[href], img[src]').forEach(el => {
|
|
const src = el.src || el.href || '';
|
|
if (src) {
|
|
try { out.add(new URL(src).hostname); } catch(e) {}
|
|
}
|
|
});
|
|
return [...out];
|
|
}""")
|
|
if isinstance(domains, list):
|
|
for d in domains:
|
|
dl = d.lower()
|
|
if any(x in dl for x in (
|
|
"akamaihd.net", "akamaized.net", "akamai.net",
|
|
"edgekey.net", "edgesuite.net",
|
|
)):
|
|
out["bot_protection"] = "Akamai (via asset-CDN)"
|
|
break
|
|
if "cloudflare.com" in dl or dl.endswith(".cloudflare.net"):
|
|
out["bot_protection"] = "Cloudflare (via asset-CDN)"
|
|
break
|
|
if "datadome" in dl:
|
|
out["bot_protection"] = "Datadome (via asset)"
|
|
break
|
|
except Exception:
|
|
pass
|
|
try:
|
|
# Check CSS user-select on banner-text + meta-tag
|
|
css_meta = await page.evaluate(r"""() => {
|
|
const result = {user_select_none: false, tdm_meta: ''};
|
|
// P58: prefer SETTINGS-view sub-containers (where vendor-list lives)
|
|
// because Mercedes' banner-body is copy-able, but the settings
|
|
// section with the vendor list is not.
|
|
const settingsSels = [
|
|
'cmm-cookie-settings', '.consent-item', '.consent-label',
|
|
'.consent-item__name', '.consent-item__information',
|
|
'.uc-settings-list', '.uc-vendor', '.ot-vlst-cntr',
|
|
];
|
|
const bannerSels = ['cmm-cookie-banner', 'cookie-consent-banner',
|
|
'#usercentrics-root', '#onetrust-banner-sdk',
|
|
'#CybotCookiebotDialog', '[role="dialog"]'];
|
|
function check(root) {
|
|
if (!root || !root.querySelectorAll) return;
|
|
// 1) Settings sub-containers first — much more meaningful
|
|
for (const sel of settingsSels) {
|
|
const els = root.querySelectorAll(sel);
|
|
for (const el of els) {
|
|
const v = getComputedStyle(el).userSelect;
|
|
if (v === 'none') { result.user_select_none = true; return; }
|
|
}
|
|
}
|
|
// 2) Fall back to banner-body sample
|
|
for (const sel of bannerSels) {
|
|
const els = root.querySelectorAll(sel);
|
|
for (const el of els) {
|
|
const target = el.shadowRoot ? el.shadowRoot : el;
|
|
const samples = target.querySelectorAll('p, span, div, label');
|
|
let noneHits = 0, total = 0;
|
|
for (const s of samples) {
|
|
total++;
|
|
if (getComputedStyle(s).userSelect === 'none') noneHits++;
|
|
if (total >= 20) break;
|
|
}
|
|
// Mark only if MAJORITY of text-elements are user-select:none
|
|
if (total > 0 && noneHits / total >= 0.5) {
|
|
result.user_select_none = true;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
const all = root.querySelectorAll('*');
|
|
for (const e of all) { if (e.shadowRoot) check(e.shadowRoot); }
|
|
}
|
|
check(document);
|
|
// Meta-tag
|
|
const metas = document.querySelectorAll('meta[name="robots"], meta[name="googlebot"]');
|
|
for (const m of metas) {
|
|
const c = (m.getAttribute('content') || '').toLowerCase();
|
|
if (c.includes('noai') || c.includes('notdm')) {
|
|
result.tdm_meta = c.substring(0, 80);
|
|
break;
|
|
}
|
|
}
|
|
return result;
|
|
}""")
|
|
if isinstance(css_meta, dict):
|
|
out["user_select_none"] = bool(css_meta.get("user_select_none"))
|
|
out["tdm_meta"] = css_meta.get("tdm_meta", "") or ""
|
|
except Exception:
|
|
pass
|
|
return out
|
|
|
|
|
|
async def _is_tdm_protected(page: Page) -> tuple[bool, str]:
|
|
"""Convenience wrapper — TDM if bot_protection or tdm_meta present."""
|
|
d = await _detect_anti_audit(page)
|
|
if d["bot_protection"]:
|
|
return True, f"{d['bot_protection']} (cookie marker)"
|
|
if d["tdm_meta"]:
|
|
return True, f"TDM opt-out meta-tag: {d['tdm_meta']}"
|
|
return False, ""
|
|
|
|
|
|
async def extract_vendor_details(
|
|
browser: Browser,
|
|
url: str,
|
|
accept_selector: Optional[str] = None,
|
|
max_vendors: int = 50,
|
|
per_vendor_timeout: float = 6.0,
|
|
) -> list[VendorDetail]:
|
|
"""Phase D: open settings-view of banner, click each Info-button, capture modal.
|
|
P50f: respect TDM opt-out (Akamai/Cloudflare/Datadome/PerimeterX) — skip
|
|
Phase G entirely when active anti-bot protection is detected."""
|
|
details: list[VendorDetail] = []
|
|
ctx = await browser.new_context(
|
|
user_agent=USER_AGENT,
|
|
viewport={"width": 1920, "height": 1080},
|
|
locale="de-DE",
|
|
timezone_id="Europe/Berlin",
|
|
)
|
|
page = await ctx.new_page()
|
|
|
|
try:
|
|
try:
|
|
await page.goto(url, wait_until="load", timeout=30000)
|
|
except Exception as e:
|
|
logger.warning("Detail-Phase: page.goto failed: %s", e)
|
|
return details
|
|
await page.wait_for_timeout(3500)
|
|
|
|
# P50f: Respect TDM opt-out (§44b UrhG). If site uses active
|
|
# anti-bot protection, do NOT attempt click-through scraping.
|
|
tdm_protected, tdm_reason = await _is_tdm_protected(page)
|
|
if tdm_protected:
|
|
logger.info(
|
|
"Detail-Phase: TDM opt-out detected (%s) — skipping vendor "
|
|
"detail-extract to respect §44b UrhG", tdm_reason
|
|
)
|
|
# Emit a sentinel detail entry so caller can flag this in the report
|
|
details.append(VendorDetail(
|
|
name="__TDM_OPTOUT__",
|
|
description=f"Phase G übersprungen — Site nutzt aktive Bot-Detection ({tdm_reason}). TDM-Vorbehalt nach §44b UrhG respektiert.",
|
|
))
|
|
return details
|
|
|
|
# Step 1: Fresh context — banner should already be open. Skip
|
|
# the Accept step and go directly to 'Einstellungen' (avoids
|
|
# closing-then-reopening which Mercedes makes hard).
|
|
|
|
# Step 2b (P50b): click 'Einstellungen' to reveal vendor list with
|
|
# info-icons. Without this Mercedes only shows the initial 3 buttons.
|
|
settings_opened = await _open_settings_view(page)
|
|
if settings_opened:
|
|
logger.info("Detail-Phase: opened settings-view")
|
|
else:
|
|
# If banner is not open, try to re-open it first then settings
|
|
await _try_reopen_banner(page)
|
|
await page.wait_for_timeout(1500)
|
|
await _open_settings_view(page)
|
|
await page.wait_for_timeout(2000)
|
|
|
|
# Step 2c (P50d): expand all category accordions so each vendor's
|
|
# info-icon becomes visible. Mercedes collapses categories by default.
|
|
await _expand_all_categories(page)
|
|
await page.wait_for_timeout(1000)
|
|
|
|
# Step 3: collect info-button candidates
|
|
btn_infos = await page.evaluate(_FIND_INFO_BUTTONS_JS)
|
|
if not isinstance(btn_infos, list):
|
|
return details
|
|
# Dedup by label
|
|
seen_labels: set[str] = set()
|
|
unique = []
|
|
for b in btn_infos:
|
|
lbl = b.get("label", "").strip()
|
|
if lbl and lbl not in seen_labels:
|
|
seen_labels.add(lbl)
|
|
unique.append(b)
|
|
logger.info("Detail-Phase: found %d info-button candidates (deduped from %d)",
|
|
len(unique), len(btn_infos))
|
|
|
|
# Step 4: click each, extract modal, close
|
|
for i, btn in enumerate(unique[:max_vendors]):
|
|
label = btn["label"]
|
|
try:
|
|
clicked = await asyncio.wait_for(
|
|
page.evaluate(_CLICK_INFO_BY_LABEL_JS, label),
|
|
timeout=per_vendor_timeout,
|
|
)
|
|
if not clicked:
|
|
continue
|
|
await page.wait_for_timeout(1200)
|
|
text = await asyncio.wait_for(
|
|
page.evaluate(_EXTRACT_MODAL_TEXT_JS),
|
|
timeout=per_vendor_timeout,
|
|
)
|
|
if isinstance(text, str) and len(text) > 50:
|
|
fields = parse_modal_text(text)
|
|
details.append(VendorDetail(
|
|
name=label,
|
|
raw_text=text[:3000],
|
|
**{k: v for k, v in fields.items()
|
|
if k in ("description", "processing_company", "address",
|
|
"retention", "opt_out_url", "privacy_url")},
|
|
purposes=fields.get("purposes", []),
|
|
technologies=fields.get("technologies", []),
|
|
cookies=fields.get("cookies", []),
|
|
))
|
|
# Close modal
|
|
try:
|
|
await asyncio.wait_for(
|
|
page.evaluate(_CLOSE_MODAL_JS),
|
|
timeout=2.0,
|
|
)
|
|
except Exception:
|
|
await page.keyboard.press("Escape")
|
|
await page.wait_for_timeout(500)
|
|
except (asyncio.TimeoutError, PlaywrightTimeout) as e:
|
|
logger.warning("Detail-Phase: vendor '%s' timed out", label[:40])
|
|
continue
|
|
except Exception as e:
|
|
logger.warning("Detail-Phase: vendor '%s' failed: %s", label[:40], e)
|
|
continue
|
|
|
|
logger.info("Detail-Phase complete: %d vendors with details", len(details))
|
|
finally:
|
|
await ctx.close()
|
|
return details
|