Files
breakpilot-compliance/consent-tester/services/category_tester.py
T
Benjamin Admin 57c0f940a2
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / nodejs-build (push) Successful in 2m19s
CI / test-go (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
feat(consent+report): P56-P67 Mercedes-Audit-Cycle (Anti-Audit, Phase G Vendors, Cookie-Behavior-Validator + 5 Mail-Polish-Items) [migration-approved]
P56  Anti-Auditing-Detection als constructive Compliance-Finding (Audit-API-
     Empfehlung statt Anklage, weil Mercedes berechtigt Bots blockiert)
P57  Phase G vendor_details Union mit cmp_vendors -> 42 Anbieter sichtbar
P58  Anti-Audit-Detection robuster (Script-Domain-Check + Settings-spezifisch)
P59  Cookie-Behavior-Validator (4 Layer, 3-Tier-Severity: MEDIUM=Kategorie-
     Mismatch / HIGH=Zweck-Mismatch / CRITICAL=beide=Vorsatz-Indiz)
     + Open Cookie Database (CC0) als Library-Seed (2264 Cookies)
P59b Cookie-Behavior in Banner-Check verdrahtet + Mail-Block (BUGFIX:
     SessionLocal selbst oeffnen, db war im Background-Task nicht im Scope)

Mail-Polish nach Mercedes-Review:
P63  Banner-Footer-Links auch im wb7-link/role=link erkennen (Shadow-DOM-
     Walker label-based statt nur <a href>)
P64  Re-Access-Severity: MEDIUM statt HIGH, wenn Footer "Einstellungen" oder
     Mercedes-typisch existiert; OEM-Footer-Detection (wb7-footer)
P65  Text-Truncation: Word-Boundary statt Zeichen-Cut (kein "einfa"-Bruch
     mehr in Sofortmassnahmen)
P66  GF-Aktionen: Service-Zweck vs Cookie-Zweck explizit erklaert
     (haeufige Verwechslung Marketing/GF: "Akamai-Beschreibung" != Cookie-
     Zweck pro DSK-OH 2024)
P67  Stirring-Finding mit "Verlust-Framing"-Erklaerung + Alt-vs-Neutral-
     Beispiel, statt nur EDPB-Fachbegriff

Compliance-Advisor FAQ (admin agent-core/soul):
  + CNIL/EDPB Top-Bussgelder (Google 100M, Meta 60M, Amazon 35M)
  + Deutsche Praezedenz (LG Muenchen Google Fonts, EuGH Planet49, BGH I ZR 7/16)
  + 4 Risiko-Pfade (Bussgeld/Abmahnung/Sammelklage/NOYB) + Berechnungs-Methodik

Document-Generator Templates: AGB-DE (142), Impressum (140), Widerrufs-
formular-Anlage (143), DSR-Process-Dedup (139), Cookie-Library (144).

Architektur: doc_action_mappings.py + banner_dom_walkers.py +
cookie_behavior_validator.py + vendor_detail_extractor.py rausgezogen,
um die 500-LOC-Caps in agent_doc_check_report.py und
banner_text_checker.py einzuhalten.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 06:28:25 +02:00

452 lines
18 KiB
Python

"""
Category Tester — tests individual cookie consent categories.
Tests each category in isolation: only "Statistics" on, only "Marketing" on, etc.
Detects miscategorization: e.g., Facebook Pixel loading when only Statistics is enabled.
"""
import logging
from dataclasses import dataclass, field
from playwright.async_api import Page, BrowserContext
from services.banner_detector import BannerInfo, click_button
from services.script_analyzer import find_tracking_services, Violation
logger = logging.getLogger(__name__)
# Which tracking service belongs to which consent category
SERVICE_CATEGORY_MAP: dict[str, str] = {
# Statistics / Analytics
"Google Analytics": "statistics",
"Matomo": "statistics",
"Plausible Analytics": "statistics",
"Hotjar": "statistics",
"Microsoft Clarity": "statistics",
"etracker": "statistics",
"Heap Analytics": "statistics",
"Amplitude": "statistics",
"Mixpanel": "statistics",
"PostHog": "statistics",
"Mouseflow": "statistics",
"Crazy Egg": "statistics",
"Lucky Orange": "statistics",
"FullStory": "statistics",
# Marketing / Advertising
"Meta/Facebook Pixel": "marketing",
"Google Ads": "marketing",
"Google Ads/DoubleClick": "marketing",
"TikTok Pixel": "marketing",
"LinkedIn Insight": "marketing",
"Pinterest Tag": "marketing",
"Criteo": "marketing",
"Taboola": "marketing",
"Outbrain": "marketing",
"Amazon Ads": "marketing",
"Bing/Microsoft Ads": "marketing",
"Salesforce Pardot": "marketing",
# Functional
"Intercom": "functional",
"Zendesk": "functional",
"Tidio Chat": "functional",
"Crisp Chat": "functional",
"LiveChat": "functional",
"Freshdesk/Freshchat": "functional",
"HelpScout Beacon": "functional",
}
CATEGORY_LABELS = {
"statistics": "Statistik / Analytics",
"marketing": "Marketing / Werbung",
"functional": "Funktional / Komfort",
"social_media": "Social Media",
}
# CMP-specific category selectors
CMP_CATEGORY_CONFIG: dict[str, dict] = {
"Cookiebot": {
"settings_button": "#CybotCookiebotDialogBodyButtonDetails",
"save_button": "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowallSelection",
"categories": {
"statistics": "#CybotCookiebotDialogBodyLevelButtonStatistics",
"marketing": "#CybotCookiebotDialogBodyLevelButtonMarketing",
"preferences": "#CybotCookiebotDialogBodyLevelButtonPreferences",
},
},
"OneTrust": {
"settings_button": "#onetrust-pc-btn-handler, .ot-sdk-show-settings",
"save_button": ".save-preference-btn-handler, #onetrust-accept-btn-handler",
"categories": {
"statistics": ".ot-switch[data-ot-category='C0002'] input, #ot-group-id-C0002",
"marketing": ".ot-switch[data-ot-category='C0004'] input, #ot-group-id-C0004",
"functional": ".ot-switch[data-ot-category='C0003'] input, #ot-group-id-C0003",
},
},
"Usercentrics": {
"settings_button": "[data-testid='uc-more-information-button'], button:has-text('Mehr Informationen')",
"save_button": "[data-testid='uc-save-button']",
"categories": {
"statistics": "[data-testid='uc-category-statistics'] input",
"marketing": "[data-testid='uc-category-marketing'] input",
"functional": "[data-testid='uc-category-functional'] input",
},
},
"Didomi": {
"settings_button": "#didomi-notice-learn-more-button, .didomi-learn-more-button",
"save_button": ".didomi-components-button--primary:has-text('Auswahl speichern'), #didomi-notice-agree-button",
"categories": {
"statistics": "[data-purpose='analytics_purposes'] input, [data-purpose='measure'] input",
"marketing": "[data-purpose='advertising_purposes'] input, [data-purpose='ads'] input",
},
},
# P19: TYPO3 dp-cookieconsent (Dirk Persky) — basiert auf osano cookieconsent.
# Banner zeigt Checkboxes direkt; KEIN Settings-Modal, KEINE Provider-Details.
# Detection: Checkbox-IDs dp--cookie-*. Provider-/Cookie-Liste fehlt
# systematisch -> explizites Finding.
"dp-cookieconsent": {
"settings_button": None,
"save_button": "a.cc-allow:not(.cc-allow-all), button:has-text('Speichern')",
"categories": {
"statistics": "#dp--cookie-statistics",
"marketing": "#dp--cookie-marketing",
},
},
"Cookie Consent (Insites)": { # alias — banner_detector benennt dp-cookieconsent so
"settings_button": None,
"save_button": "a.cc-allow:not(.cc-allow-all), button:has-text('Speichern')",
"categories": {
"statistics": "#dp--cookie-statistics, input[id*='statistic' i]",
"marketing": "#dp--cookie-marketing, input[id*='marketing' i]",
},
},
}
# Selektoren um zu prueffen ob ein Banner Provider-/Cookie-Details
# nach Kategorie-Selektion ZEIGT (Per-Category-Vendor-Listing).
_PROVIDER_DETAIL_SELECTORS = (
"[class*='cookie-list' i]",
"[class*='cookielist' i]",
"[class*='vendor-list' i]",
"[class*='vendor_list' i]",
"[class*='provider-list' i]",
"[class*='cookie-detail' i]",
"[class*='vendor-detail' i]",
"[class*='cookie-item' i]",
"[class*='vendor-item' i]",
"table[class*='cookie' i]",
"table[class*='vendor' i]",
"ul[class*='cookie' i] li",
)
async def _provider_details_visible(page, category_label: str) -> bool:
"""True wenn im Banner sichtbare Provider-/Cookie-Details existieren.
Heuristik: irgendein Element matched die Detail-Selektoren UND ist visible.
Bei Banner wie dp-cookieconsent (kein Listing) immer False -> Finding.
"""
try:
return await page.evaluate(
"""(selectors) => {
for (const sel of selectors) {
const els = document.querySelectorAll(sel);
for (const el of els) {
const r = el.getBoundingClientRect();
if (r.width > 30 && r.height > 10) return true;
}
}
return false;
}""",
list(_PROVIDER_DETAIL_SELECTORS),
)
except Exception:
return False
# Generic category keywords for fallback detection
CATEGORY_KEYWORDS = {
"statistics": ["statistik", "analytics", "analyse", "statistics", "messung", "reichweite"],
"marketing": ["marketing", "werbung", "advertising", "targeting", "remarketing", "anzeigen"],
"functional": ["funktional", "functional", "preferences", "praeferenz", "komfort", "einstellungen"],
"social_media": ["social media", "soziale medien", "social", "teilen"],
}
@dataclass
class CategoryInfo:
name: str
label: str
selector: str
@dataclass
class CategoryTestResult:
category: str
category_label: str
scripts_loaded: list[str] = field(default_factory=list)
cookies_set: list[str] = field(default_factory=list)
tracking_services: list[str] = field(default_factory=list)
violations: list[dict] = field(default_factory=list)
# P19: Per-Category-Transparenz im Banner
provider_details_visible: bool = False
async def detect_categories(page: Page, banner: BannerInfo) -> list[CategoryInfo]:
"""Detect available cookie categories in the CMP."""
categories = []
provider = banner.provider
# CMP-specific detection
config = CMP_CATEGORY_CONFIG.get(provider)
if config:
# Open settings panel first
if config.get("settings_button"):
await click_button(page, config["settings_button"], timeout=3000)
await page.wait_for_timeout(1000)
for cat_name, selector in config.get("categories", {}).items():
try:
if await page.locator(selector.split(",")[0].strip()).count() > 0:
categories.append(CategoryInfo(
name=cat_name,
label=CATEGORY_LABELS.get(cat_name, cat_name),
selector=selector,
))
except Exception:
continue
# P22: Shadow-DOM-Fallback fuer Web-Component-CMPs (Mercedes cmm-cookie-banner).
# Sucht Checkboxes/Switches rekursiv durch alle shadowRoots.
if not categories:
try:
shadow_cats = await page.evaluate("""
() => {
const out = [];
function walk(root, depth) {
if (depth > 6) return;
for (const el of root.querySelectorAll('*')) {
if (el.shadowRoot) {
const sr = el.shadowRoot;
const inputs = sr.querySelectorAll('input[type=checkbox], [role=switch], [role=checkbox]');
for (const i of inputs) {
const lbl = (i.closest('label')?.textContent || i.getAttribute('aria-label') || '').trim();
if (lbl.length > 0) {
out.push({label: lbl.slice(0,60), host: el.tagName.toLowerCase()});
}
}
walk(sr, depth + 1);
}
}
}
walk(document, 0);
return out;
}
""")
for sc in (shadow_cats or []):
text_lower = sc["label"].lower()
for cat_name, keywords in CATEGORY_KEYWORDS.items():
if any(kw in text_lower for kw in keywords):
# Marker selector — toggling per shadow:cat:<label-pattern>
categories.append(CategoryInfo(
name=cat_name,
label=sc["label"][:50],
selector=f"shadow-toggle:{sc['label'][:50]}",
))
break
if categories:
logger.info("P22: %d shadow-DOM categories detected", len(categories))
except Exception as e:
logger.warning("Shadow-DOM category detection failed: %s", e)
# Generic fallback: search for toggle/checkbox elements with category keywords
if not categories:
try:
toggles = await page.evaluate("""
() => {
const elements = document.querySelectorAll(
'input[type="checkbox"], [role="switch"], [class*="toggle"], [class*="switch"]'
);
return [...elements].map(el => ({
text: (el.closest('label')?.textContent || el.getAttribute('aria-label') || '').trim(),
id: el.id || '',
selector: el.id ? '#' + el.id : '',
})).filter(e => e.text.length > 0);
}
""")
for toggle in (toggles or []):
text_lower = toggle["text"].lower()
for cat_name, keywords in CATEGORY_KEYWORDS.items():
if any(kw in text_lower for kw in keywords):
sel = toggle["selector"] or f'[aria-label*="{toggle["text"][:20]}"]'
categories.append(CategoryInfo(
name=cat_name,
label=toggle["text"][:50],
selector=sel,
))
break
except Exception as e:
logger.warning("Generic category detection failed: %s", e)
logger.info("Detected %d categories for %s", len(categories), provider)
return categories
async def test_single_category(
context: BrowserContext,
url: str,
category: CategoryInfo,
banner: BannerInfo,
wait_ms: int = 5000,
) -> CategoryTestResult:
"""Test a single category in isolation: enable only this one, disable others."""
result = CategoryTestResult(
category=category.name,
category_label=category.label,
)
try:
page = await context.new_page()
scripts: list[str] = []
page.on("request", lambda req: _collect(req, scripts))
try:
await page.goto(url, wait_until="networkidle", timeout=20000)
except Exception:
await page.goto(url, wait_until="load", timeout=20000)
await page.wait_for_timeout(2000)
# P22: Shadow-DOM-Toggle fuer Web-Component-CMPs (Mercedes etc.)
if category.selector.startswith("shadow-toggle:"):
label_pat = category.selector[len("shadow-toggle:"):]
try:
await page.evaluate("""(pat) => {
const lbl = pat.toLowerCase();
function walk(root) {
for (const el of root.querySelectorAll('*')) {
if (el.shadowRoot) {
const inputs = el.shadowRoot.querySelectorAll(
'input[type=checkbox], [role=switch], [role=checkbox]');
for (const i of inputs) {
const t = (i.closest('label')?.textContent || i.getAttribute('aria-label') || '').toLowerCase();
if (t.includes(lbl) && !i.checked) { i.click(); return true; }
}
if (walk(el.shadowRoot)) return true;
}
}
return false;
}
walk(document);
}""", label_pat)
await page.wait_for_timeout(500)
# Save via accept-text "Speichern" / "Save" inside shadow
await page.evaluate("""() => {
const SAVE = /speichern|save|bestaetigen|confirm/i;
function walk(root) {
for (const el of root.querySelectorAll('*')) {
if (el.shadowRoot) {
for (const b of el.shadowRoot.querySelectorAll('button, [role=button]')) {
if (SAVE.test(b.textContent || '')) { b.click(); return true; }
}
if (walk(el.shadowRoot)) return true;
}
}
return false;
}
walk(document);
}""")
await page.wait_for_timeout(wait_ms)
except Exception as e:
logger.warning("Shadow-toggle for %s failed: %s", category.name, e)
config = CMP_CATEGORY_CONFIG.get(banner.provider)
if config:
# Open settings
if config.get("settings_button"):
await click_button(page, config["settings_button"], timeout=3000)
await page.wait_for_timeout(1000)
# Disable ALL categories first
for cat_sel in config.get("categories", {}).values():
try:
el = page.locator(cat_sel.split(",")[0].strip()).first
if await el.is_checked():
await el.click()
except Exception:
continue
# Enable ONLY the target category
try:
el = page.locator(category.selector.split(",")[0].strip()).first
if not await el.is_checked():
await el.click()
except Exception:
logger.warning("Could not toggle category %s", category.name)
# Save selection
if config.get("save_button"):
await click_button(page, config["save_button"], timeout=3000)
await page.wait_for_timeout(wait_ms)
# Collect results
result.scripts_loaded = _dedup_scripts(scripts)
result.cookies_set = [c.get("name", "") for c in await context.cookies()]
result.tracking_services = find_tracking_services(result.scripts_loaded)
# P19: pruefe ob das Banner Provider-/Cookie-Details fuer diese
# Kategorie sichtbar macht — bei dp-cookieconsent (Safetykon) immer
# False -> kritischer Verstoss (Art. 7 DSGVO: keine informierte
# Einwilligung ohne Detail-Listing pro Kategorie).
result.provider_details_visible = await _provider_details_visible(
page, category.label,
)
if not result.provider_details_visible:
result.violations.append({
"service": "Cookie-Banner",
"severity": "HIGH",
"text": (f"Kategorie '{category.label}' zeigt keine "
f"Provider-/Cookie-Details im Banner — Nutzer "
f"kann nicht informiert einwilligen "
f"(Art. 7 Abs. 1 DSGVO)."),
"legal_ref": "Art. 7 Abs. 1 DSGVO, EDPB Guidelines 2/2023, "
"DSK-OH Telemedien 2024",
"expected_category": category.name,
"actual_category": category.name,
})
# Find violations: services that don't belong to this category
for service in result.tracking_services:
expected_cat = SERVICE_CATEGORY_MAP.get(service)
if expected_cat and expected_cat != category.name:
result.violations.append({
"service": service,
"severity": "HIGH",
"text": f"{service} laedt bei '{category.label}' — gehoert aber zu '{CATEGORY_LABELS.get(expected_cat, expected_cat)}'",
"expected_category": expected_cat,
"actual_category": category.name,
})
await page.close()
except Exception as e:
logger.error("Category test failed for %s: %s", category.name, e)
return result
def _collect(request, scripts: list[str]):
if request.resource_type in ("script", "image", "xhr", "fetch"):
scripts.append(request.url)
def _dedup_scripts(scripts: list[str]) -> list[str]:
seen = set()
result = []
for url in scripts:
domain = url.split("/")[2] if len(url.split("/")) > 2 else url
if domain not in seen:
seen.add(domain)
result.append(url)
return result[:30]