Files
breakpilot-compliance/consent-tester/services/dsi_helpers.py
T
Benjamin Admin 57c0f940a2
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / nodejs-build (push) Successful in 2m19s
CI / test-go (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
feat(consent+report): P56-P67 Mercedes-Audit-Cycle (Anti-Audit, Phase G Vendors, Cookie-Behavior-Validator + 5 Mail-Polish-Items) [migration-approved]
P56  Anti-Auditing-Detection als constructive Compliance-Finding (Audit-API-
     Empfehlung statt Anklage, weil Mercedes berechtigt Bots blockiert)
P57  Phase G vendor_details Union mit cmp_vendors -> 42 Anbieter sichtbar
P58  Anti-Audit-Detection robuster (Script-Domain-Check + Settings-spezifisch)
P59  Cookie-Behavior-Validator (4 Layer, 3-Tier-Severity: MEDIUM=Kategorie-
     Mismatch / HIGH=Zweck-Mismatch / CRITICAL=beide=Vorsatz-Indiz)
     + Open Cookie Database (CC0) als Library-Seed (2264 Cookies)
P59b Cookie-Behavior in Banner-Check verdrahtet + Mail-Block (BUGFIX:
     SessionLocal selbst oeffnen, db war im Background-Task nicht im Scope)

Mail-Polish nach Mercedes-Review:
P63  Banner-Footer-Links auch im wb7-link/role=link erkennen (Shadow-DOM-
     Walker label-based statt nur <a href>)
P64  Re-Access-Severity: MEDIUM statt HIGH, wenn Footer "Einstellungen" oder
     Mercedes-typisch existiert; OEM-Footer-Detection (wb7-footer)
P65  Text-Truncation: Word-Boundary statt Zeichen-Cut (kein "einfa"-Bruch
     mehr in Sofortmassnahmen)
P66  GF-Aktionen: Service-Zweck vs Cookie-Zweck explizit erklaert
     (haeufige Verwechslung Marketing/GF: "Akamai-Beschreibung" != Cookie-
     Zweck pro DSK-OH 2024)
P67  Stirring-Finding mit "Verlust-Framing"-Erklaerung + Alt-vs-Neutral-
     Beispiel, statt nur EDPB-Fachbegriff

Compliance-Advisor FAQ (admin agent-core/soul):
  + CNIL/EDPB Top-Bussgelder (Google 100M, Meta 60M, Amazon 35M)
  + Deutsche Praezedenz (LG Muenchen Google Fonts, EuGH Planet49, BGH I ZR 7/16)
  + 4 Risiko-Pfade (Bussgeld/Abmahnung/Sammelklage/NOYB) + Berechnungs-Methodik

Document-Generator Templates: AGB-DE (142), Impressum (140), Widerrufs-
formular-Anlage (143), DSR-Process-Dedup (139), Cookie-Library (144).

Architektur: doc_action_mappings.py + banner_dom_walkers.py +
cookie_behavior_validator.py + vendor_detail_extractor.py rausgezogen,
um die 500-LOC-Caps in agent_doc_check_report.py und
banner_text_checker.py einzuhalten.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 06:28:25 +02:00

164 lines
6.7 KiB
Python

"""
DSI Discovery Helpers — resilient navigation, consent dismissal, PDF redirect detection.
Extracted from dsi_discovery.py to keep modules under 500 LOC.
"""
import logging
from playwright.async_api import Page, TimeoutError as PlaywrightTimeout
logger = logging.getLogger(__name__)
async def goto_resilient(page: Page, url: str, timeout: int = 60000) -> None:
"""Navigate to URL with fallback: try networkidle first, then domcontentloaded.
SPAs like Zalando, BMW, Daimler never reach networkidle because of continuous
background requests (analytics, lazy-loaded assets, polling). The 60s wait
for networkidle is essentially always a 60s waste on those. We try briefly
(15s) and fall through to domcontentloaded + a 5s render-wait.
"""
networkidle_timeout = min(timeout, 15000)
try:
await page.goto(url, wait_until="networkidle", timeout=networkidle_timeout)
except PlaywrightTimeout:
logger.debug("networkidle timeout for %s, falling back to domcontentloaded", url)
await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
# P23: Web-Component-Footer (Mercedes wbx, BMW similar) braucht laenger.
# 5s -> 8s damit Vue/Web-Component-Footer-Links sichtbar werden.
await page.wait_for_timeout(8000)
async def try_dismiss_consent_banner(page: Page) -> bool:
"""Try to dismiss cookie consent banners that block page content.
Handles shadow DOM (Usercentrics), iframes (Sourcepoint), and regular
DOM banners (OneTrust, Cookiebot, Didomi, etc.).
Returns True if a banner was dismissed.
"""
# 1) Usercentrics shadow DOM — most common for German sites
try:
uc_root = await page.query_selector("#usercentrics-root")
if uc_root:
clicked = await page.evaluate("""() => {
const root = document.querySelector('#usercentrics-root');
if (!root || !root.shadowRoot) return false;
const buttons = root.shadowRoot.querySelectorAll('button');
for (const btn of buttons) {
const t = btn.textContent.trim().toLowerCase();
if (t.includes('akzeptieren') || t.includes('accept')
|| t.includes('zustimmen') || t.includes('agree')) {
btn.click();
return true;
}
}
return false;
}""")
if clicked:
logger.info("Dismissed Usercentrics consent banner (shadow DOM)")
await page.wait_for_timeout(2000)
return True
except Exception:
pass
# 2) Standard DOM banners — OneTrust, Cookiebot, Didomi, Borlabs, etc.
accept_selectors = [
"#onetrust-accept-btn-handler",
"#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
"#didomi-notice-agree-button",
"#BorlabsCookieBox .cookie-accept, [data-cookie-accept]",
".cmpboxbtn.cmpboxbtnyes",
".klaro .cm-btn-accept",
".cky-btn-accept",
"[class*='qc-cmp2-summary-buttons'] button:first-child",
"#tarteaucitronPersonalize2",
]
for sel in accept_selectors:
try:
btn = page.locator(sel).first
if await btn.count() > 0 and await btn.is_visible():
await btn.click(timeout=3000)
logger.info("Dismissed consent banner via %s", sel)
await page.wait_for_timeout(2000)
return True
except Exception:
continue
# 3) Sourcepoint / iframe-based CMPs (Spiegel, Zeit, etc.)
# Search ALL iframes for consent buttons — Sourcepoint generates dynamic IDs
try:
for frame in page.frames:
if frame == page.main_frame:
continue
try:
# Sourcepoint accept button
sp_btn = frame.locator(".sp_choice_type_11").first
if await sp_btn.count() > 0 and await sp_btn.is_visible():
await sp_btn.click(timeout=5000)
logger.info("Dismissed Sourcepoint consent (iframe: %s)", frame.url[:80])
await page.wait_for_timeout(3000)
return True
# Generic accept text in iframe
for text in ["Akzeptieren", "Zustimmen", "Accept all", "Alle akzeptieren"]:
btn = frame.locator(f'button:has-text("{text}")').first
if await btn.count() > 0 and await btn.is_visible():
await btn.click(timeout=3000)
logger.info("Dismissed iframe consent via '%s'", text)
await page.wait_for_timeout(3000)
return True
except Exception:
continue
except Exception as e:
logger.debug("Iframe consent dismiss: %s", e)
# 4) Use banner_detector CMP selectors as fallback
try:
from services.banner_detector import detect_banner, click_button
banner = await detect_banner(page)
if banner and banner.accept_selector:
clicked = await click_button(page, banner.accept_selector)
if clicked:
logger.info("Dismissed %s banner via banner_detector", banner.provider)
await page.wait_for_timeout(2000)
return True
except Exception as e:
logger.debug("Banner detector dismiss: %s", e)
# 5) Generic text-based button search
accept_texts = [
"Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren",
"Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen",
"Einverstanden", "Ich stimme zu", "Zustimmen und weiter",
]
try:
clicked = await page.evaluate("""(texts) => {
// Check main document
for (const btn of document.querySelectorAll('button, a[role="button"]')) {
const t = (btn.textContent || '').trim();
for (const target of texts) {
if (t === target) { btn.click(); return true; }
}
}
return false;
}""", accept_texts)
if clicked:
logger.info("Dismissed consent banner via generic text match")
await page.wait_for_timeout(2000)
return True
except Exception:
pass
return False
def is_pdf_redirect(original_url: str, final_url: str) -> bool:
"""Check if the page redirected to a PDF or external storage."""
final_lower = final_url.lower()
return (
final_lower.endswith(".pdf")
or "storage.googleapis.com" in final_lower
or "blob.core.windows.net" in final_lower
or "s3.amazonaws.com" in final_lower
)