c867478791
CI / loc-budget (push) Failing after 16s
Build + Deploy / build-admin-compliance (push) Successful in 14s
Build + Deploy / build-backend-compliance (push) Successful in 16s
Build + Deploy / build-ai-sdk (push) Successful in 20s
Build + Deploy / build-developer-portal (push) Successful in 12s
Build + Deploy / build-tts (push) Successful in 15s
Build + Deploy / build-document-crawler (push) Successful in 13s
Build + Deploy / build-dsms-gateway (push) Successful in 13s
Build + Deploy / build-dsms-node (push) Successful in 12s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / test-python-document-crawler (push) Successful in 26s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m49s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Successful in 45s
CI / test-python-backend (push) Successful in 38s
CI / test-python-dsms-gateway (push) Successful in 23s
CI / validate-canonical-controls (push) Successful in 15s
Build + Deploy / trigger-orca (push) Successful in 2m23s
Phase 1-2 of the closed quality loop: - GVL cache (consent-tester/services/gvl_cache.py): downloads and caches IAB Global Vendor List with 24h TTL, resolves vendor IDs to names, purposes, policy URLs, retention, country - Vendor extraction (consent_interceptor.py): extract_tcf_vendors() reads __tcfapi after accept phase, resolves via GVL - Scan response: tcf_vendors field added to /scan endpoint - VVT mapper (vendor_vvt_mapper.py): maps TCF vendors to VVT format with purpose labels, Rechtsgrundlage, Drittland detection - Vendor cross-check (banner_cookie_cross_check.py): checks all TCF vendors against DSI text — missing vendors, undocumented transfers - Compliance check integrates Step 3d: TCF vendors vs DSI Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
336 lines
14 KiB
Python
336 lines
14 KiB
Python
"""
|
|
Consent Scanner — Playwright-based 3-phase cookie consent test.
|
|
|
|
Phase A: Before consent (first visit)
|
|
Phase B: After rejecting consent
|
|
Phase C: After accepting consent
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
|
|
from playwright.async_api import async_playwright, Page, BrowserContext
|
|
|
|
try:
|
|
from playwright_stealth import stealth_async
|
|
HAS_STEALTH = True
|
|
except ImportError:
|
|
HAS_STEALTH = False
|
|
|
|
from services.banner_detector import detect_banner, click_button, BannerInfo
|
|
from services.script_analyzer import (
|
|
classify_scripts, find_tracking_services,
|
|
find_violations_before_consent, find_violations_after_reject, Violation,
|
|
)
|
|
from services.banner_text_checker import check_banner_text as _check_banner_text
|
|
from services.consent_interceptor import (
|
|
INIT_SCRIPT as _INTERCEPTOR_INIT,
|
|
collect_intercepted_data as _collect_intercepted,
|
|
get_consent_state as _get_consent_state,
|
|
analyze_phase_data as _analyze_phase,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
USER_AGENT = (
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ConsentTestResult:
|
|
banner_detected: bool = False
|
|
banner_provider: str = ""
|
|
# Phase A: Before consent
|
|
before_scripts: list[str] = field(default_factory=list)
|
|
before_cookies: list[str] = field(default_factory=list)
|
|
before_tracking: list[str] = field(default_factory=list)
|
|
before_violations: list[Violation] = field(default_factory=list)
|
|
# Phase B: After reject
|
|
reject_scripts: list[str] = field(default_factory=list)
|
|
reject_cookies: list[str] = field(default_factory=list)
|
|
reject_new_tracking: list[str] = field(default_factory=list)
|
|
reject_violations: list[Violation] = field(default_factory=list)
|
|
# Phase C: After accept
|
|
accept_scripts: list[str] = field(default_factory=list)
|
|
accept_cookies: list[str] = field(default_factory=list)
|
|
accept_new_tracking: list[str] = field(default_factory=list)
|
|
accept_undocumented: list[str] = field(default_factory=list)
|
|
# Phase D-F: Per-category tests
|
|
category_tests: list = field(default_factory=list) # list[CategoryTestResult]
|
|
# Banner text checks
|
|
banner_text_violations: list[Violation] = field(default_factory=list)
|
|
banner_has_impressum_link: bool = False
|
|
banner_has_dse_link: bool = False
|
|
# Deep verification (per-phase intercepted data)
|
|
deep_verification: dict = field(default_factory=dict)
|
|
# TCF vendors (resolved via GVL after accept phase)
|
|
tcf_vendors: list = field(default_factory=list)
|
|
|
|
|
|
async def run_consent_test(
|
|
url: str, wait_secs: int = 10, categories: list[str] | None = None,
|
|
) -> ConsentTestResult:
|
|
"""Run 3-phase consent test on a URL.
|
|
|
|
Args:
|
|
url: Website URL to test.
|
|
wait_secs: Seconds to wait per phase.
|
|
categories: Optional list of category names to test (empty = test all).
|
|
"""
|
|
result = ConsentTestResult()
|
|
wait_ms = wait_secs * 1000
|
|
filter_cats = categories or []
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(
|
|
headless=True,
|
|
args=[
|
|
"--no-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--window-size=1920,1080",
|
|
],
|
|
)
|
|
|
|
try:
|
|
# ── Phase A: Before consent ──────────────────────────
|
|
logger.info("Phase A: First visit (no interaction)")
|
|
ctx_a = await browser.new_context(
|
|
user_agent=USER_AGENT,
|
|
viewport={"width": 1920, "height": 1080},
|
|
locale="de-DE",
|
|
timezone_id="Europe/Berlin",
|
|
)
|
|
page_a = await ctx_a.new_page()
|
|
await page_a.add_init_script(_INTERCEPTOR_INIT)
|
|
if HAS_STEALTH:
|
|
await stealth_async(page_a)
|
|
scripts_a = []
|
|
page_a.on("request", lambda req: _collect_script(req, scripts_a))
|
|
|
|
await page_a.goto(url, wait_until="networkidle", timeout=30000)
|
|
await page_a.wait_for_timeout(wait_ms)
|
|
|
|
# Deep verification: Phase A
|
|
try:
|
|
intercepted_a = await _collect_intercepted(page_a)
|
|
consent_state_a = await _get_consent_state(page_a)
|
|
deep_violations_a = _analyze_phase("before_consent", intercepted_a, consent_state_a)
|
|
result.deep_verification["before_consent"] = {
|
|
"intercepted": intercepted_a,
|
|
"consent_state": consent_state_a,
|
|
"violations": deep_violations_a,
|
|
}
|
|
except Exception as exc:
|
|
logger.warning("Phase A deep verification failed: %s", exc)
|
|
|
|
result.before_scripts = _get_page_scripts(scripts_a)
|
|
result.before_cookies = _get_cookie_names(await ctx_a.cookies())
|
|
result.before_tracking = find_tracking_services(result.before_scripts)
|
|
result.before_violations = find_violations_before_consent(result.before_scripts)
|
|
|
|
# Detect banner
|
|
banner = await detect_banner(page_a)
|
|
result.banner_detected = banner.detected
|
|
result.banner_provider = banner.provider
|
|
|
|
# Check banner text for legal issues
|
|
if banner.detected:
|
|
banner_violations = await _check_banner_text(page_a)
|
|
result.banner_text_violations = banner_violations["violations"]
|
|
result.banner_has_impressum_link = banner_violations["has_impressum"]
|
|
result.banner_has_dse_link = banner_violations["has_dse"]
|
|
|
|
await ctx_a.close()
|
|
|
|
if not banner.detected:
|
|
logger.info("No consent banner detected — skipping Phase B/C")
|
|
await browser.close()
|
|
return result
|
|
|
|
# ── Phase B: After rejecting ─────────────────────────
|
|
logger.info("Phase B: Reject consent (%s)", banner.provider)
|
|
ctx_b = await browser.new_context(
|
|
user_agent=USER_AGENT,
|
|
viewport={"width": 1920, "height": 1080},
|
|
locale="de-DE",
|
|
timezone_id="Europe/Berlin",
|
|
)
|
|
page_b = await ctx_b.new_page()
|
|
await page_b.add_init_script(_INTERCEPTOR_INIT)
|
|
if HAS_STEALTH:
|
|
await stealth_async(page_b)
|
|
scripts_b = []
|
|
page_b.on("request", lambda req: _collect_script(req, scripts_b))
|
|
|
|
await page_b.goto(url, wait_until="networkidle", timeout=30000)
|
|
await page_b.wait_for_timeout(3000)
|
|
|
|
clicked = await click_button(page_b, banner.reject_selector)
|
|
if clicked:
|
|
logger.info("Reject button clicked, waiting %ds", wait_secs)
|
|
await page_b.wait_for_timeout(wait_ms)
|
|
else:
|
|
logger.warning("Could not click reject button")
|
|
|
|
# Deep verification: Phase B
|
|
try:
|
|
intercepted_b = await _collect_intercepted(page_b)
|
|
consent_state_b = await _get_consent_state(page_b)
|
|
deep_violations_b = _analyze_phase("after_reject", intercepted_b, consent_state_b)
|
|
result.deep_verification["after_reject"] = {
|
|
"intercepted": intercepted_b,
|
|
"consent_state": consent_state_b,
|
|
"violations": deep_violations_b,
|
|
}
|
|
except Exception as exc:
|
|
logger.warning("Phase B deep verification failed: %s", exc)
|
|
|
|
result.reject_scripts = _get_page_scripts(scripts_b)
|
|
result.reject_cookies = _get_cookie_names(await ctx_b.cookies())
|
|
reject_tracking = find_tracking_services(result.reject_scripts)
|
|
result.reject_new_tracking = [t for t in reject_tracking if t not in result.before_tracking]
|
|
result.reject_violations = find_violations_after_reject(
|
|
result.before_scripts, result.reject_scripts,
|
|
)
|
|
|
|
await ctx_b.close()
|
|
|
|
# ── Phase C: After accepting ─────────────────────────
|
|
logger.info("Phase C: Accept consent (%s)", banner.provider)
|
|
ctx_c = await browser.new_context(
|
|
user_agent=USER_AGENT,
|
|
viewport={"width": 1920, "height": 1080},
|
|
locale="de-DE",
|
|
timezone_id="Europe/Berlin",
|
|
)
|
|
page_c = await ctx_c.new_page()
|
|
await page_c.add_init_script(_INTERCEPTOR_INIT)
|
|
if HAS_STEALTH:
|
|
await stealth_async(page_c)
|
|
scripts_c = []
|
|
page_c.on("request", lambda req: _collect_script(req, scripts_c))
|
|
|
|
await page_c.goto(url, wait_until="networkidle", timeout=30000)
|
|
await page_c.wait_for_timeout(3000)
|
|
|
|
clicked = await click_button(page_c, banner.accept_selector)
|
|
if clicked:
|
|
logger.info("Accept button clicked, waiting %ds", wait_secs)
|
|
await page_c.wait_for_timeout(wait_ms)
|
|
else:
|
|
logger.warning("Could not click accept button")
|
|
|
|
# Deep verification: Phase C
|
|
try:
|
|
intercepted_c = await _collect_intercepted(page_c)
|
|
consent_state_c = await _get_consent_state(page_c)
|
|
deep_violations_c = _analyze_phase("after_accept", intercepted_c, consent_state_c)
|
|
result.deep_verification["after_accept"] = {
|
|
"intercepted": intercepted_c,
|
|
"consent_state": consent_state_c,
|
|
"violations": deep_violations_c,
|
|
}
|
|
except Exception as exc:
|
|
logger.warning("Phase C deep verification failed: %s", exc)
|
|
|
|
result.accept_scripts = _get_page_scripts(scripts_c)
|
|
result.accept_cookies = _get_cookie_names(await ctx_c.cookies())
|
|
accept_tracking = find_tracking_services(result.accept_scripts)
|
|
result.accept_new_tracking = [t for t in accept_tracking if t not in result.before_tracking]
|
|
|
|
# TCF vendor extraction (after accept, while page is still open)
|
|
try:
|
|
from services.consent_interceptor import extract_tcf_vendors
|
|
result.tcf_vendors = await extract_tcf_vendors(page_c)
|
|
except Exception as exc:
|
|
logger.warning("TCF vendor extraction failed: %s", exc)
|
|
|
|
await ctx_c.close()
|
|
|
|
# ── Phase D-F: Per-category tests ────────────────────────
|
|
try:
|
|
from services.category_tester import detect_categories, test_single_category
|
|
|
|
ctx_cat = await browser.new_context(
|
|
user_agent=USER_AGENT,
|
|
viewport={"width": 1920, "height": 1080},
|
|
locale="de-DE",
|
|
timezone_id="Europe/Berlin",
|
|
)
|
|
page_cat = await ctx_cat.new_page()
|
|
if HAS_STEALTH:
|
|
await stealth_async(page_cat)
|
|
await page_cat.goto(url, wait_until="networkidle", timeout=20000)
|
|
await page_cat.wait_for_timeout(2000)
|
|
|
|
detected_cats = await detect_categories(page_cat, banner)
|
|
await page_cat.close()
|
|
|
|
# Filter to requested categories if specified
|
|
if filter_cats and detected_cats:
|
|
detected_cats = [
|
|
c for c in detected_cats if c.name in filter_cats
|
|
]
|
|
logger.info(
|
|
"Filtered to %d categories (requested: %s)",
|
|
len(detected_cats), filter_cats,
|
|
)
|
|
|
|
if detected_cats:
|
|
logger.info("Testing %d categories individually", len(detected_cats))
|
|
for cat in detected_cats:
|
|
cat_ctx = await browser.new_context(
|
|
user_agent=USER_AGENT,
|
|
viewport={"width": 1920, "height": 1080},
|
|
locale="de-DE",
|
|
timezone_id="Europe/Berlin",
|
|
)
|
|
cat_result = await test_single_category(cat_ctx, url, cat, banner, wait_ms)
|
|
result.category_tests.append(cat_result)
|
|
await cat_ctx.close()
|
|
else:
|
|
logger.info("No categories detected — skipping per-category tests")
|
|
|
|
await ctx_cat.close()
|
|
except Exception as cat_err:
|
|
logger.warning("Category tests failed (non-blocking): %s", cat_err)
|
|
|
|
except Exception as e:
|
|
logger.error("Consent test failed: %s", e)
|
|
finally:
|
|
await browser.close()
|
|
|
|
logger.info(
|
|
"Consent test complete: banner=%s, violations_before=%d, violations_reject=%d, categories=%d",
|
|
result.banner_provider, len(result.before_violations), len(result.reject_violations),
|
|
len(result.category_tests),
|
|
)
|
|
return result
|
|
|
|
|
|
def _collect_script(request, scripts: list[str]):
|
|
"""Collect script request URLs."""
|
|
if request.resource_type in ("script", "image", "xhr", "fetch"):
|
|
scripts.append(request.url)
|
|
|
|
|
|
def _get_page_scripts(collected: list[str]) -> list[str]:
|
|
"""Deduplicate and filter script URLs."""
|
|
seen = set()
|
|
result = []
|
|
for url in collected:
|
|
domain = url.split("/")[2] if "/" in url and len(url.split("/")) > 2 else url
|
|
if domain not in seen:
|
|
seen.add(domain)
|
|
result.append(url)
|
|
return result[:50] # Cap at 50
|
|
|
|
|
|
def _get_cookie_names(cookies: list[dict]) -> list[str]:
|
|
"""Extract cookie names from Playwright cookie list."""
|
|
return sorted(set(c.get("name", "") for c in cookies if c.get("name")))
|
|
|