feat: 4 remaining tasks — EU institutions, banner integration, JS-sites, Caritas fixes

1. EU Institution Checks (Verordnung 2018/1725): - New doc_type "eu_institution" with 9 L1 + 15 L2 checks - Both German + English patterns (EU institutions are multilingual) - Auto-detection via "2018/1725", "EDSB", "EDPS" keywords - Correct article references (Art. 15 instead of 13, Art. 5 instead of 6) 2. Banner Check Integration: - banner_runner.py maps scan results to 36 L1/L2 structured checks - BannerCheckTab shows hierarchical ChecklistView with hints - 3-phase summary (cookies/scripts before/after consent) - /scan endpoint now includes structured_checks in response 3. JS-heavy Website Fixes (dm, Zalando, HWK): - dsi_helpers.py: goto_resilient (networkidle→domcontentloaded fallback) - try_dismiss_consent_banner before text extraction - PDF redirect detection (dm.de redirects to GCS PDF) 4. Caritas False Positive Fixes: - Phone regex allows parentheses: +49 (0)761 → now matches - "Recht auf Widerspruch" (3 words) + §23 KDG → matches Art. 21 - Church authorities: "Katholisches Datenschutzzentrum" recognized Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-08 01:10:10 +02:00
parent 89af88ef7d
commit 686834cea0
11 changed files with 1039 additions and 171 deletions
@@ -0,0 +1,175 @@
+"""
+Banner Runner — maps scan results to the L1/L2 check hierarchy.
+
+Takes the raw ScanResponse dict and produces a structured_checks list
+compatible with ChecklistView (same format as document checks).
+"""
+
+from checks.banner_checks import BANNER_CHECKLIST
+
+
+def map_scan_to_checks(scan_result: dict) -> dict:
+    """Map a /scan response to the L1/L2 banner check hierarchy.
+
+    Returns dict with:
+      - structured_checks: list of CheckItem dicts
+      - completeness_pct: L1 pass rate (0-100)
+      - correctness_pct: L2 pass rate (0-100)
+    """
+    # Collect all violation codes from every source
+    violation_codes = _collect_violation_codes(scan_result)
+
+    # Collect pass codes — some checks produce boolean signals, not violations
+    pass_codes = _collect_pass_codes(scan_result)
+
+    # Build structured checks
+    checks: list[dict] = []
+    l1_checks: list[dict] = []
+    l2_checks: list[dict] = []
+
+    for defn in BANNER_CHECKLIST:
+        key = defn["check_key"]
+        level = defn["level"]
+        parent = defn.get("parent")
+
+        # Determine pass/fail
+        is_violation_key = key in violation_codes
+        is_pass_key = key in pass_codes
+
+        # For checks whose check_key appears in violations → failed
+        # For checks whose check_key appears only in passes → passed
+        # For checks where neither → assume passed (not tested = no finding)
+        if is_violation_key:
+            passed = False
+            matched_text = violation_codes[key]
+        elif is_pass_key:
+            passed = True
+            matched_text = pass_codes.get(key, "")
+        else:
+            # Key not found in violations or explicit passes.
+            # If the scan ran (banner detected) → assume passed.
+            # If banner not detected → only banner_detected fails.
+            passed = scan_result.get("banner_detected", False) or key == "banner_detected"
+            if key == "banner_detected":
+                passed = scan_result.get("banner_detected", False)
+            matched_text = ""
+
+        # L2 checks are skipped if their parent L1 failed
+        skipped = False
+        if level == 2 and parent:
+            parent_check = next(
+                (c for c in checks if c["id"] == parent), None
+            )
+            if parent_check and not parent_check["passed"]:
+                skipped = True
+
+        item = {
+            "id": defn["id"],
+            "label": defn["label"],
+            "passed": passed and not skipped,
+            "severity": defn["severity"],
+            "level": level,
+            "parent": parent,
+            "skipped": skipped,
+            "hint": defn.get("hint", ""),
+            "matched_text": matched_text if passed else "",
+        }
+        checks.append(item)
+
+        if level == 1:
+            l1_checks.append(item)
+        elif level == 2:
+            l2_checks.append(item)
+
+    # Compute percentages
+    l1_total = len(l1_checks)
+    l1_passed = sum(1 for c in l1_checks if c["passed"])
+    completeness_pct = round(l1_passed / l1_total * 100) if l1_total else 0
+
+    l2_active = [c for c in l2_checks if not c["skipped"]]
+    l2_passed = sum(1 for c in l2_active if c["passed"])
+    correctness_pct = round(l2_passed / len(l2_active) * 100) if l2_active else 0
+
+    return {
+        "structured_checks": checks,
+        "completeness_pct": completeness_pct,
+        "correctness_pct": correctness_pct,
+    }
+
+
+def _collect_violation_codes(scan: dict) -> dict[str, str]:
+    """Collect check_key → violation text from all sources."""
+    codes: dict[str, str] = {}
+
+    # Banner text violations
+    banner_checks = scan.get("banner_checks", {})
+    for v in banner_checks.get("violations", []):
+        code = v.get("code", "")
+        if code:
+            codes[code] = v.get("text", "")[:120]
+
+    # Phase A violations (before consent)
+    phase_a = scan.get("phases", {}).get("before_consent", {})
+    for v in phase_a.get("violations", []):
+        code = v.get("code", "")
+        if code:
+            codes[code] = v.get("text", "")[:120]
+
+    # Phase B violations (after reject)
+    phase_b = scan.get("phases", {}).get("after_reject", {})
+    for v in phase_b.get("violations", []):
+        code = v.get("code", "")
+        if code:
+            codes[code] = v.get("text", "")[:120]
+
+    # Tracking services in phase A → tracking_before_consent
+    tracking_a = phase_a.get("tracking_services", [])
+    if tracking_a and "tracking_before_consent" not in codes:
+        codes["tracking_before_consent"] = ", ".join(tracking_a[:5])
+
+    # Cookies before consent → cookies_before_consent
+    cookies_a = phase_a.get("cookies", [])
+    tracking_cookies = [c for c in cookies_a if _is_tracking_cookie(c)]
+    if tracking_cookies and "cookies_before_consent" not in codes:
+        codes["cookies_before_consent"] = ", ".join(tracking_cookies[:5])
+
+    # New tracking after reject → tracking_after_reject
+    new_tracking_b = phase_b.get("new_tracking", [])
+    if new_tracking_b and "tracking_after_reject" not in codes:
+        codes["tracking_after_reject"] = ", ".join(new_tracking_b[:5])
+
+    return codes
+
+
+def _collect_pass_codes(scan: dict) -> dict[str, str]:
+    """Collect explicit pass signals from scan results."""
+    passes: dict[str, str] = {}
+
+    # Banner detected
+    if scan.get("banner_detected"):
+        passes["banner_detected"] = scan.get("banner_provider", "detected")
+
+    # Provider named
+    provider = scan.get("banner_provider", "")
+    if provider:
+        passes["banner_provider_named"] = provider
+
+    # Impressum link
+    bc = scan.get("banner_checks", {})
+    if bc.get("has_impressum_link"):
+        passes["impressum_link"] = "Impressum-Link gefunden"
+    if bc.get("has_dse_link"):
+        passes["dse_link"] = "DSE-Link gefunden"
+
+    return passes
+
+
+_TRACKING_COOKIE_PREFIXES = (
+    "_ga", "_gid", "_fbp", "_fbc", "IDE", "_gcl", "fr", "_pin",
+    "_tt_", "li_sugr", "_hj", "mp_", "ajs_", "_clck", "_clsk",
+)
+
+
+def _is_tracking_cookie(name: str) -> bool:
+    """Check if a cookie name is a known tracking cookie."""
+    return any(name.startswith(p) for p in _TRACKING_COOKIE_PREFIXES)
@@ -16,6 +16,7 @@ from services.consent_scanner import run_consent_test, ConsentTestResult
 from services.authenticated_scanner import run_authenticated_test, AuthTestResult
 from services.playwright_scanner import scan_website_playwright
 from services.dsi_discovery import discover_dsi_documents, DSIDiscoveryResult
+from checks.banner_runner import map_scan_to_checks

 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
 logger = logging.getLogger(__name__)
@@ -44,6 +45,9 @@ class ScanResponse(BaseModel):
    scanned_at: str
    category_tests: list = []
    banner_checks: dict = {}
+    structured_checks: list = []
+    completeness_pct: int = 0
+    correctness_pct: int = 0


@app.get("/health")
@@ -57,30 +61,47 @@ async def scan_consent(req: ScanRequest):
    logger.info("Starting consent test for %s", req.url)
    result = await run_consent_test(req.url, req.timeout_per_phase)

+    # Build raw response dict for structured check mapping
+    phases = {
+        "before_consent": {
+            "scripts": result.before_scripts,
+            "cookies": result.before_cookies,
+            "tracking_services": result.before_tracking,
+            "violations": [v.__dict__ for v in result.before_violations],
+        },
+        "after_reject": {
+            "scripts": result.reject_scripts,
+            "cookies": result.reject_cookies,
+            "new_tracking": result.reject_new_tracking,
+            "violations": [v.__dict__ for v in result.reject_violations],
+        },
+        "after_accept": {
+            "scripts": result.accept_scripts,
+            "cookies": result.accept_cookies,
+            "new_tracking": result.accept_new_tracking,
+            "undocumented": result.accept_undocumented,
+        },
+    }
+    banner_checks_data = {
+        "has_impressum_link": result.banner_has_impressum_link,
+        "has_dse_link": result.banner_has_dse_link,
+        "violations": [v.__dict__ for v in result.banner_text_violations],
+    }
+
+    # Map to L1/L2 hierarchy
+    raw_for_mapping = {
+        "banner_detected": result.banner_detected,
+        "banner_provider": result.banner_provider,
+        "phases": phases,
+        "banner_checks": banner_checks_data,
+    }
+    check_result = map_scan_to_checks(raw_for_mapping)
+
    return ScanResponse(
        url=req.url,
        banner_detected=result.banner_detected,
        banner_provider=result.banner_provider,
-        phases={
-            "before_consent": {
-                "scripts": result.before_scripts,
-                "cookies": result.before_cookies,
-                "tracking_services": result.before_tracking,
-                "violations": [v.__dict__ for v in result.before_violations],
-            },
-            "after_reject": {
-                "scripts": result.reject_scripts,
-                "cookies": result.reject_cookies,
-                "new_tracking": result.reject_new_tracking,
-                "violations": [v.__dict__ for v in result.reject_violations],
-            },
-            "after_accept": {
-                "scripts": result.accept_scripts,
-                "cookies": result.accept_cookies,
-                "new_tracking": result.accept_new_tracking,
-                "undocumented": result.accept_undocumented,
-            },
-        },
+        phases=phases,
        summary={
            "critical": sum(1 for v in result.reject_violations if v.severity == "CRITICAL"),
            "high": len(result.before_violations) + sum(1 for v in result.banner_text_violations if v.severity == "HIGH"),
@@ -90,11 +111,10 @@ async def scan_consent(req: ScanRequest):
            "categories_tested": len(result.category_tests),
            "banner_text_issues": len(result.banner_text_violations),
        },
-        banner_checks={
-            "has_impressum_link": result.banner_has_impressum_link,
-            "has_dse_link": result.banner_has_dse_link,
-            "violations": [v.__dict__ for v in result.banner_text_violations],
-        },
+        banner_checks=banner_checks_data,
+        structured_checks=check_result["structured_checks"],
+        completeness_pct=check_result["completeness_pct"],
+        correctness_pct=check_result["correctness_pct"],
        scanned_at=datetime.now(timezone.utc).isoformat(),
        category_tests=[{
            "category": ct.category,
@@ -23,6 +23,8 @@ from urllib.parse import urlparse, urljoin

 from playwright.async_api import Page

+from services.dsi_helpers import goto_resilient, try_dismiss_consent_banner, is_pdf_redirect
+
 logger = logging.getLogger(__name__)

 # Legal document keywords in all EU/EEA official languages.
@@ -216,11 +218,36 @@ async def discover_dsi_documents(
    seen_titles: set[str] = set()

    try:
-        # Step 1: Load the page
-        await page.goto(url, wait_until="networkidle", timeout=60000)
+        # Step 1: Load the page (with networkidle → domcontentloaded fallback)
+        await goto_resilient(page, url, timeout=60000)
        await page.wait_for_timeout(2000)

-        # Step 1b: Self-extraction — if the URL itself is a DSI page,
+        # Step 1a: Detect PDF redirects (e.g. dm.de redirects to GCS PDF)
+        final_url = page.url
+        if is_pdf_redirect(url, final_url):
+            is_dsi_url, dsi_lang = _matches_dsi_keyword(urlparse(url).path.lower())
+            if is_dsi_url:
+                result.documents.append(DiscoveredDSI(
+                    title=urlparse(url).path.split("/")[-1] or "Datenschutzerklaerung",
+                    url=final_url,
+                    source_url=url,
+                    language=dsi_lang or "de",
+                    doc_type="pdf",
+                    text="[PDF — Textextraktion erforderlich]",
+                ))
+                seen_urls.add(url)
+                seen_urls.add(final_url)
+                logger.info("PDF redirect detected: %s -> %s", url, final_url)
+            # Return early — a PDF redirect means no HTML content to scan
+            result.total_found = len(result.documents)
+            return result
+
+        # Step 1b: Try dismissing cookie consent banners before extraction.
+        # Many German sites (dm.de, Zalando, etc.) block page content behind
+        # a consent wall. Dismissing it reveals the actual DSI text.
+        await try_dismiss_consent_banner(page)
+
+        # Step 1c: Self-extraction — if the URL itself is a DSI page,
        # extract its full text as the first document. This handles the
        # case where the user provides the DSE URL directly (e.g.
        # example.com/datenschutz) instead of the homepage.
@@ -251,6 +278,8 @@ async def discover_dsi_documents(
                    ))
                    seen_urls.add(url)
                    logger.info("Self-extracted %d words from %s", self_wc, url)
+                else:
+                    logger.info("Self-extraction too short (%d words) for %s", self_wc, url)
            except Exception as e:
                logger.warning("Self-extraction failed for %s: %s", url, e)

@@ -323,58 +352,69 @@ async def discover_dsi_documents(
                if is_anchor:
                    continue

-                # Navigate to page — wait for JS to load content
-                resp = await page.goto(href, wait_until="networkidle", timeout=45000)
-                if resp and resp.status < 400:
-                    await page.wait_for_timeout(2000)
-                    await _expand_all_interactive(page)
-                    await page.wait_for_timeout(500)
+                # Navigate to page — with networkidle/domcontentloaded fallback
+                await goto_resilient(page, href, timeout=45000)
+                resp_url = page.url

-                    # Extract text — try specific content areas, fall back to full body
-                    text = await page.evaluate("""
-                        () => {
-                            // Try progressively broader content selectors
-                            const selectors = [
-                                '.article-content', '.page-content', '.entry-content',
-                                '[class*="content-area"]', '[class*="main-content"]',
-                                'main article', 'main', 'article',
-                                '[role="main"]', '.content', '#content',
-                            ];
-                            for (const sel of selectors) {
-                                const el = document.querySelector(sel);
-                                if (el && el.textContent.trim().length > 200) {
-                                    return el.textContent.trim();
-                                }
+                # Check for PDF redirect on followed links
+                if is_pdf_redirect(href, resp_url):
+                    result.documents.append(DiscoveredDSI(
+                        title=title, url=resp_url, source_url=url,
+                        language=lang, doc_type="pdf",
+                        text="[PDF — Textextraktion erforderlich]",
+                    ))
+                    await goto_resilient(page, url, timeout=45000)
+                    continue
+
+                await try_dismiss_consent_banner(page)
+                await _expand_all_interactive(page)
+                await page.wait_for_timeout(500)
+
+                # Extract text — try specific content areas, fall back to full body
+                text = await page.evaluate("""
+                    () => {
+                        // Try progressively broader content selectors
+                        const selectors = [
+                            '.article-content', '.page-content', '.entry-content',
+                            '[class*="content-area"]', '[class*="main-content"]',
+                            'main article', 'main', 'article',
+                            '[role="main"]', '.content', '#content',
+                        ];
+                        for (const sel of selectors) {
+                            const el = document.querySelector(sel);
+                            if (el && el.textContent.trim().length > 200) {
+                                return el.textContent.trim();
                            }
-                            // Fallback: full body minus nav/header/footer
-                            const body = document.body.cloneNode(true);
-                            body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
-                            return body.textContent?.trim() || '';
                        }
-                    """)
-                    if text and len(text) > 50:
-                        result.documents.append(DiscoveredDSI(
-                            title=title, url=href, source_url=url,
-                            language=lang,
-                            doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
-                            text=text[:50000], word_count=len(text.split()),
-                        ))
+                        // Fallback: full body minus nav/header/footer
+                        const body = document.body.cloneNode(true);
+                        body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
+                        return body.textContent?.trim() || '';
+                    }
+                """)
+                if text and len(text) > 50:
+                    result.documents.append(DiscoveredDSI(
+                        title=title, url=href, source_url=url,
+                        language=lang,
+                        doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
+                        text=text[:50000], word_count=len(text.split()),
+                    ))

-                    # Recursive: search THIS page for more DSI links
-                    new_links = await _find_dsi_links(page, base_domain)
-                    for nl in new_links:
-                        if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
-                            pending_links.append(nl)
+                # Recursive: search THIS page for more DSI links
+                new_links = await _find_dsi_links(page, base_domain)
+                for nl in new_links:
+                    if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
+                        pending_links.append(nl)

                # Navigate back for next link
-                await page.goto(url, wait_until="networkidle", timeout=45000)
+                await goto_resilient(page, url, timeout=45000)
                await page.wait_for_timeout(500)
                await _expand_all_interactive(page)

            except Exception as e:
                result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
                try:
-                    await page.goto(url, wait_until="networkidle", timeout=45000)
+                    await goto_resilient(page, url, timeout=45000)
                except Exception:
                    pass

@@ -0,0 +1,118 @@
+"""
+DSI Discovery Helpers — resilient navigation, consent dismissal, PDF redirect detection.
+
+Extracted from dsi_discovery.py to keep modules under 500 LOC.
+"""
+
+import logging
+
+from playwright.async_api import Page, TimeoutError as PlaywrightTimeout
+
+logger = logging.getLogger(__name__)
+
+
+async def goto_resilient(page: Page, url: str, timeout: int = 60000) -> None:
+    """Navigate to URL with fallback: try networkidle first, then domcontentloaded.
+
+    SPAs like Zalando never reach networkidle because of continuous background
+    requests. Falling back to domcontentloaded + a short wait gives JS time to
+    render the main content without waiting for every network request to finish.
+    """
+    try:
+        await page.goto(url, wait_until="networkidle", timeout=timeout)
+    except PlaywrightTimeout:
+        logger.info("networkidle timeout for %s, falling back to domcontentloaded", url)
+        await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
+        await page.wait_for_timeout(5000)  # extra wait for JS rendering
+
+
+async def try_dismiss_consent_banner(page: Page) -> bool:
+    """Try to dismiss cookie consent banners that block page content.
+
+    Handles shadow DOM (Usercentrics), iframes (Sourcepoint), and regular
+    DOM banners (OneTrust, Cookiebot, Didomi, etc.).
+    Returns True if a banner was dismissed.
+    """
+    # 1) Usercentrics shadow DOM — most common for German sites
+    try:
+        uc_root = await page.query_selector("#usercentrics-root")
+        if uc_root:
+            clicked = await page.evaluate("""() => {
+                const root = document.querySelector('#usercentrics-root');
+                if (!root || !root.shadowRoot) return false;
+                const buttons = root.shadowRoot.querySelectorAll('button');
+                for (const btn of buttons) {
+                    const t = btn.textContent.trim().toLowerCase();
+                    if (t.includes('akzeptieren') || t.includes('accept')
+                        || t.includes('zustimmen') || t.includes('agree')) {
+                        btn.click();
+                        return true;
+                    }
+                }
+                return false;
+            }""")
+            if clicked:
+                logger.info("Dismissed Usercentrics consent banner (shadow DOM)")
+                await page.wait_for_timeout(2000)
+                return True
+    except Exception:
+        pass
+
+    # 2) Standard DOM banners — OneTrust, Cookiebot, Didomi, Borlabs, etc.
+    accept_selectors = [
+        "#onetrust-accept-btn-handler",
+        "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
+        "#didomi-notice-agree-button",
+        "#BorlabsCookieBox .cookie-accept, [data-cookie-accept]",
+        ".cmpboxbtn.cmpboxbtnyes",
+        ".klaro .cm-btn-accept",
+        ".cky-btn-accept",
+        "[class*='qc-cmp2-summary-buttons'] button:first-child",
+        "#tarteaucitronPersonalize2",
+    ]
+    for sel in accept_selectors:
+        try:
+            btn = page.locator(sel).first
+            if await btn.count() > 0 and await btn.is_visible():
+                await btn.click(timeout=3000)
+                logger.info("Dismissed consent banner via %s", sel)
+                await page.wait_for_timeout(2000)
+                return True
+        except Exception:
+            continue
+
+    # 3) Generic text-based button search
+    accept_texts = [
+        "Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren",
+        "Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen",
+        "Einverstanden", "Ich stimme zu",
+    ]
+    try:
+        clicked = await page.evaluate("""(texts) => {
+            for (const btn of document.querySelectorAll('button, a[role="button"]')) {
+                const t = (btn.textContent || '').trim();
+                for (const target of texts) {
+                    if (t === target) { btn.click(); return true; }
+                }
+            }
+            return false;
+        }""", accept_texts)
+        if clicked:
+            logger.info("Dismissed consent banner via generic text match")
+            await page.wait_for_timeout(2000)
+            return True
+    except Exception:
+        pass
+
+    return False
+
+
+def is_pdf_redirect(original_url: str, final_url: str) -> bool:
+    """Check if the page redirected to a PDF or external storage."""
+    final_lower = final_url.lower()
+    return (
+        final_lower.endswith(".pdf")
+        or "storage.googleapis.com" in final_lower
+        or "blob.core.windows.net" in final_lower
+        or "s3.amazonaws.com" in final_lower
+    )