feat: 4 remaining tasks — EU institutions, banner integration, JS-sites, Caritas fixes

1. EU Institution Checks (Verordnung 2018/1725): - New doc_type "eu_institution" with 9 L1 + 15 L2 checks - Both German + English patterns (EU institutions are multilingual) - Auto-detection via "2018/1725", "EDSB", "EDPS" keywords - Correct article references (Art. 15 instead of 13, Art. 5 instead of 6) 2. Banner Check Integration: - banner_runner.py maps scan results to 36 L1/L2 structured checks - BannerCheckTab shows hierarchical ChecklistView with hints - 3-phase summary (cookies/scripts before/after consent) - /scan endpoint now includes structured_checks in response 3. JS-heavy Website Fixes (dm, Zalando, HWK): - dsi_helpers.py: goto_resilient (networkidle→domcontentloaded fallback) - try_dismiss_consent_banner before text extraction - PDF redirect detection (dm.de redirects to GCS PDF) 4. Caritas False Positive Fixes: - Phone regex allows parentheses: +49 (0)761 → now matches - "Recht auf Widerspruch" (3 words) + §23 KDG → matches Art. 21 - Church authorities: "Katholisches Datenschutzzentrum" recognized Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-08 01:10:10 +02:00
parent 89af88ef7d
commit 686834cea0
11 changed files with 1039 additions and 171 deletions
@@ -23,6 +23,8 @@ from urllib.parse import urlparse, urljoin

 from playwright.async_api import Page

+from services.dsi_helpers import goto_resilient, try_dismiss_consent_banner, is_pdf_redirect
+
 logger = logging.getLogger(__name__)

 # Legal document keywords in all EU/EEA official languages.
@@ -216,11 +218,36 @@ async def discover_dsi_documents(
    seen_titles: set[str] = set()

    try:
-        # Step 1: Load the page
-        await page.goto(url, wait_until="networkidle", timeout=60000)
+        # Step 1: Load the page (with networkidle → domcontentloaded fallback)
+        await goto_resilient(page, url, timeout=60000)
        await page.wait_for_timeout(2000)

-        # Step 1b: Self-extraction — if the URL itself is a DSI page,
+        # Step 1a: Detect PDF redirects (e.g. dm.de redirects to GCS PDF)
+        final_url = page.url
+        if is_pdf_redirect(url, final_url):
+            is_dsi_url, dsi_lang = _matches_dsi_keyword(urlparse(url).path.lower())
+            if is_dsi_url:
+                result.documents.append(DiscoveredDSI(
+                    title=urlparse(url).path.split("/")[-1] or "Datenschutzerklaerung",
+                    url=final_url,
+                    source_url=url,
+                    language=dsi_lang or "de",
+                    doc_type="pdf",
+                    text="[PDF — Textextraktion erforderlich]",
+                ))
+                seen_urls.add(url)
+                seen_urls.add(final_url)
+                logger.info("PDF redirect detected: %s -> %s", url, final_url)
+            # Return early — a PDF redirect means no HTML content to scan
+            result.total_found = len(result.documents)
+            return result
+
+        # Step 1b: Try dismissing cookie consent banners before extraction.
+        # Many German sites (dm.de, Zalando, etc.) block page content behind
+        # a consent wall. Dismissing it reveals the actual DSI text.
+        await try_dismiss_consent_banner(page)
+
+        # Step 1c: Self-extraction — if the URL itself is a DSI page,
        # extract its full text as the first document. This handles the
        # case where the user provides the DSE URL directly (e.g.
        # example.com/datenschutz) instead of the homepage.
@@ -251,6 +278,8 @@ async def discover_dsi_documents(
                    ))
                    seen_urls.add(url)
                    logger.info("Self-extracted %d words from %s", self_wc, url)
+                else:
+                    logger.info("Self-extraction too short (%d words) for %s", self_wc, url)
            except Exception as e:
                logger.warning("Self-extraction failed for %s: %s", url, e)

@@ -323,58 +352,69 @@ async def discover_dsi_documents(
                if is_anchor:
                    continue

-                # Navigate to page — wait for JS to load content
-                resp = await page.goto(href, wait_until="networkidle", timeout=45000)
-                if resp and resp.status < 400:
-                    await page.wait_for_timeout(2000)
-                    await _expand_all_interactive(page)
-                    await page.wait_for_timeout(500)
+                # Navigate to page — with networkidle/domcontentloaded fallback
+                await goto_resilient(page, href, timeout=45000)
+                resp_url = page.url

-                    # Extract text — try specific content areas, fall back to full body
-                    text = await page.evaluate("""
-                        () => {
-                            // Try progressively broader content selectors
-                            const selectors = [
-                                '.article-content', '.page-content', '.entry-content',
-                                '[class*="content-area"]', '[class*="main-content"]',
-                                'main article', 'main', 'article',
-                                '[role="main"]', '.content', '#content',
-                            ];
-                            for (const sel of selectors) {
-                                const el = document.querySelector(sel);
-                                if (el && el.textContent.trim().length > 200) {
-                                    return el.textContent.trim();
-                                }
+                # Check for PDF redirect on followed links
+                if is_pdf_redirect(href, resp_url):
+                    result.documents.append(DiscoveredDSI(
+                        title=title, url=resp_url, source_url=url,
+                        language=lang, doc_type="pdf",
+                        text="[PDF — Textextraktion erforderlich]",
+                    ))
+                    await goto_resilient(page, url, timeout=45000)
+                    continue
+
+                await try_dismiss_consent_banner(page)
+                await _expand_all_interactive(page)
+                await page.wait_for_timeout(500)
+
+                # Extract text — try specific content areas, fall back to full body
+                text = await page.evaluate("""
+                    () => {
+                        // Try progressively broader content selectors
+                        const selectors = [
+                            '.article-content', '.page-content', '.entry-content',
+                            '[class*="content-area"]', '[class*="main-content"]',
+                            'main article', 'main', 'article',
+                            '[role="main"]', '.content', '#content',
+                        ];
+                        for (const sel of selectors) {
+                            const el = document.querySelector(sel);
+                            if (el && el.textContent.trim().length > 200) {
+                                return el.textContent.trim();
                            }
-                            // Fallback: full body minus nav/header/footer
-                            const body = document.body.cloneNode(true);
-                            body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
-                            return body.textContent?.trim() || '';
                        }
-                    """)
-                    if text and len(text) > 50:
-                        result.documents.append(DiscoveredDSI(
-                            title=title, url=href, source_url=url,
-                            language=lang,
-                            doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
-                            text=text[:50000], word_count=len(text.split()),
-                        ))
+                        // Fallback: full body minus nav/header/footer
+                        const body = document.body.cloneNode(true);
+                        body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
+                        return body.textContent?.trim() || '';
+                    }
+                """)
+                if text and len(text) > 50:
+                    result.documents.append(DiscoveredDSI(
+                        title=title, url=href, source_url=url,
+                        language=lang,
+                        doc_type="cross_domain" if not _is_allowed_domain(href, base_domain) else "html_page",
+                        text=text[:50000], word_count=len(text.split()),
+                    ))

-                    # Recursive: search THIS page for more DSI links
-                    new_links = await _find_dsi_links(page, base_domain)
-                    for nl in new_links:
-                        if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
-                            pending_links.append(nl)
+                # Recursive: search THIS page for more DSI links
+                new_links = await _find_dsi_links(page, base_domain)
+                for nl in new_links:
+                    if nl["href"] not in seen_urls and nl["href"] not in [p["href"] for p in pending_links]:
+                        pending_links.append(nl)

                # Navigate back for next link
-                await page.goto(url, wait_until="networkidle", timeout=45000)
+                await goto_resilient(page, url, timeout=45000)
                await page.wait_for_timeout(500)
                await _expand_all_interactive(page)

            except Exception as e:
                result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
                try:
-                    await page.goto(url, wait_until="networkidle", timeout=45000)
+                    await goto_resilient(page, url, timeout=45000)
                except Exception:
                    pass