fix: DSI self-extraction + banner L1/L2 check definitions

1. DSI Discovery fix for direct-URL use case (e.g. example.com/datenschutz): - Self-extraction: if the URL itself is a DSE page, extract its text directly from the page body (main/article/content element) - Remove "datenschutz" from NOISE_TITLES — it's a legitimate doc title - Fixes safetykon.de/datenschutz returning 0 documents 2. Banner check definitions (36 checks: 6 L1 + 30 L2): - consent-tester/checks/banner_checks.py with expert-level hints - EDPB 3/2022, CNIL rulings, EuGH C-673/17, §25 TDDDG references - check_key maps to existing consent_scanner check codes Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 20:53:13 +02:00
parent 78d7273b82
commit 608fb7faf5
3 changed files with 760 additions and 1 deletions
@@ -220,6 +220,40 @@ async def discover_dsi_documents(
        await page.goto(url, wait_until="networkidle", timeout=60000)
        await page.wait_for_timeout(2000)

+        # Step 1b: Self-extraction — if the URL itself is a DSI page,
+        # extract its full text as the first document. This handles the
+        # case where the user provides the DSE URL directly (e.g.
+        # example.com/datenschutz) instead of the homepage.
+        current_url_path = urlparse(url).path.lower()
+        is_self_dsi, self_lang = _matches_dsi_keyword(current_url_path)
+        if not is_self_dsi:
+            # Also check the page title
+            page_title = await page.title() or ""
+            is_self_dsi, self_lang = _matches_dsi_keyword(page_title)
+        if is_self_dsi:
+            try:
+                self_text = await page.evaluate("""() => {
+                    const main = document.querySelector('main, article, [role="main"], .content, #content, .bodytext')
+                        || document.body;
+                    return main ? main.innerText : document.body.innerText;
+                }""")
+                self_wc = len(self_text.split()) if self_text else 0
+                if self_wc >= 100:
+                    page_title = await page.title() or url
+                    result.documents.append(DiscoveredDSI(
+                        title=page_title.strip(),
+                        url=url,
+                        source_url=url,
+                        language=self_lang or "de",
+                        doc_type="html_full_page",
+                        text=self_text.strip(),
+                        word_count=self_wc,
+                    ))
+                    seen_urls.add(url)
+                    logger.info("Self-extracted %d words from %s", self_wc, url)
+            except Exception as e:
+                logger.warning("Self-extraction failed for %s: %s", url, e)
+
        # Step 2: Find DSI links in current page
        links = await _find_dsi_links(page, base_domain)
        logger.info("Found %d DSI links on %s", len(links), url)
@@ -360,8 +394,9 @@ async def discover_dsi_documents(
    return result

 # Nav elements, not real documents
+# NOTE: "datenschutz" was removed — it's a legitimate document title
 NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
-    "kontakt", "contact", "suche", "search", "menü", "menu", "home", "datenschutz"}
+    "kontakt", "contact", "suche", "search", "menü", "menu", "home"}

 def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
    """Remove duplicate and noise documents."""