From d547e63663f0aebf7ddaca23915051bf0f6e8726 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Tue, 5 May 2026 12:26:42 +0200
Subject: [PATCH] fix: DSI dedup prefers 'Datenschutzinformation*' titles +
 better JS content extraction

Bug 1 fix: When merging documents with identical word_count, prefer
titles starting with 'Datenschutzinformation' over generic section
headings like 'Zweck und Rechtsgrundlage'. This restores the main
'Datenschutzinformationen zum Internetangebot' document.

Bug 2 fix: After navigating to a document page, wait 3s (was 2s) for
JS content loading, then try 10+ content selectors before falling back
to body text (with nav/header/footer removed). Handles IHK-style JS
navigation where content loads after page.goto() completes.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 consent-tester/services/dsi_discovery.py | 43 +++++++++++++++---------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py
index c67a644..9f71b62 100644
--- a/consent-tester/services/dsi_discovery.py
+++ b/consent-tester/services/dsi_discovery.py
@@ -289,18 +289,33 @@ async def discover_dsi_documents(
                 if is_anchor:
                     continue
 
-                # Navigate to page
+                # Navigate to page — wait for JS to load content
                 resp = await page.goto(href, wait_until="networkidle", timeout=20000)
                 if resp and resp.status < 400:
-                    await page.wait_for_timeout(2000)
+                    await page.wait_for_timeout(3000)  # Extra wait for JS content loading
                     await _expand_all_interactive(page)
-                    await page.wait_for_timeout(500)
+                    await page.wait_for_timeout(1000)
 
-                    # Extract text
+                    # Extract text — try specific content areas, fall back to full body
                     text = await page.evaluate("""
                         () => {
-                            const main = document.querySelector('main, article, [role="main"], .content, #content');
-                            return (main || document.body).textContent?.trim() || '';
+                            // Try progressively broader content selectors
+                            const selectors = [
+                                '.article-content', '.page-content', '.entry-content',
+                                '[class*="content-area"]', '[class*="main-content"]',
+                                'main article', 'main', 'article',
+                                '[role="main"]', '.content', '#content',
+                            ];
+                            for (const sel of selectors) {
+                                const el = document.querySelector(sel);
+                                if (el && el.textContent.trim().length > 200) {
+                                    return el.textContent.trim();
+                                }
+                            }
+                            // Fallback: full body minus nav/header/footer
+                            const body = document.body.cloneNode(true);
+                            body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
+                            return body.textContent?.trim() || '';
                         }
                     """)
                     if text and len(text) > 50:
@@ -344,12 +359,9 @@ async def discover_dsi_documents(
                 result.total_found, result.languages_detected)
     return result
 
-# Titles that are navigation elements, not actual documents
-NOISE_TITLES = {
-    "drucken", "print", "nach oben", "back to top", "teilen", "share",
-    "kontakt", "contact", "suche", "search", "menü", "menu", "home",
-    "datenschutz",  # too generic (just the word, not a doc title)
-}
+# Nav elements, not real documents
+NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
+    "kontakt", "contact", "suche", "search", "menü", "menu", "home", "datenschutz"}
 
 def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
     """Remove duplicate and noise documents."""
@@ -374,10 +386,11 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
     for d in filtered:
         if d.word_count > 200:  # Only dedup substantial docs
             if d.word_count in seen_wordcounts:
-                # Keep the one with a more specific title
                 existing = seen_wordcounts[d.word_count]
-                if len(d.title) > len(existing.title):
-                    # Replace with more descriptive title
+                # Prefer "Datenschutzinformation*" titles over section headings
+                d_is_dsi = d.title.lower().startswith("datenschutzinformation")
+                ex_is_dsi = existing.title.lower().startswith("datenschutzinformation")
+                if d_is_dsi and not ex_is_dsi:
                     unique = [x for x in unique if x is not existing]
                     unique.append(d)
                     seen_wordcounts[d.word_count] = d