fix: DSI dedup prefers 'Datenschutzinformation*' titles + better JS content extraction

Bug 1 fix: When merging documents with identical word_count, prefer titles starting with 'Datenschutzinformation' over generic section headings like 'Zweck und Rechtsgrundlage'. This restores the main 'Datenschutzinformationen zum Internetangebot' document. Bug 2 fix: After navigating to a document page, wait 3s (was 2s) for JS content loading, then try 10+ content selectors before falling back to body text (with nav/header/footer removed). Handles IHK-style JS navigation where content loads after page.goto() completes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-05 12:26:42 +02:00
parent b4f90ed113
commit d547e63663
1 changed files with 28 additions and 15 deletions
@@ -289,18 +289,33 @@ async def discover_dsi_documents(
                if is_anchor:
                    continue
-                # Navigate to page
+                # Navigate to page — wait for JS to load content
                resp = await page.goto(href, wait_until="networkidle", timeout=20000)
                if resp and resp.status < 400:
-                    await page.wait_for_timeout(2000)
+                    await page.wait_for_timeout(3000)  # Extra wait for JS content loading
                    await _expand_all_interactive(page)
-                    await page.wait_for_timeout(500)
+                    await page.wait_for_timeout(1000)
-                    # Extract text
+                    # Extract text — try specific content areas, fall back to full body
                    text = await page.evaluate("""
                        () => {
-                            const main = document.querySelector('main, article, [role="main"], .content, #content');
+                            // Try progressively broader content selectors
-                            return (main || document.body).textContent?.trim() || '';
+                            const selectors = [
                                '.article-content', '.page-content', '.entry-content',
                                '[class*="content-area"]', '[class*="main-content"]',
                                'main article', 'main', 'article',
                                '[role="main"]', '.content', '#content',
                            ];
                            for (const sel of selectors) {
                                const el = document.querySelector(sel);
                                if (el && el.textContent.trim().length > 200) {
                                    return el.textContent.trim();
                                }
                            }
                            // Fallback: full body minus nav/header/footer
                            const body = document.body.cloneNode(true);
                            body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
                            return body.textContent?.trim() || '';
                        }
                    """)
                    if text and len(text) > 50:
@@ -344,12 +359,9 @@ async def discover_dsi_documents(
                result.total_found, result.languages_detected)
    return result
-# Titles that are navigation elements, not actual documents
+# Nav elements, not real documents
-NOISE_TITLES = {
+NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
-    "drucken", "print", "nach oben", "back to top", "teilen", "share",
+    "kontakt", "contact", "suche", "search", "menü", "menu", "home", "datenschutz"}
    "kontakt", "contact", "suche", "search", "menü", "menu", "home",
    "datenschutz",  # too generic (just the word, not a doc title)
 }
 def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
    """Remove duplicate and noise documents."""
@@ -374,10 +386,11 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
    for d in filtered:
        if d.word_count > 200:  # Only dedup substantial docs
            if d.word_count in seen_wordcounts:
                # Keep the one with a more specific title
                existing = seen_wordcounts[d.word_count]
-                if len(d.title) > len(existing.title):
+                # Prefer "Datenschutzinformation*" titles over section headings
-                    # Replace with more descriptive title
+                d_is_dsi = d.title.lower().startswith("datenschutzinformation")
                ex_is_dsi = existing.title.lower().startswith("datenschutzinformation")
                if d_is_dsi and not ex_is_dsi:
                    unique = [x for x in unique if x is not existing]
                    unique.append(d)
                    seen_wordcounts[d.word_count] = d