From d547e63663f0aebf7ddaca23915051bf0f6e8726 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 5 May 2026 12:26:42 +0200 Subject: [PATCH] fix: DSI dedup prefers 'Datenschutzinformation*' titles + better JS content extraction Bug 1 fix: When merging documents with identical word_count, prefer titles starting with 'Datenschutzinformation' over generic section headings like 'Zweck und Rechtsgrundlage'. This restores the main 'Datenschutzinformationen zum Internetangebot' document. Bug 2 fix: After navigating to a document page, wait 3s (was 2s) for JS content loading, then try 10+ content selectors before falling back to body text (with nav/header/footer removed). Handles IHK-style JS navigation where content loads after page.goto() completes. Co-Authored-By: Claude Opus 4.6 (1M context) --- consent-tester/services/dsi_discovery.py | 43 +++++++++++++++--------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index c67a644..9f71b62 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -289,18 +289,33 @@ async def discover_dsi_documents( if is_anchor: continue - # Navigate to page + # Navigate to page — wait for JS to load content resp = await page.goto(href, wait_until="networkidle", timeout=20000) if resp and resp.status < 400: - await page.wait_for_timeout(2000) + await page.wait_for_timeout(3000) # Extra wait for JS content loading await _expand_all_interactive(page) - await page.wait_for_timeout(500) + await page.wait_for_timeout(1000) - # Extract text + # Extract text — try specific content areas, fall back to full body text = await page.evaluate(""" () => { - const main = document.querySelector('main, article, [role="main"], .content, #content'); - return (main || document.body).textContent?.trim() || ''; + // Try progressively broader content selectors + const selectors = [ + '.article-content', '.page-content', '.entry-content', + '[class*="content-area"]', '[class*="main-content"]', + 'main article', 'main', 'article', + '[role="main"]', '.content', '#content', + ]; + for (const sel of selectors) { + const el = document.querySelector(sel); + if (el && el.textContent.trim().length > 200) { + return el.textContent.trim(); + } + } + // Fallback: full body minus nav/header/footer + const body = document.body.cloneNode(true); + body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove()); + return body.textContent?.trim() || ''; } """) if text and len(text) > 50: @@ -344,12 +359,9 @@ async def discover_dsi_documents( result.total_found, result.languages_detected) return result -# Titles that are navigation elements, not actual documents -NOISE_TITLES = { - "drucken", "print", "nach oben", "back to top", "teilen", "share", - "kontakt", "contact", "suche", "search", "menü", "menu", "home", - "datenschutz", # too generic (just the word, not a doc title) -} +# Nav elements, not real documents +NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share", + "kontakt", "contact", "suche", "search", "menü", "menu", "home", "datenschutz"} def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]: """Remove duplicate and noise documents.""" @@ -374,10 +386,11 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]: for d in filtered: if d.word_count > 200: # Only dedup substantial docs if d.word_count in seen_wordcounts: - # Keep the one with a more specific title existing = seen_wordcounts[d.word_count] - if len(d.title) > len(existing.title): - # Replace with more descriptive title + # Prefer "Datenschutzinformation*" titles over section headings + d_is_dsi = d.title.lower().startswith("datenschutzinformation") + ex_is_dsi = existing.title.lower().startswith("datenschutzinformation") + if d_is_dsi and not ex_is_dsi: unique = [x for x in unique if x is not existing] unique.append(d) seen_wordcounts[d.word_count] = d