diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index c67a644..9f71b62 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -289,18 +289,33 @@ async def discover_dsi_documents( if is_anchor: continue - # Navigate to page + # Navigate to page — wait for JS to load content resp = await page.goto(href, wait_until="networkidle", timeout=20000) if resp and resp.status < 400: - await page.wait_for_timeout(2000) + await page.wait_for_timeout(3000) # Extra wait for JS content loading await _expand_all_interactive(page) - await page.wait_for_timeout(500) + await page.wait_for_timeout(1000) - # Extract text + # Extract text — try specific content areas, fall back to full body text = await page.evaluate(""" () => { - const main = document.querySelector('main, article, [role="main"], .content, #content'); - return (main || document.body).textContent?.trim() || ''; + // Try progressively broader content selectors + const selectors = [ + '.article-content', '.page-content', '.entry-content', + '[class*="content-area"]', '[class*="main-content"]', + 'main article', 'main', 'article', + '[role="main"]', '.content', '#content', + ]; + for (const sel of selectors) { + const el = document.querySelector(sel); + if (el && el.textContent.trim().length > 200) { + return el.textContent.trim(); + } + } + // Fallback: full body minus nav/header/footer + const body = document.body.cloneNode(true); + body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove()); + return body.textContent?.trim() || ''; } """) if text and len(text) > 50: @@ -344,12 +359,9 @@ async def discover_dsi_documents( result.total_found, result.languages_detected) return result -# Titles that are navigation elements, not actual documents -NOISE_TITLES = { - "drucken", "print", "nach oben", "back to top", "teilen", "share", - "kontakt", "contact", "suche", "search", "menü", "menu", "home", - "datenschutz", # too generic (just the word, not a doc title) -} +# Nav elements, not real documents +NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share", + "kontakt", "contact", "suche", "search", "menü", "menu", "home", "datenschutz"} def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]: """Remove duplicate and noise documents.""" @@ -374,10 +386,11 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]: for d in filtered: if d.word_count > 200: # Only dedup substantial docs if d.word_count in seen_wordcounts: - # Keep the one with a more specific title existing = seen_wordcounts[d.word_count] - if len(d.title) > len(existing.title): - # Replace with more descriptive title + # Prefer "Datenschutzinformation*" titles over section headings + d_is_dsi = d.title.lower().startswith("datenschutzinformation") + ex_is_dsi = existing.title.lower().startswith("datenschutzinformation") + if d_is_dsi and not ex_is_dsi: unique = [x for x in unique if x is not existing] unique.append(d) seen_wordcounts[d.word_count] = d