fix: DSI dedup prefers 'Datenschutzinformation*' titles + better JS content extraction

Bug 1 fix: When merging documents with identical word_count, prefer
titles starting with 'Datenschutzinformation' over generic section
headings like 'Zweck und Rechtsgrundlage'. This restores the main
'Datenschutzinformationen zum Internetangebot' document.

Bug 2 fix: After navigating to a document page, wait 3s (was 2s) for
JS content loading, then try 10+ content selectors before falling back
to body text (with nav/header/footer removed). Handles IHK-style JS
navigation where content loads after page.goto() completes.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-05 12:26:42 +02:00
parent b4f90ed113
commit d547e63663
+28 -15
View File
@@ -289,18 +289,33 @@ async def discover_dsi_documents(
if is_anchor:
continue
# Navigate to page
# Navigate to page — wait for JS to load content
resp = await page.goto(href, wait_until="networkidle", timeout=20000)
if resp and resp.status < 400:
await page.wait_for_timeout(2000)
await page.wait_for_timeout(3000) # Extra wait for JS content loading
await _expand_all_interactive(page)
await page.wait_for_timeout(500)
await page.wait_for_timeout(1000)
# Extract text
# Extract text — try specific content areas, fall back to full body
text = await page.evaluate("""
() => {
const main = document.querySelector('main, article, [role="main"], .content, #content');
return (main || document.body).textContent?.trim() || '';
// Try progressively broader content selectors
const selectors = [
'.article-content', '.page-content', '.entry-content',
'[class*="content-area"]', '[class*="main-content"]',
'main article', 'main', 'article',
'[role="main"]', '.content', '#content',
];
for (const sel of selectors) {
const el = document.querySelector(sel);
if (el && el.textContent.trim().length > 200) {
return el.textContent.trim();
}
}
// Fallback: full body minus nav/header/footer
const body = document.body.cloneNode(true);
body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
return body.textContent?.trim() || '';
}
""")
if text and len(text) > 50:
@@ -344,12 +359,9 @@ async def discover_dsi_documents(
result.total_found, result.languages_detected)
return result
# Titles that are navigation elements, not actual documents
NOISE_TITLES = {
"drucken", "print", "nach oben", "back to top", "teilen", "share",
"kontakt", "contact", "suche", "search", "menü", "menu", "home",
"datenschutz", # too generic (just the word, not a doc title)
}
# Nav elements, not real documents
NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
"kontakt", "contact", "suche", "search", "menü", "menu", "home", "datenschutz"}
def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
"""Remove duplicate and noise documents."""
@@ -374,10 +386,11 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
for d in filtered:
if d.word_count > 200: # Only dedup substantial docs
if d.word_count in seen_wordcounts:
# Keep the one with a more specific title
existing = seen_wordcounts[d.word_count]
if len(d.title) > len(existing.title):
# Replace with more descriptive title
# Prefer "Datenschutzinformation*" titles over section headings
d_is_dsi = d.title.lower().startswith("datenschutzinformation")
ex_is_dsi = existing.title.lower().startswith("datenschutzinformation")
if d_is_dsi and not ex_is_dsi:
unique = [x for x in unique if x is not existing]
unique.append(d)
seen_wordcounts[d.word_count] = d