fix: DSI dedup prefers 'Datenschutzinformation*' titles + better JS content extraction
Bug 1 fix: When merging documents with identical word_count, prefer titles starting with 'Datenschutzinformation' over generic section headings like 'Zweck und Rechtsgrundlage'. This restores the main 'Datenschutzinformationen zum Internetangebot' document. Bug 2 fix: After navigating to a document page, wait 3s (was 2s) for JS content loading, then try 10+ content selectors before falling back to body text (with nav/header/footer removed). Handles IHK-style JS navigation where content loads after page.goto() completes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -289,18 +289,33 @@ async def discover_dsi_documents(
|
||||
if is_anchor:
|
||||
continue
|
||||
|
||||
# Navigate to page
|
||||
# Navigate to page — wait for JS to load content
|
||||
resp = await page.goto(href, wait_until="networkidle", timeout=20000)
|
||||
if resp and resp.status < 400:
|
||||
await page.wait_for_timeout(2000)
|
||||
await page.wait_for_timeout(3000) # Extra wait for JS content loading
|
||||
await _expand_all_interactive(page)
|
||||
await page.wait_for_timeout(500)
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
# Extract text
|
||||
# Extract text — try specific content areas, fall back to full body
|
||||
text = await page.evaluate("""
|
||||
() => {
|
||||
const main = document.querySelector('main, article, [role="main"], .content, #content');
|
||||
return (main || document.body).textContent?.trim() || '';
|
||||
// Try progressively broader content selectors
|
||||
const selectors = [
|
||||
'.article-content', '.page-content', '.entry-content',
|
||||
'[class*="content-area"]', '[class*="main-content"]',
|
||||
'main article', 'main', 'article',
|
||||
'[role="main"]', '.content', '#content',
|
||||
];
|
||||
for (const sel of selectors) {
|
||||
const el = document.querySelector(sel);
|
||||
if (el && el.textContent.trim().length > 200) {
|
||||
return el.textContent.trim();
|
||||
}
|
||||
}
|
||||
// Fallback: full body minus nav/header/footer
|
||||
const body = document.body.cloneNode(true);
|
||||
body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
|
||||
return body.textContent?.trim() || '';
|
||||
}
|
||||
""")
|
||||
if text and len(text) > 50:
|
||||
@@ -344,12 +359,9 @@ async def discover_dsi_documents(
|
||||
result.total_found, result.languages_detected)
|
||||
return result
|
||||
|
||||
# Titles that are navigation elements, not actual documents
|
||||
NOISE_TITLES = {
|
||||
"drucken", "print", "nach oben", "back to top", "teilen", "share",
|
||||
"kontakt", "contact", "suche", "search", "menü", "menu", "home",
|
||||
"datenschutz", # too generic (just the word, not a doc title)
|
||||
}
|
||||
# Nav elements, not real documents
|
||||
NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
|
||||
"kontakt", "contact", "suche", "search", "menü", "menu", "home", "datenschutz"}
|
||||
|
||||
def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
|
||||
"""Remove duplicate and noise documents."""
|
||||
@@ -374,10 +386,11 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
|
||||
for d in filtered:
|
||||
if d.word_count > 200: # Only dedup substantial docs
|
||||
if d.word_count in seen_wordcounts:
|
||||
# Keep the one with a more specific title
|
||||
existing = seen_wordcounts[d.word_count]
|
||||
if len(d.title) > len(existing.title):
|
||||
# Replace with more descriptive title
|
||||
# Prefer "Datenschutzinformation*" titles over section headings
|
||||
d_is_dsi = d.title.lower().startswith("datenschutzinformation")
|
||||
ex_is_dsi = existing.title.lower().startswith("datenschutzinformation")
|
||||
if d_is_dsi and not ex_is_dsi:
|
||||
unique = [x for x in unique if x is not existing]
|
||||
unique.append(d)
|
||||
seen_wordcounts[d.word_count] = d
|
||||
|
||||
Reference in New Issue
Block a user