fix: DSI dedup prefers 'Datenschutzinformation*' titles + better JS content extraction
Bug 1 fix: When merging documents with identical word_count, prefer titles starting with 'Datenschutzinformation' over generic section headings like 'Zweck und Rechtsgrundlage'. This restores the main 'Datenschutzinformationen zum Internetangebot' document. Bug 2 fix: After navigating to a document page, wait 3s (was 2s) for JS content loading, then try 10+ content selectors before falling back to body text (with nav/header/footer removed). Handles IHK-style JS navigation where content loads after page.goto() completes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -289,18 +289,33 @@ async def discover_dsi_documents(
|
|||||||
if is_anchor:
|
if is_anchor:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Navigate to page
|
# Navigate to page — wait for JS to load content
|
||||||
resp = await page.goto(href, wait_until="networkidle", timeout=20000)
|
resp = await page.goto(href, wait_until="networkidle", timeout=20000)
|
||||||
if resp and resp.status < 400:
|
if resp and resp.status < 400:
|
||||||
await page.wait_for_timeout(2000)
|
await page.wait_for_timeout(3000) # Extra wait for JS content loading
|
||||||
await _expand_all_interactive(page)
|
await _expand_all_interactive(page)
|
||||||
await page.wait_for_timeout(500)
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
# Extract text
|
# Extract text — try specific content areas, fall back to full body
|
||||||
text = await page.evaluate("""
|
text = await page.evaluate("""
|
||||||
() => {
|
() => {
|
||||||
const main = document.querySelector('main, article, [role="main"], .content, #content');
|
// Try progressively broader content selectors
|
||||||
return (main || document.body).textContent?.trim() || '';
|
const selectors = [
|
||||||
|
'.article-content', '.page-content', '.entry-content',
|
||||||
|
'[class*="content-area"]', '[class*="main-content"]',
|
||||||
|
'main article', 'main', 'article',
|
||||||
|
'[role="main"]', '.content', '#content',
|
||||||
|
];
|
||||||
|
for (const sel of selectors) {
|
||||||
|
const el = document.querySelector(sel);
|
||||||
|
if (el && el.textContent.trim().length > 200) {
|
||||||
|
return el.textContent.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fallback: full body minus nav/header/footer
|
||||||
|
const body = document.body.cloneNode(true);
|
||||||
|
body.querySelectorAll('nav, header, footer, script, style, [class*="nav"], [class*="sidebar"]').forEach(e => e.remove());
|
||||||
|
return body.textContent?.trim() || '';
|
||||||
}
|
}
|
||||||
""")
|
""")
|
||||||
if text and len(text) > 50:
|
if text and len(text) > 50:
|
||||||
@@ -344,12 +359,9 @@ async def discover_dsi_documents(
|
|||||||
result.total_found, result.languages_detected)
|
result.total_found, result.languages_detected)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# Titles that are navigation elements, not actual documents
|
# Nav elements, not real documents
|
||||||
NOISE_TITLES = {
|
NOISE_TITLES = {"drucken", "print", "nach oben", "back to top", "teilen", "share",
|
||||||
"drucken", "print", "nach oben", "back to top", "teilen", "share",
|
"kontakt", "contact", "suche", "search", "menü", "menu", "home", "datenschutz"}
|
||||||
"kontakt", "contact", "suche", "search", "menü", "menu", "home",
|
|
||||||
"datenschutz", # too generic (just the word, not a doc title)
|
|
||||||
}
|
|
||||||
|
|
||||||
def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
|
def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
|
||||||
"""Remove duplicate and noise documents."""
|
"""Remove duplicate and noise documents."""
|
||||||
@@ -374,10 +386,11 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
|
|||||||
for d in filtered:
|
for d in filtered:
|
||||||
if d.word_count > 200: # Only dedup substantial docs
|
if d.word_count > 200: # Only dedup substantial docs
|
||||||
if d.word_count in seen_wordcounts:
|
if d.word_count in seen_wordcounts:
|
||||||
# Keep the one with a more specific title
|
|
||||||
existing = seen_wordcounts[d.word_count]
|
existing = seen_wordcounts[d.word_count]
|
||||||
if len(d.title) > len(existing.title):
|
# Prefer "Datenschutzinformation*" titles over section headings
|
||||||
# Replace with more descriptive title
|
d_is_dsi = d.title.lower().startswith("datenschutzinformation")
|
||||||
|
ex_is_dsi = existing.title.lower().startswith("datenschutzinformation")
|
||||||
|
if d_is_dsi and not ex_is_dsi:
|
||||||
unique = [x for x in unique if x is not existing]
|
unique = [x for x in unique if x is not existing]
|
||||||
unique.append(d)
|
unique.append(d)
|
||||||
seen_wordcounts[d.word_count] = d
|
seen_wordcounts[d.word_count] = d
|
||||||
|
|||||||
Reference in New Issue
Block a user