fix: accordion close bug + merge multi-page DSIs (BMW fix)

1. _expand_all_interactive(): Only click aria-expanded="false" buttons.
   Before: clicked ALL accordion buttons including open ones → BMW's
   pre-expanded accordions got CLOSED, reducing text from 1151 to 361w.

2. _fetch_text() + /extract-text: merge ALL documents found on a page
   (max_documents=10 instead of 1). BMW splits DSI across 5 sub-pages
   that the discovery finds as separate documents — now merged.

3. Tab panels: unhide hidden tabpanels instead of clicking tabs
   (clicking tabs can hide the currently visible panel).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-15 13:32:04 +02:00
parent 70af018da5
commit fca67c1f43
2 changed files with 64 additions and 19 deletions
@@ -64,12 +64,15 @@ class ComplianceCheckStatusResponse(BaseModel):
@router.post("/extract-text") @router.post("/extract-text")
async def extract_text(req: ExtractTextRequest): async def extract_text(req: ExtractTextRequest):
"""Extract text from a URL via consent-tester DSI discovery.""" """Extract text from a URL via consent-tester DSI discovery.
Merges all documents found on the page (sub-pages, accordions, etc.)
"""
try: try:
async with httpx.AsyncClient(timeout=90.0) as client: async with httpx.AsyncClient(timeout=90.0) as client:
resp = await client.post( resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery", f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": req.url, "max_documents": 1}, json={"url": req.url, "max_documents": 10},
) )
if resp.status_code != 200: if resp.status_code != 200:
return { return {
@@ -86,10 +89,15 @@ async def extract_text(req: ExtractTextRequest):
"error": "Kein Text extrahierbar", "error": "Kein Text extrahierbar",
} }
doc = docs[0] # Merge all documents (handles multi-page DSIs like BMW)
text = doc.get("full_text", "") or doc.get("text_preview", "") or doc.get("text", "") texts = []
title = doc.get("title", "") or doc.get("doc_type", "") for doc in docs:
word_count = doc.get("word_count", 0) or len(text.split()) t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
if t and len(t) > 50:
texts.append(t)
text = "\n\n".join(texts) if texts else ""
title = docs[0].get("title", "") or docs[0].get("doc_type", "")
word_count = len(text.split())
return { return {
"text": text, "text": text,
@@ -371,20 +379,33 @@ def _update(check_id: str, msg: str):
async def _fetch_text(url: str) -> str: async def _fetch_text(url: str) -> str:
"""Fetch text from URL via consent-tester.""" """Fetch text from URL via consent-tester.
Merges ALL documents found on the page (handles sites like BMW
that split DSI across multiple sub-pages/accordions).
"""
try: try:
async with httpx.AsyncClient(timeout=90.0) as client: async with httpx.AsyncClient(timeout=90.0) as client:
resp = await client.post( resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery", f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": url, "max_documents": 1}, json={"url": url, "max_documents": 10},
) )
if resp.status_code != 200: if resp.status_code != 200:
return "" return ""
docs = resp.json().get("documents", []) docs = resp.json().get("documents", [])
if not docs: if not docs:
return "" return ""
doc = docs[0] # Merge all documents found on the page
return doc.get("full_text", "") or doc.get("text_preview", "") or "" texts = []
for doc in docs:
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
if t and len(t) > 50:
texts.append(t)
merged = "\n\n".join(texts)
if len(texts) > 1:
logger.info("Merged %d documents from %s (%d words)",
len(texts), url, len(merged.split()))
return merged
except Exception as e: except Exception as e:
logger.warning("Text fetch failed for %s: %s", url, e) logger.warning("Text fetch failed for %s: %s", url, e)
return "" return ""
+33 -9
View File
@@ -532,19 +532,43 @@ async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
return [] return []
async def _expand_all_interactive(page: Page) -> None: async def _expand_all_interactive(page: Page) -> None:
"""Expand all accordions, tabs, details, dropdowns on the page.""" """Expand all accordions, tabs, details, dropdowns on the page.
IMPORTANT: Only expand CLOSED elements. Never click elements that
are already expanded (aria-expanded="true") — that would close them.
BMW, for example, has accordions open by default.
"""
try: try:
await page.evaluate("""() => { await page.evaluate("""() => {
// 1. Open all <details> that are closed
document.querySelectorAll('details:not([open])').forEach(d => d.open = true); document.querySelectorAll('details:not([open])').forEach(d => d.open = true);
const sels = ['button[aria-expanded="false"]','[data-toggle="collapse"]',
'[data-bs-toggle="collapse"]','[class*="accordion"] > button', // 2. Click buttons that are explicitly CLOSED (aria-expanded="false")
'[class*="collapse"] > button','.panel-heading a']; document.querySelectorAll('button[aria-expanded="false"]').forEach(b => {
sels.forEach(s => document.querySelectorAll(s).forEach(e => { try{e.click()}catch{} })); try { b.click(); } catch {}
document.querySelectorAll('button,a').forEach(b => { });
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test((b.textContent||'').trim()))
try{b.click()}catch{} // 3. Bootstrap/jQuery collapse triggers (only closed ones)
document.querySelectorAll('[data-toggle="collapse"].collapsed').forEach(e => {
try { e.click(); } catch {}
});
document.querySelectorAll('[data-bs-toggle="collapse"].collapsed').forEach(e => {
try { e.click(); } catch {}
});
// 4. "Show more" / "Mehr anzeigen" buttons
document.querySelectorAll('button,a').forEach(b => {
const t = (b.textContent || '').trim();
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test(t))
try { b.click(); } catch {}
});
// 5. Tabs — click each to make content visible, then go back
// (don't click, just make tab panels visible)
document.querySelectorAll('[role="tabpanel"][hidden]').forEach(p => {
p.removeAttribute('hidden');
p.style.display = '';
}); });
document.querySelectorAll('[role="tab"]').forEach(t => { try{t.click()}catch{} });
}""") }""")
except Exception: except Exception:
pass pass