fix: accordion close bug + merge multi-page DSIs (BMW fix)

1. _expand_all_interactive(): Only click aria-expanded="false" buttons.
   Before: clicked ALL accordion buttons including open ones → BMW's
   pre-expanded accordions got CLOSED, reducing text from 1151 to 361w.

2. _fetch_text() + /extract-text: merge ALL documents found on a page
   (max_documents=10 instead of 1). BMW splits DSI across 5 sub-pages
   that the discovery finds as separate documents — now merged.

3. Tab panels: unhide hidden tabpanels instead of clicking tabs
   (clicking tabs can hide the currently visible panel).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-15 13:32:04 +02:00
parent 70af018da5
commit fca67c1f43
2 changed files with 64 additions and 19 deletions
@@ -64,12 +64,15 @@ class ComplianceCheckStatusResponse(BaseModel):
@router.post("/extract-text")
async def extract_text(req: ExtractTextRequest):
"""Extract text from a URL via consent-tester DSI discovery."""
"""Extract text from a URL via consent-tester DSI discovery.
Merges all documents found on the page (sub-pages, accordions, etc.)
"""
try:
async with httpx.AsyncClient(timeout=90.0) as client:
resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": req.url, "max_documents": 1},
json={"url": req.url, "max_documents": 10},
)
if resp.status_code != 200:
return {
@@ -86,10 +89,15 @@ async def extract_text(req: ExtractTextRequest):
"error": "Kein Text extrahierbar",
}
doc = docs[0]
text = doc.get("full_text", "") or doc.get("text_preview", "") or doc.get("text", "")
title = doc.get("title", "") or doc.get("doc_type", "")
word_count = doc.get("word_count", 0) or len(text.split())
# Merge all documents (handles multi-page DSIs like BMW)
texts = []
for doc in docs:
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
if t and len(t) > 50:
texts.append(t)
text = "\n\n".join(texts) if texts else ""
title = docs[0].get("title", "") or docs[0].get("doc_type", "")
word_count = len(text.split())
return {
"text": text,
@@ -371,20 +379,33 @@ def _update(check_id: str, msg: str):
async def _fetch_text(url: str) -> str:
"""Fetch text from URL via consent-tester."""
"""Fetch text from URL via consent-tester.
Merges ALL documents found on the page (handles sites like BMW
that split DSI across multiple sub-pages/accordions).
"""
try:
async with httpx.AsyncClient(timeout=90.0) as client:
resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": url, "max_documents": 1},
json={"url": url, "max_documents": 10},
)
if resp.status_code != 200:
return ""
docs = resp.json().get("documents", [])
if not docs:
return ""
doc = docs[0]
return doc.get("full_text", "") or doc.get("text_preview", "") or ""
# Merge all documents found on the page
texts = []
for doc in docs:
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
if t and len(t) > 50:
texts.append(t)
merged = "\n\n".join(texts)
if len(texts) > 1:
logger.info("Merged %d documents from %s (%d words)",
len(texts), url, len(merged.split()))
return merged
except Exception as e:
logger.warning("Text fetch failed for %s: %s", url, e)
return ""