fix: accordion close bug + merge multi-page DSIs (BMW fix)

1. _expand_all_interactive(): Only click aria-expanded="false" buttons. Before: clicked ALL accordion buttons including open ones → BMW's pre-expanded accordions got CLOSED, reducing text from 1151 to 361w. 2. _fetch_text() + /extract-text: merge ALL documents found on a page (max_documents=10 instead of 1). BMW splits DSI across 5 sub-pages that the discovery finds as separate documents — now merged. 3. Tab panels: unhide hidden tabpanels instead of clicking tabs (clicking tabs can hide the currently visible panel). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-15 13:32:04 +02:00
parent 70af018da5
commit fca67c1f43
2 changed files with 64 additions and 19 deletions
@@ -64,12 +64,15 @@ class ComplianceCheckStatusResponse(BaseModel):

@router.post("/extract-text")
 async def extract_text(req: ExtractTextRequest):
-    """Extract text from a URL via consent-tester DSI discovery."""
+    """Extract text from a URL via consent-tester DSI discovery.
+
+    Merges all documents found on the page (sub-pages, accordions, etc.)
+    """
    try:
        async with httpx.AsyncClient(timeout=90.0) as client:
            resp = await client.post(
                f"{CONSENT_TESTER_URL}/dsi-discovery",
-                json={"url": req.url, "max_documents": 1},
+                json={"url": req.url, "max_documents": 10},
            )
            if resp.status_code != 200:
                return {
@@ -86,10 +89,15 @@ async def extract_text(req: ExtractTextRequest):
                    "error": "Kein Text extrahierbar",
                }

-            doc = docs[0]
-            text = doc.get("full_text", "") or doc.get("text_preview", "") or doc.get("text", "")
-            title = doc.get("title", "") or doc.get("doc_type", "")
-            word_count = doc.get("word_count", 0) or len(text.split())
+            # Merge all documents (handles multi-page DSIs like BMW)
+            texts = []
+            for doc in docs:
+                t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
+                if t and len(t) > 50:
+                    texts.append(t)
+            text = "\n\n".join(texts) if texts else ""
+            title = docs[0].get("title", "") or docs[0].get("doc_type", "")
+            word_count = len(text.split())

            return {
                "text": text,
@@ -371,20 +379,33 @@ def _update(check_id: str, msg: str):


 async def _fetch_text(url: str) -> str:
-    """Fetch text from URL via consent-tester."""
+    """Fetch text from URL via consent-tester.
+
+    Merges ALL documents found on the page (handles sites like BMW
+    that split DSI across multiple sub-pages/accordions).
+    """
    try:
        async with httpx.AsyncClient(timeout=90.0) as client:
            resp = await client.post(
                f"{CONSENT_TESTER_URL}/dsi-discovery",
-                json={"url": url, "max_documents": 1},
+                json={"url": url, "max_documents": 10},
            )
            if resp.status_code != 200:
                return ""
            docs = resp.json().get("documents", [])
            if not docs:
                return ""
-            doc = docs[0]
-            return doc.get("full_text", "") or doc.get("text_preview", "") or ""
+            # Merge all documents found on the page
+            texts = []
+            for doc in docs:
+                t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
+                if t and len(t) > 50:
+                    texts.append(t)
+            merged = "\n\n".join(texts)
+            if len(texts) > 1:
+                logger.info("Merged %d documents from %s (%d words)",
+                            len(texts), url, len(merged.split()))
+            return merged
    except Exception as e:
        logger.warning("Text fetch failed for %s: %s", url, e)
        return ""