diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 8efdf3d6..18be3336 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -64,12 +64,15 @@ class ComplianceCheckStatusResponse(BaseModel): @router.post("/extract-text") async def extract_text(req: ExtractTextRequest): - """Extract text from a URL via consent-tester DSI discovery.""" + """Extract text from a URL via consent-tester DSI discovery. + + Merges all documents found on the page (sub-pages, accordions, etc.) + """ try: async with httpx.AsyncClient(timeout=90.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", - json={"url": req.url, "max_documents": 1}, + json={"url": req.url, "max_documents": 10}, ) if resp.status_code != 200: return { @@ -86,10 +89,15 @@ async def extract_text(req: ExtractTextRequest): "error": "Kein Text extrahierbar", } - doc = docs[0] - text = doc.get("full_text", "") or doc.get("text_preview", "") or doc.get("text", "") - title = doc.get("title", "") or doc.get("doc_type", "") - word_count = doc.get("word_count", 0) or len(text.split()) + # Merge all documents (handles multi-page DSIs like BMW) + texts = [] + for doc in docs: + t = doc.get("full_text", "") or doc.get("text_preview", "") or "" + if t and len(t) > 50: + texts.append(t) + text = "\n\n".join(texts) if texts else "" + title = docs[0].get("title", "") or docs[0].get("doc_type", "") + word_count = len(text.split()) return { "text": text, @@ -371,20 +379,33 @@ def _update(check_id: str, msg: str): async def _fetch_text(url: str) -> str: - """Fetch text from URL via consent-tester.""" + """Fetch text from URL via consent-tester. + + Merges ALL documents found on the page (handles sites like BMW + that split DSI across multiple sub-pages/accordions). + """ try: async with httpx.AsyncClient(timeout=90.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", - json={"url": url, "max_documents": 1}, + json={"url": url, "max_documents": 10}, ) if resp.status_code != 200: return "" docs = resp.json().get("documents", []) if not docs: return "" - doc = docs[0] - return doc.get("full_text", "") or doc.get("text_preview", "") or "" + # Merge all documents found on the page + texts = [] + for doc in docs: + t = doc.get("full_text", "") or doc.get("text_preview", "") or "" + if t and len(t) > 50: + texts.append(t) + merged = "\n\n".join(texts) + if len(texts) > 1: + logger.info("Merged %d documents from %s (%d words)", + len(texts), url, len(merged.split())) + return merged except Exception as e: logger.warning("Text fetch failed for %s: %s", url, e) return "" diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index 03d7228b..20d07ba4 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -532,19 +532,43 @@ async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]: return [] async def _expand_all_interactive(page: Page) -> None: - """Expand all accordions, tabs, details, dropdowns on the page.""" + """Expand all accordions, tabs, details, dropdowns on the page. + + IMPORTANT: Only expand CLOSED elements. Never click elements that + are already expanded (aria-expanded="true") — that would close them. + BMW, for example, has accordions open by default. + """ try: await page.evaluate("""() => { + // 1. Open all
that are closed document.querySelectorAll('details:not([open])').forEach(d => d.open = true); - const sels = ['button[aria-expanded="false"]','[data-toggle="collapse"]', - '[data-bs-toggle="collapse"]','[class*="accordion"] > button', - '[class*="collapse"] > button','.panel-heading a']; - sels.forEach(s => document.querySelectorAll(s).forEach(e => { try{e.click()}catch{} })); - document.querySelectorAll('button,a').forEach(b => { - if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test((b.textContent||'').trim())) - try{b.click()}catch{} + + // 2. Click buttons that are explicitly CLOSED (aria-expanded="false") + document.querySelectorAll('button[aria-expanded="false"]').forEach(b => { + try { b.click(); } catch {} + }); + + // 3. Bootstrap/jQuery collapse triggers (only closed ones) + document.querySelectorAll('[data-toggle="collapse"].collapsed').forEach(e => { + try { e.click(); } catch {} + }); + document.querySelectorAll('[data-bs-toggle="collapse"].collapsed').forEach(e => { + try { e.click(); } catch {} + }); + + // 4. "Show more" / "Mehr anzeigen" buttons + document.querySelectorAll('button,a').forEach(b => { + const t = (b.textContent || '').trim(); + if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test(t)) + try { b.click(); } catch {} + }); + + // 5. Tabs — click each to make content visible, then go back + // (don't click, just make tab panels visible) + document.querySelectorAll('[role="tabpanel"][hidden]').forEach(p => { + p.removeAttribute('hidden'); + p.style.display = ''; }); - document.querySelectorAll('[role="tab"]').forEach(t => { try{t.click()}catch{} }); }""") except Exception: pass