diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index b883cd03..21634bad 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -380,37 +380,55 @@ def _update(check_id: str, msg: str): async def _fetch_text(url: str) -> str: - """Fetch text from URL via consent-tester. + """Fetch text from URL via consent-tester, with HTTP fallback. - Merges ALL documents found on the page (handles sites like BMW - that split DSI across multiple sub-pages/accordions). + 1. Try consent-tester (Playwright) — handles JS-heavy SPAs + 2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages """ + # 1. Consent-tester (Playwright-based, full JS rendering) try: - async with httpx.AsyncClient(timeout=300.0) as client: + async with httpx.AsyncClient(timeout=180.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", json={"url": url, "max_documents": 5}, - timeout=300.0, + timeout=180.0, ) - if resp.status_code != 200: - return "" - docs = resp.json().get("documents", []) - if not docs: - return "" - # Merge all documents found on the page - texts = [] - for doc in docs: - t = doc.get("full_text", "") or doc.get("text_preview", "") or "" - if t and len(t) > 50: - texts.append(t) - merged = "\n\n".join(texts) - if len(texts) > 1: - logger.info("Merged %d documents from %s (%d words)", - len(texts), url, len(merged.split())) - return merged + if resp.status_code == 200: + docs = resp.json().get("documents", []) + if docs: + texts = [] + for doc in docs: + t = doc.get("full_text", "") or doc.get("text_preview", "") or "" + if t and len(t) > 50: + texts.append(t) + merged = "\n\n".join(texts) + if merged and len(merged.split()) > 100: + if len(texts) > 1: + logger.info("Merged %d docs from %s (%d words)", + len(texts), url, len(merged.split())) + return merged except Exception as e: - logger.warning("Text fetch failed for %s: %s", url, e) - return "" + logger.warning("Consent-tester fetch failed for %s: %s", url, e) + + # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW) + try: + import re as _re + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: + resp = await client.get(url) + if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""): + html = resp.text + # Strip HTML tags, decode entities + text = _re.sub(r"]*>.*?", " ", html, flags=_re.DOTALL | _re.IGNORECASE) + text = _re.sub(r"]*>.*?", " ", text, flags=_re.DOTALL | _re.IGNORECASE) + text = _re.sub(r"<[^>]+>", " ", text) + text = _re.sub(r"\s+", " ", text).strip() + if len(text.split()) > 100: + logger.info("HTTP fallback for %s: %d words", url, len(text.split())) + return text + except Exception as e: + logger.warning("HTTP fallback failed for %s: %s", url, e) + + return "" async def _check_single(