diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py
index b883cd03..21634bad 100644
--- a/backend-compliance/compliance/api/agent_compliance_check_routes.py
+++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py
@@ -380,37 +380,55 @@ def _update(check_id: str, msg: str):
async def _fetch_text(url: str) -> str:
- """Fetch text from URL via consent-tester.
+ """Fetch text from URL via consent-tester, with HTTP fallback.
- Merges ALL documents found on the page (handles sites like BMW
- that split DSI across multiple sub-pages/accordions).
+ 1. Try consent-tester (Playwright) — handles JS-heavy SPAs
+ 2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages
"""
+ # 1. Consent-tester (Playwright-based, full JS rendering)
try:
- async with httpx.AsyncClient(timeout=300.0) as client:
+ async with httpx.AsyncClient(timeout=180.0) as client:
resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": url, "max_documents": 5},
- timeout=300.0,
+ timeout=180.0,
)
- if resp.status_code != 200:
- return ""
- docs = resp.json().get("documents", [])
- if not docs:
- return ""
- # Merge all documents found on the page
- texts = []
- for doc in docs:
- t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
- if t and len(t) > 50:
- texts.append(t)
- merged = "\n\n".join(texts)
- if len(texts) > 1:
- logger.info("Merged %d documents from %s (%d words)",
- len(texts), url, len(merged.split()))
- return merged
+ if resp.status_code == 200:
+ docs = resp.json().get("documents", [])
+ if docs:
+ texts = []
+ for doc in docs:
+ t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
+ if t and len(t) > 50:
+ texts.append(t)
+ merged = "\n\n".join(texts)
+ if merged and len(merged.split()) > 100:
+ if len(texts) > 1:
+ logger.info("Merged %d docs from %s (%d words)",
+ len(texts), url, len(merged.split()))
+ return merged
except Exception as e:
- logger.warning("Text fetch failed for %s: %s", url, e)
- return ""
+ logger.warning("Consent-tester fetch failed for %s: %s", url, e)
+
+ # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW)
+ try:
+ import re as _re
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
+ resp = await client.get(url)
+ if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
+ html = resp.text
+ # Strip HTML tags, decode entities
+ text = _re.sub(r"", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
+ text = _re.sub(r"", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
+ text = _re.sub(r"<[^>]+>", " ", text)
+ text = _re.sub(r"\s+", " ", text).strip()
+ if len(text.split()) > 100:
+ logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
+ return text
+ except Exception as e:
+ logger.warning("HTTP fallback failed for %s: %s", url, e)
+
+ return ""
async def _check_single(