diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 27b2cedd..d2f6a019 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -409,16 +409,17 @@ async def _fetch_text(url: str) -> str: 2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages """ # 1. Consent-tester (Playwright-based, full JS rendering). - # Timeout 180s: a single dsi-discovery does self-extraction + follows up - # to 3 sub-links + waits for CMP JSON payloads. 60s was tight enough that - # cookie-policy pages on big SPAs (BMW, Daimler) timed out and fell back - # to the raw HTTP fetch, which returned site navigation as garbage text. + # max_documents=1: for a *specific* user-entered URL (cookie, impressum, + # privacy) we only want the self-extracted text of THAT page. Following + # sub-links was triggering 4x networkidle timeouts (~240s) and made the + # backend httpx call time out, dropping us to the raw HTTP fallback + # which returned site navigation as garbage text. try: - async with httpx.AsyncClient(timeout=180.0) as client: + async with httpx.AsyncClient(timeout=120.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", - json={"url": url, "max_documents": 3}, - timeout=180.0, + json={"url": url, "max_documents": 1}, + timeout=120.0, ) if resp.status_code == 200: docs = resp.json().get("documents", []) diff --git a/consent-tester/services/dsi_helpers.py b/consent-tester/services/dsi_helpers.py index 9065db95..8e34e118 100644 --- a/consent-tester/services/dsi_helpers.py +++ b/consent-tester/services/dsi_helpers.py @@ -14,14 +14,16 @@ logger = logging.getLogger(__name__) async def goto_resilient(page: Page, url: str, timeout: int = 60000) -> None: """Navigate to URL with fallback: try networkidle first, then domcontentloaded. - SPAs like Zalando never reach networkidle because of continuous background - requests. Falling back to domcontentloaded + a short wait gives JS time to - render the main content without waiting for every network request to finish. + SPAs like Zalando, BMW, Daimler never reach networkidle because of continuous + background requests (analytics, lazy-loaded assets, polling). The 60s wait + for networkidle is essentially always a 60s waste on those. We try briefly + (15s) and fall through to domcontentloaded + a 5s render-wait. """ + networkidle_timeout = min(timeout, 15000) try: - await page.goto(url, wait_until="networkidle", timeout=timeout) + await page.goto(url, wait_until="networkidle", timeout=networkidle_timeout) except PlaywrightTimeout: - logger.info("networkidle timeout for %s, falling back to domcontentloaded", url) + logger.debug("networkidle timeout for %s, falling back to domcontentloaded", url) await page.goto(url, wait_until="domcontentloaded", timeout=timeout) await page.wait_for_timeout(5000) # extra wait for JS rendering