From 6689b37f955bc17f27c7161d1d5f7aa55b8bb81c Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 16 May 2026 22:00:42 +0200 Subject: [PATCH] fix(agent): bump _fetch_text timeout 60s->180s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dsi-discovery in consent-tester does self-extraction + follows up to 3 sub-links + waits for CMP JSON payloads. On big SPAs (BMW, Daimler) this routinely exceeds 60s. When it timed out, the HTTP fallback returned the SSR shell as text — for the BMW cookie page that's 603 words of site navigation, which then registered as 'Cookie-Richtlinie nicht im eingereichten Text' (33%). With 180s the consent-tester finishes cleanly and we get the CMP-captured 1824 words of real policy. --- .../compliance/api/agent_compliance_check_routes.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 7ed3edf2..27b2cedd 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -408,13 +408,17 @@ async def _fetch_text(url: str) -> str: 1. Try consent-tester (Playwright) — handles JS-heavy SPAs 2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages """ - # 1. Consent-tester (Playwright-based, full JS rendering) + # 1. Consent-tester (Playwright-based, full JS rendering). + # Timeout 180s: a single dsi-discovery does self-extraction + follows up + # to 3 sub-links + waits for CMP JSON payloads. 60s was tight enough that + # cookie-policy pages on big SPAs (BMW, Daimler) timed out and fell back + # to the raw HTTP fetch, which returned site navigation as garbage text. try: - async with httpx.AsyncClient(timeout=60.0) as client: + async with httpx.AsyncClient(timeout=180.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", json={"url": url, "max_documents": 3}, - timeout=60.0, + timeout=180.0, ) if resp.status_code == 200: docs = resp.json().get("documents", [])