feat: HTTP fallback for text extraction when Playwright times out

BMW Impressum/Cookie pages timeout in Playwright (>180s) because the SPA has many sub-links to follow. But the HTML source already contains the text (SSR). New fallback: direct HTTP GET + HTML tag stripping. Order: 1. Consent-tester (Playwright, 180s) → 2. HTTP GET (30s) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-15 23:16:10 +02:00
parent 77308b783f
commit 3dbf3aa34a
1 changed files with 41 additions and 23 deletions
@@ -380,37 +380,55 @@ def _update(check_id: str, msg: str):


 async def _fetch_text(url: str) -> str:
-    """Fetch text from URL via consent-tester.
+    """Fetch text from URL via consent-tester, with HTTP fallback.

-    Merges ALL documents found on the page (handles sites like BMW
-    that split DSI across multiple sub-pages/accordions).
+    1. Try consent-tester (Playwright) — handles JS-heavy SPAs
+    2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages
    """
+    # 1. Consent-tester (Playwright-based, full JS rendering)
    try:
-        async with httpx.AsyncClient(timeout=300.0) as client:
+        async with httpx.AsyncClient(timeout=180.0) as client:
            resp = await client.post(
                f"{CONSENT_TESTER_URL}/dsi-discovery",
                json={"url": url, "max_documents": 5},
-                timeout=300.0,
+                timeout=180.0,
            )
-            if resp.status_code != 200:
-                return ""
-            docs = resp.json().get("documents", [])
-            if not docs:
-                return ""
-            # Merge all documents found on the page
-            texts = []
-            for doc in docs:
-                t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
-                if t and len(t) > 50:
-                    texts.append(t)
-            merged = "\n\n".join(texts)
-            if len(texts) > 1:
-                logger.info("Merged %d documents from %s (%d words)",
-                            len(texts), url, len(merged.split()))
-            return merged
+            if resp.status_code == 200:
+                docs = resp.json().get("documents", [])
+                if docs:
+                    texts = []
+                    for doc in docs:
+                        t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
+                        if t and len(t) > 50:
+                            texts.append(t)
+                    merged = "\n\n".join(texts)
+                    if merged and len(merged.split()) > 100:
+                        if len(texts) > 1:
+                            logger.info("Merged %d docs from %s (%d words)",
+                                        len(texts), url, len(merged.split()))
+                        return merged
    except Exception as e:
-        logger.warning("Text fetch failed for %s: %s", url, e)
-        return ""
+        logger.warning("Consent-tester fetch failed for %s: %s", url, e)
+
+    # 2. Fallback: direct HTTP fetch (works for SSR pages like BMW)
+    try:
+        import re as _re
+        async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
+            resp = await client.get(url)
+            if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
+                html = resp.text
+                # Strip HTML tags, decode entities
+                text = _re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
+                text = _re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
+                text = _re.sub(r"<[^>]+>", " ", text)
+                text = _re.sub(r"\s+", " ", text).strip()
+                if len(text.split()) > 100:
+                    logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
+                    return text
+    except Exception as e:
+        logger.warning("HTTP fallback failed for %s: %s", url, e)
+
+    return ""


 async def _check_single(