feat: HTTP fallback for text extraction when Playwright times out
BMW Impressum/Cookie pages timeout in Playwright (>180s) because the SPA has many sub-links to follow. But the HTML source already contains the text (SSR). New fallback: direct HTTP GET + HTML tag stripping. Order: 1. Consent-tester (Playwright, 180s) → 2. HTTP GET (30s) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -380,36 +380,54 @@ def _update(check_id: str, msg: str):
|
|||||||
|
|
||||||
|
|
||||||
async def _fetch_text(url: str) -> str:
|
async def _fetch_text(url: str) -> str:
|
||||||
"""Fetch text from URL via consent-tester.
|
"""Fetch text from URL via consent-tester, with HTTP fallback.
|
||||||
|
|
||||||
Merges ALL documents found on the page (handles sites like BMW
|
1. Try consent-tester (Playwright) — handles JS-heavy SPAs
|
||||||
that split DSI across multiple sub-pages/accordions).
|
2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages
|
||||||
"""
|
"""
|
||||||
|
# 1. Consent-tester (Playwright-based, full JS rendering)
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
async with httpx.AsyncClient(timeout=180.0) as client:
|
||||||
resp = await client.post(
|
resp = await client.post(
|
||||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||||
json={"url": url, "max_documents": 5},
|
json={"url": url, "max_documents": 5},
|
||||||
timeout=300.0,
|
timeout=180.0,
|
||||||
)
|
)
|
||||||
if resp.status_code != 200:
|
if resp.status_code == 200:
|
||||||
return ""
|
|
||||||
docs = resp.json().get("documents", [])
|
docs = resp.json().get("documents", [])
|
||||||
if not docs:
|
if docs:
|
||||||
return ""
|
|
||||||
# Merge all documents found on the page
|
|
||||||
texts = []
|
texts = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
|
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
|
||||||
if t and len(t) > 50:
|
if t and len(t) > 50:
|
||||||
texts.append(t)
|
texts.append(t)
|
||||||
merged = "\n\n".join(texts)
|
merged = "\n\n".join(texts)
|
||||||
|
if merged and len(merged.split()) > 100:
|
||||||
if len(texts) > 1:
|
if len(texts) > 1:
|
||||||
logger.info("Merged %d documents from %s (%d words)",
|
logger.info("Merged %d docs from %s (%d words)",
|
||||||
len(texts), url, len(merged.split()))
|
len(texts), url, len(merged.split()))
|
||||||
return merged
|
return merged
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Text fetch failed for %s: %s", url, e)
|
logger.warning("Consent-tester fetch failed for %s: %s", url, e)
|
||||||
|
|
||||||
|
# 2. Fallback: direct HTTP fetch (works for SSR pages like BMW)
|
||||||
|
try:
|
||||||
|
import re as _re
|
||||||
|
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
||||||
|
resp = await client.get(url)
|
||||||
|
if resp.status_code == 200 and "text/html" in resp.headers.get("content-type", ""):
|
||||||
|
html = resp.text
|
||||||
|
# Strip HTML tags, decode entities
|
||||||
|
text = _re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=_re.DOTALL | _re.IGNORECASE)
|
||||||
|
text = _re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=_re.DOTALL | _re.IGNORECASE)
|
||||||
|
text = _re.sub(r"<[^>]+>", " ", text)
|
||||||
|
text = _re.sub(r"\s+", " ", text).strip()
|
||||||
|
if len(text.split()) > 100:
|
||||||
|
logger.info("HTTP fallback for %s: %d words", url, len(text.split()))
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("HTTP fallback failed for %s: %s", url, e)
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user