9814b56f2f
Root cause of the recurring 603-word BMW result: - DSI discovery for cookie-policy URL was hitting 4x networkidle timeouts (60s each = ~240s total). - Backend httpx timeout (180s after the previous fix) gave up before the consent-tester finished, falling through to the raw HTTP fetch which returned BMWs SSR navigation chrome (603 words) as the 'cookie policy'. Two orthogonal fixes: 1. _fetch_text now passes max_documents=1 for user-specified URLs. We only want self-extraction of THAT page; link-following is unnecessary noise. 2. networkidle wait_until window dropped 60s -> 15s. SPAs like BMW/Daimler never reach networkidle anyway; the 60s wait was pure latency. Falls through to domcontentloaded+5s render-wait, same as before.
162 lines
6.5 KiB
Python
162 lines
6.5 KiB
Python
"""
|
|
DSI Discovery Helpers — resilient navigation, consent dismissal, PDF redirect detection.
|
|
|
|
Extracted from dsi_discovery.py to keep modules under 500 LOC.
|
|
"""
|
|
|
|
import logging
|
|
|
|
from playwright.async_api import Page, TimeoutError as PlaywrightTimeout
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def goto_resilient(page: Page, url: str, timeout: int = 60000) -> None:
|
|
"""Navigate to URL with fallback: try networkidle first, then domcontentloaded.
|
|
|
|
SPAs like Zalando, BMW, Daimler never reach networkidle because of continuous
|
|
background requests (analytics, lazy-loaded assets, polling). The 60s wait
|
|
for networkidle is essentially always a 60s waste on those. We try briefly
|
|
(15s) and fall through to domcontentloaded + a 5s render-wait.
|
|
"""
|
|
networkidle_timeout = min(timeout, 15000)
|
|
try:
|
|
await page.goto(url, wait_until="networkidle", timeout=networkidle_timeout)
|
|
except PlaywrightTimeout:
|
|
logger.debug("networkidle timeout for %s, falling back to domcontentloaded", url)
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
|
await page.wait_for_timeout(5000) # extra wait for JS rendering
|
|
|
|
|
|
async def try_dismiss_consent_banner(page: Page) -> bool:
|
|
"""Try to dismiss cookie consent banners that block page content.
|
|
|
|
Handles shadow DOM (Usercentrics), iframes (Sourcepoint), and regular
|
|
DOM banners (OneTrust, Cookiebot, Didomi, etc.).
|
|
Returns True if a banner was dismissed.
|
|
"""
|
|
# 1) Usercentrics shadow DOM — most common for German sites
|
|
try:
|
|
uc_root = await page.query_selector("#usercentrics-root")
|
|
if uc_root:
|
|
clicked = await page.evaluate("""() => {
|
|
const root = document.querySelector('#usercentrics-root');
|
|
if (!root || !root.shadowRoot) return false;
|
|
const buttons = root.shadowRoot.querySelectorAll('button');
|
|
for (const btn of buttons) {
|
|
const t = btn.textContent.trim().toLowerCase();
|
|
if (t.includes('akzeptieren') || t.includes('accept')
|
|
|| t.includes('zustimmen') || t.includes('agree')) {
|
|
btn.click();
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}""")
|
|
if clicked:
|
|
logger.info("Dismissed Usercentrics consent banner (shadow DOM)")
|
|
await page.wait_for_timeout(2000)
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
# 2) Standard DOM banners — OneTrust, Cookiebot, Didomi, Borlabs, etc.
|
|
accept_selectors = [
|
|
"#onetrust-accept-btn-handler",
|
|
"#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
|
|
"#didomi-notice-agree-button",
|
|
"#BorlabsCookieBox .cookie-accept, [data-cookie-accept]",
|
|
".cmpboxbtn.cmpboxbtnyes",
|
|
".klaro .cm-btn-accept",
|
|
".cky-btn-accept",
|
|
"[class*='qc-cmp2-summary-buttons'] button:first-child",
|
|
"#tarteaucitronPersonalize2",
|
|
]
|
|
for sel in accept_selectors:
|
|
try:
|
|
btn = page.locator(sel).first
|
|
if await btn.count() > 0 and await btn.is_visible():
|
|
await btn.click(timeout=3000)
|
|
logger.info("Dismissed consent banner via %s", sel)
|
|
await page.wait_for_timeout(2000)
|
|
return True
|
|
except Exception:
|
|
continue
|
|
|
|
# 3) Sourcepoint / iframe-based CMPs (Spiegel, Zeit, etc.)
|
|
# Search ALL iframes for consent buttons — Sourcepoint generates dynamic IDs
|
|
try:
|
|
for frame in page.frames:
|
|
if frame == page.main_frame:
|
|
continue
|
|
try:
|
|
# Sourcepoint accept button
|
|
sp_btn = frame.locator(".sp_choice_type_11").first
|
|
if await sp_btn.count() > 0 and await sp_btn.is_visible():
|
|
await sp_btn.click(timeout=5000)
|
|
logger.info("Dismissed Sourcepoint consent (iframe: %s)", frame.url[:80])
|
|
await page.wait_for_timeout(3000)
|
|
return True
|
|
# Generic accept text in iframe
|
|
for text in ["Akzeptieren", "Zustimmen", "Accept all", "Alle akzeptieren"]:
|
|
btn = frame.locator(f'button:has-text("{text}")').first
|
|
if await btn.count() > 0 and await btn.is_visible():
|
|
await btn.click(timeout=3000)
|
|
logger.info("Dismissed iframe consent via '%s'", text)
|
|
await page.wait_for_timeout(3000)
|
|
return True
|
|
except Exception:
|
|
continue
|
|
except Exception as e:
|
|
logger.debug("Iframe consent dismiss: %s", e)
|
|
|
|
# 4) Use banner_detector CMP selectors as fallback
|
|
try:
|
|
from services.banner_detector import detect_banner, click_button
|
|
banner = await detect_banner(page)
|
|
if banner and banner.accept_selector:
|
|
clicked = await click_button(page, banner.accept_selector)
|
|
if clicked:
|
|
logger.info("Dismissed %s banner via banner_detector", banner.provider)
|
|
await page.wait_for_timeout(2000)
|
|
return True
|
|
except Exception as e:
|
|
logger.debug("Banner detector dismiss: %s", e)
|
|
|
|
# 5) Generic text-based button search
|
|
accept_texts = [
|
|
"Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren",
|
|
"Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen",
|
|
"Einverstanden", "Ich stimme zu", "Zustimmen und weiter",
|
|
]
|
|
try:
|
|
clicked = await page.evaluate("""(texts) => {
|
|
// Check main document
|
|
for (const btn of document.querySelectorAll('button, a[role="button"]')) {
|
|
const t = (btn.textContent || '').trim();
|
|
for (const target of texts) {
|
|
if (t === target) { btn.click(); return true; }
|
|
}
|
|
}
|
|
return false;
|
|
}""", accept_texts)
|
|
if clicked:
|
|
logger.info("Dismissed consent banner via generic text match")
|
|
await page.wait_for_timeout(2000)
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
return False
|
|
|
|
|
|
def is_pdf_redirect(original_url: str, final_url: str) -> bool:
|
|
"""Check if the page redirected to a PDF or external storage."""
|
|
final_lower = final_url.lower()
|
|
return (
|
|
final_lower.endswith(".pdf")
|
|
or "storage.googleapis.com" in final_lower
|
|
or "blob.core.windows.net" in final_lower
|
|
or "s3.amazonaws.com" in final_lower
|
|
)
|