""" DSI Discovery Helpers — resilient navigation, consent dismissal, PDF redirect detection. Extracted from dsi_discovery.py to keep modules under 500 LOC. """ import logging from playwright.async_api import Page, TimeoutError as PlaywrightTimeout logger = logging.getLogger(__name__) async def goto_resilient(page: Page, url: str, timeout: int = 60000) -> None: """Navigate to URL with fallback: try networkidle first, then domcontentloaded. SPAs like Zalando never reach networkidle because of continuous background requests. Falling back to domcontentloaded + a short wait gives JS time to render the main content without waiting for every network request to finish. """ try: await page.goto(url, wait_until="networkidle", timeout=timeout) except PlaywrightTimeout: logger.info("networkidle timeout for %s, falling back to domcontentloaded", url) await page.goto(url, wait_until="domcontentloaded", timeout=timeout) await page.wait_for_timeout(5000) # extra wait for JS rendering async def try_dismiss_consent_banner(page: Page) -> bool: """Try to dismiss cookie consent banners that block page content. Handles shadow DOM (Usercentrics), iframes (Sourcepoint), and regular DOM banners (OneTrust, Cookiebot, Didomi, etc.). Returns True if a banner was dismissed. """ # 1) Usercentrics shadow DOM — most common for German sites try: uc_root = await page.query_selector("#usercentrics-root") if uc_root: clicked = await page.evaluate("""() => { const root = document.querySelector('#usercentrics-root'); if (!root || !root.shadowRoot) return false; const buttons = root.shadowRoot.querySelectorAll('button'); for (const btn of buttons) { const t = btn.textContent.trim().toLowerCase(); if (t.includes('akzeptieren') || t.includes('accept') || t.includes('zustimmen') || t.includes('agree')) { btn.click(); return true; } } return false; }""") if clicked: logger.info("Dismissed Usercentrics consent banner (shadow DOM)") await page.wait_for_timeout(2000) return True except Exception: pass # 2) Standard DOM banners — OneTrust, Cookiebot, Didomi, Borlabs, etc. accept_selectors = [ "#onetrust-accept-btn-handler", "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll", "#didomi-notice-agree-button", "#BorlabsCookieBox .cookie-accept, [data-cookie-accept]", ".cmpboxbtn.cmpboxbtnyes", ".klaro .cm-btn-accept", ".cky-btn-accept", "[class*='qc-cmp2-summary-buttons'] button:first-child", "#tarteaucitronPersonalize2", ] for sel in accept_selectors: try: btn = page.locator(sel).first if await btn.count() > 0 and await btn.is_visible(): await btn.click(timeout=3000) logger.info("Dismissed consent banner via %s", sel) await page.wait_for_timeout(2000) return True except Exception: continue # 3) Generic text-based button search accept_texts = [ "Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren", "Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen", "Einverstanden", "Ich stimme zu", ] try: clicked = await page.evaluate("""(texts) => { for (const btn of document.querySelectorAll('button, a[role="button"]')) { const t = (btn.textContent || '').trim(); for (const target of texts) { if (t === target) { btn.click(); return true; } } } return false; }""", accept_texts) if clicked: logger.info("Dismissed consent banner via generic text match") await page.wait_for_timeout(2000) return True except Exception: pass return False def is_pdf_redirect(original_url: str, final_url: str) -> bool: """Check if the page redirected to a PDF or external storage.""" final_lower = final_url.lower() return ( final_lower.endswith(".pdf") or "storage.googleapis.com" in final_lower or "blob.core.windows.net" in final_lower or "s3.amazonaws.com" in final_lower )