diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index e76b58d..e536f8f 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -249,7 +249,17 @@ async def discover_dsi_documents( # Step 1b: Try dismissing cookie consent banners before extraction. # Many German sites (dm.de, Zalando, etc.) block page content behind # a consent wall. Dismissing it reveals the actual DSI text. - await try_dismiss_consent_banner(page) + banner_dismissed = await try_dismiss_consent_banner(page) + if banner_dismissed: + # After consent, page may reload or reveal hidden content + await page.wait_for_timeout(2000) + # Re-navigate if the page redirected after consent + try: + if page.url != url: + await goto_resilient(page, url, timeout=30000) + await page.wait_for_timeout(2000) + except Exception: + pass # Step 1c: Self-extraction — if the URL itself is a DSI page, # extract its full text as the first document. This handles the diff --git a/consent-tester/services/dsi_helpers.py b/consent-tester/services/dsi_helpers.py index b5ad847..bd80958 100644 --- a/consent-tester/services/dsi_helpers.py +++ b/consent-tester/services/dsi_helpers.py @@ -81,14 +81,43 @@ async def try_dismiss_consent_banner(page: Page) -> bool: except Exception: continue - # 3) Generic text-based button search + # 3) Sourcepoint (iframe-based CMP, used by Spiegel, Zeit, etc.) + try: + sp_div = await page.query_selector("div[id^='sp_message']") + if sp_div: + # Sourcepoint renders in an iframe inside sp_message_container + sp_iframe = page.frame_locator("iframe[id^='sp_message']") + accept_btn = sp_iframe.locator(".sp_choice_type_11").first + if await accept_btn.count() > 0: + await accept_btn.click(timeout=5000) + logger.info("Dismissed Sourcepoint consent banner (iframe)") + await page.wait_for_timeout(3000) + return True + except Exception as e: + logger.debug("Sourcepoint dismiss attempt: %s", e) + + # 4) Use banner_detector CMP selectors as fallback + try: + from services.banner_detector import detect_banner, click_button + banner = await detect_banner(page) + if banner and banner.accept_selector: + clicked = await click_button(page, banner.accept_selector) + if clicked: + logger.info("Dismissed %s banner via banner_detector", banner.provider) + await page.wait_for_timeout(2000) + return True + except Exception as e: + logger.debug("Banner detector dismiss: %s", e) + + # 5) Generic text-based button search accept_texts = [ "Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren", "Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen", - "Einverstanden", "Ich stimme zu", + "Einverstanden", "Ich stimme zu", "Zustimmen und weiter", ] try: clicked = await page.evaluate("""(texts) => { + // Check main document for (const btn of document.querySelectorAll('button, a[role="button"]')) { const t = (btn.textContent || '').trim(); for (const target of texts) {