From b2c1f0ae84ce25c5735caf3e491ce7b02395d318 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 13 May 2026 10:12:50 +0200 Subject: [PATCH] fix(consent): add Sourcepoint iframe handler + banner_detector fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: Spiegel DSI text was truncated because Sourcepoint consent wall was not dismissed — dsi_helpers.py had no Sourcepoint handler. Fixes: 1. Add Sourcepoint iframe click (frame_locator + .sp_choice_type_11) 2. Add banner_detector fallback (reuses 30 CMP selectors from scanner) 3. After banner dismiss, wait and re-navigate if page redirected 4. Add "Zustimmen und weiter" to generic text button list Co-Authored-By: Claude Opus 4.6 (1M context) --- consent-tester/services/dsi_discovery.py | 12 ++++++++- consent-tester/services/dsi_helpers.py | 33 ++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index e76b58d..e536f8f 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -249,7 +249,17 @@ async def discover_dsi_documents( # Step 1b: Try dismissing cookie consent banners before extraction. # Many German sites (dm.de, Zalando, etc.) block page content behind # a consent wall. Dismissing it reveals the actual DSI text. - await try_dismiss_consent_banner(page) + banner_dismissed = await try_dismiss_consent_banner(page) + if banner_dismissed: + # After consent, page may reload or reveal hidden content + await page.wait_for_timeout(2000) + # Re-navigate if the page redirected after consent + try: + if page.url != url: + await goto_resilient(page, url, timeout=30000) + await page.wait_for_timeout(2000) + except Exception: + pass # Step 1c: Self-extraction — if the URL itself is a DSI page, # extract its full text as the first document. This handles the diff --git a/consent-tester/services/dsi_helpers.py b/consent-tester/services/dsi_helpers.py index b5ad847..bd80958 100644 --- a/consent-tester/services/dsi_helpers.py +++ b/consent-tester/services/dsi_helpers.py @@ -81,14 +81,43 @@ async def try_dismiss_consent_banner(page: Page) -> bool: except Exception: continue - # 3) Generic text-based button search + # 3) Sourcepoint (iframe-based CMP, used by Spiegel, Zeit, etc.) + try: + sp_div = await page.query_selector("div[id^='sp_message']") + if sp_div: + # Sourcepoint renders in an iframe inside sp_message_container + sp_iframe = page.frame_locator("iframe[id^='sp_message']") + accept_btn = sp_iframe.locator(".sp_choice_type_11").first + if await accept_btn.count() > 0: + await accept_btn.click(timeout=5000) + logger.info("Dismissed Sourcepoint consent banner (iframe)") + await page.wait_for_timeout(3000) + return True + except Exception as e: + logger.debug("Sourcepoint dismiss attempt: %s", e) + + # 4) Use banner_detector CMP selectors as fallback + try: + from services.banner_detector import detect_banner, click_button + banner = await detect_banner(page) + if banner and banner.accept_selector: + clicked = await click_button(page, banner.accept_selector) + if clicked: + logger.info("Dismissed %s banner via banner_detector", banner.provider) + await page.wait_for_timeout(2000) + return True + except Exception as e: + logger.debug("Banner detector dismiss: %s", e) + + # 5) Generic text-based button search accept_texts = [ "Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren", "Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen", - "Einverstanden", "Ich stimme zu", + "Einverstanden", "Ich stimme zu", "Zustimmen und weiter", ] try: clicked = await page.evaluate("""(texts) => { + // Check main document for (const btn of document.querySelectorAll('button, a[role="button"]')) { const t = (btn.textContent || '').trim(); for (const target of texts) {