fix(consent): add Sourcepoint iframe handler + banner_detector fallback
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 18s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m1s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Successful in 57s
CI / test-python-backend (push) Successful in 41s
CI / test-python-document-crawler (push) Successful in 28s
CI / test-python-dsms-gateway (push) Successful in 25s
CI / validate-canonical-controls (push) Successful in 15s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 18s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 3m1s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Successful in 57s
CI / test-python-backend (push) Successful in 41s
CI / test-python-document-crawler (push) Successful in 28s
CI / test-python-dsms-gateway (push) Successful in 25s
CI / validate-canonical-controls (push) Successful in 15s
Root cause: Spiegel DSI text was truncated because Sourcepoint consent wall was not dismissed — dsi_helpers.py had no Sourcepoint handler. Fixes: 1. Add Sourcepoint iframe click (frame_locator + .sp_choice_type_11) 2. Add banner_detector fallback (reuses 30 CMP selectors from scanner) 3. After banner dismiss, wait and re-navigate if page redirected 4. Add "Zustimmen und weiter" to generic text button list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -249,7 +249,17 @@ async def discover_dsi_documents(
|
|||||||
# Step 1b: Try dismissing cookie consent banners before extraction.
|
# Step 1b: Try dismissing cookie consent banners before extraction.
|
||||||
# Many German sites (dm.de, Zalando, etc.) block page content behind
|
# Many German sites (dm.de, Zalando, etc.) block page content behind
|
||||||
# a consent wall. Dismissing it reveals the actual DSI text.
|
# a consent wall. Dismissing it reveals the actual DSI text.
|
||||||
await try_dismiss_consent_banner(page)
|
banner_dismissed = await try_dismiss_consent_banner(page)
|
||||||
|
if banner_dismissed:
|
||||||
|
# After consent, page may reload or reveal hidden content
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
# Re-navigate if the page redirected after consent
|
||||||
|
try:
|
||||||
|
if page.url != url:
|
||||||
|
await goto_resilient(page, url, timeout=30000)
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Step 1c: Self-extraction — if the URL itself is a DSI page,
|
# Step 1c: Self-extraction — if the URL itself is a DSI page,
|
||||||
# extract its full text as the first document. This handles the
|
# extract its full text as the first document. This handles the
|
||||||
|
|||||||
@@ -81,14 +81,43 @@ async def try_dismiss_consent_banner(page: Page) -> bool:
|
|||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 3) Generic text-based button search
|
# 3) Sourcepoint (iframe-based CMP, used by Spiegel, Zeit, etc.)
|
||||||
|
try:
|
||||||
|
sp_div = await page.query_selector("div[id^='sp_message']")
|
||||||
|
if sp_div:
|
||||||
|
# Sourcepoint renders in an iframe inside sp_message_container
|
||||||
|
sp_iframe = page.frame_locator("iframe[id^='sp_message']")
|
||||||
|
accept_btn = sp_iframe.locator(".sp_choice_type_11").first
|
||||||
|
if await accept_btn.count() > 0:
|
||||||
|
await accept_btn.click(timeout=5000)
|
||||||
|
logger.info("Dismissed Sourcepoint consent banner (iframe)")
|
||||||
|
await page.wait_for_timeout(3000)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Sourcepoint dismiss attempt: %s", e)
|
||||||
|
|
||||||
|
# 4) Use banner_detector CMP selectors as fallback
|
||||||
|
try:
|
||||||
|
from services.banner_detector import detect_banner, click_button
|
||||||
|
banner = await detect_banner(page)
|
||||||
|
if banner and banner.accept_selector:
|
||||||
|
clicked = await click_button(page, banner.accept_selector)
|
||||||
|
if clicked:
|
||||||
|
logger.info("Dismissed %s banner via banner_detector", banner.provider)
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Banner detector dismiss: %s", e)
|
||||||
|
|
||||||
|
# 5) Generic text-based button search
|
||||||
accept_texts = [
|
accept_texts = [
|
||||||
"Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren",
|
"Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren",
|
||||||
"Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen",
|
"Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen",
|
||||||
"Einverstanden", "Ich stimme zu",
|
"Einverstanden", "Ich stimme zu", "Zustimmen und weiter",
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
clicked = await page.evaluate("""(texts) => {
|
clicked = await page.evaluate("""(texts) => {
|
||||||
|
// Check main document
|
||||||
for (const btn of document.querySelectorAll('button, a[role="button"]')) {
|
for (const btn of document.querySelectorAll('button, a[role="button"]')) {
|
||||||
const t = (btn.textContent || '').trim();
|
const t = (btn.textContent || '').trim();
|
||||||
for (const target of texts) {
|
for (const target of texts) {
|
||||||
|
|||||||
Reference in New Issue
Block a user