feat: 4 remaining tasks — EU institutions, banner integration, JS-sites, Caritas fixes
Build + Deploy / build-ai-sdk (push) Failing after 36s
Build + Deploy / build-developer-portal (push) Successful in 8s
Build + Deploy / build-tts (push) Successful in 7s
Build + Deploy / build-document-crawler (push) Successful in 7s
Build + Deploy / build-admin-compliance (push) Successful in 8s
Build + Deploy / build-backend-compliance (push) Successful in 8s
CI / nodejs-build (push) Successful in 3m14s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 46s
CI / test-python-backend (push) Successful in 43s
CI / test-python-document-crawler (push) Successful in 29s
CI / test-python-dsms-gateway (push) Successful in 30s
CI / validate-canonical-controls (push) Successful in 16s
Build + Deploy / build-dsms-gateway (push) Successful in 8s
Build + Deploy / build-dsms-node (push) Successful in 8s
CI / branch-name (push) Has been skipped
Build + Deploy / trigger-orca (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 17s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
Build + Deploy / build-ai-sdk (push) Failing after 36s
Build + Deploy / build-developer-portal (push) Successful in 8s
Build + Deploy / build-tts (push) Successful in 7s
Build + Deploy / build-document-crawler (push) Successful in 7s
Build + Deploy / build-admin-compliance (push) Successful in 8s
Build + Deploy / build-backend-compliance (push) Successful in 8s
CI / nodejs-build (push) Successful in 3m14s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 46s
CI / test-python-backend (push) Successful in 43s
CI / test-python-document-crawler (push) Successful in 29s
CI / test-python-dsms-gateway (push) Successful in 30s
CI / validate-canonical-controls (push) Successful in 16s
Build + Deploy / build-dsms-gateway (push) Successful in 8s
Build + Deploy / build-dsms-node (push) Successful in 8s
CI / branch-name (push) Has been skipped
Build + Deploy / trigger-orca (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 17s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
1. EU Institution Checks (Verordnung 2018/1725): - New doc_type "eu_institution" with 9 L1 + 15 L2 checks - Both German + English patterns (EU institutions are multilingual) - Auto-detection via "2018/1725", "EDSB", "EDPS" keywords - Correct article references (Art. 15 instead of 13, Art. 5 instead of 6) 2. Banner Check Integration: - banner_runner.py maps scan results to 36 L1/L2 structured checks - BannerCheckTab shows hierarchical ChecklistView with hints - 3-phase summary (cookies/scripts before/after consent) - /scan endpoint now includes structured_checks in response 3. JS-heavy Website Fixes (dm, Zalando, HWK): - dsi_helpers.py: goto_resilient (networkidle→domcontentloaded fallback) - try_dismiss_consent_banner before text extraction - PDF redirect detection (dm.de redirects to GCS PDF) 4. Caritas False Positive Fixes: - Phone regex allows parentheses: +49 (0)761 → now matches - "Recht auf Widerspruch" (3 words) + §23 KDG → matches Art. 21 - Church authorities: "Katholisches Datenschutzzentrum" recognized Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
DSI Discovery Helpers — resilient navigation, consent dismissal, PDF redirect detection.
|
||||
|
||||
Extracted from dsi_discovery.py to keep modules under 500 LOC.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from playwright.async_api import Page, TimeoutError as PlaywrightTimeout
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def goto_resilient(page: Page, url: str, timeout: int = 60000) -> None:
|
||||
"""Navigate to URL with fallback: try networkidle first, then domcontentloaded.
|
||||
|
||||
SPAs like Zalando never reach networkidle because of continuous background
|
||||
requests. Falling back to domcontentloaded + a short wait gives JS time to
|
||||
render the main content without waiting for every network request to finish.
|
||||
"""
|
||||
try:
|
||||
await page.goto(url, wait_until="networkidle", timeout=timeout)
|
||||
except PlaywrightTimeout:
|
||||
logger.info("networkidle timeout for %s, falling back to domcontentloaded", url)
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
||||
await page.wait_for_timeout(5000) # extra wait for JS rendering
|
||||
|
||||
|
||||
async def try_dismiss_consent_banner(page: Page) -> bool:
|
||||
"""Try to dismiss cookie consent banners that block page content.
|
||||
|
||||
Handles shadow DOM (Usercentrics), iframes (Sourcepoint), and regular
|
||||
DOM banners (OneTrust, Cookiebot, Didomi, etc.).
|
||||
Returns True if a banner was dismissed.
|
||||
"""
|
||||
# 1) Usercentrics shadow DOM — most common for German sites
|
||||
try:
|
||||
uc_root = await page.query_selector("#usercentrics-root")
|
||||
if uc_root:
|
||||
clicked = await page.evaluate("""() => {
|
||||
const root = document.querySelector('#usercentrics-root');
|
||||
if (!root || !root.shadowRoot) return false;
|
||||
const buttons = root.shadowRoot.querySelectorAll('button');
|
||||
for (const btn of buttons) {
|
||||
const t = btn.textContent.trim().toLowerCase();
|
||||
if (t.includes('akzeptieren') || t.includes('accept')
|
||||
|| t.includes('zustimmen') || t.includes('agree')) {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}""")
|
||||
if clicked:
|
||||
logger.info("Dismissed Usercentrics consent banner (shadow DOM)")
|
||||
await page.wait_for_timeout(2000)
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 2) Standard DOM banners — OneTrust, Cookiebot, Didomi, Borlabs, etc.
|
||||
accept_selectors = [
|
||||
"#onetrust-accept-btn-handler",
|
||||
"#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
|
||||
"#didomi-notice-agree-button",
|
||||
"#BorlabsCookieBox .cookie-accept, [data-cookie-accept]",
|
||||
".cmpboxbtn.cmpboxbtnyes",
|
||||
".klaro .cm-btn-accept",
|
||||
".cky-btn-accept",
|
||||
"[class*='qc-cmp2-summary-buttons'] button:first-child",
|
||||
"#tarteaucitronPersonalize2",
|
||||
]
|
||||
for sel in accept_selectors:
|
||||
try:
|
||||
btn = page.locator(sel).first
|
||||
if await btn.count() > 0 and await btn.is_visible():
|
||||
await btn.click(timeout=3000)
|
||||
logger.info("Dismissed consent banner via %s", sel)
|
||||
await page.wait_for_timeout(2000)
|
||||
return True
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 3) Generic text-based button search
|
||||
accept_texts = [
|
||||
"Alle akzeptieren", "Alles akzeptieren", "Alle Cookies akzeptieren",
|
||||
"Accept all", "Accept All Cookies", "Akzeptieren", "Zustimmen",
|
||||
"Einverstanden", "Ich stimme zu",
|
||||
]
|
||||
try:
|
||||
clicked = await page.evaluate("""(texts) => {
|
||||
for (const btn of document.querySelectorAll('button, a[role="button"]')) {
|
||||
const t = (btn.textContent || '').trim();
|
||||
for (const target of texts) {
|
||||
if (t === target) { btn.click(); return true; }
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}""", accept_texts)
|
||||
if clicked:
|
||||
logger.info("Dismissed consent banner via generic text match")
|
||||
await page.wait_for_timeout(2000)
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_pdf_redirect(original_url: str, final_url: str) -> bool:
|
||||
"""Check if the page redirected to a PDF or external storage."""
|
||||
final_lower = final_url.lower()
|
||||
return (
|
||||
final_lower.endswith(".pdf")
|
||||
or "storage.googleapis.com" in final_lower
|
||||
or "blob.core.windows.net" in final_lower
|
||||
or "s3.amazonaws.com" in final_lower
|
||||
)
|
||||
Reference in New Issue
Block a user