fix: 4 bugs from IHK Konstanz scan validation
1. DSE-Matcher: Google/YouTube false match — now requires 2+ word match for provider-name fallback, not just "Google" matching YouTube section 2. AGB/Widerrufsbelehrung: only_ecommerce flag — skips for non-shop websites (detected via payment providers, cart keywords) 3. DSE-internal link following — scanner now discovers links WITHIN the privacy policy and scans those too (finds regional DSE sub-pages) 4. Expanded keyword synonyms for DSE mandatory checks: - "Zweck und Rechtsgrundlage" now matches "zwecke" - "behoerdlichen datenschutzbeauftragt" matches DSB - "aufsichtsbehörde" with umlaut matches Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -58,9 +58,11 @@ AI_TEXT_PATTERNS = [
|
||||
|
||||
FOOTER_LINK_PATTERNS = [
|
||||
(r'href="([^"]*(?:impressum|imprint|legal-notice)[^"]*)"', "impressum"),
|
||||
(r'href="([^"]*(?:datenschutz|privacy|dsgvo)[^"]*)"', "datenschutz"),
|
||||
(r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"),
|
||||
(r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"),
|
||||
(r'href="([^"]*(?:cookie)[^"]*)"', "cookies"),
|
||||
# Also find deep DSE links (regional pages, sub-pages)
|
||||
(r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"),
|
||||
]
|
||||
|
||||
|
||||
@@ -97,11 +99,31 @@ async def scan_website(base_url: str) -> ScanResult:
|
||||
_detect_services(start_html, origin, result)
|
||||
_detect_ai_mentions(start_html, origin, result)
|
||||
|
||||
# Process other pages
|
||||
# Process other pages + discover DSE-internal links
|
||||
dse_internal_urls = set()
|
||||
for url, html in zip(other_urls, other_htmls):
|
||||
if isinstance(html, str) and html:
|
||||
_detect_services(html, url, result)
|
||||
_detect_ai_mentions(html, url, result)
|
||||
# If this is a DSE page, find links within it
|
||||
if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE):
|
||||
for pattern, _ in FOOTER_LINK_PATTERNS:
|
||||
for match in re.finditer(pattern, html, re.IGNORECASE):
|
||||
href = match.group(1)
|
||||
if href.startswith("/"):
|
||||
href = urljoin(origin, href)
|
||||
if href.startswith("http") and href not in page_urls:
|
||||
dse_internal_urls.add(href)
|
||||
|
||||
# 4. Follow DSE-internal links (additional pages linked from privacy policy)
|
||||
if dse_internal_urls:
|
||||
extra_urls = [u for u in list(dse_internal_urls)[:5] if u not in page_urls]
|
||||
if extra_urls:
|
||||
extra_tasks = [_fetch_page(client, u, result) for u in extra_urls]
|
||||
extra_htmls = await asyncio.gather(*extra_tasks, return_exceptions=True)
|
||||
for url, html in zip(extra_urls, extra_htmls):
|
||||
if isinstance(html, str) and html:
|
||||
_detect_services(html, url, result)
|
||||
|
||||
# Deduplicate services
|
||||
seen = set()
|
||||
|
||||
Reference in New Issue
Block a user