fix: 4 bugs from IHK Konstanz scan validation

1. DSE-Matcher: Google/YouTube false match — now requires 2+ word match
   for provider-name fallback, not just "Google" matching YouTube section
2. AGB/Widerrufsbelehrung: only_ecommerce flag — skips for non-shop
   websites (detected via payment providers, cart keywords)
3. DSE-internal link following — scanner now discovers links WITHIN the
   privacy policy and scans those too (finds regional DSE sub-pages)
4. Expanded keyword synonyms for DSE mandatory checks:
   - "Zweck und Rechtsgrundlage" now matches "zwecke"
   - "behoerdlichen datenschutzbeauftragt" matches DSB
   - "aufsichtsbehörde" with umlaut matches

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-29 17:57:19 +02:00
parent 0f3ba9c207
commit fff47cc52e
3 changed files with 70 additions and 7 deletions
@@ -58,9 +58,11 @@ AI_TEXT_PATTERNS = [
FOOTER_LINK_PATTERNS = [
(r'href="([^"]*(?:impressum|imprint|legal-notice)[^"]*)"', "impressum"),
(r'href="([^"]*(?:datenschutz|privacy|dsgvo)[^"]*)"', "datenschutz"),
(r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"),
(r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"),
(r'href="([^"]*(?:cookie)[^"]*)"', "cookies"),
# Also find deep DSE links (regional pages, sub-pages)
(r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"),
]
@@ -97,11 +99,31 @@ async def scan_website(base_url: str) -> ScanResult:
_detect_services(start_html, origin, result)
_detect_ai_mentions(start_html, origin, result)
# Process other pages
# Process other pages + discover DSE-internal links
dse_internal_urls = set()
for url, html in zip(other_urls, other_htmls):
if isinstance(html, str) and html:
_detect_services(html, url, result)
_detect_ai_mentions(html, url, result)
# If this is a DSE page, find links within it
if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE):
for pattern, _ in FOOTER_LINK_PATTERNS:
for match in re.finditer(pattern, html, re.IGNORECASE):
href = match.group(1)
if href.startswith("/"):
href = urljoin(origin, href)
if href.startswith("http") and href not in page_urls:
dse_internal_urls.add(href)
# 4. Follow DSE-internal links (additional pages linked from privacy policy)
if dse_internal_urls:
extra_urls = [u for u in list(dse_internal_urls)[:5] if u not in page_urls]
if extra_urls:
extra_tasks = [_fetch_page(client, u, result) for u in extra_urls]
extra_htmls = await asyncio.gather(*extra_tasks, return_exceptions=True)
for url, html in zip(extra_urls, extra_htmls):
if isinstance(html, str) and html:
_detect_services(html, url, result)
# Deduplicate services
seen = set()