fix: 4 bugs from IHK Konstanz scan validation

1. DSE-Matcher: Google/YouTube false match — now requires 2+ word match for provider-name fallback, not just "Google" matching YouTube section 2. AGB/Widerrufsbelehrung: only_ecommerce flag — skips for non-shop websites (detected via payment providers, cart keywords) 3. DSE-internal link following — scanner now discovers links WITHIN the privacy policy and scans those too (finds regional DSE sub-pages) 4. Expanded keyword synonyms for DSE mandatory checks: - "Zweck und Rechtsgrundlage" now matches "zwecke" - "behoerdlichen datenschutzbeauftragt" matches DSB - "aufsichtsbehörde" with umlaut matches Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-29 17:57:19 +02:00
parent 0f3ba9c207
commit fff47cc52e
3 changed files with 70 additions and 7 deletions
@@ -58,9 +58,11 @@ AI_TEXT_PATTERNS = [

 FOOTER_LINK_PATTERNS = [
    (r'href="([^"]*(?:impressum|imprint|legal-notice)[^"]*)"', "impressum"),
-    (r'href="([^"]*(?:datenschutz|privacy|dsgvo)[^"]*)"', "datenschutz"),
+    (r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"),
    (r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"),
    (r'href="([^"]*(?:cookie)[^"]*)"', "cookies"),
+    # Also find deep DSE links (regional pages, sub-pages)
+    (r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"),
 ]


@@ -97,11 +99,31 @@ async def scan_website(base_url: str) -> ScanResult:
        _detect_services(start_html, origin, result)
        _detect_ai_mentions(start_html, origin, result)

-        # Process other pages
+        # Process other pages + discover DSE-internal links
+        dse_internal_urls = set()
        for url, html in zip(other_urls, other_htmls):
            if isinstance(html, str) and html:
                _detect_services(html, url, result)
                _detect_ai_mentions(html, url, result)
+                # If this is a DSE page, find links within it
+                if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE):
+                    for pattern, _ in FOOTER_LINK_PATTERNS:
+                        for match in re.finditer(pattern, html, re.IGNORECASE):
+                            href = match.group(1)
+                            if href.startswith("/"):
+                                href = urljoin(origin, href)
+                            if href.startswith("http") and href not in page_urls:
+                                dse_internal_urls.add(href)
+
+        # 4. Follow DSE-internal links (additional pages linked from privacy policy)
+        if dse_internal_urls:
+            extra_urls = [u for u in list(dse_internal_urls)[:5] if u not in page_urls]
+            if extra_urls:
+                extra_tasks = [_fetch_page(client, u, result) for u in extra_urls]
+                extra_htmls = await asyncio.gather(*extra_tasks, return_exceptions=True)
+                for url, html in zip(extra_urls, extra_htmls):
+                    if isinstance(html, str) and html:
+                        _detect_services(html, url, result)

    # Deduplicate services
    seen = set()