fix: 4 bugs from IHK scan — false positives + missing etracker

1. GA regex: G-\w{5,} matched CSS classes (g-7031048). Now requires
   G-[A-Z0-9]{8,12} (uppercase after G-, 8-12 chars = real GA4 ID)
2. External page scanning: DSE-internal links now SAME DOMAIN only.
   Previously followed links to etracker.com, google.de/policies etc.
   and detected services on THOSE sites as IHK services.
3. Added etracker to service registry (DE, ePrivacy-certified)
4. CSS/JS/image files excluded from page scanning
5. Navigation-pattern links for deeper DSE sub-pages

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-29 19:08:07 +02:00
parent 891fc5bea0
commit 5eeef3a9c3
3 changed files with 15 additions and 6 deletions
@@ -16,7 +16,7 @@ SERVICE_REGISTRY: dict[str, dict] = {
# ═══════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════
# TRACKING & ANALYTICS # TRACKING & ANALYTICS
# ═══════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════
r"google.?analytics|gtag\(|UA-\d+|G-\w{5,}": { r"google.?analytics|gtag\(|UA-\d{4,}|G-[A-Z0-9]{8,12}": {
"id": "google_analytics", "name": "Google Analytics", "category": "tracking", "id": "google_analytics", "name": "Google Analytics", "category": "tracking",
"provider": "Google LLC", "country": "US", "eu_adequate": False, "provider": "Google LLC", "country": "US", "eu_adequate": False,
"requires_consent": True, "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", "requires_consent": True, "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO",
@@ -51,6 +51,11 @@ SERVICE_REGISTRY: dict[str, dict] = {
"provider": "Plausible Insights", "country": "EE", "eu_adequate": True, "provider": "Plausible Insights", "country": "EE", "eu_adequate": True,
"requires_consent": False, "legal_ref": "EU-Anbieter, cookieless", "requires_consent": False, "legal_ref": "EU-Anbieter, cookieless",
}, },
r"etracker\.com|etracker\.de|etrackerCookieless": {
"id": "etracker", "name": "etracker", "category": "tracking",
"provider": "etracker GmbH", "country": "DE", "eu_adequate": True,
"requires_consent": True, "legal_ref": "§25 TDDDG, DE-Anbieter mit ePrivacy-Siegel",
},
r"pirsch\.io": { r"pirsch\.io": {
"id": "pirsch", "name": "Pirsch Analytics", "category": "tracking", "id": "pirsch", "name": "Pirsch Analytics", "category": "tracking",
"provider": "Pirsch GmbH", "country": "DE", "eu_adequate": True, "provider": "Pirsch GmbH", "country": "DE", "eu_adequate": True,
@@ -61,8 +61,10 @@ FOOTER_LINK_PATTERNS = [
(r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"), (r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"),
(r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"), (r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"),
(r'href="([^"]*(?:cookie)[^"]*)"', "cookies"), (r'href="([^"]*(?:cookie)[^"]*)"', "cookies"),
# Also find deep DSE links (regional pages, sub-pages) # Deep DSE links (regional pages, sub-pages, service marks)
(r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"), (r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"),
# Navigation links often contain DSB/privacy sub-pages
(r'href="([^"]*(?:ueber.?uns.*datenschutz|servicemarken.*datenschutz|kontakt.*datenschutz)[^"]*)"', "datenschutz_nav"),
] ]
@@ -86,7 +88,7 @@ async def scan_website(base_url: str) -> ScanResult:
href = match.group(1) href = match.group(1)
if href.startswith("/"): if href.startswith("/"):
href = urljoin(origin, href) href = urljoin(origin, href)
if href.startswith(origin): if href.startswith(origin) and not re.search(r"\.(css|js|png|jpg|gif|svg|pdf|zip)(\?|$)", href):
page_urls.add(href) page_urls.add(href)
# 3. Scan all pages in PARALLEL (max 10) # 3. Scan all pages in PARALLEL (max 10)
@@ -105,14 +107,16 @@ async def scan_website(base_url: str) -> ScanResult:
if isinstance(html, str) and html: if isinstance(html, str) and html:
_detect_services(html, url, result) _detect_services(html, url, result)
_detect_ai_mentions(html, url, result) _detect_ai_mentions(html, url, result)
# If this is a DSE page, find links within it # If this is a DSE page, find links within it (SAME DOMAIN only)
if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE): if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE):
for pattern, _ in FOOTER_LINK_PATTERNS: for pattern, _ in FOOTER_LINK_PATTERNS:
for match in re.finditer(pattern, html, re.IGNORECASE): for match in re.finditer(pattern, html, re.IGNORECASE):
href = match.group(1) href = match.group(1)
if href.startswith("/"): if href.startswith("/"):
href = urljoin(origin, href) href = urljoin(origin, href)
if href.startswith("http") and href not in page_urls: # IMPORTANT: Only follow links on the SAME domain
# External links (etracker.com, google.de) must NOT be scanned
if href.startswith(origin) and href not in page_urls:
dse_internal_urls.add(href) dse_internal_urls.add(href)
# 4. Follow DSE-internal links (additional pages linked from privacy policy) # 4. Follow DSE-internal links (additional pages linked from privacy policy)
+1 -1
View File
@@ -6,7 +6,7 @@ import re
from dataclasses import dataclass from dataclasses import dataclass
SERVICE_PATTERNS: dict[str, dict] = { SERVICE_PATTERNS: dict[str, dict] = {
r"google.?analytics|gtag|UA-\d|G-\w{5}": { r"google.?analytics|gtag\(|UA-\d{4,}|G-[A-Z0-9]{8,12}": {
"name": "Google Analytics", "requires_consent": True, "name": "Google Analytics", "requires_consent": True,
"legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO",
}, },