From 5eeef3a9c3974108591499763f9cd8f8c0bd59bd Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 29 Apr 2026 19:08:07 +0200 Subject: [PATCH] =?UTF-8?q?fix:=204=20bugs=20from=20IHK=20scan=20=E2=80=94?= =?UTF-8?q?=20false=20positives=20+=20missing=20etracker?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. GA regex: G-\w{5,} matched CSS classes (g-7031048). Now requires G-[A-Z0-9]{8,12} (uppercase after G-, 8-12 chars = real GA4 ID) 2. External page scanning: DSE-internal links now SAME DOMAIN only. Previously followed links to etracker.com, google.de/policies etc. and detected services on THOSE sites as IHK services. 3. Added etracker to service registry (DE, ePrivacy-certified) 4. CSS/JS/image files excluded from page scanning 5. Navigation-pattern links for deeper DSE sub-pages Co-Authored-By: Claude Opus 4.6 (1M context) --- .../compliance/services/service_registry.py | 7 ++++++- .../compliance/services/website_scanner.py | 12 ++++++++---- consent-tester/services/script_analyzer.py | 2 +- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/backend-compliance/compliance/services/service_registry.py b/backend-compliance/compliance/services/service_registry.py index 2db8e22..0503b9a 100644 --- a/backend-compliance/compliance/services/service_registry.py +++ b/backend-compliance/compliance/services/service_registry.py @@ -16,7 +16,7 @@ SERVICE_REGISTRY: dict[str, dict] = { # ═══════════════════════════════════════════════════════════════ # TRACKING & ANALYTICS # ═══════════════════════════════════════════════════════════════ - r"google.?analytics|gtag\(|UA-\d+|G-\w{5,}": { + r"google.?analytics|gtag\(|UA-\d{4,}|G-[A-Z0-9]{8,12}": { "id": "google_analytics", "name": "Google Analytics", "category": "tracking", "provider": "Google LLC", "country": "US", "eu_adequate": False, "requires_consent": True, "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", @@ -51,6 +51,11 @@ SERVICE_REGISTRY: dict[str, dict] = { "provider": "Plausible Insights", "country": "EE", "eu_adequate": True, "requires_consent": False, "legal_ref": "EU-Anbieter, cookieless", }, + r"etracker\.com|etracker\.de|etrackerCookieless": { + "id": "etracker", "name": "etracker", "category": "tracking", + "provider": "etracker GmbH", "country": "DE", "eu_adequate": True, + "requires_consent": True, "legal_ref": "§25 TDDDG, DE-Anbieter mit ePrivacy-Siegel", + }, r"pirsch\.io": { "id": "pirsch", "name": "Pirsch Analytics", "category": "tracking", "provider": "Pirsch GmbH", "country": "DE", "eu_adequate": True, diff --git a/backend-compliance/compliance/services/website_scanner.py b/backend-compliance/compliance/services/website_scanner.py index 6d9213d..6538a4d 100644 --- a/backend-compliance/compliance/services/website_scanner.py +++ b/backend-compliance/compliance/services/website_scanner.py @@ -61,8 +61,10 @@ FOOTER_LINK_PATTERNS = [ (r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"), (r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"), (r'href="([^"]*(?:cookie)[^"]*)"', "cookies"), - # Also find deep DSE links (regional pages, sub-pages) + # Deep DSE links (regional pages, sub-pages, service marks) (r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"), + # Navigation links often contain DSB/privacy sub-pages + (r'href="([^"]*(?:ueber.?uns.*datenschutz|servicemarken.*datenschutz|kontakt.*datenschutz)[^"]*)"', "datenschutz_nav"), ] @@ -86,7 +88,7 @@ async def scan_website(base_url: str) -> ScanResult: href = match.group(1) if href.startswith("/"): href = urljoin(origin, href) - if href.startswith(origin): + if href.startswith(origin) and not re.search(r"\.(css|js|png|jpg|gif|svg|pdf|zip)(\?|$)", href): page_urls.add(href) # 3. Scan all pages in PARALLEL (max 10) @@ -105,14 +107,16 @@ async def scan_website(base_url: str) -> ScanResult: if isinstance(html, str) and html: _detect_services(html, url, result) _detect_ai_mentions(html, url, result) - # If this is a DSE page, find links within it + # If this is a DSE page, find links within it (SAME DOMAIN only) if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE): for pattern, _ in FOOTER_LINK_PATTERNS: for match in re.finditer(pattern, html, re.IGNORECASE): href = match.group(1) if href.startswith("/"): href = urljoin(origin, href) - if href.startswith("http") and href not in page_urls: + # IMPORTANT: Only follow links on the SAME domain + # External links (etracker.com, google.de) must NOT be scanned + if href.startswith(origin) and href not in page_urls: dse_internal_urls.add(href) # 4. Follow DSE-internal links (additional pages linked from privacy policy) diff --git a/consent-tester/services/script_analyzer.py b/consent-tester/services/script_analyzer.py index 4079362..aeb36d4 100644 --- a/consent-tester/services/script_analyzer.py +++ b/consent-tester/services/script_analyzer.py @@ -6,7 +6,7 @@ import re from dataclasses import dataclass SERVICE_PATTERNS: dict[str, dict] = { - r"google.?analytics|gtag|UA-\d|G-\w{5}": { + r"google.?analytics|gtag\(|UA-\d{4,}|G-[A-Z0-9]{8,12}": { "name": "Google Analytics", "requires_consent": True, "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", },