diff --git a/backend-compliance/compliance/services/service_registry.py b/backend-compliance/compliance/services/service_registry.py index 2db8e22..0503b9a 100644 --- a/backend-compliance/compliance/services/service_registry.py +++ b/backend-compliance/compliance/services/service_registry.py @@ -16,7 +16,7 @@ SERVICE_REGISTRY: dict[str, dict] = { # ═══════════════════════════════════════════════════════════════ # TRACKING & ANALYTICS # ═══════════════════════════════════════════════════════════════ - r"google.?analytics|gtag\(|UA-\d+|G-\w{5,}": { + r"google.?analytics|gtag\(|UA-\d{4,}|G-[A-Z0-9]{8,12}": { "id": "google_analytics", "name": "Google Analytics", "category": "tracking", "provider": "Google LLC", "country": "US", "eu_adequate": False, "requires_consent": True, "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", @@ -51,6 +51,11 @@ SERVICE_REGISTRY: dict[str, dict] = { "provider": "Plausible Insights", "country": "EE", "eu_adequate": True, "requires_consent": False, "legal_ref": "EU-Anbieter, cookieless", }, + r"etracker\.com|etracker\.de|etrackerCookieless": { + "id": "etracker", "name": "etracker", "category": "tracking", + "provider": "etracker GmbH", "country": "DE", "eu_adequate": True, + "requires_consent": True, "legal_ref": "§25 TDDDG, DE-Anbieter mit ePrivacy-Siegel", + }, r"pirsch\.io": { "id": "pirsch", "name": "Pirsch Analytics", "category": "tracking", "provider": "Pirsch GmbH", "country": "DE", "eu_adequate": True, diff --git a/backend-compliance/compliance/services/website_scanner.py b/backend-compliance/compliance/services/website_scanner.py index 6d9213d..6538a4d 100644 --- a/backend-compliance/compliance/services/website_scanner.py +++ b/backend-compliance/compliance/services/website_scanner.py @@ -61,8 +61,10 @@ FOOTER_LINK_PATTERNS = [ (r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"), (r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"), (r'href="([^"]*(?:cookie)[^"]*)"', "cookies"), - # Also find deep DSE links (regional pages, sub-pages) + # Deep DSE links (regional pages, sub-pages, service marks) (r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"), + # Navigation links often contain DSB/privacy sub-pages + (r'href="([^"]*(?:ueber.?uns.*datenschutz|servicemarken.*datenschutz|kontakt.*datenschutz)[^"]*)"', "datenschutz_nav"), ] @@ -86,7 +88,7 @@ async def scan_website(base_url: str) -> ScanResult: href = match.group(1) if href.startswith("/"): href = urljoin(origin, href) - if href.startswith(origin): + if href.startswith(origin) and not re.search(r"\.(css|js|png|jpg|gif|svg|pdf|zip)(\?|$)", href): page_urls.add(href) # 3. Scan all pages in PARALLEL (max 10) @@ -105,14 +107,16 @@ async def scan_website(base_url: str) -> ScanResult: if isinstance(html, str) and html: _detect_services(html, url, result) _detect_ai_mentions(html, url, result) - # If this is a DSE page, find links within it + # If this is a DSE page, find links within it (SAME DOMAIN only) if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE): for pattern, _ in FOOTER_LINK_PATTERNS: for match in re.finditer(pattern, html, re.IGNORECASE): href = match.group(1) if href.startswith("/"): href = urljoin(origin, href) - if href.startswith("http") and href not in page_urls: + # IMPORTANT: Only follow links on the SAME domain + # External links (etracker.com, google.de) must NOT be scanned + if href.startswith(origin) and href not in page_urls: dse_internal_urls.add(href) # 4. Follow DSE-internal links (additional pages linked from privacy policy) diff --git a/consent-tester/services/script_analyzer.py b/consent-tester/services/script_analyzer.py index 4079362..aeb36d4 100644 --- a/consent-tester/services/script_analyzer.py +++ b/consent-tester/services/script_analyzer.py @@ -6,7 +6,7 @@ import re from dataclasses import dataclass SERVICE_PATTERNS: dict[str, dict] = { - r"google.?analytics|gtag|UA-\d|G-\w{5}": { + r"google.?analytics|gtag\(|UA-\d{4,}|G-[A-Z0-9]{8,12}": { "name": "Google Analytics", "requires_consent": True, "legal_ref": "§25 TDDDG, Art. 44-49 DSGVO", },