fix: 4 bugs from IHK scan — false positives + missing etracker
1. GA regex: G-\w{5,} matched CSS classes (g-7031048). Now requires
G-[A-Z0-9]{8,12} (uppercase after G-, 8-12 chars = real GA4 ID)
2. External page scanning: DSE-internal links now SAME DOMAIN only.
Previously followed links to etracker.com, google.de/policies etc.
and detected services on THOSE sites as IHK services.
3. Added etracker to service registry (DE, ePrivacy-certified)
4. CSS/JS/image files excluded from page scanning
5. Navigation-pattern links for deeper DSE sub-pages
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -61,8 +61,10 @@ FOOTER_LINK_PATTERNS = [
|
||||
(r'href="([^"]*(?:datenschutz|privacy|dsgvo|hinweise.?zum.?datenschutz)[^"]*)"', "datenschutz"),
|
||||
(r'href="([^"]*(?:agb|terms|nutzungsbedingungen)[^"]*)"', "agb"),
|
||||
(r'href="([^"]*(?:cookie)[^"]*)"', "cookies"),
|
||||
# Also find deep DSE links (regional pages, sub-pages)
|
||||
# Deep DSE links (regional pages, sub-pages, service marks)
|
||||
(r'href="([^"]*(?:datenschutzinformation|datenschutzerklaerung|datenschutzerkl)[^"]*)"', "datenschutz_deep"),
|
||||
# Navigation links often contain DSB/privacy sub-pages
|
||||
(r'href="([^"]*(?:ueber.?uns.*datenschutz|servicemarken.*datenschutz|kontakt.*datenschutz)[^"]*)"', "datenschutz_nav"),
|
||||
]
|
||||
|
||||
|
||||
@@ -86,7 +88,7 @@ async def scan_website(base_url: str) -> ScanResult:
|
||||
href = match.group(1)
|
||||
if href.startswith("/"):
|
||||
href = urljoin(origin, href)
|
||||
if href.startswith(origin):
|
||||
if href.startswith(origin) and not re.search(r"\.(css|js|png|jpg|gif|svg|pdf|zip)(\?|$)", href):
|
||||
page_urls.add(href)
|
||||
|
||||
# 3. Scan all pages in PARALLEL (max 10)
|
||||
@@ -105,14 +107,16 @@ async def scan_website(base_url: str) -> ScanResult:
|
||||
if isinstance(html, str) and html:
|
||||
_detect_services(html, url, result)
|
||||
_detect_ai_mentions(html, url, result)
|
||||
# If this is a DSE page, find links within it
|
||||
# If this is a DSE page, find links within it (SAME DOMAIN only)
|
||||
if re.search(r"datenschutz|privacy|dsgvo", url, re.IGNORECASE):
|
||||
for pattern, _ in FOOTER_LINK_PATTERNS:
|
||||
for match in re.finditer(pattern, html, re.IGNORECASE):
|
||||
href = match.group(1)
|
||||
if href.startswith("/"):
|
||||
href = urljoin(origin, href)
|
||||
if href.startswith("http") and href not in page_urls:
|
||||
# IMPORTANT: Only follow links on the SAME domain
|
||||
# External links (etracker.com, google.de) must NOT be scanned
|
||||
if href.startswith(origin) and href not in page_urls:
|
||||
dse_internal_urls.add(href)
|
||||
|
||||
# 4. Follow DSE-internal links (additional pages linked from privacy policy)
|
||||
|
||||
Reference in New Issue
Block a user