fix: Eliminate GA false positive + handle short DSI documents

Service detection:
- Only search script tags + src/href attributes for service patterns
- Prevents false positives from DSE text mentioning services
  (e.g. IHK DSE describes etracker, 'google analytics' in text)
- Technical patterns (with regex chars) still checked in full HTML

Short documents:
- Documents with < 200 words flagged as 'Kurzhinweis' instead of
  'MANGELHAFT' — too short for Art. 13 completeness check
- Prevents 96-word navigation pages from showing 8 missing fields

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-05 18:21:37 +02:00
parent 8d6959e8b2
commit 8fb2061e9b
2 changed files with 35 additions and 5 deletions
@@ -228,12 +228,25 @@ async def _fetch_page(
def _detect_services(html: str, url: str, result: ScanResult) -> None:
"""Detect third-party services in HTML."""
"""Detect third-party services in HTML.
Searches script tags + src/href attributes to avoid false positives
from DSE text mentioning services (e.g. 'wir nutzen nicht Google Analytics').
"""
# Extract script content + all src/href attributes for matching
scripts = " ".join(re.findall(r"<script[^>]*>.*?</script>", html, re.DOTALL | re.IGNORECASE))
attrs = " ".join(re.findall(r'(?:src|href|data-src|action)=["\']([^"\']+)["\']', html, re.IGNORECASE))
technical_context = scripts + " " + attrs
for pattern, meta in SERVICE_REGISTRY.items():
if re.search(pattern, html, re.IGNORECASE):
result.detected_services.append(DetectedService(
found_on=url, **meta,
))
# First try in technical context (scripts + URLs) — no false positives
if re.search(pattern, technical_context, re.IGNORECASE):
result.detected_services.append(DetectedService(found_on=url, **meta))
continue
# For patterns that are purely technical (contain special chars), also check full HTML
is_technical = any(c in pattern for c in r"\(\.\/\d{")
if is_technical and re.search(pattern, html, re.IGNORECASE):
result.detected_services.append(DetectedService(found_on=url, **meta))
def _detect_ai_mentions(html: str, url: str, result: ScanResult) -> None: