fix: Eliminate GA false positive + handle short DSI documents
Service detection: - Only search script tags + src/href attributes for service patterns - Prevents false positives from DSE text mentioning services (e.g. IHK DSE describes etracker, 'google analytics' in text) - Technical patterns (with regex chars) still checked in full HTML Short documents: - Documents with < 200 words flagged as 'Kurzhinweis' instead of 'MANGELHAFT' — too short for Art. 13 completeness check - Prevents 96-word navigation pages from showing 8 missing fields Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -188,6 +188,23 @@ def check_document_completeness(
|
|||||||
})
|
})
|
||||||
return findings
|
return findings
|
||||||
|
|
||||||
|
# Short documents (< 200 words) are likely navigation snippets or
|
||||||
|
# introductory pages, not full Art. 13 documents — flag but don't check
|
||||||
|
word_count = len(text.split())
|
||||||
|
if word_count < 200 and doc_type == "dse":
|
||||||
|
findings.append({
|
||||||
|
"code": f"DSI-SCORE-{doc_type.upper()}",
|
||||||
|
"severity": "LOW",
|
||||||
|
"text": (
|
||||||
|
f"'{doc_title}': Kurzhinweis ({word_count} Woerter) — zu kurz fuer "
|
||||||
|
f"eine vollstaendige Art. 13 DSGVO Pruefung. Kein eigenstaendiges DSI-Dokument."
|
||||||
|
),
|
||||||
|
"doc_title": doc_title,
|
||||||
|
"doc_url": doc_url,
|
||||||
|
"doc_type": doc_type,
|
||||||
|
})
|
||||||
|
return findings
|
||||||
|
|
||||||
# Select checklist based on document type
|
# Select checklist based on document type
|
||||||
if doc_type in ("dse", "datenschutz", "privacy"):
|
if doc_type in ("dse", "datenschutz", "privacy"):
|
||||||
checklist = ART13_CHECKLIST
|
checklist = ART13_CHECKLIST
|
||||||
|
|||||||
@@ -228,12 +228,25 @@ async def _fetch_page(
|
|||||||
|
|
||||||
|
|
||||||
def _detect_services(html: str, url: str, result: ScanResult) -> None:
|
def _detect_services(html: str, url: str, result: ScanResult) -> None:
|
||||||
"""Detect third-party services in HTML."""
|
"""Detect third-party services in HTML.
|
||||||
|
|
||||||
|
Searches script tags + src/href attributes to avoid false positives
|
||||||
|
from DSE text mentioning services (e.g. 'wir nutzen nicht Google Analytics').
|
||||||
|
"""
|
||||||
|
# Extract script content + all src/href attributes for matching
|
||||||
|
scripts = " ".join(re.findall(r"<script[^>]*>.*?</script>", html, re.DOTALL | re.IGNORECASE))
|
||||||
|
attrs = " ".join(re.findall(r'(?:src|href|data-src|action)=["\']([^"\']+)["\']', html, re.IGNORECASE))
|
||||||
|
technical_context = scripts + " " + attrs
|
||||||
|
|
||||||
for pattern, meta in SERVICE_REGISTRY.items():
|
for pattern, meta in SERVICE_REGISTRY.items():
|
||||||
if re.search(pattern, html, re.IGNORECASE):
|
# First try in technical context (scripts + URLs) — no false positives
|
||||||
result.detected_services.append(DetectedService(
|
if re.search(pattern, technical_context, re.IGNORECASE):
|
||||||
found_on=url, **meta,
|
result.detected_services.append(DetectedService(found_on=url, **meta))
|
||||||
))
|
continue
|
||||||
|
# For patterns that are purely technical (contain special chars), also check full HTML
|
||||||
|
is_technical = any(c in pattern for c in r"\(\.\/\d{")
|
||||||
|
if is_technical and re.search(pattern, html, re.IGNORECASE):
|
||||||
|
result.detected_services.append(DetectedService(found_on=url, **meta))
|
||||||
|
|
||||||
|
|
||||||
def _detect_ai_mentions(html: str, url: str, result: ScanResult) -> None:
|
def _detect_ai_mentions(html: str, url: str, result: ScanResult) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user