diff --git a/backend-compliance/compliance/services/dsi_document_checker.py b/backend-compliance/compliance/services/dsi_document_checker.py index 4d96544..3ec52ed 100644 --- a/backend-compliance/compliance/services/dsi_document_checker.py +++ b/backend-compliance/compliance/services/dsi_document_checker.py @@ -188,6 +188,23 @@ def check_document_completeness( }) return findings + # Short documents (< 200 words) are likely navigation snippets or + # introductory pages, not full Art. 13 documents — flag but don't check + word_count = len(text.split()) + if word_count < 200 and doc_type == "dse": + findings.append({ + "code": f"DSI-SCORE-{doc_type.upper()}", + "severity": "LOW", + "text": ( + f"'{doc_title}': Kurzhinweis ({word_count} Woerter) — zu kurz fuer " + f"eine vollstaendige Art. 13 DSGVO Pruefung. Kein eigenstaendiges DSI-Dokument." + ), + "doc_title": doc_title, + "doc_url": doc_url, + "doc_type": doc_type, + }) + return findings + # Select checklist based on document type if doc_type in ("dse", "datenschutz", "privacy"): checklist = ART13_CHECKLIST diff --git a/backend-compliance/compliance/services/website_scanner.py b/backend-compliance/compliance/services/website_scanner.py index 18256a7..1327b84 100644 --- a/backend-compliance/compliance/services/website_scanner.py +++ b/backend-compliance/compliance/services/website_scanner.py @@ -228,12 +228,25 @@ async def _fetch_page( def _detect_services(html: str, url: str, result: ScanResult) -> None: - """Detect third-party services in HTML.""" + """Detect third-party services in HTML. + + Searches script tags + src/href attributes to avoid false positives + from DSE text mentioning services (e.g. 'wir nutzen nicht Google Analytics'). + """ + # Extract script content + all src/href attributes for matching + scripts = " ".join(re.findall(r"]*>.*?", html, re.DOTALL | re.IGNORECASE)) + attrs = " ".join(re.findall(r'(?:src|href|data-src|action)=["\']([^"\']+)["\']', html, re.IGNORECASE)) + technical_context = scripts + " " + attrs + for pattern, meta in SERVICE_REGISTRY.items(): - if re.search(pattern, html, re.IGNORECASE): - result.detected_services.append(DetectedService( - found_on=url, **meta, - )) + # First try in technical context (scripts + URLs) — no false positives + if re.search(pattern, technical_context, re.IGNORECASE): + result.detected_services.append(DetectedService(found_on=url, **meta)) + continue + # For patterns that are purely technical (contain special chars), also check full HTML + is_technical = any(c in pattern for c in r"\(\.\/\d{") + if is_technical and re.search(pattern, html, re.IGNORECASE): + result.detected_services.append(DetectedService(found_on=url, **meta)) def _detect_ai_mentions(html: str, url: str, result: ScanResult) -> None: