diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 0104e7d4..0a8afe8c 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -890,13 +890,19 @@ _DOC_TYPE_LABELS = { "dsb": "DSB-Kontakt", } -# Canonical 8 doc types in the same order as the frontend ComplianceCheckTab. +# Canonical doc types in the same order as the frontend ComplianceCheckTab. # The route pads `results` to always contain an entry for each — even if # the user did not submit a URL — so the email + frontend always show # the complete checklist (missing rows marked as 'Nicht eingereicht'). +# +# DSB-Kontakt is intentionally NOT canonical: per GDPR practice the DSB is +# named *inside* the DSI/datenschutz document (email or contact block), not +# as a separate page. We check 'DSB benannt' as a sub-check of the DSE +# instead. If a tenant insists on a separate DSB document, they can still +# submit one — it just won't appear as a missing checklist row. _ALL_DOC_TYPES = [ "dse", "impressum", "social_media", "cookie", - "agb", "nutzungsbedingungen", "widerruf", "dsb", + "agb", "nutzungsbedingungen", "widerruf", ] diff --git a/backend-compliance/compliance/services/section_splitter.py b/backend-compliance/compliance/services/section_splitter.py index c9326a58..ad7b91d4 100644 --- a/backend-compliance/compliance/services/section_splitter.py +++ b/backend-compliance/compliance/services/section_splitter.py @@ -199,6 +199,12 @@ def auto_fill_from_dsi(doc_entries: list[dict]) -> None: for entry in doc_entries: if entry.get("text") or entry.get("url"): continue # Already has content + # Auto-discovery already tried + decided: skip. Don't override its + # 'NICHT GEFUNDEN' verdict with a pseudo-match from DSI sections + # (which produces false MANGELHAFT findings for genuinely missing + # docs like BMW's AGB). + if entry.get("discovery_attempted") and not entry.get("auto_discovered"): + continue doc_type = entry["doc_type"] section_text = _find_section_for_type(sections, doc_type) @@ -267,8 +273,10 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]: return findings # For each entry, check if: - # a) It's empty → search other texts - # b) It has text but the text doesn't match the doc_type → search other texts + # a) It has text but the text doesn't match the doc_type → search other texts + # (Empty entries from auto-discovery 'not found' are NOT pseudo-filled + # from other docs — that would silently revive a 'NICHT GEFUNDEN' verdict + # as a misleading MANGELHAFT row.) for entry in doc_entries: target_type = entry["doc_type"] keywords = _DOC_TYPE_KEYWORDS.get(target_type, []) @@ -278,13 +286,15 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]: has_text = entry.get("text") and len(entry["text"].split()) > 50 text_matches = False if has_text: - # Check if the current text actually contains this doc_type's content entry_lower = entry["text"].lower() match_score = sum(1 for kw in keywords if kw in entry_lower) text_matches = match_score >= 2 if has_text and text_matches: continue # Text present AND matches doc_type → skip + # Skip empty entries the auto-discovery has already ruled on. + if not has_text and entry.get("discovery_attempted") and not entry.get("auto_discovered"): + continue # Search all other texts for this doc_type's keywords best_match: dict | None = None