From bd2d6976d693aced30b2f7120af851ea336d2bee Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Fri, 15 May 2026 00:19:40 +0200
Subject: [PATCH] fix(cross-doc): also check entries with wrong text, not just
 empty ones

Cross-search now validates if existing text matches the expected
doc_type using keyword scoring. If text is present but doesn't match
(e.g. Nutzungsbedingungen in Widerruf row), searches other texts
and creates a finding explaining the mismatch.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../compliance/services/section_splitter.py   | 42 ++++++++++++++++---
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/backend-compliance/compliance/services/section_splitter.py b/backend-compliance/compliance/services/section_splitter.py
index b7d31cdf..c9326a58 100644
--- a/backend-compliance/compliance/services/section_splitter.py
+++ b/backend-compliance/compliance/services/section_splitter.py
@@ -266,23 +266,33 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
     if not all_texts:
         return findings
 
-    # For each empty or short entry, search all other texts
+    # For each entry, check if:
+    # a) It's empty → search other texts
+    # b) It has text but the text doesn't match the doc_type → search other texts
     for entry in doc_entries:
-        if entry.get("text") and len(entry["text"].split()) > 50:
-            continue  # Already has content
-
         target_type = entry["doc_type"]
         keywords = _DOC_TYPE_KEYWORDS.get(target_type, [])
         if not keywords:
             continue
 
+        has_text = entry.get("text") and len(entry["text"].split()) > 50
+        text_matches = False
+        if has_text:
+            # Check if the current text actually contains this doc_type's content
+            entry_lower = entry["text"].lower()
+            match_score = sum(1 for kw in keywords if kw in entry_lower)
+            text_matches = match_score >= 2
+
+        if has_text and text_matches:
+            continue  # Text present AND matches doc_type → skip
+
         # Search all other texts for this doc_type's keywords
         best_match: dict | None = None
         best_score = 0
 
         for source_type, source_url, source_text in all_texts:
             if source_type == target_type:
-                continue  # Don't search in the same doc_type
+                continue
 
             text_lower = source_text.lower()
             score = sum(1 for kw in keywords if kw in text_lower)
@@ -333,6 +343,28 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
                 best_match["keyword_hits"],
                 entry["word_count"],
             )
+        elif has_text and not text_matches:
+            # Text present but doesn't match — wrong text assigned
+            findings.append({
+                "id": f"wrong-text-{target_type}",
+                "label": f"{_type_label(target_type)} nicht im eingereichten Text",
+                "passed": False,
+                "severity": "HIGH",
+                "level": 1,
+                "parent": None,
+                "skipped": False,
+                "matched_text": "",
+                "hint": (
+                    f"Der eingereichte Text enthaelt keine "
+                    f"{_type_label(target_type)}. Moeglicherweise wurde "
+                    f"die falsche URL eingegeben. Das System konnte die "
+                    f"{_type_label(target_type)} auch in keinem anderen "
+                    f"eingereichten Dokument finden."
+                ),
+                "source": "cross_document_search",
+                "doc_type": target_type,
+            })
+            logger.info("Cross-doc: %s text doesn't match doc_type, not found elsewhere", target_type)
 
     return findings