From bd2d6976d693aced30b2f7120af851ea336d2bee Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 15 May 2026 00:19:40 +0200 Subject: [PATCH] fix(cross-doc): also check entries with wrong text, not just empty ones Cross-search now validates if existing text matches the expected doc_type using keyword scoring. If text is present but doesn't match (e.g. Nutzungsbedingungen in Widerruf row), searches other texts and creates a finding explaining the mismatch. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../compliance/services/section_splitter.py | 42 ++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/backend-compliance/compliance/services/section_splitter.py b/backend-compliance/compliance/services/section_splitter.py index b7d31cdf..c9326a58 100644 --- a/backend-compliance/compliance/services/section_splitter.py +++ b/backend-compliance/compliance/services/section_splitter.py @@ -266,23 +266,33 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]: if not all_texts: return findings - # For each empty or short entry, search all other texts + # For each entry, check if: + # a) It's empty → search other texts + # b) It has text but the text doesn't match the doc_type → search other texts for entry in doc_entries: - if entry.get("text") and len(entry["text"].split()) > 50: - continue # Already has content - target_type = entry["doc_type"] keywords = _DOC_TYPE_KEYWORDS.get(target_type, []) if not keywords: continue + has_text = entry.get("text") and len(entry["text"].split()) > 50 + text_matches = False + if has_text: + # Check if the current text actually contains this doc_type's content + entry_lower = entry["text"].lower() + match_score = sum(1 for kw in keywords if kw in entry_lower) + text_matches = match_score >= 2 + + if has_text and text_matches: + continue # Text present AND matches doc_type → skip + # Search all other texts for this doc_type's keywords best_match: dict | None = None best_score = 0 for source_type, source_url, source_text in all_texts: if source_type == target_type: - continue # Don't search in the same doc_type + continue text_lower = source_text.lower() score = sum(1 for kw in keywords if kw in text_lower) @@ -333,6 +343,28 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]: best_match["keyword_hits"], entry["word_count"], ) + elif has_text and not text_matches: + # Text present but doesn't match — wrong text assigned + findings.append({ + "id": f"wrong-text-{target_type}", + "label": f"{_type_label(target_type)} nicht im eingereichten Text", + "passed": False, + "severity": "HIGH", + "level": 1, + "parent": None, + "skipped": False, + "matched_text": "", + "hint": ( + f"Der eingereichte Text enthaelt keine " + f"{_type_label(target_type)}. Moeglicherweise wurde " + f"die falsche URL eingegeben. Das System konnte die " + f"{_type_label(target_type)} auch in keinem anderen " + f"eingereichten Dokument finden." + ), + "source": "cross_document_search", + "doc_type": target_type, + }) + logger.info("Cross-doc: %s text doesn't match doc_type, not found elsewhere", target_type) return findings