fix(cross-doc): also check entries with wrong text, not just empty ones

Cross-search now validates if existing text matches the expected
doc_type using keyword scoring. If text is present but doesn't match
(e.g. Nutzungsbedingungen in Widerruf row), searches other texts
and creates a finding explaining the mismatch.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-15 00:19:40 +02:00
parent a5d1814605
commit bd2d6976d6
@@ -266,23 +266,33 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
if not all_texts:
return findings
# For each empty or short entry, search all other texts
# For each entry, check if:
# a) It's empty → search other texts
# b) It has text but the text doesn't match the doc_type → search other texts
for entry in doc_entries:
if entry.get("text") and len(entry["text"].split()) > 50:
continue # Already has content
target_type = entry["doc_type"]
keywords = _DOC_TYPE_KEYWORDS.get(target_type, [])
if not keywords:
continue
has_text = entry.get("text") and len(entry["text"].split()) > 50
text_matches = False
if has_text:
# Check if the current text actually contains this doc_type's content
entry_lower = entry["text"].lower()
match_score = sum(1 for kw in keywords if kw in entry_lower)
text_matches = match_score >= 2
if has_text and text_matches:
continue # Text present AND matches doc_type → skip
# Search all other texts for this doc_type's keywords
best_match: dict | None = None
best_score = 0
for source_type, source_url, source_text in all_texts:
if source_type == target_type:
continue # Don't search in the same doc_type
continue
text_lower = source_text.lower()
score = sum(1 for kw in keywords if kw in text_lower)
@@ -333,6 +343,28 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
best_match["keyword_hits"],
entry["word_count"],
)
elif has_text and not text_matches:
# Text present but doesn't match — wrong text assigned
findings.append({
"id": f"wrong-text-{target_type}",
"label": f"{_type_label(target_type)} nicht im eingereichten Text",
"passed": False,
"severity": "HIGH",
"level": 1,
"parent": None,
"skipped": False,
"matched_text": "",
"hint": (
f"Der eingereichte Text enthaelt keine "
f"{_type_label(target_type)}. Moeglicherweise wurde "
f"die falsche URL eingegeben. Das System konnte die "
f"{_type_label(target_type)} auch in keinem anderen "
f"eingereichten Dokument finden."
),
"source": "cross_document_search",
"doc_type": target_type,
})
logger.info("Cross-doc: %s text doesn't match doc_type, not found elsewhere", target_type)
return findings