fix(cross-doc): also check entries with wrong text, not just empty ones
Cross-search now validates if existing text matches the expected doc_type using keyword scoring. If text is present but doesn't match (e.g. Nutzungsbedingungen in Widerruf row), searches other texts and creates a finding explaining the mismatch. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -266,23 +266,33 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
|
||||
if not all_texts:
|
||||
return findings
|
||||
|
||||
# For each empty or short entry, search all other texts
|
||||
# For each entry, check if:
|
||||
# a) It's empty → search other texts
|
||||
# b) It has text but the text doesn't match the doc_type → search other texts
|
||||
for entry in doc_entries:
|
||||
if entry.get("text") and len(entry["text"].split()) > 50:
|
||||
continue # Already has content
|
||||
|
||||
target_type = entry["doc_type"]
|
||||
keywords = _DOC_TYPE_KEYWORDS.get(target_type, [])
|
||||
if not keywords:
|
||||
continue
|
||||
|
||||
has_text = entry.get("text") and len(entry["text"].split()) > 50
|
||||
text_matches = False
|
||||
if has_text:
|
||||
# Check if the current text actually contains this doc_type's content
|
||||
entry_lower = entry["text"].lower()
|
||||
match_score = sum(1 for kw in keywords if kw in entry_lower)
|
||||
text_matches = match_score >= 2
|
||||
|
||||
if has_text and text_matches:
|
||||
continue # Text present AND matches doc_type → skip
|
||||
|
||||
# Search all other texts for this doc_type's keywords
|
||||
best_match: dict | None = None
|
||||
best_score = 0
|
||||
|
||||
for source_type, source_url, source_text in all_texts:
|
||||
if source_type == target_type:
|
||||
continue # Don't search in the same doc_type
|
||||
continue
|
||||
|
||||
text_lower = source_text.lower()
|
||||
score = sum(1 for kw in keywords if kw in text_lower)
|
||||
@@ -333,6 +343,28 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
|
||||
best_match["keyword_hits"],
|
||||
entry["word_count"],
|
||||
)
|
||||
elif has_text and not text_matches:
|
||||
# Text present but doesn't match — wrong text assigned
|
||||
findings.append({
|
||||
"id": f"wrong-text-{target_type}",
|
||||
"label": f"{_type_label(target_type)} nicht im eingereichten Text",
|
||||
"passed": False,
|
||||
"severity": "HIGH",
|
||||
"level": 1,
|
||||
"parent": None,
|
||||
"skipped": False,
|
||||
"matched_text": "",
|
||||
"hint": (
|
||||
f"Der eingereichte Text enthaelt keine "
|
||||
f"{_type_label(target_type)}. Moeglicherweise wurde "
|
||||
f"die falsche URL eingegeben. Das System konnte die "
|
||||
f"{_type_label(target_type)} auch in keinem anderen "
|
||||
f"eingereichten Dokument finden."
|
||||
),
|
||||
"source": "cross_document_search",
|
||||
"doc_type": target_type,
|
||||
})
|
||||
logger.info("Cross-doc: %s text doesn't match doc_type, not found elsewhere", target_type)
|
||||
|
||||
return findings
|
||||
|
||||
|
||||
Reference in New Issue
Block a user