fix(cross-doc): also check entries with wrong text, not just empty ones
Cross-search now validates if existing text matches the expected doc_type using keyword scoring. If text is present but doesn't match (e.g. Nutzungsbedingungen in Widerruf row), searches other texts and creates a finding explaining the mismatch. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -266,23 +266,33 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
|
|||||||
if not all_texts:
|
if not all_texts:
|
||||||
return findings
|
return findings
|
||||||
|
|
||||||
# For each empty or short entry, search all other texts
|
# For each entry, check if:
|
||||||
|
# a) It's empty → search other texts
|
||||||
|
# b) It has text but the text doesn't match the doc_type → search other texts
|
||||||
for entry in doc_entries:
|
for entry in doc_entries:
|
||||||
if entry.get("text") and len(entry["text"].split()) > 50:
|
|
||||||
continue # Already has content
|
|
||||||
|
|
||||||
target_type = entry["doc_type"]
|
target_type = entry["doc_type"]
|
||||||
keywords = _DOC_TYPE_KEYWORDS.get(target_type, [])
|
keywords = _DOC_TYPE_KEYWORDS.get(target_type, [])
|
||||||
if not keywords:
|
if not keywords:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
has_text = entry.get("text") and len(entry["text"].split()) > 50
|
||||||
|
text_matches = False
|
||||||
|
if has_text:
|
||||||
|
# Check if the current text actually contains this doc_type's content
|
||||||
|
entry_lower = entry["text"].lower()
|
||||||
|
match_score = sum(1 for kw in keywords if kw in entry_lower)
|
||||||
|
text_matches = match_score >= 2
|
||||||
|
|
||||||
|
if has_text and text_matches:
|
||||||
|
continue # Text present AND matches doc_type → skip
|
||||||
|
|
||||||
# Search all other texts for this doc_type's keywords
|
# Search all other texts for this doc_type's keywords
|
||||||
best_match: dict | None = None
|
best_match: dict | None = None
|
||||||
best_score = 0
|
best_score = 0
|
||||||
|
|
||||||
for source_type, source_url, source_text in all_texts:
|
for source_type, source_url, source_text in all_texts:
|
||||||
if source_type == target_type:
|
if source_type == target_type:
|
||||||
continue # Don't search in the same doc_type
|
continue
|
||||||
|
|
||||||
text_lower = source_text.lower()
|
text_lower = source_text.lower()
|
||||||
score = sum(1 for kw in keywords if kw in text_lower)
|
score = sum(1 for kw in keywords if kw in text_lower)
|
||||||
@@ -333,6 +343,28 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
|
|||||||
best_match["keyword_hits"],
|
best_match["keyword_hits"],
|
||||||
entry["word_count"],
|
entry["word_count"],
|
||||||
)
|
)
|
||||||
|
elif has_text and not text_matches:
|
||||||
|
# Text present but doesn't match — wrong text assigned
|
||||||
|
findings.append({
|
||||||
|
"id": f"wrong-text-{target_type}",
|
||||||
|
"label": f"{_type_label(target_type)} nicht im eingereichten Text",
|
||||||
|
"passed": False,
|
||||||
|
"severity": "HIGH",
|
||||||
|
"level": 1,
|
||||||
|
"parent": None,
|
||||||
|
"skipped": False,
|
||||||
|
"matched_text": "",
|
||||||
|
"hint": (
|
||||||
|
f"Der eingereichte Text enthaelt keine "
|
||||||
|
f"{_type_label(target_type)}. Moeglicherweise wurde "
|
||||||
|
f"die falsche URL eingegeben. Das System konnte die "
|
||||||
|
f"{_type_label(target_type)} auch in keinem anderen "
|
||||||
|
f"eingereichten Dokument finden."
|
||||||
|
),
|
||||||
|
"source": "cross_document_search",
|
||||||
|
"doc_type": target_type,
|
||||||
|
})
|
||||||
|
logger.info("Cross-doc: %s text doesn't match doc_type, not found elsewhere", target_type)
|
||||||
|
|
||||||
return findings
|
return findings
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user