fix(cross-doc): also check entries with wrong text, not just empty ones

Cross-search now validates if existing text matches the expected doc_type using keyword scoring. If text is present but doesn't match (e.g. Nutzungsbedingungen in Widerruf row), searches other texts and creates a finding explaining the mismatch. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-15 00:19:40 +02:00
parent a5d1814605
commit bd2d6976d6
1 changed files with 37 additions and 5 deletions
@@ -266,23 +266,33 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
    if not all_texts:
        return findings
-    # For each empty or short entry, search all other texts
+    # For each entry, check if:
    # a) It's empty → search other texts
    # b) It has text but the text doesn't match the doc_type → search other texts
    for entry in doc_entries:
        if entry.get("text") and len(entry["text"].split()) > 50:
            continue  # Already has content
        target_type = entry["doc_type"]
        keywords = _DOC_TYPE_KEYWORDS.get(target_type, [])
        if not keywords:
            continue
        has_text = entry.get("text") and len(entry["text"].split()) > 50
        text_matches = False
        if has_text:
            # Check if the current text actually contains this doc_type's content
            entry_lower = entry["text"].lower()
            match_score = sum(1 for kw in keywords if kw in entry_lower)
            text_matches = match_score >= 2
        if has_text and text_matches:
            continue  # Text present AND matches doc_type → skip
        # Search all other texts for this doc_type's keywords
        best_match: dict | None = None
        best_score = 0
        for source_type, source_url, source_text in all_texts:
            if source_type == target_type:
-                continue  # Don't search in the same doc_type
+                continue
            text_lower = source_text.lower()
            score = sum(1 for kw in keywords if kw in text_lower)
@@ -333,6 +343,28 @@ def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
                best_match["keyword_hits"],
                entry["word_count"],
            )
        elif has_text and not text_matches:
            # Text present but doesn't match — wrong text assigned
            findings.append({
                "id": f"wrong-text-{target_type}",
                "label": f"{_type_label(target_type)} nicht im eingereichten Text",
                "passed": False,
                "severity": "HIGH",
                "level": 1,
                "parent": None,
                "skipped": False,
                "matched_text": "",
                "hint": (
                    f"Der eingereichte Text enthaelt keine "
                    f"{_type_label(target_type)}. Moeglicherweise wurde "
                    f"die falsche URL eingegeben. Das System konnte die "
                    f"{_type_label(target_type)} auch in keinem anderen "
                    f"eingereichten Dokument finden."
                ),
                "source": "cross_document_search",
                "doc_type": target_type,
            })
            logger.info("Cross-doc: %s text doesn't match doc_type, not found elsewhere", target_type)
    return findings