feat(cross-doc): search all texts for all doc_types + misplacement finding

Cross-Document Intelligence: When a doc_type row is empty, searches ALL other loaded documents for that content. If found (e.g. Widerruf in AGB), extracts the section, runs the check, AND creates a finding: "Widerrufsbelehrung in falschem Dokument gefunden — schwer auffindbar" Keywords for: widerruf, cookie, social_media, impressum, agb, dsb. Integrated as Step 1c in compliance check pipeline. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-14 23:19:39 +02:00
parent 29fbd03c79
commit 4e9043f26d
2 changed files with 176 additions and 2 deletions
@@ -178,11 +178,16 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
        # 1. Same URL used for multiple doc_types → split by heading
        # 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows
        from compliance.services.section_splitter import (
-            split_shared_texts, auto_fill_from_dsi,
+            split_shared_texts, auto_fill_from_dsi, cross_search_documents,
        )
        split_shared_texts(doc_entries, url_text_cache)
        auto_fill_from_dsi(doc_entries)
-        # Refresh doc_texts after splitting
+
+        # Step 1c: Cross-document search — find doc_types in wrong documents
+        _update(check_id, "Dokumente werden uebergreifend durchsucht...")
+        placement_findings = cross_search_documents(doc_entries)
+
+        # Refresh doc_texts after all splitting/searching
        for entry in doc_entries:
            if entry.get("text"):
                doc_texts[entry["doc_type"]] = entry["text"]
@@ -232,6 +237,13 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
            # Apply profile context filter
            result = _apply_profile_filter(result, profile, doc_type)

+            # Add placement findings (doc found in wrong location)
+            for pf in placement_findings:
+                if pf.get("doc_type") == doc_type:
+                    result.checks.insert(0, CheckItem(**{
+                        k: v for k, v in pf.items() if k != "doc_type"
+                    }))
+
            results.append(result)
            total_findings += result.findings_count

@@ -213,3 +213,165 @@ def auto_fill_from_dsi(doc_entries: list[dict]) -> None:
            "Auto-filled %d empty rows from DSI sections: %s",
            len(filled), ", ".join(filled),
        )
+
+
+# ── Cross-Document Search ────────────────────────────────────────────
+
+# Keywords that indicate a doc_type is present in text (case-insensitive)
+_DOC_TYPE_KEYWORDS = {
+    "widerruf": [
+        "widerrufsrecht", "widerrufsbelehrung", "widerrufsfrist",
+        "binnen 14 tagen", "widerruf erklaeren", "muster-widerrufsformular",
+    ],
+    "cookie": [
+        "cookie-richtlinie", "cookie-tabelle", "cookiebot", "consent-tool",
+        "arten der cookies", "session-cookie", "tracking-cookie",
+    ],
+    "social_media": [
+        "gemeinsam verantwortlich", "art. 26 dsgvo", "fanpage",
+        "social media plugin", "facebook-seite", "instagram-profil",
+    ],
+    "impressum": [
+        "angaben gemaess", "angaben gemäß", "§ 5 tmg", "§5 tmg",
+        "telemediengesetz", "impressum",
+    ],
+    "agb": [
+        "allgemeine geschaeftsbedingungen", "allgemeine geschäftsbedingungen",
+        "geltungsbereich", "vertragsschluss", "§305 bgb",
+    ],
+    "dsb": [
+        "datenschutzbeauftragte", "dsb@", "dpo@",
+        "datenschutzbeauftragten",
+    ],
+}
+
+
+def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
+    """Search ALL texts for ALL doc_types and fill missing entries.
+
+    For each empty doc_type row, search through all other documents'
+    texts to find the content. If found in the wrong document, extract
+    it, assign it, and create a finding about incorrect placement.
+
+    Returns list of findings (misplacement warnings).
+    """
+    findings: list[dict] = []
+
+    # Collect all available texts with their source doc_type
+    all_texts: list[tuple[str, str, str]] = []  # (doc_type, url, text)
+    for entry in doc_entries:
+        if entry.get("text") and len(entry["text"]) > 100:
+            all_texts.append((entry["doc_type"], entry.get("url", ""), entry["text"]))
+
+    if not all_texts:
+        return findings
+
+    # For each empty or short entry, search all other texts
+    for entry in doc_entries:
+        if entry.get("text") and len(entry["text"].split()) > 50:
+            continue  # Already has content
+
+        target_type = entry["doc_type"]
+        keywords = _DOC_TYPE_KEYWORDS.get(target_type, [])
+        if not keywords:
+            continue
+
+        # Search all other texts for this doc_type's keywords
+        best_match: dict | None = None
+        best_score = 0
+
+        for source_type, source_url, source_text in all_texts:
+            if source_type == target_type:
+                continue  # Don't search in the same doc_type
+
+            text_lower = source_text.lower()
+            score = sum(1 for kw in keywords if kw in text_lower)
+
+            if score >= 2 and score > best_score:
+                best_score = score
+                # Extract the relevant section
+                section = _extract_section_by_keywords(source_text, keywords)
+                if section and len(section.split()) >= 30:
+                    best_match = {
+                        "source_type": source_type,
+                        "source_url": source_url,
+                        "section_text": section,
+                        "keyword_hits": score,
+                    }
+
+        if best_match:
+            entry["text"] = best_match["section_text"]
+            entry["word_count"] = len(best_match["section_text"].split())
+            source_label = best_match["source_type"].upper()
+            entry["url"] = f"(gefunden in {source_label})"
+
+            findings.append({
+                "id": f"placement-{target_type}",
+                "label": f"{_type_label(target_type)} in falschem Dokument",
+                "passed": False,
+                "severity": "MEDIUM",
+                "level": 1,
+                "parent": None,
+                "skipped": False,
+                "matched_text": "",
+                "hint": (
+                    f"Die {_type_label(target_type)} wurde nicht als eigenes "
+                    f"Dokument gefunden, sondern in der/den {source_label} "
+                    f"({best_match['source_url']}). Gemaess Art. 246a EGBGB / "
+                    f"§312d BGB muss die {_type_label(target_type)} leicht "
+                    f"auffindbar und klar erkennbar sein. Empfehlung: Als "
+                    f"eigenen Link im Footer oder als separates Dokument "
+                    f"bereitstellen."
+                ),
+                "source": "cross_document_search",
+                "doc_type": target_type,
+            })
+
+            logger.info(
+                "Cross-doc: Found %s in %s (%d keywords, %d words)",
+                target_type, best_match["source_type"],
+                best_match["keyword_hits"],
+                entry["word_count"],
+            )
+
+    return findings
+
+
+def _extract_section_by_keywords(
+    text: str, keywords: list[str],
+) -> str | None:
+    """Extract the section of text around the keyword matches."""
+    text_lower = text.lower()
+    lines = text.split("\n")
+
+    # Find first and last line containing any keyword
+    first_line = len(lines)
+    last_line = 0
+    for i, line in enumerate(lines):
+        line_lower = line.lower()
+        if any(kw in line_lower for kw in keywords):
+            first_line = min(first_line, i)
+            last_line = max(last_line, i)
+
+    if first_line >= last_line:
+        return None
+
+    # Expand to include context (5 lines before first, 10 after last)
+    start = max(0, first_line - 5)
+    end = min(len(lines), last_line + 10)
+
+    section = "\n".join(lines[start:end])
+    return section if len(section.split()) >= 30 else None
+
+
+def _type_label(doc_type: str) -> str:
+    labels = {
+        "widerruf": "Widerrufsbelehrung",
+        "cookie": "Cookie-Richtlinie",
+        "social_media": "Social-Media-Datenschutz",
+        "impressum": "Impressum",
+        "agb": "AGB",
+        "dsb": "DSB-Kontakt",
+        "dse": "Datenschutzerklaerung",
+    }
+    return labels.get(doc_type, doc_type)