feat(cross-doc): search all texts for all doc_types + misplacement finding
Cross-Document Intelligence: When a doc_type row is empty, searches ALL other loaded documents for that content. If found (e.g. Widerruf in AGB), extracts the section, runs the check, AND creates a finding: "Widerrufsbelehrung in falschem Dokument gefunden — schwer auffindbar" Keywords for: widerruf, cookie, social_media, impressum, agb, dsb. Integrated as Step 1c in compliance check pipeline. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -178,11 +178,16 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
# 1. Same URL used for multiple doc_types → split by heading
|
# 1. Same URL used for multiple doc_types → split by heading
|
||||||
# 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows
|
# 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows
|
||||||
from compliance.services.section_splitter import (
|
from compliance.services.section_splitter import (
|
||||||
split_shared_texts, auto_fill_from_dsi,
|
split_shared_texts, auto_fill_from_dsi, cross_search_documents,
|
||||||
)
|
)
|
||||||
split_shared_texts(doc_entries, url_text_cache)
|
split_shared_texts(doc_entries, url_text_cache)
|
||||||
auto_fill_from_dsi(doc_entries)
|
auto_fill_from_dsi(doc_entries)
|
||||||
# Refresh doc_texts after splitting
|
|
||||||
|
# Step 1c: Cross-document search — find doc_types in wrong documents
|
||||||
|
_update(check_id, "Dokumente werden uebergreifend durchsucht...")
|
||||||
|
placement_findings = cross_search_documents(doc_entries)
|
||||||
|
|
||||||
|
# Refresh doc_texts after all splitting/searching
|
||||||
for entry in doc_entries:
|
for entry in doc_entries:
|
||||||
if entry.get("text"):
|
if entry.get("text"):
|
||||||
doc_texts[entry["doc_type"]] = entry["text"]
|
doc_texts[entry["doc_type"]] = entry["text"]
|
||||||
@@ -232,6 +237,13 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
# Apply profile context filter
|
# Apply profile context filter
|
||||||
result = _apply_profile_filter(result, profile, doc_type)
|
result = _apply_profile_filter(result, profile, doc_type)
|
||||||
|
|
||||||
|
# Add placement findings (doc found in wrong location)
|
||||||
|
for pf in placement_findings:
|
||||||
|
if pf.get("doc_type") == doc_type:
|
||||||
|
result.checks.insert(0, CheckItem(**{
|
||||||
|
k: v for k, v in pf.items() if k != "doc_type"
|
||||||
|
}))
|
||||||
|
|
||||||
results.append(result)
|
results.append(result)
|
||||||
total_findings += result.findings_count
|
total_findings += result.findings_count
|
||||||
|
|
||||||
|
|||||||
@@ -213,3 +213,165 @@ def auto_fill_from_dsi(doc_entries: list[dict]) -> None:
|
|||||||
"Auto-filled %d empty rows from DSI sections: %s",
|
"Auto-filled %d empty rows from DSI sections: %s",
|
||||||
len(filled), ", ".join(filled),
|
len(filled), ", ".join(filled),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Cross-Document Search ────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Keywords that indicate a doc_type is present in text (case-insensitive)
|
||||||
|
_DOC_TYPE_KEYWORDS = {
|
||||||
|
"widerruf": [
|
||||||
|
"widerrufsrecht", "widerrufsbelehrung", "widerrufsfrist",
|
||||||
|
"binnen 14 tagen", "widerruf erklaeren", "muster-widerrufsformular",
|
||||||
|
],
|
||||||
|
"cookie": [
|
||||||
|
"cookie-richtlinie", "cookie-tabelle", "cookiebot", "consent-tool",
|
||||||
|
"arten der cookies", "session-cookie", "tracking-cookie",
|
||||||
|
],
|
||||||
|
"social_media": [
|
||||||
|
"gemeinsam verantwortlich", "art. 26 dsgvo", "fanpage",
|
||||||
|
"social media plugin", "facebook-seite", "instagram-profil",
|
||||||
|
],
|
||||||
|
"impressum": [
|
||||||
|
"angaben gemaess", "angaben gemäß", "§ 5 tmg", "§5 tmg",
|
||||||
|
"telemediengesetz", "impressum",
|
||||||
|
],
|
||||||
|
"agb": [
|
||||||
|
"allgemeine geschaeftsbedingungen", "allgemeine geschäftsbedingungen",
|
||||||
|
"geltungsbereich", "vertragsschluss", "§305 bgb",
|
||||||
|
],
|
||||||
|
"dsb": [
|
||||||
|
"datenschutzbeauftragte", "dsb@", "dpo@",
|
||||||
|
"datenschutzbeauftragten",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def cross_search_documents(doc_entries: list[dict]) -> list[dict]:
|
||||||
|
"""Search ALL texts for ALL doc_types and fill missing entries.
|
||||||
|
|
||||||
|
For each empty doc_type row, search through all other documents'
|
||||||
|
texts to find the content. If found in the wrong document, extract
|
||||||
|
it, assign it, and create a finding about incorrect placement.
|
||||||
|
|
||||||
|
Returns list of findings (misplacement warnings).
|
||||||
|
"""
|
||||||
|
findings: list[dict] = []
|
||||||
|
|
||||||
|
# Collect all available texts with their source doc_type
|
||||||
|
all_texts: list[tuple[str, str, str]] = [] # (doc_type, url, text)
|
||||||
|
for entry in doc_entries:
|
||||||
|
if entry.get("text") and len(entry["text"]) > 100:
|
||||||
|
all_texts.append((entry["doc_type"], entry.get("url", ""), entry["text"]))
|
||||||
|
|
||||||
|
if not all_texts:
|
||||||
|
return findings
|
||||||
|
|
||||||
|
# For each empty or short entry, search all other texts
|
||||||
|
for entry in doc_entries:
|
||||||
|
if entry.get("text") and len(entry["text"].split()) > 50:
|
||||||
|
continue # Already has content
|
||||||
|
|
||||||
|
target_type = entry["doc_type"]
|
||||||
|
keywords = _DOC_TYPE_KEYWORDS.get(target_type, [])
|
||||||
|
if not keywords:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Search all other texts for this doc_type's keywords
|
||||||
|
best_match: dict | None = None
|
||||||
|
best_score = 0
|
||||||
|
|
||||||
|
for source_type, source_url, source_text in all_texts:
|
||||||
|
if source_type == target_type:
|
||||||
|
continue # Don't search in the same doc_type
|
||||||
|
|
||||||
|
text_lower = source_text.lower()
|
||||||
|
score = sum(1 for kw in keywords if kw in text_lower)
|
||||||
|
|
||||||
|
if score >= 2 and score > best_score:
|
||||||
|
best_score = score
|
||||||
|
# Extract the relevant section
|
||||||
|
section = _extract_section_by_keywords(source_text, keywords)
|
||||||
|
if section and len(section.split()) >= 30:
|
||||||
|
best_match = {
|
||||||
|
"source_type": source_type,
|
||||||
|
"source_url": source_url,
|
||||||
|
"section_text": section,
|
||||||
|
"keyword_hits": score,
|
||||||
|
}
|
||||||
|
|
||||||
|
if best_match:
|
||||||
|
entry["text"] = best_match["section_text"]
|
||||||
|
entry["word_count"] = len(best_match["section_text"].split())
|
||||||
|
source_label = best_match["source_type"].upper()
|
||||||
|
entry["url"] = f"(gefunden in {source_label})"
|
||||||
|
|
||||||
|
findings.append({
|
||||||
|
"id": f"placement-{target_type}",
|
||||||
|
"label": f"{_type_label(target_type)} in falschem Dokument",
|
||||||
|
"passed": False,
|
||||||
|
"severity": "MEDIUM",
|
||||||
|
"level": 1,
|
||||||
|
"parent": None,
|
||||||
|
"skipped": False,
|
||||||
|
"matched_text": "",
|
||||||
|
"hint": (
|
||||||
|
f"Die {_type_label(target_type)} wurde nicht als eigenes "
|
||||||
|
f"Dokument gefunden, sondern in der/den {source_label} "
|
||||||
|
f"({best_match['source_url']}). Gemaess Art. 246a EGBGB / "
|
||||||
|
f"§312d BGB muss die {_type_label(target_type)} leicht "
|
||||||
|
f"auffindbar und klar erkennbar sein. Empfehlung: Als "
|
||||||
|
f"eigenen Link im Footer oder als separates Dokument "
|
||||||
|
f"bereitstellen."
|
||||||
|
),
|
||||||
|
"source": "cross_document_search",
|
||||||
|
"doc_type": target_type,
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Cross-doc: Found %s in %s (%d keywords, %d words)",
|
||||||
|
target_type, best_match["source_type"],
|
||||||
|
best_match["keyword_hits"],
|
||||||
|
entry["word_count"],
|
||||||
|
)
|
||||||
|
|
||||||
|
return findings
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_section_by_keywords(
|
||||||
|
text: str, keywords: list[str],
|
||||||
|
) -> str | None:
|
||||||
|
"""Extract the section of text around the keyword matches."""
|
||||||
|
text_lower = text.lower()
|
||||||
|
lines = text.split("\n")
|
||||||
|
|
||||||
|
# Find first and last line containing any keyword
|
||||||
|
first_line = len(lines)
|
||||||
|
last_line = 0
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
line_lower = line.lower()
|
||||||
|
if any(kw in line_lower for kw in keywords):
|
||||||
|
first_line = min(first_line, i)
|
||||||
|
last_line = max(last_line, i)
|
||||||
|
|
||||||
|
if first_line >= last_line:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Expand to include context (5 lines before first, 10 after last)
|
||||||
|
start = max(0, first_line - 5)
|
||||||
|
end = min(len(lines), last_line + 10)
|
||||||
|
|
||||||
|
section = "\n".join(lines[start:end])
|
||||||
|
return section if len(section.split()) >= 30 else None
|
||||||
|
|
||||||
|
|
||||||
|
def _type_label(doc_type: str) -> str:
|
||||||
|
labels = {
|
||||||
|
"widerruf": "Widerrufsbelehrung",
|
||||||
|
"cookie": "Cookie-Richtlinie",
|
||||||
|
"social_media": "Social-Media-Datenschutz",
|
||||||
|
"impressum": "Impressum",
|
||||||
|
"agb": "AGB",
|
||||||
|
"dsb": "DSB-Kontakt",
|
||||||
|
"dse": "Datenschutzerklaerung",
|
||||||
|
}
|
||||||
|
return labels.get(doc_type, doc_type)
|
||||||
|
|||||||
Reference in New Issue
Block a user