feat(qa): recital detection, review split, duplicate comparison

Add _detect_recital() to QA pipeline — flags controls where source_original_text contains Erwägungsgrund markers instead of article text (28% of controls with source text affected). - Recital detection via regex + phrase matching in QA validation - 10 new tests (TestRecitalDetection), 81 total - ReviewCompare component for side-by-side duplicate comparison - Review mode split: Duplikat-Verdacht vs Rule-3-ohne-Anchor tabs - MkDocs: recital detection documentation - Detection script for bulk analysis (scripts/find_recital_controls.py) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 08:20:02 +01:00
parent a9e0869205
commit 148c7ba3af
7 changed files with 657 additions and 28 deletions
@@ -321,6 +321,62 @@ VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
 VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
                 "AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}

+# ---------------------------------------------------------------------------
+# Recital (Erwägungsgrund) detection in source text
+# ---------------------------------------------------------------------------
+
+# Pattern: standalone recital number like (125)\n or (126) at line start
+_RECITAL_RE = re.compile(r'\((\d{1,3})\)\s*\n')
+
+# Recital-typical phrasing (German EU law Erwägungsgründe)
+_RECITAL_PHRASES = [
+    "in erwägung nachstehender gründe",
+    "erwägungsgrund",
+    "in anbetracht",
+    "daher sollte",
+    "aus diesem grund",
+    "es ist daher",
+    "folglich sollte",
+    "es sollte daher",
+    "in diesem zusammenhang",
+]
+
+
+def _detect_recital(text: str) -> Optional[dict]:
+    """Detect if source text is a recital (Erwägungsgrund) rather than an article.
+
+    Returns a dict with detection details if recital markers are found,
+    or None if the text appears to be genuine article text.
+
+    Detection criteria:
+    1. Standalone recital numbers like (126)\\n in the text
+    2. Recital-typical phrasing ("daher sollte", "erwägungsgrund", etc.)
+    """
+    if not text:
+        return None
+
+    # Check 1: Recital number markers
+    recital_matches = _RECITAL_RE.findall(text)
+
+    # Check 2: Recital phrasing
+    text_lower = text.lower()
+    phrase_hits = [p for p in _RECITAL_PHRASES if p in text_lower]
+
+    if not recital_matches and not phrase_hits:
+        return None
+
+    # Require at least recital numbers OR >=2 phrase hits to be a suspect
+    if not recital_matches and len(phrase_hits) < 2:
+        return None
+
+    return {
+        "recital_suspect": True,
+        "recital_numbers": recital_matches[:10],
+        "recital_phrases": phrase_hits[:5],
+        "detection_method": "regex+phrases" if recital_matches and phrase_hits
+                           else "regex" if recital_matches else "phrases",
+    }
+
 CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))

 VERIFICATION_KEYWORDS = {
@@ -1520,9 +1576,23 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
    ) -> tuple[GeneratedControl, bool]:
        """Cross-validate category/domain using keyword detection + local LLM.

+        Also checks for recital (Erwägungsgrund) contamination in source text.
        Returns (control, was_fixed). Only triggers Ollama QA when the LLM
        classification disagrees with keyword detection — keeps it fast.
        """
+        # ── Recital detection ──────────────────────────────────────────
+        source_text = control.source_original_text or ""
+        recital_info = _detect_recital(source_text)
+        if recital_info:
+            control.generation_metadata["recital_suspect"] = True
+            control.generation_metadata["recital_detection"] = recital_info
+            control.release_state = "needs_review"
+            logger.warning(
+                "Recital suspect: '%s' — recitals %s detected in source text",
+                control.title[:40],
+                recital_info.get("recital_numbers", []),
+            )
+
        kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
        kw_domain = _detect_domain(chunk_text)
        llm_domain = control.generation_metadata.get("_effective_domain", "")