feat(qa): recital detection, review split, duplicate comparison

Add _detect_recital() to QA pipeline — flags controls where source_original_text contains Erwägungsgrund markers instead of article text (28% of controls with source text affected). - Recital detection via regex + phrase matching in QA validation - 10 new tests (TestRecitalDetection), 81 total - ReviewCompare component for side-by-side duplicate comparison - Review mode split: Duplikat-Verdacht vs Rule-3-ohne-Anchor tabs - MkDocs: recital detection documentation - Detection script for bulk analysis (scripts/find_recital_controls.py) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 08:20:02 +01:00
parent a9e0869205
commit 148c7ba3af
7 changed files with 657 additions and 28 deletions
@@ -321,6 +321,62 @@ VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
 VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
                 "AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}

+# ---------------------------------------------------------------------------
+# Recital (Erwägungsgrund) detection in source text
+# ---------------------------------------------------------------------------
+
+# Pattern: standalone recital number like (125)\n or (126) at line start
+_RECITAL_RE = re.compile(r'\((\d{1,3})\)\s*\n')
+
+# Recital-typical phrasing (German EU law Erwägungsgründe)
+_RECITAL_PHRASES = [
+    "in erwägung nachstehender gründe",
+    "erwägungsgrund",
+    "in anbetracht",
+    "daher sollte",
+    "aus diesem grund",
+    "es ist daher",
+    "folglich sollte",
+    "es sollte daher",
+    "in diesem zusammenhang",
+]
+
+
+def _detect_recital(text: str) -> Optional[dict]:
+    """Detect if source text is a recital (Erwägungsgrund) rather than an article.
+
+    Returns a dict with detection details if recital markers are found,
+    or None if the text appears to be genuine article text.
+
+    Detection criteria:
+    1. Standalone recital numbers like (126)\\n in the text
+    2. Recital-typical phrasing ("daher sollte", "erwägungsgrund", etc.)
+    """
+    if not text:
+        return None
+
+    # Check 1: Recital number markers
+    recital_matches = _RECITAL_RE.findall(text)
+
+    # Check 2: Recital phrasing
+    text_lower = text.lower()
+    phrase_hits = [p for p in _RECITAL_PHRASES if p in text_lower]
+
+    if not recital_matches and not phrase_hits:
+        return None
+
+    # Require at least recital numbers OR >=2 phrase hits to be a suspect
+    if not recital_matches and len(phrase_hits) < 2:
+        return None
+
+    return {
+        "recital_suspect": True,
+        "recital_numbers": recital_matches[:10],
+        "recital_phrases": phrase_hits[:5],
+        "detection_method": "regex+phrases" if recital_matches and phrase_hits
+                           else "regex" if recital_matches else "phrases",
+    }
+
 CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))

 VERIFICATION_KEYWORDS = {
@@ -1520,9 +1576,23 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
    ) -> tuple[GeneratedControl, bool]:
        """Cross-validate category/domain using keyword detection + local LLM.

+        Also checks for recital (Erwägungsgrund) contamination in source text.
        Returns (control, was_fixed). Only triggers Ollama QA when the LLM
        classification disagrees with keyword detection — keeps it fast.
        """
+        # ── Recital detection ──────────────────────────────────────────
+        source_text = control.source_original_text or ""
+        recital_info = _detect_recital(source_text)
+        if recital_info:
+            control.generation_metadata["recital_suspect"] = True
+            control.generation_metadata["recital_detection"] = recital_info
+            control.release_state = "needs_review"
+            logger.warning(
+                "Recital suspect: '%s' — recitals %s detected in source text",
+                control.title[:40],
+                recital_info.get("recital_numbers", []),
+            )
+
        kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
        kw_domain = _detect_domain(chunk_text)
        llm_domain = control.generation_metadata.get("_effective_domain", "")
@@ -7,6 +7,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
 from compliance.services.control_generator import (
    _classify_regulation,
    _detect_domain,
+    _detect_recital,
    _parse_llm_json,
    _parse_llm_json_array,
    GeneratorConfig,
@@ -1306,3 +1307,92 @@ class TestPipelineVersion:
        assert controls[0] is not None
        assert controls[1] is None  # Null entry from LLM
        assert controls[2] is not None
+
+
+# =============================================================================
+# Recital (Erwägungsgrund) Detection Tests
+# =============================================================================
+
+class TestRecitalDetection:
+    """Tests for _detect_recital — identifying Erwägungsgrund text in source."""
+
+    def test_recital_number_detected(self):
+        """Text with (126)\\n pattern is flagged as recital suspect."""
+        text = "Daher ist es wichtig...\n(126)\nDie Konformitätsbewertung sollte..."
+        result = _detect_recital(text)
+        assert result is not None
+        assert result["recital_suspect"] is True
+        assert "126" in result["recital_numbers"]
+
+    def test_multiple_recital_numbers(self):
+        """Multiple recital markers are all captured."""
+        text = "(124)\nErster Punkt.\n(125)\nZweiter Punkt.\n(126)\nDritter Punkt."
+        result = _detect_recital(text)
+        assert result is not None
+        assert "124" in result["recital_numbers"]
+        assert "125" in result["recital_numbers"]
+        assert "126" in result["recital_numbers"]
+
+    def test_article_text_not_flagged(self):
+        """Normal article text without recital markers returns None."""
+        text = ("Der Anbieter eines Hochrisiko-KI-Systems muss sicherstellen, "
+                "dass die technische Dokumentation erstellt wird.")
+        result = _detect_recital(text)
+        assert result is None
+
+    def test_empty_text_returns_none(self):
+        result = _detect_recital("")
+        assert result is None
+
+    def test_none_text_returns_none(self):
+        result = _detect_recital(None)
+        assert result is None
+
+    def test_recital_phrases_detected(self):
+        """Text with multiple recital-typical phrases is flagged."""
+        text = ("In Erwägung nachstehender Gründe wurde beschlossen, "
+                "daher sollte der Anbieter folgende Maßnahmen ergreifen. "
+                "Es ist daher notwendig, die Konformität sicherzustellen.")
+        result = _detect_recital(text)
+        assert result is not None
+        assert result["detection_method"] == "phrases"
+
+    def test_single_phrase_not_enough(self):
+        """A single recital phrase alone is not sufficient for detection."""
+        text = "Daher sollte das System regelmäßig geprüft werden."
+        result = _detect_recital(text)
+        assert result is None
+
+    def test_combined_regex_and_phrases(self):
+        """Both recital numbers and phrases → detection_method is regex+phrases."""
+        text = "(42)\nIn Erwägung nachstehender Gründe wurde entschieden..."
+        result = _detect_recital(text)
+        assert result is not None
+        assert result["detection_method"] == "regex+phrases"
+        assert "42" in result["recital_numbers"]
+
+    def test_parenthesized_number_without_newline_ignored(self):
+        """Numbers in parentheses without trailing newline are not recital markers.
+        e.g. 'gemäß Absatz (3) des Artikels' should not be flagged."""
+        text = "Gemäß Absatz (3) des Artikels 52 muss der Anbieter sicherstellen..."
+        result = _detect_recital(text)
+        assert result is None
+
+    def test_real_world_recital_text(self):
+        """Real-world example: AI Act Erwägungsgrund (126) about conformity assessment."""
+        text = (
+            "(126)\n"
+            "Um den Verwaltungsaufwand zu verringern und die Konformitätsbewertung "
+            "zu vereinfachen, sollten bestimmte Hochrisiko-KI-Systeme, die von "
+            "Anbietern zertifiziert oder für die eine Konformitätserklärung "
+            "ausgestellt wurde, automatisch als konform mit den Anforderungen "
+            "dieser Verordnung gelten, sofern sie den harmonisierten Normen oder "
+            "gemeinsamen Spezifikationen entsprechen.\n"
+            "(127)\n"
+            "Es ist daher angezeigt, dass der Anbieter das entsprechende "
+            "Konformitätsbewertungsverfahren anwendet."
+        )
+        result = _detect_recital(text)
+        assert result is not None
+        assert "126" in result["recital_numbers"]
+        assert "127" in result["recital_numbers"]