diff --git a/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx b/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx
new file mode 100644
index 0000000..5d4c92e
--- /dev/null
+++ b/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx
@@ -0,0 +1,264 @@
+'use client'
+
+import { useState, useEffect } from 'react'
+import {
+ ArrowLeft, CheckCircle2, Trash2, Pencil, SkipForward,
+ ChevronLeft, Scale, BookOpen, ExternalLink, AlertTriangle,
+ FileText, Clock,
+} from 'lucide-react'
+import {
+ CanonicalControl, BACKEND_URL,
+ SeverityBadge, StateBadge, LicenseRuleBadge, CategoryBadge, TargetAudienceBadge,
+} from './helpers'
+
+// =============================================================================
+// Compact Control Panel (used on both sides of the comparison)
+// =============================================================================
+
+function ControlPanel({ ctrl, label, highlight }: { ctrl: CanonicalControl; label: string; highlight?: boolean }) {
+ return (
+
+ {/* Panel Header */}
+
+
{label}
+
+ {ctrl.control_id}
+
+
+
+
+
+
+
{ctrl.title}
+
+
+ {/* Panel Content */}
+
+ {/* Objective */}
+
+ Ziel
+ {ctrl.objective}
+
+
+ {/* Rationale */}
+ {ctrl.rationale && (
+
+ Begruendung
+ {ctrl.rationale}
+
+ )}
+
+ {/* Source Citation (Rule 1+2) */}
+ {ctrl.source_citation && (
+
+
+
+ Gesetzliche Grundlage
+
+ {ctrl.source_citation.source && (
+
+ {ctrl.source_citation.source}
+ {ctrl.source_citation.article && ` — ${ctrl.source_citation.article}`}
+ {ctrl.source_citation.paragraph && ` ${ctrl.source_citation.paragraph}`}
+
+ )}
+
+ )}
+
+ {/* Requirements */}
+ {ctrl.requirements.length > 0 && (
+
+ Anforderungen
+
+ {ctrl.requirements.map((r, i) => (
+ - {r}
+ ))}
+
+
+ )}
+
+ {/* Test Procedure */}
+ {ctrl.test_procedure.length > 0 && (
+
+ Pruefverfahren
+
+ {ctrl.test_procedure.map((s, i) => (
+ - {s}
+ ))}
+
+
+ )}
+
+ {/* Open Anchors */}
+ {ctrl.open_anchors.length > 0 && (
+
+
+
+ Referenzen ({ctrl.open_anchors.length})
+
+
+ {ctrl.open_anchors.map((a, i) => (
+
+
+ {a.framework}
+ {a.ref}
+
+ ))}
+
+
+ )}
+
+ {/* Tags */}
+ {ctrl.tags.length > 0 && (
+
+ {ctrl.tags.map(t => (
+ {t}
+ ))}
+
+ )}
+
+
+ )
+}
+
+// =============================================================================
+// ReviewCompare — Side-by-Side Duplicate Comparison
+// =============================================================================
+
+interface ReviewCompareProps {
+ ctrl: CanonicalControl
+ onBack: () => void
+ onReview: (controlId: string, action: string) => void
+ onEdit: () => void
+ reviewIndex: number
+ reviewTotal: number
+ onReviewPrev: () => void
+ onReviewNext: () => void
+}
+
+export function ReviewCompare({
+ ctrl,
+ onBack,
+ onReview,
+ onEdit,
+ reviewIndex,
+ reviewTotal,
+ onReviewPrev,
+ onReviewNext,
+}: ReviewCompareProps) {
+ const [suspectedDuplicate, setSuspectedDuplicate] = useState(null)
+ const [loading, setLoading] = useState(false)
+ const [similarity, setSimilarity] = useState(null)
+
+ // Load the suspected duplicate from generation_metadata.similar_controls
+ useEffect(() => {
+ const loadDuplicate = async () => {
+ const similarControls = ctrl.generation_metadata?.similar_controls as Array<{ control_id: string; title: string; similarity: number }> | undefined
+ if (!similarControls || similarControls.length === 0) {
+ setSuspectedDuplicate(null)
+ setSimilarity(null)
+ return
+ }
+
+ const suspect = similarControls[0]
+ setSimilarity(suspect.similarity)
+ setLoading(true)
+
+ try {
+ const res = await fetch(`${BACKEND_URL}?endpoint=control&id=${encodeURIComponent(suspect.control_id)}`)
+ if (res.ok) {
+ const data = await res.json()
+ setSuspectedDuplicate(data)
+ } else {
+ setSuspectedDuplicate(null)
+ }
+ } catch {
+ setSuspectedDuplicate(null)
+ } finally {
+ setLoading(false)
+ }
+ }
+
+ loadDuplicate()
+ }, [ctrl.control_id, ctrl.generation_metadata])
+
+ return (
+
+ {/* Header */}
+
+
+
+
+
+
+
Duplikat-Vergleich
+ {similarity !== null && (
+
+ {(similarity * 100).toFixed(1)}% Aehnlichkeit
+
+ )}
+
+
+
+
+
+ {/* Navigation */}
+
+
+ {reviewIndex + 1} / {reviewTotal}
+
+
+
+ {/* Actions */}
+
+
+
+
+
+
+ {/* Side-by-Side Panels */}
+
+ {/* Left: Control to review */}
+
+
+
+
+ {/* Right: Suspected duplicate */}
+
+ {loading ? (
+
+ ) : suspectedDuplicate ? (
+
+ ) : (
+
+ Kein Duplikat-Kandidat gefunden
+
+ )}
+
+
+
+ )
+}
diff --git a/admin-compliance/app/sdk/control-library/page.tsx b/admin-compliance/app/sdk/control-library/page.tsx
index 4cbaedd..f67f80e 100644
--- a/admin-compliance/app/sdk/control-library/page.tsx
+++ b/admin-compliance/app/sdk/control-library/page.tsx
@@ -14,6 +14,7 @@ import {
} from './components/helpers'
import { ControlForm } from './components/ControlForm'
import { ControlDetail } from './components/ControlDetail'
+import { ReviewCompare } from './components/ReviewCompare'
import { GeneratorModal } from './components/GeneratorModal'
// =============================================================================
@@ -71,6 +72,9 @@ export default function ControlLibraryPage() {
const [reviewIndex, setReviewIndex] = useState(0)
const [reviewItems, setReviewItems] = useState([])
const [reviewCount, setReviewCount] = useState(0)
+ const [reviewTab, setReviewTab] = useState<'duplicates' | 'rule3'>('duplicates')
+ const [reviewDuplicates, setReviewDuplicates] = useState([])
+ const [reviewRule3, setReviewRule3] = useState([])
// Debounce search
const searchTimer = useRef | null>(null)
@@ -303,20 +307,47 @@ export default function ControlLibraryPage() {
const enterReviewMode = async () => {
// Load review items from backend
try {
- const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=200`)
+ const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=1000`)
if (res.ok) {
- const items = await res.json()
+ const items: CanonicalControl[] = await res.json()
if (items.length > 0) {
- setReviewItems(items)
+ // Split into duplicate suspects vs rule 3 without anchor
+ const dupes = items.filter(c =>
+ c.generation_metadata?.similar_controls &&
+ Array.isArray(c.generation_metadata.similar_controls) &&
+ (c.generation_metadata.similar_controls as unknown[]).length > 0
+ )
+ const rule3 = items.filter(c =>
+ !c.generation_metadata?.similar_controls ||
+ !Array.isArray(c.generation_metadata.similar_controls) ||
+ (c.generation_metadata.similar_controls as unknown[]).length === 0
+ )
+ setReviewDuplicates(dupes)
+ setReviewRule3(rule3)
+ // Start with duplicates tab if any, otherwise rule3
+ const startTab = dupes.length > 0 ? 'duplicates' : 'rule3'
+ const startItems = startTab === 'duplicates' ? dupes : rule3
+ setReviewTab(startTab)
+ setReviewItems(startItems)
setReviewMode(true)
setReviewIndex(0)
- setSelectedControl(items[0])
+ setSelectedControl(startItems[0])
setMode('detail')
}
}
} catch { /* ignore */ }
}
+ const switchReviewTab = (tab: 'duplicates' | 'rule3') => {
+ const items = tab === 'duplicates' ? reviewDuplicates : reviewRule3
+ setReviewTab(tab)
+ setReviewItems(items)
+ setReviewIndex(0)
+ if (items.length > 0) {
+ setSelectedControl(items[0])
+ }
+ }
+
// Loading
if (loading && controls.length === 0) {
return (
@@ -363,28 +394,89 @@ export default function ControlLibraryPage() {
// DETAIL MODE
if (mode === 'detail' && selectedControl) {
+ const isDuplicateReview = reviewMode && reviewTab === 'duplicates'
+
+ // Review tab bar (shown above the detail/compare view in review mode)
+ const reviewTabBar = reviewMode ? (
+
+
+
+
+ ) : null
+
+ if (isDuplicateReview) {
+ return (
+
+ {reviewTabBar}
+
+ { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
+ onReview={handleReview}
+ onEdit={() => setMode('edit')}
+ reviewIndex={reviewIndex}
+ reviewTotal={reviewItems.length}
+ onReviewPrev={() => {
+ const idx = Math.max(0, reviewIndex - 1)
+ setReviewIndex(idx)
+ setSelectedControl(reviewItems[idx])
+ }}
+ onReviewNext={() => {
+ const idx = Math.min(reviewItems.length - 1, reviewIndex + 1)
+ setReviewIndex(idx)
+ setSelectedControl(reviewItems[idx])
+ }}
+ />
+
+
+ )
+ }
+
return (
- { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
- onEdit={() => setMode('edit')}
- onDelete={handleDelete}
- onReview={handleReview}
- onRefresh={fullReload}
- reviewMode={reviewMode}
- reviewIndex={reviewIndex}
- reviewTotal={reviewItems.length}
- onReviewPrev={() => {
- const idx = Math.max(0, reviewIndex - 1)
- setReviewIndex(idx)
- setSelectedControl(reviewItems[idx])
- }}
- onReviewNext={() => {
- const idx = Math.min(reviewItems.length - 1, reviewIndex + 1)
- setReviewIndex(idx)
- setSelectedControl(reviewItems[idx])
- }}
- />
+
+ {reviewTabBar}
+
+ { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
+ onEdit={() => setMode('edit')}
+ onDelete={handleDelete}
+ onReview={handleReview}
+ onRefresh={fullReload}
+ reviewMode={reviewMode}
+ reviewIndex={reviewIndex}
+ reviewTotal={reviewItems.length}
+ onReviewPrev={() => {
+ const idx = Math.max(0, reviewIndex - 1)
+ setReviewIndex(idx)
+ setSelectedControl(reviewItems[idx])
+ }}
+ onReviewNext={() => {
+ const idx = Math.min(reviewItems.length - 1, reviewIndex + 1)
+ setReviewIndex(idx)
+ setSelectedControl(reviewItems[idx])
+ }}
+ />
+
+
)
}
diff --git a/backend-compliance/compliance/services/control_generator.py b/backend-compliance/compliance/services/control_generator.py
index fce4316..1de79a4 100644
--- a/backend-compliance/compliance/services/control_generator.py
+++ b/backend-compliance/compliance/services/control_generator.py
@@ -321,6 +321,62 @@ VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
"AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
+# ---------------------------------------------------------------------------
+# Recital (Erwägungsgrund) detection in source text
+# ---------------------------------------------------------------------------
+
+# Pattern: standalone recital number like (125)\n or (126) at line start
+_RECITAL_RE = re.compile(r'\((\d{1,3})\)\s*\n')
+
+# Recital-typical phrasing (German EU law Erwägungsgründe)
+_RECITAL_PHRASES = [
+ "in erwägung nachstehender gründe",
+ "erwägungsgrund",
+ "in anbetracht",
+ "daher sollte",
+ "aus diesem grund",
+ "es ist daher",
+ "folglich sollte",
+ "es sollte daher",
+ "in diesem zusammenhang",
+]
+
+
+def _detect_recital(text: str) -> Optional[dict]:
+ """Detect if source text is a recital (Erwägungsgrund) rather than an article.
+
+ Returns a dict with detection details if recital markers are found,
+ or None if the text appears to be genuine article text.
+
+ Detection criteria:
+ 1. Standalone recital numbers like (126)\\n in the text
+ 2. Recital-typical phrasing ("daher sollte", "erwägungsgrund", etc.)
+ """
+ if not text:
+ return None
+
+ # Check 1: Recital number markers
+ recital_matches = _RECITAL_RE.findall(text)
+
+ # Check 2: Recital phrasing
+ text_lower = text.lower()
+ phrase_hits = [p for p in _RECITAL_PHRASES if p in text_lower]
+
+ if not recital_matches and not phrase_hits:
+ return None
+
+ # Require at least recital numbers OR >=2 phrase hits to be a suspect
+ if not recital_matches and len(phrase_hits) < 2:
+ return None
+
+ return {
+ "recital_suspect": True,
+ "recital_numbers": recital_matches[:10],
+ "recital_phrases": phrase_hits[:5],
+ "detection_method": "regex+phrases" if recital_matches and phrase_hits
+ else "regex" if recital_matches else "phrases",
+ }
+
CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))
VERIFICATION_KEYWORDS = {
@@ -1520,9 +1576,23 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
) -> tuple[GeneratedControl, bool]:
"""Cross-validate category/domain using keyword detection + local LLM.
+ Also checks for recital (Erwägungsgrund) contamination in source text.
Returns (control, was_fixed). Only triggers Ollama QA when the LLM
classification disagrees with keyword detection — keeps it fast.
"""
+ # ── Recital detection ──────────────────────────────────────────
+ source_text = control.source_original_text or ""
+ recital_info = _detect_recital(source_text)
+ if recital_info:
+ control.generation_metadata["recital_suspect"] = True
+ control.generation_metadata["recital_detection"] = recital_info
+ control.release_state = "needs_review"
+ logger.warning(
+ "Recital suspect: '%s' — recitals %s detected in source text",
+ control.title[:40],
+ recital_info.get("recital_numbers", []),
+ )
+
kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
kw_domain = _detect_domain(chunk_text)
llm_domain = control.generation_metadata.get("_effective_domain", "")
diff --git a/backend-compliance/tests/test_control_generator.py b/backend-compliance/tests/test_control_generator.py
index 29c025d..fc812f0 100644
--- a/backend-compliance/tests/test_control_generator.py
+++ b/backend-compliance/tests/test_control_generator.py
@@ -7,6 +7,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
from compliance.services.control_generator import (
_classify_regulation,
_detect_domain,
+ _detect_recital,
_parse_llm_json,
_parse_llm_json_array,
GeneratorConfig,
@@ -1306,3 +1307,92 @@ class TestPipelineVersion:
assert controls[0] is not None
assert controls[1] is None # Null entry from LLM
assert controls[2] is not None
+
+
+# =============================================================================
+# Recital (Erwägungsgrund) Detection Tests
+# =============================================================================
+
+class TestRecitalDetection:
+ """Tests for _detect_recital — identifying Erwägungsgrund text in source."""
+
+ def test_recital_number_detected(self):
+ """Text with (126)\\n pattern is flagged as recital suspect."""
+ text = "Daher ist es wichtig...\n(126)\nDie Konformitätsbewertung sollte..."
+ result = _detect_recital(text)
+ assert result is not None
+ assert result["recital_suspect"] is True
+ assert "126" in result["recital_numbers"]
+
+ def test_multiple_recital_numbers(self):
+ """Multiple recital markers are all captured."""
+ text = "(124)\nErster Punkt.\n(125)\nZweiter Punkt.\n(126)\nDritter Punkt."
+ result = _detect_recital(text)
+ assert result is not None
+ assert "124" in result["recital_numbers"]
+ assert "125" in result["recital_numbers"]
+ assert "126" in result["recital_numbers"]
+
+ def test_article_text_not_flagged(self):
+ """Normal article text without recital markers returns None."""
+ text = ("Der Anbieter eines Hochrisiko-KI-Systems muss sicherstellen, "
+ "dass die technische Dokumentation erstellt wird.")
+ result = _detect_recital(text)
+ assert result is None
+
+ def test_empty_text_returns_none(self):
+ result = _detect_recital("")
+ assert result is None
+
+ def test_none_text_returns_none(self):
+ result = _detect_recital(None)
+ assert result is None
+
+ def test_recital_phrases_detected(self):
+ """Text with multiple recital-typical phrases is flagged."""
+ text = ("In Erwägung nachstehender Gründe wurde beschlossen, "
+ "daher sollte der Anbieter folgende Maßnahmen ergreifen. "
+ "Es ist daher notwendig, die Konformität sicherzustellen.")
+ result = _detect_recital(text)
+ assert result is not None
+ assert result["detection_method"] == "phrases"
+
+ def test_single_phrase_not_enough(self):
+ """A single recital phrase alone is not sufficient for detection."""
+ text = "Daher sollte das System regelmäßig geprüft werden."
+ result = _detect_recital(text)
+ assert result is None
+
+ def test_combined_regex_and_phrases(self):
+ """Both recital numbers and phrases → detection_method is regex+phrases."""
+ text = "(42)\nIn Erwägung nachstehender Gründe wurde entschieden..."
+ result = _detect_recital(text)
+ assert result is not None
+ assert result["detection_method"] == "regex+phrases"
+ assert "42" in result["recital_numbers"]
+
+ def test_parenthesized_number_without_newline_ignored(self):
+ """Numbers in parentheses without trailing newline are not recital markers.
+ e.g. 'gemäß Absatz (3) des Artikels' should not be flagged."""
+ text = "Gemäß Absatz (3) des Artikels 52 muss der Anbieter sicherstellen..."
+ result = _detect_recital(text)
+ assert result is None
+
+ def test_real_world_recital_text(self):
+ """Real-world example: AI Act Erwägungsgrund (126) about conformity assessment."""
+ text = (
+ "(126)\n"
+ "Um den Verwaltungsaufwand zu verringern und die Konformitätsbewertung "
+ "zu vereinfachen, sollten bestimmte Hochrisiko-KI-Systeme, die von "
+ "Anbietern zertifiziert oder für die eine Konformitätserklärung "
+ "ausgestellt wurde, automatisch als konform mit den Anforderungen "
+ "dieser Verordnung gelten, sofern sie den harmonisierten Normen oder "
+ "gemeinsamen Spezifikationen entsprechen.\n"
+ "(127)\n"
+ "Es ist daher angezeigt, dass der Anbieter das entsprechende "
+ "Konformitätsbewertungsverfahren anwendet."
+ )
+ result = _detect_recital(text)
+ assert result is not None
+ assert "126" in result["recital_numbers"]
+ assert "127" in result["recital_numbers"]
diff --git a/docs-src/development/testing.md b/docs-src/development/testing.md
index f6aab2c..ea7dc6b 100644
--- a/docs-src/development/testing.md
+++ b/docs-src/development/testing.md
@@ -214,13 +214,13 @@ Wenn du z.B. eine neue `GetUserStats()` Funktion im Go Service hinzufuegst:
## Modul-spezifische Tests
-### Canonical Control Generator (71+ Tests)
+### Canonical Control Generator (81+ Tests)
Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien.
Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) und [Control Generator Pipeline](../services/sdk-modules/control-generator-pipeline.md) fuer Details.
```bash
-# Alle Generator-Tests (71 Tests in 10 Klassen)
+# Alle Generator-Tests (81 Tests in 12 Klassen)
cd backend-compliance && pytest -v tests/test_control_generator.py
# Similarity Detector Tests
@@ -253,3 +253,4 @@ cd backend-compliance && pytest -v tests/test_validate_controls.py
| `TestBatchProcessingLoop` | 10 | Batch-Verarbeitung (Rule-Split, Mixed-Rules, Too-Close, Null-Handling) |
| `TestRegulationFilter` | 5 | regulation_filter Prefix-Matching, leere regulation_codes |
| `TestPipelineVersion` | 5 | pipeline_version=2 in DB-Writes, null-Handling in Structure/Reform |
+| `TestRecitalDetection` | 10 | Erwaegungsgrund-Erkennung in Quelltexten (Regex, Phrasen, Kombiniert) |
diff --git a/docs-src/services/sdk-modules/control-generator-pipeline.md b/docs-src/services/sdk-modules/control-generator-pipeline.md
index 309a516..8fc70a2 100644
--- a/docs-src/services/sdk-modules/control-generator-pipeline.md
+++ b/docs-src/services/sdk-modules/control-generator-pipeline.md
@@ -500,6 +500,39 @@ Die QA-Metriken werden in `generation_metadata` gespeichert:
}
```
+### Recital-Erkennung (Erwägungsgrund-Detektion)
+
+Die QA-Stufe prueft zusaetzlich, ob der `source_original_text` eines Controls tatsaechlich aus einem Gesetzesartikel stammt — oder aus einem Erwaegungsgrund (Recital). Erwaegungsgruende enthalten keine normativen Pflichten und fuehren zu falsch zugeordneten Controls.
+
+**Erkennungsmethoden:**
+
+| Methode | Pattern | Beispiel |
+|---------|---------|----------|
+| **Regex** | `\((\d{1,3})\)\s*\n` — Erwaegungsgrund-Nummern | `(126)\nUm den Verwaltungsaufwand...` |
+| **Phrasen** | Typische Recital-Formulierungen (≥2 Treffer) | "daher sollte", "in Erwägung nachstehender Gründe" |
+
+**Ergebnis bei Verdacht:**
+
+- `release_state` wird auf `needs_review` gesetzt
+- `generation_metadata.recital_suspect = true`
+- `generation_metadata.recital_detection` enthaelt Details:
+
+```json
+{
+ "recital_suspect": true,
+ "recital_detection": {
+ "recital_suspect": true,
+ "recital_numbers": ["126", "127"],
+ "recital_phrases": ["daher sollte"],
+ "detection_method": "regex+phrases"
+ }
+}
+```
+
+**Funktion:** `_detect_recital(text)` in `control_generator.py`
+
+**Hintergrund:** Bei der Analyse von ~5.500 Controls mit Quelltext wurden 1.555 (28%) als Erwaegungsgrund-Verdacht identifiziert. Der Document Crawler unterschied nicht zwischen Artikeltext und Erwaegungsgruenden, was zu falschen `article`/`paragraph`-Zuordnungen fuehrte.
+
### QA-Reklassifizierung bestehender Controls
```bash
@@ -530,7 +563,7 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/
| `backend-compliance/migrations/046_control_generator.sql` | Job-Tracking, Chunk-Tracking Tabellen |
| `backend-compliance/migrations/048_processing_path_expand.sql` | Erweiterte Processing-Path-Werte |
| `backend-compliance/migrations/062_pipeline_version.sql` | `pipeline_version` Spalte |
-| `backend-compliance/tests/test_control_generator.py` | 15 Tests (Lizenz, Domain, Batch, Pipeline) |
+| `backend-compliance/tests/test_control_generator.py` | 81+ Tests (Lizenz, Domain, Batch, Pipeline, Recital) |
---
diff --git a/scripts/find_recital_controls.py b/scripts/find_recital_controls.py
new file mode 100644
index 0000000..f661dc3
--- /dev/null
+++ b/scripts/find_recital_controls.py
@@ -0,0 +1,79 @@
+"""Find controls where source_original_text contains Erwägungsgrund (recital) markers
+instead of actual article text — indicates wrong article tagging in RAG chunks."""
+
+import sqlalchemy
+import os
+import json
+import re
+
+url = os.environ.get("DATABASE_URL", "")
+if not url:
+ print("DATABASE_URL not set")
+ exit(1)
+
+engine = sqlalchemy.create_engine(url)
+
+with engine.connect() as conn:
+ conn.execute(sqlalchemy.text("SET search_path TO compliance,public"))
+
+ r = conn.execute(sqlalchemy.text("""
+ SELECT control_id, title,
+ source_citation::text,
+ source_original_text,
+ pipeline_version, release_state,
+ generation_metadata::text
+ FROM canonical_controls
+ WHERE source_original_text IS NOT NULL
+ AND source_original_text != ''
+ AND source_citation IS NOT NULL
+ ORDER BY control_id
+ """)).fetchall()
+
+ # Pattern: standalone recital number like (125)\n or (126) at line start
+ recital_re = re.compile(r'\((\d{1,3})\)\s*\n')
+
+ # Pattern: article reference like "Artikel 43" in the text
+ artikel_re = re.compile(r'Artikel\s+(\d+)', re.IGNORECASE)
+
+ suspects_recital = []
+ suspects_mismatch = []
+
+ for row in r:
+ cid, title, citation_json, orig, pv, state, meta_json = row
+ if not orig:
+ continue
+
+ citation = json.loads(citation_json) if citation_json else {}
+ claimed_article = citation.get("article", "")
+
+ # Check 1: Recital markers in source text
+ recital_matches = recital_re.findall(orig)
+ has_recital = len(recital_matches) > 0
+
+ # Check 2: Text mentions a different article than claimed
+ artikel_matches = artikel_re.findall(orig)
+ claimed_num = re.search(r'\d+', claimed_article).group() if re.search(r'\d+', claimed_article) else ""
+ different_articles = [a for a in artikel_matches if a != claimed_num] if claimed_num else []
+
+ if has_recital:
+ suspects_recital.append({
+ "control_id": cid,
+ "title": title[:80],
+ "claimed_article": claimed_article,
+ "claimed_paragraph": citation.get("paragraph", ""),
+ "recitals_found": recital_matches[:5],
+ "v": pv,
+ "state": state,
+ })
+
+ print(f"=== Ergebnis ===")
+ print(f"Geprueft: {len(r)} Controls mit source_original_text")
+ print(f"Erwaegungsgrund-Verdacht: {len(suspects_recital)}")
+ print()
+
+ if suspects_recital:
+ print(f"{'Control':<12} {'Behauptet':<18} {'Recitals':<20} {'v':>2} {'State':<15} Titel")
+ print("-" * 120)
+ for s in suspects_recital:
+ recitals = ",".join(s["recitals_found"])
+ print(f"{s['control_id']:<12} {s['claimed_article']:<10} {s['claimed_paragraph']:<7} ({recitals}){'':<{max(0,15-len(recitals))}} v{s['v']} {s['state']:<15} {s['title']}")