From 148c7ba3af10e26fa27fcbba7f8abe2c6942c49d Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 08:20:02 +0100 Subject: [PATCH] feat(qa): recital detection, review split, duplicate comparison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add _detect_recital() to QA pipeline — flags controls where source_original_text contains Erwägungsgrund markers instead of article text (28% of controls with source text affected). - Recital detection via regex + phrase matching in QA validation - 10 new tests (TestRecitalDetection), 81 total - ReviewCompare component for side-by-side duplicate comparison - Review mode split: Duplikat-Verdacht vs Rule-3-ohne-Anchor tabs - MkDocs: recital detection documentation - Detection script for bulk analysis (scripts/find_recital_controls.py) Co-Authored-By: Claude Opus 4.6 --- .../components/ReviewCompare.tsx | 264 ++++++++++++++++++ .../app/sdk/control-library/page.tsx | 142 ++++++++-- .../compliance/services/control_generator.py | 70 +++++ .../tests/test_control_generator.py | 90 ++++++ docs-src/development/testing.md | 5 +- .../sdk-modules/control-generator-pipeline.md | 35 ++- scripts/find_recital_controls.py | 79 ++++++ 7 files changed, 657 insertions(+), 28 deletions(-) create mode 100644 admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx create mode 100644 scripts/find_recital_controls.py diff --git a/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx b/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx new file mode 100644 index 0000000..5d4c92e --- /dev/null +++ b/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx @@ -0,0 +1,264 @@ +'use client' + +import { useState, useEffect } from 'react' +import { + ArrowLeft, CheckCircle2, Trash2, Pencil, SkipForward, + ChevronLeft, Scale, BookOpen, ExternalLink, AlertTriangle, + FileText, Clock, +} from 'lucide-react' +import { + CanonicalControl, BACKEND_URL, + SeverityBadge, StateBadge, LicenseRuleBadge, CategoryBadge, TargetAudienceBadge, +} from './helpers' + +// ============================================================================= +// Compact Control Panel (used on both sides of the comparison) +// ============================================================================= + +function ControlPanel({ ctrl, label, highlight }: { ctrl: CanonicalControl; label: string; highlight?: boolean }) { + return ( +
+ {/* Panel Header */} +
+
{label}
+
+ {ctrl.control_id} + + + + + +
+

{ctrl.title}

+
+ + {/* Panel Content */} +
+ {/* Objective */} +
+

Ziel

+

{ctrl.objective}

+
+ + {/* Rationale */} + {ctrl.rationale && ( +
+

Begruendung

+

{ctrl.rationale}

+
+ )} + + {/* Source Citation (Rule 1+2) */} + {ctrl.source_citation && ( +
+
+ + Gesetzliche Grundlage +
+ {ctrl.source_citation.source && ( +

+ {ctrl.source_citation.source} + {ctrl.source_citation.article && ` — ${ctrl.source_citation.article}`} + {ctrl.source_citation.paragraph && ` ${ctrl.source_citation.paragraph}`} +

+ )} +
+ )} + + {/* Requirements */} + {ctrl.requirements.length > 0 && ( +
+

Anforderungen

+
    + {ctrl.requirements.map((r, i) => ( +
  1. {r}
  2. + ))} +
+
+ )} + + {/* Test Procedure */} + {ctrl.test_procedure.length > 0 && ( +
+

Pruefverfahren

+
    + {ctrl.test_procedure.map((s, i) => ( +
  1. {s}
  2. + ))} +
+
+ )} + + {/* Open Anchors */} + {ctrl.open_anchors.length > 0 && ( +
+
+ + Referenzen ({ctrl.open_anchors.length}) +
+
+ {ctrl.open_anchors.map((a, i) => ( +
+ + {a.framework} + {a.ref} +
+ ))} +
+
+ )} + + {/* Tags */} + {ctrl.tags.length > 0 && ( +
+ {ctrl.tags.map(t => ( + {t} + ))} +
+ )} +
+
+ ) +} + +// ============================================================================= +// ReviewCompare — Side-by-Side Duplicate Comparison +// ============================================================================= + +interface ReviewCompareProps { + ctrl: CanonicalControl + onBack: () => void + onReview: (controlId: string, action: string) => void + onEdit: () => void + reviewIndex: number + reviewTotal: number + onReviewPrev: () => void + onReviewNext: () => void +} + +export function ReviewCompare({ + ctrl, + onBack, + onReview, + onEdit, + reviewIndex, + reviewTotal, + onReviewPrev, + onReviewNext, +}: ReviewCompareProps) { + const [suspectedDuplicate, setSuspectedDuplicate] = useState(null) + const [loading, setLoading] = useState(false) + const [similarity, setSimilarity] = useState(null) + + // Load the suspected duplicate from generation_metadata.similar_controls + useEffect(() => { + const loadDuplicate = async () => { + const similarControls = ctrl.generation_metadata?.similar_controls as Array<{ control_id: string; title: string; similarity: number }> | undefined + if (!similarControls || similarControls.length === 0) { + setSuspectedDuplicate(null) + setSimilarity(null) + return + } + + const suspect = similarControls[0] + setSimilarity(suspect.similarity) + setLoading(true) + + try { + const res = await fetch(`${BACKEND_URL}?endpoint=control&id=${encodeURIComponent(suspect.control_id)}`) + if (res.ok) { + const data = await res.json() + setSuspectedDuplicate(data) + } else { + setSuspectedDuplicate(null) + } + } catch { + setSuspectedDuplicate(null) + } finally { + setLoading(false) + } + } + + loadDuplicate() + }, [ctrl.control_id, ctrl.generation_metadata]) + + return ( +
+ {/* Header */} +
+
+ +
+
+ + Duplikat-Vergleich + {similarity !== null && ( + + {(similarity * 100).toFixed(1)}% Aehnlichkeit + + )} +
+
+
+ +
+ {/* Navigation */} +
+ + {reviewIndex + 1} / {reviewTotal} + +
+ + {/* Actions */} + + + +
+
+ + {/* Side-by-Side Panels */} +
+ {/* Left: Control to review */} +
+ +
+ + {/* Right: Suspected duplicate */} +
+ {loading ? ( +
+
+
+ ) : suspectedDuplicate ? ( + + ) : ( +
+ Kein Duplikat-Kandidat gefunden +
+ )} +
+
+
+ ) +} diff --git a/admin-compliance/app/sdk/control-library/page.tsx b/admin-compliance/app/sdk/control-library/page.tsx index 4cbaedd..f67f80e 100644 --- a/admin-compliance/app/sdk/control-library/page.tsx +++ b/admin-compliance/app/sdk/control-library/page.tsx @@ -14,6 +14,7 @@ import { } from './components/helpers' import { ControlForm } from './components/ControlForm' import { ControlDetail } from './components/ControlDetail' +import { ReviewCompare } from './components/ReviewCompare' import { GeneratorModal } from './components/GeneratorModal' // ============================================================================= @@ -71,6 +72,9 @@ export default function ControlLibraryPage() { const [reviewIndex, setReviewIndex] = useState(0) const [reviewItems, setReviewItems] = useState([]) const [reviewCount, setReviewCount] = useState(0) + const [reviewTab, setReviewTab] = useState<'duplicates' | 'rule3'>('duplicates') + const [reviewDuplicates, setReviewDuplicates] = useState([]) + const [reviewRule3, setReviewRule3] = useState([]) // Debounce search const searchTimer = useRef | null>(null) @@ -303,20 +307,47 @@ export default function ControlLibraryPage() { const enterReviewMode = async () => { // Load review items from backend try { - const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=200`) + const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=1000`) if (res.ok) { - const items = await res.json() + const items: CanonicalControl[] = await res.json() if (items.length > 0) { - setReviewItems(items) + // Split into duplicate suspects vs rule 3 without anchor + const dupes = items.filter(c => + c.generation_metadata?.similar_controls && + Array.isArray(c.generation_metadata.similar_controls) && + (c.generation_metadata.similar_controls as unknown[]).length > 0 + ) + const rule3 = items.filter(c => + !c.generation_metadata?.similar_controls || + !Array.isArray(c.generation_metadata.similar_controls) || + (c.generation_metadata.similar_controls as unknown[]).length === 0 + ) + setReviewDuplicates(dupes) + setReviewRule3(rule3) + // Start with duplicates tab if any, otherwise rule3 + const startTab = dupes.length > 0 ? 'duplicates' : 'rule3' + const startItems = startTab === 'duplicates' ? dupes : rule3 + setReviewTab(startTab) + setReviewItems(startItems) setReviewMode(true) setReviewIndex(0) - setSelectedControl(items[0]) + setSelectedControl(startItems[0]) setMode('detail') } } } catch { /* ignore */ } } + const switchReviewTab = (tab: 'duplicates' | 'rule3') => { + const items = tab === 'duplicates' ? reviewDuplicates : reviewRule3 + setReviewTab(tab) + setReviewItems(items) + setReviewIndex(0) + if (items.length > 0) { + setSelectedControl(items[0]) + } + } + // Loading if (loading && controls.length === 0) { return ( @@ -363,28 +394,89 @@ export default function ControlLibraryPage() { // DETAIL MODE if (mode === 'detail' && selectedControl) { + const isDuplicateReview = reviewMode && reviewTab === 'duplicates' + + // Review tab bar (shown above the detail/compare view in review mode) + const reviewTabBar = reviewMode ? ( +
+ + +
+ ) : null + + if (isDuplicateReview) { + return ( +
+ {reviewTabBar} +
+ { setMode('list'); setSelectedControl(null); setReviewMode(false) }} + onReview={handleReview} + onEdit={() => setMode('edit')} + reviewIndex={reviewIndex} + reviewTotal={reviewItems.length} + onReviewPrev={() => { + const idx = Math.max(0, reviewIndex - 1) + setReviewIndex(idx) + setSelectedControl(reviewItems[idx]) + }} + onReviewNext={() => { + const idx = Math.min(reviewItems.length - 1, reviewIndex + 1) + setReviewIndex(idx) + setSelectedControl(reviewItems[idx]) + }} + /> +
+
+ ) + } + return ( - { setMode('list'); setSelectedControl(null); setReviewMode(false) }} - onEdit={() => setMode('edit')} - onDelete={handleDelete} - onReview={handleReview} - onRefresh={fullReload} - reviewMode={reviewMode} - reviewIndex={reviewIndex} - reviewTotal={reviewItems.length} - onReviewPrev={() => { - const idx = Math.max(0, reviewIndex - 1) - setReviewIndex(idx) - setSelectedControl(reviewItems[idx]) - }} - onReviewNext={() => { - const idx = Math.min(reviewItems.length - 1, reviewIndex + 1) - setReviewIndex(idx) - setSelectedControl(reviewItems[idx]) - }} - /> +
+ {reviewTabBar} +
+ { setMode('list'); setSelectedControl(null); setReviewMode(false) }} + onEdit={() => setMode('edit')} + onDelete={handleDelete} + onReview={handleReview} + onRefresh={fullReload} + reviewMode={reviewMode} + reviewIndex={reviewIndex} + reviewTotal={reviewItems.length} + onReviewPrev={() => { + const idx = Math.max(0, reviewIndex - 1) + setReviewIndex(idx) + setSelectedControl(reviewItems[idx]) + }} + onReviewNext={() => { + const idx = Math.min(reviewItems.length - 1, reviewIndex + 1) + setReviewIndex(idx) + setSelectedControl(reviewItems[idx]) + }} + /> +
+
) } diff --git a/backend-compliance/compliance/services/control_generator.py b/backend-compliance/compliance/services/control_generator.py index fce4316..1de79a4 100644 --- a/backend-compliance/compliance/services/control_generator.py +++ b/backend-compliance/compliance/services/control_generator.py @@ -321,6 +321,62 @@ VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys()) VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC", "AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"} +# --------------------------------------------------------------------------- +# Recital (Erwägungsgrund) detection in source text +# --------------------------------------------------------------------------- + +# Pattern: standalone recital number like (125)\n or (126) at line start +_RECITAL_RE = re.compile(r'\((\d{1,3})\)\s*\n') + +# Recital-typical phrasing (German EU law Erwägungsgründe) +_RECITAL_PHRASES = [ + "in erwägung nachstehender gründe", + "erwägungsgrund", + "in anbetracht", + "daher sollte", + "aus diesem grund", + "es ist daher", + "folglich sollte", + "es sollte daher", + "in diesem zusammenhang", +] + + +def _detect_recital(text: str) -> Optional[dict]: + """Detect if source text is a recital (Erwägungsgrund) rather than an article. + + Returns a dict with detection details if recital markers are found, + or None if the text appears to be genuine article text. + + Detection criteria: + 1. Standalone recital numbers like (126)\\n in the text + 2. Recital-typical phrasing ("daher sollte", "erwägungsgrund", etc.) + """ + if not text: + return None + + # Check 1: Recital number markers + recital_matches = _RECITAL_RE.findall(text) + + # Check 2: Recital phrasing + text_lower = text.lower() + phrase_hits = [p for p in _RECITAL_PHRASES if p in text_lower] + + if not recital_matches and not phrase_hits: + return None + + # Require at least recital numbers OR >=2 phrase hits to be a suspect + if not recital_matches and len(phrase_hits) < 2: + return None + + return { + "recital_suspect": True, + "recital_numbers": recital_matches[:10], + "recital_phrases": phrase_hits[:5], + "detection_method": "regex+phrases" if recital_matches and phrase_hits + else "regex" if recital_matches else "phrases", + } + CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES)) VERIFICATION_KEYWORDS = { @@ -1520,9 +1576,23 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne ) -> tuple[GeneratedControl, bool]: """Cross-validate category/domain using keyword detection + local LLM. + Also checks for recital (Erwägungsgrund) contamination in source text. Returns (control, was_fixed). Only triggers Ollama QA when the LLM classification disagrees with keyword detection — keeps it fast. """ + # ── Recital detection ────────────────────────────────────────── + source_text = control.source_original_text or "" + recital_info = _detect_recital(source_text) + if recital_info: + control.generation_metadata["recital_suspect"] = True + control.generation_metadata["recital_detection"] = recital_info + control.release_state = "needs_review" + logger.warning( + "Recital suspect: '%s' — recitals %s detected in source text", + control.title[:40], + recital_info.get("recital_numbers", []), + ) + kw_category = _detect_category(chunk_text) or _detect_category(control.objective) kw_domain = _detect_domain(chunk_text) llm_domain = control.generation_metadata.get("_effective_domain", "") diff --git a/backend-compliance/tests/test_control_generator.py b/backend-compliance/tests/test_control_generator.py index 29c025d..fc812f0 100644 --- a/backend-compliance/tests/test_control_generator.py +++ b/backend-compliance/tests/test_control_generator.py @@ -7,6 +7,7 @@ from unittest.mock import AsyncMock, MagicMock, patch from compliance.services.control_generator import ( _classify_regulation, _detect_domain, + _detect_recital, _parse_llm_json, _parse_llm_json_array, GeneratorConfig, @@ -1306,3 +1307,92 @@ class TestPipelineVersion: assert controls[0] is not None assert controls[1] is None # Null entry from LLM assert controls[2] is not None + + +# ============================================================================= +# Recital (Erwägungsgrund) Detection Tests +# ============================================================================= + +class TestRecitalDetection: + """Tests for _detect_recital — identifying Erwägungsgrund text in source.""" + + def test_recital_number_detected(self): + """Text with (126)\\n pattern is flagged as recital suspect.""" + text = "Daher ist es wichtig...\n(126)\nDie Konformitätsbewertung sollte..." + result = _detect_recital(text) + assert result is not None + assert result["recital_suspect"] is True + assert "126" in result["recital_numbers"] + + def test_multiple_recital_numbers(self): + """Multiple recital markers are all captured.""" + text = "(124)\nErster Punkt.\n(125)\nZweiter Punkt.\n(126)\nDritter Punkt." + result = _detect_recital(text) + assert result is not None + assert "124" in result["recital_numbers"] + assert "125" in result["recital_numbers"] + assert "126" in result["recital_numbers"] + + def test_article_text_not_flagged(self): + """Normal article text without recital markers returns None.""" + text = ("Der Anbieter eines Hochrisiko-KI-Systems muss sicherstellen, " + "dass die technische Dokumentation erstellt wird.") + result = _detect_recital(text) + assert result is None + + def test_empty_text_returns_none(self): + result = _detect_recital("") + assert result is None + + def test_none_text_returns_none(self): + result = _detect_recital(None) + assert result is None + + def test_recital_phrases_detected(self): + """Text with multiple recital-typical phrases is flagged.""" + text = ("In Erwägung nachstehender Gründe wurde beschlossen, " + "daher sollte der Anbieter folgende Maßnahmen ergreifen. " + "Es ist daher notwendig, die Konformität sicherzustellen.") + result = _detect_recital(text) + assert result is not None + assert result["detection_method"] == "phrases" + + def test_single_phrase_not_enough(self): + """A single recital phrase alone is not sufficient for detection.""" + text = "Daher sollte das System regelmäßig geprüft werden." + result = _detect_recital(text) + assert result is None + + def test_combined_regex_and_phrases(self): + """Both recital numbers and phrases → detection_method is regex+phrases.""" + text = "(42)\nIn Erwägung nachstehender Gründe wurde entschieden..." + result = _detect_recital(text) + assert result is not None + assert result["detection_method"] == "regex+phrases" + assert "42" in result["recital_numbers"] + + def test_parenthesized_number_without_newline_ignored(self): + """Numbers in parentheses without trailing newline are not recital markers. + e.g. 'gemäß Absatz (3) des Artikels' should not be flagged.""" + text = "Gemäß Absatz (3) des Artikels 52 muss der Anbieter sicherstellen..." + result = _detect_recital(text) + assert result is None + + def test_real_world_recital_text(self): + """Real-world example: AI Act Erwägungsgrund (126) about conformity assessment.""" + text = ( + "(126)\n" + "Um den Verwaltungsaufwand zu verringern und die Konformitätsbewertung " + "zu vereinfachen, sollten bestimmte Hochrisiko-KI-Systeme, die von " + "Anbietern zertifiziert oder für die eine Konformitätserklärung " + "ausgestellt wurde, automatisch als konform mit den Anforderungen " + "dieser Verordnung gelten, sofern sie den harmonisierten Normen oder " + "gemeinsamen Spezifikationen entsprechen.\n" + "(127)\n" + "Es ist daher angezeigt, dass der Anbieter das entsprechende " + "Konformitätsbewertungsverfahren anwendet." + ) + result = _detect_recital(text) + assert result is not None + assert "126" in result["recital_numbers"] + assert "127" in result["recital_numbers"] diff --git a/docs-src/development/testing.md b/docs-src/development/testing.md index f6aab2c..ea7dc6b 100644 --- a/docs-src/development/testing.md +++ b/docs-src/development/testing.md @@ -214,13 +214,13 @@ Wenn du z.B. eine neue `GetUserStats()` Funktion im Go Service hinzufuegst: ## Modul-spezifische Tests -### Canonical Control Generator (71+ Tests) +### Canonical Control Generator (81+ Tests) Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien. Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) und [Control Generator Pipeline](../services/sdk-modules/control-generator-pipeline.md) fuer Details. ```bash -# Alle Generator-Tests (71 Tests in 10 Klassen) +# Alle Generator-Tests (81 Tests in 12 Klassen) cd backend-compliance && pytest -v tests/test_control_generator.py # Similarity Detector Tests @@ -253,3 +253,4 @@ cd backend-compliance && pytest -v tests/test_validate_controls.py | `TestBatchProcessingLoop` | 10 | Batch-Verarbeitung (Rule-Split, Mixed-Rules, Too-Close, Null-Handling) | | `TestRegulationFilter` | 5 | regulation_filter Prefix-Matching, leere regulation_codes | | `TestPipelineVersion` | 5 | pipeline_version=2 in DB-Writes, null-Handling in Structure/Reform | +| `TestRecitalDetection` | 10 | Erwaegungsgrund-Erkennung in Quelltexten (Regex, Phrasen, Kombiniert) | diff --git a/docs-src/services/sdk-modules/control-generator-pipeline.md b/docs-src/services/sdk-modules/control-generator-pipeline.md index 309a516..8fc70a2 100644 --- a/docs-src/services/sdk-modules/control-generator-pipeline.md +++ b/docs-src/services/sdk-modules/control-generator-pipeline.md @@ -500,6 +500,39 @@ Die QA-Metriken werden in `generation_metadata` gespeichert: } ``` +### Recital-Erkennung (Erwägungsgrund-Detektion) + +Die QA-Stufe prueft zusaetzlich, ob der `source_original_text` eines Controls tatsaechlich aus einem Gesetzesartikel stammt — oder aus einem Erwaegungsgrund (Recital). Erwaegungsgruende enthalten keine normativen Pflichten und fuehren zu falsch zugeordneten Controls. + +**Erkennungsmethoden:** + +| Methode | Pattern | Beispiel | +|---------|---------|----------| +| **Regex** | `\((\d{1,3})\)\s*\n` — Erwaegungsgrund-Nummern | `(126)\nUm den Verwaltungsaufwand...` | +| **Phrasen** | Typische Recital-Formulierungen (≥2 Treffer) | "daher sollte", "in Erwägung nachstehender Gründe" | + +**Ergebnis bei Verdacht:** + +- `release_state` wird auf `needs_review` gesetzt +- `generation_metadata.recital_suspect = true` +- `generation_metadata.recital_detection` enthaelt Details: + +```json +{ + "recital_suspect": true, + "recital_detection": { + "recital_suspect": true, + "recital_numbers": ["126", "127"], + "recital_phrases": ["daher sollte"], + "detection_method": "regex+phrases" + } +} +``` + +**Funktion:** `_detect_recital(text)` in `control_generator.py` + +**Hintergrund:** Bei der Analyse von ~5.500 Controls mit Quelltext wurden 1.555 (28%) als Erwaegungsgrund-Verdacht identifiziert. Der Document Crawler unterschied nicht zwischen Artikeltext und Erwaegungsgruenden, was zu falschen `article`/`paragraph`-Zuordnungen fuehrte. + ### QA-Reklassifizierung bestehender Controls ```bash @@ -530,7 +563,7 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/ | `backend-compliance/migrations/046_control_generator.sql` | Job-Tracking, Chunk-Tracking Tabellen | | `backend-compliance/migrations/048_processing_path_expand.sql` | Erweiterte Processing-Path-Werte | | `backend-compliance/migrations/062_pipeline_version.sql` | `pipeline_version` Spalte | -| `backend-compliance/tests/test_control_generator.py` | 15 Tests (Lizenz, Domain, Batch, Pipeline) | +| `backend-compliance/tests/test_control_generator.py` | 81+ Tests (Lizenz, Domain, Batch, Pipeline, Recital) | --- diff --git a/scripts/find_recital_controls.py b/scripts/find_recital_controls.py new file mode 100644 index 0000000..f661dc3 --- /dev/null +++ b/scripts/find_recital_controls.py @@ -0,0 +1,79 @@ +"""Find controls where source_original_text contains Erwägungsgrund (recital) markers +instead of actual article text — indicates wrong article tagging in RAG chunks.""" + +import sqlalchemy +import os +import json +import re + +url = os.environ.get("DATABASE_URL", "") +if not url: + print("DATABASE_URL not set") + exit(1) + +engine = sqlalchemy.create_engine(url) + +with engine.connect() as conn: + conn.execute(sqlalchemy.text("SET search_path TO compliance,public")) + + r = conn.execute(sqlalchemy.text(""" + SELECT control_id, title, + source_citation::text, + source_original_text, + pipeline_version, release_state, + generation_metadata::text + FROM canonical_controls + WHERE source_original_text IS NOT NULL + AND source_original_text != '' + AND source_citation IS NOT NULL + ORDER BY control_id + """)).fetchall() + + # Pattern: standalone recital number like (125)\n or (126) at line start + recital_re = re.compile(r'\((\d{1,3})\)\s*\n') + + # Pattern: article reference like "Artikel 43" in the text + artikel_re = re.compile(r'Artikel\s+(\d+)', re.IGNORECASE) + + suspects_recital = [] + suspects_mismatch = [] + + for row in r: + cid, title, citation_json, orig, pv, state, meta_json = row + if not orig: + continue + + citation = json.loads(citation_json) if citation_json else {} + claimed_article = citation.get("article", "") + + # Check 1: Recital markers in source text + recital_matches = recital_re.findall(orig) + has_recital = len(recital_matches) > 0 + + # Check 2: Text mentions a different article than claimed + artikel_matches = artikel_re.findall(orig) + claimed_num = re.search(r'\d+', claimed_article).group() if re.search(r'\d+', claimed_article) else "" + different_articles = [a for a in artikel_matches if a != claimed_num] if claimed_num else [] + + if has_recital: + suspects_recital.append({ + "control_id": cid, + "title": title[:80], + "claimed_article": claimed_article, + "claimed_paragraph": citation.get("paragraph", ""), + "recitals_found": recital_matches[:5], + "v": pv, + "state": state, + }) + + print(f"=== Ergebnis ===") + print(f"Geprueft: {len(r)} Controls mit source_original_text") + print(f"Erwaegungsgrund-Verdacht: {len(suspects_recital)}") + print() + + if suspects_recital: + print(f"{'Control':<12} {'Behauptet':<18} {'Recitals':<20} {'v':>2} {'State':<15} Titel") + print("-" * 120) + for s in suspects_recital: + recitals = ",".join(s["recitals_found"]) + print(f"{s['control_id']:<12} {s['claimed_article']:<10} {s['claimed_paragraph']:<7} ({recitals}){'':<{max(0,15-len(recitals))}} v{s['v']} {s['state']:<15} {s['title']}")