feat(qa): recital detection, review split, duplicate comparison
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped

Add _detect_recital() to QA pipeline — flags controls where
source_original_text contains Erwägungsgrund markers instead of
article text (28% of controls with source text affected).

- Recital detection via regex + phrase matching in QA validation
- 10 new tests (TestRecitalDetection), 81 total
- ReviewCompare component for side-by-side duplicate comparison
- Review mode split: Duplikat-Verdacht vs Rule-3-ohne-Anchor tabs
- MkDocs: recital detection documentation
- Detection script for bulk analysis (scripts/find_recital_controls.py)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-18 08:20:02 +01:00
parent a9e0869205
commit 148c7ba3af
7 changed files with 657 additions and 28 deletions

View File

@@ -0,0 +1,264 @@
'use client'
import { useState, useEffect } from 'react'
import {
ArrowLeft, CheckCircle2, Trash2, Pencil, SkipForward,
ChevronLeft, Scale, BookOpen, ExternalLink, AlertTriangle,
FileText, Clock,
} from 'lucide-react'
import {
CanonicalControl, BACKEND_URL,
SeverityBadge, StateBadge, LicenseRuleBadge, CategoryBadge, TargetAudienceBadge,
} from './helpers'
// =============================================================================
// Compact Control Panel (used on both sides of the comparison)
// =============================================================================
function ControlPanel({ ctrl, label, highlight }: { ctrl: CanonicalControl; label: string; highlight?: boolean }) {
return (
<div className={`flex flex-col h-full overflow-y-auto ${highlight ? 'bg-yellow-50' : 'bg-white'}`}>
{/* Panel Header */}
<div className={`sticky top-0 z-10 px-4 py-3 border-b ${highlight ? 'bg-yellow-100 border-yellow-200' : 'bg-gray-50 border-gray-200'}`}>
<div className="text-xs font-semibold uppercase tracking-wide text-gray-500 mb-1">{label}</div>
<div className="flex items-center gap-2 flex-wrap">
<span className="text-sm font-mono text-purple-600 bg-purple-50 px-2 py-0.5 rounded">{ctrl.control_id}</span>
<SeverityBadge severity={ctrl.severity} />
<StateBadge state={ctrl.release_state} />
<LicenseRuleBadge rule={ctrl.license_rule} />
<CategoryBadge category={ctrl.category} />
<TargetAudienceBadge audience={ctrl.target_audience} />
</div>
<h3 className="text-sm font-semibold text-gray-900 mt-1 leading-snug">{ctrl.title}</h3>
</div>
{/* Panel Content */}
<div className="p-4 space-y-4 text-sm">
{/* Objective */}
<section>
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Ziel</h4>
<p className="text-gray-700 leading-relaxed">{ctrl.objective}</p>
</section>
{/* Rationale */}
{ctrl.rationale && (
<section>
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Begruendung</h4>
<p className="text-gray-700 leading-relaxed">{ctrl.rationale}</p>
</section>
)}
{/* Source Citation (Rule 1+2) */}
{ctrl.source_citation && (
<section className="bg-blue-50 border border-blue-200 rounded-lg p-3">
<div className="flex items-center gap-1.5 mb-1">
<Scale className="w-3.5 h-3.5 text-blue-600" />
<span className="text-xs font-semibold text-blue-900">Gesetzliche Grundlage</span>
</div>
{ctrl.source_citation.source && (
<p className="text-xs text-blue-800">
{ctrl.source_citation.source}
{ctrl.source_citation.article && `${ctrl.source_citation.article}`}
{ctrl.source_citation.paragraph && ` ${ctrl.source_citation.paragraph}`}
</p>
)}
</section>
)}
{/* Requirements */}
{ctrl.requirements.length > 0 && (
<section>
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Anforderungen</h4>
<ol className="list-decimal list-inside space-y-1">
{ctrl.requirements.map((r, i) => (
<li key={i} className="text-gray-700 text-xs leading-relaxed">{r}</li>
))}
</ol>
</section>
)}
{/* Test Procedure */}
{ctrl.test_procedure.length > 0 && (
<section>
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Pruefverfahren</h4>
<ol className="list-decimal list-inside space-y-1">
{ctrl.test_procedure.map((s, i) => (
<li key={i} className="text-gray-700 text-xs leading-relaxed">{s}</li>
))}
</ol>
</section>
)}
{/* Open Anchors */}
{ctrl.open_anchors.length > 0 && (
<section className="bg-green-50 border border-green-200 rounded-lg p-3">
<div className="flex items-center gap-1.5 mb-2">
<BookOpen className="w-3.5 h-3.5 text-green-700" />
<span className="text-xs font-semibold text-green-900">Referenzen ({ctrl.open_anchors.length})</span>
</div>
<div className="space-y-1">
{ctrl.open_anchors.map((a, i) => (
<div key={i} className="flex items-center gap-1.5 text-xs">
<ExternalLink className="w-3 h-3 text-green-600 flex-shrink-0" />
<span className="font-medium text-green-800">{a.framework}</span>
<span className="text-green-700">{a.ref}</span>
</div>
))}
</div>
</section>
)}
{/* Tags */}
{ctrl.tags.length > 0 && (
<div className="flex items-center gap-1 flex-wrap">
{ctrl.tags.map(t => (
<span key={t} className="px-2 py-0.5 bg-gray-100 text-gray-600 rounded text-xs">{t}</span>
))}
</div>
)}
</div>
</div>
)
}
// =============================================================================
// ReviewCompare — Side-by-Side Duplicate Comparison
// =============================================================================
interface ReviewCompareProps {
ctrl: CanonicalControl
onBack: () => void
onReview: (controlId: string, action: string) => void
onEdit: () => void
reviewIndex: number
reviewTotal: number
onReviewPrev: () => void
onReviewNext: () => void
}
export function ReviewCompare({
ctrl,
onBack,
onReview,
onEdit,
reviewIndex,
reviewTotal,
onReviewPrev,
onReviewNext,
}: ReviewCompareProps) {
const [suspectedDuplicate, setSuspectedDuplicate] = useState<CanonicalControl | null>(null)
const [loading, setLoading] = useState(false)
const [similarity, setSimilarity] = useState<number | null>(null)
// Load the suspected duplicate from generation_metadata.similar_controls
useEffect(() => {
const loadDuplicate = async () => {
const similarControls = ctrl.generation_metadata?.similar_controls as Array<{ control_id: string; title: string; similarity: number }> | undefined
if (!similarControls || similarControls.length === 0) {
setSuspectedDuplicate(null)
setSimilarity(null)
return
}
const suspect = similarControls[0]
setSimilarity(suspect.similarity)
setLoading(true)
try {
const res = await fetch(`${BACKEND_URL}?endpoint=control&id=${encodeURIComponent(suspect.control_id)}`)
if (res.ok) {
const data = await res.json()
setSuspectedDuplicate(data)
} else {
setSuspectedDuplicate(null)
}
} catch {
setSuspectedDuplicate(null)
} finally {
setLoading(false)
}
}
loadDuplicate()
}, [ctrl.control_id, ctrl.generation_metadata])
return (
<div className="flex flex-col h-full">
{/* Header */}
<div className="border-b border-gray-200 bg-white px-6 py-3 flex items-center justify-between">
<div className="flex items-center gap-3">
<button onClick={onBack} className="text-gray-400 hover:text-gray-600">
<ArrowLeft className="w-5 h-5" />
</button>
<div>
<div className="flex items-center gap-2">
<AlertTriangle className="w-4 h-4 text-amber-500" />
<span className="text-sm font-semibold text-gray-900">Duplikat-Vergleich</span>
{similarity !== null && (
<span className="text-xs font-medium text-amber-600 bg-amber-50 px-2 py-0.5 rounded-full">
{(similarity * 100).toFixed(1)}% Aehnlichkeit
</span>
)}
</div>
</div>
</div>
<div className="flex items-center gap-2">
{/* Navigation */}
<div className="flex items-center gap-1 mr-3">
<button onClick={onReviewPrev} disabled={reviewIndex === 0} className="p-1 text-gray-400 hover:text-gray-600 disabled:opacity-30">
<ChevronLeft className="w-4 h-4" />
</button>
<span className="text-xs text-gray-500 font-medium">{reviewIndex + 1} / {reviewTotal}</span>
<button onClick={onReviewNext} disabled={reviewIndex >= reviewTotal - 1} className="p-1 text-gray-400 hover:text-gray-600 disabled:opacity-30">
<SkipForward className="w-4 h-4" />
</button>
</div>
{/* Actions */}
<button
onClick={() => onReview(ctrl.control_id, 'approve')}
className="px-3 py-1.5 text-sm text-white bg-green-600 rounded-lg hover:bg-green-700"
>
<CheckCircle2 className="w-3.5 h-3.5 inline mr-1" />Behalten
</button>
<button
onClick={() => onReview(ctrl.control_id, 'reject')}
className="px-3 py-1.5 text-sm text-white bg-red-600 rounded-lg hover:bg-red-700"
>
<Trash2 className="w-3.5 h-3.5 inline mr-1" />Duplikat
</button>
<button
onClick={onEdit}
className="px-3 py-1.5 text-sm text-gray-600 border border-gray-300 rounded-lg hover:bg-gray-50"
>
<Pencil className="w-3.5 h-3.5 inline mr-1" />Bearbeiten
</button>
</div>
</div>
{/* Side-by-Side Panels */}
<div className="flex-1 flex overflow-hidden">
{/* Left: Control to review */}
<div className="w-1/2 border-r border-gray-200 overflow-y-auto">
<ControlPanel ctrl={ctrl} label="Zu pruefen" highlight />
</div>
{/* Right: Suspected duplicate */}
<div className="w-1/2 overflow-y-auto">
{loading ? (
<div className="flex items-center justify-center h-full">
<div className="animate-spin rounded-full h-6 w-6 border-2 border-purple-600 border-t-transparent" />
</div>
) : suspectedDuplicate ? (
<ControlPanel ctrl={suspectedDuplicate} label="Bestehendes Control (Verdacht)" />
) : (
<div className="flex items-center justify-center h-full text-gray-400 text-sm">
Kein Duplikat-Kandidat gefunden
</div>
)}
</div>
</div>
</div>
)
}

View File

@@ -14,6 +14,7 @@ import {
} from './components/helpers'
import { ControlForm } from './components/ControlForm'
import { ControlDetail } from './components/ControlDetail'
import { ReviewCompare } from './components/ReviewCompare'
import { GeneratorModal } from './components/GeneratorModal'
// =============================================================================
@@ -71,6 +72,9 @@ export default function ControlLibraryPage() {
const [reviewIndex, setReviewIndex] = useState(0)
const [reviewItems, setReviewItems] = useState<CanonicalControl[]>([])
const [reviewCount, setReviewCount] = useState(0)
const [reviewTab, setReviewTab] = useState<'duplicates' | 'rule3'>('duplicates')
const [reviewDuplicates, setReviewDuplicates] = useState<CanonicalControl[]>([])
const [reviewRule3, setReviewRule3] = useState<CanonicalControl[]>([])
// Debounce search
const searchTimer = useRef<ReturnType<typeof setTimeout> | null>(null)
@@ -303,20 +307,47 @@ export default function ControlLibraryPage() {
const enterReviewMode = async () => {
// Load review items from backend
try {
const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=200`)
const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=1000`)
if (res.ok) {
const items = await res.json()
const items: CanonicalControl[] = await res.json()
if (items.length > 0) {
setReviewItems(items)
// Split into duplicate suspects vs rule 3 without anchor
const dupes = items.filter(c =>
c.generation_metadata?.similar_controls &&
Array.isArray(c.generation_metadata.similar_controls) &&
(c.generation_metadata.similar_controls as unknown[]).length > 0
)
const rule3 = items.filter(c =>
!c.generation_metadata?.similar_controls ||
!Array.isArray(c.generation_metadata.similar_controls) ||
(c.generation_metadata.similar_controls as unknown[]).length === 0
)
setReviewDuplicates(dupes)
setReviewRule3(rule3)
// Start with duplicates tab if any, otherwise rule3
const startTab = dupes.length > 0 ? 'duplicates' : 'rule3'
const startItems = startTab === 'duplicates' ? dupes : rule3
setReviewTab(startTab)
setReviewItems(startItems)
setReviewMode(true)
setReviewIndex(0)
setSelectedControl(items[0])
setSelectedControl(startItems[0])
setMode('detail')
}
}
} catch { /* ignore */ }
}
const switchReviewTab = (tab: 'duplicates' | 'rule3') => {
const items = tab === 'duplicates' ? reviewDuplicates : reviewRule3
setReviewTab(tab)
setReviewItems(items)
setReviewIndex(0)
if (items.length > 0) {
setSelectedControl(items[0])
}
}
// Loading
if (loading && controls.length === 0) {
return (
@@ -363,7 +394,66 @@ export default function ControlLibraryPage() {
// DETAIL MODE
if (mode === 'detail' && selectedControl) {
const isDuplicateReview = reviewMode && reviewTab === 'duplicates'
// Review tab bar (shown above the detail/compare view in review mode)
const reviewTabBar = reviewMode ? (
<div className="border-b border-gray-200 bg-white px-6 py-2 flex items-center gap-4">
<button
onClick={() => switchReviewTab('duplicates')}
className={`px-3 py-1.5 text-sm rounded-lg font-medium ${
reviewTab === 'duplicates'
? 'bg-amber-100 text-amber-800 border border-amber-300'
: 'text-gray-500 hover:text-gray-700 hover:bg-gray-100'
}`}
>
Duplikat-Verdacht ({reviewDuplicates.length})
</button>
<button
onClick={() => switchReviewTab('rule3')}
className={`px-3 py-1.5 text-sm rounded-lg font-medium ${
reviewTab === 'rule3'
? 'bg-purple-100 text-purple-800 border border-purple-300'
: 'text-gray-500 hover:text-gray-700 hover:bg-gray-100'
}`}
>
Rule 3 ohne Anchor ({reviewRule3.length})
</button>
</div>
) : null
if (isDuplicateReview) {
return (
<div className="flex flex-col h-full">
{reviewTabBar}
<div className="flex-1 overflow-hidden">
<ReviewCompare
ctrl={selectedControl}
onBack={() => { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
onReview={handleReview}
onEdit={() => setMode('edit')}
reviewIndex={reviewIndex}
reviewTotal={reviewItems.length}
onReviewPrev={() => {
const idx = Math.max(0, reviewIndex - 1)
setReviewIndex(idx)
setSelectedControl(reviewItems[idx])
}}
onReviewNext={() => {
const idx = Math.min(reviewItems.length - 1, reviewIndex + 1)
setReviewIndex(idx)
setSelectedControl(reviewItems[idx])
}}
/>
</div>
</div>
)
}
return (
<div className="flex flex-col h-full">
{reviewTabBar}
<div className="flex-1 overflow-hidden">
<ControlDetail
ctrl={selectedControl}
onBack={() => { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
@@ -385,6 +475,8 @@ export default function ControlLibraryPage() {
setSelectedControl(reviewItems[idx])
}}
/>
</div>
</div>
)
}

View File

@@ -321,6 +321,62 @@ VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
"AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
# ---------------------------------------------------------------------------
# Recital (Erwägungsgrund) detection in source text
# ---------------------------------------------------------------------------
# Pattern: standalone recital number like (125)\n or (126) at line start
_RECITAL_RE = re.compile(r'\((\d{1,3})\)\s*\n')
# Recital-typical phrasing (German EU law Erwägungsgründe)
_RECITAL_PHRASES = [
"in erwägung nachstehender gründe",
"erwägungsgrund",
"in anbetracht",
"daher sollte",
"aus diesem grund",
"es ist daher",
"folglich sollte",
"es sollte daher",
"in diesem zusammenhang",
]
def _detect_recital(text: str) -> Optional[dict]:
"""Detect if source text is a recital (Erwägungsgrund) rather than an article.
Returns a dict with detection details if recital markers are found,
or None if the text appears to be genuine article text.
Detection criteria:
1. Standalone recital numbers like (126)\\n in the text
2. Recital-typical phrasing ("daher sollte", "erwägungsgrund", etc.)
"""
if not text:
return None
# Check 1: Recital number markers
recital_matches = _RECITAL_RE.findall(text)
# Check 2: Recital phrasing
text_lower = text.lower()
phrase_hits = [p for p in _RECITAL_PHRASES if p in text_lower]
if not recital_matches and not phrase_hits:
return None
# Require at least recital numbers OR >=2 phrase hits to be a suspect
if not recital_matches and len(phrase_hits) < 2:
return None
return {
"recital_suspect": True,
"recital_numbers": recital_matches[:10],
"recital_phrases": phrase_hits[:5],
"detection_method": "regex+phrases" if recital_matches and phrase_hits
else "regex" if recital_matches else "phrases",
}
CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))
VERIFICATION_KEYWORDS = {
@@ -1520,9 +1576,23 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
) -> tuple[GeneratedControl, bool]:
"""Cross-validate category/domain using keyword detection + local LLM.
Also checks for recital (Erwägungsgrund) contamination in source text.
Returns (control, was_fixed). Only triggers Ollama QA when the LLM
classification disagrees with keyword detection — keeps it fast.
"""
# ── Recital detection ──────────────────────────────────────────
source_text = control.source_original_text or ""
recital_info = _detect_recital(source_text)
if recital_info:
control.generation_metadata["recital_suspect"] = True
control.generation_metadata["recital_detection"] = recital_info
control.release_state = "needs_review"
logger.warning(
"Recital suspect: '%s' — recitals %s detected in source text",
control.title[:40],
recital_info.get("recital_numbers", []),
)
kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
kw_domain = _detect_domain(chunk_text)
llm_domain = control.generation_metadata.get("_effective_domain", "")

View File

@@ -7,6 +7,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
from compliance.services.control_generator import (
_classify_regulation,
_detect_domain,
_detect_recital,
_parse_llm_json,
_parse_llm_json_array,
GeneratorConfig,
@@ -1306,3 +1307,92 @@ class TestPipelineVersion:
assert controls[0] is not None
assert controls[1] is None # Null entry from LLM
assert controls[2] is not None
# =============================================================================
# Recital (Erwägungsgrund) Detection Tests
# =============================================================================
class TestRecitalDetection:
"""Tests for _detect_recital — identifying Erwägungsgrund text in source."""
def test_recital_number_detected(self):
"""Text with (126)\\n pattern is flagged as recital suspect."""
text = "Daher ist es wichtig...\n(126)\nDie Konformitätsbewertung sollte..."
result = _detect_recital(text)
assert result is not None
assert result["recital_suspect"] is True
assert "126" in result["recital_numbers"]
def test_multiple_recital_numbers(self):
"""Multiple recital markers are all captured."""
text = "(124)\nErster Punkt.\n(125)\nZweiter Punkt.\n(126)\nDritter Punkt."
result = _detect_recital(text)
assert result is not None
assert "124" in result["recital_numbers"]
assert "125" in result["recital_numbers"]
assert "126" in result["recital_numbers"]
def test_article_text_not_flagged(self):
"""Normal article text without recital markers returns None."""
text = ("Der Anbieter eines Hochrisiko-KI-Systems muss sicherstellen, "
"dass die technische Dokumentation erstellt wird.")
result = _detect_recital(text)
assert result is None
def test_empty_text_returns_none(self):
result = _detect_recital("")
assert result is None
def test_none_text_returns_none(self):
result = _detect_recital(None)
assert result is None
def test_recital_phrases_detected(self):
"""Text with multiple recital-typical phrases is flagged."""
text = ("In Erwägung nachstehender Gründe wurde beschlossen, "
"daher sollte der Anbieter folgende Maßnahmen ergreifen. "
"Es ist daher notwendig, die Konformität sicherzustellen.")
result = _detect_recital(text)
assert result is not None
assert result["detection_method"] == "phrases"
def test_single_phrase_not_enough(self):
"""A single recital phrase alone is not sufficient for detection."""
text = "Daher sollte das System regelmäßig geprüft werden."
result = _detect_recital(text)
assert result is None
def test_combined_regex_and_phrases(self):
"""Both recital numbers and phrases → detection_method is regex+phrases."""
text = "(42)\nIn Erwägung nachstehender Gründe wurde entschieden..."
result = _detect_recital(text)
assert result is not None
assert result["detection_method"] == "regex+phrases"
assert "42" in result["recital_numbers"]
def test_parenthesized_number_without_newline_ignored(self):
"""Numbers in parentheses without trailing newline are not recital markers.
e.g. 'gemäß Absatz (3) des Artikels' should not be flagged."""
text = "Gemäß Absatz (3) des Artikels 52 muss der Anbieter sicherstellen..."
result = _detect_recital(text)
assert result is None
def test_real_world_recital_text(self):
"""Real-world example: AI Act Erwägungsgrund (126) about conformity assessment."""
text = (
"(126)\n"
"Um den Verwaltungsaufwand zu verringern und die Konformitätsbewertung "
"zu vereinfachen, sollten bestimmte Hochrisiko-KI-Systeme, die von "
"Anbietern zertifiziert oder für die eine Konformitätserklärung "
"ausgestellt wurde, automatisch als konform mit den Anforderungen "
"dieser Verordnung gelten, sofern sie den harmonisierten Normen oder "
"gemeinsamen Spezifikationen entsprechen.\n"
"(127)\n"
"Es ist daher angezeigt, dass der Anbieter das entsprechende "
"Konformitätsbewertungsverfahren anwendet."
)
result = _detect_recital(text)
assert result is not None
assert "126" in result["recital_numbers"]
assert "127" in result["recital_numbers"]

View File

@@ -214,13 +214,13 @@ Wenn du z.B. eine neue `GetUserStats()` Funktion im Go Service hinzufuegst:
## Modul-spezifische Tests
### Canonical Control Generator (71+ Tests)
### Canonical Control Generator (81+ Tests)
Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien.
Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) und [Control Generator Pipeline](../services/sdk-modules/control-generator-pipeline.md) fuer Details.
```bash
# Alle Generator-Tests (71 Tests in 10 Klassen)
# Alle Generator-Tests (81 Tests in 12 Klassen)
cd backend-compliance && pytest -v tests/test_control_generator.py
# Similarity Detector Tests
@@ -253,3 +253,4 @@ cd backend-compliance && pytest -v tests/test_validate_controls.py
| `TestBatchProcessingLoop` | 10 | Batch-Verarbeitung (Rule-Split, Mixed-Rules, Too-Close, Null-Handling) |
| `TestRegulationFilter` | 5 | regulation_filter Prefix-Matching, leere regulation_codes |
| `TestPipelineVersion` | 5 | pipeline_version=2 in DB-Writes, null-Handling in Structure/Reform |
| `TestRecitalDetection` | 10 | Erwaegungsgrund-Erkennung in Quelltexten (Regex, Phrasen, Kombiniert) |

View File

@@ -500,6 +500,39 @@ Die QA-Metriken werden in `generation_metadata` gespeichert:
}
```
### Recital-Erkennung (Erwägungsgrund-Detektion)
Die QA-Stufe prueft zusaetzlich, ob der `source_original_text` eines Controls tatsaechlich aus einem Gesetzesartikel stammt — oder aus einem Erwaegungsgrund (Recital). Erwaegungsgruende enthalten keine normativen Pflichten und fuehren zu falsch zugeordneten Controls.
**Erkennungsmethoden:**
| Methode | Pattern | Beispiel |
|---------|---------|----------|
| **Regex** | `\((\d{1,3})\)\s*\n` — Erwaegungsgrund-Nummern | `(126)\nUm den Verwaltungsaufwand...` |
| **Phrasen** | Typische Recital-Formulierungen (≥2 Treffer) | "daher sollte", "in Erwägung nachstehender Gründe" |
**Ergebnis bei Verdacht:**
- `release_state` wird auf `needs_review` gesetzt
- `generation_metadata.recital_suspect = true`
- `generation_metadata.recital_detection` enthaelt Details:
```json
{
"recital_suspect": true,
"recital_detection": {
"recital_suspect": true,
"recital_numbers": ["126", "127"],
"recital_phrases": ["daher sollte"],
"detection_method": "regex+phrases"
}
}
```
**Funktion:** `_detect_recital(text)` in `control_generator.py`
**Hintergrund:** Bei der Analyse von ~5.500 Controls mit Quelltext wurden 1.555 (28%) als Erwaegungsgrund-Verdacht identifiziert. Der Document Crawler unterschied nicht zwischen Artikeltext und Erwaegungsgruenden, was zu falschen `article`/`paragraph`-Zuordnungen fuehrte.
### QA-Reklassifizierung bestehender Controls
```bash
@@ -530,7 +563,7 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/
| `backend-compliance/migrations/046_control_generator.sql` | Job-Tracking, Chunk-Tracking Tabellen |
| `backend-compliance/migrations/048_processing_path_expand.sql` | Erweiterte Processing-Path-Werte |
| `backend-compliance/migrations/062_pipeline_version.sql` | `pipeline_version` Spalte |
| `backend-compliance/tests/test_control_generator.py` | 15 Tests (Lizenz, Domain, Batch, Pipeline) |
| `backend-compliance/tests/test_control_generator.py` | 81+ Tests (Lizenz, Domain, Batch, Pipeline, Recital) |
---

View File

@@ -0,0 +1,79 @@
"""Find controls where source_original_text contains Erwägungsgrund (recital) markers
instead of actual article text — indicates wrong article tagging in RAG chunks."""
import sqlalchemy
import os
import json
import re
url = os.environ.get("DATABASE_URL", "")
if not url:
print("DATABASE_URL not set")
exit(1)
engine = sqlalchemy.create_engine(url)
with engine.connect() as conn:
conn.execute(sqlalchemy.text("SET search_path TO compliance,public"))
r = conn.execute(sqlalchemy.text("""
SELECT control_id, title,
source_citation::text,
source_original_text,
pipeline_version, release_state,
generation_metadata::text
FROM canonical_controls
WHERE source_original_text IS NOT NULL
AND source_original_text != ''
AND source_citation IS NOT NULL
ORDER BY control_id
""")).fetchall()
# Pattern: standalone recital number like (125)\n or (126) at line start
recital_re = re.compile(r'\((\d{1,3})\)\s*\n')
# Pattern: article reference like "Artikel 43" in the text
artikel_re = re.compile(r'Artikel\s+(\d+)', re.IGNORECASE)
suspects_recital = []
suspects_mismatch = []
for row in r:
cid, title, citation_json, orig, pv, state, meta_json = row
if not orig:
continue
citation = json.loads(citation_json) if citation_json else {}
claimed_article = citation.get("article", "")
# Check 1: Recital markers in source text
recital_matches = recital_re.findall(orig)
has_recital = len(recital_matches) > 0
# Check 2: Text mentions a different article than claimed
artikel_matches = artikel_re.findall(orig)
claimed_num = re.search(r'\d+', claimed_article).group() if re.search(r'\d+', claimed_article) else ""
different_articles = [a for a in artikel_matches if a != claimed_num] if claimed_num else []
if has_recital:
suspects_recital.append({
"control_id": cid,
"title": title[:80],
"claimed_article": claimed_article,
"claimed_paragraph": citation.get("paragraph", ""),
"recitals_found": recital_matches[:5],
"v": pv,
"state": state,
})
print(f"=== Ergebnis ===")
print(f"Geprueft: {len(r)} Controls mit source_original_text")
print(f"Erwaegungsgrund-Verdacht: {len(suspects_recital)}")
print()
if suspects_recital:
print(f"{'Control':<12} {'Behauptet':<18} {'Recitals':<20} {'v':>2} {'State':<15} Titel")
print("-" * 120)
for s in suspects_recital:
recitals = ",".join(s["recitals_found"])
print(f"{s['control_id']:<12} {s['claimed_article']:<10} {s['claimed_paragraph']:<7} ({recitals}){'':<{max(0,15-len(recitals))}} v{s['v']} {s['state']:<15} {s['title']}")