feat(qa): recital detection, review split, duplicate comparison
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Add _detect_recital() to QA pipeline — flags controls where source_original_text contains Erwägungsgrund markers instead of article text (28% of controls with source text affected). - Recital detection via regex + phrase matching in QA validation - 10 new tests (TestRecitalDetection), 81 total - ReviewCompare component for side-by-side duplicate comparison - Review mode split: Duplikat-Verdacht vs Rule-3-ohne-Anchor tabs - MkDocs: recital detection documentation - Detection script for bulk analysis (scripts/find_recital_controls.py) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,264 @@
|
||||
'use client'
|
||||
|
||||
import { useState, useEffect } from 'react'
|
||||
import {
|
||||
ArrowLeft, CheckCircle2, Trash2, Pencil, SkipForward,
|
||||
ChevronLeft, Scale, BookOpen, ExternalLink, AlertTriangle,
|
||||
FileText, Clock,
|
||||
} from 'lucide-react'
|
||||
import {
|
||||
CanonicalControl, BACKEND_URL,
|
||||
SeverityBadge, StateBadge, LicenseRuleBadge, CategoryBadge, TargetAudienceBadge,
|
||||
} from './helpers'
|
||||
|
||||
// =============================================================================
|
||||
// Compact Control Panel (used on both sides of the comparison)
|
||||
// =============================================================================
|
||||
|
||||
function ControlPanel({ ctrl, label, highlight }: { ctrl: CanonicalControl; label: string; highlight?: boolean }) {
|
||||
return (
|
||||
<div className={`flex flex-col h-full overflow-y-auto ${highlight ? 'bg-yellow-50' : 'bg-white'}`}>
|
||||
{/* Panel Header */}
|
||||
<div className={`sticky top-0 z-10 px-4 py-3 border-b ${highlight ? 'bg-yellow-100 border-yellow-200' : 'bg-gray-50 border-gray-200'}`}>
|
||||
<div className="text-xs font-semibold uppercase tracking-wide text-gray-500 mb-1">{label}</div>
|
||||
<div className="flex items-center gap-2 flex-wrap">
|
||||
<span className="text-sm font-mono text-purple-600 bg-purple-50 px-2 py-0.5 rounded">{ctrl.control_id}</span>
|
||||
<SeverityBadge severity={ctrl.severity} />
|
||||
<StateBadge state={ctrl.release_state} />
|
||||
<LicenseRuleBadge rule={ctrl.license_rule} />
|
||||
<CategoryBadge category={ctrl.category} />
|
||||
<TargetAudienceBadge audience={ctrl.target_audience} />
|
||||
</div>
|
||||
<h3 className="text-sm font-semibold text-gray-900 mt-1 leading-snug">{ctrl.title}</h3>
|
||||
</div>
|
||||
|
||||
{/* Panel Content */}
|
||||
<div className="p-4 space-y-4 text-sm">
|
||||
{/* Objective */}
|
||||
<section>
|
||||
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Ziel</h4>
|
||||
<p className="text-gray-700 leading-relaxed">{ctrl.objective}</p>
|
||||
</section>
|
||||
|
||||
{/* Rationale */}
|
||||
{ctrl.rationale && (
|
||||
<section>
|
||||
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Begruendung</h4>
|
||||
<p className="text-gray-700 leading-relaxed">{ctrl.rationale}</p>
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Source Citation (Rule 1+2) */}
|
||||
{ctrl.source_citation && (
|
||||
<section className="bg-blue-50 border border-blue-200 rounded-lg p-3">
|
||||
<div className="flex items-center gap-1.5 mb-1">
|
||||
<Scale className="w-3.5 h-3.5 text-blue-600" />
|
||||
<span className="text-xs font-semibold text-blue-900">Gesetzliche Grundlage</span>
|
||||
</div>
|
||||
{ctrl.source_citation.source && (
|
||||
<p className="text-xs text-blue-800">
|
||||
{ctrl.source_citation.source}
|
||||
{ctrl.source_citation.article && ` — ${ctrl.source_citation.article}`}
|
||||
{ctrl.source_citation.paragraph && ` ${ctrl.source_citation.paragraph}`}
|
||||
</p>
|
||||
)}
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Requirements */}
|
||||
{ctrl.requirements.length > 0 && (
|
||||
<section>
|
||||
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Anforderungen</h4>
|
||||
<ol className="list-decimal list-inside space-y-1">
|
||||
{ctrl.requirements.map((r, i) => (
|
||||
<li key={i} className="text-gray-700 text-xs leading-relaxed">{r}</li>
|
||||
))}
|
||||
</ol>
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Test Procedure */}
|
||||
{ctrl.test_procedure.length > 0 && (
|
||||
<section>
|
||||
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Pruefverfahren</h4>
|
||||
<ol className="list-decimal list-inside space-y-1">
|
||||
{ctrl.test_procedure.map((s, i) => (
|
||||
<li key={i} className="text-gray-700 text-xs leading-relaxed">{s}</li>
|
||||
))}
|
||||
</ol>
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Open Anchors */}
|
||||
{ctrl.open_anchors.length > 0 && (
|
||||
<section className="bg-green-50 border border-green-200 rounded-lg p-3">
|
||||
<div className="flex items-center gap-1.5 mb-2">
|
||||
<BookOpen className="w-3.5 h-3.5 text-green-700" />
|
||||
<span className="text-xs font-semibold text-green-900">Referenzen ({ctrl.open_anchors.length})</span>
|
||||
</div>
|
||||
<div className="space-y-1">
|
||||
{ctrl.open_anchors.map((a, i) => (
|
||||
<div key={i} className="flex items-center gap-1.5 text-xs">
|
||||
<ExternalLink className="w-3 h-3 text-green-600 flex-shrink-0" />
|
||||
<span className="font-medium text-green-800">{a.framework}</span>
|
||||
<span className="text-green-700">{a.ref}</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Tags */}
|
||||
{ctrl.tags.length > 0 && (
|
||||
<div className="flex items-center gap-1 flex-wrap">
|
||||
{ctrl.tags.map(t => (
|
||||
<span key={t} className="px-2 py-0.5 bg-gray-100 text-gray-600 rounded text-xs">{t}</span>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// ReviewCompare — Side-by-Side Duplicate Comparison
|
||||
// =============================================================================
|
||||
|
||||
interface ReviewCompareProps {
|
||||
ctrl: CanonicalControl
|
||||
onBack: () => void
|
||||
onReview: (controlId: string, action: string) => void
|
||||
onEdit: () => void
|
||||
reviewIndex: number
|
||||
reviewTotal: number
|
||||
onReviewPrev: () => void
|
||||
onReviewNext: () => void
|
||||
}
|
||||
|
||||
export function ReviewCompare({
|
||||
ctrl,
|
||||
onBack,
|
||||
onReview,
|
||||
onEdit,
|
||||
reviewIndex,
|
||||
reviewTotal,
|
||||
onReviewPrev,
|
||||
onReviewNext,
|
||||
}: ReviewCompareProps) {
|
||||
const [suspectedDuplicate, setSuspectedDuplicate] = useState<CanonicalControl | null>(null)
|
||||
const [loading, setLoading] = useState(false)
|
||||
const [similarity, setSimilarity] = useState<number | null>(null)
|
||||
|
||||
// Load the suspected duplicate from generation_metadata.similar_controls
|
||||
useEffect(() => {
|
||||
const loadDuplicate = async () => {
|
||||
const similarControls = ctrl.generation_metadata?.similar_controls as Array<{ control_id: string; title: string; similarity: number }> | undefined
|
||||
if (!similarControls || similarControls.length === 0) {
|
||||
setSuspectedDuplicate(null)
|
||||
setSimilarity(null)
|
||||
return
|
||||
}
|
||||
|
||||
const suspect = similarControls[0]
|
||||
setSimilarity(suspect.similarity)
|
||||
setLoading(true)
|
||||
|
||||
try {
|
||||
const res = await fetch(`${BACKEND_URL}?endpoint=control&id=${encodeURIComponent(suspect.control_id)}`)
|
||||
if (res.ok) {
|
||||
const data = await res.json()
|
||||
setSuspectedDuplicate(data)
|
||||
} else {
|
||||
setSuspectedDuplicate(null)
|
||||
}
|
||||
} catch {
|
||||
setSuspectedDuplicate(null)
|
||||
} finally {
|
||||
setLoading(false)
|
||||
}
|
||||
}
|
||||
|
||||
loadDuplicate()
|
||||
}, [ctrl.control_id, ctrl.generation_metadata])
|
||||
|
||||
return (
|
||||
<div className="flex flex-col h-full">
|
||||
{/* Header */}
|
||||
<div className="border-b border-gray-200 bg-white px-6 py-3 flex items-center justify-between">
|
||||
<div className="flex items-center gap-3">
|
||||
<button onClick={onBack} className="text-gray-400 hover:text-gray-600">
|
||||
<ArrowLeft className="w-5 h-5" />
|
||||
</button>
|
||||
<div>
|
||||
<div className="flex items-center gap-2">
|
||||
<AlertTriangle className="w-4 h-4 text-amber-500" />
|
||||
<span className="text-sm font-semibold text-gray-900">Duplikat-Vergleich</span>
|
||||
{similarity !== null && (
|
||||
<span className="text-xs font-medium text-amber-600 bg-amber-50 px-2 py-0.5 rounded-full">
|
||||
{(similarity * 100).toFixed(1)}% Aehnlichkeit
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex items-center gap-2">
|
||||
{/* Navigation */}
|
||||
<div className="flex items-center gap-1 mr-3">
|
||||
<button onClick={onReviewPrev} disabled={reviewIndex === 0} className="p-1 text-gray-400 hover:text-gray-600 disabled:opacity-30">
|
||||
<ChevronLeft className="w-4 h-4" />
|
||||
</button>
|
||||
<span className="text-xs text-gray-500 font-medium">{reviewIndex + 1} / {reviewTotal}</span>
|
||||
<button onClick={onReviewNext} disabled={reviewIndex >= reviewTotal - 1} className="p-1 text-gray-400 hover:text-gray-600 disabled:opacity-30">
|
||||
<SkipForward className="w-4 h-4" />
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* Actions */}
|
||||
<button
|
||||
onClick={() => onReview(ctrl.control_id, 'approve')}
|
||||
className="px-3 py-1.5 text-sm text-white bg-green-600 rounded-lg hover:bg-green-700"
|
||||
>
|
||||
<CheckCircle2 className="w-3.5 h-3.5 inline mr-1" />Behalten
|
||||
</button>
|
||||
<button
|
||||
onClick={() => onReview(ctrl.control_id, 'reject')}
|
||||
className="px-3 py-1.5 text-sm text-white bg-red-600 rounded-lg hover:bg-red-700"
|
||||
>
|
||||
<Trash2 className="w-3.5 h-3.5 inline mr-1" />Duplikat
|
||||
</button>
|
||||
<button
|
||||
onClick={onEdit}
|
||||
className="px-3 py-1.5 text-sm text-gray-600 border border-gray-300 rounded-lg hover:bg-gray-50"
|
||||
>
|
||||
<Pencil className="w-3.5 h-3.5 inline mr-1" />Bearbeiten
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Side-by-Side Panels */}
|
||||
<div className="flex-1 flex overflow-hidden">
|
||||
{/* Left: Control to review */}
|
||||
<div className="w-1/2 border-r border-gray-200 overflow-y-auto">
|
||||
<ControlPanel ctrl={ctrl} label="Zu pruefen" highlight />
|
||||
</div>
|
||||
|
||||
{/* Right: Suspected duplicate */}
|
||||
<div className="w-1/2 overflow-y-auto">
|
||||
{loading ? (
|
||||
<div className="flex items-center justify-center h-full">
|
||||
<div className="animate-spin rounded-full h-6 w-6 border-2 border-purple-600 border-t-transparent" />
|
||||
</div>
|
||||
) : suspectedDuplicate ? (
|
||||
<ControlPanel ctrl={suspectedDuplicate} label="Bestehendes Control (Verdacht)" />
|
||||
) : (
|
||||
<div className="flex items-center justify-center h-full text-gray-400 text-sm">
|
||||
Kein Duplikat-Kandidat gefunden
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -14,6 +14,7 @@ import {
|
||||
} from './components/helpers'
|
||||
import { ControlForm } from './components/ControlForm'
|
||||
import { ControlDetail } from './components/ControlDetail'
|
||||
import { ReviewCompare } from './components/ReviewCompare'
|
||||
import { GeneratorModal } from './components/GeneratorModal'
|
||||
|
||||
// =============================================================================
|
||||
@@ -71,6 +72,9 @@ export default function ControlLibraryPage() {
|
||||
const [reviewIndex, setReviewIndex] = useState(0)
|
||||
const [reviewItems, setReviewItems] = useState<CanonicalControl[]>([])
|
||||
const [reviewCount, setReviewCount] = useState(0)
|
||||
const [reviewTab, setReviewTab] = useState<'duplicates' | 'rule3'>('duplicates')
|
||||
const [reviewDuplicates, setReviewDuplicates] = useState<CanonicalControl[]>([])
|
||||
const [reviewRule3, setReviewRule3] = useState<CanonicalControl[]>([])
|
||||
|
||||
// Debounce search
|
||||
const searchTimer = useRef<ReturnType<typeof setTimeout> | null>(null)
|
||||
@@ -303,20 +307,47 @@ export default function ControlLibraryPage() {
|
||||
const enterReviewMode = async () => {
|
||||
// Load review items from backend
|
||||
try {
|
||||
const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=200`)
|
||||
const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=1000`)
|
||||
if (res.ok) {
|
||||
const items = await res.json()
|
||||
const items: CanonicalControl[] = await res.json()
|
||||
if (items.length > 0) {
|
||||
setReviewItems(items)
|
||||
// Split into duplicate suspects vs rule 3 without anchor
|
||||
const dupes = items.filter(c =>
|
||||
c.generation_metadata?.similar_controls &&
|
||||
Array.isArray(c.generation_metadata.similar_controls) &&
|
||||
(c.generation_metadata.similar_controls as unknown[]).length > 0
|
||||
)
|
||||
const rule3 = items.filter(c =>
|
||||
!c.generation_metadata?.similar_controls ||
|
||||
!Array.isArray(c.generation_metadata.similar_controls) ||
|
||||
(c.generation_metadata.similar_controls as unknown[]).length === 0
|
||||
)
|
||||
setReviewDuplicates(dupes)
|
||||
setReviewRule3(rule3)
|
||||
// Start with duplicates tab if any, otherwise rule3
|
||||
const startTab = dupes.length > 0 ? 'duplicates' : 'rule3'
|
||||
const startItems = startTab === 'duplicates' ? dupes : rule3
|
||||
setReviewTab(startTab)
|
||||
setReviewItems(startItems)
|
||||
setReviewMode(true)
|
||||
setReviewIndex(0)
|
||||
setSelectedControl(items[0])
|
||||
setSelectedControl(startItems[0])
|
||||
setMode('detail')
|
||||
}
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
|
||||
const switchReviewTab = (tab: 'duplicates' | 'rule3') => {
|
||||
const items = tab === 'duplicates' ? reviewDuplicates : reviewRule3
|
||||
setReviewTab(tab)
|
||||
setReviewItems(items)
|
||||
setReviewIndex(0)
|
||||
if (items.length > 0) {
|
||||
setSelectedControl(items[0])
|
||||
}
|
||||
}
|
||||
|
||||
// Loading
|
||||
if (loading && controls.length === 0) {
|
||||
return (
|
||||
@@ -363,7 +394,66 @@ export default function ControlLibraryPage() {
|
||||
|
||||
// DETAIL MODE
|
||||
if (mode === 'detail' && selectedControl) {
|
||||
const isDuplicateReview = reviewMode && reviewTab === 'duplicates'
|
||||
|
||||
// Review tab bar (shown above the detail/compare view in review mode)
|
||||
const reviewTabBar = reviewMode ? (
|
||||
<div className="border-b border-gray-200 bg-white px-6 py-2 flex items-center gap-4">
|
||||
<button
|
||||
onClick={() => switchReviewTab('duplicates')}
|
||||
className={`px-3 py-1.5 text-sm rounded-lg font-medium ${
|
||||
reviewTab === 'duplicates'
|
||||
? 'bg-amber-100 text-amber-800 border border-amber-300'
|
||||
: 'text-gray-500 hover:text-gray-700 hover:bg-gray-100'
|
||||
}`}
|
||||
>
|
||||
Duplikat-Verdacht ({reviewDuplicates.length})
|
||||
</button>
|
||||
<button
|
||||
onClick={() => switchReviewTab('rule3')}
|
||||
className={`px-3 py-1.5 text-sm rounded-lg font-medium ${
|
||||
reviewTab === 'rule3'
|
||||
? 'bg-purple-100 text-purple-800 border border-purple-300'
|
||||
: 'text-gray-500 hover:text-gray-700 hover:bg-gray-100'
|
||||
}`}
|
||||
>
|
||||
Rule 3 ohne Anchor ({reviewRule3.length})
|
||||
</button>
|
||||
</div>
|
||||
) : null
|
||||
|
||||
if (isDuplicateReview) {
|
||||
return (
|
||||
<div className="flex flex-col h-full">
|
||||
{reviewTabBar}
|
||||
<div className="flex-1 overflow-hidden">
|
||||
<ReviewCompare
|
||||
ctrl={selectedControl}
|
||||
onBack={() => { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
|
||||
onReview={handleReview}
|
||||
onEdit={() => setMode('edit')}
|
||||
reviewIndex={reviewIndex}
|
||||
reviewTotal={reviewItems.length}
|
||||
onReviewPrev={() => {
|
||||
const idx = Math.max(0, reviewIndex - 1)
|
||||
setReviewIndex(idx)
|
||||
setSelectedControl(reviewItems[idx])
|
||||
}}
|
||||
onReviewNext={() => {
|
||||
const idx = Math.min(reviewItems.length - 1, reviewIndex + 1)
|
||||
setReviewIndex(idx)
|
||||
setSelectedControl(reviewItems[idx])
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="flex flex-col h-full">
|
||||
{reviewTabBar}
|
||||
<div className="flex-1 overflow-hidden">
|
||||
<ControlDetail
|
||||
ctrl={selectedControl}
|
||||
onBack={() => { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
|
||||
@@ -385,6 +475,8 @@ export default function ControlLibraryPage() {
|
||||
setSelectedControl(reviewItems[idx])
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -321,6 +321,62 @@ VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
|
||||
VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
|
||||
"AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Recital (Erwägungsgrund) detection in source text
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Pattern: standalone recital number like (125)\n or (126) at line start
|
||||
_RECITAL_RE = re.compile(r'\((\d{1,3})\)\s*\n')
|
||||
|
||||
# Recital-typical phrasing (German EU law Erwägungsgründe)
|
||||
_RECITAL_PHRASES = [
|
||||
"in erwägung nachstehender gründe",
|
||||
"erwägungsgrund",
|
||||
"in anbetracht",
|
||||
"daher sollte",
|
||||
"aus diesem grund",
|
||||
"es ist daher",
|
||||
"folglich sollte",
|
||||
"es sollte daher",
|
||||
"in diesem zusammenhang",
|
||||
]
|
||||
|
||||
|
||||
def _detect_recital(text: str) -> Optional[dict]:
|
||||
"""Detect if source text is a recital (Erwägungsgrund) rather than an article.
|
||||
|
||||
Returns a dict with detection details if recital markers are found,
|
||||
or None if the text appears to be genuine article text.
|
||||
|
||||
Detection criteria:
|
||||
1. Standalone recital numbers like (126)\\n in the text
|
||||
2. Recital-typical phrasing ("daher sollte", "erwägungsgrund", etc.)
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Check 1: Recital number markers
|
||||
recital_matches = _RECITAL_RE.findall(text)
|
||||
|
||||
# Check 2: Recital phrasing
|
||||
text_lower = text.lower()
|
||||
phrase_hits = [p for p in _RECITAL_PHRASES if p in text_lower]
|
||||
|
||||
if not recital_matches and not phrase_hits:
|
||||
return None
|
||||
|
||||
# Require at least recital numbers OR >=2 phrase hits to be a suspect
|
||||
if not recital_matches and len(phrase_hits) < 2:
|
||||
return None
|
||||
|
||||
return {
|
||||
"recital_suspect": True,
|
||||
"recital_numbers": recital_matches[:10],
|
||||
"recital_phrases": phrase_hits[:5],
|
||||
"detection_method": "regex+phrases" if recital_matches and phrase_hits
|
||||
else "regex" if recital_matches else "phrases",
|
||||
}
|
||||
|
||||
CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))
|
||||
|
||||
VERIFICATION_KEYWORDS = {
|
||||
@@ -1520,9 +1576,23 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
|
||||
) -> tuple[GeneratedControl, bool]:
|
||||
"""Cross-validate category/domain using keyword detection + local LLM.
|
||||
|
||||
Also checks for recital (Erwägungsgrund) contamination in source text.
|
||||
Returns (control, was_fixed). Only triggers Ollama QA when the LLM
|
||||
classification disagrees with keyword detection — keeps it fast.
|
||||
"""
|
||||
# ── Recital detection ──────────────────────────────────────────
|
||||
source_text = control.source_original_text or ""
|
||||
recital_info = _detect_recital(source_text)
|
||||
if recital_info:
|
||||
control.generation_metadata["recital_suspect"] = True
|
||||
control.generation_metadata["recital_detection"] = recital_info
|
||||
control.release_state = "needs_review"
|
||||
logger.warning(
|
||||
"Recital suspect: '%s' — recitals %s detected in source text",
|
||||
control.title[:40],
|
||||
recital_info.get("recital_numbers", []),
|
||||
)
|
||||
|
||||
kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
|
||||
kw_domain = _detect_domain(chunk_text)
|
||||
llm_domain = control.generation_metadata.get("_effective_domain", "")
|
||||
|
||||
@@ -7,6 +7,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
|
||||
from compliance.services.control_generator import (
|
||||
_classify_regulation,
|
||||
_detect_domain,
|
||||
_detect_recital,
|
||||
_parse_llm_json,
|
||||
_parse_llm_json_array,
|
||||
GeneratorConfig,
|
||||
@@ -1306,3 +1307,92 @@ class TestPipelineVersion:
|
||||
assert controls[0] is not None
|
||||
assert controls[1] is None # Null entry from LLM
|
||||
assert controls[2] is not None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Recital (Erwägungsgrund) Detection Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestRecitalDetection:
|
||||
"""Tests for _detect_recital — identifying Erwägungsgrund text in source."""
|
||||
|
||||
def test_recital_number_detected(self):
|
||||
"""Text with (126)\\n pattern is flagged as recital suspect."""
|
||||
text = "Daher ist es wichtig...\n(126)\nDie Konformitätsbewertung sollte..."
|
||||
result = _detect_recital(text)
|
||||
assert result is not None
|
||||
assert result["recital_suspect"] is True
|
||||
assert "126" in result["recital_numbers"]
|
||||
|
||||
def test_multiple_recital_numbers(self):
|
||||
"""Multiple recital markers are all captured."""
|
||||
text = "(124)\nErster Punkt.\n(125)\nZweiter Punkt.\n(126)\nDritter Punkt."
|
||||
result = _detect_recital(text)
|
||||
assert result is not None
|
||||
assert "124" in result["recital_numbers"]
|
||||
assert "125" in result["recital_numbers"]
|
||||
assert "126" in result["recital_numbers"]
|
||||
|
||||
def test_article_text_not_flagged(self):
|
||||
"""Normal article text without recital markers returns None."""
|
||||
text = ("Der Anbieter eines Hochrisiko-KI-Systems muss sicherstellen, "
|
||||
"dass die technische Dokumentation erstellt wird.")
|
||||
result = _detect_recital(text)
|
||||
assert result is None
|
||||
|
||||
def test_empty_text_returns_none(self):
|
||||
result = _detect_recital("")
|
||||
assert result is None
|
||||
|
||||
def test_none_text_returns_none(self):
|
||||
result = _detect_recital(None)
|
||||
assert result is None
|
||||
|
||||
def test_recital_phrases_detected(self):
|
||||
"""Text with multiple recital-typical phrases is flagged."""
|
||||
text = ("In Erwägung nachstehender Gründe wurde beschlossen, "
|
||||
"daher sollte der Anbieter folgende Maßnahmen ergreifen. "
|
||||
"Es ist daher notwendig, die Konformität sicherzustellen.")
|
||||
result = _detect_recital(text)
|
||||
assert result is not None
|
||||
assert result["detection_method"] == "phrases"
|
||||
|
||||
def test_single_phrase_not_enough(self):
|
||||
"""A single recital phrase alone is not sufficient for detection."""
|
||||
text = "Daher sollte das System regelmäßig geprüft werden."
|
||||
result = _detect_recital(text)
|
||||
assert result is None
|
||||
|
||||
def test_combined_regex_and_phrases(self):
|
||||
"""Both recital numbers and phrases → detection_method is regex+phrases."""
|
||||
text = "(42)\nIn Erwägung nachstehender Gründe wurde entschieden..."
|
||||
result = _detect_recital(text)
|
||||
assert result is not None
|
||||
assert result["detection_method"] == "regex+phrases"
|
||||
assert "42" in result["recital_numbers"]
|
||||
|
||||
def test_parenthesized_number_without_newline_ignored(self):
|
||||
"""Numbers in parentheses without trailing newline are not recital markers.
|
||||
e.g. 'gemäß Absatz (3) des Artikels' should not be flagged."""
|
||||
text = "Gemäß Absatz (3) des Artikels 52 muss der Anbieter sicherstellen..."
|
||||
result = _detect_recital(text)
|
||||
assert result is None
|
||||
|
||||
def test_real_world_recital_text(self):
|
||||
"""Real-world example: AI Act Erwägungsgrund (126) about conformity assessment."""
|
||||
text = (
|
||||
"(126)\n"
|
||||
"Um den Verwaltungsaufwand zu verringern und die Konformitätsbewertung "
|
||||
"zu vereinfachen, sollten bestimmte Hochrisiko-KI-Systeme, die von "
|
||||
"Anbietern zertifiziert oder für die eine Konformitätserklärung "
|
||||
"ausgestellt wurde, automatisch als konform mit den Anforderungen "
|
||||
"dieser Verordnung gelten, sofern sie den harmonisierten Normen oder "
|
||||
"gemeinsamen Spezifikationen entsprechen.\n"
|
||||
"(127)\n"
|
||||
"Es ist daher angezeigt, dass der Anbieter das entsprechende "
|
||||
"Konformitätsbewertungsverfahren anwendet."
|
||||
)
|
||||
result = _detect_recital(text)
|
||||
assert result is not None
|
||||
assert "126" in result["recital_numbers"]
|
||||
assert "127" in result["recital_numbers"]
|
||||
|
||||
@@ -214,13 +214,13 @@ Wenn du z.B. eine neue `GetUserStats()` Funktion im Go Service hinzufuegst:
|
||||
|
||||
## Modul-spezifische Tests
|
||||
|
||||
### Canonical Control Generator (71+ Tests)
|
||||
### Canonical Control Generator (81+ Tests)
|
||||
|
||||
Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien.
|
||||
Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) und [Control Generator Pipeline](../services/sdk-modules/control-generator-pipeline.md) fuer Details.
|
||||
|
||||
```bash
|
||||
# Alle Generator-Tests (71 Tests in 10 Klassen)
|
||||
# Alle Generator-Tests (81 Tests in 12 Klassen)
|
||||
cd backend-compliance && pytest -v tests/test_control_generator.py
|
||||
|
||||
# Similarity Detector Tests
|
||||
@@ -253,3 +253,4 @@ cd backend-compliance && pytest -v tests/test_validate_controls.py
|
||||
| `TestBatchProcessingLoop` | 10 | Batch-Verarbeitung (Rule-Split, Mixed-Rules, Too-Close, Null-Handling) |
|
||||
| `TestRegulationFilter` | 5 | regulation_filter Prefix-Matching, leere regulation_codes |
|
||||
| `TestPipelineVersion` | 5 | pipeline_version=2 in DB-Writes, null-Handling in Structure/Reform |
|
||||
| `TestRecitalDetection` | 10 | Erwaegungsgrund-Erkennung in Quelltexten (Regex, Phrasen, Kombiniert) |
|
||||
|
||||
@@ -500,6 +500,39 @@ Die QA-Metriken werden in `generation_metadata` gespeichert:
|
||||
}
|
||||
```
|
||||
|
||||
### Recital-Erkennung (Erwägungsgrund-Detektion)
|
||||
|
||||
Die QA-Stufe prueft zusaetzlich, ob der `source_original_text` eines Controls tatsaechlich aus einem Gesetzesartikel stammt — oder aus einem Erwaegungsgrund (Recital). Erwaegungsgruende enthalten keine normativen Pflichten und fuehren zu falsch zugeordneten Controls.
|
||||
|
||||
**Erkennungsmethoden:**
|
||||
|
||||
| Methode | Pattern | Beispiel |
|
||||
|---------|---------|----------|
|
||||
| **Regex** | `\((\d{1,3})\)\s*\n` — Erwaegungsgrund-Nummern | `(126)\nUm den Verwaltungsaufwand...` |
|
||||
| **Phrasen** | Typische Recital-Formulierungen (≥2 Treffer) | "daher sollte", "in Erwägung nachstehender Gründe" |
|
||||
|
||||
**Ergebnis bei Verdacht:**
|
||||
|
||||
- `release_state` wird auf `needs_review` gesetzt
|
||||
- `generation_metadata.recital_suspect = true`
|
||||
- `generation_metadata.recital_detection` enthaelt Details:
|
||||
|
||||
```json
|
||||
{
|
||||
"recital_suspect": true,
|
||||
"recital_detection": {
|
||||
"recital_suspect": true,
|
||||
"recital_numbers": ["126", "127"],
|
||||
"recital_phrases": ["daher sollte"],
|
||||
"detection_method": "regex+phrases"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Funktion:** `_detect_recital(text)` in `control_generator.py`
|
||||
|
||||
**Hintergrund:** Bei der Analyse von ~5.500 Controls mit Quelltext wurden 1.555 (28%) als Erwaegungsgrund-Verdacht identifiziert. Der Document Crawler unterschied nicht zwischen Artikeltext und Erwaegungsgruenden, was zu falschen `article`/`paragraph`-Zuordnungen fuehrte.
|
||||
|
||||
### QA-Reklassifizierung bestehender Controls
|
||||
|
||||
```bash
|
||||
@@ -530,7 +563,7 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/
|
||||
| `backend-compliance/migrations/046_control_generator.sql` | Job-Tracking, Chunk-Tracking Tabellen |
|
||||
| `backend-compliance/migrations/048_processing_path_expand.sql` | Erweiterte Processing-Path-Werte |
|
||||
| `backend-compliance/migrations/062_pipeline_version.sql` | `pipeline_version` Spalte |
|
||||
| `backend-compliance/tests/test_control_generator.py` | 15 Tests (Lizenz, Domain, Batch, Pipeline) |
|
||||
| `backend-compliance/tests/test_control_generator.py` | 81+ Tests (Lizenz, Domain, Batch, Pipeline, Recital) |
|
||||
|
||||
---
|
||||
|
||||
|
||||
79
scripts/find_recital_controls.py
Normal file
79
scripts/find_recital_controls.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""Find controls where source_original_text contains Erwägungsgrund (recital) markers
|
||||
instead of actual article text — indicates wrong article tagging in RAG chunks."""
|
||||
|
||||
import sqlalchemy
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
url = os.environ.get("DATABASE_URL", "")
|
||||
if not url:
|
||||
print("DATABASE_URL not set")
|
||||
exit(1)
|
||||
|
||||
engine = sqlalchemy.create_engine(url)
|
||||
|
||||
with engine.connect() as conn:
|
||||
conn.execute(sqlalchemy.text("SET search_path TO compliance,public"))
|
||||
|
||||
r = conn.execute(sqlalchemy.text("""
|
||||
SELECT control_id, title,
|
||||
source_citation::text,
|
||||
source_original_text,
|
||||
pipeline_version, release_state,
|
||||
generation_metadata::text
|
||||
FROM canonical_controls
|
||||
WHERE source_original_text IS NOT NULL
|
||||
AND source_original_text != ''
|
||||
AND source_citation IS NOT NULL
|
||||
ORDER BY control_id
|
||||
""")).fetchall()
|
||||
|
||||
# Pattern: standalone recital number like (125)\n or (126) at line start
|
||||
recital_re = re.compile(r'\((\d{1,3})\)\s*\n')
|
||||
|
||||
# Pattern: article reference like "Artikel 43" in the text
|
||||
artikel_re = re.compile(r'Artikel\s+(\d+)', re.IGNORECASE)
|
||||
|
||||
suspects_recital = []
|
||||
suspects_mismatch = []
|
||||
|
||||
for row in r:
|
||||
cid, title, citation_json, orig, pv, state, meta_json = row
|
||||
if not orig:
|
||||
continue
|
||||
|
||||
citation = json.loads(citation_json) if citation_json else {}
|
||||
claimed_article = citation.get("article", "")
|
||||
|
||||
# Check 1: Recital markers in source text
|
||||
recital_matches = recital_re.findall(orig)
|
||||
has_recital = len(recital_matches) > 0
|
||||
|
||||
# Check 2: Text mentions a different article than claimed
|
||||
artikel_matches = artikel_re.findall(orig)
|
||||
claimed_num = re.search(r'\d+', claimed_article).group() if re.search(r'\d+', claimed_article) else ""
|
||||
different_articles = [a for a in artikel_matches if a != claimed_num] if claimed_num else []
|
||||
|
||||
if has_recital:
|
||||
suspects_recital.append({
|
||||
"control_id": cid,
|
||||
"title": title[:80],
|
||||
"claimed_article": claimed_article,
|
||||
"claimed_paragraph": citation.get("paragraph", ""),
|
||||
"recitals_found": recital_matches[:5],
|
||||
"v": pv,
|
||||
"state": state,
|
||||
})
|
||||
|
||||
print(f"=== Ergebnis ===")
|
||||
print(f"Geprueft: {len(r)} Controls mit source_original_text")
|
||||
print(f"Erwaegungsgrund-Verdacht: {len(suspects_recital)}")
|
||||
print()
|
||||
|
||||
if suspects_recital:
|
||||
print(f"{'Control':<12} {'Behauptet':<18} {'Recitals':<20} {'v':>2} {'State':<15} Titel")
|
||||
print("-" * 120)
|
||||
for s in suspects_recital:
|
||||
recitals = ",".join(s["recitals_found"])
|
||||
print(f"{s['control_id']:<12} {s['claimed_article']:<10} {s['claimed_paragraph']:<7} ({recitals}){'':<{max(0,15-len(recitals))}} v{s['v']} {s['state']:<15} {s['title']}")
|
||||
Reference in New Issue
Block a user