feat(qa): recital detection, review split, duplicate comparison
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Add _detect_recital() to QA pipeline — flags controls where source_original_text contains Erwägungsgrund markers instead of article text (28% of controls with source text affected). - Recital detection via regex + phrase matching in QA validation - 10 new tests (TestRecitalDetection), 81 total - ReviewCompare component for side-by-side duplicate comparison - Review mode split: Duplikat-Verdacht vs Rule-3-ohne-Anchor tabs - MkDocs: recital detection documentation - Detection script for bulk analysis (scripts/find_recital_controls.py) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,264 @@
|
|||||||
|
'use client'
|
||||||
|
|
||||||
|
import { useState, useEffect } from 'react'
|
||||||
|
import {
|
||||||
|
ArrowLeft, CheckCircle2, Trash2, Pencil, SkipForward,
|
||||||
|
ChevronLeft, Scale, BookOpen, ExternalLink, AlertTriangle,
|
||||||
|
FileText, Clock,
|
||||||
|
} from 'lucide-react'
|
||||||
|
import {
|
||||||
|
CanonicalControl, BACKEND_URL,
|
||||||
|
SeverityBadge, StateBadge, LicenseRuleBadge, CategoryBadge, TargetAudienceBadge,
|
||||||
|
} from './helpers'
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// Compact Control Panel (used on both sides of the comparison)
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
function ControlPanel({ ctrl, label, highlight }: { ctrl: CanonicalControl; label: string; highlight?: boolean }) {
|
||||||
|
return (
|
||||||
|
<div className={`flex flex-col h-full overflow-y-auto ${highlight ? 'bg-yellow-50' : 'bg-white'}`}>
|
||||||
|
{/* Panel Header */}
|
||||||
|
<div className={`sticky top-0 z-10 px-4 py-3 border-b ${highlight ? 'bg-yellow-100 border-yellow-200' : 'bg-gray-50 border-gray-200'}`}>
|
||||||
|
<div className="text-xs font-semibold uppercase tracking-wide text-gray-500 mb-1">{label}</div>
|
||||||
|
<div className="flex items-center gap-2 flex-wrap">
|
||||||
|
<span className="text-sm font-mono text-purple-600 bg-purple-50 px-2 py-0.5 rounded">{ctrl.control_id}</span>
|
||||||
|
<SeverityBadge severity={ctrl.severity} />
|
||||||
|
<StateBadge state={ctrl.release_state} />
|
||||||
|
<LicenseRuleBadge rule={ctrl.license_rule} />
|
||||||
|
<CategoryBadge category={ctrl.category} />
|
||||||
|
<TargetAudienceBadge audience={ctrl.target_audience} />
|
||||||
|
</div>
|
||||||
|
<h3 className="text-sm font-semibold text-gray-900 mt-1 leading-snug">{ctrl.title}</h3>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Panel Content */}
|
||||||
|
<div className="p-4 space-y-4 text-sm">
|
||||||
|
{/* Objective */}
|
||||||
|
<section>
|
||||||
|
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Ziel</h4>
|
||||||
|
<p className="text-gray-700 leading-relaxed">{ctrl.objective}</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
{/* Rationale */}
|
||||||
|
{ctrl.rationale && (
|
||||||
|
<section>
|
||||||
|
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Begruendung</h4>
|
||||||
|
<p className="text-gray-700 leading-relaxed">{ctrl.rationale}</p>
|
||||||
|
</section>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Source Citation (Rule 1+2) */}
|
||||||
|
{ctrl.source_citation && (
|
||||||
|
<section className="bg-blue-50 border border-blue-200 rounded-lg p-3">
|
||||||
|
<div className="flex items-center gap-1.5 mb-1">
|
||||||
|
<Scale className="w-3.5 h-3.5 text-blue-600" />
|
||||||
|
<span className="text-xs font-semibold text-blue-900">Gesetzliche Grundlage</span>
|
||||||
|
</div>
|
||||||
|
{ctrl.source_citation.source && (
|
||||||
|
<p className="text-xs text-blue-800">
|
||||||
|
{ctrl.source_citation.source}
|
||||||
|
{ctrl.source_citation.article && ` — ${ctrl.source_citation.article}`}
|
||||||
|
{ctrl.source_citation.paragraph && ` ${ctrl.source_citation.paragraph}`}
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
</section>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Requirements */}
|
||||||
|
{ctrl.requirements.length > 0 && (
|
||||||
|
<section>
|
||||||
|
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Anforderungen</h4>
|
||||||
|
<ol className="list-decimal list-inside space-y-1">
|
||||||
|
{ctrl.requirements.map((r, i) => (
|
||||||
|
<li key={i} className="text-gray-700 text-xs leading-relaxed">{r}</li>
|
||||||
|
))}
|
||||||
|
</ol>
|
||||||
|
</section>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Test Procedure */}
|
||||||
|
{ctrl.test_procedure.length > 0 && (
|
||||||
|
<section>
|
||||||
|
<h4 className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-1">Pruefverfahren</h4>
|
||||||
|
<ol className="list-decimal list-inside space-y-1">
|
||||||
|
{ctrl.test_procedure.map((s, i) => (
|
||||||
|
<li key={i} className="text-gray-700 text-xs leading-relaxed">{s}</li>
|
||||||
|
))}
|
||||||
|
</ol>
|
||||||
|
</section>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Open Anchors */}
|
||||||
|
{ctrl.open_anchors.length > 0 && (
|
||||||
|
<section className="bg-green-50 border border-green-200 rounded-lg p-3">
|
||||||
|
<div className="flex items-center gap-1.5 mb-2">
|
||||||
|
<BookOpen className="w-3.5 h-3.5 text-green-700" />
|
||||||
|
<span className="text-xs font-semibold text-green-900">Referenzen ({ctrl.open_anchors.length})</span>
|
||||||
|
</div>
|
||||||
|
<div className="space-y-1">
|
||||||
|
{ctrl.open_anchors.map((a, i) => (
|
||||||
|
<div key={i} className="flex items-center gap-1.5 text-xs">
|
||||||
|
<ExternalLink className="w-3 h-3 text-green-600 flex-shrink-0" />
|
||||||
|
<span className="font-medium text-green-800">{a.framework}</span>
|
||||||
|
<span className="text-green-700">{a.ref}</span>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Tags */}
|
||||||
|
{ctrl.tags.length > 0 && (
|
||||||
|
<div className="flex items-center gap-1 flex-wrap">
|
||||||
|
{ctrl.tags.map(t => (
|
||||||
|
<span key={t} className="px-2 py-0.5 bg-gray-100 text-gray-600 rounded text-xs">{t}</span>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// ReviewCompare — Side-by-Side Duplicate Comparison
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
interface ReviewCompareProps {
|
||||||
|
ctrl: CanonicalControl
|
||||||
|
onBack: () => void
|
||||||
|
onReview: (controlId: string, action: string) => void
|
||||||
|
onEdit: () => void
|
||||||
|
reviewIndex: number
|
||||||
|
reviewTotal: number
|
||||||
|
onReviewPrev: () => void
|
||||||
|
onReviewNext: () => void
|
||||||
|
}
|
||||||
|
|
||||||
|
export function ReviewCompare({
|
||||||
|
ctrl,
|
||||||
|
onBack,
|
||||||
|
onReview,
|
||||||
|
onEdit,
|
||||||
|
reviewIndex,
|
||||||
|
reviewTotal,
|
||||||
|
onReviewPrev,
|
||||||
|
onReviewNext,
|
||||||
|
}: ReviewCompareProps) {
|
||||||
|
const [suspectedDuplicate, setSuspectedDuplicate] = useState<CanonicalControl | null>(null)
|
||||||
|
const [loading, setLoading] = useState(false)
|
||||||
|
const [similarity, setSimilarity] = useState<number | null>(null)
|
||||||
|
|
||||||
|
// Load the suspected duplicate from generation_metadata.similar_controls
|
||||||
|
useEffect(() => {
|
||||||
|
const loadDuplicate = async () => {
|
||||||
|
const similarControls = ctrl.generation_metadata?.similar_controls as Array<{ control_id: string; title: string; similarity: number }> | undefined
|
||||||
|
if (!similarControls || similarControls.length === 0) {
|
||||||
|
setSuspectedDuplicate(null)
|
||||||
|
setSimilarity(null)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const suspect = similarControls[0]
|
||||||
|
setSimilarity(suspect.similarity)
|
||||||
|
setLoading(true)
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch(`${BACKEND_URL}?endpoint=control&id=${encodeURIComponent(suspect.control_id)}`)
|
||||||
|
if (res.ok) {
|
||||||
|
const data = await res.json()
|
||||||
|
setSuspectedDuplicate(data)
|
||||||
|
} else {
|
||||||
|
setSuspectedDuplicate(null)
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
setSuspectedDuplicate(null)
|
||||||
|
} finally {
|
||||||
|
setLoading(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
loadDuplicate()
|
||||||
|
}, [ctrl.control_id, ctrl.generation_metadata])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="flex flex-col h-full">
|
||||||
|
{/* Header */}
|
||||||
|
<div className="border-b border-gray-200 bg-white px-6 py-3 flex items-center justify-between">
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<button onClick={onBack} className="text-gray-400 hover:text-gray-600">
|
||||||
|
<ArrowLeft className="w-5 h-5" />
|
||||||
|
</button>
|
||||||
|
<div>
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<AlertTriangle className="w-4 h-4 text-amber-500" />
|
||||||
|
<span className="text-sm font-semibold text-gray-900">Duplikat-Vergleich</span>
|
||||||
|
{similarity !== null && (
|
||||||
|
<span className="text-xs font-medium text-amber-600 bg-amber-50 px-2 py-0.5 rounded-full">
|
||||||
|
{(similarity * 100).toFixed(1)}% Aehnlichkeit
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
{/* Navigation */}
|
||||||
|
<div className="flex items-center gap-1 mr-3">
|
||||||
|
<button onClick={onReviewPrev} disabled={reviewIndex === 0} className="p-1 text-gray-400 hover:text-gray-600 disabled:opacity-30">
|
||||||
|
<ChevronLeft className="w-4 h-4" />
|
||||||
|
</button>
|
||||||
|
<span className="text-xs text-gray-500 font-medium">{reviewIndex + 1} / {reviewTotal}</span>
|
||||||
|
<button onClick={onReviewNext} disabled={reviewIndex >= reviewTotal - 1} className="p-1 text-gray-400 hover:text-gray-600 disabled:opacity-30">
|
||||||
|
<SkipForward className="w-4 h-4" />
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Actions */}
|
||||||
|
<button
|
||||||
|
onClick={() => onReview(ctrl.control_id, 'approve')}
|
||||||
|
className="px-3 py-1.5 text-sm text-white bg-green-600 rounded-lg hover:bg-green-700"
|
||||||
|
>
|
||||||
|
<CheckCircle2 className="w-3.5 h-3.5 inline mr-1" />Behalten
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => onReview(ctrl.control_id, 'reject')}
|
||||||
|
className="px-3 py-1.5 text-sm text-white bg-red-600 rounded-lg hover:bg-red-700"
|
||||||
|
>
|
||||||
|
<Trash2 className="w-3.5 h-3.5 inline mr-1" />Duplikat
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={onEdit}
|
||||||
|
className="px-3 py-1.5 text-sm text-gray-600 border border-gray-300 rounded-lg hover:bg-gray-50"
|
||||||
|
>
|
||||||
|
<Pencil className="w-3.5 h-3.5 inline mr-1" />Bearbeiten
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Side-by-Side Panels */}
|
||||||
|
<div className="flex-1 flex overflow-hidden">
|
||||||
|
{/* Left: Control to review */}
|
||||||
|
<div className="w-1/2 border-r border-gray-200 overflow-y-auto">
|
||||||
|
<ControlPanel ctrl={ctrl} label="Zu pruefen" highlight />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Right: Suspected duplicate */}
|
||||||
|
<div className="w-1/2 overflow-y-auto">
|
||||||
|
{loading ? (
|
||||||
|
<div className="flex items-center justify-center h-full">
|
||||||
|
<div className="animate-spin rounded-full h-6 w-6 border-2 border-purple-600 border-t-transparent" />
|
||||||
|
</div>
|
||||||
|
) : suspectedDuplicate ? (
|
||||||
|
<ControlPanel ctrl={suspectedDuplicate} label="Bestehendes Control (Verdacht)" />
|
||||||
|
) : (
|
||||||
|
<div className="flex items-center justify-center h-full text-gray-400 text-sm">
|
||||||
|
Kein Duplikat-Kandidat gefunden
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
@@ -14,6 +14,7 @@ import {
|
|||||||
} from './components/helpers'
|
} from './components/helpers'
|
||||||
import { ControlForm } from './components/ControlForm'
|
import { ControlForm } from './components/ControlForm'
|
||||||
import { ControlDetail } from './components/ControlDetail'
|
import { ControlDetail } from './components/ControlDetail'
|
||||||
|
import { ReviewCompare } from './components/ReviewCompare'
|
||||||
import { GeneratorModal } from './components/GeneratorModal'
|
import { GeneratorModal } from './components/GeneratorModal'
|
||||||
|
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
@@ -71,6 +72,9 @@ export default function ControlLibraryPage() {
|
|||||||
const [reviewIndex, setReviewIndex] = useState(0)
|
const [reviewIndex, setReviewIndex] = useState(0)
|
||||||
const [reviewItems, setReviewItems] = useState<CanonicalControl[]>([])
|
const [reviewItems, setReviewItems] = useState<CanonicalControl[]>([])
|
||||||
const [reviewCount, setReviewCount] = useState(0)
|
const [reviewCount, setReviewCount] = useState(0)
|
||||||
|
const [reviewTab, setReviewTab] = useState<'duplicates' | 'rule3'>('duplicates')
|
||||||
|
const [reviewDuplicates, setReviewDuplicates] = useState<CanonicalControl[]>([])
|
||||||
|
const [reviewRule3, setReviewRule3] = useState<CanonicalControl[]>([])
|
||||||
|
|
||||||
// Debounce search
|
// Debounce search
|
||||||
const searchTimer = useRef<ReturnType<typeof setTimeout> | null>(null)
|
const searchTimer = useRef<ReturnType<typeof setTimeout> | null>(null)
|
||||||
@@ -303,20 +307,47 @@ export default function ControlLibraryPage() {
|
|||||||
const enterReviewMode = async () => {
|
const enterReviewMode = async () => {
|
||||||
// Load review items from backend
|
// Load review items from backend
|
||||||
try {
|
try {
|
||||||
const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=200`)
|
const res = await fetch(`${BACKEND_URL}?endpoint=controls&release_state=needs_review&limit=1000`)
|
||||||
if (res.ok) {
|
if (res.ok) {
|
||||||
const items = await res.json()
|
const items: CanonicalControl[] = await res.json()
|
||||||
if (items.length > 0) {
|
if (items.length > 0) {
|
||||||
setReviewItems(items)
|
// Split into duplicate suspects vs rule 3 without anchor
|
||||||
|
const dupes = items.filter(c =>
|
||||||
|
c.generation_metadata?.similar_controls &&
|
||||||
|
Array.isArray(c.generation_metadata.similar_controls) &&
|
||||||
|
(c.generation_metadata.similar_controls as unknown[]).length > 0
|
||||||
|
)
|
||||||
|
const rule3 = items.filter(c =>
|
||||||
|
!c.generation_metadata?.similar_controls ||
|
||||||
|
!Array.isArray(c.generation_metadata.similar_controls) ||
|
||||||
|
(c.generation_metadata.similar_controls as unknown[]).length === 0
|
||||||
|
)
|
||||||
|
setReviewDuplicates(dupes)
|
||||||
|
setReviewRule3(rule3)
|
||||||
|
// Start with duplicates tab if any, otherwise rule3
|
||||||
|
const startTab = dupes.length > 0 ? 'duplicates' : 'rule3'
|
||||||
|
const startItems = startTab === 'duplicates' ? dupes : rule3
|
||||||
|
setReviewTab(startTab)
|
||||||
|
setReviewItems(startItems)
|
||||||
setReviewMode(true)
|
setReviewMode(true)
|
||||||
setReviewIndex(0)
|
setReviewIndex(0)
|
||||||
setSelectedControl(items[0])
|
setSelectedControl(startItems[0])
|
||||||
setMode('detail')
|
setMode('detail')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch { /* ignore */ }
|
} catch { /* ignore */ }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const switchReviewTab = (tab: 'duplicates' | 'rule3') => {
|
||||||
|
const items = tab === 'duplicates' ? reviewDuplicates : reviewRule3
|
||||||
|
setReviewTab(tab)
|
||||||
|
setReviewItems(items)
|
||||||
|
setReviewIndex(0)
|
||||||
|
if (items.length > 0) {
|
||||||
|
setSelectedControl(items[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Loading
|
// Loading
|
||||||
if (loading && controls.length === 0) {
|
if (loading && controls.length === 0) {
|
||||||
return (
|
return (
|
||||||
@@ -363,28 +394,89 @@ export default function ControlLibraryPage() {
|
|||||||
|
|
||||||
// DETAIL MODE
|
// DETAIL MODE
|
||||||
if (mode === 'detail' && selectedControl) {
|
if (mode === 'detail' && selectedControl) {
|
||||||
|
const isDuplicateReview = reviewMode && reviewTab === 'duplicates'
|
||||||
|
|
||||||
|
// Review tab bar (shown above the detail/compare view in review mode)
|
||||||
|
const reviewTabBar = reviewMode ? (
|
||||||
|
<div className="border-b border-gray-200 bg-white px-6 py-2 flex items-center gap-4">
|
||||||
|
<button
|
||||||
|
onClick={() => switchReviewTab('duplicates')}
|
||||||
|
className={`px-3 py-1.5 text-sm rounded-lg font-medium ${
|
||||||
|
reviewTab === 'duplicates'
|
||||||
|
? 'bg-amber-100 text-amber-800 border border-amber-300'
|
||||||
|
: 'text-gray-500 hover:text-gray-700 hover:bg-gray-100'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
Duplikat-Verdacht ({reviewDuplicates.length})
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => switchReviewTab('rule3')}
|
||||||
|
className={`px-3 py-1.5 text-sm rounded-lg font-medium ${
|
||||||
|
reviewTab === 'rule3'
|
||||||
|
? 'bg-purple-100 text-purple-800 border border-purple-300'
|
||||||
|
: 'text-gray-500 hover:text-gray-700 hover:bg-gray-100'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
Rule 3 ohne Anchor ({reviewRule3.length})
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
) : null
|
||||||
|
|
||||||
|
if (isDuplicateReview) {
|
||||||
|
return (
|
||||||
|
<div className="flex flex-col h-full">
|
||||||
|
{reviewTabBar}
|
||||||
|
<div className="flex-1 overflow-hidden">
|
||||||
|
<ReviewCompare
|
||||||
|
ctrl={selectedControl}
|
||||||
|
onBack={() => { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
|
||||||
|
onReview={handleReview}
|
||||||
|
onEdit={() => setMode('edit')}
|
||||||
|
reviewIndex={reviewIndex}
|
||||||
|
reviewTotal={reviewItems.length}
|
||||||
|
onReviewPrev={() => {
|
||||||
|
const idx = Math.max(0, reviewIndex - 1)
|
||||||
|
setReviewIndex(idx)
|
||||||
|
setSelectedControl(reviewItems[idx])
|
||||||
|
}}
|
||||||
|
onReviewNext={() => {
|
||||||
|
const idx = Math.min(reviewItems.length - 1, reviewIndex + 1)
|
||||||
|
setReviewIndex(idx)
|
||||||
|
setSelectedControl(reviewItems[idx])
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<ControlDetail
|
<div className="flex flex-col h-full">
|
||||||
ctrl={selectedControl}
|
{reviewTabBar}
|
||||||
onBack={() => { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
|
<div className="flex-1 overflow-hidden">
|
||||||
onEdit={() => setMode('edit')}
|
<ControlDetail
|
||||||
onDelete={handleDelete}
|
ctrl={selectedControl}
|
||||||
onReview={handleReview}
|
onBack={() => { setMode('list'); setSelectedControl(null); setReviewMode(false) }}
|
||||||
onRefresh={fullReload}
|
onEdit={() => setMode('edit')}
|
||||||
reviewMode={reviewMode}
|
onDelete={handleDelete}
|
||||||
reviewIndex={reviewIndex}
|
onReview={handleReview}
|
||||||
reviewTotal={reviewItems.length}
|
onRefresh={fullReload}
|
||||||
onReviewPrev={() => {
|
reviewMode={reviewMode}
|
||||||
const idx = Math.max(0, reviewIndex - 1)
|
reviewIndex={reviewIndex}
|
||||||
setReviewIndex(idx)
|
reviewTotal={reviewItems.length}
|
||||||
setSelectedControl(reviewItems[idx])
|
onReviewPrev={() => {
|
||||||
}}
|
const idx = Math.max(0, reviewIndex - 1)
|
||||||
onReviewNext={() => {
|
setReviewIndex(idx)
|
||||||
const idx = Math.min(reviewItems.length - 1, reviewIndex + 1)
|
setSelectedControl(reviewItems[idx])
|
||||||
setReviewIndex(idx)
|
}}
|
||||||
setSelectedControl(reviewItems[idx])
|
onReviewNext={() => {
|
||||||
}}
|
const idx = Math.min(reviewItems.length - 1, reviewIndex + 1)
|
||||||
/>
|
setReviewIndex(idx)
|
||||||
|
setSelectedControl(reviewItems[idx])
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -321,6 +321,62 @@ VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
|
|||||||
VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
|
VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
|
||||||
"AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
|
"AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Recital (Erwägungsgrund) detection in source text
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Pattern: standalone recital number like (125)\n or (126) at line start
|
||||||
|
_RECITAL_RE = re.compile(r'\((\d{1,3})\)\s*\n')
|
||||||
|
|
||||||
|
# Recital-typical phrasing (German EU law Erwägungsgründe)
|
||||||
|
_RECITAL_PHRASES = [
|
||||||
|
"in erwägung nachstehender gründe",
|
||||||
|
"erwägungsgrund",
|
||||||
|
"in anbetracht",
|
||||||
|
"daher sollte",
|
||||||
|
"aus diesem grund",
|
||||||
|
"es ist daher",
|
||||||
|
"folglich sollte",
|
||||||
|
"es sollte daher",
|
||||||
|
"in diesem zusammenhang",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_recital(text: str) -> Optional[dict]:
|
||||||
|
"""Detect if source text is a recital (Erwägungsgrund) rather than an article.
|
||||||
|
|
||||||
|
Returns a dict with detection details if recital markers are found,
|
||||||
|
or None if the text appears to be genuine article text.
|
||||||
|
|
||||||
|
Detection criteria:
|
||||||
|
1. Standalone recital numbers like (126)\\n in the text
|
||||||
|
2. Recital-typical phrasing ("daher sollte", "erwägungsgrund", etc.)
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check 1: Recital number markers
|
||||||
|
recital_matches = _RECITAL_RE.findall(text)
|
||||||
|
|
||||||
|
# Check 2: Recital phrasing
|
||||||
|
text_lower = text.lower()
|
||||||
|
phrase_hits = [p for p in _RECITAL_PHRASES if p in text_lower]
|
||||||
|
|
||||||
|
if not recital_matches and not phrase_hits:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Require at least recital numbers OR >=2 phrase hits to be a suspect
|
||||||
|
if not recital_matches and len(phrase_hits) < 2:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"recital_suspect": True,
|
||||||
|
"recital_numbers": recital_matches[:10],
|
||||||
|
"recital_phrases": phrase_hits[:5],
|
||||||
|
"detection_method": "regex+phrases" if recital_matches and phrase_hits
|
||||||
|
else "regex" if recital_matches else "phrases",
|
||||||
|
}
|
||||||
|
|
||||||
CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))
|
CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))
|
||||||
|
|
||||||
VERIFICATION_KEYWORDS = {
|
VERIFICATION_KEYWORDS = {
|
||||||
@@ -1520,9 +1576,23 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
|
|||||||
) -> tuple[GeneratedControl, bool]:
|
) -> tuple[GeneratedControl, bool]:
|
||||||
"""Cross-validate category/domain using keyword detection + local LLM.
|
"""Cross-validate category/domain using keyword detection + local LLM.
|
||||||
|
|
||||||
|
Also checks for recital (Erwägungsgrund) contamination in source text.
|
||||||
Returns (control, was_fixed). Only triggers Ollama QA when the LLM
|
Returns (control, was_fixed). Only triggers Ollama QA when the LLM
|
||||||
classification disagrees with keyword detection — keeps it fast.
|
classification disagrees with keyword detection — keeps it fast.
|
||||||
"""
|
"""
|
||||||
|
# ── Recital detection ──────────────────────────────────────────
|
||||||
|
source_text = control.source_original_text or ""
|
||||||
|
recital_info = _detect_recital(source_text)
|
||||||
|
if recital_info:
|
||||||
|
control.generation_metadata["recital_suspect"] = True
|
||||||
|
control.generation_metadata["recital_detection"] = recital_info
|
||||||
|
control.release_state = "needs_review"
|
||||||
|
logger.warning(
|
||||||
|
"Recital suspect: '%s' — recitals %s detected in source text",
|
||||||
|
control.title[:40],
|
||||||
|
recital_info.get("recital_numbers", []),
|
||||||
|
)
|
||||||
|
|
||||||
kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
|
kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
|
||||||
kw_domain = _detect_domain(chunk_text)
|
kw_domain = _detect_domain(chunk_text)
|
||||||
llm_domain = control.generation_metadata.get("_effective_domain", "")
|
llm_domain = control.generation_metadata.get("_effective_domain", "")
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
|
|||||||
from compliance.services.control_generator import (
|
from compliance.services.control_generator import (
|
||||||
_classify_regulation,
|
_classify_regulation,
|
||||||
_detect_domain,
|
_detect_domain,
|
||||||
|
_detect_recital,
|
||||||
_parse_llm_json,
|
_parse_llm_json,
|
||||||
_parse_llm_json_array,
|
_parse_llm_json_array,
|
||||||
GeneratorConfig,
|
GeneratorConfig,
|
||||||
@@ -1306,3 +1307,92 @@ class TestPipelineVersion:
|
|||||||
assert controls[0] is not None
|
assert controls[0] is not None
|
||||||
assert controls[1] is None # Null entry from LLM
|
assert controls[1] is None # Null entry from LLM
|
||||||
assert controls[2] is not None
|
assert controls[2] is not None
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Recital (Erwägungsgrund) Detection Tests
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class TestRecitalDetection:
|
||||||
|
"""Tests for _detect_recital — identifying Erwägungsgrund text in source."""
|
||||||
|
|
||||||
|
def test_recital_number_detected(self):
|
||||||
|
"""Text with (126)\\n pattern is flagged as recital suspect."""
|
||||||
|
text = "Daher ist es wichtig...\n(126)\nDie Konformitätsbewertung sollte..."
|
||||||
|
result = _detect_recital(text)
|
||||||
|
assert result is not None
|
||||||
|
assert result["recital_suspect"] is True
|
||||||
|
assert "126" in result["recital_numbers"]
|
||||||
|
|
||||||
|
def test_multiple_recital_numbers(self):
|
||||||
|
"""Multiple recital markers are all captured."""
|
||||||
|
text = "(124)\nErster Punkt.\n(125)\nZweiter Punkt.\n(126)\nDritter Punkt."
|
||||||
|
result = _detect_recital(text)
|
||||||
|
assert result is not None
|
||||||
|
assert "124" in result["recital_numbers"]
|
||||||
|
assert "125" in result["recital_numbers"]
|
||||||
|
assert "126" in result["recital_numbers"]
|
||||||
|
|
||||||
|
def test_article_text_not_flagged(self):
|
||||||
|
"""Normal article text without recital markers returns None."""
|
||||||
|
text = ("Der Anbieter eines Hochrisiko-KI-Systems muss sicherstellen, "
|
||||||
|
"dass die technische Dokumentation erstellt wird.")
|
||||||
|
result = _detect_recital(text)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_empty_text_returns_none(self):
|
||||||
|
result = _detect_recital("")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_none_text_returns_none(self):
|
||||||
|
result = _detect_recital(None)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_recital_phrases_detected(self):
|
||||||
|
"""Text with multiple recital-typical phrases is flagged."""
|
||||||
|
text = ("In Erwägung nachstehender Gründe wurde beschlossen, "
|
||||||
|
"daher sollte der Anbieter folgende Maßnahmen ergreifen. "
|
||||||
|
"Es ist daher notwendig, die Konformität sicherzustellen.")
|
||||||
|
result = _detect_recital(text)
|
||||||
|
assert result is not None
|
||||||
|
assert result["detection_method"] == "phrases"
|
||||||
|
|
||||||
|
def test_single_phrase_not_enough(self):
|
||||||
|
"""A single recital phrase alone is not sufficient for detection."""
|
||||||
|
text = "Daher sollte das System regelmäßig geprüft werden."
|
||||||
|
result = _detect_recital(text)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_combined_regex_and_phrases(self):
|
||||||
|
"""Both recital numbers and phrases → detection_method is regex+phrases."""
|
||||||
|
text = "(42)\nIn Erwägung nachstehender Gründe wurde entschieden..."
|
||||||
|
result = _detect_recital(text)
|
||||||
|
assert result is not None
|
||||||
|
assert result["detection_method"] == "regex+phrases"
|
||||||
|
assert "42" in result["recital_numbers"]
|
||||||
|
|
||||||
|
def test_parenthesized_number_without_newline_ignored(self):
|
||||||
|
"""Numbers in parentheses without trailing newline are not recital markers.
|
||||||
|
e.g. 'gemäß Absatz (3) des Artikels' should not be flagged."""
|
||||||
|
text = "Gemäß Absatz (3) des Artikels 52 muss der Anbieter sicherstellen..."
|
||||||
|
result = _detect_recital(text)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_real_world_recital_text(self):
|
||||||
|
"""Real-world example: AI Act Erwägungsgrund (126) about conformity assessment."""
|
||||||
|
text = (
|
||||||
|
"(126)\n"
|
||||||
|
"Um den Verwaltungsaufwand zu verringern und die Konformitätsbewertung "
|
||||||
|
"zu vereinfachen, sollten bestimmte Hochrisiko-KI-Systeme, die von "
|
||||||
|
"Anbietern zertifiziert oder für die eine Konformitätserklärung "
|
||||||
|
"ausgestellt wurde, automatisch als konform mit den Anforderungen "
|
||||||
|
"dieser Verordnung gelten, sofern sie den harmonisierten Normen oder "
|
||||||
|
"gemeinsamen Spezifikationen entsprechen.\n"
|
||||||
|
"(127)\n"
|
||||||
|
"Es ist daher angezeigt, dass der Anbieter das entsprechende "
|
||||||
|
"Konformitätsbewertungsverfahren anwendet."
|
||||||
|
)
|
||||||
|
result = _detect_recital(text)
|
||||||
|
assert result is not None
|
||||||
|
assert "126" in result["recital_numbers"]
|
||||||
|
assert "127" in result["recital_numbers"]
|
||||||
|
|||||||
@@ -214,13 +214,13 @@ Wenn du z.B. eine neue `GetUserStats()` Funktion im Go Service hinzufuegst:
|
|||||||
|
|
||||||
## Modul-spezifische Tests
|
## Modul-spezifische Tests
|
||||||
|
|
||||||
### Canonical Control Generator (71+ Tests)
|
### Canonical Control Generator (81+ Tests)
|
||||||
|
|
||||||
Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien.
|
Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien.
|
||||||
Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) und [Control Generator Pipeline](../services/sdk-modules/control-generator-pipeline.md) fuer Details.
|
Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) und [Control Generator Pipeline](../services/sdk-modules/control-generator-pipeline.md) fuer Details.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Alle Generator-Tests (71 Tests in 10 Klassen)
|
# Alle Generator-Tests (81 Tests in 12 Klassen)
|
||||||
cd backend-compliance && pytest -v tests/test_control_generator.py
|
cd backend-compliance && pytest -v tests/test_control_generator.py
|
||||||
|
|
||||||
# Similarity Detector Tests
|
# Similarity Detector Tests
|
||||||
@@ -253,3 +253,4 @@ cd backend-compliance && pytest -v tests/test_validate_controls.py
|
|||||||
| `TestBatchProcessingLoop` | 10 | Batch-Verarbeitung (Rule-Split, Mixed-Rules, Too-Close, Null-Handling) |
|
| `TestBatchProcessingLoop` | 10 | Batch-Verarbeitung (Rule-Split, Mixed-Rules, Too-Close, Null-Handling) |
|
||||||
| `TestRegulationFilter` | 5 | regulation_filter Prefix-Matching, leere regulation_codes |
|
| `TestRegulationFilter` | 5 | regulation_filter Prefix-Matching, leere regulation_codes |
|
||||||
| `TestPipelineVersion` | 5 | pipeline_version=2 in DB-Writes, null-Handling in Structure/Reform |
|
| `TestPipelineVersion` | 5 | pipeline_version=2 in DB-Writes, null-Handling in Structure/Reform |
|
||||||
|
| `TestRecitalDetection` | 10 | Erwaegungsgrund-Erkennung in Quelltexten (Regex, Phrasen, Kombiniert) |
|
||||||
|
|||||||
@@ -500,6 +500,39 @@ Die QA-Metriken werden in `generation_metadata` gespeichert:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Recital-Erkennung (Erwägungsgrund-Detektion)
|
||||||
|
|
||||||
|
Die QA-Stufe prueft zusaetzlich, ob der `source_original_text` eines Controls tatsaechlich aus einem Gesetzesartikel stammt — oder aus einem Erwaegungsgrund (Recital). Erwaegungsgruende enthalten keine normativen Pflichten und fuehren zu falsch zugeordneten Controls.
|
||||||
|
|
||||||
|
**Erkennungsmethoden:**
|
||||||
|
|
||||||
|
| Methode | Pattern | Beispiel |
|
||||||
|
|---------|---------|----------|
|
||||||
|
| **Regex** | `\((\d{1,3})\)\s*\n` — Erwaegungsgrund-Nummern | `(126)\nUm den Verwaltungsaufwand...` |
|
||||||
|
| **Phrasen** | Typische Recital-Formulierungen (≥2 Treffer) | "daher sollte", "in Erwägung nachstehender Gründe" |
|
||||||
|
|
||||||
|
**Ergebnis bei Verdacht:**
|
||||||
|
|
||||||
|
- `release_state` wird auf `needs_review` gesetzt
|
||||||
|
- `generation_metadata.recital_suspect = true`
|
||||||
|
- `generation_metadata.recital_detection` enthaelt Details:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"recital_suspect": true,
|
||||||
|
"recital_detection": {
|
||||||
|
"recital_suspect": true,
|
||||||
|
"recital_numbers": ["126", "127"],
|
||||||
|
"recital_phrases": ["daher sollte"],
|
||||||
|
"detection_method": "regex+phrases"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Funktion:** `_detect_recital(text)` in `control_generator.py`
|
||||||
|
|
||||||
|
**Hintergrund:** Bei der Analyse von ~5.500 Controls mit Quelltext wurden 1.555 (28%) als Erwaegungsgrund-Verdacht identifiziert. Der Document Crawler unterschied nicht zwischen Artikeltext und Erwaegungsgruenden, was zu falschen `article`/`paragraph`-Zuordnungen fuehrte.
|
||||||
|
|
||||||
### QA-Reklassifizierung bestehender Controls
|
### QA-Reklassifizierung bestehender Controls
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -530,7 +563,7 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/
|
|||||||
| `backend-compliance/migrations/046_control_generator.sql` | Job-Tracking, Chunk-Tracking Tabellen |
|
| `backend-compliance/migrations/046_control_generator.sql` | Job-Tracking, Chunk-Tracking Tabellen |
|
||||||
| `backend-compliance/migrations/048_processing_path_expand.sql` | Erweiterte Processing-Path-Werte |
|
| `backend-compliance/migrations/048_processing_path_expand.sql` | Erweiterte Processing-Path-Werte |
|
||||||
| `backend-compliance/migrations/062_pipeline_version.sql` | `pipeline_version` Spalte |
|
| `backend-compliance/migrations/062_pipeline_version.sql` | `pipeline_version` Spalte |
|
||||||
| `backend-compliance/tests/test_control_generator.py` | 15 Tests (Lizenz, Domain, Batch, Pipeline) |
|
| `backend-compliance/tests/test_control_generator.py` | 81+ Tests (Lizenz, Domain, Batch, Pipeline, Recital) |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
79
scripts/find_recital_controls.py
Normal file
79
scripts/find_recital_controls.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
"""Find controls where source_original_text contains Erwägungsgrund (recital) markers
|
||||||
|
instead of actual article text — indicates wrong article tagging in RAG chunks."""
|
||||||
|
|
||||||
|
import sqlalchemy
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
url = os.environ.get("DATABASE_URL", "")
|
||||||
|
if not url:
|
||||||
|
print("DATABASE_URL not set")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
engine = sqlalchemy.create_engine(url)
|
||||||
|
|
||||||
|
with engine.connect() as conn:
|
||||||
|
conn.execute(sqlalchemy.text("SET search_path TO compliance,public"))
|
||||||
|
|
||||||
|
r = conn.execute(sqlalchemy.text("""
|
||||||
|
SELECT control_id, title,
|
||||||
|
source_citation::text,
|
||||||
|
source_original_text,
|
||||||
|
pipeline_version, release_state,
|
||||||
|
generation_metadata::text
|
||||||
|
FROM canonical_controls
|
||||||
|
WHERE source_original_text IS NOT NULL
|
||||||
|
AND source_original_text != ''
|
||||||
|
AND source_citation IS NOT NULL
|
||||||
|
ORDER BY control_id
|
||||||
|
""")).fetchall()
|
||||||
|
|
||||||
|
# Pattern: standalone recital number like (125)\n or (126) at line start
|
||||||
|
recital_re = re.compile(r'\((\d{1,3})\)\s*\n')
|
||||||
|
|
||||||
|
# Pattern: article reference like "Artikel 43" in the text
|
||||||
|
artikel_re = re.compile(r'Artikel\s+(\d+)', re.IGNORECASE)
|
||||||
|
|
||||||
|
suspects_recital = []
|
||||||
|
suspects_mismatch = []
|
||||||
|
|
||||||
|
for row in r:
|
||||||
|
cid, title, citation_json, orig, pv, state, meta_json = row
|
||||||
|
if not orig:
|
||||||
|
continue
|
||||||
|
|
||||||
|
citation = json.loads(citation_json) if citation_json else {}
|
||||||
|
claimed_article = citation.get("article", "")
|
||||||
|
|
||||||
|
# Check 1: Recital markers in source text
|
||||||
|
recital_matches = recital_re.findall(orig)
|
||||||
|
has_recital = len(recital_matches) > 0
|
||||||
|
|
||||||
|
# Check 2: Text mentions a different article than claimed
|
||||||
|
artikel_matches = artikel_re.findall(orig)
|
||||||
|
claimed_num = re.search(r'\d+', claimed_article).group() if re.search(r'\d+', claimed_article) else ""
|
||||||
|
different_articles = [a for a in artikel_matches if a != claimed_num] if claimed_num else []
|
||||||
|
|
||||||
|
if has_recital:
|
||||||
|
suspects_recital.append({
|
||||||
|
"control_id": cid,
|
||||||
|
"title": title[:80],
|
||||||
|
"claimed_article": claimed_article,
|
||||||
|
"claimed_paragraph": citation.get("paragraph", ""),
|
||||||
|
"recitals_found": recital_matches[:5],
|
||||||
|
"v": pv,
|
||||||
|
"state": state,
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"=== Ergebnis ===")
|
||||||
|
print(f"Geprueft: {len(r)} Controls mit source_original_text")
|
||||||
|
print(f"Erwaegungsgrund-Verdacht: {len(suspects_recital)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if suspects_recital:
|
||||||
|
print(f"{'Control':<12} {'Behauptet':<18} {'Recitals':<20} {'v':>2} {'State':<15} Titel")
|
||||||
|
print("-" * 120)
|
||||||
|
for s in suspects_recital:
|
||||||
|
recitals = ",".join(s["recitals_found"])
|
||||||
|
print(f"{s['control_id']:<12} {s['claimed_article']:<10} {s['claimed_paragraph']:<7} ({recitals}){'':<{max(0,15-len(recitals))}} v{s['v']} {s['state']:<15} {s['title']}")
|
||||||
Reference in New Issue
Block a user