From db7c207464bc8222874f86e68fb0eb35de77cdc7 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 26 Mar 2026 10:32:08 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20V1=20Control=20Enrichment=20=E2=80=94?= =?UTF-8?q?=20Eigenentwicklung-Label,=20regulatorisches=20Matching=20&=20V?= =?UTF-8?q?ergleichsansicht?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 863 v1-Controls (manuell geschrieben, ohne Rechtsgrundlage) werden als "Eigenentwicklung" gekennzeichnet und automatisch mit regulatorischen Controls (DSGVO, NIS2, OWASP etc.) per Embedding-Similarity abgeglichen. Backend: - Migration 080: v1_control_matches Tabelle (Cross-Reference) - v1_enrichment.py: Batch-Matching via BGE-M3 + Qdrant (Threshold 0.75) - 3 neue API-Endpoints: enrich-v1-matches, v1-matches, v1-enrichment-stats - 6 Tests (dry-run, execution, matches, pagination, detection) Frontend: - Orange "Eigenentwicklung"-Badge statt grauem "v1" (wenn kein Source) - "Regulatorische Abdeckung"-Sektion im ControlDetail mit Match-Karten - Side-by-Side V1CompareView (Eigenentwicklung vs. regulatorisch gedeckt) - Prev/Next Navigation durch alle Matches Co-Authored-By: Claude Opus 4.6 --- .../app/api/sdk/v1/canonical/route.ts | 18 ++ .../app/sdk/atomic-controls/page.tsx | 2 +- .../components/ControlDetail.tsx | 104 +++++- .../components/ReviewCompare.tsx | 2 +- .../components/V1CompareView.tsx | 155 +++++++++ .../control-library/components/helpers.tsx | 25 +- .../app/sdk/control-library/page.tsx | 40 ++- .../api/canonical_control_routes.py | 60 ++++ .../compliance/services/v1_enrichment.py | 301 ++++++++++++++++++ .../migrations/080_v1_control_matches.sql | 18 ++ .../tests/test_v1_enrichment.py | 220 +++++++++++++ 11 files changed, 939 insertions(+), 6 deletions(-) create mode 100644 admin-compliance/app/sdk/control-library/components/V1CompareView.tsx create mode 100644 backend-compliance/compliance/services/v1_enrichment.py create mode 100644 backend-compliance/migrations/080_v1_control_matches.sql create mode 100644 backend-compliance/tests/test_v1_enrichment.py diff --git a/admin-compliance/app/api/sdk/v1/canonical/route.ts b/admin-compliance/app/api/sdk/v1/canonical/route.ts index 06c87c7..d7ee46c 100644 --- a/admin-compliance/app/api/sdk/v1/canonical/route.ts +++ b/admin-compliance/app/api/sdk/v1/canonical/route.ts @@ -135,6 +135,19 @@ export async function GET(request: NextRequest) { backendPath = '/api/compliance/v1/canonical/blocked-sources' break + case 'v1-matches': { + const matchId = searchParams.get('id') + if (!matchId) { + return NextResponse.json({ error: 'Missing control id' }, { status: 400 }) + } + backendPath = `/api/compliance/v1/canonical/controls/${encodeURIComponent(matchId)}/v1-matches` + break + } + + case 'v1-enrichment-stats': + backendPath = '/api/compliance/v1/canonical/controls/v1-enrichment-stats' + break + case 'controls-customer': { const custSeverity = searchParams.get('severity') const custDomain = searchParams.get('domain') @@ -201,6 +214,11 @@ export async function POST(request: NextRequest) { backendPath = '/api/compliance/v1/canonical/generate/bulk-review' } else if (endpoint === 'blocked-sources-cleanup') { backendPath = '/api/compliance/v1/canonical/blocked-sources/cleanup' + } else if (endpoint === 'enrich-v1-matches') { + const dryRun = searchParams.get('dry_run') ?? 'true' + const batchSize = searchParams.get('batch_size') ?? '100' + const enrichOffset = searchParams.get('offset') ?? '0' + backendPath = `/api/compliance/v1/canonical/controls/enrich-v1-matches?dry_run=${dryRun}&batch_size=${batchSize}&offset=${enrichOffset}` } else if (endpoint === 'similarity-check') { const controlId = searchParams.get('id') if (!controlId) { diff --git a/admin-compliance/app/sdk/atomic-controls/page.tsx b/admin-compliance/app/sdk/atomic-controls/page.tsx index 1fdb7c8..7e21afc 100644 --- a/admin-compliance/app/sdk/atomic-controls/page.tsx +++ b/admin-compliance/app/sdk/atomic-controls/page.tsx @@ -308,7 +308,7 @@ export default function AtomicControlsPage() { - +

{ctrl.title}

diff --git a/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx b/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx index b0386c6..1d8a09a 100644 --- a/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx +++ b/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx @@ -9,7 +9,7 @@ import { import { CanonicalControl, EFFORT_LABELS, BACKEND_URL, SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, EvidenceTypeBadge, TargetAudienceBadge, - ObligationTypeBadge, GenerationStrategyBadge, + ObligationTypeBadge, GenerationStrategyBadge, isEigenentwicklung, ExtractionMethodBadge, RegulationCountBadge, VERIFICATION_METHODS, CATEGORY_OPTIONS, EVIDENCE_TYPE_OPTIONS, ObligationInfo, DocumentReference, MergedDuplicate, RegulationSummary, @@ -65,6 +65,20 @@ interface TraceabilityData { regulations_summary?: RegulationSummary[] } +interface V1Match { + matched_control_id: string + matched_title: string + matched_objective: string + matched_severity: string + matched_category: string + matched_source: string | null + matched_article: string | null + matched_source_citation: Record | null + similarity_score: number + match_rank: number + match_method: string +} + interface ControlDetailProps { ctrl: CanonicalControl onBack: () => void @@ -73,6 +87,7 @@ interface ControlDetailProps { onReview: (controlId: string, action: string) => void onRefresh?: () => void onNavigateToControl?: (controlId: string) => void + onCompare?: (ctrl: CanonicalControl, matches: V1Match[]) => void // Review mode navigation reviewMode?: boolean reviewIndex?: number @@ -89,6 +104,7 @@ export function ControlDetail({ onReview, onRefresh, onNavigateToControl, + onCompare, reviewMode, reviewIndex = 0, reviewTotal = 0, @@ -101,6 +117,9 @@ export function ControlDetail({ const [merging, setMerging] = useState(false) const [traceability, setTraceability] = useState(null) const [loadingTrace, setLoadingTrace] = useState(false) + const [v1Matches, setV1Matches] = useState([]) + const [loadingV1, setLoadingV1] = useState(false) + const eigenentwicklung = isEigenentwicklung(ctrl) const loadTraceability = useCallback(async () => { setLoadingTrace(true) @@ -117,9 +136,21 @@ export function ControlDetail({ finally { setLoadingTrace(false) } }, [ctrl.control_id]) + const loadV1Matches = useCallback(async () => { + if (!eigenentwicklung) { setV1Matches([]); return } + setLoadingV1(true) + try { + const res = await fetch(`${BACKEND_URL}?endpoint=v1-matches&id=${ctrl.control_id}`) + if (res.ok) setV1Matches(await res.json()) + else setV1Matches([]) + } catch { setV1Matches([]) } + finally { setLoadingV1(false) } + }, [ctrl.control_id, eigenentwicklung]) + useEffect(() => { loadSimilarControls() loadTraceability() + loadV1Matches() setSelectedDuplicates(new Set()) // eslint-disable-next-line react-hooks/exhaustive-deps }, [ctrl.control_id]) @@ -187,7 +218,7 @@ export function ControlDetail({ - +

{ctrl.title}

@@ -303,6 +334,75 @@ export function ControlDetail({ )} + {/* Regulatorische Abdeckung (Eigenentwicklung) */} + {eigenentwicklung && ( +
+
+ +

+ Regulatorische Abdeckung +

+ {loadingV1 && Laden...} +
+ {v1Matches.length > 0 ? ( +
+ {v1Matches.map((match, i) => ( +
+
+
+
+ {match.matched_source && ( + + {match.matched_source} + + )} + {match.matched_article && ( + + {match.matched_article} + + )} + = 0.85 ? 'bg-green-100 text-green-700' : + match.similarity_score >= 0.80 ? 'bg-yellow-100 text-yellow-700' : + 'bg-gray-100 text-gray-600' + }`}> + {(match.similarity_score * 100).toFixed(0)}% + +
+

+ {onNavigateToControl ? ( + + ) : ( + + {match.matched_control_id} + + )} + {match.matched_title} +

+
+ {onCompare && ( + + )} +
+
+ ))} +
+ ) : !loadingV1 ? ( +

Keine regulatorische Abdeckung gefunden. Dieses Control ist eine reine Eigenentwicklung.

+ ) : null} +
+ )} + {/* Rechtsgrundlagen / Traceability (atomic controls) */} {traceability && traceability.parent_links.length > 0 && (
diff --git a/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx b/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx index 5d4c92e..5d54cc3 100644 --- a/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx +++ b/admin-compliance/app/sdk/control-library/components/ReviewCompare.tsx @@ -15,7 +15,7 @@ import { // Compact Control Panel (used on both sides of the comparison) // ============================================================================= -function ControlPanel({ ctrl, label, highlight }: { ctrl: CanonicalControl; label: string; highlight?: boolean }) { +export function ControlPanel({ ctrl, label, highlight }: { ctrl: CanonicalControl; label: string; highlight?: boolean }) { return (
{/* Panel Header */} diff --git a/admin-compliance/app/sdk/control-library/components/V1CompareView.tsx b/admin-compliance/app/sdk/control-library/components/V1CompareView.tsx new file mode 100644 index 0000000..9ed1d56 --- /dev/null +++ b/admin-compliance/app/sdk/control-library/components/V1CompareView.tsx @@ -0,0 +1,155 @@ +'use client' + +import { useState, useEffect } from 'react' +import { + ArrowLeft, ChevronLeft, SkipForward, Scale, +} from 'lucide-react' +import { CanonicalControl, BACKEND_URL } from './helpers' +import { ControlPanel } from './ReviewCompare' + +interface V1Match { + matched_control_id: string + matched_title: string + matched_objective: string + matched_severity: string + matched_category: string + matched_source: string | null + matched_article: string | null + matched_source_citation: Record | null + similarity_score: number + match_rank: number + match_method: string +} + +interface V1CompareViewProps { + v1Control: CanonicalControl + matches: V1Match[] + onBack: () => void + onNavigateToControl?: (controlId: string) => void +} + +export function V1CompareView({ v1Control, matches, onBack, onNavigateToControl }: V1CompareViewProps) { + const [currentMatchIndex, setCurrentMatchIndex] = useState(0) + const [matchedControl, setMatchedControl] = useState(null) + const [loading, setLoading] = useState(false) + + const currentMatch = matches[currentMatchIndex] + + // Load the full matched control when index changes + useEffect(() => { + if (!currentMatch) return + const load = async () => { + setLoading(true) + try { + const res = await fetch(`${BACKEND_URL}?endpoint=control&id=${encodeURIComponent(currentMatch.matched_control_id)}`) + if (res.ok) { + setMatchedControl(await res.json()) + } else { + setMatchedControl(null) + } + } catch { + setMatchedControl(null) + } finally { + setLoading(false) + } + } + load() + }, [currentMatch]) + + return ( +
+ {/* Header */} +
+
+ +
+
+ + V1-Vergleich + {currentMatch && ( + = 0.85 ? 'bg-green-100 text-green-700' : + currentMatch.similarity_score >= 0.80 ? 'bg-yellow-100 text-yellow-700' : + 'bg-gray-100 text-gray-600' + }`}> + {(currentMatch.similarity_score * 100).toFixed(1)}% Aehnlichkeit + + )} +
+
+
+ +
+ {/* Navigation */} +
+ + + {currentMatchIndex + 1} / {matches.length} + + +
+ + {/* Navigate to matched control */} + {onNavigateToControl && matchedControl && ( + + )} +
+
+ + {/* Source info bar */} + {currentMatch && (currentMatch.matched_source || currentMatch.matched_article) && ( +
+ + {currentMatch.matched_source && ( + {currentMatch.matched_source} + )} + {currentMatch.matched_article && ( + {currentMatch.matched_article} + )} +
+ )} + + {/* Side-by-Side Panels */} +
+ {/* Left: V1 Eigenentwicklung */} +
+ +
+ + {/* Right: Regulatory match */} +
+ {loading ? ( +
+
+
+ ) : matchedControl ? ( + + ) : ( +
+ Control konnte nicht geladen werden +
+ )} +
+
+
+ ) +} diff --git a/admin-compliance/app/sdk/control-library/components/helpers.tsx b/admin-compliance/app/sdk/control-library/components/helpers.tsx index dd50c58..2495a2d 100644 --- a/admin-compliance/app/sdk/control-library/components/helpers.tsx +++ b/admin-compliance/app/sdk/control-library/components/helpers.tsx @@ -52,6 +52,7 @@ export interface CanonicalControl { parent_control_id?: string | null parent_control_title?: string | null decomposition_method?: string | null + pipeline_version?: number | string | null created_at: string updated_at: string } @@ -293,7 +294,29 @@ export function TargetAudienceBadge({ audience }: { audience: string | string[] ) } -export function GenerationStrategyBadge({ strategy }: { strategy: string | null | undefined }) { +export interface CanonicalControlPipelineInfo { + pipeline_version?: number | string | null + source_citation?: Record | null + parent_control_uuid?: string | null +} + +export function isEigenentwicklung(ctrl: CanonicalControlPipelineInfo & { generation_strategy?: string | null }): boolean { + return ( + (!ctrl.generation_strategy || ctrl.generation_strategy === 'ungrouped') && + (!ctrl.pipeline_version || String(ctrl.pipeline_version) === '1') && + !ctrl.source_citation && + !ctrl.parent_control_uuid + ) +} + +export function GenerationStrategyBadge({ strategy, pipelineInfo }: { + strategy: string | null | undefined + pipelineInfo?: CanonicalControlPipelineInfo & { generation_strategy?: string | null } +}) { + // Eigenentwicklung detection: v1 + no source + no parent + if (pipelineInfo && isEigenentwicklung(pipelineInfo)) { + return Eigenentwicklung + } if (!strategy || strategy === 'ungrouped') { return v1 } diff --git a/admin-compliance/app/sdk/control-library/page.tsx b/admin-compliance/app/sdk/control-library/page.tsx index 3fe398d..e473b29 100644 --- a/admin-compliance/app/sdk/control-library/page.tsx +++ b/admin-compliance/app/sdk/control-library/page.tsx @@ -15,6 +15,7 @@ import { import { ControlForm } from './components/ControlForm' import { ControlDetail } from './components/ControlDetail' import { ReviewCompare } from './components/ReviewCompare' +import { V1CompareView } from './components/V1CompareView' import { GeneratorModal } from './components/GeneratorModal' // ============================================================================= @@ -79,6 +80,17 @@ export default function ControlLibraryPage() { const [reviewDuplicates, setReviewDuplicates] = useState([]) const [reviewRule3, setReviewRule3] = useState([]) + // V1 Compare mode + const [compareMode, setCompareMode] = useState(false) + const [compareV1Control, setCompareV1Control] = useState(null) + const [compareMatches, setCompareMatches] = useState | null + similarity_score: number; match_rank: number; match_method: string + }>>([]) + // Debounce search const searchTimer = useRef | null>(null) useEffect(() => { @@ -398,6 +410,27 @@ export default function ControlLibraryPage() { ) } + // V1 COMPARE MODE + if (compareMode && compareV1Control) { + return ( + { setCompareMode(false) }} + onNavigateToControl={async (controlId: string) => { + try { + const res = await fetch(`${BACKEND_URL}?endpoint=control&id=${controlId}`) + if (res.ok) { + setCompareMode(false) + setSelectedControl(await res.json()) + setMode('detail') + } + } catch { /* ignore */ } + }} + /> + ) + } + // DETAIL MODE if (mode === 'detail' && selectedControl) { const isDuplicateReview = reviewMode && reviewTab === 'duplicates' @@ -467,6 +500,11 @@ export default function ControlLibraryPage() { onDelete={handleDelete} onReview={handleReview} onRefresh={fullReload} + onCompare={(ctrl, matches) => { + setCompareV1Control(ctrl) + setCompareMatches(matches) + setCompareMode(true) + }} onNavigateToControl={async (controlId: string) => { try { const res = await fetch(`${BACKEND_URL}?endpoint=control&id=${controlId}`) @@ -806,7 +844,7 @@ export default function ControlLibraryPage() { - + {ctrl.risk_score !== null && ( Score: {ctrl.risk_score} diff --git a/backend-compliance/compliance/api/canonical_control_routes.py b/backend-compliance/compliance/api/canonical_control_routes.py index bf89cd5..de13988 100644 --- a/backend-compliance/compliance/api/canonical_control_routes.py +++ b/backend-compliance/compliance/api/canonical_control_routes.py @@ -547,6 +547,15 @@ async def atomic_stats(): } +@router.get("/controls/v1-enrichment-stats") +async def v1_enrichment_stats_endpoint(): + """ + Uebersicht: Wie viele v1 Controls haben regulatorische Abdeckung? + """ + from compliance.services.v1_enrichment import get_v1_enrichment_stats + return await get_v1_enrichment_stats() + + @router.get("/controls/{control_id}") async def get_control(control_id: str): """Get a single canonical control by its control_id (e.g. AUTH-001).""" @@ -1567,6 +1576,57 @@ async def list_licenses(): return get_license_matrix(db) +# ============================================================================= +# V1 ENRICHMENT (Eigenentwicklung → Regulatorische Abdeckung) +# ============================================================================= + +@router.post("/controls/enrich-v1-matches") +async def enrich_v1_matches_endpoint( + dry_run: bool = Query(True, description="Nur zaehlen, nicht schreiben"), + batch_size: int = Query(100, description="Controls pro Durchlauf"), + offset: int = Query(0, description="Offset fuer Paginierung"), +): + """ + Findet regulatorische Abdeckung fuer v1 Eigenentwicklung Controls. + + Eigenentwicklung = generation_strategy='ungrouped', pipeline_version=1, + source_citation IS NULL, parent_control_uuid IS NULL. + + Workflow: + 1. dry_run=true → Statistiken anzeigen + 2. dry_run=false&batch_size=100&offset=0 → Erste 100 verarbeiten + 3. Wiederholen mit next_offset bis fertig + """ + from compliance.services.v1_enrichment import enrich_v1_matches + return await enrich_v1_matches( + dry_run=dry_run, + batch_size=batch_size, + offset=offset, + ) + + +@router.get("/controls/{control_id}/v1-matches") +async def get_v1_matches_endpoint(control_id: str): + """ + Gibt regulatorische Matches fuer ein v1 Control zurueck. + + Returns: + Liste von Matches mit Control-Details, Source, Score. + """ + from compliance.services.v1_enrichment import get_v1_matches + + # Resolve control_id to UUID + with SessionLocal() as db: + row = db.execute(text(""" + SELECT id FROM canonical_controls WHERE control_id = :cid + """), {"cid": control_id}).fetchone() + + if not row: + raise HTTPException(status_code=404, detail=f"Control {control_id} not found") + + return await get_v1_matches(str(row.id)) + + # ============================================================================= # INTERNAL HELPERS # ============================================================================= diff --git a/backend-compliance/compliance/services/v1_enrichment.py b/backend-compliance/compliance/services/v1_enrichment.py new file mode 100644 index 0000000..39f0f34 --- /dev/null +++ b/backend-compliance/compliance/services/v1_enrichment.py @@ -0,0 +1,301 @@ +"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations. + +Finds regulatory coverage for v1 controls (generation_strategy='ungrouped', +pipeline_version=1, no source_citation) by embedding similarity search. + +Reuses embedding + Qdrant helpers from control_dedup.py. +""" + +import logging +from typing import Optional + +from sqlalchemy import text + +from database import SessionLocal +from compliance.services.control_dedup import ( + get_embedding, + qdrant_search_cross_regulation, +) + +logger = logging.getLogger(__name__) + +# Similarity threshold — lower than dedup (0.85) since we want informational matches +V1_MATCH_THRESHOLD = 0.75 +V1_MAX_MATCHES = 5 + + +def _is_eigenentwicklung_query() -> str: + """SQL WHERE clause identifying v1 Eigenentwicklung controls.""" + return """ + generation_strategy = 'ungrouped' + AND (pipeline_version = '1' OR pipeline_version IS NULL) + AND source_citation IS NULL + AND parent_control_uuid IS NULL + AND release_state NOT IN ('rejected', 'merged', 'deprecated') + """ + + +async def count_v1_controls() -> int: + """Count how many v1 Eigenentwicklung controls exist.""" + with SessionLocal() as db: + row = db.execute(text(f""" + SELECT COUNT(*) AS cnt + FROM canonical_controls + WHERE {_is_eigenentwicklung_query()} + """)).fetchone() + return row.cnt if row else 0 + + +async def enrich_v1_matches( + dry_run: bool = True, + batch_size: int = 100, + offset: int = 0, +) -> dict: + """Find regulatory matches for v1 Eigenentwicklung controls. + + Args: + dry_run: If True, only count — don't write matches. + batch_size: Number of v1 controls to process per call. + offset: Pagination offset (v1 control index). + + Returns: + Stats dict with counts, sample matches, and pagination info. + """ + with SessionLocal() as db: + # 1. Load v1 controls (paginated) + v1_controls = db.execute(text(f""" + SELECT id, control_id, title, objective, category + FROM canonical_controls + WHERE {_is_eigenentwicklung_query()} + ORDER BY control_id + LIMIT :limit OFFSET :offset + """), {"limit": batch_size, "offset": offset}).fetchall() + + # Count total for pagination + total_row = db.execute(text(f""" + SELECT COUNT(*) AS cnt + FROM canonical_controls + WHERE {_is_eigenentwicklung_query()} + """)).fetchone() + total_v1 = total_row.cnt if total_row else 0 + + if not v1_controls: + return { + "dry_run": dry_run, + "processed": 0, + "total_v1": total_v1, + "message": "Kein weiterer Batch — alle v1 Controls verarbeitet.", + } + + if dry_run: + return { + "dry_run": True, + "total_v1": total_v1, + "offset": offset, + "batch_size": batch_size, + "sample_controls": [ + { + "control_id": r.control_id, + "title": r.title, + "category": r.category, + } + for r in v1_controls[:20] + ], + } + + # 2. Process each v1 control + processed = 0 + matches_inserted = 0 + errors = [] + sample_matches = [] + + for v1 in v1_controls: + try: + # Build search text + search_text = f"{v1.title} — {v1.objective}" + + # Get embedding + embedding = await get_embedding(search_text) + if not embedding: + errors.append({ + "control_id": v1.control_id, + "error": "Embedding fehlgeschlagen", + }) + continue + + # Search Qdrant (cross-regulation, no pattern filter) + results = await qdrant_search_cross_regulation( + embedding, top_k=10, + ) + + # Filter: only regulatory controls (with source_citation) + # and above threshold + rank = 0 + for hit in results: + score = hit.get("score", 0) + if score < V1_MATCH_THRESHOLD: + continue + + payload = hit.get("payload", {}) + matched_uuid = payload.get("control_uuid") + if not matched_uuid or matched_uuid == str(v1.id): + continue + + # Check if matched control has source_citation + matched_row = db.execute(text(""" + SELECT id, control_id, title, source_citation, severity, category + FROM canonical_controls + WHERE id = CAST(:uuid AS uuid) + AND source_citation IS NOT NULL + """), {"uuid": matched_uuid}).fetchone() + + if not matched_row: + continue + + rank += 1 + if rank > V1_MAX_MATCHES: + break + + # Extract source info + source_citation = matched_row.source_citation or {} + matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None + matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None + + # Insert match (ON CONFLICT skip) + db.execute(text(""" + INSERT INTO v1_control_matches + (v1_control_uuid, matched_control_uuid, similarity_score, + match_rank, matched_source, matched_article, match_method) + VALUES + (CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score, + :rank, :source, :article, 'embedding') + ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE + SET similarity_score = EXCLUDED.similarity_score, + match_rank = EXCLUDED.match_rank + """), { + "v1_uuid": str(v1.id), + "matched_uuid": str(matched_row.id), + "score": round(score, 3), + "rank": rank, + "source": matched_source, + "article": matched_article, + }) + matches_inserted += 1 + + # Collect sample + if len(sample_matches) < 20: + sample_matches.append({ + "v1_control_id": v1.control_id, + "v1_title": v1.title, + "matched_control_id": matched_row.control_id, + "matched_title": matched_row.title, + "matched_source": matched_source, + "matched_article": matched_article, + "similarity_score": round(score, 3), + "match_rank": rank, + }) + + processed += 1 + + except Exception as e: + logger.warning("V1 enrichment error for %s: %s", v1.control_id, e) + errors.append({ + "control_id": v1.control_id, + "error": str(e), + }) + + db.commit() + + # Pagination + next_offset = offset + batch_size if len(v1_controls) == batch_size else None + + return { + "dry_run": False, + "offset": offset, + "batch_size": batch_size, + "next_offset": next_offset, + "total_v1": total_v1, + "processed": processed, + "matches_inserted": matches_inserted, + "errors": errors[:10], + "sample_matches": sample_matches, + } + + +async def get_v1_matches(control_uuid: str) -> list[dict]: + """Get all regulatory matches for a specific v1 control. + + Args: + control_uuid: The UUID of the v1 control. + + Returns: + List of match dicts with control details. + """ + with SessionLocal() as db: + rows = db.execute(text(""" + SELECT + m.similarity_score, + m.match_rank, + m.matched_source, + m.matched_article, + m.match_method, + c.control_id AS matched_control_id, + c.title AS matched_title, + c.objective AS matched_objective, + c.severity AS matched_severity, + c.category AS matched_category, + c.source_citation AS matched_source_citation + FROM v1_control_matches m + JOIN canonical_controls c ON c.id = m.matched_control_uuid + WHERE m.v1_control_uuid = CAST(:uuid AS uuid) + ORDER BY m.match_rank + """), {"uuid": control_uuid}).fetchall() + + return [ + { + "matched_control_id": r.matched_control_id, + "matched_title": r.matched_title, + "matched_objective": r.matched_objective, + "matched_severity": r.matched_severity, + "matched_category": r.matched_category, + "matched_source": r.matched_source, + "matched_article": r.matched_article, + "matched_source_citation": r.matched_source_citation, + "similarity_score": float(r.similarity_score), + "match_rank": r.match_rank, + "match_method": r.match_method, + } + for r in rows + ] + + +async def get_v1_enrichment_stats() -> dict: + """Get overview stats for v1 enrichment.""" + with SessionLocal() as db: + total_v1 = db.execute(text(f""" + SELECT COUNT(*) AS cnt FROM canonical_controls + WHERE {_is_eigenentwicklung_query()} + """)).fetchone() + + matched_v1 = db.execute(text(f""" + SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt + FROM v1_control_matches m + JOIN canonical_controls c ON c.id = m.v1_control_uuid + WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')} + """)).fetchone() + + total_matches = db.execute(text(""" + SELECT COUNT(*) AS cnt FROM v1_control_matches + """)).fetchone() + + avg_score = db.execute(text(""" + SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches + """)).fetchone() + + return { + "total_v1_controls": total_v1.cnt if total_v1 else 0, + "v1_with_matches": matched_v1.cnt if matched_v1 else 0, + "v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0), + "total_matches": total_matches.cnt if total_matches else 0, + "avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None, + } diff --git a/backend-compliance/migrations/080_v1_control_matches.sql b/backend-compliance/migrations/080_v1_control_matches.sql new file mode 100644 index 0000000..653ec43 --- /dev/null +++ b/backend-compliance/migrations/080_v1_control_matches.sql @@ -0,0 +1,18 @@ +-- V1 Control Enrichment: Cross-reference table for matching +-- Eigenentwicklung (v1, ungrouped, no source) → regulatorische Controls + +CREATE TABLE IF NOT EXISTS v1_control_matches ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + v1_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE, + matched_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE, + similarity_score NUMERIC(4,3) NOT NULL, + match_rank SMALLINT NOT NULL DEFAULT 1, + matched_source TEXT, -- e.g. "DSGVO (EU) 2016/679" + matched_article TEXT, -- e.g. "Art. 32" + match_method VARCHAR(30) NOT NULL DEFAULT 'embedding', + created_at TIMESTAMPTZ DEFAULT NOW(), + CONSTRAINT uq_v1_match UNIQUE (v1_control_uuid, matched_control_uuid) +); + +CREATE INDEX IF NOT EXISTS idx_v1m_v1 ON v1_control_matches(v1_control_uuid); +CREATE INDEX IF NOT EXISTS idx_v1m_matched ON v1_control_matches(matched_control_uuid); diff --git a/backend-compliance/tests/test_v1_enrichment.py b/backend-compliance/tests/test_v1_enrichment.py new file mode 100644 index 0000000..cc95fe8 --- /dev/null +++ b/backend-compliance/tests/test_v1_enrichment.py @@ -0,0 +1,220 @@ +"""Tests for V1 Control Enrichment (Eigenentwicklung matching).""" +import sys +sys.path.insert(0, ".") + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from compliance.services.v1_enrichment import ( + enrich_v1_matches, + get_v1_matches, + count_v1_controls, +) + + +class TestV1EnrichmentDryRun: + """Dry-run mode should return statistics without touching DB.""" + + @pytest.mark.asyncio + async def test_dry_run_returns_stats(self): + mock_v1 = [ + MagicMock( + id="uuid-v1-1", + control_id="ACC-013", + title="Zugriffskontrolle", + objective="Zugriff einschraenken", + category="access", + ), + MagicMock( + id="uuid-v1-2", + control_id="SEC-005", + title="Verschluesselung", + objective="Daten verschluesseln", + category="encryption", + ), + ] + + mock_count = MagicMock(cnt=863) + + with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session: + db = MagicMock() + mock_session.return_value.__enter__ = MagicMock(return_value=db) + mock_session.return_value.__exit__ = MagicMock(return_value=False) + # First call: v1 controls, second call: count + db.execute.return_value.fetchall.return_value = mock_v1 + db.execute.return_value.fetchone.return_value = mock_count + + result = await enrich_v1_matches(dry_run=True, batch_size=100, offset=0) + + assert result["dry_run"] is True + assert result["total_v1"] == 863 + assert len(result["sample_controls"]) == 2 + assert result["sample_controls"][0]["control_id"] == "ACC-013" + + +class TestV1EnrichmentExecution: + """Execution mode should find matches and insert them.""" + + @pytest.mark.asyncio + async def test_processes_and_inserts_matches(self): + mock_v1 = [ + MagicMock( + id="uuid-v1-1", + control_id="ACC-013", + title="Zugriffskontrolle", + objective="Zugriff auf Systeme einschraenken", + category="access", + ), + ] + + mock_count = MagicMock(cnt=1) + mock_matched_row = MagicMock( + id="uuid-reg-1", + control_id="SEC-042", + title="Verschluesselung personenbezogener Daten", + source_citation={"source": "DSGVO (EU) 2016/679", "article": "Art. 32"}, + severity="high", + category="encryption", + ) + + mock_qdrant_results = [ + { + "score": 0.89, + "payload": { + "control_uuid": "uuid-reg-1", + "control_id": "SEC-042", + "title": "Verschluesselung", + }, + }, + { + "score": 0.65, # Below threshold + "payload": { + "control_uuid": "uuid-reg-2", + "control_id": "SEC-100", + }, + }, + ] + + with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session: + db = MagicMock() + mock_session.return_value.__enter__ = MagicMock(return_value=db) + mock_session.return_value.__exit__ = MagicMock(return_value=False) + + # Multiple execute calls: v1 list, count, matched_row lookup, insert + call_count = [0] + def side_effect_execute(query, params=None): + call_count[0] += 1 + result = MagicMock() + # fetchall for v1 controls list + result.fetchall.return_value = mock_v1 + # fetchone for count and matched row + if "COUNT" in str(query): + result.fetchone.return_value = mock_count + elif "source_citation IS NOT NULL" in str(query): + result.fetchone.return_value = mock_matched_row + else: + result.fetchone.return_value = mock_count + return result + + db.execute.side_effect = side_effect_execute + + with patch("compliance.services.v1_enrichment.get_embedding") as mock_embed, \ + patch("compliance.services.v1_enrichment.qdrant_search_cross_regulation") as mock_qdrant: + mock_embed.return_value = [0.1] * 1024 + mock_qdrant.return_value = mock_qdrant_results + + result = await enrich_v1_matches(dry_run=False, batch_size=100, offset=0) + + assert result["dry_run"] is False + assert result["processed"] == 1 + assert result["matches_inserted"] == 1 + assert len(result["sample_matches"]) == 1 + assert result["sample_matches"][0]["matched_control_id"] == "SEC-042" + assert result["sample_matches"][0]["similarity_score"] == 0.89 + + @pytest.mark.asyncio + async def test_empty_batch_returns_done(self): + mock_count = MagicMock(cnt=863) + + with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session: + db = MagicMock() + mock_session.return_value.__enter__ = MagicMock(return_value=db) + mock_session.return_value.__exit__ = MagicMock(return_value=False) + db.execute.return_value.fetchall.return_value = [] + db.execute.return_value.fetchone.return_value = mock_count + + result = await enrich_v1_matches(dry_run=False, batch_size=100, offset=9999) + + assert result["processed"] == 0 + assert "alle v1 Controls verarbeitet" in result["message"] + + +class TestV1MatchesEndpoint: + """Test the matches retrieval.""" + + @pytest.mark.asyncio + async def test_returns_matches(self): + mock_rows = [ + MagicMock( + matched_control_id="SEC-042", + matched_title="Verschluesselung", + matched_objective="Daten verschluesseln", + matched_severity="high", + matched_category="encryption", + matched_source="DSGVO (EU) 2016/679", + matched_article="Art. 32", + matched_source_citation={"source": "DSGVO (EU) 2016/679"}, + similarity_score=0.89, + match_rank=1, + match_method="embedding", + ), + ] + + with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session: + db = MagicMock() + mock_session.return_value.__enter__ = MagicMock(return_value=db) + mock_session.return_value.__exit__ = MagicMock(return_value=False) + db.execute.return_value.fetchall.return_value = mock_rows + + result = await get_v1_matches("uuid-v1-1") + + assert len(result) == 1 + assert result[0]["matched_control_id"] == "SEC-042" + assert result[0]["similarity_score"] == 0.89 + assert result[0]["matched_source"] == "DSGVO (EU) 2016/679" + + @pytest.mark.asyncio + async def test_empty_matches(self): + with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session: + db = MagicMock() + mock_session.return_value.__enter__ = MagicMock(return_value=db) + mock_session.return_value.__exit__ = MagicMock(return_value=False) + db.execute.return_value.fetchall.return_value = [] + + result = await get_v1_matches("uuid-nonexistent") + + assert result == [] + + +class TestEigenentwicklungDetection: + """Verify the Eigenentwicklung detection query.""" + + @pytest.mark.asyncio + async def test_count_v1_controls(self): + mock_count = MagicMock(cnt=863) + + with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session: + db = MagicMock() + mock_session.return_value.__enter__ = MagicMock(return_value=db) + mock_session.return_value.__exit__ = MagicMock(return_value=False) + db.execute.return_value.fetchone.return_value = mock_count + + result = await count_v1_controls() + + assert result == 863 + # Verify the query includes all conditions + call_args = db.execute.call_args[0][0] + query_str = str(call_args) + assert "generation_strategy = 'ungrouped'" in query_str + assert "source_citation IS NULL" in query_str + assert "parent_control_uuid IS NULL" in query_str