feat: V1 Control Enrichment — Eigenentwicklung-Label, regulatorisches Matching & Vergleichsansicht
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 39s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 4s

863 v1-Controls (manuell geschrieben, ohne Rechtsgrundlage) werden als
"Eigenentwicklung" gekennzeichnet und automatisch mit regulatorischen
Controls (DSGVO, NIS2, OWASP etc.) per Embedding-Similarity abgeglichen.

Backend:
- Migration 080: v1_control_matches Tabelle (Cross-Reference)
- v1_enrichment.py: Batch-Matching via BGE-M3 + Qdrant (Threshold 0.75)
- 3 neue API-Endpoints: enrich-v1-matches, v1-matches, v1-enrichment-stats
- 6 Tests (dry-run, execution, matches, pagination, detection)

Frontend:
- Orange "Eigenentwicklung"-Badge statt grauem "v1" (wenn kein Source)
- "Regulatorische Abdeckung"-Sektion im ControlDetail mit Match-Karten
- Side-by-Side V1CompareView (Eigenentwicklung vs. regulatorisch gedeckt)
- Prev/Next Navigation durch alle Matches

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-26 10:32:08 +01:00
parent cb034b8009
commit db7c207464
11 changed files with 939 additions and 6 deletions

View File

@@ -547,6 +547,15 @@ async def atomic_stats():
}
@router.get("/controls/v1-enrichment-stats")
async def v1_enrichment_stats_endpoint():
"""
Uebersicht: Wie viele v1 Controls haben regulatorische Abdeckung?
"""
from compliance.services.v1_enrichment import get_v1_enrichment_stats
return await get_v1_enrichment_stats()
@router.get("/controls/{control_id}")
async def get_control(control_id: str):
"""Get a single canonical control by its control_id (e.g. AUTH-001)."""
@@ -1567,6 +1576,57 @@ async def list_licenses():
return get_license_matrix(db)
# =============================================================================
# V1 ENRICHMENT (Eigenentwicklung → Regulatorische Abdeckung)
# =============================================================================
@router.post("/controls/enrich-v1-matches")
async def enrich_v1_matches_endpoint(
dry_run: bool = Query(True, description="Nur zaehlen, nicht schreiben"),
batch_size: int = Query(100, description="Controls pro Durchlauf"),
offset: int = Query(0, description="Offset fuer Paginierung"),
):
"""
Findet regulatorische Abdeckung fuer v1 Eigenentwicklung Controls.
Eigenentwicklung = generation_strategy='ungrouped', pipeline_version=1,
source_citation IS NULL, parent_control_uuid IS NULL.
Workflow:
1. dry_run=true → Statistiken anzeigen
2. dry_run=false&batch_size=100&offset=0 → Erste 100 verarbeiten
3. Wiederholen mit next_offset bis fertig
"""
from compliance.services.v1_enrichment import enrich_v1_matches
return await enrich_v1_matches(
dry_run=dry_run,
batch_size=batch_size,
offset=offset,
)
@router.get("/controls/{control_id}/v1-matches")
async def get_v1_matches_endpoint(control_id: str):
"""
Gibt regulatorische Matches fuer ein v1 Control zurueck.
Returns:
Liste von Matches mit Control-Details, Source, Score.
"""
from compliance.services.v1_enrichment import get_v1_matches
# Resolve control_id to UUID
with SessionLocal() as db:
row = db.execute(text("""
SELECT id FROM canonical_controls WHERE control_id = :cid
"""), {"cid": control_id}).fetchone()
if not row:
raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
return await get_v1_matches(str(row.id))
# =============================================================================
# INTERNAL HELPERS
# =============================================================================

View File

@@ -0,0 +1,301 @@
"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations.
Finds regulatory coverage for v1 controls (generation_strategy='ungrouped',
pipeline_version=1, no source_citation) by embedding similarity search.
Reuses embedding + Qdrant helpers from control_dedup.py.
"""
import logging
from typing import Optional
from sqlalchemy import text
from database import SessionLocal
from compliance.services.control_dedup import (
get_embedding,
qdrant_search_cross_regulation,
)
logger = logging.getLogger(__name__)
# Similarity threshold — lower than dedup (0.85) since we want informational matches
V1_MATCH_THRESHOLD = 0.75
V1_MAX_MATCHES = 5
def _is_eigenentwicklung_query() -> str:
"""SQL WHERE clause identifying v1 Eigenentwicklung controls."""
return """
generation_strategy = 'ungrouped'
AND (pipeline_version = '1' OR pipeline_version IS NULL)
AND source_citation IS NULL
AND parent_control_uuid IS NULL
AND release_state NOT IN ('rejected', 'merged', 'deprecated')
"""
async def count_v1_controls() -> int:
"""Count how many v1 Eigenentwicklung controls exist."""
with SessionLocal() as db:
row = db.execute(text(f"""
SELECT COUNT(*) AS cnt
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
return row.cnt if row else 0
async def enrich_v1_matches(
dry_run: bool = True,
batch_size: int = 100,
offset: int = 0,
) -> dict:
"""Find regulatory matches for v1 Eigenentwicklung controls.
Args:
dry_run: If True, only count — don't write matches.
batch_size: Number of v1 controls to process per call.
offset: Pagination offset (v1 control index).
Returns:
Stats dict with counts, sample matches, and pagination info.
"""
with SessionLocal() as db:
# 1. Load v1 controls (paginated)
v1_controls = db.execute(text(f"""
SELECT id, control_id, title, objective, category
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
ORDER BY control_id
LIMIT :limit OFFSET :offset
"""), {"limit": batch_size, "offset": offset}).fetchall()
# Count total for pagination
total_row = db.execute(text(f"""
SELECT COUNT(*) AS cnt
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
total_v1 = total_row.cnt if total_row else 0
if not v1_controls:
return {
"dry_run": dry_run,
"processed": 0,
"total_v1": total_v1,
"message": "Kein weiterer Batch — alle v1 Controls verarbeitet.",
}
if dry_run:
return {
"dry_run": True,
"total_v1": total_v1,
"offset": offset,
"batch_size": batch_size,
"sample_controls": [
{
"control_id": r.control_id,
"title": r.title,
"category": r.category,
}
for r in v1_controls[:20]
],
}
# 2. Process each v1 control
processed = 0
matches_inserted = 0
errors = []
sample_matches = []
for v1 in v1_controls:
try:
# Build search text
search_text = f"{v1.title}{v1.objective}"
# Get embedding
embedding = await get_embedding(search_text)
if not embedding:
errors.append({
"control_id": v1.control_id,
"error": "Embedding fehlgeschlagen",
})
continue
# Search Qdrant (cross-regulation, no pattern filter)
results = await qdrant_search_cross_regulation(
embedding, top_k=10,
)
# Filter: only regulatory controls (with source_citation)
# and above threshold
rank = 0
for hit in results:
score = hit.get("score", 0)
if score < V1_MATCH_THRESHOLD:
continue
payload = hit.get("payload", {})
matched_uuid = payload.get("control_uuid")
if not matched_uuid or matched_uuid == str(v1.id):
continue
# Check if matched control has source_citation
matched_row = db.execute(text("""
SELECT id, control_id, title, source_citation, severity, category
FROM canonical_controls
WHERE id = CAST(:uuid AS uuid)
AND source_citation IS NOT NULL
"""), {"uuid": matched_uuid}).fetchone()
if not matched_row:
continue
rank += 1
if rank > V1_MAX_MATCHES:
break
# Extract source info
source_citation = matched_row.source_citation or {}
matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
# Insert match (ON CONFLICT skip)
db.execute(text("""
INSERT INTO v1_control_matches
(v1_control_uuid, matched_control_uuid, similarity_score,
match_rank, matched_source, matched_article, match_method)
VALUES
(CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score,
:rank, :source, :article, 'embedding')
ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE
SET similarity_score = EXCLUDED.similarity_score,
match_rank = EXCLUDED.match_rank
"""), {
"v1_uuid": str(v1.id),
"matched_uuid": str(matched_row.id),
"score": round(score, 3),
"rank": rank,
"source": matched_source,
"article": matched_article,
})
matches_inserted += 1
# Collect sample
if len(sample_matches) < 20:
sample_matches.append({
"v1_control_id": v1.control_id,
"v1_title": v1.title,
"matched_control_id": matched_row.control_id,
"matched_title": matched_row.title,
"matched_source": matched_source,
"matched_article": matched_article,
"similarity_score": round(score, 3),
"match_rank": rank,
})
processed += 1
except Exception as e:
logger.warning("V1 enrichment error for %s: %s", v1.control_id, e)
errors.append({
"control_id": v1.control_id,
"error": str(e),
})
db.commit()
# Pagination
next_offset = offset + batch_size if len(v1_controls) == batch_size else None
return {
"dry_run": False,
"offset": offset,
"batch_size": batch_size,
"next_offset": next_offset,
"total_v1": total_v1,
"processed": processed,
"matches_inserted": matches_inserted,
"errors": errors[:10],
"sample_matches": sample_matches,
}
async def get_v1_matches(control_uuid: str) -> list[dict]:
"""Get all regulatory matches for a specific v1 control.
Args:
control_uuid: The UUID of the v1 control.
Returns:
List of match dicts with control details.
"""
with SessionLocal() as db:
rows = db.execute(text("""
SELECT
m.similarity_score,
m.match_rank,
m.matched_source,
m.matched_article,
m.match_method,
c.control_id AS matched_control_id,
c.title AS matched_title,
c.objective AS matched_objective,
c.severity AS matched_severity,
c.category AS matched_category,
c.source_citation AS matched_source_citation
FROM v1_control_matches m
JOIN canonical_controls c ON c.id = m.matched_control_uuid
WHERE m.v1_control_uuid = CAST(:uuid AS uuid)
ORDER BY m.match_rank
"""), {"uuid": control_uuid}).fetchall()
return [
{
"matched_control_id": r.matched_control_id,
"matched_title": r.matched_title,
"matched_objective": r.matched_objective,
"matched_severity": r.matched_severity,
"matched_category": r.matched_category,
"matched_source": r.matched_source,
"matched_article": r.matched_article,
"matched_source_citation": r.matched_source_citation,
"similarity_score": float(r.similarity_score),
"match_rank": r.match_rank,
"match_method": r.match_method,
}
for r in rows
]
async def get_v1_enrichment_stats() -> dict:
"""Get overview stats for v1 enrichment."""
with SessionLocal() as db:
total_v1 = db.execute(text(f"""
SELECT COUNT(*) AS cnt FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
matched_v1 = db.execute(text(f"""
SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt
FROM v1_control_matches m
JOIN canonical_controls c ON c.id = m.v1_control_uuid
WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')}
""")).fetchone()
total_matches = db.execute(text("""
SELECT COUNT(*) AS cnt FROM v1_control_matches
""")).fetchone()
avg_score = db.execute(text("""
SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches
""")).fetchone()
return {
"total_v1_controls": total_v1.cnt if total_v1 else 0,
"v1_with_matches": matched_v1.cnt if matched_v1 else 0,
"v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0),
"total_matches": total_matches.cnt if total_matches else 0,
"avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None,
}

View File

@@ -0,0 +1,18 @@
-- V1 Control Enrichment: Cross-reference table for matching
-- Eigenentwicklung (v1, ungrouped, no source) → regulatorische Controls
CREATE TABLE IF NOT EXISTS v1_control_matches (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
v1_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
matched_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
similarity_score NUMERIC(4,3) NOT NULL,
match_rank SMALLINT NOT NULL DEFAULT 1,
matched_source TEXT, -- e.g. "DSGVO (EU) 2016/679"
matched_article TEXT, -- e.g. "Art. 32"
match_method VARCHAR(30) NOT NULL DEFAULT 'embedding',
created_at TIMESTAMPTZ DEFAULT NOW(),
CONSTRAINT uq_v1_match UNIQUE (v1_control_uuid, matched_control_uuid)
);
CREATE INDEX IF NOT EXISTS idx_v1m_v1 ON v1_control_matches(v1_control_uuid);
CREATE INDEX IF NOT EXISTS idx_v1m_matched ON v1_control_matches(matched_control_uuid);

View File

@@ -0,0 +1,220 @@
"""Tests for V1 Control Enrichment (Eigenentwicklung matching)."""
import sys
sys.path.insert(0, ".")
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from compliance.services.v1_enrichment import (
enrich_v1_matches,
get_v1_matches,
count_v1_controls,
)
class TestV1EnrichmentDryRun:
"""Dry-run mode should return statistics without touching DB."""
@pytest.mark.asyncio
async def test_dry_run_returns_stats(self):
mock_v1 = [
MagicMock(
id="uuid-v1-1",
control_id="ACC-013",
title="Zugriffskontrolle",
objective="Zugriff einschraenken",
category="access",
),
MagicMock(
id="uuid-v1-2",
control_id="SEC-005",
title="Verschluesselung",
objective="Daten verschluesseln",
category="encryption",
),
]
mock_count = MagicMock(cnt=863)
with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
db = MagicMock()
mock_session.return_value.__enter__ = MagicMock(return_value=db)
mock_session.return_value.__exit__ = MagicMock(return_value=False)
# First call: v1 controls, second call: count
db.execute.return_value.fetchall.return_value = mock_v1
db.execute.return_value.fetchone.return_value = mock_count
result = await enrich_v1_matches(dry_run=True, batch_size=100, offset=0)
assert result["dry_run"] is True
assert result["total_v1"] == 863
assert len(result["sample_controls"]) == 2
assert result["sample_controls"][0]["control_id"] == "ACC-013"
class TestV1EnrichmentExecution:
"""Execution mode should find matches and insert them."""
@pytest.mark.asyncio
async def test_processes_and_inserts_matches(self):
mock_v1 = [
MagicMock(
id="uuid-v1-1",
control_id="ACC-013",
title="Zugriffskontrolle",
objective="Zugriff auf Systeme einschraenken",
category="access",
),
]
mock_count = MagicMock(cnt=1)
mock_matched_row = MagicMock(
id="uuid-reg-1",
control_id="SEC-042",
title="Verschluesselung personenbezogener Daten",
source_citation={"source": "DSGVO (EU) 2016/679", "article": "Art. 32"},
severity="high",
category="encryption",
)
mock_qdrant_results = [
{
"score": 0.89,
"payload": {
"control_uuid": "uuid-reg-1",
"control_id": "SEC-042",
"title": "Verschluesselung",
},
},
{
"score": 0.65, # Below threshold
"payload": {
"control_uuid": "uuid-reg-2",
"control_id": "SEC-100",
},
},
]
with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
db = MagicMock()
mock_session.return_value.__enter__ = MagicMock(return_value=db)
mock_session.return_value.__exit__ = MagicMock(return_value=False)
# Multiple execute calls: v1 list, count, matched_row lookup, insert
call_count = [0]
def side_effect_execute(query, params=None):
call_count[0] += 1
result = MagicMock()
# fetchall for v1 controls list
result.fetchall.return_value = mock_v1
# fetchone for count and matched row
if "COUNT" in str(query):
result.fetchone.return_value = mock_count
elif "source_citation IS NOT NULL" in str(query):
result.fetchone.return_value = mock_matched_row
else:
result.fetchone.return_value = mock_count
return result
db.execute.side_effect = side_effect_execute
with patch("compliance.services.v1_enrichment.get_embedding") as mock_embed, \
patch("compliance.services.v1_enrichment.qdrant_search_cross_regulation") as mock_qdrant:
mock_embed.return_value = [0.1] * 1024
mock_qdrant.return_value = mock_qdrant_results
result = await enrich_v1_matches(dry_run=False, batch_size=100, offset=0)
assert result["dry_run"] is False
assert result["processed"] == 1
assert result["matches_inserted"] == 1
assert len(result["sample_matches"]) == 1
assert result["sample_matches"][0]["matched_control_id"] == "SEC-042"
assert result["sample_matches"][0]["similarity_score"] == 0.89
@pytest.mark.asyncio
async def test_empty_batch_returns_done(self):
mock_count = MagicMock(cnt=863)
with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
db = MagicMock()
mock_session.return_value.__enter__ = MagicMock(return_value=db)
mock_session.return_value.__exit__ = MagicMock(return_value=False)
db.execute.return_value.fetchall.return_value = []
db.execute.return_value.fetchone.return_value = mock_count
result = await enrich_v1_matches(dry_run=False, batch_size=100, offset=9999)
assert result["processed"] == 0
assert "alle v1 Controls verarbeitet" in result["message"]
class TestV1MatchesEndpoint:
"""Test the matches retrieval."""
@pytest.mark.asyncio
async def test_returns_matches(self):
mock_rows = [
MagicMock(
matched_control_id="SEC-042",
matched_title="Verschluesselung",
matched_objective="Daten verschluesseln",
matched_severity="high",
matched_category="encryption",
matched_source="DSGVO (EU) 2016/679",
matched_article="Art. 32",
matched_source_citation={"source": "DSGVO (EU) 2016/679"},
similarity_score=0.89,
match_rank=1,
match_method="embedding",
),
]
with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
db = MagicMock()
mock_session.return_value.__enter__ = MagicMock(return_value=db)
mock_session.return_value.__exit__ = MagicMock(return_value=False)
db.execute.return_value.fetchall.return_value = mock_rows
result = await get_v1_matches("uuid-v1-1")
assert len(result) == 1
assert result[0]["matched_control_id"] == "SEC-042"
assert result[0]["similarity_score"] == 0.89
assert result[0]["matched_source"] == "DSGVO (EU) 2016/679"
@pytest.mark.asyncio
async def test_empty_matches(self):
with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
db = MagicMock()
mock_session.return_value.__enter__ = MagicMock(return_value=db)
mock_session.return_value.__exit__ = MagicMock(return_value=False)
db.execute.return_value.fetchall.return_value = []
result = await get_v1_matches("uuid-nonexistent")
assert result == []
class TestEigenentwicklungDetection:
"""Verify the Eigenentwicklung detection query."""
@pytest.mark.asyncio
async def test_count_v1_controls(self):
mock_count = MagicMock(cnt=863)
with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
db = MagicMock()
mock_session.return_value.__enter__ = MagicMock(return_value=db)
mock_session.return_value.__exit__ = MagicMock(return_value=False)
db.execute.return_value.fetchone.return_value = mock_count
result = await count_v1_controls()
assert result == 863
# Verify the query includes all conditions
call_args = db.execute.call_args[0][0]
query_str = str(call_args)
assert "generation_strategy = 'ungrouped'" in query_str
assert "source_citation IS NULL" in query_str
assert "parent_control_uuid IS NULL" in query_str