feat: V1 Control Enrichment — Eigenentwicklung-Label, regulatorisches Matching & Vergleichsansicht

863 v1-Controls (manuell geschrieben, ohne Rechtsgrundlage) werden als "Eigenentwicklung" gekennzeichnet und automatisch mit regulatorischen Controls (DSGVO, NIS2, OWASP etc.) per Embedding-Similarity abgeglichen. Backend: - Migration 080: v1_control_matches Tabelle (Cross-Reference) - v1_enrichment.py: Batch-Matching via BGE-M3 + Qdrant (Threshold 0.75) - 3 neue API-Endpoints: enrich-v1-matches, v1-matches, v1-enrichment-stats - 6 Tests (dry-run, execution, matches, pagination, detection) Frontend: - Orange "Eigenentwicklung"-Badge statt grauem "v1" (wenn kein Source) - "Regulatorische Abdeckung"-Sektion im ControlDetail mit Match-Karten - Side-by-Side V1CompareView (Eigenentwicklung vs. regulatorisch gedeckt) - Prev/Next Navigation durch alle Matches Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 10:32:08 +01:00
parent cb034b8009
commit db7c207464
11 changed files with 939 additions and 6 deletions
@@ -547,6 +547,15 @@ async def atomic_stats():
    }


+@router.get("/controls/v1-enrichment-stats")
+async def v1_enrichment_stats_endpoint():
+    """
+    Uebersicht: Wie viele v1 Controls haben regulatorische Abdeckung?
+    """
+    from compliance.services.v1_enrichment import get_v1_enrichment_stats
+    return await get_v1_enrichment_stats()
+
+
@router.get("/controls/{control_id}")
 async def get_control(control_id: str):
    """Get a single canonical control by its control_id (e.g. AUTH-001)."""
@@ -1567,6 +1576,57 @@ async def list_licenses():
        return get_license_matrix(db)


+# =============================================================================
+# V1 ENRICHMENT (Eigenentwicklung → Regulatorische Abdeckung)
+# =============================================================================
+
+@router.post("/controls/enrich-v1-matches")
+async def enrich_v1_matches_endpoint(
+    dry_run: bool = Query(True, description="Nur zaehlen, nicht schreiben"),
+    batch_size: int = Query(100, description="Controls pro Durchlauf"),
+    offset: int = Query(0, description="Offset fuer Paginierung"),
+):
+    """
+    Findet regulatorische Abdeckung fuer v1 Eigenentwicklung Controls.
+
+    Eigenentwicklung = generation_strategy='ungrouped', pipeline_version=1,
+    source_citation IS NULL, parent_control_uuid IS NULL.
+
+    Workflow:
+      1. dry_run=true → Statistiken anzeigen
+      2. dry_run=false&batch_size=100&offset=0 → Erste 100 verarbeiten
+      3. Wiederholen mit next_offset bis fertig
+    """
+    from compliance.services.v1_enrichment import enrich_v1_matches
+    return await enrich_v1_matches(
+        dry_run=dry_run,
+        batch_size=batch_size,
+        offset=offset,
+    )
+
+
+@router.get("/controls/{control_id}/v1-matches")
+async def get_v1_matches_endpoint(control_id: str):
+    """
+    Gibt regulatorische Matches fuer ein v1 Control zurueck.
+
+    Returns:
+        Liste von Matches mit Control-Details, Source, Score.
+    """
+    from compliance.services.v1_enrichment import get_v1_matches
+
+    # Resolve control_id to UUID
+    with SessionLocal() as db:
+        row = db.execute(text("""
+            SELECT id FROM canonical_controls WHERE control_id = :cid
+        """), {"cid": control_id}).fetchone()
+
+    if not row:
+        raise HTTPException(status_code=404, detail=f"Control {control_id} not found")
+
+    return await get_v1_matches(str(row.id))
+
+
 # =============================================================================
 # INTERNAL HELPERS
 # =============================================================================
@@ -0,0 +1,301 @@
+"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations.
+
+Finds regulatory coverage for v1 controls (generation_strategy='ungrouped',
+pipeline_version=1, no source_citation) by embedding similarity search.
+
+Reuses embedding + Qdrant helpers from control_dedup.py.
+"""
+
+import logging
+from typing import Optional
+
+from sqlalchemy import text
+
+from database import SessionLocal
+from compliance.services.control_dedup import (
+    get_embedding,
+    qdrant_search_cross_regulation,
+)
+
+logger = logging.getLogger(__name__)
+
+# Similarity threshold — lower than dedup (0.85) since we want informational matches
+V1_MATCH_THRESHOLD = 0.75
+V1_MAX_MATCHES = 5
+
+
+def _is_eigenentwicklung_query() -> str:
+    """SQL WHERE clause identifying v1 Eigenentwicklung controls."""
+    return """
+        generation_strategy = 'ungrouped'
+        AND (pipeline_version = '1' OR pipeline_version IS NULL)
+        AND source_citation IS NULL
+        AND parent_control_uuid IS NULL
+        AND release_state NOT IN ('rejected', 'merged', 'deprecated')
+    """
+
+
+async def count_v1_controls() -> int:
+    """Count how many v1 Eigenentwicklung controls exist."""
+    with SessionLocal() as db:
+        row = db.execute(text(f"""
+            SELECT COUNT(*) AS cnt
+            FROM canonical_controls
+            WHERE {_is_eigenentwicklung_query()}
+        """)).fetchone()
+        return row.cnt if row else 0
+
+
+async def enrich_v1_matches(
+    dry_run: bool = True,
+    batch_size: int = 100,
+    offset: int = 0,
+) -> dict:
+    """Find regulatory matches for v1 Eigenentwicklung controls.
+
+    Args:
+        dry_run: If True, only count — don't write matches.
+        batch_size: Number of v1 controls to process per call.
+        offset: Pagination offset (v1 control index).
+
+    Returns:
+        Stats dict with counts, sample matches, and pagination info.
+    """
+    with SessionLocal() as db:
+        # 1. Load v1 controls (paginated)
+        v1_controls = db.execute(text(f"""
+            SELECT id, control_id, title, objective, category
+            FROM canonical_controls
+            WHERE {_is_eigenentwicklung_query()}
+            ORDER BY control_id
+            LIMIT :limit OFFSET :offset
+        """), {"limit": batch_size, "offset": offset}).fetchall()
+
+        # Count total for pagination
+        total_row = db.execute(text(f"""
+            SELECT COUNT(*) AS cnt
+            FROM canonical_controls
+            WHERE {_is_eigenentwicklung_query()}
+        """)).fetchone()
+        total_v1 = total_row.cnt if total_row else 0
+
+        if not v1_controls:
+            return {
+                "dry_run": dry_run,
+                "processed": 0,
+                "total_v1": total_v1,
+                "message": "Kein weiterer Batch — alle v1 Controls verarbeitet.",
+            }
+
+        if dry_run:
+            return {
+                "dry_run": True,
+                "total_v1": total_v1,
+                "offset": offset,
+                "batch_size": batch_size,
+                "sample_controls": [
+                    {
+                        "control_id": r.control_id,
+                        "title": r.title,
+                        "category": r.category,
+                    }
+                    for r in v1_controls[:20]
+                ],
+            }
+
+        # 2. Process each v1 control
+        processed = 0
+        matches_inserted = 0
+        errors = []
+        sample_matches = []
+
+        for v1 in v1_controls:
+            try:
+                # Build search text
+                search_text = f"{v1.title} — {v1.objective}"
+
+                # Get embedding
+                embedding = await get_embedding(search_text)
+                if not embedding:
+                    errors.append({
+                        "control_id": v1.control_id,
+                        "error": "Embedding fehlgeschlagen",
+                    })
+                    continue
+
+                # Search Qdrant (cross-regulation, no pattern filter)
+                results = await qdrant_search_cross_regulation(
+                    embedding, top_k=10,
+                )
+
+                # Filter: only regulatory controls (with source_citation)
+                # and above threshold
+                rank = 0
+                for hit in results:
+                    score = hit.get("score", 0)
+                    if score < V1_MATCH_THRESHOLD:
+                        continue
+
+                    payload = hit.get("payload", {})
+                    matched_uuid = payload.get("control_uuid")
+                    if not matched_uuid or matched_uuid == str(v1.id):
+                        continue
+
+                    # Check if matched control has source_citation
+                    matched_row = db.execute(text("""
+                        SELECT id, control_id, title, source_citation, severity, category
+                        FROM canonical_controls
+                        WHERE id = CAST(:uuid AS uuid)
+                          AND source_citation IS NOT NULL
+                    """), {"uuid": matched_uuid}).fetchone()
+
+                    if not matched_row:
+                        continue
+
+                    rank += 1
+                    if rank > V1_MAX_MATCHES:
+                        break
+
+                    # Extract source info
+                    source_citation = matched_row.source_citation or {}
+                    matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
+                    matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
+
+                    # Insert match (ON CONFLICT skip)
+                    db.execute(text("""
+                        INSERT INTO v1_control_matches
+                            (v1_control_uuid, matched_control_uuid, similarity_score,
+                             match_rank, matched_source, matched_article, match_method)
+                        VALUES
+                            (CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score,
+                             :rank, :source, :article, 'embedding')
+                        ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE
+                        SET similarity_score = EXCLUDED.similarity_score,
+                            match_rank = EXCLUDED.match_rank
+                    """), {
+                        "v1_uuid": str(v1.id),
+                        "matched_uuid": str(matched_row.id),
+                        "score": round(score, 3),
+                        "rank": rank,
+                        "source": matched_source,
+                        "article": matched_article,
+                    })
+                    matches_inserted += 1
+
+                    # Collect sample
+                    if len(sample_matches) < 20:
+                        sample_matches.append({
+                            "v1_control_id": v1.control_id,
+                            "v1_title": v1.title,
+                            "matched_control_id": matched_row.control_id,
+                            "matched_title": matched_row.title,
+                            "matched_source": matched_source,
+                            "matched_article": matched_article,
+                            "similarity_score": round(score, 3),
+                            "match_rank": rank,
+                        })
+
+                processed += 1
+
+            except Exception as e:
+                logger.warning("V1 enrichment error for %s: %s", v1.control_id, e)
+                errors.append({
+                    "control_id": v1.control_id,
+                    "error": str(e),
+                })
+
+        db.commit()
+
+    # Pagination
+    next_offset = offset + batch_size if len(v1_controls) == batch_size else None
+
+    return {
+        "dry_run": False,
+        "offset": offset,
+        "batch_size": batch_size,
+        "next_offset": next_offset,
+        "total_v1": total_v1,
+        "processed": processed,
+        "matches_inserted": matches_inserted,
+        "errors": errors[:10],
+        "sample_matches": sample_matches,
+    }
+
+
+async def get_v1_matches(control_uuid: str) -> list[dict]:
+    """Get all regulatory matches for a specific v1 control.
+
+    Args:
+        control_uuid: The UUID of the v1 control.
+
+    Returns:
+        List of match dicts with control details.
+    """
+    with SessionLocal() as db:
+        rows = db.execute(text("""
+            SELECT
+                m.similarity_score,
+                m.match_rank,
+                m.matched_source,
+                m.matched_article,
+                m.match_method,
+                c.control_id AS matched_control_id,
+                c.title AS matched_title,
+                c.objective AS matched_objective,
+                c.severity AS matched_severity,
+                c.category AS matched_category,
+                c.source_citation AS matched_source_citation
+            FROM v1_control_matches m
+            JOIN canonical_controls c ON c.id = m.matched_control_uuid
+            WHERE m.v1_control_uuid = CAST(:uuid AS uuid)
+            ORDER BY m.match_rank
+        """), {"uuid": control_uuid}).fetchall()
+
+        return [
+            {
+                "matched_control_id": r.matched_control_id,
+                "matched_title": r.matched_title,
+                "matched_objective": r.matched_objective,
+                "matched_severity": r.matched_severity,
+                "matched_category": r.matched_category,
+                "matched_source": r.matched_source,
+                "matched_article": r.matched_article,
+                "matched_source_citation": r.matched_source_citation,
+                "similarity_score": float(r.similarity_score),
+                "match_rank": r.match_rank,
+                "match_method": r.match_method,
+            }
+            for r in rows
+        ]
+
+
+async def get_v1_enrichment_stats() -> dict:
+    """Get overview stats for v1 enrichment."""
+    with SessionLocal() as db:
+        total_v1 = db.execute(text(f"""
+            SELECT COUNT(*) AS cnt FROM canonical_controls
+            WHERE {_is_eigenentwicklung_query()}
+        """)).fetchone()
+
+        matched_v1 = db.execute(text(f"""
+            SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt
+            FROM v1_control_matches m
+            JOIN canonical_controls c ON c.id = m.v1_control_uuid
+            WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')}
+        """)).fetchone()
+
+        total_matches = db.execute(text("""
+            SELECT COUNT(*) AS cnt FROM v1_control_matches
+        """)).fetchone()
+
+        avg_score = db.execute(text("""
+            SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches
+        """)).fetchone()
+
+        return {
+            "total_v1_controls": total_v1.cnt if total_v1 else 0,
+            "v1_with_matches": matched_v1.cnt if matched_v1 else 0,
+            "v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0),
+            "total_matches": total_matches.cnt if total_matches else 0,
+            "avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None,
+        }
@@ -0,0 +1,18 @@
+-- V1 Control Enrichment: Cross-reference table for matching
+-- Eigenentwicklung (v1, ungrouped, no source) → regulatorische Controls
+
+CREATE TABLE IF NOT EXISTS v1_control_matches (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    v1_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
+    matched_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
+    similarity_score NUMERIC(4,3) NOT NULL,
+    match_rank SMALLINT NOT NULL DEFAULT 1,
+    matched_source TEXT,           -- e.g. "DSGVO (EU) 2016/679"
+    matched_article TEXT,          -- e.g. "Art. 32"
+    match_method VARCHAR(30) NOT NULL DEFAULT 'embedding',
+    created_at TIMESTAMPTZ DEFAULT NOW(),
+    CONSTRAINT uq_v1_match UNIQUE (v1_control_uuid, matched_control_uuid)
+);
+
+CREATE INDEX IF NOT EXISTS idx_v1m_v1 ON v1_control_matches(v1_control_uuid);
+CREATE INDEX IF NOT EXISTS idx_v1m_matched ON v1_control_matches(matched_control_uuid);
@@ -0,0 +1,220 @@
+"""Tests for V1 Control Enrichment (Eigenentwicklung matching)."""
+import sys
+sys.path.insert(0, ".")
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+
+from compliance.services.v1_enrichment import (
+    enrich_v1_matches,
+    get_v1_matches,
+    count_v1_controls,
+)
+
+
+class TestV1EnrichmentDryRun:
+    """Dry-run mode should return statistics without touching DB."""
+
+    @pytest.mark.asyncio
+    async def test_dry_run_returns_stats(self):
+        mock_v1 = [
+            MagicMock(
+                id="uuid-v1-1",
+                control_id="ACC-013",
+                title="Zugriffskontrolle",
+                objective="Zugriff einschraenken",
+                category="access",
+            ),
+            MagicMock(
+                id="uuid-v1-2",
+                control_id="SEC-005",
+                title="Verschluesselung",
+                objective="Daten verschluesseln",
+                category="encryption",
+            ),
+        ]
+
+        mock_count = MagicMock(cnt=863)
+
+        with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
+            db = MagicMock()
+            mock_session.return_value.__enter__ = MagicMock(return_value=db)
+            mock_session.return_value.__exit__ = MagicMock(return_value=False)
+            # First call: v1 controls, second call: count
+            db.execute.return_value.fetchall.return_value = mock_v1
+            db.execute.return_value.fetchone.return_value = mock_count
+
+            result = await enrich_v1_matches(dry_run=True, batch_size=100, offset=0)
+
+        assert result["dry_run"] is True
+        assert result["total_v1"] == 863
+        assert len(result["sample_controls"]) == 2
+        assert result["sample_controls"][0]["control_id"] == "ACC-013"
+
+
+class TestV1EnrichmentExecution:
+    """Execution mode should find matches and insert them."""
+
+    @pytest.mark.asyncio
+    async def test_processes_and_inserts_matches(self):
+        mock_v1 = [
+            MagicMock(
+                id="uuid-v1-1",
+                control_id="ACC-013",
+                title="Zugriffskontrolle",
+                objective="Zugriff auf Systeme einschraenken",
+                category="access",
+            ),
+        ]
+
+        mock_count = MagicMock(cnt=1)
+        mock_matched_row = MagicMock(
+            id="uuid-reg-1",
+            control_id="SEC-042",
+            title="Verschluesselung personenbezogener Daten",
+            source_citation={"source": "DSGVO (EU) 2016/679", "article": "Art. 32"},
+            severity="high",
+            category="encryption",
+        )
+
+        mock_qdrant_results = [
+            {
+                "score": 0.89,
+                "payload": {
+                    "control_uuid": "uuid-reg-1",
+                    "control_id": "SEC-042",
+                    "title": "Verschluesselung",
+                },
+            },
+            {
+                "score": 0.65,  # Below threshold
+                "payload": {
+                    "control_uuid": "uuid-reg-2",
+                    "control_id": "SEC-100",
+                },
+            },
+        ]
+
+        with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
+            db = MagicMock()
+            mock_session.return_value.__enter__ = MagicMock(return_value=db)
+            mock_session.return_value.__exit__ = MagicMock(return_value=False)
+
+            # Multiple execute calls: v1 list, count, matched_row lookup, insert
+            call_count = [0]
+            def side_effect_execute(query, params=None):
+                call_count[0] += 1
+                result = MagicMock()
+                # fetchall for v1 controls list
+                result.fetchall.return_value = mock_v1
+                # fetchone for count and matched row
+                if "COUNT" in str(query):
+                    result.fetchone.return_value = mock_count
+                elif "source_citation IS NOT NULL" in str(query):
+                    result.fetchone.return_value = mock_matched_row
+                else:
+                    result.fetchone.return_value = mock_count
+                return result
+
+            db.execute.side_effect = side_effect_execute
+
+            with patch("compliance.services.v1_enrichment.get_embedding") as mock_embed, \
+                 patch("compliance.services.v1_enrichment.qdrant_search_cross_regulation") as mock_qdrant:
+                mock_embed.return_value = [0.1] * 1024
+                mock_qdrant.return_value = mock_qdrant_results
+
+                result = await enrich_v1_matches(dry_run=False, batch_size=100, offset=0)
+
+        assert result["dry_run"] is False
+        assert result["processed"] == 1
+        assert result["matches_inserted"] == 1
+        assert len(result["sample_matches"]) == 1
+        assert result["sample_matches"][0]["matched_control_id"] == "SEC-042"
+        assert result["sample_matches"][0]["similarity_score"] == 0.89
+
+    @pytest.mark.asyncio
+    async def test_empty_batch_returns_done(self):
+        mock_count = MagicMock(cnt=863)
+
+        with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
+            db = MagicMock()
+            mock_session.return_value.__enter__ = MagicMock(return_value=db)
+            mock_session.return_value.__exit__ = MagicMock(return_value=False)
+            db.execute.return_value.fetchall.return_value = []
+            db.execute.return_value.fetchone.return_value = mock_count
+
+            result = await enrich_v1_matches(dry_run=False, batch_size=100, offset=9999)
+
+        assert result["processed"] == 0
+        assert "alle v1 Controls verarbeitet" in result["message"]
+
+
+class TestV1MatchesEndpoint:
+    """Test the matches retrieval."""
+
+    @pytest.mark.asyncio
+    async def test_returns_matches(self):
+        mock_rows = [
+            MagicMock(
+                matched_control_id="SEC-042",
+                matched_title="Verschluesselung",
+                matched_objective="Daten verschluesseln",
+                matched_severity="high",
+                matched_category="encryption",
+                matched_source="DSGVO (EU) 2016/679",
+                matched_article="Art. 32",
+                matched_source_citation={"source": "DSGVO (EU) 2016/679"},
+                similarity_score=0.89,
+                match_rank=1,
+                match_method="embedding",
+            ),
+        ]
+
+        with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
+            db = MagicMock()
+            mock_session.return_value.__enter__ = MagicMock(return_value=db)
+            mock_session.return_value.__exit__ = MagicMock(return_value=False)
+            db.execute.return_value.fetchall.return_value = mock_rows
+
+            result = await get_v1_matches("uuid-v1-1")
+
+        assert len(result) == 1
+        assert result[0]["matched_control_id"] == "SEC-042"
+        assert result[0]["similarity_score"] == 0.89
+        assert result[0]["matched_source"] == "DSGVO (EU) 2016/679"
+
+    @pytest.mark.asyncio
+    async def test_empty_matches(self):
+        with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
+            db = MagicMock()
+            mock_session.return_value.__enter__ = MagicMock(return_value=db)
+            mock_session.return_value.__exit__ = MagicMock(return_value=False)
+            db.execute.return_value.fetchall.return_value = []
+
+            result = await get_v1_matches("uuid-nonexistent")
+
+        assert result == []
+
+
+class TestEigenentwicklungDetection:
+    """Verify the Eigenentwicklung detection query."""
+
+    @pytest.mark.asyncio
+    async def test_count_v1_controls(self):
+        mock_count = MagicMock(cnt=863)
+
+        with patch("compliance.services.v1_enrichment.SessionLocal") as mock_session:
+            db = MagicMock()
+            mock_session.return_value.__enter__ = MagicMock(return_value=db)
+            mock_session.return_value.__exit__ = MagicMock(return_value=False)
+            db.execute.return_value.fetchone.return_value = mock_count
+
+            result = await count_v1_controls()
+
+        assert result == 863
+        # Verify the query includes all conditions
+        call_args = db.execute.call_args[0][0]
+        query_str = str(call_args)
+        assert "generation_strategy = 'ungrouped'" in query_str
+        assert "source_citation IS NULL" in query_str
+        assert "parent_control_uuid IS NULL" in query_str