Files
breakpilot-core/control-pipeline/services/v1_enrichment.py
Benjamin Admin e3ab428b91 feat: control-pipeline Service aus Compliance-Repo migriert
Control-Pipeline (Pass 0a/0b, BatchDedup, Generator) als eigenstaendiger
Service in Core, damit Compliance-Repo unabhaengig refakturiert werden kann.
Schreibt weiterhin ins compliance-Schema der shared PostgreSQL.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 14:40:47 +02:00

332 lines
13 KiB
Python

"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations.
Finds regulatory coverage for v1 controls (generation_strategy='ungrouped',
pipeline_version=1, no source_citation) by embedding similarity search.
Reuses embedding + Qdrant helpers from control_dedup.py.
"""
import logging
from typing import Optional
from sqlalchemy import text
from db.session import SessionLocal
from services.control_dedup import (
get_embedding,
qdrant_search_cross_regulation,
)
logger = logging.getLogger(__name__)
# Similarity threshold — lower than dedup (0.85) since we want informational matches
# Typical top scores for v1 controls are 0.70-0.77
V1_MATCH_THRESHOLD = 0.70
V1_MAX_MATCHES = 5
def _is_eigenentwicklung_query() -> str:
"""SQL WHERE clause identifying v1 Eigenentwicklung controls."""
return """
generation_strategy = 'ungrouped'
AND (pipeline_version = '1' OR pipeline_version IS NULL)
AND source_citation IS NULL
AND parent_control_uuid IS NULL
AND release_state NOT IN ('rejected', 'merged', 'deprecated')
"""
async def count_v1_controls() -> int:
"""Count how many v1 Eigenentwicklung controls exist."""
with SessionLocal() as db:
row = db.execute(text(f"""
SELECT COUNT(*) AS cnt
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
return row.cnt if row else 0
async def enrich_v1_matches(
dry_run: bool = True,
batch_size: int = 100,
offset: int = 0,
) -> dict:
"""Find regulatory matches for v1 Eigenentwicklung controls.
Args:
dry_run: If True, only count — don't write matches.
batch_size: Number of v1 controls to process per call.
offset: Pagination offset (v1 control index).
Returns:
Stats dict with counts, sample matches, and pagination info.
"""
with SessionLocal() as db:
# 1. Load v1 controls (paginated)
v1_controls = db.execute(text(f"""
SELECT id, control_id, title, objective, category
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
ORDER BY control_id
LIMIT :limit OFFSET :offset
"""), {"limit": batch_size, "offset": offset}).fetchall()
# Count total for pagination
total_row = db.execute(text(f"""
SELECT COUNT(*) AS cnt
FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
total_v1 = total_row.cnt if total_row else 0
if not v1_controls:
return {
"dry_run": dry_run,
"processed": 0,
"total_v1": total_v1,
"message": "Kein weiterer Batch — alle v1 Controls verarbeitet.",
}
if dry_run:
return {
"dry_run": True,
"total_v1": total_v1,
"offset": offset,
"batch_size": batch_size,
"sample_controls": [
{
"control_id": r.control_id,
"title": r.title,
"category": r.category,
}
for r in v1_controls[:20]
],
}
# 2. Process each v1 control
processed = 0
matches_inserted = 0
errors = []
sample_matches = []
for v1 in v1_controls:
try:
# Build search text
search_text = f"{v1.title}{v1.objective}"
# Get embedding
embedding = await get_embedding(search_text)
if not embedding:
errors.append({
"control_id": v1.control_id,
"error": "Embedding fehlgeschlagen",
})
continue
# Search Qdrant (cross-regulation, no pattern filter)
# Collection is atomic_controls_dedup (contains ~51k atomare Controls)
results = await qdrant_search_cross_regulation(
embedding, top_k=20,
collection="atomic_controls_dedup",
)
# For each hit: resolve to a regulatory parent with source_citation.
# Atomic controls in Qdrant usually have parent_control_uuid → parent
# has the source_citation. We deduplicate by parent to avoid
# listing the same regulation multiple times.
rank = 0
seen_parents: set[str] = set()
for hit in results:
score = hit.get("score", 0)
if score < V1_MATCH_THRESHOLD:
continue
payload = hit.get("payload", {})
matched_uuid = payload.get("control_uuid")
if not matched_uuid or matched_uuid == str(v1.id):
continue
# Try the matched control itself first, then its parent
matched_row = db.execute(text("""
SELECT c.id, c.control_id, c.title, c.source_citation,
c.severity, c.category, c.parent_control_uuid
FROM canonical_controls c
WHERE c.id = CAST(:uuid AS uuid)
"""), {"uuid": matched_uuid}).fetchone()
if not matched_row:
continue
# Resolve to regulatory control (one with source_citation)
reg_row = matched_row
if not reg_row.source_citation and reg_row.parent_control_uuid:
# Look up parent — the parent has the source_citation
parent_row = db.execute(text("""
SELECT id, control_id, title, source_citation,
severity, category, parent_control_uuid
FROM canonical_controls
WHERE id = CAST(:uuid AS uuid)
AND source_citation IS NOT NULL
"""), {"uuid": str(reg_row.parent_control_uuid)}).fetchone()
if parent_row:
reg_row = parent_row
if not reg_row.source_citation:
continue
# Deduplicate by parent UUID
parent_key = str(reg_row.id)
if parent_key in seen_parents:
continue
seen_parents.add(parent_key)
rank += 1
if rank > V1_MAX_MATCHES:
break
# Extract source info
source_citation = reg_row.source_citation or {}
matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
# Insert match — link to the regulatory parent (not the atomic child)
db.execute(text("""
INSERT INTO v1_control_matches
(v1_control_uuid, matched_control_uuid, similarity_score,
match_rank, matched_source, matched_article, match_method)
VALUES
(CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score,
:rank, :source, :article, 'embedding')
ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE
SET similarity_score = EXCLUDED.similarity_score,
match_rank = EXCLUDED.match_rank
"""), {
"v1_uuid": str(v1.id),
"matched_uuid": str(reg_row.id),
"score": round(score, 3),
"rank": rank,
"source": matched_source,
"article": matched_article,
})
matches_inserted += 1
# Collect sample
if len(sample_matches) < 20:
sample_matches.append({
"v1_control_id": v1.control_id,
"v1_title": v1.title,
"matched_control_id": reg_row.control_id,
"matched_title": reg_row.title,
"matched_source": matched_source,
"matched_article": matched_article,
"similarity_score": round(score, 3),
"match_rank": rank,
})
processed += 1
except Exception as e:
logger.warning("V1 enrichment error for %s: %s", v1.control_id, e)
errors.append({
"control_id": v1.control_id,
"error": str(e),
})
db.commit()
# Pagination
next_offset = offset + batch_size if len(v1_controls) == batch_size else None
return {
"dry_run": False,
"offset": offset,
"batch_size": batch_size,
"next_offset": next_offset,
"total_v1": total_v1,
"processed": processed,
"matches_inserted": matches_inserted,
"errors": errors[:10],
"sample_matches": sample_matches,
}
async def get_v1_matches(control_uuid: str) -> list[dict]:
"""Get all regulatory matches for a specific v1 control.
Args:
control_uuid: The UUID of the v1 control.
Returns:
List of match dicts with control details.
"""
with SessionLocal() as db:
rows = db.execute(text("""
SELECT
m.similarity_score,
m.match_rank,
m.matched_source,
m.matched_article,
m.match_method,
c.control_id AS matched_control_id,
c.title AS matched_title,
c.objective AS matched_objective,
c.severity AS matched_severity,
c.category AS matched_category,
c.source_citation AS matched_source_citation
FROM v1_control_matches m
JOIN canonical_controls c ON c.id = m.matched_control_uuid
WHERE m.v1_control_uuid = CAST(:uuid AS uuid)
ORDER BY m.match_rank
"""), {"uuid": control_uuid}).fetchall()
return [
{
"matched_control_id": r.matched_control_id,
"matched_title": r.matched_title,
"matched_objective": r.matched_objective,
"matched_severity": r.matched_severity,
"matched_category": r.matched_category,
"matched_source": r.matched_source,
"matched_article": r.matched_article,
"matched_source_citation": r.matched_source_citation,
"similarity_score": float(r.similarity_score),
"match_rank": r.match_rank,
"match_method": r.match_method,
}
for r in rows
]
async def get_v1_enrichment_stats() -> dict:
"""Get overview stats for v1 enrichment."""
with SessionLocal() as db:
total_v1 = db.execute(text(f"""
SELECT COUNT(*) AS cnt FROM canonical_controls
WHERE {_is_eigenentwicklung_query()}
""")).fetchone()
matched_v1 = db.execute(text(f"""
SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt
FROM v1_control_matches m
JOIN canonical_controls c ON c.id = m.v1_control_uuid
WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')}
""")).fetchone()
total_matches = db.execute(text("""
SELECT COUNT(*) AS cnt FROM v1_control_matches
""")).fetchone()
avg_score = db.execute(text("""
SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches
""")).fetchone()
return {
"total_v1_controls": total_v1.cnt if total_v1 else 0,
"v1_with_matches": matched_v1.cnt if matched_v1 else 0,
"v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0),
"total_matches": total_matches.cnt if total_matches else 0,
"avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None,
}