Control-Pipeline (Pass 0a/0b, BatchDedup, Generator) als eigenstaendiger Service in Core, damit Compliance-Repo unabhaengig refakturiert werden kann. Schreibt weiterhin ins compliance-Schema der shared PostgreSQL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
332 lines
13 KiB
Python
332 lines
13 KiB
Python
"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations.
|
|
|
|
Finds regulatory coverage for v1 controls (generation_strategy='ungrouped',
|
|
pipeline_version=1, no source_citation) by embedding similarity search.
|
|
|
|
Reuses embedding + Qdrant helpers from control_dedup.py.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Optional
|
|
|
|
from sqlalchemy import text
|
|
|
|
from db.session import SessionLocal
|
|
from services.control_dedup import (
|
|
get_embedding,
|
|
qdrant_search_cross_regulation,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Similarity threshold — lower than dedup (0.85) since we want informational matches
|
|
# Typical top scores for v1 controls are 0.70-0.77
|
|
V1_MATCH_THRESHOLD = 0.70
|
|
V1_MAX_MATCHES = 5
|
|
|
|
|
|
def _is_eigenentwicklung_query() -> str:
|
|
"""SQL WHERE clause identifying v1 Eigenentwicklung controls."""
|
|
return """
|
|
generation_strategy = 'ungrouped'
|
|
AND (pipeline_version = '1' OR pipeline_version IS NULL)
|
|
AND source_citation IS NULL
|
|
AND parent_control_uuid IS NULL
|
|
AND release_state NOT IN ('rejected', 'merged', 'deprecated')
|
|
"""
|
|
|
|
|
|
async def count_v1_controls() -> int:
|
|
"""Count how many v1 Eigenentwicklung controls exist."""
|
|
with SessionLocal() as db:
|
|
row = db.execute(text(f"""
|
|
SELECT COUNT(*) AS cnt
|
|
FROM canonical_controls
|
|
WHERE {_is_eigenentwicklung_query()}
|
|
""")).fetchone()
|
|
return row.cnt if row else 0
|
|
|
|
|
|
async def enrich_v1_matches(
|
|
dry_run: bool = True,
|
|
batch_size: int = 100,
|
|
offset: int = 0,
|
|
) -> dict:
|
|
"""Find regulatory matches for v1 Eigenentwicklung controls.
|
|
|
|
Args:
|
|
dry_run: If True, only count — don't write matches.
|
|
batch_size: Number of v1 controls to process per call.
|
|
offset: Pagination offset (v1 control index).
|
|
|
|
Returns:
|
|
Stats dict with counts, sample matches, and pagination info.
|
|
"""
|
|
with SessionLocal() as db:
|
|
# 1. Load v1 controls (paginated)
|
|
v1_controls = db.execute(text(f"""
|
|
SELECT id, control_id, title, objective, category
|
|
FROM canonical_controls
|
|
WHERE {_is_eigenentwicklung_query()}
|
|
ORDER BY control_id
|
|
LIMIT :limit OFFSET :offset
|
|
"""), {"limit": batch_size, "offset": offset}).fetchall()
|
|
|
|
# Count total for pagination
|
|
total_row = db.execute(text(f"""
|
|
SELECT COUNT(*) AS cnt
|
|
FROM canonical_controls
|
|
WHERE {_is_eigenentwicklung_query()}
|
|
""")).fetchone()
|
|
total_v1 = total_row.cnt if total_row else 0
|
|
|
|
if not v1_controls:
|
|
return {
|
|
"dry_run": dry_run,
|
|
"processed": 0,
|
|
"total_v1": total_v1,
|
|
"message": "Kein weiterer Batch — alle v1 Controls verarbeitet.",
|
|
}
|
|
|
|
if dry_run:
|
|
return {
|
|
"dry_run": True,
|
|
"total_v1": total_v1,
|
|
"offset": offset,
|
|
"batch_size": batch_size,
|
|
"sample_controls": [
|
|
{
|
|
"control_id": r.control_id,
|
|
"title": r.title,
|
|
"category": r.category,
|
|
}
|
|
for r in v1_controls[:20]
|
|
],
|
|
}
|
|
|
|
# 2. Process each v1 control
|
|
processed = 0
|
|
matches_inserted = 0
|
|
errors = []
|
|
sample_matches = []
|
|
|
|
for v1 in v1_controls:
|
|
try:
|
|
# Build search text
|
|
search_text = f"{v1.title} — {v1.objective}"
|
|
|
|
# Get embedding
|
|
embedding = await get_embedding(search_text)
|
|
if not embedding:
|
|
errors.append({
|
|
"control_id": v1.control_id,
|
|
"error": "Embedding fehlgeschlagen",
|
|
})
|
|
continue
|
|
|
|
# Search Qdrant (cross-regulation, no pattern filter)
|
|
# Collection is atomic_controls_dedup (contains ~51k atomare Controls)
|
|
results = await qdrant_search_cross_regulation(
|
|
embedding, top_k=20,
|
|
collection="atomic_controls_dedup",
|
|
)
|
|
|
|
# For each hit: resolve to a regulatory parent with source_citation.
|
|
# Atomic controls in Qdrant usually have parent_control_uuid → parent
|
|
# has the source_citation. We deduplicate by parent to avoid
|
|
# listing the same regulation multiple times.
|
|
rank = 0
|
|
seen_parents: set[str] = set()
|
|
|
|
for hit in results:
|
|
score = hit.get("score", 0)
|
|
if score < V1_MATCH_THRESHOLD:
|
|
continue
|
|
|
|
payload = hit.get("payload", {})
|
|
matched_uuid = payload.get("control_uuid")
|
|
if not matched_uuid or matched_uuid == str(v1.id):
|
|
continue
|
|
|
|
# Try the matched control itself first, then its parent
|
|
matched_row = db.execute(text("""
|
|
SELECT c.id, c.control_id, c.title, c.source_citation,
|
|
c.severity, c.category, c.parent_control_uuid
|
|
FROM canonical_controls c
|
|
WHERE c.id = CAST(:uuid AS uuid)
|
|
"""), {"uuid": matched_uuid}).fetchone()
|
|
|
|
if not matched_row:
|
|
continue
|
|
|
|
# Resolve to regulatory control (one with source_citation)
|
|
reg_row = matched_row
|
|
if not reg_row.source_citation and reg_row.parent_control_uuid:
|
|
# Look up parent — the parent has the source_citation
|
|
parent_row = db.execute(text("""
|
|
SELECT id, control_id, title, source_citation,
|
|
severity, category, parent_control_uuid
|
|
FROM canonical_controls
|
|
WHERE id = CAST(:uuid AS uuid)
|
|
AND source_citation IS NOT NULL
|
|
"""), {"uuid": str(reg_row.parent_control_uuid)}).fetchone()
|
|
if parent_row:
|
|
reg_row = parent_row
|
|
|
|
if not reg_row.source_citation:
|
|
continue
|
|
|
|
# Deduplicate by parent UUID
|
|
parent_key = str(reg_row.id)
|
|
if parent_key in seen_parents:
|
|
continue
|
|
seen_parents.add(parent_key)
|
|
|
|
rank += 1
|
|
if rank > V1_MAX_MATCHES:
|
|
break
|
|
|
|
# Extract source info
|
|
source_citation = reg_row.source_citation or {}
|
|
matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
|
|
matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
|
|
|
|
# Insert match — link to the regulatory parent (not the atomic child)
|
|
db.execute(text("""
|
|
INSERT INTO v1_control_matches
|
|
(v1_control_uuid, matched_control_uuid, similarity_score,
|
|
match_rank, matched_source, matched_article, match_method)
|
|
VALUES
|
|
(CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score,
|
|
:rank, :source, :article, 'embedding')
|
|
ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE
|
|
SET similarity_score = EXCLUDED.similarity_score,
|
|
match_rank = EXCLUDED.match_rank
|
|
"""), {
|
|
"v1_uuid": str(v1.id),
|
|
"matched_uuid": str(reg_row.id),
|
|
"score": round(score, 3),
|
|
"rank": rank,
|
|
"source": matched_source,
|
|
"article": matched_article,
|
|
})
|
|
matches_inserted += 1
|
|
|
|
# Collect sample
|
|
if len(sample_matches) < 20:
|
|
sample_matches.append({
|
|
"v1_control_id": v1.control_id,
|
|
"v1_title": v1.title,
|
|
"matched_control_id": reg_row.control_id,
|
|
"matched_title": reg_row.title,
|
|
"matched_source": matched_source,
|
|
"matched_article": matched_article,
|
|
"similarity_score": round(score, 3),
|
|
"match_rank": rank,
|
|
})
|
|
|
|
processed += 1
|
|
|
|
except Exception as e:
|
|
logger.warning("V1 enrichment error for %s: %s", v1.control_id, e)
|
|
errors.append({
|
|
"control_id": v1.control_id,
|
|
"error": str(e),
|
|
})
|
|
|
|
db.commit()
|
|
|
|
# Pagination
|
|
next_offset = offset + batch_size if len(v1_controls) == batch_size else None
|
|
|
|
return {
|
|
"dry_run": False,
|
|
"offset": offset,
|
|
"batch_size": batch_size,
|
|
"next_offset": next_offset,
|
|
"total_v1": total_v1,
|
|
"processed": processed,
|
|
"matches_inserted": matches_inserted,
|
|
"errors": errors[:10],
|
|
"sample_matches": sample_matches,
|
|
}
|
|
|
|
|
|
async def get_v1_matches(control_uuid: str) -> list[dict]:
|
|
"""Get all regulatory matches for a specific v1 control.
|
|
|
|
Args:
|
|
control_uuid: The UUID of the v1 control.
|
|
|
|
Returns:
|
|
List of match dicts with control details.
|
|
"""
|
|
with SessionLocal() as db:
|
|
rows = db.execute(text("""
|
|
SELECT
|
|
m.similarity_score,
|
|
m.match_rank,
|
|
m.matched_source,
|
|
m.matched_article,
|
|
m.match_method,
|
|
c.control_id AS matched_control_id,
|
|
c.title AS matched_title,
|
|
c.objective AS matched_objective,
|
|
c.severity AS matched_severity,
|
|
c.category AS matched_category,
|
|
c.source_citation AS matched_source_citation
|
|
FROM v1_control_matches m
|
|
JOIN canonical_controls c ON c.id = m.matched_control_uuid
|
|
WHERE m.v1_control_uuid = CAST(:uuid AS uuid)
|
|
ORDER BY m.match_rank
|
|
"""), {"uuid": control_uuid}).fetchall()
|
|
|
|
return [
|
|
{
|
|
"matched_control_id": r.matched_control_id,
|
|
"matched_title": r.matched_title,
|
|
"matched_objective": r.matched_objective,
|
|
"matched_severity": r.matched_severity,
|
|
"matched_category": r.matched_category,
|
|
"matched_source": r.matched_source,
|
|
"matched_article": r.matched_article,
|
|
"matched_source_citation": r.matched_source_citation,
|
|
"similarity_score": float(r.similarity_score),
|
|
"match_rank": r.match_rank,
|
|
"match_method": r.match_method,
|
|
}
|
|
for r in rows
|
|
]
|
|
|
|
|
|
async def get_v1_enrichment_stats() -> dict:
|
|
"""Get overview stats for v1 enrichment."""
|
|
with SessionLocal() as db:
|
|
total_v1 = db.execute(text(f"""
|
|
SELECT COUNT(*) AS cnt FROM canonical_controls
|
|
WHERE {_is_eigenentwicklung_query()}
|
|
""")).fetchone()
|
|
|
|
matched_v1 = db.execute(text(f"""
|
|
SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt
|
|
FROM v1_control_matches m
|
|
JOIN canonical_controls c ON c.id = m.v1_control_uuid
|
|
WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')}
|
|
""")).fetchone()
|
|
|
|
total_matches = db.execute(text("""
|
|
SELECT COUNT(*) AS cnt FROM v1_control_matches
|
|
""")).fetchone()
|
|
|
|
avg_score = db.execute(text("""
|
|
SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches
|
|
""")).fetchone()
|
|
|
|
return {
|
|
"total_v1_controls": total_v1.cnt if total_v1 else 0,
|
|
"v1_with_matches": matched_v1.cnt if matched_v1 else 0,
|
|
"v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0),
|
|
"total_matches": total_matches.cnt if total_matches else 0,
|
|
"avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None,
|
|
}
|