"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations. Finds regulatory coverage for v1 controls (generation_strategy='ungrouped', pipeline_version=1, no source_citation) by embedding similarity search. Reuses embedding + Qdrant helpers from control_dedup.py. """ import logging from typing import Optional from sqlalchemy import text from db.session import SessionLocal from services.control_dedup import ( get_embedding, qdrant_search_cross_regulation, ) logger = logging.getLogger(__name__) # Similarity threshold — lower than dedup (0.85) since we want informational matches # Typical top scores for v1 controls are 0.70-0.77 V1_MATCH_THRESHOLD = 0.70 V1_MAX_MATCHES = 5 def _is_eigenentwicklung_query() -> str: """SQL WHERE clause identifying v1 Eigenentwicklung controls.""" return """ generation_strategy = 'ungrouped' AND (pipeline_version = '1' OR pipeline_version IS NULL) AND source_citation IS NULL AND parent_control_uuid IS NULL AND release_state NOT IN ('rejected', 'merged', 'deprecated') """ async def count_v1_controls() -> int: """Count how many v1 Eigenentwicklung controls exist.""" with SessionLocal() as db: row = db.execute(text(f""" SELECT COUNT(*) AS cnt FROM canonical_controls WHERE {_is_eigenentwicklung_query()} """)).fetchone() return row.cnt if row else 0 async def enrich_v1_matches( dry_run: bool = True, batch_size: int = 100, offset: int = 0, ) -> dict: """Find regulatory matches for v1 Eigenentwicklung controls. Args: dry_run: If True, only count — don't write matches. batch_size: Number of v1 controls to process per call. offset: Pagination offset (v1 control index). Returns: Stats dict with counts, sample matches, and pagination info. """ with SessionLocal() as db: # 1. Load v1 controls (paginated) v1_controls = db.execute(text(f""" SELECT id, control_id, title, objective, category FROM canonical_controls WHERE {_is_eigenentwicklung_query()} ORDER BY control_id LIMIT :limit OFFSET :offset """), {"limit": batch_size, "offset": offset}).fetchall() # Count total for pagination total_row = db.execute(text(f""" SELECT COUNT(*) AS cnt FROM canonical_controls WHERE {_is_eigenentwicklung_query()} """)).fetchone() total_v1 = total_row.cnt if total_row else 0 if not v1_controls: return { "dry_run": dry_run, "processed": 0, "total_v1": total_v1, "message": "Kein weiterer Batch — alle v1 Controls verarbeitet.", } if dry_run: return { "dry_run": True, "total_v1": total_v1, "offset": offset, "batch_size": batch_size, "sample_controls": [ { "control_id": r.control_id, "title": r.title, "category": r.category, } for r in v1_controls[:20] ], } # 2. Process each v1 control processed = 0 matches_inserted = 0 errors = [] sample_matches = [] for v1 in v1_controls: try: # Build search text search_text = f"{v1.title} — {v1.objective}" # Get embedding embedding = await get_embedding(search_text) if not embedding: errors.append({ "control_id": v1.control_id, "error": "Embedding fehlgeschlagen", }) continue # Search Qdrant (cross-regulation, no pattern filter) # Collection is atomic_controls_dedup (contains ~51k atomare Controls) results = await qdrant_search_cross_regulation( embedding, top_k=20, collection="atomic_controls_dedup", ) # For each hit: resolve to a regulatory parent with source_citation. # Atomic controls in Qdrant usually have parent_control_uuid → parent # has the source_citation. We deduplicate by parent to avoid # listing the same regulation multiple times. rank = 0 seen_parents: set[str] = set() for hit in results: score = hit.get("score", 0) if score < V1_MATCH_THRESHOLD: continue payload = hit.get("payload", {}) matched_uuid = payload.get("control_uuid") if not matched_uuid or matched_uuid == str(v1.id): continue # Try the matched control itself first, then its parent matched_row = db.execute(text(""" SELECT c.id, c.control_id, c.title, c.source_citation, c.severity, c.category, c.parent_control_uuid FROM canonical_controls c WHERE c.id = CAST(:uuid AS uuid) """), {"uuid": matched_uuid}).fetchone() if not matched_row: continue # Resolve to regulatory control (one with source_citation) reg_row = matched_row if not reg_row.source_citation and reg_row.parent_control_uuid: # Look up parent — the parent has the source_citation parent_row = db.execute(text(""" SELECT id, control_id, title, source_citation, severity, category, parent_control_uuid FROM canonical_controls WHERE id = CAST(:uuid AS uuid) AND source_citation IS NOT NULL """), {"uuid": str(reg_row.parent_control_uuid)}).fetchone() if parent_row: reg_row = parent_row if not reg_row.source_citation: continue # Deduplicate by parent UUID parent_key = str(reg_row.id) if parent_key in seen_parents: continue seen_parents.add(parent_key) rank += 1 if rank > V1_MAX_MATCHES: break # Extract source info source_citation = reg_row.source_citation or {} matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None # Insert match — link to the regulatory parent (not the atomic child) db.execute(text(""" INSERT INTO v1_control_matches (v1_control_uuid, matched_control_uuid, similarity_score, match_rank, matched_source, matched_article, match_method) VALUES (CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score, :rank, :source, :article, 'embedding') ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE SET similarity_score = EXCLUDED.similarity_score, match_rank = EXCLUDED.match_rank """), { "v1_uuid": str(v1.id), "matched_uuid": str(reg_row.id), "score": round(score, 3), "rank": rank, "source": matched_source, "article": matched_article, }) matches_inserted += 1 # Collect sample if len(sample_matches) < 20: sample_matches.append({ "v1_control_id": v1.control_id, "v1_title": v1.title, "matched_control_id": reg_row.control_id, "matched_title": reg_row.title, "matched_source": matched_source, "matched_article": matched_article, "similarity_score": round(score, 3), "match_rank": rank, }) processed += 1 except Exception as e: logger.warning("V1 enrichment error for %s: %s", v1.control_id, e) errors.append({ "control_id": v1.control_id, "error": str(e), }) db.commit() # Pagination next_offset = offset + batch_size if len(v1_controls) == batch_size else None return { "dry_run": False, "offset": offset, "batch_size": batch_size, "next_offset": next_offset, "total_v1": total_v1, "processed": processed, "matches_inserted": matches_inserted, "errors": errors[:10], "sample_matches": sample_matches, } async def get_v1_matches(control_uuid: str) -> list[dict]: """Get all regulatory matches for a specific v1 control. Args: control_uuid: The UUID of the v1 control. Returns: List of match dicts with control details. """ with SessionLocal() as db: rows = db.execute(text(""" SELECT m.similarity_score, m.match_rank, m.matched_source, m.matched_article, m.match_method, c.control_id AS matched_control_id, c.title AS matched_title, c.objective AS matched_objective, c.severity AS matched_severity, c.category AS matched_category, c.source_citation AS matched_source_citation FROM v1_control_matches m JOIN canonical_controls c ON c.id = m.matched_control_uuid WHERE m.v1_control_uuid = CAST(:uuid AS uuid) ORDER BY m.match_rank """), {"uuid": control_uuid}).fetchall() return [ { "matched_control_id": r.matched_control_id, "matched_title": r.matched_title, "matched_objective": r.matched_objective, "matched_severity": r.matched_severity, "matched_category": r.matched_category, "matched_source": r.matched_source, "matched_article": r.matched_article, "matched_source_citation": r.matched_source_citation, "similarity_score": float(r.similarity_score), "match_rank": r.match_rank, "match_method": r.match_method, } for r in rows ] async def get_v1_enrichment_stats() -> dict: """Get overview stats for v1 enrichment.""" with SessionLocal() as db: total_v1 = db.execute(text(f""" SELECT COUNT(*) AS cnt FROM canonical_controls WHERE {_is_eigenentwicklung_query()} """)).fetchone() matched_v1 = db.execute(text(f""" SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt FROM v1_control_matches m JOIN canonical_controls c ON c.id = m.v1_control_uuid WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')} """)).fetchone() total_matches = db.execute(text(""" SELECT COUNT(*) AS cnt FROM v1_control_matches """)).fetchone() avg_score = db.execute(text(""" SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches """)).fetchone() return { "total_v1_controls": total_v1.cnt if total_v1 else 0, "v1_with_matches": matched_v1.cnt if matched_v1 else 0, "v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0), "total_matches": total_matches.cnt if total_matches else 0, "avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None, }