breakpilot-core/control-pipeline/services/v1_enrichment.py

"""V1 Control Enrichment Service — Match Eigenentwicklung controls to regulations.

Finds regulatory coverage for v1 controls (generation_strategy='ungrouped',
pipeline_version=1, no source_citation) by embedding similarity search.

Reuses embedding + Qdrant helpers from control_dedup.py.
"""

import logging
from typing import Optional

from sqlalchemy import text

from db.session import SessionLocal
from services.control_dedup import (
    get_embedding,
    qdrant_search_cross_regulation,
)

logger = logging.getLogger(__name__)

# Similarity threshold — lower than dedup (0.85) since we want informational matches
# Typical top scores for v1 controls are 0.70-0.77
V1_MATCH_THRESHOLD = 0.70
V1_MAX_MATCHES = 5


def _is_eigenentwicklung_query() -> str:
    """SQL WHERE clause identifying v1 Eigenentwicklung controls."""
    return """
        generation_strategy = 'ungrouped'
        AND (pipeline_version = '1' OR pipeline_version IS NULL)
        AND source_citation IS NULL
        AND parent_control_uuid IS NULL
        AND release_state NOT IN ('rejected', 'merged', 'deprecated')
    """


async def count_v1_controls() -> int:
    """Count how many v1 Eigenentwicklung controls exist."""
    with SessionLocal() as db:
        row = db.execute(text(f"""
            SELECT COUNT(*) AS cnt
            FROM canonical_controls
            WHERE {_is_eigenentwicklung_query()}
        """)).fetchone()
        return row.cnt if row else 0


async def enrich_v1_matches(
    dry_run: bool = True,
    batch_size: int = 100,
    offset: int = 0,
) -> dict:
    """Find regulatory matches for v1 Eigenentwicklung controls.

    Args:
        dry_run: If True, only count — don't write matches.
        batch_size: Number of v1 controls to process per call.
        offset: Pagination offset (v1 control index).

    Returns:
        Stats dict with counts, sample matches, and pagination info.
    """
    with SessionLocal() as db:
        # 1. Load v1 controls (paginated)
        v1_controls = db.execute(text(f"""
            SELECT id, control_id, title, objective, category
            FROM canonical_controls
            WHERE {_is_eigenentwicklung_query()}
            ORDER BY control_id
            LIMIT :limit OFFSET :offset
        """), {"limit": batch_size, "offset": offset}).fetchall()

        # Count total for pagination
        total_row = db.execute(text(f"""
            SELECT COUNT(*) AS cnt
            FROM canonical_controls
            WHERE {_is_eigenentwicklung_query()}
        """)).fetchone()
        total_v1 = total_row.cnt if total_row else 0

        if not v1_controls:
            return {
                "dry_run": dry_run,
                "processed": 0,
                "total_v1": total_v1,
                "message": "Kein weiterer Batch — alle v1 Controls verarbeitet.",
            }

        if dry_run:
            return {
                "dry_run": True,
                "total_v1": total_v1,
                "offset": offset,
                "batch_size": batch_size,
                "sample_controls": [
                    {
                        "control_id": r.control_id,
                        "title": r.title,
                        "category": r.category,
                    }
                    for r in v1_controls[:20]
                ],
            }

        # 2. Process each v1 control
        processed = 0
        matches_inserted = 0
        errors = []
        sample_matches = []

        for v1 in v1_controls:
            try:
                # Build search text
                search_text = f"{v1.title} — {v1.objective}"

                # Get embedding
                embedding = await get_embedding(search_text)
                if not embedding:
                    errors.append({
                        "control_id": v1.control_id,
                        "error": "Embedding fehlgeschlagen",
                    })
                    continue

                # Search Qdrant (cross-regulation, no pattern filter)
                # Collection is atomic_controls_dedup (contains ~51k atomare Controls)
                results = await qdrant_search_cross_regulation(
                    embedding, top_k=20,
                    collection="atomic_controls_dedup",
                )

                # For each hit: resolve to a regulatory parent with source_citation.
                # Atomic controls in Qdrant usually have parent_control_uuid → parent
                # has the source_citation. We deduplicate by parent to avoid
                # listing the same regulation multiple times.
                rank = 0
                seen_parents: set[str] = set()

                for hit in results:
                    score = hit.get("score", 0)
                    if score < V1_MATCH_THRESHOLD:
                        continue

                    payload = hit.get("payload", {})
                    matched_uuid = payload.get("control_uuid")
                    if not matched_uuid or matched_uuid == str(v1.id):
                        continue

                    # Try the matched control itself first, then its parent
                    matched_row = db.execute(text("""
                        SELECT c.id, c.control_id, c.title, c.source_citation,
                               c.severity, c.category, c.parent_control_uuid
                        FROM canonical_controls c
                        WHERE c.id = CAST(:uuid AS uuid)
                    """), {"uuid": matched_uuid}).fetchone()

                    if not matched_row:
                        continue

                    # Resolve to regulatory control (one with source_citation)
                    reg_row = matched_row
                    if not reg_row.source_citation and reg_row.parent_control_uuid:
                        # Look up parent — the parent has the source_citation
                        parent_row = db.execute(text("""
                            SELECT id, control_id, title, source_citation,
                                   severity, category, parent_control_uuid
                            FROM canonical_controls
                            WHERE id = CAST(:uuid AS uuid)
                              AND source_citation IS NOT NULL
                        """), {"uuid": str(reg_row.parent_control_uuid)}).fetchone()
                        if parent_row:
                            reg_row = parent_row

                    if not reg_row.source_citation:
                        continue

                    # Deduplicate by parent UUID
                    parent_key = str(reg_row.id)
                    if parent_key in seen_parents:
                        continue
                    seen_parents.add(parent_key)

                    rank += 1
                    if rank > V1_MAX_MATCHES:
                        break

                    # Extract source info
                    source_citation = reg_row.source_citation or {}
                    matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
                    matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None

                    # Insert match — link to the regulatory parent (not the atomic child)
                    db.execute(text("""
                        INSERT INTO v1_control_matches
                            (v1_control_uuid, matched_control_uuid, similarity_score,
                             match_rank, matched_source, matched_article, match_method)
                        VALUES
                            (CAST(:v1_uuid AS uuid), CAST(:matched_uuid AS uuid), :score,
                             :rank, :source, :article, 'embedding')
                        ON CONFLICT (v1_control_uuid, matched_control_uuid) DO UPDATE
                        SET similarity_score = EXCLUDED.similarity_score,
                            match_rank = EXCLUDED.match_rank
                    """), {
                        "v1_uuid": str(v1.id),
                        "matched_uuid": str(reg_row.id),
                        "score": round(score, 3),
                        "rank": rank,
                        "source": matched_source,
                        "article": matched_article,
                    })
                    matches_inserted += 1

                    # Collect sample
                    if len(sample_matches) < 20:
                        sample_matches.append({
                            "v1_control_id": v1.control_id,
                            "v1_title": v1.title,
                            "matched_control_id": reg_row.control_id,
                            "matched_title": reg_row.title,
                            "matched_source": matched_source,
                            "matched_article": matched_article,
                            "similarity_score": round(score, 3),
                            "match_rank": rank,
                        })

                processed += 1

            except Exception as e:
                logger.warning("V1 enrichment error for %s: %s", v1.control_id, e)
                errors.append({
                    "control_id": v1.control_id,
                    "error": str(e),
                })

        db.commit()

    # Pagination
    next_offset = offset + batch_size if len(v1_controls) == batch_size else None

    return {
        "dry_run": False,
        "offset": offset,
        "batch_size": batch_size,
        "next_offset": next_offset,
        "total_v1": total_v1,
        "processed": processed,
        "matches_inserted": matches_inserted,
        "errors": errors[:10],
        "sample_matches": sample_matches,
    }


async def get_v1_matches(control_uuid: str) -> list[dict]:
    """Get all regulatory matches for a specific v1 control.

    Args:
        control_uuid: The UUID of the v1 control.

    Returns:
        List of match dicts with control details.
    """
    with SessionLocal() as db:
        rows = db.execute(text("""
            SELECT
                m.similarity_score,
                m.match_rank,
                m.matched_source,
                m.matched_article,
                m.match_method,
                c.control_id AS matched_control_id,
                c.title AS matched_title,
                c.objective AS matched_objective,
                c.severity AS matched_severity,
                c.category AS matched_category,
                c.source_citation AS matched_source_citation
            FROM v1_control_matches m
            JOIN canonical_controls c ON c.id = m.matched_control_uuid
            WHERE m.v1_control_uuid = CAST(:uuid AS uuid)
            ORDER BY m.match_rank
        """), {"uuid": control_uuid}).fetchall()

        return [
            {
                "matched_control_id": r.matched_control_id,
                "matched_title": r.matched_title,
                "matched_objective": r.matched_objective,
                "matched_severity": r.matched_severity,
                "matched_category": r.matched_category,
                "matched_source": r.matched_source,
                "matched_article": r.matched_article,
                "matched_source_citation": r.matched_source_citation,
                "similarity_score": float(r.similarity_score),
                "match_rank": r.match_rank,
                "match_method": r.match_method,
            }
            for r in rows
        ]


async def get_v1_enrichment_stats() -> dict:
    """Get overview stats for v1 enrichment."""
    with SessionLocal() as db:
        total_v1 = db.execute(text(f"""
            SELECT COUNT(*) AS cnt FROM canonical_controls
            WHERE {_is_eigenentwicklung_query()}
        """)).fetchone()

        matched_v1 = db.execute(text(f"""
            SELECT COUNT(DISTINCT m.v1_control_uuid) AS cnt
            FROM v1_control_matches m
            JOIN canonical_controls c ON c.id = m.v1_control_uuid
            WHERE {_is_eigenentwicklung_query().replace('release_state', 'c.release_state').replace('generation_strategy', 'c.generation_strategy').replace('pipeline_version', 'c.pipeline_version').replace('source_citation', 'c.source_citation').replace('parent_control_uuid', 'c.parent_control_uuid')}
        """)).fetchone()

        total_matches = db.execute(text("""
            SELECT COUNT(*) AS cnt FROM v1_control_matches
        """)).fetchone()

        avg_score = db.execute(text("""
            SELECT AVG(similarity_score) AS avg_score FROM v1_control_matches
        """)).fetchone()

        return {
            "total_v1_controls": total_v1.cnt if total_v1 else 0,
            "v1_with_matches": matched_v1.cnt if matched_v1 else 0,
            "v1_without_matches": (total_v1.cnt if total_v1 else 0) - (matched_v1.cnt if matched_v1 else 0),
            "total_matches": total_matches.cnt if total_matches else 0,
            "avg_similarity_score": round(float(avg_score.avg_score), 3) if avg_score and avg_score.avg_score else None,
        }