breakpilot-compliance/backend-compliance/compliance/api/control_generator_routes.py

"""
FastAPI routes for the Control Generator Pipeline.

Endpoints:
  POST /v1/canonical/generate                     — Start generation run
  GET  /v1/canonical/generate/status/{job_id}     — Job status
  GET  /v1/canonical/generate/jobs                — All jobs
  GET  /v1/canonical/generate/review-queue        — Controls needing review
  POST /v1/canonical/generate/review/{control_id} — Complete review
  GET  /v1/canonical/generate/processed-stats     — Processing stats per collection
  GET  /v1/canonical/blocked-sources              — Blocked sources list
  POST /v1/canonical/blocked-sources/cleanup      — Start cleanup workflow
"""

import asyncio
import json
import logging
from typing import Optional, List

from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import text

from database import SessionLocal
from compliance.services.control_generator import (
    ControlGeneratorPipeline,
    GeneratorConfig,
    ALL_COLLECTIONS,
)
from compliance.services.citation_backfill import CitationBackfill, BackfillResult
from compliance.services.rag_client import get_rag_client

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/canonical", tags=["control-generator"])


# =============================================================================
# REQUEST / RESPONSE MODELS
# =============================================================================

class GenerateRequest(BaseModel):
    domain: Optional[str] = None
    collections: Optional[List[str]] = None
    max_controls: int = 50
    batch_size: int = 5
    skip_web_search: bool = False
    dry_run: bool = False


class GenerateResponse(BaseModel):
    job_id: str
    status: str
    message: str
    total_chunks_scanned: int = 0
    controls_generated: int = 0
    controls_verified: int = 0
    controls_needs_review: int = 0
    controls_too_close: int = 0
    controls_duplicates_found: int = 0
    errors: list = []
    controls: list = []


class ReviewRequest(BaseModel):
    action: str  # "approve", "reject", "needs_rework"
    release_state: Optional[str] = None  # Override release_state
    notes: Optional[str] = None


class ProcessedStats(BaseModel):
    collection: str
    total_chunks_estimated: int
    processed_chunks: int
    pending_chunks: int
    direct_adopted: int
    llm_reformed: int
    skipped: int


class BlockedSourceResponse(BaseModel):
    id: str
    regulation_code: str
    document_title: str
    reason: str
    deletion_status: str
    qdrant_collection: Optional[str] = None
    marked_at: str


# =============================================================================
# ENDPOINTS
# =============================================================================

async def _run_pipeline_background(config: GeneratorConfig, job_id: str):
    """Run the pipeline in the background. Uses its own DB session."""
    db = SessionLocal()
    try:
        config.existing_job_id = job_id
        pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
        result = await pipeline.run(config)
        logger.info(
            "Background generation job %s completed: %d controls from %d chunks",
            job_id, result.controls_generated, result.total_chunks_scanned,
        )
    except Exception as e:
        logger.error("Background generation job %s failed: %s", job_id, e)
        # Update job as failed
        try:
            db.execute(
                text("""
                    UPDATE canonical_generation_jobs
                    SET status = 'failed', errors = :errors, completed_at = NOW()
                    WHERE id = CAST(:job_id AS uuid)
                """),
                {"job_id": job_id, "errors": json.dumps([str(e)])},
            )
            db.commit()
        except Exception:
            pass
    finally:
        db.close()


@router.post("/generate", response_model=GenerateResponse)
async def start_generation(req: GenerateRequest):
    """Start a control generation run (runs in background).

    Returns immediately with job_id. Use GET /generate/status/{job_id} to poll progress.
    """
    config = GeneratorConfig(
        collections=req.collections,
        domain=req.domain,
        batch_size=req.batch_size,
        max_controls=req.max_controls,
        skip_web_search=req.skip_web_search,
        dry_run=req.dry_run,
    )

    if req.dry_run:
        # Dry run: execute synchronously and return controls
        db = SessionLocal()
        try:
            pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
            result = await pipeline.run(config)
            return GenerateResponse(
                job_id=result.job_id,
                status=result.status,
                message=f"Dry run: {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
                total_chunks_scanned=result.total_chunks_scanned,
                controls_generated=result.controls_generated,
                controls_verified=result.controls_verified,
                controls_needs_review=result.controls_needs_review,
                controls_too_close=result.controls_too_close,
                controls_duplicates_found=result.controls_duplicates_found,
                errors=result.errors,
                controls=result.controls,
            )
        except Exception as e:
            logger.error("Dry run failed: %s", e)
            raise HTTPException(status_code=500, detail=str(e))
        finally:
            db.close()

    # Create job record first so we can return the ID
    db = SessionLocal()
    try:
        result = db.execute(
            text("""
                INSERT INTO canonical_generation_jobs (status, config)
                VALUES ('running', :config)
                RETURNING id
            """),
            {"config": json.dumps(config.model_dump())},
        )
        db.commit()
        row = result.fetchone()
        job_id = str(row[0]) if row else None
    except Exception as e:
        logger.error("Failed to create job: %s", e)
        raise HTTPException(status_code=500, detail=f"Failed to create job: {e}")
    finally:
        db.close()

    if not job_id:
        raise HTTPException(status_code=500, detail="Failed to create job record")

    # Launch pipeline in background
    asyncio.create_task(_run_pipeline_background(config, job_id))

    return GenerateResponse(
        job_id=job_id,
        status="running",
        message="Generation started in background. Poll /generate/status/{job_id} for progress.",
    )


@router.get("/generate/status/{job_id}")
async def get_job_status(job_id: str):
    """Get status of a generation job."""
    db = SessionLocal()
    try:
        result = db.execute(
            text("SELECT * FROM canonical_generation_jobs WHERE id = CAST(:id AS uuid)"),
            {"id": job_id},
        )
        row = result.fetchone()
        if not row:
            raise HTTPException(status_code=404, detail="Job not found")

        cols = result.keys()
        job = dict(zip(cols, row))
        # Serialize datetime fields
        for key in ("started_at", "completed_at", "created_at"):
            if job.get(key):
                job[key] = str(job[key])
        job["id"] = str(job["id"])
        return job
    finally:
        db.close()


@router.get("/generate/jobs")
async def list_jobs(
    limit: int = Query(20, ge=1, le=100),
    offset: int = Query(0, ge=0),
):
    """List all generation jobs."""
    db = SessionLocal()
    try:
        result = db.execute(
            text("""
                SELECT id, status, total_chunks_scanned, controls_generated,
                       controls_verified, controls_needs_review, controls_too_close,
                       controls_duplicates_found, created_at, completed_at
                FROM canonical_generation_jobs
                ORDER BY created_at DESC
                LIMIT :limit OFFSET :offset
            """),
            {"limit": limit, "offset": offset},
        )
        jobs = []
        cols = result.keys()
        for row in result:
            job = dict(zip(cols, row))
            job["id"] = str(job["id"])
            for key in ("created_at", "completed_at"):
                if job.get(key):
                    job[key] = str(job[key])
            jobs.append(job)
        return {"jobs": jobs, "total": len(jobs)}
    finally:
        db.close()


@router.get("/generate/review-queue")
async def get_review_queue(
    release_state: str = Query("needs_review", regex="^(needs_review|too_close|duplicate)$"),
    limit: int = Query(50, ge=1, le=200),
):
    """Get controls that need manual review."""
    db = SessionLocal()
    try:
        result = db.execute(
            text("""
                SELECT c.id, c.control_id, c.title, c.objective, c.severity,
                       c.release_state, c.license_rule, c.customer_visible,
                       c.generation_metadata, c.open_anchors, c.tags,
                       c.created_at
                FROM canonical_controls c
                WHERE c.release_state = :state
                ORDER BY c.created_at DESC
                LIMIT :limit
            """),
            {"state": release_state, "limit": limit},
        )
        controls = []
        cols = result.keys()
        for row in result:
            ctrl = dict(zip(cols, row))
            ctrl["id"] = str(ctrl["id"])
            ctrl["created_at"] = str(ctrl["created_at"])
            # Parse JSON fields
            for jf in ("generation_metadata", "open_anchors", "tags"):
                if isinstance(ctrl.get(jf), str):
                    try:
                        ctrl[jf] = json.loads(ctrl[jf])
                    except (json.JSONDecodeError, TypeError):
                        pass
            controls.append(ctrl)
        return {"controls": controls, "total": len(controls)}
    finally:
        db.close()


@router.post("/generate/review/{control_id}")
async def review_control(control_id: str, req: ReviewRequest):
    """Complete review of a generated control."""
    db = SessionLocal()
    try:
        # Validate control exists and is in reviewable state
        result = db.execute(
            text("SELECT id, release_state FROM canonical_controls WHERE control_id = :cid"),
            {"cid": control_id},
        )
        row = result.fetchone()
        if not row:
            raise HTTPException(status_code=404, detail="Control not found")

        current_state = row[1]
        if current_state not in ("needs_review", "too_close", "duplicate"):
            raise HTTPException(status_code=400, detail=f"Control is in state '{current_state}', not reviewable")

        # Determine new state
        if req.action == "approve":
            new_state = req.release_state or "draft"
        elif req.action == "reject":
            new_state = "deprecated"
        elif req.action == "needs_rework":
            new_state = "needs_review"
        else:
            raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")

        if new_state not in ("draft", "review", "approved", "deprecated", "needs_review", "too_close", "duplicate"):
            raise HTTPException(status_code=400, detail=f"Invalid release_state: {new_state}")

        db.execute(
            text("""
                UPDATE canonical_controls
                SET release_state = :state, updated_at = NOW()
                WHERE control_id = :cid
            """),
            {"state": new_state, "cid": control_id},
        )
        db.commit()

        return {"control_id": control_id, "release_state": new_state, "action": req.action}
    finally:
        db.close()


@router.get("/generate/processed-stats")
async def get_processed_stats():
    """Get processing statistics per collection."""
    db = SessionLocal()
    try:
        result = db.execute(
            text("""
                SELECT
                    collection,
                    COUNT(*) as processed_chunks,
                    COUNT(*) FILTER (WHERE processing_path = 'structured') as direct_adopted,
                    COUNT(*) FILTER (WHERE processing_path = 'llm_reform') as llm_reformed,
                    COUNT(*) FILTER (WHERE processing_path = 'skipped') as skipped
                FROM canonical_processed_chunks
                GROUP BY collection
                ORDER BY collection
            """)
        )
        stats = []
        cols = result.keys()
        for row in result:
            stat = dict(zip(cols, row))
            stat["total_chunks_estimated"] = 0  # Would need Qdrant API to get total
            stat["pending_chunks"] = 0
            stats.append(stat)
        return {"stats": stats}
    finally:
        db.close()


# =============================================================================
# BLOCKED SOURCES
# =============================================================================

@router.get("/blocked-sources")
async def list_blocked_sources():
    """List all blocked (Rule 3) sources."""
    db = SessionLocal()
    try:
        result = db.execute(
            text("""
                SELECT id, regulation_code, document_title, reason,
                       deletion_status, qdrant_collection, marked_at
                FROM canonical_blocked_sources
                ORDER BY marked_at DESC
            """)
        )
        sources = []
        cols = result.keys()
        for row in result:
            src = dict(zip(cols, row))
            src["id"] = str(src["id"])
            src["marked_at"] = str(src["marked_at"])
            sources.append(src)
        return {"sources": sources}
    finally:
        db.close()


@router.post("/blocked-sources/cleanup")
async def start_cleanup():
    """Start cleanup workflow for blocked sources.

    This marks all pending blocked sources for deletion.
    Actual RAG chunk deletion and file removal is a separate manual step.
    """
    db = SessionLocal()
    try:
        result = db.execute(
            text("""
                UPDATE canonical_blocked_sources
                SET deletion_status = 'marked_for_deletion'
                WHERE deletion_status = 'pending'
                RETURNING regulation_code
            """)
        )
        marked = [row[0] for row in result]
        db.commit()

        return {
            "status": "marked_for_deletion",
            "marked_count": len(marked),
            "regulation_codes": marked,
            "message": "Sources marked for deletion. Run manual cleanup to remove RAG chunks and files.",
        }
    finally:
        db.close()


# =============================================================================
# CUSTOMER VIEW FILTER
# =============================================================================

@router.get("/controls-customer")
async def get_controls_customer_view(
    severity: Optional[str] = Query(None),
    domain: Optional[str] = Query(None),
):
    """Get controls filtered for customer visibility.

    Rule 3 controls have source_citation and source_original_text hidden.
    generation_metadata is NEVER shown to customers.
    """
    db = SessionLocal()
    try:
        query = """
            SELECT c.id, c.control_id, c.title, c.objective, c.rationale,
                   c.scope, c.requirements, c.test_procedure, c.evidence,
                   c.severity, c.risk_score, c.implementation_effort,
                   c.open_anchors, c.release_state, c.tags,
                   c.license_rule, c.customer_visible,
                   c.source_original_text, c.source_citation,
                   c.created_at, c.updated_at
            FROM canonical_controls c
            WHERE c.release_state IN ('draft', 'approved')
        """
        params: dict = {}

        if severity:
            query += " AND c.severity = :severity"
            params["severity"] = severity
        if domain:
            query += " AND c.control_id LIKE :domain"
            params["domain"] = f"{domain.upper()}-%"

        query += " ORDER BY c.control_id"

        result = db.execute(text(query), params)
        controls = []
        cols = result.keys()
        for row in result:
            ctrl = dict(zip(cols, row))
            ctrl["id"] = str(ctrl["id"])
            for key in ("created_at", "updated_at"):
                if ctrl.get(key):
                    ctrl[key] = str(ctrl[key])

            # Parse JSON fields
            for jf in ("scope", "requirements", "test_procedure", "evidence",
                        "open_anchors", "tags", "source_citation"):
                if isinstance(ctrl.get(jf), str):
                    try:
                        ctrl[jf] = json.loads(ctrl[jf])
                    except (json.JSONDecodeError, TypeError):
                        pass

            # Customer visibility rules:
            # - NEVER show generation_metadata
            # - Rule 3: NEVER show source_citation or source_original_text
            ctrl.pop("generation_metadata", None)
            if not ctrl.get("customer_visible", True):
                ctrl["source_citation"] = None
                ctrl["source_original_text"] = None

            controls.append(ctrl)

        return {"controls": controls, "total": len(controls)}
    finally:
        db.close()


# =============================================================================
# CITATION BACKFILL
# =============================================================================

class BackfillRequest(BaseModel):
    dry_run: bool = True  # Default to dry_run for safety
    limit: int = 0  # 0 = all controls


class BackfillResponse(BaseModel):
    status: str
    total_controls: int = 0
    matched_hash: int = 0
    matched_regex: int = 0
    matched_llm: int = 0
    unmatched: int = 0
    updated: int = 0
    errors: list = []


_backfill_status: dict = {}


async def _run_backfill_background(dry_run: bool, limit: int, backfill_id: str):
    """Run backfill in background with own DB session."""
    db = SessionLocal()
    try:
        backfill = CitationBackfill(db=db, rag_client=get_rag_client())
        result = await backfill.run(dry_run=dry_run, limit=limit)
        _backfill_status[backfill_id] = {
            "status": "completed",
            "total_controls": result.total_controls,
            "matched_hash": result.matched_hash,
            "matched_regex": result.matched_regex,
            "matched_llm": result.matched_llm,
            "unmatched": result.unmatched,
            "updated": result.updated,
            "errors": result.errors[:50],
        }
        logger.info("Backfill %s completed: %d updated", backfill_id, result.updated)
    except Exception as e:
        logger.error("Backfill %s failed: %s", backfill_id, e)
        _backfill_status[backfill_id] = {"status": "failed", "errors": [str(e)]}
    finally:
        db.close()


@router.post("/generate/backfill-citations", response_model=BackfillResponse)
async def start_backfill(req: BackfillRequest):
    """Backfill article/paragraph into existing control source_citations.

    Uses 3-tier matching: hash lookup → regex parse → Ollama LLM.
    Default is dry_run=True (preview only, no DB changes).
    """
    import uuid
    backfill_id = str(uuid.uuid4())[:8]
    _backfill_status[backfill_id] = {"status": "running"}

    # Always run in background (RAG index build takes minutes)
    asyncio.create_task(_run_backfill_background(req.dry_run, req.limit, backfill_id))
    return BackfillResponse(
        status=f"running (id={backfill_id})",
    )


@router.get("/generate/backfill-status/{backfill_id}")
async def get_backfill_status(backfill_id: str):
    """Get status of a backfill job."""
    status = _backfill_status.get(backfill_id)
    if not status:
        raise HTTPException(status_code=404, detail="Backfill job not found")
    return status


# =============================================================================
# DOMAIN + TARGET AUDIENCE BACKFILL
# =============================================================================

class DomainBackfillRequest(BaseModel):
    dry_run: bool = True
    job_id: Optional[str] = None  # Only backfill controls from this job
    limit: int = 0  # 0 = all

_domain_backfill_status: dict = {}


async def _run_domain_backfill(req: DomainBackfillRequest, backfill_id: str):
    """Backfill domain, category, and target_audience for existing controls using Anthropic."""
    import os
    import httpx

    ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
    ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")

    if not ANTHROPIC_API_KEY:
        _domain_backfill_status[backfill_id] = {
            "status": "failed", "error": "ANTHROPIC_API_KEY not set"
        }
        return

    db = SessionLocal()
    try:
        # Find controls needing backfill
        where_clauses = ["(target_audience IS NULL OR target_audience = '[]' OR target_audience = 'null')"]
        params: dict = {}
        if req.job_id:
            where_clauses.append("generation_metadata->>'job_id' = :job_id")
            params["job_id"] = req.job_id

        query = f"""
            SELECT id, control_id, title, objective, category, source_original_text, tags
            FROM canonical_controls
            WHERE {' AND '.join(where_clauses)}
            ORDER BY control_id
        """
        if req.limit > 0:
            query += f" LIMIT {req.limit}"

        result = db.execute(text(query), params)
        controls = [dict(zip(result.keys(), row)) for row in result]

        total = len(controls)
        updated = 0
        errors = []

        _domain_backfill_status[backfill_id] = {
            "status": "running", "total": total, "updated": 0, "errors": []
        }

        # Process in batches of 10
        BATCH_SIZE = 10
        for batch_start in range(0, total, BATCH_SIZE):
            batch = controls[batch_start:batch_start + BATCH_SIZE]

            entries = []
            for idx, ctrl in enumerate(batch):
                text_for_analysis = ctrl.get("objective") or ctrl.get("title") or ""
                original = ctrl.get("source_original_text") or ""
                if original:
                    text_for_analysis += f"\n\nQuelltext-Auszug: {original[:500]}"
                entries.append(
                    f"--- CONTROL {idx + 1}: {ctrl['control_id']} ---\n"
                    f"Titel: {ctrl.get('title', '')}\n"
                    f"Objective: {text_for_analysis[:800]}\n"
                    f"Tags: {json.dumps(ctrl.get('tags', []))}"
                )

            prompt = f"""Analysiere die folgenden {len(batch)} Controls und bestimme fuer jedes:
1. domain: Das Fachgebiet (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
2. category: Die Kategorie (encryption, authentication, network, data_protection, logging, incident, continuity, compliance, supply_chain, physical, personnel, application, system, risk, governance, hardware, identity, public_administration, labor_law, finance, trade_regulation, environmental, health)
3. target_audience: Liste der Zielgruppen (moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")

Antworte mit einem JSON-Array mit {len(batch)} Objekten. Jedes Objekt hat:
- control_index: 1-basierter Index
- domain: Fachgebiet-Kuerzel
- category: Kategorie
- target_audience: Liste der Zielgruppen

{"".join(entries)}"""

            try:
                headers = {
                    "x-api-key": ANTHROPIC_API_KEY,
                    "anthropic-version": "2023-06-01",
                    "content-type": "application/json",
                }
                payload = {
                    "model": ANTHROPIC_MODEL,
                    "max_tokens": 4096,
                    "system": "Du bist ein Compliance-Experte. Klassifiziere Controls nach Fachgebiet und Zielgruppe. Antworte NUR mit validem JSON.",
                    "messages": [{"role": "user", "content": prompt}],
                }

                async with httpx.AsyncClient(timeout=60.0) as client:
                    resp = await client.post(
                        "https://api.anthropic.com/v1/messages",
                        headers=headers,
                        json=payload,
                    )
                    if resp.status_code != 200:
                        errors.append(f"Anthropic API {resp.status_code} at batch {batch_start}")
                        continue

                    raw = resp.json().get("content", [{}])[0].get("text", "")

                # Parse response
                import re
                bracket_match = re.search(r"\[.*\]", raw, re.DOTALL)
                if not bracket_match:
                    errors.append(f"No JSON array in response at batch {batch_start}")
                    continue

                results_list = json.loads(bracket_match.group(0))

                for item in results_list:
                    idx = item.get("control_index", 0) - 1
                    if idx < 0 or idx >= len(batch):
                        continue
                    ctrl = batch[idx]
                    ctrl_id = str(ctrl["id"])

                    new_domain = item.get("domain", "")
                    new_category = item.get("category", "")
                    new_audience = item.get("target_audience", [])

                    if not isinstance(new_audience, list):
                        new_audience = []

                    # Build new control_id from domain if domain changed
                    old_prefix = ctrl["control_id"].split("-")[0] if ctrl["control_id"] else ""
                    new_prefix = new_domain.upper()[:4] if new_domain else old_prefix

                    if not req.dry_run:
                        update_parts = []
                        update_params: dict = {"ctrl_id": ctrl_id}

                        if new_category:
                            update_parts.append("category = :category")
                            update_params["category"] = new_category

                        if new_audience:
                            update_parts.append("target_audience = :target_audience")
                            update_params["target_audience"] = json.dumps(new_audience)

                        # Note: We do NOT rename control_ids here — that would
                        # break references and cause unique constraint violations.

                        if update_parts:
                            update_parts.append("updated_at = NOW()")
                            db.execute(
                                text(f"UPDATE canonical_controls SET {', '.join(update_parts)} WHERE id = CAST(:ctrl_id AS uuid)"),
                                update_params,
                            )
                            updated += 1

                if not req.dry_run:
                    db.commit()

            except Exception as e:
                errors.append(f"Batch {batch_start}: {str(e)}")
                db.rollback()

            _domain_backfill_status[backfill_id] = {
                "status": "running", "total": total, "updated": updated,
                "progress": f"{min(batch_start + BATCH_SIZE, total)}/{total}",
                "errors": errors[-10:],
            }

        _domain_backfill_status[backfill_id] = {
            "status": "completed", "total": total, "updated": updated,
            "errors": errors[-50:],
        }
        logger.info("Domain backfill %s completed: %d/%d updated", backfill_id, updated, total)

    except Exception as e:
        logger.error("Domain backfill %s failed: %s", backfill_id, e)
        _domain_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
    finally:
        db.close()


@router.post("/generate/backfill-domain")
async def start_domain_backfill(req: DomainBackfillRequest):
    """Backfill domain, category, and target_audience for controls using Anthropic API.

    Finds controls where target_audience is NULL and enriches them.
    Default is dry_run=True (preview only).
    """
    import uuid
    backfill_id = str(uuid.uuid4())[:8]
    _domain_backfill_status[backfill_id] = {"status": "starting"}
    asyncio.create_task(_run_domain_backfill(req, backfill_id))
    return {"status": "running", "backfill_id": backfill_id,
            "message": f"Domain backfill started. Poll /generate/backfill-status/{backfill_id}"}


@router.get("/generate/domain-backfill-status/{backfill_id}")
async def get_domain_backfill_status(backfill_id: str):
    """Get status of a domain backfill job."""
    status = _domain_backfill_status.get(backfill_id)
    if not status:
        raise HTTPException(status_code=404, detail="Domain backfill job not found")
    return status