""" Legal Corpus API - Pipeline Routes Pipeline checkpoints, history, start/stop, and status endpoints. Extracted from legal_corpus_api.py to keep files under 500 LOC. """ import os import asyncio from datetime import datetime from fastapi import APIRouter, HTTPException, BackgroundTasks from pydantic import BaseModel import logging logger = logging.getLogger(__name__) class StartPipelineRequest(BaseModel): force_reindex: bool = False skip_ingestion: bool = False # Create a separate router for pipeline-related endpoints pipeline_router = APIRouter(prefix="/api/v1/admin/pipeline", tags=["pipeline"]) @pipeline_router.get("/checkpoints") async def get_pipeline_checkpoints(): """ Get current pipeline checkpoint state. Returns the current state of the compliance pipeline including: - Pipeline ID and overall status - Start and completion times - All checkpoints with their validations and metrics - Summary data """ from pipeline_checkpoints import CheckpointManager state = CheckpointManager.load_state() if state is None: return { "status": "no_data", "message": "No pipeline run data available yet.", "pipeline_id": None, "checkpoints": [], "summary": {} } # Enrich with validation summary validation_summary = { "passed": 0, "warning": 0, "failed": 0, "total": 0 } for checkpoint in state.get("checkpoints", []): for validation in checkpoint.get("validations", []): validation_summary["total"] += 1 status = validation.get("status", "not_run") if status in validation_summary: validation_summary[status] += 1 state["validation_summary"] = validation_summary return state @pipeline_router.get("/checkpoints/history") async def get_pipeline_history(): """ Get list of previous pipeline runs (if stored). For now, returns only current run. """ from pipeline_checkpoints import CheckpointManager state = CheckpointManager.load_state() if state is None: return {"runs": []} return { "runs": [{ "pipeline_id": state.get("pipeline_id"), "status": state.get("status"), "started_at": state.get("started_at"), "completed_at": state.get("completed_at"), }] } # Pipeline state for start/stop pipeline_process_state = { "running": False, "pid": None, "started_at": None, } @pipeline_router.post("/start") async def start_pipeline(request: StartPipelineRequest, background_tasks: BackgroundTasks): """ Start the compliance pipeline in the background. This runs the full_compliance_pipeline.py script which: 1. Ingests all legal documents (unless skip_ingestion=True) 2. Extracts requirements and controls 3. Generates compliance measures 4. Creates checkpoint data for monitoring """ global pipeline_process_state from pipeline_checkpoints import CheckpointManager state = CheckpointManager.load_state() if state and state.get("status") == "running": raise HTTPException( status_code=409, detail="Pipeline is already running" ) if pipeline_process_state["running"]: raise HTTPException( status_code=409, detail="Pipeline start already in progress" ) pipeline_process_state["running"] = True pipeline_process_state["started_at"] = datetime.now().isoformat() background_tasks.add_task( run_pipeline_background, request.force_reindex, request.skip_ingestion ) return { "status": "starting", "message": "Compliance pipeline is starting in background", "started_at": pipeline_process_state["started_at"], } async def run_pipeline_background(force_reindex: bool, skip_ingestion: bool): """Background task to run the compliance pipeline.""" global pipeline_process_state try: import subprocess import sys cmd = [sys.executable, "full_compliance_pipeline.py"] if force_reindex: cmd.append("--force-reindex") if skip_ingestion: cmd.append("--skip-ingestion") logger.info(f"Starting pipeline: {' '.join(cmd)}") process = subprocess.Popen( cmd, cwd=os.path.dirname(os.path.abspath(__file__)), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, ) pipeline_process_state["pid"] = process.pid while process.poll() is None: await asyncio.sleep(5) return_code = process.returncode if return_code != 0: output = process.stdout.read() if process.stdout else "" logger.error(f"Pipeline failed with code {return_code}: {output}") else: logger.info("Pipeline completed successfully") except Exception as e: logger.error(f"Failed to run pipeline: {e}") finally: pipeline_process_state["running"] = False pipeline_process_state["pid"] = None @pipeline_router.get("/status") async def get_pipeline_status(): """Get current pipeline running status.""" from pipeline_checkpoints import CheckpointManager state = CheckpointManager.load_state() checkpoint_status = state.get("status") if state else "no_data" return { "process_running": pipeline_process_state["running"], "process_pid": pipeline_process_state["pid"], "process_started_at": pipeline_process_state["started_at"], "checkpoint_status": checkpoint_status, "current_phase": state.get("current_phase") if state else None, }