backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
207 lines
5.7 KiB
Python
207 lines
5.7 KiB
Python
"""
|
|
Legal Corpus API - Pipeline Routes
|
|
|
|
Pipeline checkpoints, history, start/stop, and status endpoints.
|
|
|
|
Extracted from legal_corpus_api.py to keep files under 500 LOC.
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
from datetime import datetime
|
|
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
|
from pydantic import BaseModel
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class StartPipelineRequest(BaseModel):
|
|
force_reindex: bool = False
|
|
skip_ingestion: bool = False
|
|
|
|
|
|
# Create a separate router for pipeline-related endpoints
|
|
pipeline_router = APIRouter(prefix="/api/v1/admin/pipeline", tags=["pipeline"])
|
|
|
|
|
|
@pipeline_router.get("/checkpoints")
|
|
async def get_pipeline_checkpoints():
|
|
"""
|
|
Get current pipeline checkpoint state.
|
|
|
|
Returns the current state of the compliance pipeline including:
|
|
- Pipeline ID and overall status
|
|
- Start and completion times
|
|
- All checkpoints with their validations and metrics
|
|
- Summary data
|
|
"""
|
|
from pipeline_checkpoints import CheckpointManager
|
|
|
|
state = CheckpointManager.load_state()
|
|
|
|
if state is None:
|
|
return {
|
|
"status": "no_data",
|
|
"message": "No pipeline run data available yet.",
|
|
"pipeline_id": None,
|
|
"checkpoints": [],
|
|
"summary": {}
|
|
}
|
|
|
|
# Enrich with validation summary
|
|
validation_summary = {
|
|
"passed": 0,
|
|
"warning": 0,
|
|
"failed": 0,
|
|
"total": 0
|
|
}
|
|
|
|
for checkpoint in state.get("checkpoints", []):
|
|
for validation in checkpoint.get("validations", []):
|
|
validation_summary["total"] += 1
|
|
status = validation.get("status", "not_run")
|
|
if status in validation_summary:
|
|
validation_summary[status] += 1
|
|
|
|
state["validation_summary"] = validation_summary
|
|
|
|
return state
|
|
|
|
|
|
@pipeline_router.get("/checkpoints/history")
|
|
async def get_pipeline_history():
|
|
"""
|
|
Get list of previous pipeline runs (if stored).
|
|
For now, returns only current run.
|
|
"""
|
|
from pipeline_checkpoints import CheckpointManager
|
|
|
|
state = CheckpointManager.load_state()
|
|
|
|
if state is None:
|
|
return {"runs": []}
|
|
|
|
return {
|
|
"runs": [{
|
|
"pipeline_id": state.get("pipeline_id"),
|
|
"status": state.get("status"),
|
|
"started_at": state.get("started_at"),
|
|
"completed_at": state.get("completed_at"),
|
|
}]
|
|
}
|
|
|
|
|
|
# Pipeline state for start/stop
|
|
pipeline_process_state = {
|
|
"running": False,
|
|
"pid": None,
|
|
"started_at": None,
|
|
}
|
|
|
|
|
|
@pipeline_router.post("/start")
|
|
async def start_pipeline(request: StartPipelineRequest, background_tasks: BackgroundTasks):
|
|
"""
|
|
Start the compliance pipeline in the background.
|
|
|
|
This runs the full_compliance_pipeline.py script which:
|
|
1. Ingests all legal documents (unless skip_ingestion=True)
|
|
2. Extracts requirements and controls
|
|
3. Generates compliance measures
|
|
4. Creates checkpoint data for monitoring
|
|
"""
|
|
global pipeline_process_state
|
|
|
|
from pipeline_checkpoints import CheckpointManager
|
|
state = CheckpointManager.load_state()
|
|
|
|
if state and state.get("status") == "running":
|
|
raise HTTPException(
|
|
status_code=409,
|
|
detail="Pipeline is already running"
|
|
)
|
|
|
|
if pipeline_process_state["running"]:
|
|
raise HTTPException(
|
|
status_code=409,
|
|
detail="Pipeline start already in progress"
|
|
)
|
|
|
|
pipeline_process_state["running"] = True
|
|
pipeline_process_state["started_at"] = datetime.now().isoformat()
|
|
|
|
background_tasks.add_task(
|
|
run_pipeline_background,
|
|
request.force_reindex,
|
|
request.skip_ingestion
|
|
)
|
|
|
|
return {
|
|
"status": "starting",
|
|
"message": "Compliance pipeline is starting in background",
|
|
"started_at": pipeline_process_state["started_at"],
|
|
}
|
|
|
|
|
|
async def run_pipeline_background(force_reindex: bool, skip_ingestion: bool):
|
|
"""Background task to run the compliance pipeline."""
|
|
global pipeline_process_state
|
|
|
|
try:
|
|
import subprocess
|
|
import sys
|
|
|
|
cmd = [sys.executable, "full_compliance_pipeline.py"]
|
|
if force_reindex:
|
|
cmd.append("--force-reindex")
|
|
if skip_ingestion:
|
|
cmd.append("--skip-ingestion")
|
|
|
|
logger.info(f"Starting pipeline: {' '.join(cmd)}")
|
|
|
|
process = subprocess.Popen(
|
|
cmd,
|
|
cwd=os.path.dirname(os.path.abspath(__file__)),
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
text=True,
|
|
)
|
|
|
|
pipeline_process_state["pid"] = process.pid
|
|
|
|
while process.poll() is None:
|
|
await asyncio.sleep(5)
|
|
|
|
return_code = process.returncode
|
|
|
|
if return_code != 0:
|
|
output = process.stdout.read() if process.stdout else ""
|
|
logger.error(f"Pipeline failed with code {return_code}: {output}")
|
|
else:
|
|
logger.info("Pipeline completed successfully")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to run pipeline: {e}")
|
|
|
|
finally:
|
|
pipeline_process_state["running"] = False
|
|
pipeline_process_state["pid"] = None
|
|
|
|
|
|
@pipeline_router.get("/status")
|
|
async def get_pipeline_status():
|
|
"""Get current pipeline running status."""
|
|
from pipeline_checkpoints import CheckpointManager
|
|
|
|
state = CheckpointManager.load_state()
|
|
checkpoint_status = state.get("status") if state else "no_data"
|
|
|
|
return {
|
|
"process_running": pipeline_process_state["running"],
|
|
"process_pid": pipeline_process_state["pid"],
|
|
"process_started_at": pipeline_process_state["started_at"],
|
|
"checkpoint_status": checkpoint_status,
|
|
"current_phase": state.get("current_phase") if state else None,
|
|
}
|