Files
breakpilot-lehrer/klausur-service/backend/legal_corpus_pipeline.py
Benjamin Admin 34da9f4cda [split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files):
- llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6)
- messenger_api.py (840 → 5), print_generator.py (824 → 5)
- unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4)
- llm_gateway/routes/edu_search_seeds.py (710 → 4)

klausur-service (12 files):
- ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4)
- legal_corpus_api.py (790 → 4), page_crop.py (758 → 3)
- mail/ai_service.py (747 → 4), github_crawler.py (767 → 3)
- trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4)
- dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4)

website (6 pages):
- audit-checklist (867 → 8), content (806 → 6)
- screen-flow (790 → 4), scraper (789 → 5)
- zeugnisse (776 → 5), modules (745 → 4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00

207 lines
5.7 KiB
Python

"""
Legal Corpus API - Pipeline Routes
Pipeline checkpoints, history, start/stop, and status endpoints.
Extracted from legal_corpus_api.py to keep files under 500 LOC.
"""
import os
import asyncio
from datetime import datetime
from fastapi import APIRouter, HTTPException, BackgroundTasks
from pydantic import BaseModel
import logging
logger = logging.getLogger(__name__)
class StartPipelineRequest(BaseModel):
force_reindex: bool = False
skip_ingestion: bool = False
# Create a separate router for pipeline-related endpoints
pipeline_router = APIRouter(prefix="/api/v1/admin/pipeline", tags=["pipeline"])
@pipeline_router.get("/checkpoints")
async def get_pipeline_checkpoints():
"""
Get current pipeline checkpoint state.
Returns the current state of the compliance pipeline including:
- Pipeline ID and overall status
- Start and completion times
- All checkpoints with their validations and metrics
- Summary data
"""
from pipeline_checkpoints import CheckpointManager
state = CheckpointManager.load_state()
if state is None:
return {
"status": "no_data",
"message": "No pipeline run data available yet.",
"pipeline_id": None,
"checkpoints": [],
"summary": {}
}
# Enrich with validation summary
validation_summary = {
"passed": 0,
"warning": 0,
"failed": 0,
"total": 0
}
for checkpoint in state.get("checkpoints", []):
for validation in checkpoint.get("validations", []):
validation_summary["total"] += 1
status = validation.get("status", "not_run")
if status in validation_summary:
validation_summary[status] += 1
state["validation_summary"] = validation_summary
return state
@pipeline_router.get("/checkpoints/history")
async def get_pipeline_history():
"""
Get list of previous pipeline runs (if stored).
For now, returns only current run.
"""
from pipeline_checkpoints import CheckpointManager
state = CheckpointManager.load_state()
if state is None:
return {"runs": []}
return {
"runs": [{
"pipeline_id": state.get("pipeline_id"),
"status": state.get("status"),
"started_at": state.get("started_at"),
"completed_at": state.get("completed_at"),
}]
}
# Pipeline state for start/stop
pipeline_process_state = {
"running": False,
"pid": None,
"started_at": None,
}
@pipeline_router.post("/start")
async def start_pipeline(request: StartPipelineRequest, background_tasks: BackgroundTasks):
"""
Start the compliance pipeline in the background.
This runs the full_compliance_pipeline.py script which:
1. Ingests all legal documents (unless skip_ingestion=True)
2. Extracts requirements and controls
3. Generates compliance measures
4. Creates checkpoint data for monitoring
"""
global pipeline_process_state
from pipeline_checkpoints import CheckpointManager
state = CheckpointManager.load_state()
if state and state.get("status") == "running":
raise HTTPException(
status_code=409,
detail="Pipeline is already running"
)
if pipeline_process_state["running"]:
raise HTTPException(
status_code=409,
detail="Pipeline start already in progress"
)
pipeline_process_state["running"] = True
pipeline_process_state["started_at"] = datetime.now().isoformat()
background_tasks.add_task(
run_pipeline_background,
request.force_reindex,
request.skip_ingestion
)
return {
"status": "starting",
"message": "Compliance pipeline is starting in background",
"started_at": pipeline_process_state["started_at"],
}
async def run_pipeline_background(force_reindex: bool, skip_ingestion: bool):
"""Background task to run the compliance pipeline."""
global pipeline_process_state
try:
import subprocess
import sys
cmd = [sys.executable, "full_compliance_pipeline.py"]
if force_reindex:
cmd.append("--force-reindex")
if skip_ingestion:
cmd.append("--skip-ingestion")
logger.info(f"Starting pipeline: {' '.join(cmd)}")
process = subprocess.Popen(
cmd,
cwd=os.path.dirname(os.path.abspath(__file__)),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
pipeline_process_state["pid"] = process.pid
while process.poll() is None:
await asyncio.sleep(5)
return_code = process.returncode
if return_code != 0:
output = process.stdout.read() if process.stdout else ""
logger.error(f"Pipeline failed with code {return_code}: {output}")
else:
logger.info("Pipeline completed successfully")
except Exception as e:
logger.error(f"Failed to run pipeline: {e}")
finally:
pipeline_process_state["running"] = False
pipeline_process_state["pid"] = None
@pipeline_router.get("/status")
async def get_pipeline_status():
"""Get current pipeline running status."""
from pipeline_checkpoints import CheckpointManager
state = CheckpointManager.load_state()
checkpoint_status = state.get("status") if state else "no_data"
return {
"process_running": pipeline_process_state["running"],
"process_pid": pipeline_process_state["pid"],
"process_started_at": pipeline_process_state["started_at"],
"checkpoint_status": checkpoint_status,
"current_phase": state.get("current_phase") if state else None,
}