[split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
206
klausur-service/backend/legal_corpus_pipeline.py
Normal file
206
klausur-service/backend/legal_corpus_pipeline.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""
|
||||
Legal Corpus API - Pipeline Routes
|
||||
|
||||
Pipeline checkpoints, history, start/stop, and status endpoints.
|
||||
|
||||
Extracted from legal_corpus_api.py to keep files under 500 LOC.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StartPipelineRequest(BaseModel):
|
||||
force_reindex: bool = False
|
||||
skip_ingestion: bool = False
|
||||
|
||||
|
||||
# Create a separate router for pipeline-related endpoints
|
||||
pipeline_router = APIRouter(prefix="/api/v1/admin/pipeline", tags=["pipeline"])
|
||||
|
||||
|
||||
@pipeline_router.get("/checkpoints")
|
||||
async def get_pipeline_checkpoints():
|
||||
"""
|
||||
Get current pipeline checkpoint state.
|
||||
|
||||
Returns the current state of the compliance pipeline including:
|
||||
- Pipeline ID and overall status
|
||||
- Start and completion times
|
||||
- All checkpoints with their validations and metrics
|
||||
- Summary data
|
||||
"""
|
||||
from pipeline_checkpoints import CheckpointManager
|
||||
|
||||
state = CheckpointManager.load_state()
|
||||
|
||||
if state is None:
|
||||
return {
|
||||
"status": "no_data",
|
||||
"message": "No pipeline run data available yet.",
|
||||
"pipeline_id": None,
|
||||
"checkpoints": [],
|
||||
"summary": {}
|
||||
}
|
||||
|
||||
# Enrich with validation summary
|
||||
validation_summary = {
|
||||
"passed": 0,
|
||||
"warning": 0,
|
||||
"failed": 0,
|
||||
"total": 0
|
||||
}
|
||||
|
||||
for checkpoint in state.get("checkpoints", []):
|
||||
for validation in checkpoint.get("validations", []):
|
||||
validation_summary["total"] += 1
|
||||
status = validation.get("status", "not_run")
|
||||
if status in validation_summary:
|
||||
validation_summary[status] += 1
|
||||
|
||||
state["validation_summary"] = validation_summary
|
||||
|
||||
return state
|
||||
|
||||
|
||||
@pipeline_router.get("/checkpoints/history")
|
||||
async def get_pipeline_history():
|
||||
"""
|
||||
Get list of previous pipeline runs (if stored).
|
||||
For now, returns only current run.
|
||||
"""
|
||||
from pipeline_checkpoints import CheckpointManager
|
||||
|
||||
state = CheckpointManager.load_state()
|
||||
|
||||
if state is None:
|
||||
return {"runs": []}
|
||||
|
||||
return {
|
||||
"runs": [{
|
||||
"pipeline_id": state.get("pipeline_id"),
|
||||
"status": state.get("status"),
|
||||
"started_at": state.get("started_at"),
|
||||
"completed_at": state.get("completed_at"),
|
||||
}]
|
||||
}
|
||||
|
||||
|
||||
# Pipeline state for start/stop
|
||||
pipeline_process_state = {
|
||||
"running": False,
|
||||
"pid": None,
|
||||
"started_at": None,
|
||||
}
|
||||
|
||||
|
||||
@pipeline_router.post("/start")
|
||||
async def start_pipeline(request: StartPipelineRequest, background_tasks: BackgroundTasks):
|
||||
"""
|
||||
Start the compliance pipeline in the background.
|
||||
|
||||
This runs the full_compliance_pipeline.py script which:
|
||||
1. Ingests all legal documents (unless skip_ingestion=True)
|
||||
2. Extracts requirements and controls
|
||||
3. Generates compliance measures
|
||||
4. Creates checkpoint data for monitoring
|
||||
"""
|
||||
global pipeline_process_state
|
||||
|
||||
from pipeline_checkpoints import CheckpointManager
|
||||
state = CheckpointManager.load_state()
|
||||
|
||||
if state and state.get("status") == "running":
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Pipeline is already running"
|
||||
)
|
||||
|
||||
if pipeline_process_state["running"]:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Pipeline start already in progress"
|
||||
)
|
||||
|
||||
pipeline_process_state["running"] = True
|
||||
pipeline_process_state["started_at"] = datetime.now().isoformat()
|
||||
|
||||
background_tasks.add_task(
|
||||
run_pipeline_background,
|
||||
request.force_reindex,
|
||||
request.skip_ingestion
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "starting",
|
||||
"message": "Compliance pipeline is starting in background",
|
||||
"started_at": pipeline_process_state["started_at"],
|
||||
}
|
||||
|
||||
|
||||
async def run_pipeline_background(force_reindex: bool, skip_ingestion: bool):
|
||||
"""Background task to run the compliance pipeline."""
|
||||
global pipeline_process_state
|
||||
|
||||
try:
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
cmd = [sys.executable, "full_compliance_pipeline.py"]
|
||||
if force_reindex:
|
||||
cmd.append("--force-reindex")
|
||||
if skip_ingestion:
|
||||
cmd.append("--skip-ingestion")
|
||||
|
||||
logger.info(f"Starting pipeline: {' '.join(cmd)}")
|
||||
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
cwd=os.path.dirname(os.path.abspath(__file__)),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
)
|
||||
|
||||
pipeline_process_state["pid"] = process.pid
|
||||
|
||||
while process.poll() is None:
|
||||
await asyncio.sleep(5)
|
||||
|
||||
return_code = process.returncode
|
||||
|
||||
if return_code != 0:
|
||||
output = process.stdout.read() if process.stdout else ""
|
||||
logger.error(f"Pipeline failed with code {return_code}: {output}")
|
||||
else:
|
||||
logger.info("Pipeline completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to run pipeline: {e}")
|
||||
|
||||
finally:
|
||||
pipeline_process_state["running"] = False
|
||||
pipeline_process_state["pid"] = None
|
||||
|
||||
|
||||
@pipeline_router.get("/status")
|
||||
async def get_pipeline_status():
|
||||
"""Get current pipeline running status."""
|
||||
from pipeline_checkpoints import CheckpointManager
|
||||
|
||||
state = CheckpointManager.load_state()
|
||||
checkpoint_status = state.get("status") if state else "no_data"
|
||||
|
||||
return {
|
||||
"process_running": pipeline_process_state["running"],
|
||||
"process_pid": pipeline_process_state["pid"],
|
||||
"process_started_at": pipeline_process_state["started_at"],
|
||||
"checkpoint_status": checkpoint_status,
|
||||
"current_phase": state.get("current_phase") if state else None,
|
||||
}
|
||||
Reference in New Issue
Block a user