Files
breakpilot-lehrer/klausur-service/backend/admin_rag.py
Benjamin Admin 6811264756 [split-required] Split final batch of monoliths >1000 LOC
Python (6 files in klausur-service):
- rbac.py (1,132 → 4), admin_api.py (1,012 → 4)
- routes/eh.py (1,111 → 4), ocr_pipeline_geometry.py (1,105 → 5)

Python (2 files in backend-lehrer):
- unit_api.py (1,226 → 6), game_api.py (1,129 → 5)

Website (6 page files):
- 4x klausur-korrektur pages (1,249-1,328 LOC each) → shared components
  in website/components/klausur-korrektur/ (17 shared files)
- companion (1,057 → 10), magic-help (1,017 → 8)

All re-export barrels preserve backward compatibility.
Zero import errors verified.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 23:17:30 +02:00

282 lines
8.4 KiB
Python

"""
Admin API - RAG Upload & Metrics
Endpoints for uploading documents, tracking uploads, RAG metrics,
search feedback, storage stats, and service initialization.
Extracted from admin_api.py for file-size compliance.
"""
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query, UploadFile, File, Form
from pydantic import BaseModel
from typing import Optional, List, Dict
from datetime import datetime
from pathlib import Path
import zipfile
import tempfile
import os
from nibis_ingestion import run_ingestion, DOCS_BASE_PATH
# Import ingestion status from nibis module for auto-ingest
from admin_nibis import _ingestion_status
# Optional: MinIO and PostgreSQL integrations
try:
from minio_storage import upload_rag_document, get_storage_stats, init_minio_bucket
MINIO_AVAILABLE = True
except ImportError:
MINIO_AVAILABLE = False
try:
from metrics_db import (
init_metrics_tables, store_feedback, log_search, log_upload,
calculate_metrics, get_recent_feedback, get_upload_history
)
METRICS_DB_AVAILABLE = True
except ImportError:
METRICS_DB_AVAILABLE = False
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
# Upload directory configuration
RAG_UPLOAD_BASE = Path(os.getenv("RAG_UPLOAD_BASE", str(DOCS_BASE_PATH)))
# Store for upload tracking
_upload_history: List[Dict] = []
class UploadResult(BaseModel):
status: str
files_received: int
pdfs_extracted: int
target_directory: str
errors: List[str]
@router.post("/rag/upload", response_model=UploadResult)
async def upload_rag_documents(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
collection: str = Form(default="bp_nibis_eh"),
year: Optional[int] = Form(default=None),
auto_ingest: bool = Form(default=False),
):
"""
Upload documents for RAG indexing.
Supports:
- ZIP archives (automatically extracted)
- Individual PDF files
"""
errors = []
pdfs_extracted = 0
# Determine target year
target_year = year or datetime.now().year
# Target directory: za-download/YYYY/
target_dir = RAG_UPLOAD_BASE / "za-download" / str(target_year)
target_dir.mkdir(parents=True, exist_ok=True)
try:
filename = file.filename or "upload"
if filename.lower().endswith(".zip"):
# Handle ZIP file
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
content = await file.read()
tmp.write(content)
tmp_path = tmp.name
try:
with zipfile.ZipFile(tmp_path, 'r') as zf:
for member in zf.namelist():
if member.lower().endswith(".pdf") and not member.startswith("__MACOSX"):
pdf_name = Path(member).name
if pdf_name:
target_path = target_dir / pdf_name
with zf.open(member) as src:
with open(target_path, 'wb') as dst:
dst.write(src.read())
pdfs_extracted += 1
finally:
os.unlink(tmp_path)
elif filename.lower().endswith(".pdf"):
target_path = target_dir / filename
content = await file.read()
with open(target_path, 'wb') as f:
f.write(content)
pdfs_extracted = 1
else:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type: {filename}. Only .zip and .pdf are allowed."
)
# Track upload in memory
upload_record = {
"timestamp": datetime.now().isoformat(),
"filename": filename,
"collection": collection,
"year": target_year,
"pdfs_extracted": pdfs_extracted,
"target_directory": str(target_dir),
}
_upload_history.append(upload_record)
# Keep only last 100 uploads in memory
if len(_upload_history) > 100:
_upload_history.pop(0)
# Store in PostgreSQL if available
if METRICS_DB_AVAILABLE:
await log_upload(
filename=filename,
collection_name=collection,
year=target_year,
pdfs_extracted=pdfs_extracted,
minio_path=str(target_dir),
)
# Auto-ingest if requested
if auto_ingest and not _ingestion_status["running"]:
async def run_auto_ingest():
global _ingestion_status
_ingestion_status["running"] = True
_ingestion_status["last_run"] = datetime.now().isoformat()
try:
result = await run_ingestion(
ewh_only=True,
dry_run=False,
year_filter=target_year,
)
_ingestion_status["last_result"] = result
except Exception as e:
_ingestion_status["last_result"] = {"error": str(e), "errors": [str(e)]}
finally:
_ingestion_status["running"] = False
background_tasks.add_task(run_auto_ingest)
return UploadResult(
status="success",
files_received=1,
pdfs_extracted=pdfs_extracted,
target_directory=str(target_dir),
errors=errors,
)
except HTTPException:
raise
except Exception as e:
errors.append(str(e))
raise HTTPException(status_code=500, detail=str(e))
@router.get("/rag/upload/history")
async def get_upload_history_endpoint(limit: int = Query(default=20, le=100)):
"""Get recent upload history."""
return {
"uploads": _upload_history[-limit:][::-1],
"total": len(_upload_history),
}
@router.get("/rag/metrics")
async def get_rag_metrics(
collection: Optional[str] = Query(default=None),
days: int = Query(default=7, le=90),
):
"""Get RAG quality metrics."""
if METRICS_DB_AVAILABLE:
metrics = await calculate_metrics(collection_name=collection, days=days)
if metrics.get("connected"):
return metrics
# Fallback: Return placeholder metrics
return {
"precision_at_5": 0.78,
"recall_at_10": 0.85,
"mrr": 0.72,
"avg_latency_ms": 52,
"total_ratings": len(_upload_history),
"error_rate": 0.3,
"score_distribution": {
"0.9+": 23,
"0.7-0.9": 41,
"0.5-0.7": 28,
"<0.5": 8,
},
"note": "Placeholder metrics - PostgreSQL not connected",
"connected": False,
}
@router.post("/rag/search/feedback")
async def submit_search_feedback(
result_id: str = Form(...),
rating: int = Form(..., ge=1, le=5),
notes: Optional[str] = Form(default=None),
query: Optional[str] = Form(default=None),
collection: Optional[str] = Form(default=None),
score: Optional[float] = Form(default=None),
):
"""Submit feedback for a search result."""
feedback_record = {
"timestamp": datetime.now().isoformat(),
"result_id": result_id,
"rating": rating,
"notes": notes,
}
stored = False
if METRICS_DB_AVAILABLE:
stored = await store_feedback(
result_id=result_id,
rating=rating,
query_text=query,
collection_name=collection,
score=score,
notes=notes,
)
return {
"status": "stored" if stored else "received",
"feedback": feedback_record,
"persisted": stored,
}
@router.get("/rag/storage/stats")
async def get_storage_statistics():
"""Get MinIO storage statistics."""
if MINIO_AVAILABLE:
stats = await get_storage_stats()
return stats
return {
"error": "MinIO not available",
"connected": False,
}
@router.post("/rag/init")
async def initialize_rag_services():
"""Initialize RAG services (MinIO bucket, PostgreSQL tables)."""
results = {
"minio": False,
"postgres": False,
}
if MINIO_AVAILABLE:
results["minio"] = await init_minio_bucket()
if METRICS_DB_AVAILABLE:
results["postgres"] = await init_metrics_tables()
return {
"status": "initialized",
"services": results,
}