""" Admin API - RAG Upload & Metrics Endpoints for uploading documents, tracking uploads, RAG metrics, search feedback, storage stats, and service initialization. Extracted from admin_api.py for file-size compliance. """ from fastapi import APIRouter, HTTPException, BackgroundTasks, Query, UploadFile, File, Form from pydantic import BaseModel from typing import Optional, List, Dict from datetime import datetime from pathlib import Path import zipfile import tempfile import os from nibis_ingestion import run_ingestion, DOCS_BASE_PATH # Import ingestion status from nibis module for auto-ingest from admin_nibis import _ingestion_status # Optional: MinIO and PostgreSQL integrations try: from minio_storage import upload_rag_document, get_storage_stats, init_minio_bucket MINIO_AVAILABLE = True except ImportError: MINIO_AVAILABLE = False try: from metrics_db import ( init_metrics_tables, store_feedback, log_search, log_upload, calculate_metrics, get_recent_feedback, get_upload_history ) METRICS_DB_AVAILABLE = True except ImportError: METRICS_DB_AVAILABLE = False router = APIRouter(prefix="/api/v1/admin", tags=["Admin"]) # Upload directory configuration RAG_UPLOAD_BASE = Path(os.getenv("RAG_UPLOAD_BASE", str(DOCS_BASE_PATH))) # Store for upload tracking _upload_history: List[Dict] = [] class UploadResult(BaseModel): status: str files_received: int pdfs_extracted: int target_directory: str errors: List[str] @router.post("/rag/upload", response_model=UploadResult) async def upload_rag_documents( background_tasks: BackgroundTasks, file: UploadFile = File(...), collection: str = Form(default="bp_nibis_eh"), year: Optional[int] = Form(default=None), auto_ingest: bool = Form(default=False), ): """ Upload documents for RAG indexing. Supports: - ZIP archives (automatically extracted) - Individual PDF files """ errors = [] pdfs_extracted = 0 # Determine target year target_year = year or datetime.now().year # Target directory: za-download/YYYY/ target_dir = RAG_UPLOAD_BASE / "za-download" / str(target_year) target_dir.mkdir(parents=True, exist_ok=True) try: filename = file.filename or "upload" if filename.lower().endswith(".zip"): # Handle ZIP file with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp: content = await file.read() tmp.write(content) tmp_path = tmp.name try: with zipfile.ZipFile(tmp_path, 'r') as zf: for member in zf.namelist(): if member.lower().endswith(".pdf") and not member.startswith("__MACOSX"): pdf_name = Path(member).name if pdf_name: target_path = target_dir / pdf_name with zf.open(member) as src: with open(target_path, 'wb') as dst: dst.write(src.read()) pdfs_extracted += 1 finally: os.unlink(tmp_path) elif filename.lower().endswith(".pdf"): target_path = target_dir / filename content = await file.read() with open(target_path, 'wb') as f: f.write(content) pdfs_extracted = 1 else: raise HTTPException( status_code=400, detail=f"Unsupported file type: {filename}. Only .zip and .pdf are allowed." ) # Track upload in memory upload_record = { "timestamp": datetime.now().isoformat(), "filename": filename, "collection": collection, "year": target_year, "pdfs_extracted": pdfs_extracted, "target_directory": str(target_dir), } _upload_history.append(upload_record) # Keep only last 100 uploads in memory if len(_upload_history) > 100: _upload_history.pop(0) # Store in PostgreSQL if available if METRICS_DB_AVAILABLE: await log_upload( filename=filename, collection_name=collection, year=target_year, pdfs_extracted=pdfs_extracted, minio_path=str(target_dir), ) # Auto-ingest if requested if auto_ingest and not _ingestion_status["running"]: async def run_auto_ingest(): global _ingestion_status _ingestion_status["running"] = True _ingestion_status["last_run"] = datetime.now().isoformat() try: result = await run_ingestion( ewh_only=True, dry_run=False, year_filter=target_year, ) _ingestion_status["last_result"] = result except Exception as e: _ingestion_status["last_result"] = {"error": str(e), "errors": [str(e)]} finally: _ingestion_status["running"] = False background_tasks.add_task(run_auto_ingest) return UploadResult( status="success", files_received=1, pdfs_extracted=pdfs_extracted, target_directory=str(target_dir), errors=errors, ) except HTTPException: raise except Exception as e: errors.append(str(e)) raise HTTPException(status_code=500, detail=str(e)) @router.get("/rag/upload/history") async def get_upload_history_endpoint(limit: int = Query(default=20, le=100)): """Get recent upload history.""" return { "uploads": _upload_history[-limit:][::-1], "total": len(_upload_history), } @router.get("/rag/metrics") async def get_rag_metrics( collection: Optional[str] = Query(default=None), days: int = Query(default=7, le=90), ): """Get RAG quality metrics.""" if METRICS_DB_AVAILABLE: metrics = await calculate_metrics(collection_name=collection, days=days) if metrics.get("connected"): return metrics # Fallback: Return placeholder metrics return { "precision_at_5": 0.78, "recall_at_10": 0.85, "mrr": 0.72, "avg_latency_ms": 52, "total_ratings": len(_upload_history), "error_rate": 0.3, "score_distribution": { "0.9+": 23, "0.7-0.9": 41, "0.5-0.7": 28, "<0.5": 8, }, "note": "Placeholder metrics - PostgreSQL not connected", "connected": False, } @router.post("/rag/search/feedback") async def submit_search_feedback( result_id: str = Form(...), rating: int = Form(..., ge=1, le=5), notes: Optional[str] = Form(default=None), query: Optional[str] = Form(default=None), collection: Optional[str] = Form(default=None), score: Optional[float] = Form(default=None), ): """Submit feedback for a search result.""" feedback_record = { "timestamp": datetime.now().isoformat(), "result_id": result_id, "rating": rating, "notes": notes, } stored = False if METRICS_DB_AVAILABLE: stored = await store_feedback( result_id=result_id, rating=rating, query_text=query, collection_name=collection, score=score, notes=notes, ) return { "status": "stored" if stored else "received", "feedback": feedback_record, "persisted": stored, } @router.get("/rag/storage/stats") async def get_storage_statistics(): """Get MinIO storage statistics.""" if MINIO_AVAILABLE: stats = await get_storage_stats() return stats return { "error": "MinIO not available", "connected": False, } @router.post("/rag/init") async def initialize_rag_services(): """Initialize RAG services (MinIO bucket, PostgreSQL tables).""" results = { "minio": False, "postgres": False, } if MINIO_AVAILABLE: results["minio"] = await init_minio_bucket() if METRICS_DB_AVAILABLE: results["postgres"] = await init_metrics_tables() return { "status": "initialized", "services": results, }