""" OCR Labeling - Upload, Run-OCR, and Export Route Handlers Extracted from ocr_labeling_routes.py to keep files under 500 LOC. Endpoints: - POST /sessions/{id}/upload - Upload images for labeling - POST /run-ocr/{item_id} - Run OCR on existing item - POST /export - Export training data - GET /training-samples - List training samples - GET /images/{path} - Serve images from local storage - GET /exports - List exports """ from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Query from typing import Optional, List import uuid import os from metrics_db import ( get_ocr_labeling_session, add_ocr_labeling_item, get_ocr_labeling_item, export_training_samples, get_training_samples, ) from ocr_labeling_models import ( ExportRequest, LOCAL_STORAGE_PATH, ) from ocr_labeling_helpers import ( compute_image_hash, run_ocr_on_image, save_image_locally, MINIO_AVAILABLE, TRAINING_EXPORT_AVAILABLE, ) # Conditional imports try: from minio_storage import upload_ocr_image, get_ocr_image except ImportError: pass try: from training_export_service import TrainingSample, get_training_export_service except ImportError: pass router = APIRouter(prefix="/api/v1/ocr-label", tags=["OCR Labeling"]) @router.post("/sessions/{session_id}/upload") async def upload_images( session_id: str, files: List[UploadFile] = File(...), run_ocr: bool = Form(True), metadata: Optional[str] = Form(None), ): """ Upload images to a labeling session. Args: session_id: Session to add images to files: Image files to upload (PNG, JPG, PDF) run_ocr: Whether to run OCR immediately (default: True) metadata: Optional JSON metadata (subject, year, etc.) """ import json session = await get_ocr_labeling_session(session_id) if not session: raise HTTPException(status_code=404, detail="Session not found") meta_dict = None if metadata: try: meta_dict = json.loads(metadata) except json.JSONDecodeError: meta_dict = {"raw": metadata} results = [] ocr_model = session.get('ocr_model', 'llama3.2-vision:11b') for file in files: content = await file.read() image_hash = compute_image_hash(content) item_id = str(uuid.uuid4()) extension = file.filename.split('.')[-1].lower() if file.filename else 'png' if extension not in ['png', 'jpg', 'jpeg', 'pdf']: extension = 'png' if MINIO_AVAILABLE: try: image_path = upload_ocr_image(session_id, item_id, content, extension) except Exception as e: print(f"MinIO upload failed, using local storage: {e}") image_path = save_image_locally(session_id, item_id, content, extension) else: image_path = save_image_locally(session_id, item_id, content, extension) ocr_text = None ocr_confidence = None if run_ocr and extension != 'pdf': ocr_text, ocr_confidence = await run_ocr_on_image( content, file.filename or f"{item_id}.{extension}", model=ocr_model ) success = await add_ocr_labeling_item( item_id=item_id, session_id=session_id, image_path=image_path, image_hash=image_hash, ocr_text=ocr_text, ocr_confidence=ocr_confidence, ocr_model=ocr_model if ocr_text else None, metadata=meta_dict, ) if success: results.append({ "id": item_id, "filename": file.filename, "image_path": image_path, "image_hash": image_hash, "ocr_text": ocr_text, "ocr_confidence": ocr_confidence, "status": "pending", }) return { "session_id": session_id, "uploaded_count": len(results), "items": results, } @router.post("/export") async def export_data(request: ExportRequest): """Export labeled data for training.""" db_samples = await export_training_samples( export_format=request.export_format, session_id=request.session_id, batch_id=request.batch_id, exported_by="admin", ) if not db_samples: return { "export_format": request.export_format, "batch_id": request.batch_id, "exported_count": 0, "samples": [], "message": "No labeled samples found to export", } export_result = None if TRAINING_EXPORT_AVAILABLE: try: export_service = get_training_export_service() training_samples = [] for s in db_samples: training_samples.append(TrainingSample( id=s.get('id', s.get('item_id', '')), image_path=s.get('image_path', ''), ground_truth=s.get('ground_truth', ''), ocr_text=s.get('ocr_text'), ocr_confidence=s.get('ocr_confidence'), metadata=s.get('metadata'), )) export_result = export_service.export( samples=training_samples, export_format=request.export_format, batch_id=request.batch_id, ) except Exception as e: print(f"Training export failed: {e}") response = { "export_format": request.export_format, "batch_id": request.batch_id or (export_result.batch_id if export_result else None), "exported_count": len(db_samples), "samples": db_samples, } if export_result: response["export_path"] = export_result.export_path response["manifest_path"] = export_result.manifest_path return response @router.get("/training-samples") async def list_training_samples( export_format: Optional[str] = Query(None), batch_id: Optional[str] = Query(None), limit: int = Query(100, ge=1, le=1000), ): """Get exported training samples.""" samples = await get_training_samples( export_format=export_format, batch_id=batch_id, limit=limit, ) return { "count": len(samples), "samples": samples, } @router.get("/images/{path:path}") async def get_image(path: str): """Serve an image from local storage.""" from fastapi.responses import FileResponse filepath = os.path.join(LOCAL_STORAGE_PATH, path) if not os.path.exists(filepath): raise HTTPException(status_code=404, detail="Image not found") extension = filepath.split('.')[-1].lower() content_type = { 'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg', 'pdf': 'application/pdf', }.get(extension, 'application/octet-stream') return FileResponse(filepath, media_type=content_type) @router.post("/run-ocr/{item_id}") async def run_ocr_for_item(item_id: str): """Run OCR on an existing item.""" item = await get_ocr_labeling_item(item_id) if not item: raise HTTPException(status_code=404, detail="Item not found") image_path = item['image_path'] if image_path.startswith(LOCAL_STORAGE_PATH): if not os.path.exists(image_path): raise HTTPException(status_code=404, detail="Image file not found") with open(image_path, 'rb') as f: image_data = f.read() elif MINIO_AVAILABLE: try: image_data = get_ocr_image(image_path) except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to load image: {e}") else: raise HTTPException(status_code=500, detail="Cannot load image") session = await get_ocr_labeling_session(item['session_id']) ocr_model = session.get('ocr_model', 'llama3.2-vision:11b') if session else 'llama3.2-vision:11b' ocr_text, ocr_confidence = await run_ocr_on_image( image_data, os.path.basename(image_path), model=ocr_model ) if ocr_text is None: raise HTTPException(status_code=500, detail="OCR failed") from metrics_db import get_pool pool = await get_pool() if pool: async with pool.acquire() as conn: await conn.execute( """ UPDATE ocr_labeling_items SET ocr_text = $2, ocr_confidence = $3, ocr_model = $4 WHERE id = $1 """, item_id, ocr_text, ocr_confidence, ocr_model ) return { "item_id": item_id, "ocr_text": ocr_text, "ocr_confidence": ocr_confidence, "ocr_model": ocr_model, } @router.get("/exports") async def list_exports(export_format: Optional[str] = Query(None)): """List all available training data exports.""" if not TRAINING_EXPORT_AVAILABLE: return { "exports": [], "message": "Training export service not available", } try: export_service = get_training_export_service() exports = export_service.list_exports(export_format=export_format) return { "count": len(exports), "exports": exports, } except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to list exports: {e}")