breakpilot-lehrer/klausur-service/backend/ocr_labeling_upload_routes.py

"""
OCR Labeling - Upload, Run-OCR, and Export Route Handlers

Extracted from ocr_labeling_routes.py to keep files under 500 LOC.

Endpoints:
- POST /sessions/{id}/upload - Upload images for labeling
- POST /run-ocr/{item_id} - Run OCR on existing item
- POST /export - Export training data
- GET /training-samples - List training samples
- GET /images/{path} - Serve images from local storage
- GET /exports - List exports
"""

from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Query
from typing import Optional, List
import uuid
import os

from metrics_db import (
    get_ocr_labeling_session,
    add_ocr_labeling_item,
    get_ocr_labeling_item,
    export_training_samples,
    get_training_samples,
)

from ocr_labeling_models import (
    ExportRequest,
    LOCAL_STORAGE_PATH,
)
from ocr_labeling_helpers import (
    compute_image_hash, run_ocr_on_image,
    save_image_locally,
    MINIO_AVAILABLE, TRAINING_EXPORT_AVAILABLE,
)

# Conditional imports
try:
    from minio_storage import upload_ocr_image, get_ocr_image
except ImportError:
    pass

try:
    from training_export_service import TrainingSample, get_training_export_service
except ImportError:
    pass


router = APIRouter(prefix="/api/v1/ocr-label", tags=["OCR Labeling"])


@router.post("/sessions/{session_id}/upload")
async def upload_images(
    session_id: str,
    files: List[UploadFile] = File(...),
    run_ocr: bool = Form(True),
    metadata: Optional[str] = Form(None),
):
    """
    Upload images to a labeling session.

    Args:
        session_id: Session to add images to
        files: Image files to upload (PNG, JPG, PDF)
        run_ocr: Whether to run OCR immediately (default: True)
        metadata: Optional JSON metadata (subject, year, etc.)
    """
    import json

    session = await get_ocr_labeling_session(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    meta_dict = None
    if metadata:
        try:
            meta_dict = json.loads(metadata)
        except json.JSONDecodeError:
            meta_dict = {"raw": metadata}

    results = []
    ocr_model = session.get('ocr_model', 'llama3.2-vision:11b')

    for file in files:
        content = await file.read()
        image_hash = compute_image_hash(content)
        item_id = str(uuid.uuid4())

        extension = file.filename.split('.')[-1].lower() if file.filename else 'png'
        if extension not in ['png', 'jpg', 'jpeg', 'pdf']:
            extension = 'png'

        if MINIO_AVAILABLE:
            try:
                image_path = upload_ocr_image(session_id, item_id, content, extension)
            except Exception as e:
                print(f"MinIO upload failed, using local storage: {e}")
                image_path = save_image_locally(session_id, item_id, content, extension)
        else:
            image_path = save_image_locally(session_id, item_id, content, extension)

        ocr_text = None
        ocr_confidence = None

        if run_ocr and extension != 'pdf':
            ocr_text, ocr_confidence = await run_ocr_on_image(
                content,
                file.filename or f"{item_id}.{extension}",
                model=ocr_model
            )

        success = await add_ocr_labeling_item(
            item_id=item_id,
            session_id=session_id,
            image_path=image_path,
            image_hash=image_hash,
            ocr_text=ocr_text,
            ocr_confidence=ocr_confidence,
            ocr_model=ocr_model if ocr_text else None,
            metadata=meta_dict,
        )

        if success:
            results.append({
                "id": item_id,
                "filename": file.filename,
                "image_path": image_path,
                "image_hash": image_hash,
                "ocr_text": ocr_text,
                "ocr_confidence": ocr_confidence,
                "status": "pending",
            })

    return {
        "session_id": session_id,
        "uploaded_count": len(results),
        "items": results,
    }


@router.post("/export")
async def export_data(request: ExportRequest):
    """Export labeled data for training."""
    db_samples = await export_training_samples(
        export_format=request.export_format,
        session_id=request.session_id,
        batch_id=request.batch_id,
        exported_by="admin",
    )

    if not db_samples:
        return {
            "export_format": request.export_format,
            "batch_id": request.batch_id,
            "exported_count": 0,
            "samples": [],
            "message": "No labeled samples found to export",
        }

    export_result = None
    if TRAINING_EXPORT_AVAILABLE:
        try:
            export_service = get_training_export_service()

            training_samples = []
            for s in db_samples:
                training_samples.append(TrainingSample(
                    id=s.get('id', s.get('item_id', '')),
                    image_path=s.get('image_path', ''),
                    ground_truth=s.get('ground_truth', ''),
                    ocr_text=s.get('ocr_text'),
                    ocr_confidence=s.get('ocr_confidence'),
                    metadata=s.get('metadata'),
                ))

            export_result = export_service.export(
                samples=training_samples,
                export_format=request.export_format,
                batch_id=request.batch_id,
            )
        except Exception as e:
            print(f"Training export failed: {e}")

    response = {
        "export_format": request.export_format,
        "batch_id": request.batch_id or (export_result.batch_id if export_result else None),
        "exported_count": len(db_samples),
        "samples": db_samples,
    }

    if export_result:
        response["export_path"] = export_result.export_path
        response["manifest_path"] = export_result.manifest_path

    return response


@router.get("/training-samples")
async def list_training_samples(
    export_format: Optional[str] = Query(None),
    batch_id: Optional[str] = Query(None),
    limit: int = Query(100, ge=1, le=1000),
):
    """Get exported training samples."""
    samples = await get_training_samples(
        export_format=export_format,
        batch_id=batch_id,
        limit=limit,
    )

    return {
        "count": len(samples),
        "samples": samples,
    }


@router.get("/images/{path:path}")
async def get_image(path: str):
    """Serve an image from local storage."""
    from fastapi.responses import FileResponse

    filepath = os.path.join(LOCAL_STORAGE_PATH, path)

    if not os.path.exists(filepath):
        raise HTTPException(status_code=404, detail="Image not found")

    extension = filepath.split('.')[-1].lower()
    content_type = {
        'png': 'image/png',
        'jpg': 'image/jpeg',
        'jpeg': 'image/jpeg',
        'pdf': 'application/pdf',
    }.get(extension, 'application/octet-stream')

    return FileResponse(filepath, media_type=content_type)


@router.post("/run-ocr/{item_id}")
async def run_ocr_for_item(item_id: str):
    """Run OCR on an existing item."""
    item = await get_ocr_labeling_item(item_id)

    if not item:
        raise HTTPException(status_code=404, detail="Item not found")

    image_path = item['image_path']

    if image_path.startswith(LOCAL_STORAGE_PATH):
        if not os.path.exists(image_path):
            raise HTTPException(status_code=404, detail="Image file not found")
        with open(image_path, 'rb') as f:
            image_data = f.read()
    elif MINIO_AVAILABLE:
        try:
            image_data = get_ocr_image(image_path)
        except Exception as e:
            raise HTTPException(status_code=500, detail=f"Failed to load image: {e}")
    else:
        raise HTTPException(status_code=500, detail="Cannot load image")

    session = await get_ocr_labeling_session(item['session_id'])
    ocr_model = session.get('ocr_model', 'llama3.2-vision:11b') if session else 'llama3.2-vision:11b'

    ocr_text, ocr_confidence = await run_ocr_on_image(
        image_data,
        os.path.basename(image_path),
        model=ocr_model
    )

    if ocr_text is None:
        raise HTTPException(status_code=500, detail="OCR failed")

    from metrics_db import get_pool
    pool = await get_pool()
    if pool:
        async with pool.acquire() as conn:
            await conn.execute(
                """
                UPDATE ocr_labeling_items
                SET ocr_text = $2, ocr_confidence = $3, ocr_model = $4
                WHERE id = $1
                """,
                item_id, ocr_text, ocr_confidence, ocr_model
            )

    return {
        "item_id": item_id,
        "ocr_text": ocr_text,
        "ocr_confidence": ocr_confidence,
        "ocr_model": ocr_model,
    }


@router.get("/exports")
async def list_exports(export_format: Optional[str] = Query(None)):
    """List all available training data exports."""
    if not TRAINING_EXPORT_AVAILABLE:
        return {
            "exports": [],
            "message": "Training export service not available",
        }

    try:
        export_service = get_training_export_service()
        exports = export_service.list_exports(export_format=export_format)

        return {
            "count": len(exports),
            "exports": exports,
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to list exports: {e}")