backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
314 lines
9.3 KiB
Python
314 lines
9.3 KiB
Python
"""
|
|
OCR Labeling - Upload, Run-OCR, and Export Route Handlers
|
|
|
|
Extracted from ocr_labeling_routes.py to keep files under 500 LOC.
|
|
|
|
Endpoints:
|
|
- POST /sessions/{id}/upload - Upload images for labeling
|
|
- POST /run-ocr/{item_id} - Run OCR on existing item
|
|
- POST /export - Export training data
|
|
- GET /training-samples - List training samples
|
|
- GET /images/{path} - Serve images from local storage
|
|
- GET /exports - List exports
|
|
"""
|
|
|
|
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Query
|
|
from typing import Optional, List
|
|
import uuid
|
|
import os
|
|
|
|
from metrics_db import (
|
|
get_ocr_labeling_session,
|
|
add_ocr_labeling_item,
|
|
get_ocr_labeling_item,
|
|
export_training_samples,
|
|
get_training_samples,
|
|
)
|
|
|
|
from ocr_labeling_models import (
|
|
ExportRequest,
|
|
LOCAL_STORAGE_PATH,
|
|
)
|
|
from ocr_labeling_helpers import (
|
|
compute_image_hash, run_ocr_on_image,
|
|
save_image_locally,
|
|
MINIO_AVAILABLE, TRAINING_EXPORT_AVAILABLE,
|
|
)
|
|
|
|
# Conditional imports
|
|
try:
|
|
from minio_storage import upload_ocr_image, get_ocr_image
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
from training_export_service import TrainingSample, get_training_export_service
|
|
except ImportError:
|
|
pass
|
|
|
|
|
|
router = APIRouter(prefix="/api/v1/ocr-label", tags=["OCR Labeling"])
|
|
|
|
|
|
@router.post("/sessions/{session_id}/upload")
|
|
async def upload_images(
|
|
session_id: str,
|
|
files: List[UploadFile] = File(...),
|
|
run_ocr: bool = Form(True),
|
|
metadata: Optional[str] = Form(None),
|
|
):
|
|
"""
|
|
Upload images to a labeling session.
|
|
|
|
Args:
|
|
session_id: Session to add images to
|
|
files: Image files to upload (PNG, JPG, PDF)
|
|
run_ocr: Whether to run OCR immediately (default: True)
|
|
metadata: Optional JSON metadata (subject, year, etc.)
|
|
"""
|
|
import json
|
|
|
|
session = await get_ocr_labeling_session(session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
meta_dict = None
|
|
if metadata:
|
|
try:
|
|
meta_dict = json.loads(metadata)
|
|
except json.JSONDecodeError:
|
|
meta_dict = {"raw": metadata}
|
|
|
|
results = []
|
|
ocr_model = session.get('ocr_model', 'llama3.2-vision:11b')
|
|
|
|
for file in files:
|
|
content = await file.read()
|
|
image_hash = compute_image_hash(content)
|
|
item_id = str(uuid.uuid4())
|
|
|
|
extension = file.filename.split('.')[-1].lower() if file.filename else 'png'
|
|
if extension not in ['png', 'jpg', 'jpeg', 'pdf']:
|
|
extension = 'png'
|
|
|
|
if MINIO_AVAILABLE:
|
|
try:
|
|
image_path = upload_ocr_image(session_id, item_id, content, extension)
|
|
except Exception as e:
|
|
print(f"MinIO upload failed, using local storage: {e}")
|
|
image_path = save_image_locally(session_id, item_id, content, extension)
|
|
else:
|
|
image_path = save_image_locally(session_id, item_id, content, extension)
|
|
|
|
ocr_text = None
|
|
ocr_confidence = None
|
|
|
|
if run_ocr and extension != 'pdf':
|
|
ocr_text, ocr_confidence = await run_ocr_on_image(
|
|
content,
|
|
file.filename or f"{item_id}.{extension}",
|
|
model=ocr_model
|
|
)
|
|
|
|
success = await add_ocr_labeling_item(
|
|
item_id=item_id,
|
|
session_id=session_id,
|
|
image_path=image_path,
|
|
image_hash=image_hash,
|
|
ocr_text=ocr_text,
|
|
ocr_confidence=ocr_confidence,
|
|
ocr_model=ocr_model if ocr_text else None,
|
|
metadata=meta_dict,
|
|
)
|
|
|
|
if success:
|
|
results.append({
|
|
"id": item_id,
|
|
"filename": file.filename,
|
|
"image_path": image_path,
|
|
"image_hash": image_hash,
|
|
"ocr_text": ocr_text,
|
|
"ocr_confidence": ocr_confidence,
|
|
"status": "pending",
|
|
})
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"uploaded_count": len(results),
|
|
"items": results,
|
|
}
|
|
|
|
|
|
@router.post("/export")
|
|
async def export_data(request: ExportRequest):
|
|
"""Export labeled data for training."""
|
|
db_samples = await export_training_samples(
|
|
export_format=request.export_format,
|
|
session_id=request.session_id,
|
|
batch_id=request.batch_id,
|
|
exported_by="admin",
|
|
)
|
|
|
|
if not db_samples:
|
|
return {
|
|
"export_format": request.export_format,
|
|
"batch_id": request.batch_id,
|
|
"exported_count": 0,
|
|
"samples": [],
|
|
"message": "No labeled samples found to export",
|
|
}
|
|
|
|
export_result = None
|
|
if TRAINING_EXPORT_AVAILABLE:
|
|
try:
|
|
export_service = get_training_export_service()
|
|
|
|
training_samples = []
|
|
for s in db_samples:
|
|
training_samples.append(TrainingSample(
|
|
id=s.get('id', s.get('item_id', '')),
|
|
image_path=s.get('image_path', ''),
|
|
ground_truth=s.get('ground_truth', ''),
|
|
ocr_text=s.get('ocr_text'),
|
|
ocr_confidence=s.get('ocr_confidence'),
|
|
metadata=s.get('metadata'),
|
|
))
|
|
|
|
export_result = export_service.export(
|
|
samples=training_samples,
|
|
export_format=request.export_format,
|
|
batch_id=request.batch_id,
|
|
)
|
|
except Exception as e:
|
|
print(f"Training export failed: {e}")
|
|
|
|
response = {
|
|
"export_format": request.export_format,
|
|
"batch_id": request.batch_id or (export_result.batch_id if export_result else None),
|
|
"exported_count": len(db_samples),
|
|
"samples": db_samples,
|
|
}
|
|
|
|
if export_result:
|
|
response["export_path"] = export_result.export_path
|
|
response["manifest_path"] = export_result.manifest_path
|
|
|
|
return response
|
|
|
|
|
|
@router.get("/training-samples")
|
|
async def list_training_samples(
|
|
export_format: Optional[str] = Query(None),
|
|
batch_id: Optional[str] = Query(None),
|
|
limit: int = Query(100, ge=1, le=1000),
|
|
):
|
|
"""Get exported training samples."""
|
|
samples = await get_training_samples(
|
|
export_format=export_format,
|
|
batch_id=batch_id,
|
|
limit=limit,
|
|
)
|
|
|
|
return {
|
|
"count": len(samples),
|
|
"samples": samples,
|
|
}
|
|
|
|
|
|
@router.get("/images/{path:path}")
|
|
async def get_image(path: str):
|
|
"""Serve an image from local storage."""
|
|
from fastapi.responses import FileResponse
|
|
|
|
filepath = os.path.join(LOCAL_STORAGE_PATH, path)
|
|
|
|
if not os.path.exists(filepath):
|
|
raise HTTPException(status_code=404, detail="Image not found")
|
|
|
|
extension = filepath.split('.')[-1].lower()
|
|
content_type = {
|
|
'png': 'image/png',
|
|
'jpg': 'image/jpeg',
|
|
'jpeg': 'image/jpeg',
|
|
'pdf': 'application/pdf',
|
|
}.get(extension, 'application/octet-stream')
|
|
|
|
return FileResponse(filepath, media_type=content_type)
|
|
|
|
|
|
@router.post("/run-ocr/{item_id}")
|
|
async def run_ocr_for_item(item_id: str):
|
|
"""Run OCR on an existing item."""
|
|
item = await get_ocr_labeling_item(item_id)
|
|
|
|
if not item:
|
|
raise HTTPException(status_code=404, detail="Item not found")
|
|
|
|
image_path = item['image_path']
|
|
|
|
if image_path.startswith(LOCAL_STORAGE_PATH):
|
|
if not os.path.exists(image_path):
|
|
raise HTTPException(status_code=404, detail="Image file not found")
|
|
with open(image_path, 'rb') as f:
|
|
image_data = f.read()
|
|
elif MINIO_AVAILABLE:
|
|
try:
|
|
image_data = get_ocr_image(image_path)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to load image: {e}")
|
|
else:
|
|
raise HTTPException(status_code=500, detail="Cannot load image")
|
|
|
|
session = await get_ocr_labeling_session(item['session_id'])
|
|
ocr_model = session.get('ocr_model', 'llama3.2-vision:11b') if session else 'llama3.2-vision:11b'
|
|
|
|
ocr_text, ocr_confidence = await run_ocr_on_image(
|
|
image_data,
|
|
os.path.basename(image_path),
|
|
model=ocr_model
|
|
)
|
|
|
|
if ocr_text is None:
|
|
raise HTTPException(status_code=500, detail="OCR failed")
|
|
|
|
from metrics_db import get_pool
|
|
pool = await get_pool()
|
|
if pool:
|
|
async with pool.acquire() as conn:
|
|
await conn.execute(
|
|
"""
|
|
UPDATE ocr_labeling_items
|
|
SET ocr_text = $2, ocr_confidence = $3, ocr_model = $4
|
|
WHERE id = $1
|
|
""",
|
|
item_id, ocr_text, ocr_confidence, ocr_model
|
|
)
|
|
|
|
return {
|
|
"item_id": item_id,
|
|
"ocr_text": ocr_text,
|
|
"ocr_confidence": ocr_confidence,
|
|
"ocr_model": ocr_model,
|
|
}
|
|
|
|
|
|
@router.get("/exports")
|
|
async def list_exports(export_format: Optional[str] = Query(None)):
|
|
"""List all available training data exports."""
|
|
if not TRAINING_EXPORT_AVAILABLE:
|
|
return {
|
|
"exports": [],
|
|
"message": "Training export service not available",
|
|
}
|
|
|
|
try:
|
|
export_service = get_training_export_service()
|
|
exports = export_service.list_exports(export_format=export_format)
|
|
|
|
return {
|
|
"count": len(exports),
|
|
"exports": exports,
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to list exports: {e}")
|