Restructure: Move ocr_pipeline + labeling + crop into ocr/ package
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m25s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 20s

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 21:51:43 +02:00
parent 59c400b9aa
commit 0504d22b8e
98 changed files with 10351 additions and 10152 deletions

View File

@@ -0,0 +1,6 @@
"""
OCR Labeling sub-package — labeling API, models, helpers, and route handlers.
Moved from backend/ flat modules (ocr_labeling_*.py).
Backward-compatible shim files remain at the old locations.
"""

View File

@@ -0,0 +1,81 @@
"""
OCR Labeling API — Barrel Re-export
Split into:
- ocr_labeling_models.py — Pydantic models and constants
- ocr_labeling_helpers.py — OCR wrappers, image storage, hashing
- ocr_labeling_routes.py — Session/queue/labeling route handlers
- ocr_labeling_upload_routes.py — Upload, run-OCR, export route handlers
All public names are re-exported here for backward compatibility.
"""
# Models
from .models import ( # noqa: F401
LOCAL_STORAGE_PATH,
SessionCreate,
SessionResponse,
ItemResponse,
ConfirmRequest,
CorrectRequest,
SkipRequest,
ExportRequest,
StatsResponse,
)
# Helpers
from .helpers import ( # noqa: F401
VISION_OCR_AVAILABLE,
PADDLEOCR_AVAILABLE,
TROCR_AVAILABLE,
DONUT_AVAILABLE,
MINIO_AVAILABLE,
TRAINING_EXPORT_AVAILABLE,
compute_image_hash,
run_ocr_on_image,
run_vision_ocr_wrapper,
run_paddleocr_wrapper,
run_trocr_wrapper,
run_donut_wrapper,
save_image_locally,
get_image_url,
)
# Conditional re-exports from helpers' optional imports
try:
from minio_storage import upload_ocr_image, get_ocr_image, MINIO_BUCKET # noqa: F401
except ImportError:
pass
try:
from training_export_service import ( # noqa: F401
TrainingExportService,
TrainingSample,
get_training_export_service,
)
except ImportError:
pass
try:
from hybrid_vocab_extractor import run_paddle_ocr # noqa: F401
except ImportError:
pass
try:
from services.trocr_service import run_trocr_ocr # noqa: F401
except ImportError:
pass
try:
from services.donut_ocr_service import run_donut_ocr # noqa: F401
except ImportError:
pass
try:
from vision_ocr_service import get_vision_ocr_service, VisionOCRService # noqa: F401
except ImportError:
pass
# Routes (router is the main export for app.include_router)
from .routes import router # noqa: F401
from .upload_routes import router as upload_router # noqa: F401

View File

@@ -0,0 +1,205 @@
"""
OCR Labeling - Helper Functions and OCR Wrappers
Extracted from ocr_labeling_api.py to keep files under 500 LOC.
DATENSCHUTZ/PRIVACY:
- Alle Verarbeitung erfolgt lokal (Mac Mini mit Ollama)
- Keine Daten werden an externe Server gesendet
"""
import os
import hashlib
from .models import LOCAL_STORAGE_PATH
# Try to import Vision OCR service
try:
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'backend', 'klausur', 'services'))
from vision_ocr_service import get_vision_ocr_service, VisionOCRService
VISION_OCR_AVAILABLE = True
except ImportError:
VISION_OCR_AVAILABLE = False
print("Warning: Vision OCR service not available")
# Try to import PaddleOCR from hybrid_vocab_extractor
try:
from hybrid_vocab_extractor import run_paddle_ocr
PADDLEOCR_AVAILABLE = True
except ImportError:
PADDLEOCR_AVAILABLE = False
print("Warning: PaddleOCR not available")
# Try to import TrOCR service
try:
from services.trocr_service import run_trocr_ocr
TROCR_AVAILABLE = True
except ImportError:
TROCR_AVAILABLE = False
print("Warning: TrOCR service not available")
# Try to import Donut service
try:
from services.donut_ocr_service import run_donut_ocr
DONUT_AVAILABLE = True
except ImportError:
DONUT_AVAILABLE = False
print("Warning: Donut OCR service not available")
# Try to import MinIO storage
try:
from minio_storage import upload_ocr_image, get_ocr_image, MINIO_BUCKET
MINIO_AVAILABLE = True
except ImportError:
MINIO_AVAILABLE = False
print("Warning: MinIO storage not available, using local storage")
# Try to import Training Export Service
try:
from training_export_service import (
TrainingExportService,
TrainingSample,
get_training_export_service,
)
TRAINING_EXPORT_AVAILABLE = True
except ImportError:
TRAINING_EXPORT_AVAILABLE = False
print("Warning: Training export service not available")
# =============================================================================
# Helper Functions
# =============================================================================
def compute_image_hash(image_data: bytes) -> str:
"""Compute SHA256 hash of image data."""
return hashlib.sha256(image_data).hexdigest()
async def run_ocr_on_image(image_data: bytes, filename: str, model: str = "llama3.2-vision:11b") -> tuple:
"""
Run OCR on an image using the specified model.
Models:
- llama3.2-vision:11b: Vision LLM (default, best for handwriting)
- trocr: Microsoft TrOCR (fast for printed text)
- paddleocr: PaddleOCR + LLM hybrid (4x faster)
- donut: Document Understanding Transformer (structured documents)
Returns:
Tuple of (ocr_text, confidence)
"""
print(f"Running OCR with model: {model}")
# Route to appropriate OCR service based on model
if model == "paddleocr":
return await run_paddleocr_wrapper(image_data, filename)
elif model == "donut":
return await run_donut_wrapper(image_data, filename)
elif model == "trocr":
return await run_trocr_wrapper(image_data, filename)
else:
# Default: Vision LLM (llama3.2-vision or similar)
return await run_vision_ocr_wrapper(image_data, filename)
async def run_vision_ocr_wrapper(image_data: bytes, filename: str) -> tuple:
"""Vision LLM OCR wrapper."""
if not VISION_OCR_AVAILABLE:
print("Vision OCR service not available")
return None, 0.0
try:
service = get_vision_ocr_service()
if not await service.is_available():
print("Vision OCR service not available (is_available check failed)")
return None, 0.0
result = await service.extract_text(
image_data,
filename=filename,
is_handwriting=True
)
return result.text, result.confidence
except Exception as e:
print(f"Vision OCR failed: {e}")
return None, 0.0
async def run_paddleocr_wrapper(image_data: bytes, filename: str) -> tuple:
"""PaddleOCR wrapper - uses hybrid_vocab_extractor."""
if not PADDLEOCR_AVAILABLE:
print("PaddleOCR not available, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
try:
# run_paddle_ocr returns (regions, raw_text)
regions, raw_text = run_paddle_ocr(image_data)
if not raw_text:
print("PaddleOCR returned empty text")
return None, 0.0
# Calculate average confidence from regions
if regions:
avg_confidence = sum(r.confidence for r in regions) / len(regions)
else:
avg_confidence = 0.5
return raw_text, avg_confidence
except Exception as e:
print(f"PaddleOCR failed: {e}, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
async def run_trocr_wrapper(image_data: bytes, filename: str) -> tuple:
"""TrOCR wrapper."""
if not TROCR_AVAILABLE:
print("TrOCR not available, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
try:
text, confidence = await run_trocr_ocr(image_data)
return text, confidence
except Exception as e:
print(f"TrOCR failed: {e}, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
async def run_donut_wrapper(image_data: bytes, filename: str) -> tuple:
"""Donut OCR wrapper."""
if not DONUT_AVAILABLE:
print("Donut not available, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
try:
text, confidence = await run_donut_ocr(image_data)
return text, confidence
except Exception as e:
print(f"Donut OCR failed: {e}, falling back to Vision OCR")
return await run_vision_ocr_wrapper(image_data, filename)
def save_image_locally(session_id: str, item_id: str, image_data: bytes, extension: str = "png") -> str:
"""Save image to local storage."""
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
filename = f"{item_id}.{extension}"
filepath = os.path.join(session_dir, filename)
with open(filepath, 'wb') as f:
f.write(image_data)
return filepath
def get_image_url(image_path: str) -> str:
"""Get URL for an image."""
# For local images, return a relative path that the frontend can use
if image_path.startswith(LOCAL_STORAGE_PATH):
relative_path = image_path[len(LOCAL_STORAGE_PATH):].lstrip('/')
return f"/api/v1/ocr-label/images/{relative_path}"
# For MinIO images, the path is already a URL or key
return image_path

View File

@@ -0,0 +1,86 @@
"""
OCR Labeling - Pydantic Models and Constants
Extracted from ocr_labeling_api.py to keep files under 500 LOC.
"""
import os
from pydantic import BaseModel
from typing import Optional, Dict
from datetime import datetime
# Local storage path (fallback if MinIO not available)
LOCAL_STORAGE_PATH = os.getenv("OCR_STORAGE_PATH", "/app/ocr-labeling")
# =============================================================================
# Pydantic Models
# =============================================================================
class SessionCreate(BaseModel):
name: str
source_type: str = "klausur" # klausur, handwriting_sample, scan
description: Optional[str] = None
ocr_model: Optional[str] = "llama3.2-vision:11b"
class SessionResponse(BaseModel):
id: str
name: str
source_type: str
description: Optional[str]
ocr_model: Optional[str]
total_items: int
labeled_items: int
confirmed_items: int
corrected_items: int
skipped_items: int
created_at: datetime
class ItemResponse(BaseModel):
id: str
session_id: str
session_name: str
image_path: str
image_url: Optional[str]
ocr_text: Optional[str]
ocr_confidence: Optional[float]
ground_truth: Optional[str]
status: str
metadata: Optional[Dict]
created_at: datetime
class ConfirmRequest(BaseModel):
item_id: str
label_time_seconds: Optional[int] = None
class CorrectRequest(BaseModel):
item_id: str
ground_truth: str
label_time_seconds: Optional[int] = None
class SkipRequest(BaseModel):
item_id: str
class ExportRequest(BaseModel):
export_format: str = "generic" # generic, trocr, llama_vision
session_id: Optional[str] = None
batch_id: Optional[str] = None
class StatsResponse(BaseModel):
total_sessions: Optional[int] = None
total_items: int
labeled_items: int
confirmed_items: int
corrected_items: int
pending_items: int
exportable_items: Optional[int] = None
accuracy_rate: float
avg_label_time_seconds: Optional[float] = None

View File

@@ -0,0 +1,241 @@
"""
OCR Labeling - Session and Labeling Route Handlers
Extracted from ocr_labeling_api.py to keep files under 500 LOC.
Endpoints:
- POST /sessions - Create labeling session
- GET /sessions - List sessions
- GET /sessions/{id} - Get session
- GET /queue - Get labeling queue
- GET /items/{id} - Get item
- POST /confirm - Confirm OCR
- POST /correct - Correct ground truth
- POST /skip - Skip item
- GET /stats - Get statistics
"""
from fastapi import APIRouter, HTTPException, Query
from typing import Optional, List
from datetime import datetime
import uuid
from metrics_db import (
create_ocr_labeling_session,
get_ocr_labeling_sessions,
get_ocr_labeling_session,
get_ocr_labeling_queue,
get_ocr_labeling_item,
confirm_ocr_label,
correct_ocr_label,
skip_ocr_item,
get_ocr_labeling_stats,
)
from .models import (
SessionCreate, SessionResponse, ItemResponse,
ConfirmRequest, CorrectRequest, SkipRequest,
)
from .helpers import get_image_url
router = APIRouter(prefix="/api/v1/ocr-label", tags=["OCR Labeling"])
# =============================================================================
# Session Endpoints
# =============================================================================
@router.post("/sessions", response_model=SessionResponse)
async def create_session(session: SessionCreate):
"""Create a new OCR labeling session."""
session_id = str(uuid.uuid4())
success = await create_ocr_labeling_session(
session_id=session_id,
name=session.name,
source_type=session.source_type,
description=session.description,
ocr_model=session.ocr_model,
)
if not success:
raise HTTPException(status_code=500, detail="Failed to create session")
return SessionResponse(
id=session_id,
name=session.name,
source_type=session.source_type,
description=session.description,
ocr_model=session.ocr_model,
total_items=0,
labeled_items=0,
confirmed_items=0,
corrected_items=0,
skipped_items=0,
created_at=datetime.utcnow(),
)
@router.get("/sessions", response_model=List[SessionResponse])
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
"""List all OCR labeling sessions."""
sessions = await get_ocr_labeling_sessions(limit=limit)
return [
SessionResponse(
id=s['id'],
name=s['name'],
source_type=s['source_type'],
description=s.get('description'),
ocr_model=s.get('ocr_model'),
total_items=s.get('total_items', 0),
labeled_items=s.get('labeled_items', 0),
confirmed_items=s.get('confirmed_items', 0),
corrected_items=s.get('corrected_items', 0),
skipped_items=s.get('skipped_items', 0),
created_at=s.get('created_at', datetime.utcnow()),
)
for s in sessions
]
@router.get("/sessions/{session_id}", response_model=SessionResponse)
async def get_session(session_id: str):
"""Get a specific OCR labeling session."""
session = await get_ocr_labeling_session(session_id)
if not session:
raise HTTPException(status_code=404, detail="Session not found")
return SessionResponse(
id=session['id'],
name=session['name'],
source_type=session['source_type'],
description=session.get('description'),
ocr_model=session.get('ocr_model'),
total_items=session.get('total_items', 0),
labeled_items=session.get('labeled_items', 0),
confirmed_items=session.get('confirmed_items', 0),
corrected_items=session.get('corrected_items', 0),
skipped_items=session.get('skipped_items', 0),
created_at=session.get('created_at', datetime.utcnow()),
)
# =============================================================================
# Queue and Item Endpoints
# =============================================================================
@router.get("/queue", response_model=List[ItemResponse])
async def get_labeling_queue(
session_id: Optional[str] = Query(None),
status: str = Query("pending"),
limit: int = Query(10, ge=1, le=50),
):
"""Get items from the labeling queue."""
items = await get_ocr_labeling_queue(
session_id=session_id,
status=status,
limit=limit,
)
return [
ItemResponse(
id=item['id'],
session_id=item['session_id'],
session_name=item.get('session_name', ''),
image_path=item['image_path'],
image_url=get_image_url(item['image_path']),
ocr_text=item.get('ocr_text'),
ocr_confidence=item.get('ocr_confidence'),
ground_truth=item.get('ground_truth'),
status=item.get('status', 'pending'),
metadata=item.get('metadata'),
created_at=item.get('created_at', datetime.utcnow()),
)
for item in items
]
@router.get("/items/{item_id}", response_model=ItemResponse)
async def get_item(item_id: str):
"""Get a specific labeling item."""
item = await get_ocr_labeling_item(item_id)
if not item:
raise HTTPException(status_code=404, detail="Item not found")
return ItemResponse(
id=item['id'],
session_id=item['session_id'],
session_name=item.get('session_name', ''),
image_path=item['image_path'],
image_url=get_image_url(item['image_path']),
ocr_text=item.get('ocr_text'),
ocr_confidence=item.get('ocr_confidence'),
ground_truth=item.get('ground_truth'),
status=item.get('status', 'pending'),
metadata=item.get('metadata'),
created_at=item.get('created_at', datetime.utcnow()),
)
# =============================================================================
# Labeling Action Endpoints
# =============================================================================
@router.post("/confirm")
async def confirm_item(request: ConfirmRequest):
"""Confirm that OCR text is correct."""
success = await confirm_ocr_label(
item_id=request.item_id,
labeled_by="admin",
label_time_seconds=request.label_time_seconds,
)
if not success:
raise HTTPException(status_code=400, detail="Failed to confirm item")
return {"status": "confirmed", "item_id": request.item_id}
@router.post("/correct")
async def correct_item(request: CorrectRequest):
"""Save corrected ground truth for an item."""
success = await correct_ocr_label(
item_id=request.item_id,
ground_truth=request.ground_truth,
labeled_by="admin",
label_time_seconds=request.label_time_seconds,
)
if not success:
raise HTTPException(status_code=400, detail="Failed to correct item")
return {"status": "corrected", "item_id": request.item_id}
@router.post("/skip")
async def skip_item(request: SkipRequest):
"""Skip an item (unusable image, etc.)."""
success = await skip_ocr_item(
item_id=request.item_id,
labeled_by="admin",
)
if not success:
raise HTTPException(status_code=400, detail="Failed to skip item")
return {"status": "skipped", "item_id": request.item_id}
@router.get("/stats")
async def get_stats(session_id: Optional[str] = Query(None)):
"""Get labeling statistics."""
stats = await get_ocr_labeling_stats(session_id=session_id)
if "error" in stats:
raise HTTPException(status_code=500, detail=stats["error"])
return stats

View File

@@ -0,0 +1,313 @@
"""
OCR Labeling - Upload, Run-OCR, and Export Route Handlers
Extracted from ocr_labeling_routes.py to keep files under 500 LOC.
Endpoints:
- POST /sessions/{id}/upload - Upload images for labeling
- POST /run-ocr/{item_id} - Run OCR on existing item
- POST /export - Export training data
- GET /training-samples - List training samples
- GET /images/{path} - Serve images from local storage
- GET /exports - List exports
"""
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Query
from typing import Optional, List
import uuid
import os
from metrics_db import (
get_ocr_labeling_session,
add_ocr_labeling_item,
get_ocr_labeling_item,
export_training_samples,
get_training_samples,
)
from .models import (
ExportRequest,
LOCAL_STORAGE_PATH,
)
from .helpers import (
compute_image_hash, run_ocr_on_image,
save_image_locally,
MINIO_AVAILABLE, TRAINING_EXPORT_AVAILABLE,
)
# Conditional imports
try:
from minio_storage import upload_ocr_image, get_ocr_image
except ImportError:
pass
try:
from training_export_service import TrainingSample, get_training_export_service
except ImportError:
pass
router = APIRouter(prefix="/api/v1/ocr-label", tags=["OCR Labeling"])
@router.post("/sessions/{session_id}/upload")
async def upload_images(
session_id: str,
files: List[UploadFile] = File(...),
run_ocr: bool = Form(True),
metadata: Optional[str] = Form(None),
):
"""
Upload images to a labeling session.
Args:
session_id: Session to add images to
files: Image files to upload (PNG, JPG, PDF)
run_ocr: Whether to run OCR immediately (default: True)
metadata: Optional JSON metadata (subject, year, etc.)
"""
import json
session = await get_ocr_labeling_session(session_id)
if not session:
raise HTTPException(status_code=404, detail="Session not found")
meta_dict = None
if metadata:
try:
meta_dict = json.loads(metadata)
except json.JSONDecodeError:
meta_dict = {"raw": metadata}
results = []
ocr_model = session.get('ocr_model', 'llama3.2-vision:11b')
for file in files:
content = await file.read()
image_hash = compute_image_hash(content)
item_id = str(uuid.uuid4())
extension = file.filename.split('.')[-1].lower() if file.filename else 'png'
if extension not in ['png', 'jpg', 'jpeg', 'pdf']:
extension = 'png'
if MINIO_AVAILABLE:
try:
image_path = upload_ocr_image(session_id, item_id, content, extension)
except Exception as e:
print(f"MinIO upload failed, using local storage: {e}")
image_path = save_image_locally(session_id, item_id, content, extension)
else:
image_path = save_image_locally(session_id, item_id, content, extension)
ocr_text = None
ocr_confidence = None
if run_ocr and extension != 'pdf':
ocr_text, ocr_confidence = await run_ocr_on_image(
content,
file.filename or f"{item_id}.{extension}",
model=ocr_model
)
success = await add_ocr_labeling_item(
item_id=item_id,
session_id=session_id,
image_path=image_path,
image_hash=image_hash,
ocr_text=ocr_text,
ocr_confidence=ocr_confidence,
ocr_model=ocr_model if ocr_text else None,
metadata=meta_dict,
)
if success:
results.append({
"id": item_id,
"filename": file.filename,
"image_path": image_path,
"image_hash": image_hash,
"ocr_text": ocr_text,
"ocr_confidence": ocr_confidence,
"status": "pending",
})
return {
"session_id": session_id,
"uploaded_count": len(results),
"items": results,
}
@router.post("/export")
async def export_data(request: ExportRequest):
"""Export labeled data for training."""
db_samples = await export_training_samples(
export_format=request.export_format,
session_id=request.session_id,
batch_id=request.batch_id,
exported_by="admin",
)
if not db_samples:
return {
"export_format": request.export_format,
"batch_id": request.batch_id,
"exported_count": 0,
"samples": [],
"message": "No labeled samples found to export",
}
export_result = None
if TRAINING_EXPORT_AVAILABLE:
try:
export_service = get_training_export_service()
training_samples = []
for s in db_samples:
training_samples.append(TrainingSample(
id=s.get('id', s.get('item_id', '')),
image_path=s.get('image_path', ''),
ground_truth=s.get('ground_truth', ''),
ocr_text=s.get('ocr_text'),
ocr_confidence=s.get('ocr_confidence'),
metadata=s.get('metadata'),
))
export_result = export_service.export(
samples=training_samples,
export_format=request.export_format,
batch_id=request.batch_id,
)
except Exception as e:
print(f"Training export failed: {e}")
response = {
"export_format": request.export_format,
"batch_id": request.batch_id or (export_result.batch_id if export_result else None),
"exported_count": len(db_samples),
"samples": db_samples,
}
if export_result:
response["export_path"] = export_result.export_path
response["manifest_path"] = export_result.manifest_path
return response
@router.get("/training-samples")
async def list_training_samples(
export_format: Optional[str] = Query(None),
batch_id: Optional[str] = Query(None),
limit: int = Query(100, ge=1, le=1000),
):
"""Get exported training samples."""
samples = await get_training_samples(
export_format=export_format,
batch_id=batch_id,
limit=limit,
)
return {
"count": len(samples),
"samples": samples,
}
@router.get("/images/{path:path}")
async def get_image(path: str):
"""Serve an image from local storage."""
from fastapi.responses import FileResponse
filepath = os.path.join(LOCAL_STORAGE_PATH, path)
if not os.path.exists(filepath):
raise HTTPException(status_code=404, detail="Image not found")
extension = filepath.split('.')[-1].lower()
content_type = {
'png': 'image/png',
'jpg': 'image/jpeg',
'jpeg': 'image/jpeg',
'pdf': 'application/pdf',
}.get(extension, 'application/octet-stream')
return FileResponse(filepath, media_type=content_type)
@router.post("/run-ocr/{item_id}")
async def run_ocr_for_item(item_id: str):
"""Run OCR on an existing item."""
item = await get_ocr_labeling_item(item_id)
if not item:
raise HTTPException(status_code=404, detail="Item not found")
image_path = item['image_path']
if image_path.startswith(LOCAL_STORAGE_PATH):
if not os.path.exists(image_path):
raise HTTPException(status_code=404, detail="Image file not found")
with open(image_path, 'rb') as f:
image_data = f.read()
elif MINIO_AVAILABLE:
try:
image_data = get_ocr_image(image_path)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to load image: {e}")
else:
raise HTTPException(status_code=500, detail="Cannot load image")
session = await get_ocr_labeling_session(item['session_id'])
ocr_model = session.get('ocr_model', 'llama3.2-vision:11b') if session else 'llama3.2-vision:11b'
ocr_text, ocr_confidence = await run_ocr_on_image(
image_data,
os.path.basename(image_path),
model=ocr_model
)
if ocr_text is None:
raise HTTPException(status_code=500, detail="OCR failed")
from metrics_db import get_pool
pool = await get_pool()
if pool:
async with pool.acquire() as conn:
await conn.execute(
"""
UPDATE ocr_labeling_items
SET ocr_text = $2, ocr_confidence = $3, ocr_model = $4
WHERE id = $1
""",
item_id, ocr_text, ocr_confidence, ocr_model
)
return {
"item_id": item_id,
"ocr_text": ocr_text,
"ocr_confidence": ocr_confidence,
"ocr_model": ocr_model,
}
@router.get("/exports")
async def list_exports(export_format: Optional[str] = Query(None)):
"""List all available training data exports."""
if not TRAINING_EXPORT_AVAILABLE:
return {
"exports": [],
"message": "Training export service not available",
}
try:
export_service = get_training_export_service()
exports = export_service.list_exports(export_format=export_format)
return {
"count": len(exports),
"exports": exports,
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to list exports: {e}")