breakpilot-lehrer/klausur-service/backend/services/lighton_ocr_service.py

"""
LightOnOCR-2-1B Service

End-to-end VLM OCR fuer gedruckten und gemischten Text.
1B Parameter, Apple MPS-faehig (M-Serie).

Modell:  lightonai/LightOnOCR-2-1B
Lizenz:  Apache 2.0
Quelle:  https://huggingface.co/lightonai/LightOnOCR-2-1B

Unterstuetzte Dokumenttypen:
- Buchseiten, Vokabelseiten
- Arbeitsblaetter, Klausuren
- Gemischt gedruckt/handschriftlich

DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import io
import logging
import os
from typing import Optional, Tuple

logger = logging.getLogger(__name__)

LIGHTON_MODEL_ID = os.getenv("LIGHTON_OCR_MODEL", "lightonai/LightOnOCR-2-1B")

_lighton_model = None
_lighton_processor = None
_lighton_available: Optional[bool] = None


def _check_lighton_available() -> bool:
    """Check if LightOnOCR dependencies (transformers, torch) are available."""
    global _lighton_available
    if _lighton_available is not None:
        return _lighton_available
    try:
        from transformers import AutoModelForImageTextToText, AutoProcessor  # noqa: F401
        import torch  # noqa: F401
        _lighton_available = True
    except ImportError as e:
        logger.warning(f"LightOnOCR deps not available: {e}")
        _lighton_available = False
    return _lighton_available


def get_lighton_model() -> Tuple:
    """
    Lazy-load LightOnOCR-2-1B processor and model.

    Returns (processor, model) or (None, None) on failure.
    Device priority: MPS (Apple Silicon) > CUDA > CPU.
    """
    global _lighton_model, _lighton_processor
    if _lighton_model is not None:
        return _lighton_processor, _lighton_model
    if not _check_lighton_available():
        return None, None

    try:
        import torch
        from transformers import AutoModelForImageTextToText, AutoProcessor

        if torch.backends.mps.is_available():
            device = "mps"
        elif torch.cuda.is_available():
            device = "cuda"
        else:
            device = "cpu"
        dtype = torch.bfloat16

        logger.info(f"Loading LightOnOCR-2-1B on {device} ({dtype}) from {LIGHTON_MODEL_ID} ...")
        _lighton_processor = AutoProcessor.from_pretrained(LIGHTON_MODEL_ID)
        _lighton_model = AutoModelForImageTextToText.from_pretrained(
            LIGHTON_MODEL_ID, torch_dtype=dtype
        ).to(device)
        _lighton_model.eval()
        logger.info("LightOnOCR-2-1B loaded successfully")

    except Exception as e:
        logger.error(f"Failed to load LightOnOCR-2-1B: {e}")
        _lighton_model = None
        _lighton_processor = None

    return _lighton_processor, _lighton_model


def run_lighton_ocr_sync(image_bytes: bytes) -> Optional[str]:
    """
    Run LightOnOCR on image bytes (synchronous).

    Returns extracted text or None on error.
    Generic — works for any document/page region.
    """
    processor, model = get_lighton_model()
    if processor is None or model is None:
        return None

    try:
        import torch
        from PIL import Image as _PILImage

        pil_img = _PILImage.open(io.BytesIO(image_bytes)).convert("RGB")
        conversation = [{"role": "user", "content": [{"type": "image"}]}]

        inputs = processor.apply_chat_template(
            conversation, images=[pil_img],
            add_generation_prompt=True, return_tensors="pt"
        ).to(model.device)

        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=1024)

        text = processor.decode(output_ids[0], skip_special_tokens=True)
        return text.strip() if text else None

    except Exception as e:
        logger.error(f"LightOnOCR inference failed: {e}")
        return None