breakpilot-pwa/klausur-service/backend/training_export_service.py

"""
Training Export Service for OCR Labeling Data

Exports labeled OCR data in formats suitable for fine-tuning:
- TrOCR (Microsoft's Transformer-based OCR model)
- llama3.2-vision (Meta's Vision-Language Model)
- Generic JSONL format

DATENSCHUTZ/PRIVACY:
- Alle Daten bleiben lokal auf dem Mac Mini
- Keine Cloud-Uploads ohne explizite Zustimmung
- Export-Pfade sind konfigurierbar
"""

import os
import json
import base64
import shutil
from pathlib import Path
from typing import List, Dict, Optional, Any
from dataclasses import dataclass
from datetime import datetime
import hashlib

# Export directory configuration
EXPORT_BASE_PATH = os.getenv("OCR_EXPORT_PATH", "/app/ocr-exports")
TROCR_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "trocr")
LLAMA_VISION_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "llama-vision")
GENERIC_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "generic")


@dataclass
class TrainingSample:
    """A single training sample for OCR fine-tuning."""
    id: str
    image_path: str
    ground_truth: str
    ocr_text: Optional[str] = None
    ocr_confidence: Optional[float] = None
    metadata: Optional[Dict[str, Any]] = None


@dataclass
class ExportResult:
    """Result of a training data export."""
    export_format: str
    export_path: str
    sample_count: int
    batch_id: str
    created_at: datetime
    manifest_path: str


class TrOCRExporter:
    """
    Export training data for TrOCR fine-tuning.

    TrOCR expects:
    - Image files (PNG/JPG)
    - A CSV/TSV file with: image_path, text
    - Or a JSONL file with: {"file_name": "img.png", "text": "ground truth"}

    We use the JSONL format for flexibility.
    """

    def __init__(self, export_path: str = TROCR_EXPORT_PATH):
        self.export_path = export_path
        os.makedirs(export_path, exist_ok=True)

    def export(
        self,
        samples: List[TrainingSample],
        batch_id: str,
        copy_images: bool = True,
    ) -> ExportResult:
        """
        Export samples in TrOCR format.

        Args:
            samples: List of training samples
            batch_id: Unique batch identifier
            copy_images: Whether to copy images to export directory

        Returns:
            ExportResult with export details
        """
        batch_path = os.path.join(self.export_path, batch_id)
        images_path = os.path.join(batch_path, "images")
        os.makedirs(images_path, exist_ok=True)

        # Export data
        export_data = []
        for sample in samples:
            # Copy image if requested
            if copy_images and os.path.exists(sample.image_path):
                image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
                dest_path = os.path.join(images_path, image_filename)
                shutil.copy2(sample.image_path, dest_path)
                image_ref = f"images/{image_filename}"
            else:
                image_ref = sample.image_path

            export_data.append({
                "file_name": image_ref,
                "text": sample.ground_truth,
                "id": sample.id,
            })

        # Write JSONL file
        jsonl_path = os.path.join(batch_path, "train.jsonl")
        with open(jsonl_path, 'w', encoding='utf-8') as f:
            for item in export_data:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')

        # Write manifest
        manifest = {
            "format": "trocr",
            "version": "1.0",
            "batch_id": batch_id,
            "sample_count": len(samples),
            "created_at": datetime.utcnow().isoformat(),
            "files": {
                "data": "train.jsonl",
                "images": "images/",
            },
            "model_config": {
                "base_model": "microsoft/trocr-base-handwritten",
                "task": "handwriting-recognition",
            },
        }
        manifest_path = os.path.join(batch_path, "manifest.json")
        with open(manifest_path, 'w') as f:
            json.dump(manifest, f, indent=2)

        return ExportResult(
            export_format="trocr",
            export_path=batch_path,
            sample_count=len(samples),
            batch_id=batch_id,
            created_at=datetime.utcnow(),
            manifest_path=manifest_path,
        )


class LlamaVisionExporter:
    """
    Export training data for llama3.2-vision fine-tuning.

    Llama Vision fine-tuning expects:
    - JSONL format with base64-encoded images or image URLs
    - Format: {"messages": [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]}, {"role": "assistant", "content": "..."}]}

    We create a supervised fine-tuning dataset.
    """

    def __init__(self, export_path: str = LLAMA_VISION_EXPORT_PATH):
        self.export_path = export_path
        os.makedirs(export_path, exist_ok=True)

    def _encode_image_base64(self, image_path: str) -> Optional[str]:
        """Encode image to base64."""
        try:
            with open(image_path, 'rb') as f:
                return base64.b64encode(f.read()).decode('utf-8')
        except Exception:
            return None

    def export(
        self,
        samples: List[TrainingSample],
        batch_id: str,
        include_base64: bool = False,
        copy_images: bool = True,
    ) -> ExportResult:
        """
        Export samples in Llama Vision fine-tuning format.

        Args:
            samples: List of training samples
            batch_id: Unique batch identifier
            include_base64: Whether to include base64-encoded images in JSONL
            copy_images: Whether to copy images to export directory

        Returns:
            ExportResult with export details
        """
        batch_path = os.path.join(self.export_path, batch_id)
        images_path = os.path.join(batch_path, "images")
        os.makedirs(images_path, exist_ok=True)

        # OCR instruction prompt
        system_prompt = (
            "Du bist ein OCR-Experte für deutsche Handschrift. "
            "Lies den handgeschriebenen Text im Bild und gib ihn wortgetreu wieder."
        )

        # Export data
        export_data = []
        for sample in samples:
            # Copy image if requested
            if copy_images and os.path.exists(sample.image_path):
                image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
                dest_path = os.path.join(images_path, image_filename)
                shutil.copy2(sample.image_path, dest_path)
                image_ref = f"images/{image_filename}"
            else:
                image_ref = sample.image_path

            # Build message format
            user_content = [
                {"type": "image_url", "image_url": {"url": image_ref}},
                {"type": "text", "text": "Lies den handgeschriebenen Text in diesem Bild."},
            ]

            # Optionally include base64
            if include_base64:
                b64 = self._encode_image_base64(sample.image_path)
                if b64:
                    ext = Path(sample.image_path).suffix.lower().replace('.', '')
                    mime = {'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg'}.get(ext, 'image/png')
                    user_content[0] = {
                        "type": "image_url",
                        "image_url": {"url": f"data:{mime};base64,{b64}"}
                    }

            export_data.append({
                "id": sample.id,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_content},
                    {"role": "assistant", "content": sample.ground_truth},
                ],
            })

        # Write JSONL file
        jsonl_path = os.path.join(batch_path, "train.jsonl")
        with open(jsonl_path, 'w', encoding='utf-8') as f:
            for item in export_data:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')

        # Write manifest
        manifest = {
            "format": "llama_vision",
            "version": "1.0",
            "batch_id": batch_id,
            "sample_count": len(samples),
            "created_at": datetime.utcnow().isoformat(),
            "files": {
                "data": "train.jsonl",
                "images": "images/",
            },
            "model_config": {
                "base_model": "llama3.2-vision:11b",
                "task": "handwriting-ocr",
                "system_prompt": system_prompt,
            },
        }
        manifest_path = os.path.join(batch_path, "manifest.json")
        with open(manifest_path, 'w') as f:
            json.dump(manifest, f, indent=2)

        return ExportResult(
            export_format="llama_vision",
            export_path=batch_path,
            sample_count=len(samples),
            batch_id=batch_id,
            created_at=datetime.utcnow(),
            manifest_path=manifest_path,
        )


class GenericExporter:
    """
    Export training data in a generic JSONL format.

    This format is compatible with most ML frameworks and can be
    easily converted to other formats.
    """

    def __init__(self, export_path: str = GENERIC_EXPORT_PATH):
        self.export_path = export_path
        os.makedirs(export_path, exist_ok=True)

    def export(
        self,
        samples: List[TrainingSample],
        batch_id: str,
        copy_images: bool = True,
    ) -> ExportResult:
        """
        Export samples in generic JSONL format.

        Args:
            samples: List of training samples
            batch_id: Unique batch identifier
            copy_images: Whether to copy images to export directory

        Returns:
            ExportResult with export details
        """
        batch_path = os.path.join(self.export_path, batch_id)
        images_path = os.path.join(batch_path, "images")
        os.makedirs(images_path, exist_ok=True)

        # Export data
        export_data = []
        for sample in samples:
            # Copy image if requested
            if copy_images and os.path.exists(sample.image_path):
                image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
                dest_path = os.path.join(images_path, image_filename)
                shutil.copy2(sample.image_path, dest_path)
                image_ref = f"images/{image_filename}"
            else:
                image_ref = sample.image_path

            export_data.append({
                "id": sample.id,
                "image_path": image_ref,
                "ground_truth": sample.ground_truth,
                "ocr_text": sample.ocr_text,
                "ocr_confidence": sample.ocr_confidence,
                "metadata": sample.metadata or {},
            })

        # Write JSONL file
        jsonl_path = os.path.join(batch_path, "data.jsonl")
        with open(jsonl_path, 'w', encoding='utf-8') as f:
            for item in export_data:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')

        # Also write as single JSON for convenience
        json_path = os.path.join(batch_path, "data.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)

        # Write manifest
        manifest = {
            "format": "generic",
            "version": "1.0",
            "batch_id": batch_id,
            "sample_count": len(samples),
            "created_at": datetime.utcnow().isoformat(),
            "files": {
                "data_jsonl": "data.jsonl",
                "data_json": "data.json",
                "images": "images/",
            },
        }
        manifest_path = os.path.join(batch_path, "manifest.json")
        with open(manifest_path, 'w') as f:
            json.dump(manifest, f, indent=2)

        return ExportResult(
            export_format="generic",
            export_path=batch_path,
            sample_count=len(samples),
            batch_id=batch_id,
            created_at=datetime.utcnow(),
            manifest_path=manifest_path,
        )


class TrainingExportService:
    """
    Main service for exporting OCR labeling data to various training formats.
    """

    def __init__(self):
        self.trocr_exporter = TrOCRExporter()
        self.llama_vision_exporter = LlamaVisionExporter()
        self.generic_exporter = GenericExporter()

    def export(
        self,
        samples: List[TrainingSample],
        export_format: str,
        batch_id: Optional[str] = None,
        **kwargs,
    ) -> ExportResult:
        """
        Export training samples in the specified format.

        Args:
            samples: List of training samples
            export_format: 'trocr', 'llama_vision', or 'generic'
            batch_id: Optional batch ID (generated if not provided)
            **kwargs: Additional format-specific options

        Returns:
            ExportResult with export details
        """
        if not batch_id:
            batch_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")

        if export_format == "trocr":
            return self.trocr_exporter.export(samples, batch_id, **kwargs)
        elif export_format == "llama_vision":
            return self.llama_vision_exporter.export(samples, batch_id, **kwargs)
        elif export_format == "generic":
            return self.generic_exporter.export(samples, batch_id, **kwargs)
        else:
            raise ValueError(f"Unknown export format: {export_format}")

    def list_exports(self, export_format: Optional[str] = None) -> List[Dict]:
        """
        List all available exports.

        Args:
            export_format: Optional filter by format

        Returns:
            List of export manifests
        """
        exports = []

        paths_to_check = []
        if export_format is None or export_format == "trocr":
            paths_to_check.append((TROCR_EXPORT_PATH, "trocr"))
        if export_format is None or export_format == "llama_vision":
            paths_to_check.append((LLAMA_VISION_EXPORT_PATH, "llama_vision"))
        if export_format is None or export_format == "generic":
            paths_to_check.append((GENERIC_EXPORT_PATH, "generic"))

        for base_path, fmt in paths_to_check:
            if not os.path.exists(base_path):
                continue
            for batch_dir in os.listdir(base_path):
                manifest_path = os.path.join(base_path, batch_dir, "manifest.json")
                if os.path.exists(manifest_path):
                    with open(manifest_path, 'r') as f:
                        manifest = json.load(f)
                        manifest["export_path"] = os.path.join(base_path, batch_dir)
                        exports.append(manifest)

        return sorted(exports, key=lambda x: x.get("created_at", ""), reverse=True)


# Singleton instance
_export_service: Optional[TrainingExportService] = None


def get_training_export_service() -> TrainingExportService:
    """Get or create the training export service singleton."""
    global _export_service
    if _export_service is None:
        _export_service = TrainingExportService()
    return _export_service