""" Training Export Service for OCR Labeling Data Exports labeled OCR data in formats suitable for fine-tuning: - TrOCR (Microsoft's Transformer-based OCR model) - llama3.2-vision (Meta's Vision-Language Model) - Generic JSONL format DATENSCHUTZ/PRIVACY: - Alle Daten bleiben lokal auf dem Mac Mini - Keine Cloud-Uploads ohne explizite Zustimmung - Export-Pfade sind konfigurierbar """ import os import json import base64 import shutil from pathlib import Path from typing import List, Dict, Optional, Any from dataclasses import dataclass from datetime import datetime import hashlib # Export directory configuration EXPORT_BASE_PATH = os.getenv("OCR_EXPORT_PATH", "/app/ocr-exports") TROCR_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "trocr") LLAMA_VISION_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "llama-vision") GENERIC_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "generic") @dataclass class TrainingSample: """A single training sample for OCR fine-tuning.""" id: str image_path: str ground_truth: str ocr_text: Optional[str] = None ocr_confidence: Optional[float] = None metadata: Optional[Dict[str, Any]] = None @dataclass class ExportResult: """Result of a training data export.""" export_format: str export_path: str sample_count: int batch_id: str created_at: datetime manifest_path: str class TrOCRExporter: """ Export training data for TrOCR fine-tuning. TrOCR expects: - Image files (PNG/JPG) - A CSV/TSV file with: image_path, text - Or a JSONL file with: {"file_name": "img.png", "text": "ground truth"} We use the JSONL format for flexibility. """ def __init__(self, export_path: str = TROCR_EXPORT_PATH): self.export_path = export_path os.makedirs(export_path, exist_ok=True) def export( self, samples: List[TrainingSample], batch_id: str, copy_images: bool = True, ) -> ExportResult: """ Export samples in TrOCR format. Args: samples: List of training samples batch_id: Unique batch identifier copy_images: Whether to copy images to export directory Returns: ExportResult with export details """ batch_path = os.path.join(self.export_path, batch_id) images_path = os.path.join(batch_path, "images") os.makedirs(images_path, exist_ok=True) # Export data export_data = [] for sample in samples: # Copy image if requested if copy_images and os.path.exists(sample.image_path): image_filename = f"{sample.id}{Path(sample.image_path).suffix}" dest_path = os.path.join(images_path, image_filename) shutil.copy2(sample.image_path, dest_path) image_ref = f"images/{image_filename}" else: image_ref = sample.image_path export_data.append({ "file_name": image_ref, "text": sample.ground_truth, "id": sample.id, }) # Write JSONL file jsonl_path = os.path.join(batch_path, "train.jsonl") with open(jsonl_path, 'w', encoding='utf-8') as f: for item in export_data: f.write(json.dumps(item, ensure_ascii=False) + '\n') # Write manifest manifest = { "format": "trocr", "version": "1.0", "batch_id": batch_id, "sample_count": len(samples), "created_at": datetime.utcnow().isoformat(), "files": { "data": "train.jsonl", "images": "images/", }, "model_config": { "base_model": "microsoft/trocr-base-handwritten", "task": "handwriting-recognition", }, } manifest_path = os.path.join(batch_path, "manifest.json") with open(manifest_path, 'w') as f: json.dump(manifest, f, indent=2) return ExportResult( export_format="trocr", export_path=batch_path, sample_count=len(samples), batch_id=batch_id, created_at=datetime.utcnow(), manifest_path=manifest_path, ) class LlamaVisionExporter: """ Export training data for llama3.2-vision fine-tuning. Llama Vision fine-tuning expects: - JSONL format with base64-encoded images or image URLs - Format: {"messages": [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]}, {"role": "assistant", "content": "..."}]} We create a supervised fine-tuning dataset. """ def __init__(self, export_path: str = LLAMA_VISION_EXPORT_PATH): self.export_path = export_path os.makedirs(export_path, exist_ok=True) def _encode_image_base64(self, image_path: str) -> Optional[str]: """Encode image to base64.""" try: with open(image_path, 'rb') as f: return base64.b64encode(f.read()).decode('utf-8') except Exception: return None def export( self, samples: List[TrainingSample], batch_id: str, include_base64: bool = False, copy_images: bool = True, ) -> ExportResult: """ Export samples in Llama Vision fine-tuning format. Args: samples: List of training samples batch_id: Unique batch identifier include_base64: Whether to include base64-encoded images in JSONL copy_images: Whether to copy images to export directory Returns: ExportResult with export details """ batch_path = os.path.join(self.export_path, batch_id) images_path = os.path.join(batch_path, "images") os.makedirs(images_path, exist_ok=True) # OCR instruction prompt system_prompt = ( "Du bist ein OCR-Experte für deutsche Handschrift. " "Lies den handgeschriebenen Text im Bild und gib ihn wortgetreu wieder." ) # Export data export_data = [] for sample in samples: # Copy image if requested if copy_images and os.path.exists(sample.image_path): image_filename = f"{sample.id}{Path(sample.image_path).suffix}" dest_path = os.path.join(images_path, image_filename) shutil.copy2(sample.image_path, dest_path) image_ref = f"images/{image_filename}" else: image_ref = sample.image_path # Build message format user_content = [ {"type": "image_url", "image_url": {"url": image_ref}}, {"type": "text", "text": "Lies den handgeschriebenen Text in diesem Bild."}, ] # Optionally include base64 if include_base64: b64 = self._encode_image_base64(sample.image_path) if b64: ext = Path(sample.image_path).suffix.lower().replace('.', '') mime = {'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg'}.get(ext, 'image/png') user_content[0] = { "type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"} } export_data.append({ "id": sample.id, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}, {"role": "assistant", "content": sample.ground_truth}, ], }) # Write JSONL file jsonl_path = os.path.join(batch_path, "train.jsonl") with open(jsonl_path, 'w', encoding='utf-8') as f: for item in export_data: f.write(json.dumps(item, ensure_ascii=False) + '\n') # Write manifest manifest = { "format": "llama_vision", "version": "1.0", "batch_id": batch_id, "sample_count": len(samples), "created_at": datetime.utcnow().isoformat(), "files": { "data": "train.jsonl", "images": "images/", }, "model_config": { "base_model": "llama3.2-vision:11b", "task": "handwriting-ocr", "system_prompt": system_prompt, }, } manifest_path = os.path.join(batch_path, "manifest.json") with open(manifest_path, 'w') as f: json.dump(manifest, f, indent=2) return ExportResult( export_format="llama_vision", export_path=batch_path, sample_count=len(samples), batch_id=batch_id, created_at=datetime.utcnow(), manifest_path=manifest_path, ) class GenericExporter: """ Export training data in a generic JSONL format. This format is compatible with most ML frameworks and can be easily converted to other formats. """ def __init__(self, export_path: str = GENERIC_EXPORT_PATH): self.export_path = export_path os.makedirs(export_path, exist_ok=True) def export( self, samples: List[TrainingSample], batch_id: str, copy_images: bool = True, ) -> ExportResult: """ Export samples in generic JSONL format. Args: samples: List of training samples batch_id: Unique batch identifier copy_images: Whether to copy images to export directory Returns: ExportResult with export details """ batch_path = os.path.join(self.export_path, batch_id) images_path = os.path.join(batch_path, "images") os.makedirs(images_path, exist_ok=True) # Export data export_data = [] for sample in samples: # Copy image if requested if copy_images and os.path.exists(sample.image_path): image_filename = f"{sample.id}{Path(sample.image_path).suffix}" dest_path = os.path.join(images_path, image_filename) shutil.copy2(sample.image_path, dest_path) image_ref = f"images/{image_filename}" else: image_ref = sample.image_path export_data.append({ "id": sample.id, "image_path": image_ref, "ground_truth": sample.ground_truth, "ocr_text": sample.ocr_text, "ocr_confidence": sample.ocr_confidence, "metadata": sample.metadata or {}, }) # Write JSONL file jsonl_path = os.path.join(batch_path, "data.jsonl") with open(jsonl_path, 'w', encoding='utf-8') as f: for item in export_data: f.write(json.dumps(item, ensure_ascii=False) + '\n') # Also write as single JSON for convenience json_path = os.path.join(batch_path, "data.json") with open(json_path, 'w', encoding='utf-8') as f: json.dump(export_data, f, indent=2, ensure_ascii=False) # Write manifest manifest = { "format": "generic", "version": "1.0", "batch_id": batch_id, "sample_count": len(samples), "created_at": datetime.utcnow().isoformat(), "files": { "data_jsonl": "data.jsonl", "data_json": "data.json", "images": "images/", }, } manifest_path = os.path.join(batch_path, "manifest.json") with open(manifest_path, 'w') as f: json.dump(manifest, f, indent=2) return ExportResult( export_format="generic", export_path=batch_path, sample_count=len(samples), batch_id=batch_id, created_at=datetime.utcnow(), manifest_path=manifest_path, ) class TrainingExportService: """ Main service for exporting OCR labeling data to various training formats. """ def __init__(self): self.trocr_exporter = TrOCRExporter() self.llama_vision_exporter = LlamaVisionExporter() self.generic_exporter = GenericExporter() def export( self, samples: List[TrainingSample], export_format: str, batch_id: Optional[str] = None, **kwargs, ) -> ExportResult: """ Export training samples in the specified format. Args: samples: List of training samples export_format: 'trocr', 'llama_vision', or 'generic' batch_id: Optional batch ID (generated if not provided) **kwargs: Additional format-specific options Returns: ExportResult with export details """ if not batch_id: batch_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S") if export_format == "trocr": return self.trocr_exporter.export(samples, batch_id, **kwargs) elif export_format == "llama_vision": return self.llama_vision_exporter.export(samples, batch_id, **kwargs) elif export_format == "generic": return self.generic_exporter.export(samples, batch_id, **kwargs) else: raise ValueError(f"Unknown export format: {export_format}") def list_exports(self, export_format: Optional[str] = None) -> List[Dict]: """ List all available exports. Args: export_format: Optional filter by format Returns: List of export manifests """ exports = [] paths_to_check = [] if export_format is None or export_format == "trocr": paths_to_check.append((TROCR_EXPORT_PATH, "trocr")) if export_format is None or export_format == "llama_vision": paths_to_check.append((LLAMA_VISION_EXPORT_PATH, "llama_vision")) if export_format is None or export_format == "generic": paths_to_check.append((GENERIC_EXPORT_PATH, "generic")) for base_path, fmt in paths_to_check: if not os.path.exists(base_path): continue for batch_dir in os.listdir(base_path): manifest_path = os.path.join(base_path, batch_dir, "manifest.json") if os.path.exists(manifest_path): with open(manifest_path, 'r') as f: manifest = json.load(f) manifest["export_path"] = os.path.join(base_path, batch_dir) exports.append(manifest) return sorted(exports, key=lambda x: x.get("created_at", ""), reverse=True) # Singleton instance _export_service: Optional[TrainingExportService] = None def get_training_export_service() -> TrainingExportService: """Get or create the training export service singleton.""" global _export_service if _export_service is None: _export_service = TrainingExportService() return _export_service