Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
449 lines
15 KiB
Python
449 lines
15 KiB
Python
"""
|
|
Training Export Service for OCR Labeling Data
|
|
|
|
Exports labeled OCR data in formats suitable for fine-tuning:
|
|
- TrOCR (Microsoft's Transformer-based OCR model)
|
|
- llama3.2-vision (Meta's Vision-Language Model)
|
|
- Generic JSONL format
|
|
|
|
DATENSCHUTZ/PRIVACY:
|
|
- Alle Daten bleiben lokal auf dem Mac Mini
|
|
- Keine Cloud-Uploads ohne explizite Zustimmung
|
|
- Export-Pfade sind konfigurierbar
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import base64
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional, Any
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
import hashlib
|
|
|
|
# Export directory configuration
|
|
EXPORT_BASE_PATH = os.getenv("OCR_EXPORT_PATH", "/app/ocr-exports")
|
|
TROCR_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "trocr")
|
|
LLAMA_VISION_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "llama-vision")
|
|
GENERIC_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "generic")
|
|
|
|
|
|
@dataclass
|
|
class TrainingSample:
|
|
"""A single training sample for OCR fine-tuning."""
|
|
id: str
|
|
image_path: str
|
|
ground_truth: str
|
|
ocr_text: Optional[str] = None
|
|
ocr_confidence: Optional[float] = None
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
@dataclass
|
|
class ExportResult:
|
|
"""Result of a training data export."""
|
|
export_format: str
|
|
export_path: str
|
|
sample_count: int
|
|
batch_id: str
|
|
created_at: datetime
|
|
manifest_path: str
|
|
|
|
|
|
class TrOCRExporter:
|
|
"""
|
|
Export training data for TrOCR fine-tuning.
|
|
|
|
TrOCR expects:
|
|
- Image files (PNG/JPG)
|
|
- A CSV/TSV file with: image_path, text
|
|
- Or a JSONL file with: {"file_name": "img.png", "text": "ground truth"}
|
|
|
|
We use the JSONL format for flexibility.
|
|
"""
|
|
|
|
def __init__(self, export_path: str = TROCR_EXPORT_PATH):
|
|
self.export_path = export_path
|
|
os.makedirs(export_path, exist_ok=True)
|
|
|
|
def export(
|
|
self,
|
|
samples: List[TrainingSample],
|
|
batch_id: str,
|
|
copy_images: bool = True,
|
|
) -> ExportResult:
|
|
"""
|
|
Export samples in TrOCR format.
|
|
|
|
Args:
|
|
samples: List of training samples
|
|
batch_id: Unique batch identifier
|
|
copy_images: Whether to copy images to export directory
|
|
|
|
Returns:
|
|
ExportResult with export details
|
|
"""
|
|
batch_path = os.path.join(self.export_path, batch_id)
|
|
images_path = os.path.join(batch_path, "images")
|
|
os.makedirs(images_path, exist_ok=True)
|
|
|
|
# Export data
|
|
export_data = []
|
|
for sample in samples:
|
|
# Copy image if requested
|
|
if copy_images and os.path.exists(sample.image_path):
|
|
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
|
dest_path = os.path.join(images_path, image_filename)
|
|
shutil.copy2(sample.image_path, dest_path)
|
|
image_ref = f"images/{image_filename}"
|
|
else:
|
|
image_ref = sample.image_path
|
|
|
|
export_data.append({
|
|
"file_name": image_ref,
|
|
"text": sample.ground_truth,
|
|
"id": sample.id,
|
|
})
|
|
|
|
# Write JSONL file
|
|
jsonl_path = os.path.join(batch_path, "train.jsonl")
|
|
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
|
for item in export_data:
|
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
|
|
|
# Write manifest
|
|
manifest = {
|
|
"format": "trocr",
|
|
"version": "1.0",
|
|
"batch_id": batch_id,
|
|
"sample_count": len(samples),
|
|
"created_at": datetime.utcnow().isoformat(),
|
|
"files": {
|
|
"data": "train.jsonl",
|
|
"images": "images/",
|
|
},
|
|
"model_config": {
|
|
"base_model": "microsoft/trocr-base-handwritten",
|
|
"task": "handwriting-recognition",
|
|
},
|
|
}
|
|
manifest_path = os.path.join(batch_path, "manifest.json")
|
|
with open(manifest_path, 'w') as f:
|
|
json.dump(manifest, f, indent=2)
|
|
|
|
return ExportResult(
|
|
export_format="trocr",
|
|
export_path=batch_path,
|
|
sample_count=len(samples),
|
|
batch_id=batch_id,
|
|
created_at=datetime.utcnow(),
|
|
manifest_path=manifest_path,
|
|
)
|
|
|
|
|
|
class LlamaVisionExporter:
|
|
"""
|
|
Export training data for llama3.2-vision fine-tuning.
|
|
|
|
Llama Vision fine-tuning expects:
|
|
- JSONL format with base64-encoded images or image URLs
|
|
- Format: {"messages": [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]}, {"role": "assistant", "content": "..."}]}
|
|
|
|
We create a supervised fine-tuning dataset.
|
|
"""
|
|
|
|
def __init__(self, export_path: str = LLAMA_VISION_EXPORT_PATH):
|
|
self.export_path = export_path
|
|
os.makedirs(export_path, exist_ok=True)
|
|
|
|
def _encode_image_base64(self, image_path: str) -> Optional[str]:
|
|
"""Encode image to base64."""
|
|
try:
|
|
with open(image_path, 'rb') as f:
|
|
return base64.b64encode(f.read()).decode('utf-8')
|
|
except Exception:
|
|
return None
|
|
|
|
def export(
|
|
self,
|
|
samples: List[TrainingSample],
|
|
batch_id: str,
|
|
include_base64: bool = False,
|
|
copy_images: bool = True,
|
|
) -> ExportResult:
|
|
"""
|
|
Export samples in Llama Vision fine-tuning format.
|
|
|
|
Args:
|
|
samples: List of training samples
|
|
batch_id: Unique batch identifier
|
|
include_base64: Whether to include base64-encoded images in JSONL
|
|
copy_images: Whether to copy images to export directory
|
|
|
|
Returns:
|
|
ExportResult with export details
|
|
"""
|
|
batch_path = os.path.join(self.export_path, batch_id)
|
|
images_path = os.path.join(batch_path, "images")
|
|
os.makedirs(images_path, exist_ok=True)
|
|
|
|
# OCR instruction prompt
|
|
system_prompt = (
|
|
"Du bist ein OCR-Experte für deutsche Handschrift. "
|
|
"Lies den handgeschriebenen Text im Bild und gib ihn wortgetreu wieder."
|
|
)
|
|
|
|
# Export data
|
|
export_data = []
|
|
for sample in samples:
|
|
# Copy image if requested
|
|
if copy_images and os.path.exists(sample.image_path):
|
|
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
|
dest_path = os.path.join(images_path, image_filename)
|
|
shutil.copy2(sample.image_path, dest_path)
|
|
image_ref = f"images/{image_filename}"
|
|
else:
|
|
image_ref = sample.image_path
|
|
|
|
# Build message format
|
|
user_content = [
|
|
{"type": "image_url", "image_url": {"url": image_ref}},
|
|
{"type": "text", "text": "Lies den handgeschriebenen Text in diesem Bild."},
|
|
]
|
|
|
|
# Optionally include base64
|
|
if include_base64:
|
|
b64 = self._encode_image_base64(sample.image_path)
|
|
if b64:
|
|
ext = Path(sample.image_path).suffix.lower().replace('.', '')
|
|
mime = {'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg'}.get(ext, 'image/png')
|
|
user_content[0] = {
|
|
"type": "image_url",
|
|
"image_url": {"url": f"data:{mime};base64,{b64}"}
|
|
}
|
|
|
|
export_data.append({
|
|
"id": sample.id,
|
|
"messages": [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_content},
|
|
{"role": "assistant", "content": sample.ground_truth},
|
|
],
|
|
})
|
|
|
|
# Write JSONL file
|
|
jsonl_path = os.path.join(batch_path, "train.jsonl")
|
|
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
|
for item in export_data:
|
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
|
|
|
# Write manifest
|
|
manifest = {
|
|
"format": "llama_vision",
|
|
"version": "1.0",
|
|
"batch_id": batch_id,
|
|
"sample_count": len(samples),
|
|
"created_at": datetime.utcnow().isoformat(),
|
|
"files": {
|
|
"data": "train.jsonl",
|
|
"images": "images/",
|
|
},
|
|
"model_config": {
|
|
"base_model": "llama3.2-vision:11b",
|
|
"task": "handwriting-ocr",
|
|
"system_prompt": system_prompt,
|
|
},
|
|
}
|
|
manifest_path = os.path.join(batch_path, "manifest.json")
|
|
with open(manifest_path, 'w') as f:
|
|
json.dump(manifest, f, indent=2)
|
|
|
|
return ExportResult(
|
|
export_format="llama_vision",
|
|
export_path=batch_path,
|
|
sample_count=len(samples),
|
|
batch_id=batch_id,
|
|
created_at=datetime.utcnow(),
|
|
manifest_path=manifest_path,
|
|
)
|
|
|
|
|
|
class GenericExporter:
|
|
"""
|
|
Export training data in a generic JSONL format.
|
|
|
|
This format is compatible with most ML frameworks and can be
|
|
easily converted to other formats.
|
|
"""
|
|
|
|
def __init__(self, export_path: str = GENERIC_EXPORT_PATH):
|
|
self.export_path = export_path
|
|
os.makedirs(export_path, exist_ok=True)
|
|
|
|
def export(
|
|
self,
|
|
samples: List[TrainingSample],
|
|
batch_id: str,
|
|
copy_images: bool = True,
|
|
) -> ExportResult:
|
|
"""
|
|
Export samples in generic JSONL format.
|
|
|
|
Args:
|
|
samples: List of training samples
|
|
batch_id: Unique batch identifier
|
|
copy_images: Whether to copy images to export directory
|
|
|
|
Returns:
|
|
ExportResult with export details
|
|
"""
|
|
batch_path = os.path.join(self.export_path, batch_id)
|
|
images_path = os.path.join(batch_path, "images")
|
|
os.makedirs(images_path, exist_ok=True)
|
|
|
|
# Export data
|
|
export_data = []
|
|
for sample in samples:
|
|
# Copy image if requested
|
|
if copy_images and os.path.exists(sample.image_path):
|
|
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
|
dest_path = os.path.join(images_path, image_filename)
|
|
shutil.copy2(sample.image_path, dest_path)
|
|
image_ref = f"images/{image_filename}"
|
|
else:
|
|
image_ref = sample.image_path
|
|
|
|
export_data.append({
|
|
"id": sample.id,
|
|
"image_path": image_ref,
|
|
"ground_truth": sample.ground_truth,
|
|
"ocr_text": sample.ocr_text,
|
|
"ocr_confidence": sample.ocr_confidence,
|
|
"metadata": sample.metadata or {},
|
|
})
|
|
|
|
# Write JSONL file
|
|
jsonl_path = os.path.join(batch_path, "data.jsonl")
|
|
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
|
for item in export_data:
|
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
|
|
|
# Also write as single JSON for convenience
|
|
json_path = os.path.join(batch_path, "data.json")
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
|
|
|
# Write manifest
|
|
manifest = {
|
|
"format": "generic",
|
|
"version": "1.0",
|
|
"batch_id": batch_id,
|
|
"sample_count": len(samples),
|
|
"created_at": datetime.utcnow().isoformat(),
|
|
"files": {
|
|
"data_jsonl": "data.jsonl",
|
|
"data_json": "data.json",
|
|
"images": "images/",
|
|
},
|
|
}
|
|
manifest_path = os.path.join(batch_path, "manifest.json")
|
|
with open(manifest_path, 'w') as f:
|
|
json.dump(manifest, f, indent=2)
|
|
|
|
return ExportResult(
|
|
export_format="generic",
|
|
export_path=batch_path,
|
|
sample_count=len(samples),
|
|
batch_id=batch_id,
|
|
created_at=datetime.utcnow(),
|
|
manifest_path=manifest_path,
|
|
)
|
|
|
|
|
|
class TrainingExportService:
|
|
"""
|
|
Main service for exporting OCR labeling data to various training formats.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.trocr_exporter = TrOCRExporter()
|
|
self.llama_vision_exporter = LlamaVisionExporter()
|
|
self.generic_exporter = GenericExporter()
|
|
|
|
def export(
|
|
self,
|
|
samples: List[TrainingSample],
|
|
export_format: str,
|
|
batch_id: Optional[str] = None,
|
|
**kwargs,
|
|
) -> ExportResult:
|
|
"""
|
|
Export training samples in the specified format.
|
|
|
|
Args:
|
|
samples: List of training samples
|
|
export_format: 'trocr', 'llama_vision', or 'generic'
|
|
batch_id: Optional batch ID (generated if not provided)
|
|
**kwargs: Additional format-specific options
|
|
|
|
Returns:
|
|
ExportResult with export details
|
|
"""
|
|
if not batch_id:
|
|
batch_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
|
|
|
if export_format == "trocr":
|
|
return self.trocr_exporter.export(samples, batch_id, **kwargs)
|
|
elif export_format == "llama_vision":
|
|
return self.llama_vision_exporter.export(samples, batch_id, **kwargs)
|
|
elif export_format == "generic":
|
|
return self.generic_exporter.export(samples, batch_id, **kwargs)
|
|
else:
|
|
raise ValueError(f"Unknown export format: {export_format}")
|
|
|
|
def list_exports(self, export_format: Optional[str] = None) -> List[Dict]:
|
|
"""
|
|
List all available exports.
|
|
|
|
Args:
|
|
export_format: Optional filter by format
|
|
|
|
Returns:
|
|
List of export manifests
|
|
"""
|
|
exports = []
|
|
|
|
paths_to_check = []
|
|
if export_format is None or export_format == "trocr":
|
|
paths_to_check.append((TROCR_EXPORT_PATH, "trocr"))
|
|
if export_format is None or export_format == "llama_vision":
|
|
paths_to_check.append((LLAMA_VISION_EXPORT_PATH, "llama_vision"))
|
|
if export_format is None or export_format == "generic":
|
|
paths_to_check.append((GENERIC_EXPORT_PATH, "generic"))
|
|
|
|
for base_path, fmt in paths_to_check:
|
|
if not os.path.exists(base_path):
|
|
continue
|
|
for batch_dir in os.listdir(base_path):
|
|
manifest_path = os.path.join(base_path, batch_dir, "manifest.json")
|
|
if os.path.exists(manifest_path):
|
|
with open(manifest_path, 'r') as f:
|
|
manifest = json.load(f)
|
|
manifest["export_path"] = os.path.join(base_path, batch_dir)
|
|
exports.append(manifest)
|
|
|
|
return sorted(exports, key=lambda x: x.get("created_at", ""), reverse=True)
|
|
|
|
|
|
# Singleton instance
|
|
_export_service: Optional[TrainingExportService] = None
|
|
|
|
|
|
def get_training_export_service() -> TrainingExportService:
|
|
"""Get or create the training export service singleton."""
|
|
global _export_service
|
|
if _export_service is None:
|
|
_export_service = TrainingExportService()
|
|
return _export_service
|