This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/klausur-service/backend/training_export_service.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

449 lines
15 KiB
Python

"""
Training Export Service for OCR Labeling Data
Exports labeled OCR data in formats suitable for fine-tuning:
- TrOCR (Microsoft's Transformer-based OCR model)
- llama3.2-vision (Meta's Vision-Language Model)
- Generic JSONL format
DATENSCHUTZ/PRIVACY:
- Alle Daten bleiben lokal auf dem Mac Mini
- Keine Cloud-Uploads ohne explizite Zustimmung
- Export-Pfade sind konfigurierbar
"""
import os
import json
import base64
import shutil
from pathlib import Path
from typing import List, Dict, Optional, Any
from dataclasses import dataclass
from datetime import datetime
import hashlib
# Export directory configuration
EXPORT_BASE_PATH = os.getenv("OCR_EXPORT_PATH", "/app/ocr-exports")
TROCR_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "trocr")
LLAMA_VISION_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "llama-vision")
GENERIC_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "generic")
@dataclass
class TrainingSample:
"""A single training sample for OCR fine-tuning."""
id: str
image_path: str
ground_truth: str
ocr_text: Optional[str] = None
ocr_confidence: Optional[float] = None
metadata: Optional[Dict[str, Any]] = None
@dataclass
class ExportResult:
"""Result of a training data export."""
export_format: str
export_path: str
sample_count: int
batch_id: str
created_at: datetime
manifest_path: str
class TrOCRExporter:
"""
Export training data for TrOCR fine-tuning.
TrOCR expects:
- Image files (PNG/JPG)
- A CSV/TSV file with: image_path, text
- Or a JSONL file with: {"file_name": "img.png", "text": "ground truth"}
We use the JSONL format for flexibility.
"""
def __init__(self, export_path: str = TROCR_EXPORT_PATH):
self.export_path = export_path
os.makedirs(export_path, exist_ok=True)
def export(
self,
samples: List[TrainingSample],
batch_id: str,
copy_images: bool = True,
) -> ExportResult:
"""
Export samples in TrOCR format.
Args:
samples: List of training samples
batch_id: Unique batch identifier
copy_images: Whether to copy images to export directory
Returns:
ExportResult with export details
"""
batch_path = os.path.join(self.export_path, batch_id)
images_path = os.path.join(batch_path, "images")
os.makedirs(images_path, exist_ok=True)
# Export data
export_data = []
for sample in samples:
# Copy image if requested
if copy_images and os.path.exists(sample.image_path):
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
dest_path = os.path.join(images_path, image_filename)
shutil.copy2(sample.image_path, dest_path)
image_ref = f"images/{image_filename}"
else:
image_ref = sample.image_path
export_data.append({
"file_name": image_ref,
"text": sample.ground_truth,
"id": sample.id,
})
# Write JSONL file
jsonl_path = os.path.join(batch_path, "train.jsonl")
with open(jsonl_path, 'w', encoding='utf-8') as f:
for item in export_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
# Write manifest
manifest = {
"format": "trocr",
"version": "1.0",
"batch_id": batch_id,
"sample_count": len(samples),
"created_at": datetime.utcnow().isoformat(),
"files": {
"data": "train.jsonl",
"images": "images/",
},
"model_config": {
"base_model": "microsoft/trocr-base-handwritten",
"task": "handwriting-recognition",
},
}
manifest_path = os.path.join(batch_path, "manifest.json")
with open(manifest_path, 'w') as f:
json.dump(manifest, f, indent=2)
return ExportResult(
export_format="trocr",
export_path=batch_path,
sample_count=len(samples),
batch_id=batch_id,
created_at=datetime.utcnow(),
manifest_path=manifest_path,
)
class LlamaVisionExporter:
"""
Export training data for llama3.2-vision fine-tuning.
Llama Vision fine-tuning expects:
- JSONL format with base64-encoded images or image URLs
- Format: {"messages": [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]}, {"role": "assistant", "content": "..."}]}
We create a supervised fine-tuning dataset.
"""
def __init__(self, export_path: str = LLAMA_VISION_EXPORT_PATH):
self.export_path = export_path
os.makedirs(export_path, exist_ok=True)
def _encode_image_base64(self, image_path: str) -> Optional[str]:
"""Encode image to base64."""
try:
with open(image_path, 'rb') as f:
return base64.b64encode(f.read()).decode('utf-8')
except Exception:
return None
def export(
self,
samples: List[TrainingSample],
batch_id: str,
include_base64: bool = False,
copy_images: bool = True,
) -> ExportResult:
"""
Export samples in Llama Vision fine-tuning format.
Args:
samples: List of training samples
batch_id: Unique batch identifier
include_base64: Whether to include base64-encoded images in JSONL
copy_images: Whether to copy images to export directory
Returns:
ExportResult with export details
"""
batch_path = os.path.join(self.export_path, batch_id)
images_path = os.path.join(batch_path, "images")
os.makedirs(images_path, exist_ok=True)
# OCR instruction prompt
system_prompt = (
"Du bist ein OCR-Experte für deutsche Handschrift. "
"Lies den handgeschriebenen Text im Bild und gib ihn wortgetreu wieder."
)
# Export data
export_data = []
for sample in samples:
# Copy image if requested
if copy_images and os.path.exists(sample.image_path):
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
dest_path = os.path.join(images_path, image_filename)
shutil.copy2(sample.image_path, dest_path)
image_ref = f"images/{image_filename}"
else:
image_ref = sample.image_path
# Build message format
user_content = [
{"type": "image_url", "image_url": {"url": image_ref}},
{"type": "text", "text": "Lies den handgeschriebenen Text in diesem Bild."},
]
# Optionally include base64
if include_base64:
b64 = self._encode_image_base64(sample.image_path)
if b64:
ext = Path(sample.image_path).suffix.lower().replace('.', '')
mime = {'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg'}.get(ext, 'image/png')
user_content[0] = {
"type": "image_url",
"image_url": {"url": f"data:{mime};base64,{b64}"}
}
export_data.append({
"id": sample.id,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content},
{"role": "assistant", "content": sample.ground_truth},
],
})
# Write JSONL file
jsonl_path = os.path.join(batch_path, "train.jsonl")
with open(jsonl_path, 'w', encoding='utf-8') as f:
for item in export_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
# Write manifest
manifest = {
"format": "llama_vision",
"version": "1.0",
"batch_id": batch_id,
"sample_count": len(samples),
"created_at": datetime.utcnow().isoformat(),
"files": {
"data": "train.jsonl",
"images": "images/",
},
"model_config": {
"base_model": "llama3.2-vision:11b",
"task": "handwriting-ocr",
"system_prompt": system_prompt,
},
}
manifest_path = os.path.join(batch_path, "manifest.json")
with open(manifest_path, 'w') as f:
json.dump(manifest, f, indent=2)
return ExportResult(
export_format="llama_vision",
export_path=batch_path,
sample_count=len(samples),
batch_id=batch_id,
created_at=datetime.utcnow(),
manifest_path=manifest_path,
)
class GenericExporter:
"""
Export training data in a generic JSONL format.
This format is compatible with most ML frameworks and can be
easily converted to other formats.
"""
def __init__(self, export_path: str = GENERIC_EXPORT_PATH):
self.export_path = export_path
os.makedirs(export_path, exist_ok=True)
def export(
self,
samples: List[TrainingSample],
batch_id: str,
copy_images: bool = True,
) -> ExportResult:
"""
Export samples in generic JSONL format.
Args:
samples: List of training samples
batch_id: Unique batch identifier
copy_images: Whether to copy images to export directory
Returns:
ExportResult with export details
"""
batch_path = os.path.join(self.export_path, batch_id)
images_path = os.path.join(batch_path, "images")
os.makedirs(images_path, exist_ok=True)
# Export data
export_data = []
for sample in samples:
# Copy image if requested
if copy_images and os.path.exists(sample.image_path):
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
dest_path = os.path.join(images_path, image_filename)
shutil.copy2(sample.image_path, dest_path)
image_ref = f"images/{image_filename}"
else:
image_ref = sample.image_path
export_data.append({
"id": sample.id,
"image_path": image_ref,
"ground_truth": sample.ground_truth,
"ocr_text": sample.ocr_text,
"ocr_confidence": sample.ocr_confidence,
"metadata": sample.metadata or {},
})
# Write JSONL file
jsonl_path = os.path.join(batch_path, "data.jsonl")
with open(jsonl_path, 'w', encoding='utf-8') as f:
for item in export_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
# Also write as single JSON for convenience
json_path = os.path.join(batch_path, "data.json")
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(export_data, f, indent=2, ensure_ascii=False)
# Write manifest
manifest = {
"format": "generic",
"version": "1.0",
"batch_id": batch_id,
"sample_count": len(samples),
"created_at": datetime.utcnow().isoformat(),
"files": {
"data_jsonl": "data.jsonl",
"data_json": "data.json",
"images": "images/",
},
}
manifest_path = os.path.join(batch_path, "manifest.json")
with open(manifest_path, 'w') as f:
json.dump(manifest, f, indent=2)
return ExportResult(
export_format="generic",
export_path=batch_path,
sample_count=len(samples),
batch_id=batch_id,
created_at=datetime.utcnow(),
manifest_path=manifest_path,
)
class TrainingExportService:
"""
Main service for exporting OCR labeling data to various training formats.
"""
def __init__(self):
self.trocr_exporter = TrOCRExporter()
self.llama_vision_exporter = LlamaVisionExporter()
self.generic_exporter = GenericExporter()
def export(
self,
samples: List[TrainingSample],
export_format: str,
batch_id: Optional[str] = None,
**kwargs,
) -> ExportResult:
"""
Export training samples in the specified format.
Args:
samples: List of training samples
export_format: 'trocr', 'llama_vision', or 'generic'
batch_id: Optional batch ID (generated if not provided)
**kwargs: Additional format-specific options
Returns:
ExportResult with export details
"""
if not batch_id:
batch_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
if export_format == "trocr":
return self.trocr_exporter.export(samples, batch_id, **kwargs)
elif export_format == "llama_vision":
return self.llama_vision_exporter.export(samples, batch_id, **kwargs)
elif export_format == "generic":
return self.generic_exporter.export(samples, batch_id, **kwargs)
else:
raise ValueError(f"Unknown export format: {export_format}")
def list_exports(self, export_format: Optional[str] = None) -> List[Dict]:
"""
List all available exports.
Args:
export_format: Optional filter by format
Returns:
List of export manifests
"""
exports = []
paths_to_check = []
if export_format is None or export_format == "trocr":
paths_to_check.append((TROCR_EXPORT_PATH, "trocr"))
if export_format is None or export_format == "llama_vision":
paths_to_check.append((LLAMA_VISION_EXPORT_PATH, "llama_vision"))
if export_format is None or export_format == "generic":
paths_to_check.append((GENERIC_EXPORT_PATH, "generic"))
for base_path, fmt in paths_to_check:
if not os.path.exists(base_path):
continue
for batch_dir in os.listdir(base_path):
manifest_path = os.path.join(base_path, batch_dir, "manifest.json")
if os.path.exists(manifest_path):
with open(manifest_path, 'r') as f:
manifest = json.load(f)
manifest["export_path"] = os.path.join(base_path, batch_dir)
exports.append(manifest)
return sorted(exports, key=lambda x: x.get("created_at", ""), reverse=True)
# Singleton instance
_export_service: Optional[TrainingExportService] = None
def get_training_export_service() -> TrainingExportService:
"""Get or create the training export service singleton."""
global _export_service
if _export_service is None:
_export_service = TrainingExportService()
return _export_service