fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
448
klausur-service/backend/training_export_service.py
Normal file
448
klausur-service/backend/training_export_service.py
Normal file
@@ -0,0 +1,448 @@
|
||||
"""
|
||||
Training Export Service for OCR Labeling Data
|
||||
|
||||
Exports labeled OCR data in formats suitable for fine-tuning:
|
||||
- TrOCR (Microsoft's Transformer-based OCR model)
|
||||
- llama3.2-vision (Meta's Vision-Language Model)
|
||||
- Generic JSONL format
|
||||
|
||||
DATENSCHUTZ/PRIVACY:
|
||||
- Alle Daten bleiben lokal auf dem Mac Mini
|
||||
- Keine Cloud-Uploads ohne explizite Zustimmung
|
||||
- Export-Pfade sind konfigurierbar
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import base64
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Any
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
|
||||
# Export directory configuration
|
||||
EXPORT_BASE_PATH = os.getenv("OCR_EXPORT_PATH", "/app/ocr-exports")
|
||||
TROCR_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "trocr")
|
||||
LLAMA_VISION_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "llama-vision")
|
||||
GENERIC_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "generic")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingSample:
|
||||
"""A single training sample for OCR fine-tuning."""
|
||||
id: str
|
||||
image_path: str
|
||||
ground_truth: str
|
||||
ocr_text: Optional[str] = None
|
||||
ocr_confidence: Optional[float] = None
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExportResult:
|
||||
"""Result of a training data export."""
|
||||
export_format: str
|
||||
export_path: str
|
||||
sample_count: int
|
||||
batch_id: str
|
||||
created_at: datetime
|
||||
manifest_path: str
|
||||
|
||||
|
||||
class TrOCRExporter:
|
||||
"""
|
||||
Export training data for TrOCR fine-tuning.
|
||||
|
||||
TrOCR expects:
|
||||
- Image files (PNG/JPG)
|
||||
- A CSV/TSV file with: image_path, text
|
||||
- Or a JSONL file with: {"file_name": "img.png", "text": "ground truth"}
|
||||
|
||||
We use the JSONL format for flexibility.
|
||||
"""
|
||||
|
||||
def __init__(self, export_path: str = TROCR_EXPORT_PATH):
|
||||
self.export_path = export_path
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
batch_id: str,
|
||||
copy_images: bool = True,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export samples in TrOCR format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
batch_id: Unique batch identifier
|
||||
copy_images: Whether to copy images to export directory
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
batch_path = os.path.join(self.export_path, batch_id)
|
||||
images_path = os.path.join(batch_path, "images")
|
||||
os.makedirs(images_path, exist_ok=True)
|
||||
|
||||
# Export data
|
||||
export_data = []
|
||||
for sample in samples:
|
||||
# Copy image if requested
|
||||
if copy_images and os.path.exists(sample.image_path):
|
||||
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
||||
dest_path = os.path.join(images_path, image_filename)
|
||||
shutil.copy2(sample.image_path, dest_path)
|
||||
image_ref = f"images/{image_filename}"
|
||||
else:
|
||||
image_ref = sample.image_path
|
||||
|
||||
export_data.append({
|
||||
"file_name": image_ref,
|
||||
"text": sample.ground_truth,
|
||||
"id": sample.id,
|
||||
})
|
||||
|
||||
# Write JSONL file
|
||||
jsonl_path = os.path.join(batch_path, "train.jsonl")
|
||||
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
||||
for item in export_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
# Write manifest
|
||||
manifest = {
|
||||
"format": "trocr",
|
||||
"version": "1.0",
|
||||
"batch_id": batch_id,
|
||||
"sample_count": len(samples),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"files": {
|
||||
"data": "train.jsonl",
|
||||
"images": "images/",
|
||||
},
|
||||
"model_config": {
|
||||
"base_model": "microsoft/trocr-base-handwritten",
|
||||
"task": "handwriting-recognition",
|
||||
},
|
||||
}
|
||||
manifest_path = os.path.join(batch_path, "manifest.json")
|
||||
with open(manifest_path, 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
return ExportResult(
|
||||
export_format="trocr",
|
||||
export_path=batch_path,
|
||||
sample_count=len(samples),
|
||||
batch_id=batch_id,
|
||||
created_at=datetime.utcnow(),
|
||||
manifest_path=manifest_path,
|
||||
)
|
||||
|
||||
|
||||
class LlamaVisionExporter:
|
||||
"""
|
||||
Export training data for llama3.2-vision fine-tuning.
|
||||
|
||||
Llama Vision fine-tuning expects:
|
||||
- JSONL format with base64-encoded images or image URLs
|
||||
- Format: {"messages": [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]}, {"role": "assistant", "content": "..."}]}
|
||||
|
||||
We create a supervised fine-tuning dataset.
|
||||
"""
|
||||
|
||||
def __init__(self, export_path: str = LLAMA_VISION_EXPORT_PATH):
|
||||
self.export_path = export_path
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
|
||||
def _encode_image_base64(self, image_path: str) -> Optional[str]:
|
||||
"""Encode image to base64."""
|
||||
try:
|
||||
with open(image_path, 'rb') as f:
|
||||
return base64.b64encode(f.read()).decode('utf-8')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
batch_id: str,
|
||||
include_base64: bool = False,
|
||||
copy_images: bool = True,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export samples in Llama Vision fine-tuning format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
batch_id: Unique batch identifier
|
||||
include_base64: Whether to include base64-encoded images in JSONL
|
||||
copy_images: Whether to copy images to export directory
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
batch_path = os.path.join(self.export_path, batch_id)
|
||||
images_path = os.path.join(batch_path, "images")
|
||||
os.makedirs(images_path, exist_ok=True)
|
||||
|
||||
# OCR instruction prompt
|
||||
system_prompt = (
|
||||
"Du bist ein OCR-Experte für deutsche Handschrift. "
|
||||
"Lies den handgeschriebenen Text im Bild und gib ihn wortgetreu wieder."
|
||||
)
|
||||
|
||||
# Export data
|
||||
export_data = []
|
||||
for sample in samples:
|
||||
# Copy image if requested
|
||||
if copy_images and os.path.exists(sample.image_path):
|
||||
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
||||
dest_path = os.path.join(images_path, image_filename)
|
||||
shutil.copy2(sample.image_path, dest_path)
|
||||
image_ref = f"images/{image_filename}"
|
||||
else:
|
||||
image_ref = sample.image_path
|
||||
|
||||
# Build message format
|
||||
user_content = [
|
||||
{"type": "image_url", "image_url": {"url": image_ref}},
|
||||
{"type": "text", "text": "Lies den handgeschriebenen Text in diesem Bild."},
|
||||
]
|
||||
|
||||
# Optionally include base64
|
||||
if include_base64:
|
||||
b64 = self._encode_image_base64(sample.image_path)
|
||||
if b64:
|
||||
ext = Path(sample.image_path).suffix.lower().replace('.', '')
|
||||
mime = {'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg'}.get(ext, 'image/png')
|
||||
user_content[0] = {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:{mime};base64,{b64}"}
|
||||
}
|
||||
|
||||
export_data.append({
|
||||
"id": sample.id,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_content},
|
||||
{"role": "assistant", "content": sample.ground_truth},
|
||||
],
|
||||
})
|
||||
|
||||
# Write JSONL file
|
||||
jsonl_path = os.path.join(batch_path, "train.jsonl")
|
||||
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
||||
for item in export_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
# Write manifest
|
||||
manifest = {
|
||||
"format": "llama_vision",
|
||||
"version": "1.0",
|
||||
"batch_id": batch_id,
|
||||
"sample_count": len(samples),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"files": {
|
||||
"data": "train.jsonl",
|
||||
"images": "images/",
|
||||
},
|
||||
"model_config": {
|
||||
"base_model": "llama3.2-vision:11b",
|
||||
"task": "handwriting-ocr",
|
||||
"system_prompt": system_prompt,
|
||||
},
|
||||
}
|
||||
manifest_path = os.path.join(batch_path, "manifest.json")
|
||||
with open(manifest_path, 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
return ExportResult(
|
||||
export_format="llama_vision",
|
||||
export_path=batch_path,
|
||||
sample_count=len(samples),
|
||||
batch_id=batch_id,
|
||||
created_at=datetime.utcnow(),
|
||||
manifest_path=manifest_path,
|
||||
)
|
||||
|
||||
|
||||
class GenericExporter:
|
||||
"""
|
||||
Export training data in a generic JSONL format.
|
||||
|
||||
This format is compatible with most ML frameworks and can be
|
||||
easily converted to other formats.
|
||||
"""
|
||||
|
||||
def __init__(self, export_path: str = GENERIC_EXPORT_PATH):
|
||||
self.export_path = export_path
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
batch_id: str,
|
||||
copy_images: bool = True,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export samples in generic JSONL format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
batch_id: Unique batch identifier
|
||||
copy_images: Whether to copy images to export directory
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
batch_path = os.path.join(self.export_path, batch_id)
|
||||
images_path = os.path.join(batch_path, "images")
|
||||
os.makedirs(images_path, exist_ok=True)
|
||||
|
||||
# Export data
|
||||
export_data = []
|
||||
for sample in samples:
|
||||
# Copy image if requested
|
||||
if copy_images and os.path.exists(sample.image_path):
|
||||
image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
|
||||
dest_path = os.path.join(images_path, image_filename)
|
||||
shutil.copy2(sample.image_path, dest_path)
|
||||
image_ref = f"images/{image_filename}"
|
||||
else:
|
||||
image_ref = sample.image_path
|
||||
|
||||
export_data.append({
|
||||
"id": sample.id,
|
||||
"image_path": image_ref,
|
||||
"ground_truth": sample.ground_truth,
|
||||
"ocr_text": sample.ocr_text,
|
||||
"ocr_confidence": sample.ocr_confidence,
|
||||
"metadata": sample.metadata or {},
|
||||
})
|
||||
|
||||
# Write JSONL file
|
||||
jsonl_path = os.path.join(batch_path, "data.jsonl")
|
||||
with open(jsonl_path, 'w', encoding='utf-8') as f:
|
||||
for item in export_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
# Also write as single JSON for convenience
|
||||
json_path = os.path.join(batch_path, "data.json")
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Write manifest
|
||||
manifest = {
|
||||
"format": "generic",
|
||||
"version": "1.0",
|
||||
"batch_id": batch_id,
|
||||
"sample_count": len(samples),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"files": {
|
||||
"data_jsonl": "data.jsonl",
|
||||
"data_json": "data.json",
|
||||
"images": "images/",
|
||||
},
|
||||
}
|
||||
manifest_path = os.path.join(batch_path, "manifest.json")
|
||||
with open(manifest_path, 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
return ExportResult(
|
||||
export_format="generic",
|
||||
export_path=batch_path,
|
||||
sample_count=len(samples),
|
||||
batch_id=batch_id,
|
||||
created_at=datetime.utcnow(),
|
||||
manifest_path=manifest_path,
|
||||
)
|
||||
|
||||
|
||||
class TrainingExportService:
|
||||
"""
|
||||
Main service for exporting OCR labeling data to various training formats.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.trocr_exporter = TrOCRExporter()
|
||||
self.llama_vision_exporter = LlamaVisionExporter()
|
||||
self.generic_exporter = GenericExporter()
|
||||
|
||||
def export(
|
||||
self,
|
||||
samples: List[TrainingSample],
|
||||
export_format: str,
|
||||
batch_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> ExportResult:
|
||||
"""
|
||||
Export training samples in the specified format.
|
||||
|
||||
Args:
|
||||
samples: List of training samples
|
||||
export_format: 'trocr', 'llama_vision', or 'generic'
|
||||
batch_id: Optional batch ID (generated if not provided)
|
||||
**kwargs: Additional format-specific options
|
||||
|
||||
Returns:
|
||||
ExportResult with export details
|
||||
"""
|
||||
if not batch_id:
|
||||
batch_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
if export_format == "trocr":
|
||||
return self.trocr_exporter.export(samples, batch_id, **kwargs)
|
||||
elif export_format == "llama_vision":
|
||||
return self.llama_vision_exporter.export(samples, batch_id, **kwargs)
|
||||
elif export_format == "generic":
|
||||
return self.generic_exporter.export(samples, batch_id, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unknown export format: {export_format}")
|
||||
|
||||
def list_exports(self, export_format: Optional[str] = None) -> List[Dict]:
|
||||
"""
|
||||
List all available exports.
|
||||
|
||||
Args:
|
||||
export_format: Optional filter by format
|
||||
|
||||
Returns:
|
||||
List of export manifests
|
||||
"""
|
||||
exports = []
|
||||
|
||||
paths_to_check = []
|
||||
if export_format is None or export_format == "trocr":
|
||||
paths_to_check.append((TROCR_EXPORT_PATH, "trocr"))
|
||||
if export_format is None or export_format == "llama_vision":
|
||||
paths_to_check.append((LLAMA_VISION_EXPORT_PATH, "llama_vision"))
|
||||
if export_format is None or export_format == "generic":
|
||||
paths_to_check.append((GENERIC_EXPORT_PATH, "generic"))
|
||||
|
||||
for base_path, fmt in paths_to_check:
|
||||
if not os.path.exists(base_path):
|
||||
continue
|
||||
for batch_dir in os.listdir(base_path):
|
||||
manifest_path = os.path.join(base_path, batch_dir, "manifest.json")
|
||||
if os.path.exists(manifest_path):
|
||||
with open(manifest_path, 'r') as f:
|
||||
manifest = json.load(f)
|
||||
manifest["export_path"] = os.path.join(base_path, batch_dir)
|
||||
exports.append(manifest)
|
||||
|
||||
return sorted(exports, key=lambda x: x.get("created_at", ""), reverse=True)
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_export_service: Optional[TrainingExportService] = None
|
||||
|
||||
|
||||
def get_training_export_service() -> TrainingExportService:
|
||||
"""Get or create the training export service singleton."""
|
||||
global _export_service
|
||||
if _export_service is None:
|
||||
_export_service = TrainingExportService()
|
||||
return _export_service
|
||||
Reference in New Issue
Block a user