fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/klausur-service/backend/training_export_service.py
+++ b/klausur-service/backend/training_export_service.py
@@ -0,0 +1,448 @@
+"""
+Training Export Service for OCR Labeling Data
+
+Exports labeled OCR data in formats suitable for fine-tuning:
+- TrOCR (Microsoft's Transformer-based OCR model)
+- llama3.2-vision (Meta's Vision-Language Model)
+- Generic JSONL format
+
+DATENSCHUTZ/PRIVACY:
+- Alle Daten bleiben lokal auf dem Mac Mini
+- Keine Cloud-Uploads ohne explizite Zustimmung
+- Export-Pfade sind konfigurierbar
+"""
+
+import os
+import json
+import base64
+import shutil
+from pathlib import Path
+from typing import List, Dict, Optional, Any
+from dataclasses import dataclass
+from datetime import datetime
+import hashlib
+
+# Export directory configuration
+EXPORT_BASE_PATH = os.getenv("OCR_EXPORT_PATH", "/app/ocr-exports")
+TROCR_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "trocr")
+LLAMA_VISION_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "llama-vision")
+GENERIC_EXPORT_PATH = os.path.join(EXPORT_BASE_PATH, "generic")
+
+
+@dataclass
+class TrainingSample:
+    """A single training sample for OCR fine-tuning."""
+    id: str
+    image_path: str
+    ground_truth: str
+    ocr_text: Optional[str] = None
+    ocr_confidence: Optional[float] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class ExportResult:
+    """Result of a training data export."""
+    export_format: str
+    export_path: str
+    sample_count: int
+    batch_id: str
+    created_at: datetime
+    manifest_path: str
+
+
+class TrOCRExporter:
+    """
+    Export training data for TrOCR fine-tuning.
+
+    TrOCR expects:
+    - Image files (PNG/JPG)
+    - A CSV/TSV file with: image_path, text
+    - Or a JSONL file with: {"file_name": "img.png", "text": "ground truth"}
+
+    We use the JSONL format for flexibility.
+    """
+
+    def __init__(self, export_path: str = TROCR_EXPORT_PATH):
+        self.export_path = export_path
+        os.makedirs(export_path, exist_ok=True)
+
+    def export(
+        self,
+        samples: List[TrainingSample],
+        batch_id: str,
+        copy_images: bool = True,
+    ) -> ExportResult:
+        """
+        Export samples in TrOCR format.
+
+        Args:
+            samples: List of training samples
+            batch_id: Unique batch identifier
+            copy_images: Whether to copy images to export directory
+
+        Returns:
+            ExportResult with export details
+        """
+        batch_path = os.path.join(self.export_path, batch_id)
+        images_path = os.path.join(batch_path, "images")
+        os.makedirs(images_path, exist_ok=True)
+
+        # Export data
+        export_data = []
+        for sample in samples:
+            # Copy image if requested
+            if copy_images and os.path.exists(sample.image_path):
+                image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
+                dest_path = os.path.join(images_path, image_filename)
+                shutil.copy2(sample.image_path, dest_path)
+                image_ref = f"images/{image_filename}"
+            else:
+                image_ref = sample.image_path
+
+            export_data.append({
+                "file_name": image_ref,
+                "text": sample.ground_truth,
+                "id": sample.id,
+            })
+
+        # Write JSONL file
+        jsonl_path = os.path.join(batch_path, "train.jsonl")
+        with open(jsonl_path, 'w', encoding='utf-8') as f:
+            for item in export_data:
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+        # Write manifest
+        manifest = {
+            "format": "trocr",
+            "version": "1.0",
+            "batch_id": batch_id,
+            "sample_count": len(samples),
+            "created_at": datetime.utcnow().isoformat(),
+            "files": {
+                "data": "train.jsonl",
+                "images": "images/",
+            },
+            "model_config": {
+                "base_model": "microsoft/trocr-base-handwritten",
+                "task": "handwriting-recognition",
+            },
+        }
+        manifest_path = os.path.join(batch_path, "manifest.json")
+        with open(manifest_path, 'w') as f:
+            json.dump(manifest, f, indent=2)
+
+        return ExportResult(
+            export_format="trocr",
+            export_path=batch_path,
+            sample_count=len(samples),
+            batch_id=batch_id,
+            created_at=datetime.utcnow(),
+            manifest_path=manifest_path,
+        )
+
+
+class LlamaVisionExporter:
+    """
+    Export training data for llama3.2-vision fine-tuning.
+
+    Llama Vision fine-tuning expects:
+    - JSONL format with base64-encoded images or image URLs
+    - Format: {"messages": [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "..."}]}, {"role": "assistant", "content": "..."}]}
+
+    We create a supervised fine-tuning dataset.
+    """
+
+    def __init__(self, export_path: str = LLAMA_VISION_EXPORT_PATH):
+        self.export_path = export_path
+        os.makedirs(export_path, exist_ok=True)
+
+    def _encode_image_base64(self, image_path: str) -> Optional[str]:
+        """Encode image to base64."""
+        try:
+            with open(image_path, 'rb') as f:
+                return base64.b64encode(f.read()).decode('utf-8')
+        except Exception:
+            return None
+
+    def export(
+        self,
+        samples: List[TrainingSample],
+        batch_id: str,
+        include_base64: bool = False,
+        copy_images: bool = True,
+    ) -> ExportResult:
+        """
+        Export samples in Llama Vision fine-tuning format.
+
+        Args:
+            samples: List of training samples
+            batch_id: Unique batch identifier
+            include_base64: Whether to include base64-encoded images in JSONL
+            copy_images: Whether to copy images to export directory
+
+        Returns:
+            ExportResult with export details
+        """
+        batch_path = os.path.join(self.export_path, batch_id)
+        images_path = os.path.join(batch_path, "images")
+        os.makedirs(images_path, exist_ok=True)
+
+        # OCR instruction prompt
+        system_prompt = (
+            "Du bist ein OCR-Experte für deutsche Handschrift. "
+            "Lies den handgeschriebenen Text im Bild und gib ihn wortgetreu wieder."
+        )
+
+        # Export data
+        export_data = []
+        for sample in samples:
+            # Copy image if requested
+            if copy_images and os.path.exists(sample.image_path):
+                image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
+                dest_path = os.path.join(images_path, image_filename)
+                shutil.copy2(sample.image_path, dest_path)
+                image_ref = f"images/{image_filename}"
+            else:
+                image_ref = sample.image_path
+
+            # Build message format
+            user_content = [
+                {"type": "image_url", "image_url": {"url": image_ref}},
+                {"type": "text", "text": "Lies den handgeschriebenen Text in diesem Bild."},
+            ]
+
+            # Optionally include base64
+            if include_base64:
+                b64 = self._encode_image_base64(sample.image_path)
+                if b64:
+                    ext = Path(sample.image_path).suffix.lower().replace('.', '')
+                    mime = {'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg'}.get(ext, 'image/png')
+                    user_content[0] = {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{mime};base64,{b64}"}
+                    }
+
+            export_data.append({
+                "id": sample.id,
+                "messages": [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_content},
+                    {"role": "assistant", "content": sample.ground_truth},
+                ],
+            })
+
+        # Write JSONL file
+        jsonl_path = os.path.join(batch_path, "train.jsonl")
+        with open(jsonl_path, 'w', encoding='utf-8') as f:
+            for item in export_data:
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+        # Write manifest
+        manifest = {
+            "format": "llama_vision",
+            "version": "1.0",
+            "batch_id": batch_id,
+            "sample_count": len(samples),
+            "created_at": datetime.utcnow().isoformat(),
+            "files": {
+                "data": "train.jsonl",
+                "images": "images/",
+            },
+            "model_config": {
+                "base_model": "llama3.2-vision:11b",
+                "task": "handwriting-ocr",
+                "system_prompt": system_prompt,
+            },
+        }
+        manifest_path = os.path.join(batch_path, "manifest.json")
+        with open(manifest_path, 'w') as f:
+            json.dump(manifest, f, indent=2)
+
+        return ExportResult(
+            export_format="llama_vision",
+            export_path=batch_path,
+            sample_count=len(samples),
+            batch_id=batch_id,
+            created_at=datetime.utcnow(),
+            manifest_path=manifest_path,
+        )
+
+
+class GenericExporter:
+    """
+    Export training data in a generic JSONL format.
+
+    This format is compatible with most ML frameworks and can be
+    easily converted to other formats.
+    """
+
+    def __init__(self, export_path: str = GENERIC_EXPORT_PATH):
+        self.export_path = export_path
+        os.makedirs(export_path, exist_ok=True)
+
+    def export(
+        self,
+        samples: List[TrainingSample],
+        batch_id: str,
+        copy_images: bool = True,
+    ) -> ExportResult:
+        """
+        Export samples in generic JSONL format.
+
+        Args:
+            samples: List of training samples
+            batch_id: Unique batch identifier
+            copy_images: Whether to copy images to export directory
+
+        Returns:
+            ExportResult with export details
+        """
+        batch_path = os.path.join(self.export_path, batch_id)
+        images_path = os.path.join(batch_path, "images")
+        os.makedirs(images_path, exist_ok=True)
+
+        # Export data
+        export_data = []
+        for sample in samples:
+            # Copy image if requested
+            if copy_images and os.path.exists(sample.image_path):
+                image_filename = f"{sample.id}{Path(sample.image_path).suffix}"
+                dest_path = os.path.join(images_path, image_filename)
+                shutil.copy2(sample.image_path, dest_path)
+                image_ref = f"images/{image_filename}"
+            else:
+                image_ref = sample.image_path
+
+            export_data.append({
+                "id": sample.id,
+                "image_path": image_ref,
+                "ground_truth": sample.ground_truth,
+                "ocr_text": sample.ocr_text,
+                "ocr_confidence": sample.ocr_confidence,
+                "metadata": sample.metadata or {},
+            })
+
+        # Write JSONL file
+        jsonl_path = os.path.join(batch_path, "data.jsonl")
+        with open(jsonl_path, 'w', encoding='utf-8') as f:
+            for item in export_data:
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+        # Also write as single JSON for convenience
+        json_path = os.path.join(batch_path, "data.json")
+        with open(json_path, 'w', encoding='utf-8') as f:
+            json.dump(export_data, f, indent=2, ensure_ascii=False)
+
+        # Write manifest
+        manifest = {
+            "format": "generic",
+            "version": "1.0",
+            "batch_id": batch_id,
+            "sample_count": len(samples),
+            "created_at": datetime.utcnow().isoformat(),
+            "files": {
+                "data_jsonl": "data.jsonl",
+                "data_json": "data.json",
+                "images": "images/",
+            },
+        }
+        manifest_path = os.path.join(batch_path, "manifest.json")
+        with open(manifest_path, 'w') as f:
+            json.dump(manifest, f, indent=2)
+
+        return ExportResult(
+            export_format="generic",
+            export_path=batch_path,
+            sample_count=len(samples),
+            batch_id=batch_id,
+            created_at=datetime.utcnow(),
+            manifest_path=manifest_path,
+        )
+
+
+class TrainingExportService:
+    """
+    Main service for exporting OCR labeling data to various training formats.
+    """
+
+    def __init__(self):
+        self.trocr_exporter = TrOCRExporter()
+        self.llama_vision_exporter = LlamaVisionExporter()
+        self.generic_exporter = GenericExporter()
+
+    def export(
+        self,
+        samples: List[TrainingSample],
+        export_format: str,
+        batch_id: Optional[str] = None,
+        **kwargs,
+    ) -> ExportResult:
+        """
+        Export training samples in the specified format.
+
+        Args:
+            samples: List of training samples
+            export_format: 'trocr', 'llama_vision', or 'generic'
+            batch_id: Optional batch ID (generated if not provided)
+            **kwargs: Additional format-specific options
+
+        Returns:
+            ExportResult with export details
+        """
+        if not batch_id:
+            batch_id = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+
+        if export_format == "trocr":
+            return self.trocr_exporter.export(samples, batch_id, **kwargs)
+        elif export_format == "llama_vision":
+            return self.llama_vision_exporter.export(samples, batch_id, **kwargs)
+        elif export_format == "generic":
+            return self.generic_exporter.export(samples, batch_id, **kwargs)
+        else:
+            raise ValueError(f"Unknown export format: {export_format}")
+
+    def list_exports(self, export_format: Optional[str] = None) -> List[Dict]:
+        """
+        List all available exports.
+
+        Args:
+            export_format: Optional filter by format
+
+        Returns:
+            List of export manifests
+        """
+        exports = []
+
+        paths_to_check = []
+        if export_format is None or export_format == "trocr":
+            paths_to_check.append((TROCR_EXPORT_PATH, "trocr"))
+        if export_format is None or export_format == "llama_vision":
+            paths_to_check.append((LLAMA_VISION_EXPORT_PATH, "llama_vision"))
+        if export_format is None or export_format == "generic":
+            paths_to_check.append((GENERIC_EXPORT_PATH, "generic"))
+
+        for base_path, fmt in paths_to_check:
+            if not os.path.exists(base_path):
+                continue
+            for batch_dir in os.listdir(base_path):
+                manifest_path = os.path.join(base_path, batch_dir, "manifest.json")
+                if os.path.exists(manifest_path):
+                    with open(manifest_path, 'r') as f:
+                        manifest = json.load(f)
+                        manifest["export_path"] = os.path.join(base_path, batch_dir)
+                        exports.append(manifest)
+
+        return sorted(exports, key=lambda x: x.get("created_at", ""), reverse=True)
+
+
+# Singleton instance
+_export_service: Optional[TrainingExportService] = None
+
+
+def get_training_export_service() -> TrainingExportService:
+    """Get or create the training export service singleton."""
+    global _export_service
+    if _export_service is None:
+        _export_service = TrainingExportService()
+    return _export_service