fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/backend/klausur/services/processing_service.py
+++ b/backend/klausur/services/processing_service.py
@@ -0,0 +1,424 @@
+"""
+Background Processing Service for Klausur Correction.
+
+Orchestrates the complete correction pipeline:
+1. Load documents from storage
+2. Run TrOCR for text extraction
+3. Run AI correction for grading
+4. Save results to database
+
+PRIVACY BY DESIGN:
+- Only pseudonymized doc_tokens used throughout
+- No student names in processing pipeline
+- All data stays on self-hosted infrastructure
+"""
+import asyncio
+import logging
+from datetime import datetime
+from typing import Optional, List, Callable
+from dataclasses import dataclass
+
+from sqlalchemy.orm import Session
+
+from ..db_models import (
+    ExamSession, PseudonymizedDocument,
+    SessionStatus, DocumentStatus
+)
+from ..repository import KlausurRepository
+from .trocr_client import get_trocr_client, TrOCRClient
+from .vision_ocr_service import get_vision_ocr_service, VisionOCRService
+from .correction_service import (
+    get_correction_service, ExamCorrectionService,
+    QuestionRubric, CorrectionResult
+)
+from .storage_service import get_storage_service, KlausurStorageService
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ProcessingProgress:
+    """Progress update for SSE streaming."""
+    session_id: str
+    total_documents: int
+    processed_documents: int
+    current_document: Optional[str] = None
+    current_step: str = "idle"  # ocr, correction, saving
+    error: Optional[str] = None
+
+    @property
+    def percentage(self) -> int:
+        if self.total_documents == 0:
+            return 0
+        return int(self.processed_documents / self.total_documents * 100)
+
+
+class ProcessingService:
+    """
+    Background service for exam correction processing.
+
+    Usage:
+        service = ProcessingService(db_session)
+        await service.process_session(session_id, teacher_id)
+    """
+
+    def __init__(
+        self,
+        db: Session,
+        trocr_client: Optional[TrOCRClient] = None,
+        vision_ocr_service: Optional[VisionOCRService] = None,
+        correction_service: Optional[ExamCorrectionService] = None,
+        storage_service: Optional[KlausurStorageService] = None,
+        prefer_vision_ocr: bool = True  # Vision-LLM als Primär für Handschrift
+    ):
+        self.db = db
+        self.repo = KlausurRepository(db)
+        self.trocr = trocr_client or get_trocr_client()
+        self.vision_ocr = vision_ocr_service or get_vision_ocr_service()
+        self.correction = correction_service or get_correction_service()
+        self.storage = storage_service or get_storage_service()
+        self.prefer_vision_ocr = prefer_vision_ocr
+
+        # Progress callback for SSE streaming
+        self._progress_callback: Optional[Callable[[ProcessingProgress], None]] = None
+
+    def set_progress_callback(self, callback: Callable[[ProcessingProgress], None]):
+        """Set callback for progress updates (SSE streaming)."""
+        self._progress_callback = callback
+
+    def _notify_progress(self, progress: ProcessingProgress):
+        """Notify progress to callback if set."""
+        if self._progress_callback:
+            try:
+                self._progress_callback(progress)
+            except Exception as e:
+                logger.warning(f"Progress callback failed: {e}")
+
+    async def process_session(
+        self,
+        session_id: str,
+        teacher_id: str,
+        use_ai_correction: bool = True
+    ) -> bool:
+        """
+        Process all documents in a session.
+
+        Args:
+            session_id: Exam session ID
+            teacher_id: Teacher ID for isolation
+            use_ai_correction: Whether to run AI correction (requires LLM)
+
+        Returns:
+            True if processing completed successfully
+        """
+        # Get session
+        session = self.repo.get_session(session_id, teacher_id)
+        if not session:
+            logger.error(f"Session not found: {session_id}")
+            return False
+
+        # Get documents
+        documents = self.repo.list_documents(session_id, teacher_id)
+        if not documents:
+            logger.warning(f"No documents in session: {session_id}")
+            return False
+
+        total = len(documents)
+        processed = 0
+
+        logger.info(f"Starting processing for session {session_id}: {total} documents")
+
+        # Check OCR service availability (Vision-LLM preferred for handwriting)
+        vision_ocr_available = await self.vision_ocr.is_available()
+        trocr_available = await self.trocr.is_available()
+
+        if vision_ocr_available and self.prefer_vision_ocr:
+            logger.info("Using Vision-LLM (llama3.2-vision) for OCR - optimal for handwriting")
+            use_vision_ocr = True
+        elif trocr_available:
+            logger.info("Using TrOCR for OCR")
+            use_vision_ocr = False
+        elif vision_ocr_available:
+            logger.info("TrOCR not available, falling back to Vision-LLM")
+            use_vision_ocr = True
+        else:
+            logger.warning("No OCR service available - OCR will be skipped")
+            use_vision_ocr = False
+            trocr_available = False
+
+        # Process each document
+        for doc in documents:
+            progress = ProcessingProgress(
+                session_id=session_id,
+                total_documents=total,
+                processed_documents=processed,
+                current_document=doc.doc_token[:8],
+                current_step="ocr"
+            )
+            self._notify_progress(progress)
+
+            try:
+                # Step 1: OCR extraction (Vision-LLM or TrOCR)
+                if (vision_ocr_available or trocr_available) and doc.status == DocumentStatus.UPLOADED:
+                    await self._process_ocr(session_id, doc, teacher_id, use_vision_ocr=use_vision_ocr)
+
+                # Step 2: AI correction
+                progress.current_step = "correction"
+                self._notify_progress(progress)
+
+                if use_ai_correction and doc.ocr_text:
+                    await self._process_correction(session, doc, teacher_id)
+                else:
+                    # Just mark as completed without AI
+                    self._mark_document_completed(doc, teacher_id)
+
+                processed += 1
+
+            except Exception as e:
+                logger.error(f"Failed to process document {doc.doc_token}: {e}")
+                self._mark_document_failed(doc, str(e), teacher_id)
+
+        # Update session status
+        self.repo.update_session_status(session_id, teacher_id, SessionStatus.COMPLETED)
+
+        # Final progress
+        progress = ProcessingProgress(
+            session_id=session_id,
+            total_documents=total,
+            processed_documents=processed,
+            current_step="complete"
+        )
+        self._notify_progress(progress)
+
+        logger.info(f"Completed processing session {session_id}: {processed}/{total} documents")
+        return True
+
+    async def _process_ocr(
+        self,
+        session_id: str,
+        doc: PseudonymizedDocument,
+        teacher_id: str,
+        use_vision_ocr: bool = True
+    ):
+        """
+        Run OCR on a document.
+
+        Args:
+            session_id: Session ID
+            doc: Document to process
+            teacher_id: Teacher ID
+            use_vision_ocr: True to use Vision-LLM (llama3.2-vision), False for TrOCR
+        """
+        # Update status
+        doc.status = DocumentStatus.OCR_PROCESSING
+        doc.processing_started_at = datetime.utcnow()
+        self.db.commit()
+
+        # Try to get document from storage (check both redacted and original)
+        image_data = None
+        for is_redacted in [True, False]:  # Prefer redacted version
+            for ext in ["png", "jpg", "jpeg", "pdf"]:
+                image_data = self.storage.get_document(
+                    session_id, doc.doc_token, ext, is_redacted=is_redacted
+                )
+                if image_data:
+                    logger.debug(f"Found document: {doc.doc_token[:8]}.{ext} (redacted={is_redacted})")
+                    break
+            if image_data:
+                break
+
+        if not image_data:
+            logger.warning(f"No image found for document {doc.doc_token}")
+            # Use placeholder OCR text for testing
+            doc.ocr_text = "[Kein Bild gefunden - Manuelle Eingabe erforderlich]"
+            doc.ocr_confidence = 0
+            doc.status = DocumentStatus.OCR_COMPLETED
+            self.db.commit()
+            return
+
+        # Call OCR service (Vision-LLM or TrOCR)
+        try:
+            if use_vision_ocr:
+                # Use Vision-LLM (llama3.2-vision) - better for handwriting
+                result = await self.vision_ocr.extract_text(
+                    image_data,
+                    filename=f"{doc.doc_token}.png",
+                    is_handwriting=True  # Assume handwriting for exams
+                )
+                ocr_method = "Vision-LLM"
+            else:
+                # Use TrOCR
+                result = await self.trocr.extract_text(
+                    image_data,
+                    filename=f"{doc.doc_token}.png",
+                    detect_lines=True
+                )
+                ocr_method = "TrOCR"
+
+            doc.ocr_text = result.text
+            doc.ocr_confidence = int(result.confidence * 100)
+            doc.status = DocumentStatus.OCR_COMPLETED
+
+            logger.info(
+                f"OCR completed ({ocr_method}) for {doc.doc_token[:8]}: "
+                f"{len(result.text)} chars, {result.confidence:.0%} confidence"
+            )
+
+        except Exception as e:
+            logger.error(f"OCR failed for {doc.doc_token}: {e}")
+            doc.ocr_text = f"[OCR Fehler: {str(e)[:100]}]"
+            doc.ocr_confidence = 0
+            doc.status = DocumentStatus.OCR_COMPLETED  # Continue to AI anyway
+
+        self.db.commit()
+
+    async def _process_correction(
+        self,
+        session: ExamSession,
+        doc: PseudonymizedDocument,
+        teacher_id: str
+    ):
+        """Run AI correction on a document."""
+        doc.status = DocumentStatus.AI_PROCESSING
+        self.db.commit()
+
+        # Build rubrics from session questions
+        rubrics = self._build_rubrics(session)
+
+        if not rubrics:
+            # No rubrics defined - use simple scoring
+            doc.ai_feedback = "Keine Bewertungskriterien definiert. Manuelle Korrektur empfohlen."
+            doc.ai_score = None
+            doc.ai_grade = None
+            doc.status = DocumentStatus.COMPLETED
+            doc.processing_completed_at = datetime.utcnow()
+            self.db.commit()
+
+            # Update session stats
+            session.processed_count += 1
+            self.db.commit()
+            return
+
+        try:
+            # Run AI correction
+            result = await self.correction.correct_exam(
+                doc_token=doc.doc_token,
+                ocr_text=doc.ocr_text,
+                rubrics=rubrics,
+                subject=session.subject or "Allgemein"
+            )
+
+            # Save results
+            doc.ai_feedback = result.overall_feedback
+            doc.ai_score = result.total_score
+            doc.ai_grade = result.grade
+            doc.ai_details = {
+                "max_score": result.max_score,
+                "processing_time_ms": result.processing_time_ms,
+                "questions": [
+                    {
+                        "number": q.question_number,
+                        "points": q.points_awarded,
+                        "max_points": q.max_points,
+                        "feedback": q.feedback,
+                        "strengths": q.strengths,
+                        "improvements": q.improvements
+                    }
+                    for q in result.question_results
+                ]
+            }
+            doc.status = DocumentStatus.COMPLETED
+            doc.processing_completed_at = datetime.utcnow()
+
+            logger.info(
+                f"Correction completed for {doc.doc_token[:8]}: "
+                f"{result.total_score}/{result.max_score} ({result.grade})"
+            )
+
+        except Exception as e:
+            logger.error(f"AI correction failed for {doc.doc_token}: {e}")
+            doc.ai_feedback = f"KI-Korrektur fehlgeschlagen: {str(e)[:200]}"
+            doc.status = DocumentStatus.COMPLETED  # Mark complete anyway
+            doc.processing_completed_at = datetime.utcnow()
+
+        # Update session stats
+        session.processed_count += 1
+        self.db.commit()
+
+    def _build_rubrics(self, session: ExamSession) -> List[QuestionRubric]:
+        """Build QuestionRubric list from session questions."""
+        rubrics = []
+
+        if not session.questions:
+            return rubrics
+
+        for i, q in enumerate(session.questions):
+            rubric = QuestionRubric(
+                question_number=q.get("number", i + 1),
+                question_text=q.get("text", f"Frage {i + 1}"),
+                max_points=q.get("points", 10),
+                expected_answer=q.get("expected_answer", ""),
+                grading_criteria=q.get("rubric", session.rubric or "")
+            )
+            rubrics.append(rubric)
+
+        return rubrics
+
+    def _mark_document_completed(
+        self,
+        doc: PseudonymizedDocument,
+        teacher_id: str
+    ):
+        """Mark document as completed without AI correction."""
+        doc.status = DocumentStatus.COMPLETED
+        doc.processing_completed_at = datetime.utcnow()
+        if not doc.ai_feedback:
+            doc.ai_feedback = "Verarbeitung abgeschlossen (ohne KI-Korrektur)"
+        self.db.commit()
+
+        # Update session stats
+        if doc.session:
+            doc.session.processed_count += 1
+            self.db.commit()
+
+    def _mark_document_failed(
+        self,
+        doc: PseudonymizedDocument,
+        error: str,
+        teacher_id: str
+    ):
+        """Mark document as failed."""
+        doc.status = DocumentStatus.FAILED
+        doc.processing_error = error[:500]
+        doc.processing_completed_at = datetime.utcnow()
+        self.db.commit()
+
+
+# Background task function for FastAPI
+async def process_session_background(
+    session_id: str,
+    teacher_id: str,
+    db_url: str
+):
+    """
+    Background task for session processing.
+
+    This function creates its own DB session for use in background tasks.
+    """
+    from ..database import SessionLocal
+
+    db = SessionLocal()
+    try:
+        service = ProcessingService(db)
+        await service.process_session(session_id, teacher_id)
+    finally:
+        db.close()
+
+
+# Singleton for main service
+_processing_service: Optional[ProcessingService] = None
+
+
+def get_processing_service(db: Session) -> ProcessingService:
+    """Get processing service instance."""
+    return ProcessingService(db)