fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/backend/klausur/services/correction_service.py
+++ b/backend/klausur/services/correction_service.py
@@ -0,0 +1,379 @@
+"""
+Exam Correction Service using Self-Hosted LLM.
+
+PRIVACY BY DESIGN:
+- Only pseudonymized text (doc_token + OCR content) is sent to LLM
+- No student names or personal data in prompts
+- All processing happens on self-hosted infrastructure (SysEleven)
+- No data sent to external APIs (unless explicitly configured)
+
+This service generates AI-assisted corrections and feedback for exam answers.
+"""
+import logging
+from typing import Optional, List
+from dataclasses import dataclass
+
+from llm_gateway.services.inference import get_inference_service, InferenceResult
+from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage
+from llm_gateway.config import get_config
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class QuestionRubric:
+    """Rubric for a single exam question."""
+    question_number: int
+    question_text: str
+    max_points: int
+    expected_answer: str
+    grading_criteria: str
+
+
+@dataclass
+class QuestionResult:
+    """AI correction result for a single question."""
+    question_number: int
+    points_awarded: int
+    max_points: int
+    feedback: str
+    strengths: List[str]
+    improvements: List[str]
+
+
+@dataclass
+class CorrectionResult:
+    """Complete correction result for an exam."""
+    doc_token: str  # Pseudonymized identifier
+    total_score: int
+    max_score: int
+    grade: str
+    overall_feedback: str
+    question_results: List[QuestionResult]
+    processing_time_ms: int
+
+
+# German grading scale (can be customized)
+GERMAN_GRADES = [
+    (95, "1+"),  # sehr gut plus
+    (90, "1"),   # sehr gut
+    (85, "1-"),  # sehr gut minus
+    (80, "2+"),  # gut plus
+    (75, "2"),   # gut
+    (70, "2-"),  # gut minus
+    (65, "3+"),  # befriedigend plus
+    (60, "3"),   # befriedigend
+    (55, "3-"),  # befriedigend minus
+    (50, "4+"),  # ausreichend plus
+    (45, "4"),   # ausreichend
+    (40, "4-"),  # ausreichend minus
+    (33, "5+"),  # mangelhaft plus
+    (27, "5"),   # mangelhaft
+    (20, "5-"),  # mangelhaft minus
+    (0, "6"),    # ungenuegend
+]
+
+
+def calculate_grade(percentage: float) -> str:
+    """Calculate German grade from percentage."""
+    for threshold, grade in GERMAN_GRADES:
+        if percentage >= threshold:
+            return grade
+    return "6"
+
+
+class ExamCorrectionService:
+    """
+    Service for AI-assisted exam correction.
+
+    PRIVACY GUARANTEES:
+    1. Prompts contain NO personal data
+    2. Only doc_token is used as reference
+    3. Processing on self-hosted LLM
+    4. Results stored with pseudonymized identifiers
+    """
+
+    # System prompt for exam correction (German)
+    CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten.
+
+WICHTIGE REGELN:
+1. Bewerte NUR den fachlichen Inhalt der Antwort
+2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren)
+3. Gib konstruktives, ermutigzendes Feedback
+4. Beziehe dich auf die Bewertungskriterien
+5. Sei fair und konsistent
+
+AUSGABEFORMAT (JSON):
+{
+    "points": <Punktzahl>,
+    "feedback": "<Kurze Begruendung der Bewertung>",
+    "strengths": ["<Staerke 1>", "<Staerke 2>"],
+    "improvements": ["<Verbesserungsvorschlag 1>"]
+}
+
+Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen."""
+
+    OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung.
+
+Einzelbewertungen:
+{question_results}
+
+Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%)
+Note: {grade}
+
+Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die:
+1. Die Staerken hervorhebt
+2. Konstruktive Verbesserungsvorschlaege macht
+3. Ermutigt und motiviert
+
+Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung."""
+
+    def __init__(self, model: Optional[str] = None):
+        """
+        Initialize the correction service.
+
+        Args:
+            model: LLM model to use (default: qwen2.5:14b from config)
+
+        DATENSCHUTZ/PRIVACY:
+        Das Modell läuft lokal auf dem Mac Mini via Ollama.
+        Keine Daten werden an externe Server gesendet.
+        """
+        config = get_config()
+        # Use configured correction model (default: qwen2.5:14b)
+        self.model = model or config.correction_model
+        self.inference = get_inference_service()
+        logger.info(f"Correction service initialized with model: {self.model}")
+
+    async def correct_question(
+        self,
+        student_answer: str,
+        rubric: QuestionRubric,
+        subject: str = "Allgemein"
+    ) -> QuestionResult:
+        """
+        Correct a single question answer.
+
+        Args:
+            student_answer: The student's OCR-extracted answer (pseudonymized)
+            rubric: Grading rubric for this question
+            subject: Subject for context
+
+        Returns:
+            QuestionResult with points and feedback
+        """
+        # Build prompt with NO personal data
+        user_prompt = f"""Fach: {subject}
+Frage {rubric.question_number}: {rubric.question_text}
+Maximale Punktzahl: {rubric.max_points}
+
+Erwartete Antwort:
+{rubric.expected_answer}
+
+Bewertungskriterien:
+{rubric.grading_criteria}
+
+---
+
+Schuelerantwort:
+{student_answer}
+
+---
+
+Bewerte diese Antwort nach den Kriterien."""
+
+        request = ChatCompletionRequest(
+            model=self.model,
+            messages=[
+                ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT),
+                ChatMessage(role="user", content=user_prompt),
+            ],
+            temperature=0.3,  # Lower temperature for consistent grading
+            max_tokens=500,
+        )
+
+        try:
+            response = await self.inference.complete(request)
+            content = response.choices[0].message.content or "{}"
+
+            # Parse JSON response
+            import json
+            try:
+                result = json.loads(content)
+            except json.JSONDecodeError:
+                # Fallback parsing
+                logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}")
+                result = {
+                    "points": rubric.max_points // 2,
+                    "feedback": content[:200],
+                    "strengths": [],
+                    "improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"]
+                }
+
+            points = min(int(result.get("points", 0)), rubric.max_points)
+
+            return QuestionResult(
+                question_number=rubric.question_number,
+                points_awarded=points,
+                max_points=rubric.max_points,
+                feedback=result.get("feedback", ""),
+                strengths=result.get("strengths", []),
+                improvements=result.get("improvements", []),
+            )
+
+        except Exception as e:
+            logger.error(f"Correction failed for question {rubric.question_number}: {e}")
+            return QuestionResult(
+                question_number=rubric.question_number,
+                points_awarded=0,
+                max_points=rubric.max_points,
+                feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}",
+                strengths=[],
+                improvements=["Manuelle Korrektur erforderlich"],
+            )
+
+    async def correct_exam(
+        self,
+        doc_token: str,
+        ocr_text: str,
+        rubrics: List[QuestionRubric],
+        subject: str = "Allgemein"
+    ) -> CorrectionResult:
+        """
+        Correct a complete exam with multiple questions.
+
+        Args:
+            doc_token: Pseudonymized document identifier
+            ocr_text: Full OCR text of the exam (already redacted)
+            rubrics: List of question rubrics
+            subject: Subject name
+
+        Returns:
+            CorrectionResult with all scores and feedback
+        """
+        import time
+        start_time = time.time()
+
+        # Split OCR text into answers (simple heuristic)
+        answers = self._extract_answers(ocr_text, len(rubrics))
+
+        # Correct each question
+        question_results = []
+        for i, rubric in enumerate(rubrics):
+            answer = answers[i] if i < len(answers) else ""
+            result = await self.correct_question(answer, rubric, subject)
+            question_results.append(result)
+
+        # Calculate totals
+        total_score = sum(r.points_awarded for r in question_results)
+        max_score = sum(r.max_points for r in question_results)
+        percentage = (total_score / max_score * 100) if max_score > 0 else 0
+        grade = calculate_grade(percentage)
+
+        # Generate overall feedback
+        overall_feedback = await self._generate_overall_feedback(
+            question_results, total_score, max_score, percentage, grade
+        )
+
+        processing_time_ms = int((time.time() - start_time) * 1000)
+
+        return CorrectionResult(
+            doc_token=doc_token,
+            total_score=total_score,
+            max_score=max_score,
+            grade=grade,
+            overall_feedback=overall_feedback,
+            question_results=question_results,
+            processing_time_ms=processing_time_ms,
+        )
+
+    async def _generate_overall_feedback(
+        self,
+        question_results: List[QuestionResult],
+        total_score: int,
+        max_score: int,
+        percentage: float,
+        grade: str
+    ) -> str:
+        """Generate motivating overall feedback."""
+        # Summarize question results
+        results_summary = "\n".join([
+            f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}"
+            for r in question_results
+        ])
+
+        prompt = self.OVERALL_FEEDBACK_PROMPT.format(
+            question_results=results_summary,
+            total_score=total_score,
+            max_score=max_score,
+            percentage=f"{percentage:.1f}",
+            grade=grade,
+        )
+
+        request = ChatCompletionRequest(
+            model=self.model,
+            messages=[
+                ChatMessage(role="user", content=prompt),
+            ],
+            temperature=0.5,
+            max_tokens=200,
+        )
+
+        try:
+            response = await self.inference.complete(request)
+            return response.choices[0].message.content or "Gute Arbeit! Weiter so."
+        except Exception as e:
+            logger.error(f"Failed to generate overall feedback: {e}")
+            return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})"
+
+    def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]:
+        """
+        Extract individual answers from OCR text.
+
+        Simple heuristic: split by question markers (1., 2., etc.)
+        More sophisticated extraction can be implemented.
+        """
+        import re
+
+        # Try to find question markers
+        pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*'
+        parts = re.split(pattern, ocr_text)
+
+        answers = []
+        i = 1  # Skip first empty part
+        while i < len(parts):
+            if i + 1 < len(parts):
+                # parts[i] is the question number, parts[i+1] is the answer
+                answers.append(parts[i + 1].strip())
+            i += 2
+
+        # Pad with empty answers if needed
+        while len(answers) < num_questions:
+            answers.append("")
+
+        return answers[:num_questions]
+
+
+# Singleton instance
+_correction_service: Optional[ExamCorrectionService] = None
+
+
+def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService:
+    """
+    Get or create the correction service singleton.
+
+    Args:
+        model: Optional model override. If None, uses config.correction_model (qwen2.5:14b)
+
+    Returns:
+        ExamCorrectionService instance
+
+    DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API.
+    """
+    global _correction_service
+    if _correction_service is None:
+        _correction_service = ExamCorrectionService(model=model)
+    elif model and _correction_service.model != model:
+        # Only recreate if explicitly requesting different model
+        _correction_service = ExamCorrectionService(model=model)
+    return _correction_service