""" Exam Correction Service using Self-Hosted LLM. PRIVACY BY DESIGN: - Only pseudonymized text (doc_token + OCR content) is sent to LLM - No student names or personal data in prompts - All processing happens on self-hosted infrastructure (SysEleven) - No data sent to external APIs (unless explicitly configured) This service generates AI-assisted corrections and feedback for exam answers. """ import logging from typing import Optional, List from dataclasses import dataclass from llm_gateway.services.inference import get_inference_service, InferenceResult from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage from llm_gateway.config import get_config logger = logging.getLogger(__name__) @dataclass class QuestionRubric: """Rubric for a single exam question.""" question_number: int question_text: str max_points: int expected_answer: str grading_criteria: str @dataclass class QuestionResult: """AI correction result for a single question.""" question_number: int points_awarded: int max_points: int feedback: str strengths: List[str] improvements: List[str] @dataclass class CorrectionResult: """Complete correction result for an exam.""" doc_token: str # Pseudonymized identifier total_score: int max_score: int grade: str overall_feedback: str question_results: List[QuestionResult] processing_time_ms: int # German grading scale (can be customized) GERMAN_GRADES = [ (95, "1+"), # sehr gut plus (90, "1"), # sehr gut (85, "1-"), # sehr gut minus (80, "2+"), # gut plus (75, "2"), # gut (70, "2-"), # gut minus (65, "3+"), # befriedigend plus (60, "3"), # befriedigend (55, "3-"), # befriedigend minus (50, "4+"), # ausreichend plus (45, "4"), # ausreichend (40, "4-"), # ausreichend minus (33, "5+"), # mangelhaft plus (27, "5"), # mangelhaft (20, "5-"), # mangelhaft minus (0, "6"), # ungenuegend ] def calculate_grade(percentage: float) -> str: """Calculate German grade from percentage.""" for threshold, grade in GERMAN_GRADES: if percentage >= threshold: return grade return "6" class ExamCorrectionService: """ Service for AI-assisted exam correction. PRIVACY GUARANTEES: 1. Prompts contain NO personal data 2. Only doc_token is used as reference 3. Processing on self-hosted LLM 4. Results stored with pseudonymized identifiers """ # System prompt for exam correction (German) CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten. WICHTIGE REGELN: 1. Bewerte NUR den fachlichen Inhalt der Antwort 2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren) 3. Gib konstruktives, ermutigzendes Feedback 4. Beziehe dich auf die Bewertungskriterien 5. Sei fair und konsistent AUSGABEFORMAT (JSON): { "points": , "feedback": "", "strengths": ["", ""], "improvements": [""] } Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen.""" OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung. Einzelbewertungen: {question_results} Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%) Note: {grade} Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die: 1. Die Staerken hervorhebt 2. Konstruktive Verbesserungsvorschlaege macht 3. Ermutigt und motiviert Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung.""" def __init__(self, model: Optional[str] = None): """ Initialize the correction service. Args: model: LLM model to use (default: qwen2.5:14b from config) DATENSCHUTZ/PRIVACY: Das Modell läuft lokal auf dem Mac Mini via Ollama. Keine Daten werden an externe Server gesendet. """ config = get_config() # Use configured correction model (default: qwen2.5:14b) self.model = model or config.correction_model self.inference = get_inference_service() logger.info(f"Correction service initialized with model: {self.model}") async def correct_question( self, student_answer: str, rubric: QuestionRubric, subject: str = "Allgemein" ) -> QuestionResult: """ Correct a single question answer. Args: student_answer: The student's OCR-extracted answer (pseudonymized) rubric: Grading rubric for this question subject: Subject for context Returns: QuestionResult with points and feedback """ # Build prompt with NO personal data user_prompt = f"""Fach: {subject} Frage {rubric.question_number}: {rubric.question_text} Maximale Punktzahl: {rubric.max_points} Erwartete Antwort: {rubric.expected_answer} Bewertungskriterien: {rubric.grading_criteria} --- Schuelerantwort: {student_answer} --- Bewerte diese Antwort nach den Kriterien.""" request = ChatCompletionRequest( model=self.model, messages=[ ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT), ChatMessage(role="user", content=user_prompt), ], temperature=0.3, # Lower temperature for consistent grading max_tokens=500, ) try: response = await self.inference.complete(request) content = response.choices[0].message.content or "{}" # Parse JSON response import json try: result = json.loads(content) except json.JSONDecodeError: # Fallback parsing logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}") result = { "points": rubric.max_points // 2, "feedback": content[:200], "strengths": [], "improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"] } points = min(int(result.get("points", 0)), rubric.max_points) return QuestionResult( question_number=rubric.question_number, points_awarded=points, max_points=rubric.max_points, feedback=result.get("feedback", ""), strengths=result.get("strengths", []), improvements=result.get("improvements", []), ) except Exception as e: logger.error(f"Correction failed for question {rubric.question_number}: {e}") return QuestionResult( question_number=rubric.question_number, points_awarded=0, max_points=rubric.max_points, feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}", strengths=[], improvements=["Manuelle Korrektur erforderlich"], ) async def correct_exam( self, doc_token: str, ocr_text: str, rubrics: List[QuestionRubric], subject: str = "Allgemein" ) -> CorrectionResult: """ Correct a complete exam with multiple questions. Args: doc_token: Pseudonymized document identifier ocr_text: Full OCR text of the exam (already redacted) rubrics: List of question rubrics subject: Subject name Returns: CorrectionResult with all scores and feedback """ import time start_time = time.time() # Split OCR text into answers (simple heuristic) answers = self._extract_answers(ocr_text, len(rubrics)) # Correct each question question_results = [] for i, rubric in enumerate(rubrics): answer = answers[i] if i < len(answers) else "" result = await self.correct_question(answer, rubric, subject) question_results.append(result) # Calculate totals total_score = sum(r.points_awarded for r in question_results) max_score = sum(r.max_points for r in question_results) percentage = (total_score / max_score * 100) if max_score > 0 else 0 grade = calculate_grade(percentage) # Generate overall feedback overall_feedback = await self._generate_overall_feedback( question_results, total_score, max_score, percentage, grade ) processing_time_ms = int((time.time() - start_time) * 1000) return CorrectionResult( doc_token=doc_token, total_score=total_score, max_score=max_score, grade=grade, overall_feedback=overall_feedback, question_results=question_results, processing_time_ms=processing_time_ms, ) async def _generate_overall_feedback( self, question_results: List[QuestionResult], total_score: int, max_score: int, percentage: float, grade: str ) -> str: """Generate motivating overall feedback.""" # Summarize question results results_summary = "\n".join([ f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}" for r in question_results ]) prompt = self.OVERALL_FEEDBACK_PROMPT.format( question_results=results_summary, total_score=total_score, max_score=max_score, percentage=f"{percentage:.1f}", grade=grade, ) request = ChatCompletionRequest( model=self.model, messages=[ ChatMessage(role="user", content=prompt), ], temperature=0.5, max_tokens=200, ) try: response = await self.inference.complete(request) return response.choices[0].message.content or "Gute Arbeit! Weiter so." except Exception as e: logger.error(f"Failed to generate overall feedback: {e}") return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})" def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]: """ Extract individual answers from OCR text. Simple heuristic: split by question markers (1., 2., etc.) More sophisticated extraction can be implemented. """ import re # Try to find question markers pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*' parts = re.split(pattern, ocr_text) answers = [] i = 1 # Skip first empty part while i < len(parts): if i + 1 < len(parts): # parts[i] is the question number, parts[i+1] is the answer answers.append(parts[i + 1].strip()) i += 2 # Pad with empty answers if needed while len(answers) < num_questions: answers.append("") return answers[:num_questions] # Singleton instance _correction_service: Optional[ExamCorrectionService] = None def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService: """ Get or create the correction service singleton. Args: model: Optional model override. If None, uses config.correction_model (qwen2.5:14b) Returns: ExamCorrectionService instance DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API. """ global _correction_service if _correction_service is None: _correction_service = ExamCorrectionService(model=model) elif model and _correction_service.model != model: # Only recreate if explicitly requesting different model _correction_service = ExamCorrectionService(model=model) return _correction_service