This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/klausur/services/correction_service.py
Benjamin Admin bfdaf63ba9 fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

380 lines
12 KiB
Python

"""
Exam Correction Service using Self-Hosted LLM.
PRIVACY BY DESIGN:
- Only pseudonymized text (doc_token + OCR content) is sent to LLM
- No student names or personal data in prompts
- All processing happens on self-hosted infrastructure (SysEleven)
- No data sent to external APIs (unless explicitly configured)
This service generates AI-assisted corrections and feedback for exam answers.
"""
import logging
from typing import Optional, List
from dataclasses import dataclass
from llm_gateway.services.inference import get_inference_service, InferenceResult
from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage
from llm_gateway.config import get_config
logger = logging.getLogger(__name__)
@dataclass
class QuestionRubric:
"""Rubric for a single exam question."""
question_number: int
question_text: str
max_points: int
expected_answer: str
grading_criteria: str
@dataclass
class QuestionResult:
"""AI correction result for a single question."""
question_number: int
points_awarded: int
max_points: int
feedback: str
strengths: List[str]
improvements: List[str]
@dataclass
class CorrectionResult:
"""Complete correction result for an exam."""
doc_token: str # Pseudonymized identifier
total_score: int
max_score: int
grade: str
overall_feedback: str
question_results: List[QuestionResult]
processing_time_ms: int
# German grading scale (can be customized)
GERMAN_GRADES = [
(95, "1+"), # sehr gut plus
(90, "1"), # sehr gut
(85, "1-"), # sehr gut minus
(80, "2+"), # gut plus
(75, "2"), # gut
(70, "2-"), # gut minus
(65, "3+"), # befriedigend plus
(60, "3"), # befriedigend
(55, "3-"), # befriedigend minus
(50, "4+"), # ausreichend plus
(45, "4"), # ausreichend
(40, "4-"), # ausreichend minus
(33, "5+"), # mangelhaft plus
(27, "5"), # mangelhaft
(20, "5-"), # mangelhaft minus
(0, "6"), # ungenuegend
]
def calculate_grade(percentage: float) -> str:
"""Calculate German grade from percentage."""
for threshold, grade in GERMAN_GRADES:
if percentage >= threshold:
return grade
return "6"
class ExamCorrectionService:
"""
Service for AI-assisted exam correction.
PRIVACY GUARANTEES:
1. Prompts contain NO personal data
2. Only doc_token is used as reference
3. Processing on self-hosted LLM
4. Results stored with pseudonymized identifiers
"""
# System prompt for exam correction (German)
CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten.
WICHTIGE REGELN:
1. Bewerte NUR den fachlichen Inhalt der Antwort
2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren)
3. Gib konstruktives, ermutigzendes Feedback
4. Beziehe dich auf die Bewertungskriterien
5. Sei fair und konsistent
AUSGABEFORMAT (JSON):
{
"points": <Punktzahl>,
"feedback": "<Kurze Begruendung der Bewertung>",
"strengths": ["<Staerke 1>", "<Staerke 2>"],
"improvements": ["<Verbesserungsvorschlag 1>"]
}
Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen."""
OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung.
Einzelbewertungen:
{question_results}
Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%)
Note: {grade}
Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die:
1. Die Staerken hervorhebt
2. Konstruktive Verbesserungsvorschlaege macht
3. Ermutigt und motiviert
Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung."""
def __init__(self, model: Optional[str] = None):
"""
Initialize the correction service.
Args:
model: LLM model to use (default: qwen2.5:14b from config)
DATENSCHUTZ/PRIVACY:
Das Modell läuft lokal auf dem Mac Mini via Ollama.
Keine Daten werden an externe Server gesendet.
"""
config = get_config()
# Use configured correction model (default: qwen2.5:14b)
self.model = model or config.correction_model
self.inference = get_inference_service()
logger.info(f"Correction service initialized with model: {self.model}")
async def correct_question(
self,
student_answer: str,
rubric: QuestionRubric,
subject: str = "Allgemein"
) -> QuestionResult:
"""
Correct a single question answer.
Args:
student_answer: The student's OCR-extracted answer (pseudonymized)
rubric: Grading rubric for this question
subject: Subject for context
Returns:
QuestionResult with points and feedback
"""
# Build prompt with NO personal data
user_prompt = f"""Fach: {subject}
Frage {rubric.question_number}: {rubric.question_text}
Maximale Punktzahl: {rubric.max_points}
Erwartete Antwort:
{rubric.expected_answer}
Bewertungskriterien:
{rubric.grading_criteria}
---
Schuelerantwort:
{student_answer}
---
Bewerte diese Antwort nach den Kriterien."""
request = ChatCompletionRequest(
model=self.model,
messages=[
ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT),
ChatMessage(role="user", content=user_prompt),
],
temperature=0.3, # Lower temperature for consistent grading
max_tokens=500,
)
try:
response = await self.inference.complete(request)
content = response.choices[0].message.content or "{}"
# Parse JSON response
import json
try:
result = json.loads(content)
except json.JSONDecodeError:
# Fallback parsing
logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}")
result = {
"points": rubric.max_points // 2,
"feedback": content[:200],
"strengths": [],
"improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"]
}
points = min(int(result.get("points", 0)), rubric.max_points)
return QuestionResult(
question_number=rubric.question_number,
points_awarded=points,
max_points=rubric.max_points,
feedback=result.get("feedback", ""),
strengths=result.get("strengths", []),
improvements=result.get("improvements", []),
)
except Exception as e:
logger.error(f"Correction failed for question {rubric.question_number}: {e}")
return QuestionResult(
question_number=rubric.question_number,
points_awarded=0,
max_points=rubric.max_points,
feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}",
strengths=[],
improvements=["Manuelle Korrektur erforderlich"],
)
async def correct_exam(
self,
doc_token: str,
ocr_text: str,
rubrics: List[QuestionRubric],
subject: str = "Allgemein"
) -> CorrectionResult:
"""
Correct a complete exam with multiple questions.
Args:
doc_token: Pseudonymized document identifier
ocr_text: Full OCR text of the exam (already redacted)
rubrics: List of question rubrics
subject: Subject name
Returns:
CorrectionResult with all scores and feedback
"""
import time
start_time = time.time()
# Split OCR text into answers (simple heuristic)
answers = self._extract_answers(ocr_text, len(rubrics))
# Correct each question
question_results = []
for i, rubric in enumerate(rubrics):
answer = answers[i] if i < len(answers) else ""
result = await self.correct_question(answer, rubric, subject)
question_results.append(result)
# Calculate totals
total_score = sum(r.points_awarded for r in question_results)
max_score = sum(r.max_points for r in question_results)
percentage = (total_score / max_score * 100) if max_score > 0 else 0
grade = calculate_grade(percentage)
# Generate overall feedback
overall_feedback = await self._generate_overall_feedback(
question_results, total_score, max_score, percentage, grade
)
processing_time_ms = int((time.time() - start_time) * 1000)
return CorrectionResult(
doc_token=doc_token,
total_score=total_score,
max_score=max_score,
grade=grade,
overall_feedback=overall_feedback,
question_results=question_results,
processing_time_ms=processing_time_ms,
)
async def _generate_overall_feedback(
self,
question_results: List[QuestionResult],
total_score: int,
max_score: int,
percentage: float,
grade: str
) -> str:
"""Generate motivating overall feedback."""
# Summarize question results
results_summary = "\n".join([
f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}"
for r in question_results
])
prompt = self.OVERALL_FEEDBACK_PROMPT.format(
question_results=results_summary,
total_score=total_score,
max_score=max_score,
percentage=f"{percentage:.1f}",
grade=grade,
)
request = ChatCompletionRequest(
model=self.model,
messages=[
ChatMessage(role="user", content=prompt),
],
temperature=0.5,
max_tokens=200,
)
try:
response = await self.inference.complete(request)
return response.choices[0].message.content or "Gute Arbeit! Weiter so."
except Exception as e:
logger.error(f"Failed to generate overall feedback: {e}")
return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})"
def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]:
"""
Extract individual answers from OCR text.
Simple heuristic: split by question markers (1., 2., etc.)
More sophisticated extraction can be implemented.
"""
import re
# Try to find question markers
pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*'
parts = re.split(pattern, ocr_text)
answers = []
i = 1 # Skip first empty part
while i < len(parts):
if i + 1 < len(parts):
# parts[i] is the question number, parts[i+1] is the answer
answers.append(parts[i + 1].strip())
i += 2
# Pad with empty answers if needed
while len(answers) < num_questions:
answers.append("")
return answers[:num_questions]
# Singleton instance
_correction_service: Optional[ExamCorrectionService] = None
def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService:
"""
Get or create the correction service singleton.
Args:
model: Optional model override. If None, uses config.correction_model (qwen2.5:14b)
Returns:
ExamCorrectionService instance
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API.
"""
global _correction_service
if _correction_service is None:
_correction_service = ExamCorrectionService(model=model)
elif model and _correction_service.model != model:
# Only recreate if explicitly requesting different model
_correction_service = ExamCorrectionService(model=model)
return _correction_service