fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
379
backend/klausur/services/correction_service.py
Normal file
379
backend/klausur/services/correction_service.py
Normal file
@@ -0,0 +1,379 @@
|
||||
"""
|
||||
Exam Correction Service using Self-Hosted LLM.
|
||||
|
||||
PRIVACY BY DESIGN:
|
||||
- Only pseudonymized text (doc_token + OCR content) is sent to LLM
|
||||
- No student names or personal data in prompts
|
||||
- All processing happens on self-hosted infrastructure (SysEleven)
|
||||
- No data sent to external APIs (unless explicitly configured)
|
||||
|
||||
This service generates AI-assisted corrections and feedback for exam answers.
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
from dataclasses import dataclass
|
||||
|
||||
from llm_gateway.services.inference import get_inference_service, InferenceResult
|
||||
from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage
|
||||
from llm_gateway.config import get_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class QuestionRubric:
|
||||
"""Rubric for a single exam question."""
|
||||
question_number: int
|
||||
question_text: str
|
||||
max_points: int
|
||||
expected_answer: str
|
||||
grading_criteria: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class QuestionResult:
|
||||
"""AI correction result for a single question."""
|
||||
question_number: int
|
||||
points_awarded: int
|
||||
max_points: int
|
||||
feedback: str
|
||||
strengths: List[str]
|
||||
improvements: List[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CorrectionResult:
|
||||
"""Complete correction result for an exam."""
|
||||
doc_token: str # Pseudonymized identifier
|
||||
total_score: int
|
||||
max_score: int
|
||||
grade: str
|
||||
overall_feedback: str
|
||||
question_results: List[QuestionResult]
|
||||
processing_time_ms: int
|
||||
|
||||
|
||||
# German grading scale (can be customized)
|
||||
GERMAN_GRADES = [
|
||||
(95, "1+"), # sehr gut plus
|
||||
(90, "1"), # sehr gut
|
||||
(85, "1-"), # sehr gut minus
|
||||
(80, "2+"), # gut plus
|
||||
(75, "2"), # gut
|
||||
(70, "2-"), # gut minus
|
||||
(65, "3+"), # befriedigend plus
|
||||
(60, "3"), # befriedigend
|
||||
(55, "3-"), # befriedigend minus
|
||||
(50, "4+"), # ausreichend plus
|
||||
(45, "4"), # ausreichend
|
||||
(40, "4-"), # ausreichend minus
|
||||
(33, "5+"), # mangelhaft plus
|
||||
(27, "5"), # mangelhaft
|
||||
(20, "5-"), # mangelhaft minus
|
||||
(0, "6"), # ungenuegend
|
||||
]
|
||||
|
||||
|
||||
def calculate_grade(percentage: float) -> str:
|
||||
"""Calculate German grade from percentage."""
|
||||
for threshold, grade in GERMAN_GRADES:
|
||||
if percentage >= threshold:
|
||||
return grade
|
||||
return "6"
|
||||
|
||||
|
||||
class ExamCorrectionService:
|
||||
"""
|
||||
Service for AI-assisted exam correction.
|
||||
|
||||
PRIVACY GUARANTEES:
|
||||
1. Prompts contain NO personal data
|
||||
2. Only doc_token is used as reference
|
||||
3. Processing on self-hosted LLM
|
||||
4. Results stored with pseudonymized identifiers
|
||||
"""
|
||||
|
||||
# System prompt for exam correction (German)
|
||||
CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten.
|
||||
|
||||
WICHTIGE REGELN:
|
||||
1. Bewerte NUR den fachlichen Inhalt der Antwort
|
||||
2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren)
|
||||
3. Gib konstruktives, ermutigzendes Feedback
|
||||
4. Beziehe dich auf die Bewertungskriterien
|
||||
5. Sei fair und konsistent
|
||||
|
||||
AUSGABEFORMAT (JSON):
|
||||
{
|
||||
"points": <Punktzahl>,
|
||||
"feedback": "<Kurze Begruendung der Bewertung>",
|
||||
"strengths": ["<Staerke 1>", "<Staerke 2>"],
|
||||
"improvements": ["<Verbesserungsvorschlag 1>"]
|
||||
}
|
||||
|
||||
Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen."""
|
||||
|
||||
OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung.
|
||||
|
||||
Einzelbewertungen:
|
||||
{question_results}
|
||||
|
||||
Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%)
|
||||
Note: {grade}
|
||||
|
||||
Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die:
|
||||
1. Die Staerken hervorhebt
|
||||
2. Konstruktive Verbesserungsvorschlaege macht
|
||||
3. Ermutigt und motiviert
|
||||
|
||||
Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung."""
|
||||
|
||||
def __init__(self, model: Optional[str] = None):
|
||||
"""
|
||||
Initialize the correction service.
|
||||
|
||||
Args:
|
||||
model: LLM model to use (default: qwen2.5:14b from config)
|
||||
|
||||
DATENSCHUTZ/PRIVACY:
|
||||
Das Modell läuft lokal auf dem Mac Mini via Ollama.
|
||||
Keine Daten werden an externe Server gesendet.
|
||||
"""
|
||||
config = get_config()
|
||||
# Use configured correction model (default: qwen2.5:14b)
|
||||
self.model = model or config.correction_model
|
||||
self.inference = get_inference_service()
|
||||
logger.info(f"Correction service initialized with model: {self.model}")
|
||||
|
||||
async def correct_question(
|
||||
self,
|
||||
student_answer: str,
|
||||
rubric: QuestionRubric,
|
||||
subject: str = "Allgemein"
|
||||
) -> QuestionResult:
|
||||
"""
|
||||
Correct a single question answer.
|
||||
|
||||
Args:
|
||||
student_answer: The student's OCR-extracted answer (pseudonymized)
|
||||
rubric: Grading rubric for this question
|
||||
subject: Subject for context
|
||||
|
||||
Returns:
|
||||
QuestionResult with points and feedback
|
||||
"""
|
||||
# Build prompt with NO personal data
|
||||
user_prompt = f"""Fach: {subject}
|
||||
Frage {rubric.question_number}: {rubric.question_text}
|
||||
Maximale Punktzahl: {rubric.max_points}
|
||||
|
||||
Erwartete Antwort:
|
||||
{rubric.expected_answer}
|
||||
|
||||
Bewertungskriterien:
|
||||
{rubric.grading_criteria}
|
||||
|
||||
---
|
||||
|
||||
Schuelerantwort:
|
||||
{student_answer}
|
||||
|
||||
---
|
||||
|
||||
Bewerte diese Antwort nach den Kriterien."""
|
||||
|
||||
request = ChatCompletionRequest(
|
||||
model=self.model,
|
||||
messages=[
|
||||
ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT),
|
||||
ChatMessage(role="user", content=user_prompt),
|
||||
],
|
||||
temperature=0.3, # Lower temperature for consistent grading
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
try:
|
||||
response = await self.inference.complete(request)
|
||||
content = response.choices[0].message.content or "{}"
|
||||
|
||||
# Parse JSON response
|
||||
import json
|
||||
try:
|
||||
result = json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
# Fallback parsing
|
||||
logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}")
|
||||
result = {
|
||||
"points": rubric.max_points // 2,
|
||||
"feedback": content[:200],
|
||||
"strengths": [],
|
||||
"improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"]
|
||||
}
|
||||
|
||||
points = min(int(result.get("points", 0)), rubric.max_points)
|
||||
|
||||
return QuestionResult(
|
||||
question_number=rubric.question_number,
|
||||
points_awarded=points,
|
||||
max_points=rubric.max_points,
|
||||
feedback=result.get("feedback", ""),
|
||||
strengths=result.get("strengths", []),
|
||||
improvements=result.get("improvements", []),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Correction failed for question {rubric.question_number}: {e}")
|
||||
return QuestionResult(
|
||||
question_number=rubric.question_number,
|
||||
points_awarded=0,
|
||||
max_points=rubric.max_points,
|
||||
feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}",
|
||||
strengths=[],
|
||||
improvements=["Manuelle Korrektur erforderlich"],
|
||||
)
|
||||
|
||||
async def correct_exam(
|
||||
self,
|
||||
doc_token: str,
|
||||
ocr_text: str,
|
||||
rubrics: List[QuestionRubric],
|
||||
subject: str = "Allgemein"
|
||||
) -> CorrectionResult:
|
||||
"""
|
||||
Correct a complete exam with multiple questions.
|
||||
|
||||
Args:
|
||||
doc_token: Pseudonymized document identifier
|
||||
ocr_text: Full OCR text of the exam (already redacted)
|
||||
rubrics: List of question rubrics
|
||||
subject: Subject name
|
||||
|
||||
Returns:
|
||||
CorrectionResult with all scores and feedback
|
||||
"""
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
# Split OCR text into answers (simple heuristic)
|
||||
answers = self._extract_answers(ocr_text, len(rubrics))
|
||||
|
||||
# Correct each question
|
||||
question_results = []
|
||||
for i, rubric in enumerate(rubrics):
|
||||
answer = answers[i] if i < len(answers) else ""
|
||||
result = await self.correct_question(answer, rubric, subject)
|
||||
question_results.append(result)
|
||||
|
||||
# Calculate totals
|
||||
total_score = sum(r.points_awarded for r in question_results)
|
||||
max_score = sum(r.max_points for r in question_results)
|
||||
percentage = (total_score / max_score * 100) if max_score > 0 else 0
|
||||
grade = calculate_grade(percentage)
|
||||
|
||||
# Generate overall feedback
|
||||
overall_feedback = await self._generate_overall_feedback(
|
||||
question_results, total_score, max_score, percentage, grade
|
||||
)
|
||||
|
||||
processing_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
return CorrectionResult(
|
||||
doc_token=doc_token,
|
||||
total_score=total_score,
|
||||
max_score=max_score,
|
||||
grade=grade,
|
||||
overall_feedback=overall_feedback,
|
||||
question_results=question_results,
|
||||
processing_time_ms=processing_time_ms,
|
||||
)
|
||||
|
||||
async def _generate_overall_feedback(
|
||||
self,
|
||||
question_results: List[QuestionResult],
|
||||
total_score: int,
|
||||
max_score: int,
|
||||
percentage: float,
|
||||
grade: str
|
||||
) -> str:
|
||||
"""Generate motivating overall feedback."""
|
||||
# Summarize question results
|
||||
results_summary = "\n".join([
|
||||
f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}"
|
||||
for r in question_results
|
||||
])
|
||||
|
||||
prompt = self.OVERALL_FEEDBACK_PROMPT.format(
|
||||
question_results=results_summary,
|
||||
total_score=total_score,
|
||||
max_score=max_score,
|
||||
percentage=f"{percentage:.1f}",
|
||||
grade=grade,
|
||||
)
|
||||
|
||||
request = ChatCompletionRequest(
|
||||
model=self.model,
|
||||
messages=[
|
||||
ChatMessage(role="user", content=prompt),
|
||||
],
|
||||
temperature=0.5,
|
||||
max_tokens=200,
|
||||
)
|
||||
|
||||
try:
|
||||
response = await self.inference.complete(request)
|
||||
return response.choices[0].message.content or "Gute Arbeit! Weiter so."
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate overall feedback: {e}")
|
||||
return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})"
|
||||
|
||||
def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]:
|
||||
"""
|
||||
Extract individual answers from OCR text.
|
||||
|
||||
Simple heuristic: split by question markers (1., 2., etc.)
|
||||
More sophisticated extraction can be implemented.
|
||||
"""
|
||||
import re
|
||||
|
||||
# Try to find question markers
|
||||
pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*'
|
||||
parts = re.split(pattern, ocr_text)
|
||||
|
||||
answers = []
|
||||
i = 1 # Skip first empty part
|
||||
while i < len(parts):
|
||||
if i + 1 < len(parts):
|
||||
# parts[i] is the question number, parts[i+1] is the answer
|
||||
answers.append(parts[i + 1].strip())
|
||||
i += 2
|
||||
|
||||
# Pad with empty answers if needed
|
||||
while len(answers) < num_questions:
|
||||
answers.append("")
|
||||
|
||||
return answers[:num_questions]
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_correction_service: Optional[ExamCorrectionService] = None
|
||||
|
||||
|
||||
def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService:
|
||||
"""
|
||||
Get or create the correction service singleton.
|
||||
|
||||
Args:
|
||||
model: Optional model override. If None, uses config.correction_model (qwen2.5:14b)
|
||||
|
||||
Returns:
|
||||
ExamCorrectionService instance
|
||||
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API.
|
||||
"""
|
||||
global _correction_service
|
||||
if _correction_service is None:
|
||||
_correction_service = ExamCorrectionService(model=model)
|
||||
elif model and _correction_service.model != model:
|
||||
# Only recreate if explicitly requesting different model
|
||||
_correction_service = ExamCorrectionService(model=model)
|
||||
return _correction_service
|
||||
Reference in New Issue
Block a user