A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
380 lines
12 KiB
Python
380 lines
12 KiB
Python
"""
|
|
Exam Correction Service using Self-Hosted LLM.
|
|
|
|
PRIVACY BY DESIGN:
|
|
- Only pseudonymized text (doc_token + OCR content) is sent to LLM
|
|
- No student names or personal data in prompts
|
|
- All processing happens on self-hosted infrastructure (SysEleven)
|
|
- No data sent to external APIs (unless explicitly configured)
|
|
|
|
This service generates AI-assisted corrections and feedback for exam answers.
|
|
"""
|
|
import logging
|
|
from typing import Optional, List
|
|
from dataclasses import dataclass
|
|
|
|
from llm_gateway.services.inference import get_inference_service, InferenceResult
|
|
from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage
|
|
from llm_gateway.config import get_config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class QuestionRubric:
|
|
"""Rubric for a single exam question."""
|
|
question_number: int
|
|
question_text: str
|
|
max_points: int
|
|
expected_answer: str
|
|
grading_criteria: str
|
|
|
|
|
|
@dataclass
|
|
class QuestionResult:
|
|
"""AI correction result for a single question."""
|
|
question_number: int
|
|
points_awarded: int
|
|
max_points: int
|
|
feedback: str
|
|
strengths: List[str]
|
|
improvements: List[str]
|
|
|
|
|
|
@dataclass
|
|
class CorrectionResult:
|
|
"""Complete correction result for an exam."""
|
|
doc_token: str # Pseudonymized identifier
|
|
total_score: int
|
|
max_score: int
|
|
grade: str
|
|
overall_feedback: str
|
|
question_results: List[QuestionResult]
|
|
processing_time_ms: int
|
|
|
|
|
|
# German grading scale (can be customized)
|
|
GERMAN_GRADES = [
|
|
(95, "1+"), # sehr gut plus
|
|
(90, "1"), # sehr gut
|
|
(85, "1-"), # sehr gut minus
|
|
(80, "2+"), # gut plus
|
|
(75, "2"), # gut
|
|
(70, "2-"), # gut minus
|
|
(65, "3+"), # befriedigend plus
|
|
(60, "3"), # befriedigend
|
|
(55, "3-"), # befriedigend minus
|
|
(50, "4+"), # ausreichend plus
|
|
(45, "4"), # ausreichend
|
|
(40, "4-"), # ausreichend minus
|
|
(33, "5+"), # mangelhaft plus
|
|
(27, "5"), # mangelhaft
|
|
(20, "5-"), # mangelhaft minus
|
|
(0, "6"), # ungenuegend
|
|
]
|
|
|
|
|
|
def calculate_grade(percentage: float) -> str:
|
|
"""Calculate German grade from percentage."""
|
|
for threshold, grade in GERMAN_GRADES:
|
|
if percentage >= threshold:
|
|
return grade
|
|
return "6"
|
|
|
|
|
|
class ExamCorrectionService:
|
|
"""
|
|
Service for AI-assisted exam correction.
|
|
|
|
PRIVACY GUARANTEES:
|
|
1. Prompts contain NO personal data
|
|
2. Only doc_token is used as reference
|
|
3. Processing on self-hosted LLM
|
|
4. Results stored with pseudonymized identifiers
|
|
"""
|
|
|
|
# System prompt for exam correction (German)
|
|
CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten.
|
|
|
|
WICHTIGE REGELN:
|
|
1. Bewerte NUR den fachlichen Inhalt der Antwort
|
|
2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren)
|
|
3. Gib konstruktives, ermutigzendes Feedback
|
|
4. Beziehe dich auf die Bewertungskriterien
|
|
5. Sei fair und konsistent
|
|
|
|
AUSGABEFORMAT (JSON):
|
|
{
|
|
"points": <Punktzahl>,
|
|
"feedback": "<Kurze Begruendung der Bewertung>",
|
|
"strengths": ["<Staerke 1>", "<Staerke 2>"],
|
|
"improvements": ["<Verbesserungsvorschlag 1>"]
|
|
}
|
|
|
|
Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen."""
|
|
|
|
OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung.
|
|
|
|
Einzelbewertungen:
|
|
{question_results}
|
|
|
|
Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%)
|
|
Note: {grade}
|
|
|
|
Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die:
|
|
1. Die Staerken hervorhebt
|
|
2. Konstruktive Verbesserungsvorschlaege macht
|
|
3. Ermutigt und motiviert
|
|
|
|
Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung."""
|
|
|
|
def __init__(self, model: Optional[str] = None):
|
|
"""
|
|
Initialize the correction service.
|
|
|
|
Args:
|
|
model: LLM model to use (default: qwen2.5:14b from config)
|
|
|
|
DATENSCHUTZ/PRIVACY:
|
|
Das Modell läuft lokal auf dem Mac Mini via Ollama.
|
|
Keine Daten werden an externe Server gesendet.
|
|
"""
|
|
config = get_config()
|
|
# Use configured correction model (default: qwen2.5:14b)
|
|
self.model = model or config.correction_model
|
|
self.inference = get_inference_service()
|
|
logger.info(f"Correction service initialized with model: {self.model}")
|
|
|
|
async def correct_question(
|
|
self,
|
|
student_answer: str,
|
|
rubric: QuestionRubric,
|
|
subject: str = "Allgemein"
|
|
) -> QuestionResult:
|
|
"""
|
|
Correct a single question answer.
|
|
|
|
Args:
|
|
student_answer: The student's OCR-extracted answer (pseudonymized)
|
|
rubric: Grading rubric for this question
|
|
subject: Subject for context
|
|
|
|
Returns:
|
|
QuestionResult with points and feedback
|
|
"""
|
|
# Build prompt with NO personal data
|
|
user_prompt = f"""Fach: {subject}
|
|
Frage {rubric.question_number}: {rubric.question_text}
|
|
Maximale Punktzahl: {rubric.max_points}
|
|
|
|
Erwartete Antwort:
|
|
{rubric.expected_answer}
|
|
|
|
Bewertungskriterien:
|
|
{rubric.grading_criteria}
|
|
|
|
---
|
|
|
|
Schuelerantwort:
|
|
{student_answer}
|
|
|
|
---
|
|
|
|
Bewerte diese Antwort nach den Kriterien."""
|
|
|
|
request = ChatCompletionRequest(
|
|
model=self.model,
|
|
messages=[
|
|
ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT),
|
|
ChatMessage(role="user", content=user_prompt),
|
|
],
|
|
temperature=0.3, # Lower temperature for consistent grading
|
|
max_tokens=500,
|
|
)
|
|
|
|
try:
|
|
response = await self.inference.complete(request)
|
|
content = response.choices[0].message.content or "{}"
|
|
|
|
# Parse JSON response
|
|
import json
|
|
try:
|
|
result = json.loads(content)
|
|
except json.JSONDecodeError:
|
|
# Fallback parsing
|
|
logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}")
|
|
result = {
|
|
"points": rubric.max_points // 2,
|
|
"feedback": content[:200],
|
|
"strengths": [],
|
|
"improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"]
|
|
}
|
|
|
|
points = min(int(result.get("points", 0)), rubric.max_points)
|
|
|
|
return QuestionResult(
|
|
question_number=rubric.question_number,
|
|
points_awarded=points,
|
|
max_points=rubric.max_points,
|
|
feedback=result.get("feedback", ""),
|
|
strengths=result.get("strengths", []),
|
|
improvements=result.get("improvements", []),
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Correction failed for question {rubric.question_number}: {e}")
|
|
return QuestionResult(
|
|
question_number=rubric.question_number,
|
|
points_awarded=0,
|
|
max_points=rubric.max_points,
|
|
feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}",
|
|
strengths=[],
|
|
improvements=["Manuelle Korrektur erforderlich"],
|
|
)
|
|
|
|
async def correct_exam(
|
|
self,
|
|
doc_token: str,
|
|
ocr_text: str,
|
|
rubrics: List[QuestionRubric],
|
|
subject: str = "Allgemein"
|
|
) -> CorrectionResult:
|
|
"""
|
|
Correct a complete exam with multiple questions.
|
|
|
|
Args:
|
|
doc_token: Pseudonymized document identifier
|
|
ocr_text: Full OCR text of the exam (already redacted)
|
|
rubrics: List of question rubrics
|
|
subject: Subject name
|
|
|
|
Returns:
|
|
CorrectionResult with all scores and feedback
|
|
"""
|
|
import time
|
|
start_time = time.time()
|
|
|
|
# Split OCR text into answers (simple heuristic)
|
|
answers = self._extract_answers(ocr_text, len(rubrics))
|
|
|
|
# Correct each question
|
|
question_results = []
|
|
for i, rubric in enumerate(rubrics):
|
|
answer = answers[i] if i < len(answers) else ""
|
|
result = await self.correct_question(answer, rubric, subject)
|
|
question_results.append(result)
|
|
|
|
# Calculate totals
|
|
total_score = sum(r.points_awarded for r in question_results)
|
|
max_score = sum(r.max_points for r in question_results)
|
|
percentage = (total_score / max_score * 100) if max_score > 0 else 0
|
|
grade = calculate_grade(percentage)
|
|
|
|
# Generate overall feedback
|
|
overall_feedback = await self._generate_overall_feedback(
|
|
question_results, total_score, max_score, percentage, grade
|
|
)
|
|
|
|
processing_time_ms = int((time.time() - start_time) * 1000)
|
|
|
|
return CorrectionResult(
|
|
doc_token=doc_token,
|
|
total_score=total_score,
|
|
max_score=max_score,
|
|
grade=grade,
|
|
overall_feedback=overall_feedback,
|
|
question_results=question_results,
|
|
processing_time_ms=processing_time_ms,
|
|
)
|
|
|
|
async def _generate_overall_feedback(
|
|
self,
|
|
question_results: List[QuestionResult],
|
|
total_score: int,
|
|
max_score: int,
|
|
percentage: float,
|
|
grade: str
|
|
) -> str:
|
|
"""Generate motivating overall feedback."""
|
|
# Summarize question results
|
|
results_summary = "\n".join([
|
|
f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}"
|
|
for r in question_results
|
|
])
|
|
|
|
prompt = self.OVERALL_FEEDBACK_PROMPT.format(
|
|
question_results=results_summary,
|
|
total_score=total_score,
|
|
max_score=max_score,
|
|
percentage=f"{percentage:.1f}",
|
|
grade=grade,
|
|
)
|
|
|
|
request = ChatCompletionRequest(
|
|
model=self.model,
|
|
messages=[
|
|
ChatMessage(role="user", content=prompt),
|
|
],
|
|
temperature=0.5,
|
|
max_tokens=200,
|
|
)
|
|
|
|
try:
|
|
response = await self.inference.complete(request)
|
|
return response.choices[0].message.content or "Gute Arbeit! Weiter so."
|
|
except Exception as e:
|
|
logger.error(f"Failed to generate overall feedback: {e}")
|
|
return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})"
|
|
|
|
def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]:
|
|
"""
|
|
Extract individual answers from OCR text.
|
|
|
|
Simple heuristic: split by question markers (1., 2., etc.)
|
|
More sophisticated extraction can be implemented.
|
|
"""
|
|
import re
|
|
|
|
# Try to find question markers
|
|
pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*'
|
|
parts = re.split(pattern, ocr_text)
|
|
|
|
answers = []
|
|
i = 1 # Skip first empty part
|
|
while i < len(parts):
|
|
if i + 1 < len(parts):
|
|
# parts[i] is the question number, parts[i+1] is the answer
|
|
answers.append(parts[i + 1].strip())
|
|
i += 2
|
|
|
|
# Pad with empty answers if needed
|
|
while len(answers) < num_questions:
|
|
answers.append("")
|
|
|
|
return answers[:num_questions]
|
|
|
|
|
|
# Singleton instance
|
|
_correction_service: Optional[ExamCorrectionService] = None
|
|
|
|
|
|
def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService:
|
|
"""
|
|
Get or create the correction service singleton.
|
|
|
|
Args:
|
|
model: Optional model override. If None, uses config.correction_model (qwen2.5:14b)
|
|
|
|
Returns:
|
|
ExamCorrectionService instance
|
|
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API.
|
|
"""
|
|
global _correction_service
|
|
if _correction_service is None:
|
|
_correction_service = ExamCorrectionService(model=model)
|
|
elif model and _correction_service.model != model:
|
|
# Only recreate if explicitly requesting different model
|
|
_correction_service = ExamCorrectionService(model=model)
|
|
return _correction_service
|