This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/klausur/services/correction_service.py
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

380 lines
12 KiB
Python

"""
Exam Correction Service using Self-Hosted LLM.
PRIVACY BY DESIGN:
- Only pseudonymized text (doc_token + OCR content) is sent to LLM
- No student names or personal data in prompts
- All processing happens on self-hosted infrastructure (SysEleven)
- No data sent to external APIs (unless explicitly configured)
This service generates AI-assisted corrections and feedback for exam answers.
"""
import logging
from typing import Optional, List
from dataclasses import dataclass
from llm_gateway.services.inference import get_inference_service, InferenceResult
from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage
from llm_gateway.config import get_config
logger = logging.getLogger(__name__)
@dataclass
class QuestionRubric:
"""Rubric for a single exam question."""
question_number: int
question_text: str
max_points: int
expected_answer: str
grading_criteria: str
@dataclass
class QuestionResult:
"""AI correction result for a single question."""
question_number: int
points_awarded: int
max_points: int
feedback: str
strengths: List[str]
improvements: List[str]
@dataclass
class CorrectionResult:
"""Complete correction result for an exam."""
doc_token: str # Pseudonymized identifier
total_score: int
max_score: int
grade: str
overall_feedback: str
question_results: List[QuestionResult]
processing_time_ms: int
# German grading scale (can be customized)
GERMAN_GRADES = [
(95, "1+"), # sehr gut plus
(90, "1"), # sehr gut
(85, "1-"), # sehr gut minus
(80, "2+"), # gut plus
(75, "2"), # gut
(70, "2-"), # gut minus
(65, "3+"), # befriedigend plus
(60, "3"), # befriedigend
(55, "3-"), # befriedigend minus
(50, "4+"), # ausreichend plus
(45, "4"), # ausreichend
(40, "4-"), # ausreichend minus
(33, "5+"), # mangelhaft plus
(27, "5"), # mangelhaft
(20, "5-"), # mangelhaft minus
(0, "6"), # ungenuegend
]
def calculate_grade(percentage: float) -> str:
"""Calculate German grade from percentage."""
for threshold, grade in GERMAN_GRADES:
if percentage >= threshold:
return grade
return "6"
class ExamCorrectionService:
"""
Service for AI-assisted exam correction.
PRIVACY GUARANTEES:
1. Prompts contain NO personal data
2. Only doc_token is used as reference
3. Processing on self-hosted LLM
4. Results stored with pseudonymized identifiers
"""
# System prompt for exam correction (German)
CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten.
WICHTIGE REGELN:
1. Bewerte NUR den fachlichen Inhalt der Antwort
2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren)
3. Gib konstruktives, ermutigzendes Feedback
4. Beziehe dich auf die Bewertungskriterien
5. Sei fair und konsistent
AUSGABEFORMAT (JSON):
{
"points": <Punktzahl>,
"feedback": "<Kurze Begruendung der Bewertung>",
"strengths": ["<Staerke 1>", "<Staerke 2>"],
"improvements": ["<Verbesserungsvorschlag 1>"]
}
Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen."""
OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung.
Einzelbewertungen:
{question_results}
Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%)
Note: {grade}
Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die:
1. Die Staerken hervorhebt
2. Konstruktive Verbesserungsvorschlaege macht
3. Ermutigt und motiviert
Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung."""
def __init__(self, model: Optional[str] = None):
"""
Initialize the correction service.
Args:
model: LLM model to use (default: qwen2.5:14b from config)
DATENSCHUTZ/PRIVACY:
Das Modell läuft lokal auf dem Mac Mini via Ollama.
Keine Daten werden an externe Server gesendet.
"""
config = get_config()
# Use configured correction model (default: qwen2.5:14b)
self.model = model or config.correction_model
self.inference = get_inference_service()
logger.info(f"Correction service initialized with model: {self.model}")
async def correct_question(
self,
student_answer: str,
rubric: QuestionRubric,
subject: str = "Allgemein"
) -> QuestionResult:
"""
Correct a single question answer.
Args:
student_answer: The student's OCR-extracted answer (pseudonymized)
rubric: Grading rubric for this question
subject: Subject for context
Returns:
QuestionResult with points and feedback
"""
# Build prompt with NO personal data
user_prompt = f"""Fach: {subject}
Frage {rubric.question_number}: {rubric.question_text}
Maximale Punktzahl: {rubric.max_points}
Erwartete Antwort:
{rubric.expected_answer}
Bewertungskriterien:
{rubric.grading_criteria}
---
Schuelerantwort:
{student_answer}
---
Bewerte diese Antwort nach den Kriterien."""
request = ChatCompletionRequest(
model=self.model,
messages=[
ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT),
ChatMessage(role="user", content=user_prompt),
],
temperature=0.3, # Lower temperature for consistent grading
max_tokens=500,
)
try:
response = await self.inference.complete(request)
content = response.choices[0].message.content or "{}"
# Parse JSON response
import json
try:
result = json.loads(content)
except json.JSONDecodeError:
# Fallback parsing
logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}")
result = {
"points": rubric.max_points // 2,
"feedback": content[:200],
"strengths": [],
"improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"]
}
points = min(int(result.get("points", 0)), rubric.max_points)
return QuestionResult(
question_number=rubric.question_number,
points_awarded=points,
max_points=rubric.max_points,
feedback=result.get("feedback", ""),
strengths=result.get("strengths", []),
improvements=result.get("improvements", []),
)
except Exception as e:
logger.error(f"Correction failed for question {rubric.question_number}: {e}")
return QuestionResult(
question_number=rubric.question_number,
points_awarded=0,
max_points=rubric.max_points,
feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}",
strengths=[],
improvements=["Manuelle Korrektur erforderlich"],
)
async def correct_exam(
self,
doc_token: str,
ocr_text: str,
rubrics: List[QuestionRubric],
subject: str = "Allgemein"
) -> CorrectionResult:
"""
Correct a complete exam with multiple questions.
Args:
doc_token: Pseudonymized document identifier
ocr_text: Full OCR text of the exam (already redacted)
rubrics: List of question rubrics
subject: Subject name
Returns:
CorrectionResult with all scores and feedback
"""
import time
start_time = time.time()
# Split OCR text into answers (simple heuristic)
answers = self._extract_answers(ocr_text, len(rubrics))
# Correct each question
question_results = []
for i, rubric in enumerate(rubrics):
answer = answers[i] if i < len(answers) else ""
result = await self.correct_question(answer, rubric, subject)
question_results.append(result)
# Calculate totals
total_score = sum(r.points_awarded for r in question_results)
max_score = sum(r.max_points for r in question_results)
percentage = (total_score / max_score * 100) if max_score > 0 else 0
grade = calculate_grade(percentage)
# Generate overall feedback
overall_feedback = await self._generate_overall_feedback(
question_results, total_score, max_score, percentage, grade
)
processing_time_ms = int((time.time() - start_time) * 1000)
return CorrectionResult(
doc_token=doc_token,
total_score=total_score,
max_score=max_score,
grade=grade,
overall_feedback=overall_feedback,
question_results=question_results,
processing_time_ms=processing_time_ms,
)
async def _generate_overall_feedback(
self,
question_results: List[QuestionResult],
total_score: int,
max_score: int,
percentage: float,
grade: str
) -> str:
"""Generate motivating overall feedback."""
# Summarize question results
results_summary = "\n".join([
f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}"
for r in question_results
])
prompt = self.OVERALL_FEEDBACK_PROMPT.format(
question_results=results_summary,
total_score=total_score,
max_score=max_score,
percentage=f"{percentage:.1f}",
grade=grade,
)
request = ChatCompletionRequest(
model=self.model,
messages=[
ChatMessage(role="user", content=prompt),
],
temperature=0.5,
max_tokens=200,
)
try:
response = await self.inference.complete(request)
return response.choices[0].message.content or "Gute Arbeit! Weiter so."
except Exception as e:
logger.error(f"Failed to generate overall feedback: {e}")
return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})"
def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]:
"""
Extract individual answers from OCR text.
Simple heuristic: split by question markers (1., 2., etc.)
More sophisticated extraction can be implemented.
"""
import re
# Try to find question markers
pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*'
parts = re.split(pattern, ocr_text)
answers = []
i = 1 # Skip first empty part
while i < len(parts):
if i + 1 < len(parts):
# parts[i] is the question number, parts[i+1] is the answer
answers.append(parts[i + 1].strip())
i += 2
# Pad with empty answers if needed
while len(answers) < num_questions:
answers.append("")
return answers[:num_questions]
# Singleton instance
_correction_service: Optional[ExamCorrectionService] = None
def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService:
"""
Get or create the correction service singleton.
Args:
model: Optional model override. If None, uses config.correction_model (qwen2.5:14b)
Returns:
ExamCorrectionService instance
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API.
"""
global _correction_service
if _correction_service is None:
_correction_service = ExamCorrectionService(model=model)
elif model and _correction_service.model != model:
# Only recreate if explicitly requesting different model
_correction_service = ExamCorrectionService(model=model)
return _correction_service