fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
28
backend/klausur/services/__init__.py
Normal file
28
backend/klausur/services/__init__.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""
|
||||
Services for Klausurkorrektur Module.
|
||||
|
||||
- PseudonymizationService: QR code generation, header redaction
|
||||
- CorrectionService: LLM integration for AI-assisted grading
|
||||
- RosterParser: Parse Klassenbuch photos and roster files
|
||||
- SchoolResolver: School/class selection and auto-creation
|
||||
- ModuleLinker: Cross-module links (Notenbuch, Elternabend, etc.)
|
||||
"""
|
||||
|
||||
from .pseudonymizer import PseudonymizationService, get_pseudonymizer
|
||||
from .correction_service import ExamCorrectionService, get_correction_service
|
||||
from .roster_parser import RosterParser, get_roster_parser
|
||||
from .school_resolver import SchoolResolver, get_school_resolver
|
||||
from .module_linker import ModuleLinker, get_module_linker
|
||||
|
||||
__all__ = [
|
||||
"PseudonymizationService",
|
||||
"get_pseudonymizer",
|
||||
"ExamCorrectionService",
|
||||
"get_correction_service",
|
||||
"RosterParser",
|
||||
"get_roster_parser",
|
||||
"SchoolResolver",
|
||||
"get_school_resolver",
|
||||
"ModuleLinker",
|
||||
"get_module_linker",
|
||||
]
|
||||
379
backend/klausur/services/correction_service.py
Normal file
379
backend/klausur/services/correction_service.py
Normal file
@@ -0,0 +1,379 @@
|
||||
"""
|
||||
Exam Correction Service using Self-Hosted LLM.
|
||||
|
||||
PRIVACY BY DESIGN:
|
||||
- Only pseudonymized text (doc_token + OCR content) is sent to LLM
|
||||
- No student names or personal data in prompts
|
||||
- All processing happens on self-hosted infrastructure (SysEleven)
|
||||
- No data sent to external APIs (unless explicitly configured)
|
||||
|
||||
This service generates AI-assisted corrections and feedback for exam answers.
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
from dataclasses import dataclass
|
||||
|
||||
from llm_gateway.services.inference import get_inference_service, InferenceResult
|
||||
from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage
|
||||
from llm_gateway.config import get_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class QuestionRubric:
|
||||
"""Rubric for a single exam question."""
|
||||
question_number: int
|
||||
question_text: str
|
||||
max_points: int
|
||||
expected_answer: str
|
||||
grading_criteria: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class QuestionResult:
|
||||
"""AI correction result for a single question."""
|
||||
question_number: int
|
||||
points_awarded: int
|
||||
max_points: int
|
||||
feedback: str
|
||||
strengths: List[str]
|
||||
improvements: List[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CorrectionResult:
|
||||
"""Complete correction result for an exam."""
|
||||
doc_token: str # Pseudonymized identifier
|
||||
total_score: int
|
||||
max_score: int
|
||||
grade: str
|
||||
overall_feedback: str
|
||||
question_results: List[QuestionResult]
|
||||
processing_time_ms: int
|
||||
|
||||
|
||||
# German grading scale (can be customized)
|
||||
GERMAN_GRADES = [
|
||||
(95, "1+"), # sehr gut plus
|
||||
(90, "1"), # sehr gut
|
||||
(85, "1-"), # sehr gut minus
|
||||
(80, "2+"), # gut plus
|
||||
(75, "2"), # gut
|
||||
(70, "2-"), # gut minus
|
||||
(65, "3+"), # befriedigend plus
|
||||
(60, "3"), # befriedigend
|
||||
(55, "3-"), # befriedigend minus
|
||||
(50, "4+"), # ausreichend plus
|
||||
(45, "4"), # ausreichend
|
||||
(40, "4-"), # ausreichend minus
|
||||
(33, "5+"), # mangelhaft plus
|
||||
(27, "5"), # mangelhaft
|
||||
(20, "5-"), # mangelhaft minus
|
||||
(0, "6"), # ungenuegend
|
||||
]
|
||||
|
||||
|
||||
def calculate_grade(percentage: float) -> str:
|
||||
"""Calculate German grade from percentage."""
|
||||
for threshold, grade in GERMAN_GRADES:
|
||||
if percentage >= threshold:
|
||||
return grade
|
||||
return "6"
|
||||
|
||||
|
||||
class ExamCorrectionService:
|
||||
"""
|
||||
Service for AI-assisted exam correction.
|
||||
|
||||
PRIVACY GUARANTEES:
|
||||
1. Prompts contain NO personal data
|
||||
2. Only doc_token is used as reference
|
||||
3. Processing on self-hosted LLM
|
||||
4. Results stored with pseudonymized identifiers
|
||||
"""
|
||||
|
||||
# System prompt for exam correction (German)
|
||||
CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten.
|
||||
|
||||
WICHTIGE REGELN:
|
||||
1. Bewerte NUR den fachlichen Inhalt der Antwort
|
||||
2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren)
|
||||
3. Gib konstruktives, ermutigzendes Feedback
|
||||
4. Beziehe dich auf die Bewertungskriterien
|
||||
5. Sei fair und konsistent
|
||||
|
||||
AUSGABEFORMAT (JSON):
|
||||
{
|
||||
"points": <Punktzahl>,
|
||||
"feedback": "<Kurze Begruendung der Bewertung>",
|
||||
"strengths": ["<Staerke 1>", "<Staerke 2>"],
|
||||
"improvements": ["<Verbesserungsvorschlag 1>"]
|
||||
}
|
||||
|
||||
Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen."""
|
||||
|
||||
OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung.
|
||||
|
||||
Einzelbewertungen:
|
||||
{question_results}
|
||||
|
||||
Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%)
|
||||
Note: {grade}
|
||||
|
||||
Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die:
|
||||
1. Die Staerken hervorhebt
|
||||
2. Konstruktive Verbesserungsvorschlaege macht
|
||||
3. Ermutigt und motiviert
|
||||
|
||||
Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung."""
|
||||
|
||||
def __init__(self, model: Optional[str] = None):
|
||||
"""
|
||||
Initialize the correction service.
|
||||
|
||||
Args:
|
||||
model: LLM model to use (default: qwen2.5:14b from config)
|
||||
|
||||
DATENSCHUTZ/PRIVACY:
|
||||
Das Modell läuft lokal auf dem Mac Mini via Ollama.
|
||||
Keine Daten werden an externe Server gesendet.
|
||||
"""
|
||||
config = get_config()
|
||||
# Use configured correction model (default: qwen2.5:14b)
|
||||
self.model = model or config.correction_model
|
||||
self.inference = get_inference_service()
|
||||
logger.info(f"Correction service initialized with model: {self.model}")
|
||||
|
||||
async def correct_question(
|
||||
self,
|
||||
student_answer: str,
|
||||
rubric: QuestionRubric,
|
||||
subject: str = "Allgemein"
|
||||
) -> QuestionResult:
|
||||
"""
|
||||
Correct a single question answer.
|
||||
|
||||
Args:
|
||||
student_answer: The student's OCR-extracted answer (pseudonymized)
|
||||
rubric: Grading rubric for this question
|
||||
subject: Subject for context
|
||||
|
||||
Returns:
|
||||
QuestionResult with points and feedback
|
||||
"""
|
||||
# Build prompt with NO personal data
|
||||
user_prompt = f"""Fach: {subject}
|
||||
Frage {rubric.question_number}: {rubric.question_text}
|
||||
Maximale Punktzahl: {rubric.max_points}
|
||||
|
||||
Erwartete Antwort:
|
||||
{rubric.expected_answer}
|
||||
|
||||
Bewertungskriterien:
|
||||
{rubric.grading_criteria}
|
||||
|
||||
---
|
||||
|
||||
Schuelerantwort:
|
||||
{student_answer}
|
||||
|
||||
---
|
||||
|
||||
Bewerte diese Antwort nach den Kriterien."""
|
||||
|
||||
request = ChatCompletionRequest(
|
||||
model=self.model,
|
||||
messages=[
|
||||
ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT),
|
||||
ChatMessage(role="user", content=user_prompt),
|
||||
],
|
||||
temperature=0.3, # Lower temperature for consistent grading
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
try:
|
||||
response = await self.inference.complete(request)
|
||||
content = response.choices[0].message.content or "{}"
|
||||
|
||||
# Parse JSON response
|
||||
import json
|
||||
try:
|
||||
result = json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
# Fallback parsing
|
||||
logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}")
|
||||
result = {
|
||||
"points": rubric.max_points // 2,
|
||||
"feedback": content[:200],
|
||||
"strengths": [],
|
||||
"improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"]
|
||||
}
|
||||
|
||||
points = min(int(result.get("points", 0)), rubric.max_points)
|
||||
|
||||
return QuestionResult(
|
||||
question_number=rubric.question_number,
|
||||
points_awarded=points,
|
||||
max_points=rubric.max_points,
|
||||
feedback=result.get("feedback", ""),
|
||||
strengths=result.get("strengths", []),
|
||||
improvements=result.get("improvements", []),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Correction failed for question {rubric.question_number}: {e}")
|
||||
return QuestionResult(
|
||||
question_number=rubric.question_number,
|
||||
points_awarded=0,
|
||||
max_points=rubric.max_points,
|
||||
feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}",
|
||||
strengths=[],
|
||||
improvements=["Manuelle Korrektur erforderlich"],
|
||||
)
|
||||
|
||||
async def correct_exam(
|
||||
self,
|
||||
doc_token: str,
|
||||
ocr_text: str,
|
||||
rubrics: List[QuestionRubric],
|
||||
subject: str = "Allgemein"
|
||||
) -> CorrectionResult:
|
||||
"""
|
||||
Correct a complete exam with multiple questions.
|
||||
|
||||
Args:
|
||||
doc_token: Pseudonymized document identifier
|
||||
ocr_text: Full OCR text of the exam (already redacted)
|
||||
rubrics: List of question rubrics
|
||||
subject: Subject name
|
||||
|
||||
Returns:
|
||||
CorrectionResult with all scores and feedback
|
||||
"""
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
# Split OCR text into answers (simple heuristic)
|
||||
answers = self._extract_answers(ocr_text, len(rubrics))
|
||||
|
||||
# Correct each question
|
||||
question_results = []
|
||||
for i, rubric in enumerate(rubrics):
|
||||
answer = answers[i] if i < len(answers) else ""
|
||||
result = await self.correct_question(answer, rubric, subject)
|
||||
question_results.append(result)
|
||||
|
||||
# Calculate totals
|
||||
total_score = sum(r.points_awarded for r in question_results)
|
||||
max_score = sum(r.max_points for r in question_results)
|
||||
percentage = (total_score / max_score * 100) if max_score > 0 else 0
|
||||
grade = calculate_grade(percentage)
|
||||
|
||||
# Generate overall feedback
|
||||
overall_feedback = await self._generate_overall_feedback(
|
||||
question_results, total_score, max_score, percentage, grade
|
||||
)
|
||||
|
||||
processing_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
return CorrectionResult(
|
||||
doc_token=doc_token,
|
||||
total_score=total_score,
|
||||
max_score=max_score,
|
||||
grade=grade,
|
||||
overall_feedback=overall_feedback,
|
||||
question_results=question_results,
|
||||
processing_time_ms=processing_time_ms,
|
||||
)
|
||||
|
||||
async def _generate_overall_feedback(
|
||||
self,
|
||||
question_results: List[QuestionResult],
|
||||
total_score: int,
|
||||
max_score: int,
|
||||
percentage: float,
|
||||
grade: str
|
||||
) -> str:
|
||||
"""Generate motivating overall feedback."""
|
||||
# Summarize question results
|
||||
results_summary = "\n".join([
|
||||
f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}"
|
||||
for r in question_results
|
||||
])
|
||||
|
||||
prompt = self.OVERALL_FEEDBACK_PROMPT.format(
|
||||
question_results=results_summary,
|
||||
total_score=total_score,
|
||||
max_score=max_score,
|
||||
percentage=f"{percentage:.1f}",
|
||||
grade=grade,
|
||||
)
|
||||
|
||||
request = ChatCompletionRequest(
|
||||
model=self.model,
|
||||
messages=[
|
||||
ChatMessage(role="user", content=prompt),
|
||||
],
|
||||
temperature=0.5,
|
||||
max_tokens=200,
|
||||
)
|
||||
|
||||
try:
|
||||
response = await self.inference.complete(request)
|
||||
return response.choices[0].message.content or "Gute Arbeit! Weiter so."
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate overall feedback: {e}")
|
||||
return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})"
|
||||
|
||||
def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]:
|
||||
"""
|
||||
Extract individual answers from OCR text.
|
||||
|
||||
Simple heuristic: split by question markers (1., 2., etc.)
|
||||
More sophisticated extraction can be implemented.
|
||||
"""
|
||||
import re
|
||||
|
||||
# Try to find question markers
|
||||
pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*'
|
||||
parts = re.split(pattern, ocr_text)
|
||||
|
||||
answers = []
|
||||
i = 1 # Skip first empty part
|
||||
while i < len(parts):
|
||||
if i + 1 < len(parts):
|
||||
# parts[i] is the question number, parts[i+1] is the answer
|
||||
answers.append(parts[i + 1].strip())
|
||||
i += 2
|
||||
|
||||
# Pad with empty answers if needed
|
||||
while len(answers) < num_questions:
|
||||
answers.append("")
|
||||
|
||||
return answers[:num_questions]
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_correction_service: Optional[ExamCorrectionService] = None
|
||||
|
||||
|
||||
def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService:
|
||||
"""
|
||||
Get or create the correction service singleton.
|
||||
|
||||
Args:
|
||||
model: Optional model override. If None, uses config.correction_model (qwen2.5:14b)
|
||||
|
||||
Returns:
|
||||
ExamCorrectionService instance
|
||||
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API.
|
||||
"""
|
||||
global _correction_service
|
||||
if _correction_service is None:
|
||||
_correction_service = ExamCorrectionService(model=model)
|
||||
elif model and _correction_service.model != model:
|
||||
# Only recreate if explicitly requesting different model
|
||||
_correction_service = ExamCorrectionService(model=model)
|
||||
return _correction_service
|
||||
630
backend/klausur/services/module_linker.py
Normal file
630
backend/klausur/services/module_linker.py
Normal file
@@ -0,0 +1,630 @@
|
||||
"""
|
||||
Module Linker Service - Cross-Module Verknuepfungen.
|
||||
|
||||
Verknuepft Klausur-Ergebnisse mit anderen BreakPilot-Modulen:
|
||||
- Notenbuch (School Service)
|
||||
- Elternabend (Gespraechsvorschlaege)
|
||||
- Zeugnisse (Notenuebernahme)
|
||||
- Kalender (Termine)
|
||||
|
||||
Privacy:
|
||||
- Verknuepfungen nutzen doc_tokens (pseudonymisiert)
|
||||
- Deanonymisierung nur Client-seitig moeglich
|
||||
"""
|
||||
|
||||
import httpx
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Any
|
||||
from datetime import datetime, timedelta
|
||||
from enum import Enum
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DATA CLASSES
|
||||
# ============================================================================
|
||||
|
||||
class LinkType(str, Enum):
|
||||
"""Typ der Modul-Verknuepfung."""
|
||||
NOTENBUCH = "notenbuch"
|
||||
ELTERNABEND = "elternabend"
|
||||
ZEUGNIS = "zeugnis"
|
||||
CALENDAR = "calendar"
|
||||
KLASSENBUCH = "klassenbuch"
|
||||
|
||||
|
||||
class MeetingUrgency(str, Enum):
|
||||
"""Dringlichkeit eines Elterngespraechs."""
|
||||
LOW = "niedrig"
|
||||
MEDIUM = "mittel"
|
||||
HIGH = "hoch"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CorrectionResult:
|
||||
"""Korrektur-Ergebnis (pseudonymisiert)."""
|
||||
doc_token: str
|
||||
score: float # Punkte
|
||||
max_score: float
|
||||
grade: str # z.B. "2+"
|
||||
feedback: str
|
||||
question_results: List[Dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GradeEntry:
|
||||
"""Notenbuch-Eintrag."""
|
||||
student_id: str # Im Notenbuch: echte Student-ID
|
||||
doc_token: str # Aus Klausur: pseudonymisiert
|
||||
grade: str
|
||||
points: float
|
||||
max_points: float
|
||||
exam_name: str
|
||||
date: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParentMeetingSuggestion:
|
||||
"""Vorschlag fuer ein Elterngespraech."""
|
||||
doc_token: str # Pseudonymisiert
|
||||
reason: str
|
||||
urgency: MeetingUrgency
|
||||
grade: str
|
||||
subject: str
|
||||
suggested_topics: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CalendarEvent:
|
||||
"""Kalender-Eintrag."""
|
||||
id: str
|
||||
title: str
|
||||
description: str
|
||||
start_time: datetime
|
||||
end_time: datetime
|
||||
event_type: str
|
||||
linked_doc_tokens: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModuleLink:
|
||||
"""Verknuepfung zu einem anderen Modul."""
|
||||
id: str
|
||||
klausur_session_id: str
|
||||
link_type: LinkType
|
||||
target_module: str
|
||||
target_entity_id: str
|
||||
target_url: Optional[str] = None
|
||||
link_metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
created_at: datetime = field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LinkResult:
|
||||
"""Ergebnis einer Verknuepfungs-Operation."""
|
||||
success: bool
|
||||
link: Optional[ModuleLink] = None
|
||||
message: str = ""
|
||||
target_url: Optional[str] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# MODULE LINKER
|
||||
# ============================================================================
|
||||
|
||||
class ModuleLinker:
|
||||
"""
|
||||
Verknuepft Klausur-Ergebnisse mit anderen Modulen.
|
||||
|
||||
Beispiel:
|
||||
linker = ModuleLinker()
|
||||
|
||||
# Noten ins Notenbuch uebertragen
|
||||
result = await linker.link_to_notenbuch(
|
||||
session_id="session-123",
|
||||
class_id="class-456",
|
||||
results=correction_results
|
||||
)
|
||||
|
||||
# Elterngespraeche vorschlagen
|
||||
suggestions = linker.suggest_elternabend(
|
||||
results=correction_results,
|
||||
subject="Mathematik"
|
||||
)
|
||||
"""
|
||||
|
||||
# Notenschwellen fuer Elterngespraeche
|
||||
GRADE_THRESHOLDS = {
|
||||
"1+": 0.95, "1": 0.90, "1-": 0.85,
|
||||
"2+": 0.80, "2": 0.75, "2-": 0.70,
|
||||
"3+": 0.65, "3": 0.60, "3-": 0.55,
|
||||
"4+": 0.50, "4": 0.45, "4-": 0.40,
|
||||
"5+": 0.33, "5": 0.25, "5-": 0.17,
|
||||
"6": 0.0
|
||||
}
|
||||
|
||||
# Noten die Gespraeche erfordern
|
||||
MEETING_TRIGGER_GRADES = ["4", "4-", "5+", "5", "5-", "6"]
|
||||
|
||||
def __init__(self):
|
||||
self.school_service_url = os.getenv(
|
||||
"SCHOOL_SERVICE_URL",
|
||||
"http://school-service:8084"
|
||||
)
|
||||
self.calendar_service_url = os.getenv(
|
||||
"CALENDAR_SERVICE_URL",
|
||||
"http://calendar-service:8085"
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# NOTENBUCH INTEGRATION
|
||||
# =========================================================================
|
||||
|
||||
async def link_to_notenbuch(
|
||||
self,
|
||||
session_id: str,
|
||||
class_id: str,
|
||||
subject: str,
|
||||
results: List[CorrectionResult],
|
||||
exam_name: str,
|
||||
exam_date: str,
|
||||
identity_map: Optional[Dict[str, str]] = None
|
||||
) -> LinkResult:
|
||||
"""
|
||||
Uebertraegt Noten ins Notenbuch (School Service).
|
||||
|
||||
Args:
|
||||
session_id: Klausur-Session-ID
|
||||
class_id: Klassen-ID im School Service
|
||||
subject: Fach
|
||||
results: Liste der Korrektur-Ergebnisse
|
||||
exam_name: Name der Klausur
|
||||
exam_date: Datum der Klausur
|
||||
identity_map: Optional: doc_token -> student_id Mapping
|
||||
|
||||
Note:
|
||||
Das identity_map wird nur serverseitig genutzt, wenn der
|
||||
Lehrer explizit die Verknuepfung freigibt. Normalerweise
|
||||
bleibt das Mapping Client-seitig.
|
||||
"""
|
||||
try:
|
||||
# Noten-Daten aufbereiten
|
||||
grades_data = []
|
||||
for result in results:
|
||||
grade_entry = {
|
||||
"doc_token": result.doc_token,
|
||||
"grade": result.grade,
|
||||
"points": result.score,
|
||||
"max_points": result.max_score,
|
||||
"percentage": result.score / result.max_score if result.max_score > 0 else 0
|
||||
}
|
||||
|
||||
# Falls identity_map vorhanden: Student-ID hinzufuegen
|
||||
if identity_map and result.doc_token in identity_map:
|
||||
grade_entry["student_id"] = identity_map[result.doc_token]
|
||||
|
||||
grades_data.append(grade_entry)
|
||||
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.post(
|
||||
f"{self.school_service_url}/api/classes/{class_id}/exams",
|
||||
json={
|
||||
"name": exam_name,
|
||||
"subject": subject,
|
||||
"date": exam_date,
|
||||
"max_points": results[0].max_score if results else 100,
|
||||
"grades": grades_data,
|
||||
"klausur_session_id": session_id
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code in (200, 201):
|
||||
data = response.json()
|
||||
return LinkResult(
|
||||
success=True,
|
||||
link=ModuleLink(
|
||||
id=data.get('id', ''),
|
||||
klausur_session_id=session_id,
|
||||
link_type=LinkType.NOTENBUCH,
|
||||
target_module="school",
|
||||
target_entity_id=data.get('id', ''),
|
||||
target_url=f"/app?module=school&class={class_id}&exam={data.get('id')}"
|
||||
),
|
||||
message=f"Noten erfolgreich uebertragen ({len(results)} Eintraege)",
|
||||
target_url=f"/app?module=school&class={class_id}"
|
||||
)
|
||||
|
||||
return LinkResult(
|
||||
success=False,
|
||||
message=f"Fehler beim Uebertragen: {response.status_code}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return LinkResult(
|
||||
success=False,
|
||||
message=f"Verbindungsfehler: {str(e)}"
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# ELTERNABEND VORSCHLAEGE
|
||||
# =========================================================================
|
||||
|
||||
def suggest_elternabend(
|
||||
self,
|
||||
results: List[CorrectionResult],
|
||||
subject: str,
|
||||
threshold_grade: str = "4"
|
||||
) -> List[ParentMeetingSuggestion]:
|
||||
"""
|
||||
Schlaegt Elterngespraeche fuer schwache Schueler vor.
|
||||
|
||||
Args:
|
||||
results: Liste der Korrektur-Ergebnisse
|
||||
subject: Fach
|
||||
threshold_grade: Ab dieser Note wird ein Gespraech vorgeschlagen
|
||||
|
||||
Returns:
|
||||
Liste von Gespraechs-Vorschlaegen (pseudonymisiert)
|
||||
"""
|
||||
suggestions = []
|
||||
threshold_idx = list(self.GRADE_THRESHOLDS.keys()).index(threshold_grade) \
|
||||
if threshold_grade in self.GRADE_THRESHOLDS else 9
|
||||
|
||||
for result in results:
|
||||
# Pruefe ob Note Gespraech erfordert
|
||||
if result.grade in self.MEETING_TRIGGER_GRADES:
|
||||
urgency = self._determine_urgency(result.grade)
|
||||
topics = self._generate_meeting_topics(result, subject)
|
||||
|
||||
suggestions.append(ParentMeetingSuggestion(
|
||||
doc_token=result.doc_token,
|
||||
reason=f"Note {result.grade} in {subject}",
|
||||
urgency=urgency,
|
||||
grade=result.grade,
|
||||
subject=subject,
|
||||
suggested_topics=topics
|
||||
))
|
||||
|
||||
# Nach Dringlichkeit sortieren
|
||||
urgency_order = {
|
||||
MeetingUrgency.HIGH: 0,
|
||||
MeetingUrgency.MEDIUM: 1,
|
||||
MeetingUrgency.LOW: 2
|
||||
}
|
||||
suggestions.sort(key=lambda s: urgency_order[s.urgency])
|
||||
|
||||
return suggestions
|
||||
|
||||
def _determine_urgency(self, grade: str) -> MeetingUrgency:
|
||||
"""Bestimmt die Dringlichkeit basierend auf der Note."""
|
||||
if grade in ["5-", "6"]:
|
||||
return MeetingUrgency.HIGH
|
||||
elif grade in ["5", "5+"]:
|
||||
return MeetingUrgency.MEDIUM
|
||||
else:
|
||||
return MeetingUrgency.LOW
|
||||
|
||||
def _generate_meeting_topics(
|
||||
self,
|
||||
result: CorrectionResult,
|
||||
subject: str
|
||||
) -> List[str]:
|
||||
"""Generiert Gespraechsthemen basierend auf den Ergebnissen."""
|
||||
topics = []
|
||||
|
||||
# Allgemeine Themen
|
||||
topics.append(f"Leistungsstand in {subject}")
|
||||
|
||||
# Basierend auf Feedback
|
||||
if "Verstaendnis" in result.feedback.lower() or "grundlagen" in result.feedback.lower():
|
||||
topics.append("Grundlagenverstaendnis foerdern")
|
||||
|
||||
if "uebung" in result.feedback.lower():
|
||||
topics.append("Zusaetzliche Uebungsmoeglichkeiten")
|
||||
|
||||
# Basierend auf Aufgaben-Ergebnissen
|
||||
if result.question_results:
|
||||
weak_areas = []
|
||||
for qr in result.question_results:
|
||||
if qr.get('points_awarded', 0) / qr.get('max_points', 1) < 0.5:
|
||||
weak_areas.append(qr.get('question_text', ''))
|
||||
|
||||
if weak_areas:
|
||||
topics.append("Gezielte Foerderung in Schwachstellen")
|
||||
|
||||
# Standard-Themen
|
||||
if not topics or len(topics) < 3:
|
||||
topics.extend([
|
||||
"Lernstrategien besprechen",
|
||||
"Unterstuetzungsmoeglichkeiten zu Hause",
|
||||
"Nachhilfe-Optionen"
|
||||
])
|
||||
|
||||
return topics[:5] # Max 5 Themen
|
||||
|
||||
async def create_elternabend_link(
|
||||
self,
|
||||
session_id: str,
|
||||
suggestions: List[ParentMeetingSuggestion],
|
||||
teacher_id: str
|
||||
) -> LinkResult:
|
||||
"""Erstellt Verknuepfungen zum Elternabend-Modul."""
|
||||
# TODO: Integration mit Elternabend-Modul
|
||||
# Vorerst nur Metadaten speichern
|
||||
|
||||
return LinkResult(
|
||||
success=True,
|
||||
link=ModuleLink(
|
||||
id=f"elternabend-{session_id}",
|
||||
klausur_session_id=session_id,
|
||||
link_type=LinkType.ELTERNABEND,
|
||||
target_module="elternabend",
|
||||
target_entity_id="",
|
||||
link_metadata={
|
||||
"suggestion_count": len(suggestions),
|
||||
"high_urgency_count": sum(
|
||||
1 for s in suggestions if s.urgency == MeetingUrgency.HIGH
|
||||
)
|
||||
}
|
||||
),
|
||||
message=f"{len(suggestions)} Elterngespraeche vorgeschlagen",
|
||||
target_url="/app?module=elternabend"
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# ZEUGNIS INTEGRATION
|
||||
# =========================================================================
|
||||
|
||||
async def update_zeugnis(
|
||||
self,
|
||||
class_id: str,
|
||||
subject: str,
|
||||
grades: Dict[str, str],
|
||||
exam_weight: float = 1.0
|
||||
) -> LinkResult:
|
||||
"""
|
||||
Aktualisiert Zeugnis-Aggregation mit neuen Noten.
|
||||
|
||||
Args:
|
||||
class_id: Klassen-ID
|
||||
subject: Fach
|
||||
grades: doc_token -> Note Mapping
|
||||
exam_weight: Gewichtung der Klausur (Standard: 1.0)
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.post(
|
||||
f"{self.school_service_url}/api/classes/{class_id}/grades/aggregate",
|
||||
json={
|
||||
"subject": subject,
|
||||
"grades": grades,
|
||||
"weight": exam_weight,
|
||||
"type": "klausur"
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code in (200, 201):
|
||||
return LinkResult(
|
||||
success=True,
|
||||
message="Zeugnis-Daten aktualisiert",
|
||||
target_url=f"/app?module=school&class={class_id}&tab=certificates"
|
||||
)
|
||||
|
||||
return LinkResult(
|
||||
success=False,
|
||||
message=f"Fehler: {response.status_code}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return LinkResult(
|
||||
success=False,
|
||||
message=f"Verbindungsfehler: {str(e)}"
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# KALENDER INTEGRATION
|
||||
# =========================================================================
|
||||
|
||||
async def create_calendar_events(
|
||||
self,
|
||||
teacher_id: str,
|
||||
suggestions: List[ParentMeetingSuggestion],
|
||||
default_duration_minutes: int = 30
|
||||
) -> List[CalendarEvent]:
|
||||
"""
|
||||
Erstellt Kalender-Eintraege fuer Elterngespraeche.
|
||||
|
||||
Args:
|
||||
teacher_id: ID des Lehrers
|
||||
suggestions: Liste der Gespraechs-Vorschlaege
|
||||
default_duration_minutes: Standard-Dauer pro Gespraech
|
||||
"""
|
||||
events = []
|
||||
|
||||
# Zeitslots generieren (ab naechster Woche, nachmittags)
|
||||
start_date = datetime.now() + timedelta(days=7 - datetime.now().weekday())
|
||||
start_date = start_date.replace(hour=14, minute=0, second=0, microsecond=0)
|
||||
|
||||
slot_index = 0
|
||||
for suggestion in suggestions:
|
||||
# Zeitslot berechnen
|
||||
event_start = start_date + timedelta(minutes=slot_index * default_duration_minutes)
|
||||
event_end = event_start + timedelta(minutes=default_duration_minutes)
|
||||
|
||||
# Naechster Tag wenn nach 18 Uhr
|
||||
if event_start.hour >= 18:
|
||||
start_date += timedelta(days=1)
|
||||
start_date = start_date.replace(hour=14)
|
||||
slot_index = 0
|
||||
event_start = start_date
|
||||
event_end = event_start + timedelta(minutes=default_duration_minutes)
|
||||
|
||||
event = CalendarEvent(
|
||||
id=f"meeting-{suggestion.doc_token[:8]}",
|
||||
title=f"Elterngespraech ({suggestion.grade})",
|
||||
description=f"Anlass: {suggestion.reason}\n\nThemen:\n" +
|
||||
"\n".join(f"- {t}" for t in suggestion.suggested_topics),
|
||||
start_time=event_start,
|
||||
end_time=event_end,
|
||||
event_type="parent_meeting",
|
||||
linked_doc_tokens=[suggestion.doc_token]
|
||||
)
|
||||
events.append(event)
|
||||
slot_index += 1
|
||||
|
||||
# An Kalender-Service senden
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
for event in events:
|
||||
await client.post(
|
||||
f"{self.calendar_service_url}/api/events",
|
||||
json={
|
||||
"teacher_id": teacher_id,
|
||||
"title": event.title,
|
||||
"description": event.description,
|
||||
"start": event.start_time.isoformat(),
|
||||
"end": event.end_time.isoformat(),
|
||||
"type": event.event_type,
|
||||
"metadata": {
|
||||
"doc_tokens": event.linked_doc_tokens
|
||||
}
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"[ModuleLinker] Calendar service error: {e}")
|
||||
|
||||
return events
|
||||
|
||||
# =========================================================================
|
||||
# STATISTIKEN
|
||||
# =========================================================================
|
||||
|
||||
def calculate_grade_statistics(
|
||||
self,
|
||||
results: List[CorrectionResult]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Berechnet Notenstatistiken.
|
||||
|
||||
Returns:
|
||||
Dict mit Durchschnitt, Verteilung, Median, etc.
|
||||
"""
|
||||
if not results:
|
||||
return {}
|
||||
|
||||
# Notenwerte (fuer Durchschnitt)
|
||||
grade_values = {
|
||||
"1+": 0.7, "1": 1.0, "1-": 1.3,
|
||||
"2+": 1.7, "2": 2.0, "2-": 2.3,
|
||||
"3+": 2.7, "3": 3.0, "3-": 3.3,
|
||||
"4+": 3.7, "4": 4.0, "4-": 4.3,
|
||||
"5+": 4.7, "5": 5.0, "5-": 5.3,
|
||||
"6": 6.0
|
||||
}
|
||||
|
||||
# Noten sammeln
|
||||
grades = [r.grade for r in results]
|
||||
points = [r.score for r in results]
|
||||
max_points = results[0].max_score if results else 100
|
||||
|
||||
# Durchschnitt berechnen
|
||||
numeric_grades = [grade_values.get(g, 4.0) for g in grades]
|
||||
avg_grade = sum(numeric_grades) / len(numeric_grades)
|
||||
|
||||
# Notenverteilung
|
||||
distribution = {}
|
||||
for grade in grades:
|
||||
distribution[grade] = distribution.get(grade, 0) + 1
|
||||
|
||||
# Prozent-Verteilung
|
||||
percent_distribution = {
|
||||
"sehr gut (1)": sum(1 for g in grades if g.startswith("1")),
|
||||
"gut (2)": sum(1 for g in grades if g.startswith("2")),
|
||||
"befriedigend (3)": sum(1 for g in grades if g.startswith("3")),
|
||||
"ausreichend (4)": sum(1 for g in grades if g.startswith("4")),
|
||||
"mangelhaft (5)": sum(1 for g in grades if g.startswith("5")),
|
||||
"ungenuegend (6)": sum(1 for g in grades if g == "6")
|
||||
}
|
||||
|
||||
return {
|
||||
"count": len(results),
|
||||
"average_grade": round(avg_grade, 2),
|
||||
"average_grade_display": self._numeric_to_grade(avg_grade),
|
||||
"average_points": round(sum(points) / len(points), 1),
|
||||
"max_points": max_points,
|
||||
"average_percent": round((sum(points) / len(points) / max_points) * 100, 1),
|
||||
"best_grade": min(grades, key=lambda g: grade_values.get(g, 6)),
|
||||
"worst_grade": max(grades, key=lambda g: grade_values.get(g, 0)),
|
||||
"median_grade": self._calculate_median_grade(grades),
|
||||
"distribution": distribution,
|
||||
"percent_distribution": percent_distribution,
|
||||
"passing_count": sum(1 for g in grades if not g.startswith("5") and g != "6"),
|
||||
"failing_count": sum(1 for g in grades if g.startswith("5") or g == "6")
|
||||
}
|
||||
|
||||
def _numeric_to_grade(self, value: float) -> str:
|
||||
"""Konvertiert Notenwert zu Note."""
|
||||
if value <= 1.15:
|
||||
return "1+"
|
||||
elif value <= 1.5:
|
||||
return "1"
|
||||
elif value <= 1.85:
|
||||
return "1-"
|
||||
elif value <= 2.15:
|
||||
return "2+"
|
||||
elif value <= 2.5:
|
||||
return "2"
|
||||
elif value <= 2.85:
|
||||
return "2-"
|
||||
elif value <= 3.15:
|
||||
return "3+"
|
||||
elif value <= 3.5:
|
||||
return "3"
|
||||
elif value <= 3.85:
|
||||
return "3-"
|
||||
elif value <= 4.15:
|
||||
return "4+"
|
||||
elif value <= 4.5:
|
||||
return "4"
|
||||
elif value <= 4.85:
|
||||
return "4-"
|
||||
elif value <= 5.15:
|
||||
return "5+"
|
||||
elif value <= 5.5:
|
||||
return "5"
|
||||
elif value <= 5.85:
|
||||
return "5-"
|
||||
else:
|
||||
return "6"
|
||||
|
||||
def _calculate_median_grade(self, grades: List[str]) -> str:
|
||||
"""Berechnet die Median-Note."""
|
||||
grade_values = {
|
||||
"1+": 0.7, "1": 1.0, "1-": 1.3,
|
||||
"2+": 1.7, "2": 2.0, "2-": 2.3,
|
||||
"3+": 2.7, "3": 3.0, "3-": 3.3,
|
||||
"4+": 3.7, "4": 4.0, "4-": 4.3,
|
||||
"5+": 4.7, "5": 5.0, "5-": 5.3,
|
||||
"6": 6.0
|
||||
}
|
||||
|
||||
numeric = sorted([grade_values.get(g, 4.0) for g in grades])
|
||||
n = len(numeric)
|
||||
if n % 2 == 0:
|
||||
median = (numeric[n // 2 - 1] + numeric[n // 2]) / 2
|
||||
else:
|
||||
median = numeric[n // 2]
|
||||
|
||||
return self._numeric_to_grade(median)
|
||||
|
||||
|
||||
# Singleton
|
||||
_module_linker: Optional[ModuleLinker] = None
|
||||
|
||||
|
||||
def get_module_linker() -> ModuleLinker:
|
||||
"""Gibt die Singleton-Instanz des ModuleLinkers zurueck."""
|
||||
global _module_linker
|
||||
if _module_linker is None:
|
||||
_module_linker = ModuleLinker()
|
||||
return _module_linker
|
||||
424
backend/klausur/services/processing_service.py
Normal file
424
backend/klausur/services/processing_service.py
Normal file
@@ -0,0 +1,424 @@
|
||||
"""
|
||||
Background Processing Service for Klausur Correction.
|
||||
|
||||
Orchestrates the complete correction pipeline:
|
||||
1. Load documents from storage
|
||||
2. Run TrOCR for text extraction
|
||||
3. Run AI correction for grading
|
||||
4. Save results to database
|
||||
|
||||
PRIVACY BY DESIGN:
|
||||
- Only pseudonymized doc_tokens used throughout
|
||||
- No student names in processing pipeline
|
||||
- All data stays on self-hosted infrastructure
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Callable
|
||||
from dataclasses import dataclass
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ..db_models import (
|
||||
ExamSession, PseudonymizedDocument,
|
||||
SessionStatus, DocumentStatus
|
||||
)
|
||||
from ..repository import KlausurRepository
|
||||
from .trocr_client import get_trocr_client, TrOCRClient
|
||||
from .vision_ocr_service import get_vision_ocr_service, VisionOCRService
|
||||
from .correction_service import (
|
||||
get_correction_service, ExamCorrectionService,
|
||||
QuestionRubric, CorrectionResult
|
||||
)
|
||||
from .storage_service import get_storage_service, KlausurStorageService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingProgress:
|
||||
"""Progress update for SSE streaming."""
|
||||
session_id: str
|
||||
total_documents: int
|
||||
processed_documents: int
|
||||
current_document: Optional[str] = None
|
||||
current_step: str = "idle" # ocr, correction, saving
|
||||
error: Optional[str] = None
|
||||
|
||||
@property
|
||||
def percentage(self) -> int:
|
||||
if self.total_documents == 0:
|
||||
return 0
|
||||
return int(self.processed_documents / self.total_documents * 100)
|
||||
|
||||
|
||||
class ProcessingService:
|
||||
"""
|
||||
Background service for exam correction processing.
|
||||
|
||||
Usage:
|
||||
service = ProcessingService(db_session)
|
||||
await service.process_session(session_id, teacher_id)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db: Session,
|
||||
trocr_client: Optional[TrOCRClient] = None,
|
||||
vision_ocr_service: Optional[VisionOCRService] = None,
|
||||
correction_service: Optional[ExamCorrectionService] = None,
|
||||
storage_service: Optional[KlausurStorageService] = None,
|
||||
prefer_vision_ocr: bool = True # Vision-LLM als Primär für Handschrift
|
||||
):
|
||||
self.db = db
|
||||
self.repo = KlausurRepository(db)
|
||||
self.trocr = trocr_client or get_trocr_client()
|
||||
self.vision_ocr = vision_ocr_service or get_vision_ocr_service()
|
||||
self.correction = correction_service or get_correction_service()
|
||||
self.storage = storage_service or get_storage_service()
|
||||
self.prefer_vision_ocr = prefer_vision_ocr
|
||||
|
||||
# Progress callback for SSE streaming
|
||||
self._progress_callback: Optional[Callable[[ProcessingProgress], None]] = None
|
||||
|
||||
def set_progress_callback(self, callback: Callable[[ProcessingProgress], None]):
|
||||
"""Set callback for progress updates (SSE streaming)."""
|
||||
self._progress_callback = callback
|
||||
|
||||
def _notify_progress(self, progress: ProcessingProgress):
|
||||
"""Notify progress to callback if set."""
|
||||
if self._progress_callback:
|
||||
try:
|
||||
self._progress_callback(progress)
|
||||
except Exception as e:
|
||||
logger.warning(f"Progress callback failed: {e}")
|
||||
|
||||
async def process_session(
|
||||
self,
|
||||
session_id: str,
|
||||
teacher_id: str,
|
||||
use_ai_correction: bool = True
|
||||
) -> bool:
|
||||
"""
|
||||
Process all documents in a session.
|
||||
|
||||
Args:
|
||||
session_id: Exam session ID
|
||||
teacher_id: Teacher ID for isolation
|
||||
use_ai_correction: Whether to run AI correction (requires LLM)
|
||||
|
||||
Returns:
|
||||
True if processing completed successfully
|
||||
"""
|
||||
# Get session
|
||||
session = self.repo.get_session(session_id, teacher_id)
|
||||
if not session:
|
||||
logger.error(f"Session not found: {session_id}")
|
||||
return False
|
||||
|
||||
# Get documents
|
||||
documents = self.repo.list_documents(session_id, teacher_id)
|
||||
if not documents:
|
||||
logger.warning(f"No documents in session: {session_id}")
|
||||
return False
|
||||
|
||||
total = len(documents)
|
||||
processed = 0
|
||||
|
||||
logger.info(f"Starting processing for session {session_id}: {total} documents")
|
||||
|
||||
# Check OCR service availability (Vision-LLM preferred for handwriting)
|
||||
vision_ocr_available = await self.vision_ocr.is_available()
|
||||
trocr_available = await self.trocr.is_available()
|
||||
|
||||
if vision_ocr_available and self.prefer_vision_ocr:
|
||||
logger.info("Using Vision-LLM (llama3.2-vision) for OCR - optimal for handwriting")
|
||||
use_vision_ocr = True
|
||||
elif trocr_available:
|
||||
logger.info("Using TrOCR for OCR")
|
||||
use_vision_ocr = False
|
||||
elif vision_ocr_available:
|
||||
logger.info("TrOCR not available, falling back to Vision-LLM")
|
||||
use_vision_ocr = True
|
||||
else:
|
||||
logger.warning("No OCR service available - OCR will be skipped")
|
||||
use_vision_ocr = False
|
||||
trocr_available = False
|
||||
|
||||
# Process each document
|
||||
for doc in documents:
|
||||
progress = ProcessingProgress(
|
||||
session_id=session_id,
|
||||
total_documents=total,
|
||||
processed_documents=processed,
|
||||
current_document=doc.doc_token[:8],
|
||||
current_step="ocr"
|
||||
)
|
||||
self._notify_progress(progress)
|
||||
|
||||
try:
|
||||
# Step 1: OCR extraction (Vision-LLM or TrOCR)
|
||||
if (vision_ocr_available or trocr_available) and doc.status == DocumentStatus.UPLOADED:
|
||||
await self._process_ocr(session_id, doc, teacher_id, use_vision_ocr=use_vision_ocr)
|
||||
|
||||
# Step 2: AI correction
|
||||
progress.current_step = "correction"
|
||||
self._notify_progress(progress)
|
||||
|
||||
if use_ai_correction and doc.ocr_text:
|
||||
await self._process_correction(session, doc, teacher_id)
|
||||
else:
|
||||
# Just mark as completed without AI
|
||||
self._mark_document_completed(doc, teacher_id)
|
||||
|
||||
processed += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process document {doc.doc_token}: {e}")
|
||||
self._mark_document_failed(doc, str(e), teacher_id)
|
||||
|
||||
# Update session status
|
||||
self.repo.update_session_status(session_id, teacher_id, SessionStatus.COMPLETED)
|
||||
|
||||
# Final progress
|
||||
progress = ProcessingProgress(
|
||||
session_id=session_id,
|
||||
total_documents=total,
|
||||
processed_documents=processed,
|
||||
current_step="complete"
|
||||
)
|
||||
self._notify_progress(progress)
|
||||
|
||||
logger.info(f"Completed processing session {session_id}: {processed}/{total} documents")
|
||||
return True
|
||||
|
||||
async def _process_ocr(
|
||||
self,
|
||||
session_id: str,
|
||||
doc: PseudonymizedDocument,
|
||||
teacher_id: str,
|
||||
use_vision_ocr: bool = True
|
||||
):
|
||||
"""
|
||||
Run OCR on a document.
|
||||
|
||||
Args:
|
||||
session_id: Session ID
|
||||
doc: Document to process
|
||||
teacher_id: Teacher ID
|
||||
use_vision_ocr: True to use Vision-LLM (llama3.2-vision), False for TrOCR
|
||||
"""
|
||||
# Update status
|
||||
doc.status = DocumentStatus.OCR_PROCESSING
|
||||
doc.processing_started_at = datetime.utcnow()
|
||||
self.db.commit()
|
||||
|
||||
# Try to get document from storage (check both redacted and original)
|
||||
image_data = None
|
||||
for is_redacted in [True, False]: # Prefer redacted version
|
||||
for ext in ["png", "jpg", "jpeg", "pdf"]:
|
||||
image_data = self.storage.get_document(
|
||||
session_id, doc.doc_token, ext, is_redacted=is_redacted
|
||||
)
|
||||
if image_data:
|
||||
logger.debug(f"Found document: {doc.doc_token[:8]}.{ext} (redacted={is_redacted})")
|
||||
break
|
||||
if image_data:
|
||||
break
|
||||
|
||||
if not image_data:
|
||||
logger.warning(f"No image found for document {doc.doc_token}")
|
||||
# Use placeholder OCR text for testing
|
||||
doc.ocr_text = "[Kein Bild gefunden - Manuelle Eingabe erforderlich]"
|
||||
doc.ocr_confidence = 0
|
||||
doc.status = DocumentStatus.OCR_COMPLETED
|
||||
self.db.commit()
|
||||
return
|
||||
|
||||
# Call OCR service (Vision-LLM or TrOCR)
|
||||
try:
|
||||
if use_vision_ocr:
|
||||
# Use Vision-LLM (llama3.2-vision) - better for handwriting
|
||||
result = await self.vision_ocr.extract_text(
|
||||
image_data,
|
||||
filename=f"{doc.doc_token}.png",
|
||||
is_handwriting=True # Assume handwriting for exams
|
||||
)
|
||||
ocr_method = "Vision-LLM"
|
||||
else:
|
||||
# Use TrOCR
|
||||
result = await self.trocr.extract_text(
|
||||
image_data,
|
||||
filename=f"{doc.doc_token}.png",
|
||||
detect_lines=True
|
||||
)
|
||||
ocr_method = "TrOCR"
|
||||
|
||||
doc.ocr_text = result.text
|
||||
doc.ocr_confidence = int(result.confidence * 100)
|
||||
doc.status = DocumentStatus.OCR_COMPLETED
|
||||
|
||||
logger.info(
|
||||
f"OCR completed ({ocr_method}) for {doc.doc_token[:8]}: "
|
||||
f"{len(result.text)} chars, {result.confidence:.0%} confidence"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR failed for {doc.doc_token}: {e}")
|
||||
doc.ocr_text = f"[OCR Fehler: {str(e)[:100]}]"
|
||||
doc.ocr_confidence = 0
|
||||
doc.status = DocumentStatus.OCR_COMPLETED # Continue to AI anyway
|
||||
|
||||
self.db.commit()
|
||||
|
||||
async def _process_correction(
|
||||
self,
|
||||
session: ExamSession,
|
||||
doc: PseudonymizedDocument,
|
||||
teacher_id: str
|
||||
):
|
||||
"""Run AI correction on a document."""
|
||||
doc.status = DocumentStatus.AI_PROCESSING
|
||||
self.db.commit()
|
||||
|
||||
# Build rubrics from session questions
|
||||
rubrics = self._build_rubrics(session)
|
||||
|
||||
if not rubrics:
|
||||
# No rubrics defined - use simple scoring
|
||||
doc.ai_feedback = "Keine Bewertungskriterien definiert. Manuelle Korrektur empfohlen."
|
||||
doc.ai_score = None
|
||||
doc.ai_grade = None
|
||||
doc.status = DocumentStatus.COMPLETED
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
self.db.commit()
|
||||
|
||||
# Update session stats
|
||||
session.processed_count += 1
|
||||
self.db.commit()
|
||||
return
|
||||
|
||||
try:
|
||||
# Run AI correction
|
||||
result = await self.correction.correct_exam(
|
||||
doc_token=doc.doc_token,
|
||||
ocr_text=doc.ocr_text,
|
||||
rubrics=rubrics,
|
||||
subject=session.subject or "Allgemein"
|
||||
)
|
||||
|
||||
# Save results
|
||||
doc.ai_feedback = result.overall_feedback
|
||||
doc.ai_score = result.total_score
|
||||
doc.ai_grade = result.grade
|
||||
doc.ai_details = {
|
||||
"max_score": result.max_score,
|
||||
"processing_time_ms": result.processing_time_ms,
|
||||
"questions": [
|
||||
{
|
||||
"number": q.question_number,
|
||||
"points": q.points_awarded,
|
||||
"max_points": q.max_points,
|
||||
"feedback": q.feedback,
|
||||
"strengths": q.strengths,
|
||||
"improvements": q.improvements
|
||||
}
|
||||
for q in result.question_results
|
||||
]
|
||||
}
|
||||
doc.status = DocumentStatus.COMPLETED
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
|
||||
logger.info(
|
||||
f"Correction completed for {doc.doc_token[:8]}: "
|
||||
f"{result.total_score}/{result.max_score} ({result.grade})"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI correction failed for {doc.doc_token}: {e}")
|
||||
doc.ai_feedback = f"KI-Korrektur fehlgeschlagen: {str(e)[:200]}"
|
||||
doc.status = DocumentStatus.COMPLETED # Mark complete anyway
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
|
||||
# Update session stats
|
||||
session.processed_count += 1
|
||||
self.db.commit()
|
||||
|
||||
def _build_rubrics(self, session: ExamSession) -> List[QuestionRubric]:
|
||||
"""Build QuestionRubric list from session questions."""
|
||||
rubrics = []
|
||||
|
||||
if not session.questions:
|
||||
return rubrics
|
||||
|
||||
for i, q in enumerate(session.questions):
|
||||
rubric = QuestionRubric(
|
||||
question_number=q.get("number", i + 1),
|
||||
question_text=q.get("text", f"Frage {i + 1}"),
|
||||
max_points=q.get("points", 10),
|
||||
expected_answer=q.get("expected_answer", ""),
|
||||
grading_criteria=q.get("rubric", session.rubric or "")
|
||||
)
|
||||
rubrics.append(rubric)
|
||||
|
||||
return rubrics
|
||||
|
||||
def _mark_document_completed(
|
||||
self,
|
||||
doc: PseudonymizedDocument,
|
||||
teacher_id: str
|
||||
):
|
||||
"""Mark document as completed without AI correction."""
|
||||
doc.status = DocumentStatus.COMPLETED
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
if not doc.ai_feedback:
|
||||
doc.ai_feedback = "Verarbeitung abgeschlossen (ohne KI-Korrektur)"
|
||||
self.db.commit()
|
||||
|
||||
# Update session stats
|
||||
if doc.session:
|
||||
doc.session.processed_count += 1
|
||||
self.db.commit()
|
||||
|
||||
def _mark_document_failed(
|
||||
self,
|
||||
doc: PseudonymizedDocument,
|
||||
error: str,
|
||||
teacher_id: str
|
||||
):
|
||||
"""Mark document as failed."""
|
||||
doc.status = DocumentStatus.FAILED
|
||||
doc.processing_error = error[:500]
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
self.db.commit()
|
||||
|
||||
|
||||
# Background task function for FastAPI
|
||||
async def process_session_background(
|
||||
session_id: str,
|
||||
teacher_id: str,
|
||||
db_url: str
|
||||
):
|
||||
"""
|
||||
Background task for session processing.
|
||||
|
||||
This function creates its own DB session for use in background tasks.
|
||||
"""
|
||||
from ..database import SessionLocal
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
service = ProcessingService(db)
|
||||
await service.process_session(session_id, teacher_id)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# Singleton for main service
|
||||
_processing_service: Optional[ProcessingService] = None
|
||||
|
||||
|
||||
def get_processing_service(db: Session) -> ProcessingService:
|
||||
"""Get processing service instance."""
|
||||
return ProcessingService(db)
|
||||
376
backend/klausur/services/pseudonymizer.py
Normal file
376
backend/klausur/services/pseudonymizer.py
Normal file
@@ -0,0 +1,376 @@
|
||||
"""
|
||||
Pseudonymization Service for Klausurkorrektur.
|
||||
|
||||
Implements privacy-by-design principles:
|
||||
- QR code generation with random doc_tokens
|
||||
- Header redaction to remove personal data before OCR
|
||||
- No student identity data leaves the teacher's device
|
||||
|
||||
DSGVO Art. 4 Nr. 5 Compliance:
|
||||
The doc_token is a 128-bit random UUID that cannot be used to
|
||||
identify a student without the encrypted identity map.
|
||||
"""
|
||||
import uuid
|
||||
import io
|
||||
import logging
|
||||
from typing import List, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Optional imports (graceful fallback if not installed)
|
||||
try:
|
||||
import qrcode
|
||||
HAS_QRCODE = True
|
||||
except ImportError:
|
||||
HAS_QRCODE = False
|
||||
logger.warning("qrcode not installed - QR generation disabled")
|
||||
|
||||
try:
|
||||
import cv2
|
||||
import numpy as np
|
||||
HAS_CV2 = True
|
||||
except ImportError:
|
||||
HAS_CV2 = False
|
||||
logger.warning("opencv-python not installed - image processing disabled")
|
||||
|
||||
try:
|
||||
from pyzbar.pyzbar import decode as pyzbar_decode
|
||||
HAS_PYZBAR = True
|
||||
except ImportError:
|
||||
HAS_PYZBAR = False
|
||||
logger.warning("pyzbar not installed - QR reading disabled")
|
||||
|
||||
|
||||
@dataclass
|
||||
class RedactionResult:
|
||||
"""Result of header redaction."""
|
||||
redacted_image: bytes
|
||||
original_height: int
|
||||
redacted_height: int
|
||||
redaction_applied: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class QRDetectionResult:
|
||||
"""Result of QR code detection."""
|
||||
doc_token: Optional[str]
|
||||
confidence: float
|
||||
bbox: Optional[Tuple[int, int, int, int]] # x, y, width, height
|
||||
|
||||
|
||||
class PseudonymizationService:
|
||||
"""
|
||||
Service for document pseudonymization.
|
||||
|
||||
PRIVACY GUARANTEES:
|
||||
1. doc_tokens are cryptographically random (UUID4)
|
||||
2. No deterministic relationship between token and student
|
||||
3. Header redaction removes visible personal data
|
||||
4. Identity mapping is encrypted client-side
|
||||
"""
|
||||
|
||||
# Default header height to redact (in pixels, assuming 300 DPI scan)
|
||||
DEFAULT_HEADER_HEIGHT = 300 # ~1 inch / 2.5cm
|
||||
|
||||
@staticmethod
|
||||
def generate_doc_token() -> str:
|
||||
"""
|
||||
Generate a cryptographically random document token.
|
||||
|
||||
Uses UUID4 which provides 122 bits of randomness.
|
||||
This ensures no correlation between tokens is possible.
|
||||
"""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
@staticmethod
|
||||
def generate_batch_tokens(count: int) -> List[str]:
|
||||
"""Generate multiple unique doc_tokens."""
|
||||
return [PseudonymizationService.generate_doc_token() for _ in range(count)]
|
||||
|
||||
def generate_qr_code(
|
||||
self,
|
||||
doc_token: str,
|
||||
size: int = 200,
|
||||
border: int = 2
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate a QR code image for a doc_token.
|
||||
|
||||
Args:
|
||||
doc_token: The pseudonymization token
|
||||
size: Size of the QR code in pixels
|
||||
border: Border size in QR modules
|
||||
|
||||
Returns:
|
||||
PNG image as bytes
|
||||
"""
|
||||
if not HAS_QRCODE:
|
||||
raise RuntimeError("qrcode library not installed")
|
||||
|
||||
qr = qrcode.QRCode(
|
||||
version=1,
|
||||
error_correction=qrcode.constants.ERROR_CORRECT_M,
|
||||
box_size=10,
|
||||
border=border,
|
||||
)
|
||||
qr.add_data(doc_token)
|
||||
qr.make(fit=True)
|
||||
|
||||
img = qr.make_image(fill_color="black", back_color="white")
|
||||
img = img.resize((size, size), Image.Resampling.LANCZOS)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format="PNG")
|
||||
return buffer.getvalue()
|
||||
|
||||
def generate_qr_sheet(
|
||||
self,
|
||||
doc_tokens: List[str],
|
||||
page_size: Tuple[int, int] = (2480, 3508), # A4 at 300 DPI
|
||||
qr_size: int = 200,
|
||||
margin: int = 100,
|
||||
labels: Optional[List[str]] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate a printable sheet of QR codes.
|
||||
|
||||
Args:
|
||||
doc_tokens: List of tokens to generate QR codes for
|
||||
page_size: Page dimensions (width, height) in pixels
|
||||
qr_size: Size of each QR code
|
||||
margin: Page margin
|
||||
labels: Optional labels (e.g., "Nr. 1", "Nr. 2") - NO student names!
|
||||
|
||||
Returns:
|
||||
PNG image of the full sheet
|
||||
"""
|
||||
if not HAS_QRCODE:
|
||||
raise RuntimeError("qrcode library not installed")
|
||||
|
||||
width, height = page_size
|
||||
img = Image.new('RGB', (width, height), 'white')
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Calculate grid
|
||||
usable_width = width - 2 * margin
|
||||
usable_height = height - 2 * margin
|
||||
cell_width = qr_size + 50
|
||||
cell_height = qr_size + 80 # Extra space for label
|
||||
|
||||
cols = usable_width // cell_width
|
||||
rows = usable_height // cell_height
|
||||
|
||||
# Try to load a font (fallback to default)
|
||||
try:
|
||||
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
|
||||
except (IOError, OSError):
|
||||
font = ImageFont.load_default()
|
||||
|
||||
# Generate QR codes
|
||||
for i, token in enumerate(doc_tokens):
|
||||
if i >= cols * rows:
|
||||
logger.warning(f"Sheet full, skipping {len(doc_tokens) - i} tokens")
|
||||
break
|
||||
|
||||
row = i // cols
|
||||
col = i % cols
|
||||
|
||||
x = margin + col * cell_width
|
||||
y = margin + row * cell_height
|
||||
|
||||
# Generate QR code
|
||||
qr_bytes = self.generate_qr_code(token, qr_size)
|
||||
qr_img = Image.open(io.BytesIO(qr_bytes))
|
||||
img.paste(qr_img, (x, y))
|
||||
|
||||
# Add label (number only, NO names)
|
||||
label = labels[i] if labels and i < len(labels) else f"Nr. {i + 1}"
|
||||
draw.text((x, y + qr_size + 5), label, fill="black", font=font)
|
||||
|
||||
# Add truncated token for verification
|
||||
token_short = token[:8] + "..."
|
||||
draw.text((x, y + qr_size + 25), token_short, fill="gray", font=font)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format="PNG")
|
||||
return buffer.getvalue()
|
||||
|
||||
def detect_qr_code(self, image_bytes: bytes) -> QRDetectionResult:
|
||||
"""
|
||||
Detect and decode QR code from an image.
|
||||
|
||||
Args:
|
||||
image_bytes: Image data (PNG, JPEG, etc.)
|
||||
|
||||
Returns:
|
||||
QRDetectionResult with doc_token if found
|
||||
"""
|
||||
if not HAS_PYZBAR:
|
||||
return QRDetectionResult(
|
||||
doc_token=None,
|
||||
confidence=0.0,
|
||||
bbox=None
|
||||
)
|
||||
|
||||
try:
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
# Decode QR codes
|
||||
decoded = pyzbar_decode(img)
|
||||
|
||||
for obj in decoded:
|
||||
if obj.type == 'QRCODE':
|
||||
token = obj.data.decode('utf-8')
|
||||
# Validate it looks like a UUID
|
||||
try:
|
||||
uuid.UUID(token)
|
||||
rect = obj.rect
|
||||
return QRDetectionResult(
|
||||
doc_token=token,
|
||||
confidence=1.0,
|
||||
bbox=(rect.left, rect.top, rect.width, rect.height)
|
||||
)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"QR detection failed: {e}")
|
||||
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
|
||||
|
||||
def redact_header(
|
||||
self,
|
||||
image_bytes: bytes,
|
||||
header_height: Optional[int] = None,
|
||||
fill_color: Tuple[int, int, int] = (255, 255, 255)
|
||||
) -> RedactionResult:
|
||||
"""
|
||||
Redact the header area of a scanned exam page.
|
||||
|
||||
This removes the area where student name/class/date typically appears.
|
||||
The redaction is permanent - no original data is preserved.
|
||||
|
||||
Args:
|
||||
image_bytes: Original scanned image
|
||||
header_height: Height in pixels to redact (None = auto-detect)
|
||||
fill_color: RGB color to fill redacted area (default: white)
|
||||
|
||||
Returns:
|
||||
RedactionResult with redacted image
|
||||
"""
|
||||
try:
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
width, height = img.size
|
||||
|
||||
# Determine header height
|
||||
redact_height = header_height or self.DEFAULT_HEADER_HEIGHT
|
||||
|
||||
# Create a copy and redact header
|
||||
redacted = img.copy()
|
||||
draw = ImageDraw.Draw(redacted)
|
||||
draw.rectangle([(0, 0), (width, redact_height)], fill=fill_color)
|
||||
|
||||
# Save result
|
||||
buffer = io.BytesIO()
|
||||
redacted.save(buffer, format="PNG")
|
||||
|
||||
return RedactionResult(
|
||||
redacted_image=buffer.getvalue(),
|
||||
original_height=height,
|
||||
redacted_height=redact_height,
|
||||
redaction_applied=True
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Header redaction failed: {e}")
|
||||
return RedactionResult(
|
||||
redacted_image=image_bytes,
|
||||
original_height=0,
|
||||
redacted_height=0,
|
||||
redaction_applied=False
|
||||
)
|
||||
|
||||
def smart_redact_header(
|
||||
self,
|
||||
image_bytes: bytes,
|
||||
preserve_qr: bool = True
|
||||
) -> RedactionResult:
|
||||
"""
|
||||
Smart header redaction that detects text regions.
|
||||
|
||||
Uses OCR confidence to identify and redact only the header
|
||||
area containing personal data.
|
||||
|
||||
Args:
|
||||
image_bytes: Original scanned image
|
||||
preserve_qr: If True, don't redact QR code areas
|
||||
|
||||
Returns:
|
||||
RedactionResult with intelligently redacted image
|
||||
"""
|
||||
if not HAS_CV2:
|
||||
# Fallback to simple redaction
|
||||
return self.redact_header(image_bytes)
|
||||
|
||||
try:
|
||||
# Convert to OpenCV format
|
||||
nparr = np.frombuffer(image_bytes, np.uint8)
|
||||
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
||||
height, width = img.shape[:2]
|
||||
|
||||
# Detect QR code position if present
|
||||
qr_result = self.detect_qr_code(image_bytes)
|
||||
|
||||
# Calculate redaction area (top portion of page)
|
||||
# Typically header is in top 10-15% of page
|
||||
header_height = int(height * 0.12)
|
||||
|
||||
# If QR code is in header area, adjust redaction
|
||||
if preserve_qr and qr_result.bbox:
|
||||
qr_x, qr_y, qr_w, qr_h = qr_result.bbox
|
||||
if qr_y < header_height:
|
||||
# QR is in header - redact around it
|
||||
# Create mask
|
||||
mask = np.ones((header_height, width), dtype=np.uint8) * 255
|
||||
|
||||
# Leave QR area unredacted
|
||||
mask[max(0, qr_y):min(header_height, qr_y + qr_h),
|
||||
max(0, qr_x):min(width, qr_x + qr_w)] = 0
|
||||
|
||||
# Apply white fill where mask is 255
|
||||
img[:header_height][mask == 255] = [255, 255, 255]
|
||||
else:
|
||||
# QR not in header - simple redaction
|
||||
img[:header_height] = [255, 255, 255]
|
||||
else:
|
||||
# Simple header redaction
|
||||
img[:header_height] = [255, 255, 255]
|
||||
|
||||
# Encode result
|
||||
_, buffer = cv2.imencode('.png', img)
|
||||
|
||||
return RedactionResult(
|
||||
redacted_image=buffer.tobytes(),
|
||||
original_height=height,
|
||||
redacted_height=header_height,
|
||||
redaction_applied=True
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Smart redaction failed: {e}")
|
||||
return self.redact_header(image_bytes)
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_pseudonymizer: Optional[PseudonymizationService] = None
|
||||
|
||||
|
||||
def get_pseudonymizer() -> PseudonymizationService:
|
||||
"""Get or create the pseudonymization service singleton."""
|
||||
global _pseudonymizer
|
||||
if _pseudonymizer is None:
|
||||
_pseudonymizer = PseudonymizationService()
|
||||
return _pseudonymizer
|
||||
502
backend/klausur/services/roster_parser.py
Normal file
502
backend/klausur/services/roster_parser.py
Normal file
@@ -0,0 +1,502 @@
|
||||
"""
|
||||
Roster Parser Service - Klassenbuch und Schuelerlisten parsen.
|
||||
|
||||
Unterstuetzt:
|
||||
- Klassenbuch-Fotos (OCR mit PaddleOCR)
|
||||
- PDF-Schuelerlisten (SchILD, ASV, etc.)
|
||||
- CSV-Dateien
|
||||
- Manuelle Eingabe
|
||||
|
||||
Privacy-First:
|
||||
- Alle Verarbeitung serverseitig (kein externer Upload)
|
||||
- Daten bleiben im Lehrer-Namespace
|
||||
"""
|
||||
|
||||
import re
|
||||
import csv
|
||||
import io
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Tuple
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# Optionale Imports
|
||||
try:
|
||||
from services.file_processor import get_file_processor, ProcessingResult
|
||||
HAS_OCR = True
|
||||
except ImportError:
|
||||
HAS_OCR = False
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
HAS_PDF = True
|
||||
except ImportError:
|
||||
HAS_PDF = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class RosterEntry:
|
||||
"""Eintrag in einer Schuelerliste."""
|
||||
first_name: str
|
||||
last_name: str
|
||||
student_number: Optional[str] = None
|
||||
parent_email: Optional[str] = None
|
||||
parent_phone: Optional[str] = None
|
||||
birth_date: Optional[str] = None
|
||||
additional_data: Dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedRoster:
|
||||
"""Ergebnis des Roster-Parsings."""
|
||||
entries: List[RosterEntry]
|
||||
source_type: str # klassenbuch, pdf, csv
|
||||
confidence: float
|
||||
warnings: List[str] = field(default_factory=list)
|
||||
raw_text: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class NameMatch:
|
||||
"""Ergebnis eines Name-Matchings."""
|
||||
detected_name: str
|
||||
matched_entry: Optional[RosterEntry]
|
||||
confidence: float
|
||||
match_type: str # exact, first_name, fuzzy, none
|
||||
|
||||
|
||||
class RosterParser:
|
||||
"""
|
||||
Parst Klassenlisten aus verschiedenen Quellen.
|
||||
|
||||
Beispiel:
|
||||
parser = RosterParser()
|
||||
|
||||
# Klassenbuch-Foto
|
||||
roster = parser.parse_klassenbuch_image(image_bytes)
|
||||
|
||||
# PDF-Liste
|
||||
roster = parser.parse_pdf_roster(pdf_bytes)
|
||||
|
||||
# Namen matchen
|
||||
matches = parser.match_first_names(
|
||||
detected=["Max", "Anna", "Tim"],
|
||||
roster=roster.entries
|
||||
)
|
||||
"""
|
||||
|
||||
# Regex-Patterns fuer Kontaktdaten
|
||||
EMAIL_PATTERN = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
|
||||
PHONE_PATTERN = re.compile(r'(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}')
|
||||
DATE_PATTERN = re.compile(r'\b(\d{1,2})\.(\d{1,2})\.(\d{2,4})\b')
|
||||
|
||||
# Deutsche Vornamen (Auszug fuer Validierung)
|
||||
COMMON_FIRST_NAMES = {
|
||||
'max', 'anna', 'tim', 'lena', 'paul', 'marie', 'felix', 'emma',
|
||||
'leon', 'sophia', 'lukas', 'mia', 'jonas', 'hannah', 'elias', 'emilia',
|
||||
'ben', 'lea', 'noah', 'lina', 'finn', 'amelie', 'luis', 'laura',
|
||||
'moritz', 'clara', 'henry', 'julia', 'julian', 'emily', 'david', 'johanna',
|
||||
'niklas', 'charlotte', 'simon', 'maja', 'alexander', 'sarah', 'jan', 'lisa',
|
||||
'tom', 'nele', 'luca', 'sophie', 'erik', 'alina', 'fabian', 'paula',
|
||||
'philipp', 'luisa', 'tobias', 'melina', 'vincent', 'lara', 'maximilian', 'elena'
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.file_processor = get_file_processor() if HAS_OCR else None
|
||||
|
||||
# =========================================================================
|
||||
# KLASSENBUCH-FOTO PARSING
|
||||
# =========================================================================
|
||||
|
||||
def parse_klassenbuch_image(self, image_bytes: bytes) -> ParsedRoster:
|
||||
"""
|
||||
Parst ein Klassenbuch-Foto via OCR.
|
||||
|
||||
Args:
|
||||
image_bytes: Bild als Bytes (PNG, JPG)
|
||||
|
||||
Returns:
|
||||
ParsedRoster mit extrahierten Schuelerdaten
|
||||
"""
|
||||
if not HAS_OCR or not self.file_processor:
|
||||
return ParsedRoster(
|
||||
entries=[],
|
||||
source_type='klassenbuch',
|
||||
confidence=0.0,
|
||||
warnings=['OCR nicht verfuegbar (PaddleOCR nicht installiert)']
|
||||
)
|
||||
|
||||
# OCR ausfuehren
|
||||
result: ProcessingResult = self.file_processor.process_file(
|
||||
image_bytes,
|
||||
filename='klassenbuch.png',
|
||||
processing_mode='ocr_handwriting'
|
||||
)
|
||||
|
||||
# Text in Zeilen aufteilen
|
||||
lines = result.text.split('\n')
|
||||
entries = []
|
||||
warnings = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line or len(line) < 3:
|
||||
continue
|
||||
|
||||
entry = self._parse_roster_line(line)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
return ParsedRoster(
|
||||
entries=entries,
|
||||
source_type='klassenbuch',
|
||||
confidence=result.confidence,
|
||||
warnings=warnings,
|
||||
raw_text=result.text
|
||||
)
|
||||
|
||||
def _parse_roster_line(self, line: str) -> Optional[RosterEntry]:
|
||||
"""Parst eine einzelne Zeile aus dem Klassenbuch."""
|
||||
# Bereinigen
|
||||
line = re.sub(r'\s+', ' ', line).strip()
|
||||
|
||||
# Nummer am Anfang entfernen (z.B. "1. Max Mustermann")
|
||||
line = re.sub(r'^\d+[\.\)\s]+', '', line)
|
||||
|
||||
# Email extrahieren
|
||||
email_match = self.EMAIL_PATTERN.search(line)
|
||||
email = email_match.group() if email_match else None
|
||||
if email:
|
||||
line = line.replace(email, '')
|
||||
|
||||
# Telefon extrahieren
|
||||
phone_match = self.PHONE_PATTERN.search(line)
|
||||
phone = phone_match.group() if phone_match else None
|
||||
if phone:
|
||||
line = line.replace(phone, '')
|
||||
|
||||
# Geburtsdatum extrahieren
|
||||
date_match = self.DATE_PATTERN.search(line)
|
||||
birth_date = date_match.group() if date_match else None
|
||||
if birth_date:
|
||||
line = line.replace(birth_date, '')
|
||||
|
||||
# Namen parsen (Rest der Zeile)
|
||||
line = re.sub(r'\s+', ' ', line).strip()
|
||||
if not line:
|
||||
return None
|
||||
|
||||
first_name, last_name = self._parse_name(line)
|
||||
if not first_name:
|
||||
return None
|
||||
|
||||
return RosterEntry(
|
||||
first_name=first_name,
|
||||
last_name=last_name or '',
|
||||
parent_email=email,
|
||||
parent_phone=phone,
|
||||
birth_date=birth_date
|
||||
)
|
||||
|
||||
def _parse_name(self, text: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Parst einen Namen in Vor- und Nachname.
|
||||
|
||||
Formate:
|
||||
- "Max Mustermann"
|
||||
- "Mustermann, Max"
|
||||
- "Max M."
|
||||
- "Max"
|
||||
"""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return None, None
|
||||
|
||||
# Format: "Nachname, Vorname"
|
||||
if ',' in text:
|
||||
parts = text.split(',', 1)
|
||||
last_name = parts[0].strip()
|
||||
first_name = parts[1].strip() if len(parts) > 1 else ''
|
||||
return first_name, last_name
|
||||
|
||||
# Format: "Vorname Nachname" oder "Vorname"
|
||||
parts = text.split()
|
||||
if len(parts) == 1:
|
||||
return parts[0], None
|
||||
elif len(parts) == 2:
|
||||
return parts[0], parts[1]
|
||||
else:
|
||||
# Erster Teil ist Vorname, Rest ist Nachname
|
||||
return parts[0], ' '.join(parts[1:])
|
||||
|
||||
# =========================================================================
|
||||
# PDF ROSTER PARSING
|
||||
# =========================================================================
|
||||
|
||||
def parse_pdf_roster(self, pdf_bytes: bytes) -> ParsedRoster:
|
||||
"""
|
||||
Parst eine PDF-Schuelerliste.
|
||||
|
||||
Unterstuetzt gaengige Schulverwaltungs-Exporte:
|
||||
- SchILD-NRW
|
||||
- ASV (Bayern)
|
||||
- Untis
|
||||
- Generic CSV-in-PDF
|
||||
"""
|
||||
if not HAS_PDF:
|
||||
return ParsedRoster(
|
||||
entries=[],
|
||||
source_type='pdf',
|
||||
confidence=0.0,
|
||||
warnings=['PDF-Parsing nicht verfuegbar (PyMuPDF nicht installiert)']
|
||||
)
|
||||
|
||||
entries = []
|
||||
warnings = []
|
||||
raw_text = ''
|
||||
|
||||
try:
|
||||
doc = fitz.open(stream=pdf_bytes, filetype='pdf')
|
||||
|
||||
for page in doc:
|
||||
text = page.get_text()
|
||||
raw_text += text + '\n'
|
||||
|
||||
# Tabellen extrahieren
|
||||
tables = page.find_tables()
|
||||
for table in tables:
|
||||
df = table.to_pandas()
|
||||
for _, row in df.iterrows():
|
||||
entry = self._parse_table_row(row.to_dict())
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
# Falls keine Tabellen: Zeilenweise parsen
|
||||
if not tables:
|
||||
for line in text.split('\n'):
|
||||
entry = self._parse_roster_line(line)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
doc.close()
|
||||
|
||||
except Exception as e:
|
||||
warnings.append(f'PDF-Parsing Fehler: {str(e)}')
|
||||
|
||||
# Duplikate entfernen
|
||||
entries = self._deduplicate_entries(entries)
|
||||
|
||||
return ParsedRoster(
|
||||
entries=entries,
|
||||
source_type='pdf',
|
||||
confidence=0.9 if entries else 0.0,
|
||||
warnings=warnings,
|
||||
raw_text=raw_text
|
||||
)
|
||||
|
||||
def _parse_table_row(self, row: Dict) -> Optional[RosterEntry]:
|
||||
"""Parst eine Tabellenzeile in einen RosterEntry."""
|
||||
# Spalten-Mappings (verschiedene Formate)
|
||||
name_columns = ['name', 'schueler', 'schüler', 'student', 'nachname', 'last_name']
|
||||
first_name_columns = ['vorname', 'first_name', 'firstname']
|
||||
email_columns = ['email', 'e-mail', 'mail', 'eltern_email', 'parent_email']
|
||||
phone_columns = ['telefon', 'phone', 'tel', 'handy', 'mobile', 'eltern_tel']
|
||||
|
||||
first_name = None
|
||||
last_name = None
|
||||
email = None
|
||||
phone = None
|
||||
|
||||
for key, value in row.items():
|
||||
if not value or str(value).strip() == '':
|
||||
continue
|
||||
|
||||
key_lower = str(key).lower()
|
||||
value_str = str(value).strip()
|
||||
|
||||
if any(col in key_lower for col in first_name_columns):
|
||||
first_name = value_str
|
||||
elif any(col in key_lower for col in name_columns):
|
||||
# Kann "Vorname Nachname" oder nur "Nachname" sein
|
||||
if first_name:
|
||||
last_name = value_str
|
||||
else:
|
||||
first_name, last_name = self._parse_name(value_str)
|
||||
elif any(col in key_lower for col in email_columns):
|
||||
if self.EMAIL_PATTERN.match(value_str):
|
||||
email = value_str
|
||||
elif any(col in key_lower for col in phone_columns):
|
||||
phone = value_str
|
||||
|
||||
if not first_name:
|
||||
return None
|
||||
|
||||
return RosterEntry(
|
||||
first_name=first_name,
|
||||
last_name=last_name or '',
|
||||
parent_email=email,
|
||||
parent_phone=phone
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# CSV PARSING
|
||||
# =========================================================================
|
||||
|
||||
def parse_csv_roster(self, csv_content: str) -> ParsedRoster:
|
||||
"""
|
||||
Parst eine CSV-Schuelerliste.
|
||||
|
||||
Args:
|
||||
csv_content: CSV als String
|
||||
|
||||
Returns:
|
||||
ParsedRoster
|
||||
"""
|
||||
entries = []
|
||||
warnings = []
|
||||
|
||||
try:
|
||||
# Delimiter erraten
|
||||
dialect = csv.Sniffer().sniff(csv_content[:1024])
|
||||
reader = csv.DictReader(io.StringIO(csv_content), dialect=dialect)
|
||||
|
||||
for row in reader:
|
||||
entry = self._parse_table_row(row)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
except csv.Error as e:
|
||||
warnings.append(f'CSV-Parsing Fehler: {str(e)}')
|
||||
|
||||
# Fallback: Zeilenweise parsen
|
||||
for line in csv_content.split('\n'):
|
||||
entry = self._parse_roster_line(line)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
return ParsedRoster(
|
||||
entries=entries,
|
||||
source_type='csv',
|
||||
confidence=0.95 if entries else 0.0,
|
||||
warnings=warnings,
|
||||
raw_text=csv_content
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# NAME MATCHING
|
||||
# =========================================================================
|
||||
|
||||
def match_first_names(
|
||||
self,
|
||||
detected: List[str],
|
||||
roster: List[RosterEntry],
|
||||
threshold: float = 0.7
|
||||
) -> List[NameMatch]:
|
||||
"""
|
||||
Matched erkannte Vornamen zu Roster-Eintraegen.
|
||||
|
||||
Args:
|
||||
detected: Liste erkannter Vornamen (z.B. ["Max", "Anna"])
|
||||
roster: Vollstaendige Schuelerliste
|
||||
threshold: Mindest-Konfidenz fuer Fuzzy-Matching
|
||||
|
||||
Returns:
|
||||
Liste von NameMatch-Objekten
|
||||
"""
|
||||
matches = []
|
||||
used_entries = set()
|
||||
|
||||
for name in detected:
|
||||
name_lower = name.lower().strip()
|
||||
best_match = None
|
||||
best_confidence = 0.0
|
||||
match_type = 'none'
|
||||
|
||||
for i, entry in enumerate(roster):
|
||||
if i in used_entries:
|
||||
continue
|
||||
|
||||
entry_first_lower = entry.first_name.lower().strip()
|
||||
|
||||
# Exakter Match
|
||||
if name_lower == entry_first_lower:
|
||||
best_match = entry
|
||||
best_confidence = 1.0
|
||||
match_type = 'exact'
|
||||
used_entries.add(i)
|
||||
break
|
||||
|
||||
# Vorname-Anfang Match (z.B. "Max" matched "Maximilian")
|
||||
if entry_first_lower.startswith(name_lower) or name_lower.startswith(entry_first_lower):
|
||||
confidence = min(len(name_lower), len(entry_first_lower)) / max(len(name_lower), len(entry_first_lower))
|
||||
if confidence > best_confidence and confidence >= threshold:
|
||||
best_match = entry
|
||||
best_confidence = confidence
|
||||
match_type = 'first_name'
|
||||
|
||||
# Fuzzy Match
|
||||
ratio = SequenceMatcher(None, name_lower, entry_first_lower).ratio()
|
||||
if ratio > best_confidence and ratio >= threshold:
|
||||
best_match = entry
|
||||
best_confidence = ratio
|
||||
match_type = 'fuzzy'
|
||||
|
||||
if best_match and match_type != 'exact':
|
||||
# Entry als verwendet markieren
|
||||
for i, entry in enumerate(roster):
|
||||
if entry is best_match:
|
||||
used_entries.add(i)
|
||||
break
|
||||
|
||||
matches.append(NameMatch(
|
||||
detected_name=name,
|
||||
matched_entry=best_match,
|
||||
confidence=best_confidence,
|
||||
match_type=match_type
|
||||
))
|
||||
|
||||
return matches
|
||||
|
||||
# =========================================================================
|
||||
# HELPERS
|
||||
# =========================================================================
|
||||
|
||||
def _deduplicate_entries(self, entries: List[RosterEntry]) -> List[RosterEntry]:
|
||||
"""Entfernt Duplikate basierend auf Vor- und Nachname."""
|
||||
seen = set()
|
||||
unique = []
|
||||
|
||||
for entry in entries:
|
||||
key = (entry.first_name.lower(), entry.last_name.lower())
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(entry)
|
||||
|
||||
return unique
|
||||
|
||||
def validate_entry(self, entry: RosterEntry) -> List[str]:
|
||||
"""Validiert einen RosterEntry und gibt Warnungen zurueck."""
|
||||
warnings = []
|
||||
|
||||
# Vorname pruefen
|
||||
if not entry.first_name:
|
||||
warnings.append('Kein Vorname')
|
||||
elif len(entry.first_name) < 2:
|
||||
warnings.append('Vorname zu kurz')
|
||||
|
||||
# Email validieren
|
||||
if entry.parent_email and not self.EMAIL_PATTERN.match(entry.parent_email):
|
||||
warnings.append('Ungueltige Email-Adresse')
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
# Singleton
|
||||
_roster_parser: Optional[RosterParser] = None
|
||||
|
||||
|
||||
def get_roster_parser() -> RosterParser:
|
||||
"""Gibt die Singleton-Instanz des RosterParsers zurueck."""
|
||||
global _roster_parser
|
||||
if _roster_parser is None:
|
||||
_roster_parser = RosterParser()
|
||||
return _roster_parser
|
||||
613
backend/klausur/services/school_resolver.py
Normal file
613
backend/klausur/services/school_resolver.py
Normal file
@@ -0,0 +1,613 @@
|
||||
"""
|
||||
School Resolver Service - Schul-Auswahl und Klassen-Erstellung.
|
||||
|
||||
Funktionen:
|
||||
- Bundesland -> Schulform -> Schule Kaskade
|
||||
- Auto-Erstellung von Klassen aus erkannten Daten
|
||||
- Integration mit Go School Service (Port 8084)
|
||||
|
||||
Privacy:
|
||||
- Schuldaten sind Stammdaten (kein DSGVO-Problem)
|
||||
- Schueler-Erstellung nur im Lehrer-Namespace
|
||||
"""
|
||||
|
||||
import httpx
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Any
|
||||
from enum import Enum
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# KONSTANTEN
|
||||
# ============================================================================
|
||||
|
||||
BUNDESLAENDER = {
|
||||
"BW": "Baden-Wuerttemberg",
|
||||
"BY": "Bayern",
|
||||
"BE": "Berlin",
|
||||
"BB": "Brandenburg",
|
||||
"HB": "Bremen",
|
||||
"HH": "Hamburg",
|
||||
"HE": "Hessen",
|
||||
"MV": "Mecklenburg-Vorpommern",
|
||||
"NI": "Niedersachsen",
|
||||
"NW": "Nordrhein-Westfalen",
|
||||
"RP": "Rheinland-Pfalz",
|
||||
"SL": "Saarland",
|
||||
"SN": "Sachsen",
|
||||
"ST": "Sachsen-Anhalt",
|
||||
"SH": "Schleswig-Holstein",
|
||||
"TH": "Thueringen"
|
||||
}
|
||||
|
||||
SCHULFORMEN = {
|
||||
"grundschule": {
|
||||
"name": "Grundschule",
|
||||
"grades": [1, 2, 3, 4],
|
||||
"short": "GS"
|
||||
},
|
||||
"hauptschule": {
|
||||
"name": "Hauptschule",
|
||||
"grades": [5, 6, 7, 8, 9, 10],
|
||||
"short": "HS"
|
||||
},
|
||||
"realschule": {
|
||||
"name": "Realschule",
|
||||
"grades": [5, 6, 7, 8, 9, 10],
|
||||
"short": "RS"
|
||||
},
|
||||
"gymnasium": {
|
||||
"name": "Gymnasium",
|
||||
"grades": [5, 6, 7, 8, 9, 10, 11, 12, 13],
|
||||
"short": "GYM"
|
||||
},
|
||||
"gesamtschule": {
|
||||
"name": "Gesamtschule",
|
||||
"grades": [5, 6, 7, 8, 9, 10, 11, 12, 13],
|
||||
"short": "IGS"
|
||||
},
|
||||
"oberschule": {
|
||||
"name": "Oberschule",
|
||||
"grades": [5, 6, 7, 8, 9, 10],
|
||||
"short": "OBS"
|
||||
},
|
||||
"sekundarschule": {
|
||||
"name": "Sekundarschule",
|
||||
"grades": [5, 6, 7, 8, 9, 10],
|
||||
"short": "SEK"
|
||||
},
|
||||
"foerderschule": {
|
||||
"name": "Foerderschule",
|
||||
"grades": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||
"short": "FS"
|
||||
},
|
||||
"berufsschule": {
|
||||
"name": "Berufsschule",
|
||||
"grades": [10, 11, 12, 13],
|
||||
"short": "BS"
|
||||
},
|
||||
"fachoberschule": {
|
||||
"name": "Fachoberschule",
|
||||
"grades": [11, 12, 13],
|
||||
"short": "FOS"
|
||||
}
|
||||
}
|
||||
|
||||
# Faecher mit Standardbezeichnungen
|
||||
FAECHER = {
|
||||
"mathematik": {"name": "Mathematik", "short": "Ma"},
|
||||
"deutsch": {"name": "Deutsch", "short": "De"},
|
||||
"englisch": {"name": "Englisch", "short": "En"},
|
||||
"franzoesisch": {"name": "Franzoesisch", "short": "Fr"},
|
||||
"spanisch": {"name": "Spanisch", "short": "Sp"},
|
||||
"latein": {"name": "Latein", "short": "La"},
|
||||
"physik": {"name": "Physik", "short": "Ph"},
|
||||
"chemie": {"name": "Chemie", "short": "Ch"},
|
||||
"biologie": {"name": "Biologie", "short": "Bio"},
|
||||
"geschichte": {"name": "Geschichte", "short": "Ge"},
|
||||
"erdkunde": {"name": "Erdkunde", "short": "Ek"},
|
||||
"politik": {"name": "Politik", "short": "Po"},
|
||||
"wirtschaft": {"name": "Wirtschaft", "short": "Wi"},
|
||||
"kunst": {"name": "Kunst", "short": "Ku"},
|
||||
"musik": {"name": "Musik", "short": "Mu"},
|
||||
"sport": {"name": "Sport", "short": "Sp"},
|
||||
"religion": {"name": "Religion", "short": "Re"},
|
||||
"ethik": {"name": "Ethik", "short": "Et"},
|
||||
"informatik": {"name": "Informatik", "short": "If"},
|
||||
"sachunterricht": {"name": "Sachunterricht", "short": "SU"}
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DATA CLASSES
|
||||
# ============================================================================
|
||||
|
||||
@dataclass
|
||||
class School:
|
||||
"""Schule."""
|
||||
id: str
|
||||
name: str
|
||||
bundesland: str
|
||||
schulform: str
|
||||
address: Optional[str] = None
|
||||
city: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SchoolClass:
|
||||
"""Schulklasse."""
|
||||
id: str
|
||||
school_id: str
|
||||
name: str # z.B. "3a"
|
||||
grade_level: int # z.B. 3
|
||||
school_year: str # z.B. "2025/2026"
|
||||
teacher_id: str
|
||||
student_count: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class Student:
|
||||
"""Schueler (Stammdaten, keine PII im Klausur-Kontext)."""
|
||||
id: str
|
||||
class_id: str
|
||||
first_name: str
|
||||
last_name: str
|
||||
student_number: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetectedClassInfo:
|
||||
"""Aus Klausuren erkannte Klasseninformationen."""
|
||||
class_name: str # z.B. "3a"
|
||||
grade_level: Optional[int] = None # z.B. 3
|
||||
subject: Optional[str] = None
|
||||
date: Optional[str] = None
|
||||
students: List[Dict[str, str]] = field(default_factory=list)
|
||||
confidence: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class SchoolContext:
|
||||
"""Vollstaendiger Schulkontext fuer einen Lehrer."""
|
||||
teacher_id: str
|
||||
school: Optional[School] = None
|
||||
classes: List[SchoolClass] = field(default_factory=list)
|
||||
current_school_year: str = "2025/2026"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# SCHOOL RESOLVER
|
||||
# ============================================================================
|
||||
|
||||
class SchoolResolver:
|
||||
"""
|
||||
Verwaltet Schul- und Klassenkontext.
|
||||
|
||||
Beispiel:
|
||||
resolver = SchoolResolver()
|
||||
|
||||
# Schul-Kaskade
|
||||
schools = await resolver.search_schools("Niedersachsen", "Grundschule", "Jever")
|
||||
|
||||
# Klasse auto-erstellen
|
||||
class_obj = await resolver.auto_create_class(
|
||||
teacher_id="teacher-123",
|
||||
school_id="school-456",
|
||||
detected_info=DetectedClassInfo(
|
||||
class_name="3a",
|
||||
students=[{"firstName": "Max"}, {"firstName": "Anna"}]
|
||||
)
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.school_service_url = os.getenv(
|
||||
"SCHOOL_SERVICE_URL",
|
||||
"http://school-service:8084"
|
||||
)
|
||||
# Fallback auf lokale Daten wenn Service nicht erreichbar
|
||||
self._local_schools: Dict[str, School] = {}
|
||||
self._local_classes: Dict[str, SchoolClass] = {}
|
||||
|
||||
# =========================================================================
|
||||
# BUNDESLAND / SCHULFORM LOOKUP
|
||||
# =========================================================================
|
||||
|
||||
def get_bundeslaender(self) -> Dict[str, str]:
|
||||
"""Gibt alle Bundeslaender zurueck."""
|
||||
return BUNDESLAENDER
|
||||
|
||||
def get_schulformen(self) -> Dict[str, Dict]:
|
||||
"""Gibt alle Schulformen zurueck."""
|
||||
return SCHULFORMEN
|
||||
|
||||
def get_faecher(self) -> Dict[str, Dict]:
|
||||
"""Gibt alle Faecher zurueck."""
|
||||
return FAECHER
|
||||
|
||||
def get_grades_for_schulform(self, schulform: str) -> List[int]:
|
||||
"""Gibt die Klassenstufen fuer eine Schulform zurueck."""
|
||||
if schulform in SCHULFORMEN:
|
||||
return SCHULFORMEN[schulform]["grades"]
|
||||
return list(range(1, 14)) # Default: alle Stufen
|
||||
|
||||
def detect_grade_from_class_name(self, class_name: str) -> Optional[int]:
|
||||
"""
|
||||
Erkennt die Klassenstufe aus dem Klassennamen.
|
||||
|
||||
Beispiele:
|
||||
- "3a" -> 3
|
||||
- "10b" -> 10
|
||||
- "Q1" -> 11
|
||||
- "EF" -> 10
|
||||
"""
|
||||
import re
|
||||
|
||||
# Standard-Format: Zahl + Buchstabe
|
||||
match = re.match(r'^(\d{1,2})[a-zA-Z]?$', class_name)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
# Oberstufen-Formate
|
||||
upper_grades = {
|
||||
'ef': 10, 'e': 10,
|
||||
'q1': 11, 'q2': 12,
|
||||
'k1': 11, 'k2': 12,
|
||||
'11': 11, '12': 12, '13': 13
|
||||
}
|
||||
|
||||
class_lower = class_name.lower()
|
||||
if class_lower in upper_grades:
|
||||
return upper_grades[class_lower]
|
||||
|
||||
return None
|
||||
|
||||
def normalize_subject(self, detected_subject: str) -> Optional[str]:
|
||||
"""
|
||||
Normalisiert einen erkannten Fachnamen.
|
||||
|
||||
Beispiel: "Mathe" -> "mathematik"
|
||||
"""
|
||||
subject_lower = detected_subject.lower().strip()
|
||||
|
||||
# Direkte Matches
|
||||
if subject_lower in FAECHER:
|
||||
return subject_lower
|
||||
|
||||
# Abkuerzungen und Varianten
|
||||
subject_aliases = {
|
||||
'mathe': 'mathematik',
|
||||
'bio': 'biologie',
|
||||
'phy': 'physik',
|
||||
'che': 'chemie',
|
||||
'geo': 'erdkunde',
|
||||
'geographie': 'erdkunde',
|
||||
'powi': 'politik',
|
||||
'sowi': 'politik',
|
||||
'reli': 'religion',
|
||||
'info': 'informatik',
|
||||
'su': 'sachunterricht'
|
||||
}
|
||||
|
||||
if subject_lower in subject_aliases:
|
||||
return subject_aliases[subject_lower]
|
||||
|
||||
# Teilstring-Match
|
||||
for key in FAECHER:
|
||||
if key.startswith(subject_lower) or subject_lower.startswith(key[:3]):
|
||||
return key
|
||||
|
||||
return None
|
||||
|
||||
# =========================================================================
|
||||
# SCHOOL SERVICE INTEGRATION
|
||||
# =========================================================================
|
||||
|
||||
async def search_schools(
|
||||
self,
|
||||
bundesland: Optional[str] = None,
|
||||
schulform: Optional[str] = None,
|
||||
name_query: Optional[str] = None,
|
||||
limit: int = 20
|
||||
) -> List[School]:
|
||||
"""
|
||||
Sucht Schulen im School Service.
|
||||
|
||||
Args:
|
||||
bundesland: Bundesland-Kuerzel (z.B. "NI")
|
||||
schulform: Schulform-Key (z.B. "grundschule")
|
||||
name_query: Suchbegriff fuer Schulname
|
||||
limit: Max. Anzahl Ergebnisse
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
params = {}
|
||||
if bundesland:
|
||||
params['state'] = bundesland
|
||||
if schulform:
|
||||
params['type'] = schulform
|
||||
if name_query:
|
||||
params['q'] = name_query
|
||||
params['limit'] = limit
|
||||
|
||||
response = await client.get(
|
||||
f"{self.school_service_url}/api/schools",
|
||||
params=params
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return [
|
||||
School(
|
||||
id=s['id'],
|
||||
name=s['name'],
|
||||
bundesland=s.get('state', bundesland or ''),
|
||||
schulform=s.get('type', schulform or ''),
|
||||
address=s.get('address'),
|
||||
city=s.get('city')
|
||||
)
|
||||
for s in data.get('schools', [])
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
print(f"[SchoolResolver] Service error: {e}")
|
||||
|
||||
# Fallback: Leere Liste (Schule kann manuell angelegt werden)
|
||||
return []
|
||||
|
||||
async def get_or_create_school(
|
||||
self,
|
||||
teacher_id: str,
|
||||
bundesland: str,
|
||||
schulform: str,
|
||||
school_name: str,
|
||||
city: Optional[str] = None
|
||||
) -> School:
|
||||
"""
|
||||
Holt oder erstellt eine Schule.
|
||||
|
||||
Falls die Schule existiert, wird sie zurueckgegeben.
|
||||
Sonst wird sie neu erstellt.
|
||||
"""
|
||||
# Zuerst suchen
|
||||
existing = await self.search_schools(
|
||||
bundesland=bundesland,
|
||||
schulform=schulform,
|
||||
name_query=school_name,
|
||||
limit=1
|
||||
)
|
||||
|
||||
if existing:
|
||||
return existing[0]
|
||||
|
||||
# Neu erstellen
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.post(
|
||||
f"{self.school_service_url}/api/schools",
|
||||
json={
|
||||
"name": school_name,
|
||||
"state": bundesland,
|
||||
"type": schulform,
|
||||
"city": city,
|
||||
"created_by": teacher_id
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code in (200, 201):
|
||||
data = response.json()
|
||||
return School(
|
||||
id=data['id'],
|
||||
name=school_name,
|
||||
bundesland=bundesland,
|
||||
schulform=schulform,
|
||||
city=city
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[SchoolResolver] Create school error: {e}")
|
||||
|
||||
# Fallback: Lokale Schule erstellen
|
||||
import uuid
|
||||
school_id = str(uuid.uuid4())
|
||||
school = School(
|
||||
id=school_id,
|
||||
name=school_name,
|
||||
bundesland=bundesland,
|
||||
schulform=schulform,
|
||||
city=city
|
||||
)
|
||||
self._local_schools[school_id] = school
|
||||
return school
|
||||
|
||||
# =========================================================================
|
||||
# CLASS MANAGEMENT
|
||||
# =========================================================================
|
||||
|
||||
async def get_classes_for_teacher(
|
||||
self,
|
||||
teacher_id: str,
|
||||
school_id: Optional[str] = None
|
||||
) -> List[SchoolClass]:
|
||||
"""Holt alle Klassen eines Lehrers."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
params = {"teacher_id": teacher_id}
|
||||
if school_id:
|
||||
params["school_id"] = school_id
|
||||
|
||||
response = await client.get(
|
||||
f"{self.school_service_url}/api/classes",
|
||||
params=params
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return [
|
||||
SchoolClass(
|
||||
id=c['id'],
|
||||
school_id=c.get('school_id', ''),
|
||||
name=c['name'],
|
||||
grade_level=c.get('grade_level', 0),
|
||||
school_year=c.get('school_year', '2025/2026'),
|
||||
teacher_id=teacher_id,
|
||||
student_count=c.get('student_count', 0)
|
||||
)
|
||||
for c in data.get('classes', [])
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
print(f"[SchoolResolver] Get classes error: {e}")
|
||||
|
||||
return list(self._local_classes.values())
|
||||
|
||||
async def auto_create_class(
|
||||
self,
|
||||
teacher_id: str,
|
||||
school_id: str,
|
||||
detected_info: DetectedClassInfo,
|
||||
school_year: str = "2025/2026"
|
||||
) -> SchoolClass:
|
||||
"""
|
||||
Erstellt automatisch eine Klasse aus erkannten Daten.
|
||||
|
||||
Args:
|
||||
teacher_id: ID des Lehrers
|
||||
school_id: ID der Schule
|
||||
detected_info: Aus Klausuren erkannte Informationen
|
||||
school_year: Schuljahr
|
||||
"""
|
||||
grade_level = detected_info.grade_level or self.detect_grade_from_class_name(
|
||||
detected_info.class_name
|
||||
) or 0
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.post(
|
||||
f"{self.school_service_url}/api/classes",
|
||||
json={
|
||||
"school_id": school_id,
|
||||
"name": detected_info.class_name,
|
||||
"grade_level": grade_level,
|
||||
"school_year": school_year,
|
||||
"teacher_id": teacher_id
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code in (200, 201):
|
||||
data = response.json()
|
||||
class_id = data['id']
|
||||
|
||||
# Schueler hinzufuegen
|
||||
if detected_info.students:
|
||||
await self._bulk_create_students(
|
||||
class_id,
|
||||
detected_info.students
|
||||
)
|
||||
|
||||
return SchoolClass(
|
||||
id=class_id,
|
||||
school_id=school_id,
|
||||
name=detected_info.class_name,
|
||||
grade_level=grade_level,
|
||||
school_year=school_year,
|
||||
teacher_id=teacher_id,
|
||||
student_count=len(detected_info.students)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[SchoolResolver] Create class error: {e}")
|
||||
|
||||
# Fallback: Lokale Klasse
|
||||
import uuid
|
||||
class_id = str(uuid.uuid4())
|
||||
school_class = SchoolClass(
|
||||
id=class_id,
|
||||
school_id=school_id,
|
||||
name=detected_info.class_name,
|
||||
grade_level=grade_level,
|
||||
school_year=school_year,
|
||||
teacher_id=teacher_id,
|
||||
student_count=len(detected_info.students)
|
||||
)
|
||||
self._local_classes[class_id] = school_class
|
||||
return school_class
|
||||
|
||||
async def _bulk_create_students(
|
||||
self,
|
||||
class_id: str,
|
||||
students: List[Dict[str, str]]
|
||||
) -> List[Student]:
|
||||
"""Erstellt mehrere Schueler auf einmal."""
|
||||
created = []
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.post(
|
||||
f"{self.school_service_url}/api/classes/{class_id}/students/bulk",
|
||||
json={
|
||||
"students": [
|
||||
{
|
||||
"first_name": s.get("firstName", s.get("first_name", "")),
|
||||
"last_name": s.get("lastName", s.get("last_name", ""))
|
||||
}
|
||||
for s in students
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code in (200, 201):
|
||||
data = response.json()
|
||||
created = [
|
||||
Student(
|
||||
id=s['id'],
|
||||
class_id=class_id,
|
||||
first_name=s['first_name'],
|
||||
last_name=s.get('last_name', '')
|
||||
)
|
||||
for s in data.get('students', [])
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
print(f"[SchoolResolver] Bulk create students error: {e}")
|
||||
|
||||
return created
|
||||
|
||||
# =========================================================================
|
||||
# CONTEXT MANAGEMENT
|
||||
# =========================================================================
|
||||
|
||||
async def get_teacher_context(self, teacher_id: str) -> SchoolContext:
|
||||
"""
|
||||
Holt den vollstaendigen Schulkontext eines Lehrers.
|
||||
|
||||
Beinhaltet Schule, Klassen und aktuelles Schuljahr.
|
||||
"""
|
||||
context = SchoolContext(teacher_id=teacher_id)
|
||||
|
||||
# Klassen laden
|
||||
classes = await self.get_classes_for_teacher(teacher_id)
|
||||
context.classes = classes
|
||||
|
||||
# Schule aus erster Klasse ableiten
|
||||
if classes and classes[0].school_id:
|
||||
schools = await self.search_schools()
|
||||
for school in schools:
|
||||
if school.id == classes[0].school_id:
|
||||
context.school = school
|
||||
break
|
||||
|
||||
return context
|
||||
|
||||
|
||||
# Singleton
|
||||
_school_resolver: Optional[SchoolResolver] = None
|
||||
|
||||
|
||||
def get_school_resolver() -> SchoolResolver:
|
||||
"""Gibt die Singleton-Instanz des SchoolResolvers zurueck."""
|
||||
global _school_resolver
|
||||
if _school_resolver is None:
|
||||
_school_resolver = SchoolResolver()
|
||||
return _school_resolver
|
||||
197
backend/klausur/services/storage_service.py
Normal file
197
backend/klausur/services/storage_service.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
Storage Service for Klausur Documents.
|
||||
|
||||
PRIVACY BY DESIGN:
|
||||
- Documents stored with doc_token as identifier (not student names)
|
||||
- Organized by session_id/doc_token for teacher isolation
|
||||
- Auto-cleanup when retention period expires
|
||||
"""
|
||||
import os
|
||||
import io
|
||||
import logging
|
||||
from typing import Optional, BinaryIO
|
||||
from pathlib import Path
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class KlausurStorageService:
|
||||
"""
|
||||
MinIO/S3 Storage Service for exam documents.
|
||||
|
||||
Structure:
|
||||
klausur-exams/
|
||||
{session_id}/
|
||||
{doc_token}.{ext}
|
||||
{doc_token}_redacted.{ext} # After header redaction
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.endpoint = os.getenv("MINIO_ENDPOINT", "minio:9000")
|
||||
self.access_key = os.getenv("MINIO_ROOT_USER", "breakpilot_dev")
|
||||
self.secret_key = os.getenv("MINIO_ROOT_PASSWORD", "breakpilot_dev_123")
|
||||
self.secure = os.getenv("MINIO_SECURE", "false").lower() == "true"
|
||||
self.bucket_name = os.getenv("KLAUSUR_BUCKET", "klausur-exams")
|
||||
|
||||
self._client: Optional[Minio] = None
|
||||
|
||||
@property
|
||||
def client(self) -> Minio:
|
||||
"""Lazy-init MinIO client."""
|
||||
if self._client is None:
|
||||
self._client = Minio(
|
||||
self.endpoint,
|
||||
access_key=self.access_key,
|
||||
secret_key=self.secret_key,
|
||||
secure=self.secure
|
||||
)
|
||||
self._ensure_bucket()
|
||||
return self._client
|
||||
|
||||
def _ensure_bucket(self):
|
||||
"""Create bucket if it doesn't exist."""
|
||||
try:
|
||||
if not self._client.bucket_exists(self.bucket_name):
|
||||
self._client.make_bucket(self.bucket_name)
|
||||
logger.info(f"Created Klausur bucket: {self.bucket_name}")
|
||||
except S3Error as e:
|
||||
logger.warning(f"MinIO bucket check failed: {e}")
|
||||
|
||||
def upload_document(
|
||||
self,
|
||||
session_id: str,
|
||||
doc_token: str,
|
||||
file_data: bytes,
|
||||
file_extension: str = "png",
|
||||
is_redacted: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
Upload exam document to storage.
|
||||
|
||||
Args:
|
||||
session_id: Exam session ID
|
||||
doc_token: Pseudonymized document token
|
||||
file_data: Document binary data
|
||||
file_extension: File extension (png, jpg, pdf)
|
||||
is_redacted: Whether this is the redacted version
|
||||
|
||||
Returns:
|
||||
Object path in storage
|
||||
"""
|
||||
suffix = "_redacted" if is_redacted else ""
|
||||
object_name = f"{session_id}/{doc_token}{suffix}.{file_extension}"
|
||||
|
||||
# Determine content type
|
||||
content_types = {
|
||||
"png": "image/png",
|
||||
"jpg": "image/jpeg",
|
||||
"jpeg": "image/jpeg",
|
||||
"pdf": "application/pdf",
|
||||
}
|
||||
content_type = content_types.get(file_extension.lower(), "application/octet-stream")
|
||||
|
||||
try:
|
||||
self.client.put_object(
|
||||
bucket_name=self.bucket_name,
|
||||
object_name=object_name,
|
||||
data=io.BytesIO(file_data),
|
||||
length=len(file_data),
|
||||
content_type=content_type
|
||||
)
|
||||
logger.info(f"Uploaded document: {object_name}")
|
||||
return object_name
|
||||
|
||||
except S3Error as e:
|
||||
logger.error(f"Failed to upload document: {e}")
|
||||
raise
|
||||
|
||||
def get_document(
|
||||
self,
|
||||
session_id: str,
|
||||
doc_token: str,
|
||||
file_extension: str = "png",
|
||||
is_redacted: bool = False
|
||||
) -> Optional[bytes]:
|
||||
"""
|
||||
Download exam document from storage.
|
||||
|
||||
Args:
|
||||
session_id: Exam session ID
|
||||
doc_token: Pseudonymized document token
|
||||
file_extension: File extension
|
||||
is_redacted: Whether to get the redacted version
|
||||
|
||||
Returns:
|
||||
Document binary data or None if not found
|
||||
"""
|
||||
suffix = "_redacted" if is_redacted else ""
|
||||
object_name = f"{session_id}/{doc_token}{suffix}.{file_extension}"
|
||||
|
||||
try:
|
||||
response = self.client.get_object(self.bucket_name, object_name)
|
||||
data = response.read()
|
||||
response.close()
|
||||
response.release_conn()
|
||||
return data
|
||||
|
||||
except S3Error as e:
|
||||
if e.code == "NoSuchKey":
|
||||
logger.warning(f"Document not found: {object_name}")
|
||||
return None
|
||||
logger.error(f"Failed to get document: {e}")
|
||||
raise
|
||||
|
||||
def delete_session_documents(self, session_id: str) -> int:
|
||||
"""
|
||||
Delete all documents for a session.
|
||||
|
||||
Args:
|
||||
session_id: Exam session ID
|
||||
|
||||
Returns:
|
||||
Number of deleted objects
|
||||
"""
|
||||
deleted_count = 0
|
||||
prefix = f"{session_id}/"
|
||||
|
||||
try:
|
||||
objects = self.client.list_objects(self.bucket_name, prefix=prefix)
|
||||
for obj in objects:
|
||||
self.client.remove_object(self.bucket_name, obj.object_name)
|
||||
deleted_count += 1
|
||||
logger.debug(f"Deleted: {obj.object_name}")
|
||||
|
||||
logger.info(f"Deleted {deleted_count} documents for session {session_id}")
|
||||
return deleted_count
|
||||
|
||||
except S3Error as e:
|
||||
logger.error(f"Failed to delete session documents: {e}")
|
||||
raise
|
||||
|
||||
def document_exists(
|
||||
self,
|
||||
session_id: str,
|
||||
doc_token: str,
|
||||
file_extension: str = "png"
|
||||
) -> bool:
|
||||
"""Check if document exists in storage."""
|
||||
object_name = f"{session_id}/{doc_token}.{file_extension}"
|
||||
try:
|
||||
self.client.stat_object(self.bucket_name, object_name)
|
||||
return True
|
||||
except S3Error:
|
||||
return False
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_storage_service: Optional[KlausurStorageService] = None
|
||||
|
||||
|
||||
def get_storage_service() -> KlausurStorageService:
|
||||
"""Get or create the storage service singleton."""
|
||||
global _storage_service
|
||||
if _storage_service is None:
|
||||
_storage_service = KlausurStorageService()
|
||||
return _storage_service
|
||||
214
backend/klausur/services/trocr_client.py
Normal file
214
backend/klausur/services/trocr_client.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""
|
||||
TrOCR Client - Connects to external TrOCR service (Mac Mini).
|
||||
|
||||
This client forwards OCR requests to the TrOCR service running on
|
||||
the Mac Mini, enabling handwriting recognition without requiring
|
||||
local GPU/ML dependencies.
|
||||
|
||||
Privacy: Images are sent over the local network only - no cloud.
|
||||
"""
|
||||
import os
|
||||
import httpx
|
||||
import logging
|
||||
from typing import Optional, List, Dict
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Mac Mini TrOCR Service URL
|
||||
TROCR_SERVICE_URL = os.environ.get(
|
||||
"TROCR_SERVICE_URL",
|
||||
"http://192.168.178.163:8084"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OCRResult:
|
||||
"""Result from TrOCR extraction."""
|
||||
text: str
|
||||
confidence: float
|
||||
processing_time_ms: int
|
||||
device: str = "remote"
|
||||
|
||||
|
||||
class TrOCRClient:
|
||||
"""
|
||||
Client for external TrOCR service.
|
||||
|
||||
Usage:
|
||||
client = TrOCRClient()
|
||||
|
||||
# Check if service is available
|
||||
if await client.is_available():
|
||||
result = await client.extract_text(image_bytes)
|
||||
print(result.text)
|
||||
"""
|
||||
|
||||
def __init__(self, base_url: Optional[str] = None):
|
||||
self.base_url = base_url or TROCR_SERVICE_URL
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
async def _get_client(self) -> httpx.AsyncClient:
|
||||
"""Get or create HTTP client."""
|
||||
if self._client is None or self._client.is_closed:
|
||||
self._client = httpx.AsyncClient(
|
||||
base_url=self.base_url,
|
||||
timeout=300.0 # 5 min timeout for model loading
|
||||
)
|
||||
return self._client
|
||||
|
||||
async def close(self):
|
||||
"""Close the HTTP client."""
|
||||
if self._client and not self._client.is_closed:
|
||||
await self._client.aclose()
|
||||
|
||||
async def is_available(self) -> bool:
|
||||
"""Check if TrOCR service is available."""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
response = await client.get("/health", timeout=5.0)
|
||||
return response.status_code == 200
|
||||
except Exception as e:
|
||||
logger.warning(f"TrOCR service not available: {e}")
|
||||
return False
|
||||
|
||||
async def get_status(self) -> Dict:
|
||||
"""Get TrOCR service status."""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
response = await client.get("/api/v1/status")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get TrOCR status: {e}")
|
||||
return {
|
||||
"status": "unavailable",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def extract_text(
|
||||
self,
|
||||
image_data: bytes,
|
||||
filename: str = "image.png",
|
||||
detect_lines: bool = True
|
||||
) -> OCRResult:
|
||||
"""
|
||||
Extract text from an image using TrOCR.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes
|
||||
filename: Original filename
|
||||
detect_lines: Whether to detect individual lines
|
||||
|
||||
Returns:
|
||||
OCRResult with extracted text
|
||||
"""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
|
||||
files = {"file": (filename, image_data, "image/png")}
|
||||
params = {"detect_lines": str(detect_lines).lower()}
|
||||
|
||||
response = await client.post(
|
||||
"/api/v1/extract",
|
||||
files=files,
|
||||
params=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
|
||||
return OCRResult(
|
||||
text=data.get("text", ""),
|
||||
confidence=data.get("confidence", 0.0),
|
||||
processing_time_ms=data.get("processing_time_ms", 0),
|
||||
device=data.get("device", "remote")
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.error("TrOCR request timed out (model may be loading)")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"TrOCR extraction failed: {e}")
|
||||
raise
|
||||
|
||||
async def batch_extract(
|
||||
self,
|
||||
images: List[bytes],
|
||||
filenames: Optional[List[str]] = None,
|
||||
detect_lines: bool = True
|
||||
) -> List[OCRResult]:
|
||||
"""
|
||||
Extract text from multiple images.
|
||||
|
||||
Args:
|
||||
images: List of image bytes
|
||||
filenames: Optional list of filenames
|
||||
detect_lines: Whether to detect individual lines
|
||||
|
||||
Returns:
|
||||
List of OCRResult
|
||||
"""
|
||||
if filenames is None:
|
||||
filenames = [f"image_{i}.png" for i in range(len(images))]
|
||||
|
||||
try:
|
||||
client = await self._get_client()
|
||||
|
||||
files = [
|
||||
("files", (fn, img, "image/png"))
|
||||
for fn, img in zip(filenames, images)
|
||||
]
|
||||
|
||||
response = await client.post(
|
||||
"/api/v1/batch-extract",
|
||||
files=files
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
results = []
|
||||
|
||||
for item in data.get("results", []):
|
||||
results.append(OCRResult(
|
||||
text=item.get("text", ""),
|
||||
confidence=item.get("confidence", 0.85),
|
||||
processing_time_ms=0,
|
||||
device="remote"
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TrOCR batch extraction failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_trocr_client: Optional[TrOCRClient] = None
|
||||
|
||||
|
||||
def get_trocr_client() -> TrOCRClient:
|
||||
"""Get the TrOCR client singleton."""
|
||||
global _trocr_client
|
||||
if _trocr_client is None:
|
||||
_trocr_client = TrOCRClient()
|
||||
return _trocr_client
|
||||
|
||||
|
||||
async def extract_text_from_image(
|
||||
image_data: bytes,
|
||||
filename: str = "image.png"
|
||||
) -> OCRResult:
|
||||
"""
|
||||
Convenience function to extract text from an image.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes
|
||||
filename: Original filename
|
||||
|
||||
Returns:
|
||||
OCRResult with extracted text
|
||||
"""
|
||||
client = get_trocr_client()
|
||||
return await client.extract_text(image_data, filename)
|
||||
577
backend/klausur/services/trocr_service.py
Normal file
577
backend/klausur/services/trocr_service.py
Normal file
@@ -0,0 +1,577 @@
|
||||
"""
|
||||
TrOCR Service for Handwriting Recognition.
|
||||
|
||||
Uses Microsoft's TrOCR model for extracting handwritten text from exam images.
|
||||
Supports fine-tuning with teacher corrections via LoRA adapters.
|
||||
|
||||
PRIVACY BY DESIGN:
|
||||
- All processing happens locally
|
||||
- No data sent to external services
|
||||
- Fine-tuning data stays on-premise
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
import json
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Model paths
|
||||
MODEL_CACHE_DIR = Path(os.environ.get("TROCR_CACHE_DIR", "/app/models/trocr"))
|
||||
LORA_ADAPTERS_DIR = Path(os.environ.get("TROCR_LORA_DIR", "/app/models/trocr/lora"))
|
||||
TRAINING_DATA_DIR = Path(os.environ.get("TROCR_TRAINING_DIR", "/app/data/trocr_training"))
|
||||
|
||||
|
||||
@dataclass
|
||||
class OCRResult:
|
||||
"""Result from TrOCR extraction."""
|
||||
text: str
|
||||
confidence: float
|
||||
bounding_boxes: List[Dict] # [{"x": 0, "y": 0, "w": 100, "h": 20, "text": "..."}]
|
||||
processing_time_ms: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingExample:
|
||||
"""A single training example for fine-tuning."""
|
||||
image_path: str
|
||||
ground_truth: str
|
||||
teacher_id: str
|
||||
created_at: str
|
||||
|
||||
|
||||
class TrOCRService:
|
||||
"""
|
||||
Handwriting recognition service using TrOCR.
|
||||
|
||||
Features:
|
||||
- Line-by-line handwriting extraction
|
||||
- Confidence scoring
|
||||
- LoRA fine-tuning support
|
||||
- Batch processing
|
||||
"""
|
||||
|
||||
# Available models (from smallest to largest)
|
||||
MODELS = {
|
||||
"trocr-small": "microsoft/trocr-small-handwritten",
|
||||
"trocr-base": "microsoft/trocr-base-handwritten", # Recommended
|
||||
"trocr-large": "microsoft/trocr-large-handwritten",
|
||||
}
|
||||
|
||||
def __init__(self, model_name: str = "trocr-base", device: str = "auto"):
|
||||
"""
|
||||
Initialize TrOCR service.
|
||||
|
||||
Args:
|
||||
model_name: One of "trocr-small", "trocr-base", "trocr-large"
|
||||
device: "cpu", "cuda", "mps" (Apple Silicon), or "auto"
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.model_id = self.MODELS.get(model_name, self.MODELS["trocr-base"])
|
||||
self.device = self._get_device(device)
|
||||
|
||||
self._processor = None
|
||||
self._model = None
|
||||
self._lora_adapter = None
|
||||
|
||||
# Create directories
|
||||
MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
LORA_ADAPTERS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
TRAINING_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(f"TrOCR Service initialized: model={model_name}, device={self.device}")
|
||||
|
||||
def _get_device(self, device: str) -> str:
|
||||
"""Determine the best device for inference."""
|
||||
if device != "auto":
|
||||
return device
|
||||
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
return "cuda"
|
||||
elif torch.backends.mps.is_available():
|
||||
return "mps"
|
||||
return "cpu"
|
||||
except ImportError:
|
||||
return "cpu"
|
||||
|
||||
def _load_model(self):
|
||||
"""Lazy-load the TrOCR model."""
|
||||
if self._model is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
||||
import torch
|
||||
|
||||
logger.info(f"Loading TrOCR model: {self.model_id}")
|
||||
|
||||
self._processor = TrOCRProcessor.from_pretrained(
|
||||
self.model_id,
|
||||
cache_dir=str(MODEL_CACHE_DIR)
|
||||
)
|
||||
|
||||
self._model = VisionEncoderDecoderModel.from_pretrained(
|
||||
self.model_id,
|
||||
cache_dir=str(MODEL_CACHE_DIR)
|
||||
)
|
||||
|
||||
# Move to device
|
||||
if self.device == "cuda":
|
||||
self._model = self._model.cuda()
|
||||
elif self.device == "mps":
|
||||
self._model = self._model.to("mps")
|
||||
|
||||
# Load LoRA adapter if exists
|
||||
adapter_path = LORA_ADAPTERS_DIR / f"{self.model_name}_adapter"
|
||||
if adapter_path.exists():
|
||||
self._load_lora_adapter(adapter_path)
|
||||
|
||||
logger.info(f"TrOCR model loaded successfully on {self.device}")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Missing dependencies: {e}")
|
||||
logger.error("Install with: pip install transformers torch pillow")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load TrOCR model: {e}")
|
||||
raise
|
||||
|
||||
def _load_lora_adapter(self, adapter_path: Path):
|
||||
"""Load a LoRA adapter for fine-tuned model."""
|
||||
try:
|
||||
from peft import PeftModel
|
||||
|
||||
logger.info(f"Loading LoRA adapter from {adapter_path}")
|
||||
self._model = PeftModel.from_pretrained(self._model, str(adapter_path))
|
||||
self._lora_adapter = str(adapter_path)
|
||||
logger.info("LoRA adapter loaded successfully")
|
||||
|
||||
except ImportError:
|
||||
logger.warning("peft not installed, skipping LoRA adapter")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load LoRA adapter: {e}")
|
||||
|
||||
async def extract_text(
|
||||
self,
|
||||
image_data: bytes,
|
||||
detect_lines: bool = True
|
||||
) -> OCRResult:
|
||||
"""
|
||||
Extract handwritten text from an image.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes (PNG, JPG, etc.)
|
||||
detect_lines: If True, detect and process individual lines
|
||||
|
||||
Returns:
|
||||
OCRResult with extracted text and confidence
|
||||
"""
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
self._load_model()
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
import torch
|
||||
|
||||
# Load image
|
||||
image = Image.open(BytesIO(image_data)).convert("RGB")
|
||||
|
||||
if detect_lines:
|
||||
# Detect text lines and process each
|
||||
lines, bboxes = await self._detect_and_extract_lines(image)
|
||||
text = "\n".join(lines)
|
||||
confidence = 0.85 # Average confidence estimate
|
||||
else:
|
||||
# Process whole image
|
||||
text, confidence = await self._extract_single(image)
|
||||
bboxes = []
|
||||
|
||||
processing_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
return OCRResult(
|
||||
text=text,
|
||||
confidence=confidence,
|
||||
bounding_boxes=bboxes,
|
||||
processing_time_ms=processing_time_ms
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR extraction failed: {e}")
|
||||
return OCRResult(
|
||||
text="",
|
||||
confidence=0.0,
|
||||
bounding_boxes=[],
|
||||
processing_time_ms=int((time.time() - start_time) * 1000)
|
||||
)
|
||||
|
||||
async def _extract_single(self, image) -> Tuple[str, float]:
|
||||
"""Extract text from a single image (no line detection)."""
|
||||
import torch
|
||||
|
||||
# Preprocess
|
||||
pixel_values = self._processor(
|
||||
images=image,
|
||||
return_tensors="pt"
|
||||
).pixel_values
|
||||
|
||||
if self.device == "cuda":
|
||||
pixel_values = pixel_values.cuda()
|
||||
elif self.device == "mps":
|
||||
pixel_values = pixel_values.to("mps")
|
||||
|
||||
# Generate
|
||||
with torch.no_grad():
|
||||
generated_ids = self._model.generate(
|
||||
pixel_values,
|
||||
max_length=128,
|
||||
num_beams=4,
|
||||
return_dict_in_generate=True,
|
||||
output_scores=True
|
||||
)
|
||||
|
||||
# Decode
|
||||
text = self._processor.batch_decode(
|
||||
generated_ids.sequences,
|
||||
skip_special_tokens=True
|
||||
)[0]
|
||||
|
||||
# Estimate confidence from generation scores
|
||||
confidence = self._estimate_confidence(generated_ids)
|
||||
|
||||
return text.strip(), confidence
|
||||
|
||||
async def _detect_and_extract_lines(self, image) -> Tuple[List[str], List[Dict]]:
|
||||
"""Detect text lines and extract each separately."""
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
# Convert to numpy for line detection
|
||||
img_array = np.array(image.convert("L")) # Grayscale
|
||||
|
||||
# Simple horizontal projection for line detection
|
||||
lines_y = self._detect_line_positions(img_array)
|
||||
|
||||
if not lines_y:
|
||||
# Fallback: process whole image
|
||||
text, _ = await self._extract_single(image)
|
||||
return [text], []
|
||||
|
||||
# Extract each line
|
||||
results = []
|
||||
bboxes = []
|
||||
width = image.width
|
||||
|
||||
for i, (y_start, y_end) in enumerate(lines_y):
|
||||
# Crop line
|
||||
line_img = image.crop((0, y_start, width, y_end))
|
||||
|
||||
# Ensure minimum height
|
||||
if line_img.height < 20:
|
||||
continue
|
||||
|
||||
# Extract text
|
||||
text, conf = await self._extract_single(line_img)
|
||||
|
||||
if text.strip():
|
||||
results.append(text)
|
||||
bboxes.append({
|
||||
"x": 0,
|
||||
"y": y_start,
|
||||
"w": width,
|
||||
"h": y_end - y_start,
|
||||
"text": text,
|
||||
"confidence": conf
|
||||
})
|
||||
|
||||
return results, bboxes
|
||||
|
||||
def _detect_line_positions(self, img_array) -> List[Tuple[int, int]]:
|
||||
"""Detect horizontal text line positions using projection profile."""
|
||||
import numpy as np
|
||||
|
||||
# Horizontal projection (sum of pixels per row)
|
||||
projection = np.sum(255 - img_array, axis=1)
|
||||
|
||||
# Threshold to find text rows
|
||||
threshold = np.max(projection) * 0.1
|
||||
text_rows = projection > threshold
|
||||
|
||||
# Find line boundaries
|
||||
lines = []
|
||||
in_line = False
|
||||
line_start = 0
|
||||
|
||||
for i, is_text in enumerate(text_rows):
|
||||
if is_text and not in_line:
|
||||
in_line = True
|
||||
line_start = max(0, i - 5) # Add padding
|
||||
elif not is_text and in_line:
|
||||
in_line = False
|
||||
line_end = min(len(text_rows) - 1, i + 5) # Add padding
|
||||
if line_end - line_start > 15: # Minimum line height
|
||||
lines.append((line_start, line_end))
|
||||
|
||||
# Handle last line
|
||||
if in_line:
|
||||
lines.append((line_start, len(text_rows) - 1))
|
||||
|
||||
return lines
|
||||
|
||||
def _estimate_confidence(self, generated_output) -> float:
|
||||
"""Estimate confidence from generation scores."""
|
||||
try:
|
||||
import torch
|
||||
|
||||
if hasattr(generated_output, 'scores') and generated_output.scores:
|
||||
# Average probability of selected tokens
|
||||
probs = []
|
||||
for score in generated_output.scores:
|
||||
prob = torch.softmax(score, dim=-1).max().item()
|
||||
probs.append(prob)
|
||||
return sum(probs) / len(probs) if probs else 0.5
|
||||
return 0.75 # Default confidence
|
||||
except Exception:
|
||||
return 0.75
|
||||
|
||||
async def batch_extract(
|
||||
self,
|
||||
images: List[bytes],
|
||||
detect_lines: bool = True
|
||||
) -> List[OCRResult]:
|
||||
"""
|
||||
Extract text from multiple images.
|
||||
|
||||
Args:
|
||||
images: List of image bytes
|
||||
detect_lines: If True, detect lines in each image
|
||||
|
||||
Returns:
|
||||
List of OCRResult
|
||||
"""
|
||||
results = []
|
||||
for img_data in images:
|
||||
result = await self.extract_text(img_data, detect_lines)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
# ==========================================
|
||||
# FINE-TUNING SUPPORT
|
||||
# ==========================================
|
||||
|
||||
def add_training_example(
|
||||
self,
|
||||
image_data: bytes,
|
||||
ground_truth: str,
|
||||
teacher_id: str
|
||||
) -> str:
|
||||
"""
|
||||
Add a training example for fine-tuning.
|
||||
|
||||
Args:
|
||||
image_data: Image bytes
|
||||
ground_truth: Correct text (teacher-provided)
|
||||
teacher_id: ID of the teacher providing correction
|
||||
|
||||
Returns:
|
||||
Example ID
|
||||
"""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
example_id = str(uuid.uuid4())
|
||||
|
||||
# Save image
|
||||
image_path = TRAINING_DATA_DIR / f"{example_id}.png"
|
||||
with open(image_path, "wb") as f:
|
||||
f.write(image_data)
|
||||
|
||||
# Save metadata
|
||||
example = TrainingExample(
|
||||
image_path=str(image_path),
|
||||
ground_truth=ground_truth,
|
||||
teacher_id=teacher_id,
|
||||
created_at=datetime.utcnow().isoformat()
|
||||
)
|
||||
|
||||
meta_path = TRAINING_DATA_DIR / f"{example_id}.json"
|
||||
with open(meta_path, "w") as f:
|
||||
json.dump(example.__dict__, f, indent=2)
|
||||
|
||||
logger.info(f"Training example added: {example_id}")
|
||||
return example_id
|
||||
|
||||
def get_training_examples(self, teacher_id: Optional[str] = None) -> List[TrainingExample]:
|
||||
"""Get all training examples, optionally filtered by teacher."""
|
||||
examples = []
|
||||
|
||||
for meta_file in TRAINING_DATA_DIR.glob("*.json"):
|
||||
with open(meta_file) as f:
|
||||
data = json.load(f)
|
||||
example = TrainingExample(**data)
|
||||
|
||||
if teacher_id is None or example.teacher_id == teacher_id:
|
||||
examples.append(example)
|
||||
|
||||
return examples
|
||||
|
||||
async def fine_tune(
|
||||
self,
|
||||
teacher_id: Optional[str] = None,
|
||||
epochs: int = 3,
|
||||
learning_rate: float = 5e-5
|
||||
) -> Dict:
|
||||
"""
|
||||
Fine-tune the model with collected training examples.
|
||||
|
||||
Uses LoRA for efficient fine-tuning.
|
||||
|
||||
Args:
|
||||
teacher_id: If provided, only use examples from this teacher
|
||||
epochs: Number of training epochs
|
||||
learning_rate: Learning rate for fine-tuning
|
||||
|
||||
Returns:
|
||||
Training statistics
|
||||
"""
|
||||
examples = self.get_training_examples(teacher_id)
|
||||
|
||||
if len(examples) < 10:
|
||||
return {
|
||||
"status": "error",
|
||||
"message": f"Need at least 10 examples, have {len(examples)}"
|
||||
}
|
||||
|
||||
try:
|
||||
from peft import LoraConfig, get_peft_model, TaskType
|
||||
from transformers import Trainer, TrainingArguments
|
||||
from PIL import Image
|
||||
import torch
|
||||
|
||||
self._load_model()
|
||||
|
||||
logger.info(f"Starting fine-tuning with {len(examples)} examples")
|
||||
|
||||
# Configure LoRA
|
||||
lora_config = LoraConfig(
|
||||
task_type=TaskType.SEQ_2_SEQ_LM,
|
||||
r=16, # LoRA rank
|
||||
lora_alpha=32,
|
||||
lora_dropout=0.1,
|
||||
target_modules=["q_proj", "v_proj"] # Attention layers
|
||||
)
|
||||
|
||||
# Apply LoRA
|
||||
model = get_peft_model(self._model, lora_config)
|
||||
|
||||
# Prepare dataset
|
||||
class OCRDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, examples, processor):
|
||||
self.examples = examples
|
||||
self.processor = processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
ex = self.examples[idx]
|
||||
image = Image.open(ex.image_path).convert("RGB")
|
||||
|
||||
pixel_values = self.processor(
|
||||
images=image, return_tensors="pt"
|
||||
).pixel_values.squeeze()
|
||||
|
||||
labels = self.processor.tokenizer(
|
||||
ex.ground_truth,
|
||||
return_tensors="pt",
|
||||
padding="max_length",
|
||||
max_length=128
|
||||
).input_ids.squeeze()
|
||||
|
||||
return {
|
||||
"pixel_values": pixel_values,
|
||||
"labels": labels
|
||||
}
|
||||
|
||||
dataset = OCRDataset(examples, self._processor)
|
||||
|
||||
# Training arguments
|
||||
output_dir = LORA_ADAPTERS_DIR / f"{self.model_name}_adapter"
|
||||
training_args = TrainingArguments(
|
||||
output_dir=str(output_dir),
|
||||
num_train_epochs=epochs,
|
||||
per_device_train_batch_size=4,
|
||||
learning_rate=learning_rate,
|
||||
save_strategy="epoch",
|
||||
logging_steps=10,
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
# Train
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=dataset,
|
||||
)
|
||||
|
||||
train_result = trainer.train()
|
||||
|
||||
# Save adapter
|
||||
model.save_pretrained(str(output_dir))
|
||||
|
||||
# Reload model with new adapter
|
||||
self._model = None
|
||||
self._load_model()
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"examples_used": len(examples),
|
||||
"epochs": epochs,
|
||||
"adapter_path": str(output_dir),
|
||||
"train_loss": train_result.training_loss
|
||||
}
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Missing dependencies for fine-tuning: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": f"Missing dependencies: {e}. Install with: pip install peft"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Fine-tuning failed: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"message": str(e)
|
||||
}
|
||||
|
||||
def get_model_info(self) -> Dict:
|
||||
"""Get information about the loaded model."""
|
||||
adapter_path = LORA_ADAPTERS_DIR / f"{self.model_name}_adapter"
|
||||
|
||||
return {
|
||||
"model_name": self.model_name,
|
||||
"model_id": self.model_id,
|
||||
"device": self.device,
|
||||
"is_loaded": self._model is not None,
|
||||
"has_lora_adapter": adapter_path.exists(),
|
||||
"lora_adapter_path": str(adapter_path) if adapter_path.exists() else None,
|
||||
"training_examples_count": len(list(TRAINING_DATA_DIR.glob("*.json"))),
|
||||
}
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_trocr_service: Optional[TrOCRService] = None
|
||||
|
||||
|
||||
def get_trocr_service(model_name: str = "trocr-base") -> TrOCRService:
|
||||
"""Get or create the TrOCR service singleton."""
|
||||
global _trocr_service
|
||||
if _trocr_service is None or _trocr_service.model_name != model_name:
|
||||
_trocr_service = TrOCRService(model_name=model_name)
|
||||
return _trocr_service
|
||||
309
backend/klausur/services/vision_ocr_service.py
Normal file
309
backend/klausur/services/vision_ocr_service.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""
|
||||
Vision-OCR Service - Handschrifterkennung mit Llama 3.2 Vision.
|
||||
|
||||
DATENSCHUTZ/PRIVACY BY DESIGN:
|
||||
- Alle Verarbeitung erfolgt lokal auf dem Mac Mini
|
||||
- Keine Daten verlassen das lokale Netzwerk
|
||||
- Keine Cloud-APIs beteiligt
|
||||
- Perfekt für DSGVO-konforme Schulumgebungen
|
||||
|
||||
Verwendet llama3.2-vision:11b über Ollama für OCR/Handschrifterkennung.
|
||||
Dies ist eine Alternative zu TrOCR mit besserer Handschrifterkennung.
|
||||
"""
|
||||
import os
|
||||
import base64
|
||||
import httpx
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
from llm_gateway.config import get_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VisionOCRResult:
|
||||
"""Result from Vision-LLM OCR extraction."""
|
||||
text: str
|
||||
confidence: float
|
||||
processing_time_ms: int
|
||||
model: str = "llama3.2-vision:11b"
|
||||
device: str = "local-ollama"
|
||||
|
||||
|
||||
# OCR System Prompt für optimale Handschrifterkennung
|
||||
HANDWRITING_OCR_PROMPT = """Du bist ein Experte für Handschrifterkennung (OCR).
|
||||
|
||||
AUFGABE: Extrahiere den handschriftlichen Text aus dem Bild so genau wie möglich.
|
||||
|
||||
WICHTIGE REGELN:
|
||||
1. Transkribiere NUR den sichtbaren Text - erfinde nichts dazu
|
||||
2. Behalte die Zeilenstruktur bei (jede Zeile auf einer neuen Zeile)
|
||||
3. Bei unleserlichen Stellen: [unleserlich] oder [?] verwenden
|
||||
4. Ignoriere Linien, Kästchen und andere Formatierungen
|
||||
5. Korrigiere KEINE Rechtschreibfehler - transkribiere exakt was da steht
|
||||
6. Bei Aufzählungen: Nummern/Punkte beibehalten (1., 2., a), b), etc.)
|
||||
|
||||
AUSGABE: Nur der transkribierte Text, keine Erklärungen oder Kommentare."""
|
||||
|
||||
# Alternative Prompt für gedruckten Text
|
||||
PRINTED_OCR_PROMPT = """Extrahiere den gesamten Text aus diesem Bild.
|
||||
Behalte die Struktur bei (Absätze, Listen, etc.).
|
||||
Gib nur den extrahierten Text zurück, ohne Kommentare."""
|
||||
|
||||
|
||||
class VisionOCRService:
|
||||
"""
|
||||
OCR Service mit Llama 3.2 Vision über Ollama.
|
||||
|
||||
Läuft komplett lokal auf dem Mac Mini - keine Cloud-Verbindung nötig.
|
||||
Ideal für datenschutzkonforme Klausurkorrektur in Schulen.
|
||||
|
||||
Usage:
|
||||
service = VisionOCRService()
|
||||
|
||||
if await service.is_available():
|
||||
result = await service.extract_text(image_bytes)
|
||||
print(result.text)
|
||||
"""
|
||||
|
||||
def __init__(self, ollama_url: Optional[str] = None, model: Optional[str] = None):
|
||||
"""
|
||||
Initialize Vision OCR Service.
|
||||
|
||||
Args:
|
||||
ollama_url: Ollama API URL (default: from config)
|
||||
model: Vision model to use (default: llama3.2-vision:11b)
|
||||
"""
|
||||
config = get_config()
|
||||
self.ollama_url = ollama_url or (config.ollama.base_url if config.ollama else "http://localhost:11434")
|
||||
self.model = model or config.vision_model
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
async def _get_client(self) -> httpx.AsyncClient:
|
||||
"""Get or create HTTP client."""
|
||||
if self._client is None or self._client.is_closed:
|
||||
self._client = httpx.AsyncClient(
|
||||
timeout=300.0 # 5 min timeout für große Bilder
|
||||
)
|
||||
return self._client
|
||||
|
||||
async def close(self):
|
||||
"""Close the HTTP client."""
|
||||
if self._client and not self._client.is_closed:
|
||||
await self._client.aclose()
|
||||
|
||||
async def is_available(self) -> bool:
|
||||
"""Check if Ollama with vision model is available."""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
|
||||
# Check Ollama health
|
||||
response = await client.get(
|
||||
f"{self.ollama_url}/api/tags",
|
||||
timeout=5.0
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
return False
|
||||
|
||||
# Check if vision model is installed
|
||||
data = response.json()
|
||||
models = [m.get("name", "") for m in data.get("models", [])]
|
||||
|
||||
# Check for any vision model
|
||||
has_vision = any(
|
||||
"vision" in m.lower() or "llava" in m.lower()
|
||||
for m in models
|
||||
)
|
||||
|
||||
if not has_vision:
|
||||
logger.warning(f"No vision model found. Available: {models}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Vision OCR service not available: {e}")
|
||||
return False
|
||||
|
||||
async def get_status(self) -> dict:
|
||||
"""Get service status."""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
response = await client.get(f"{self.ollama_url}/api/tags")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
models = data.get("models", [])
|
||||
vision_models = [
|
||||
m for m in models
|
||||
if "vision" in m.get("name", "").lower() or "llava" in m.get("name", "").lower()
|
||||
]
|
||||
|
||||
return {
|
||||
"status": "available",
|
||||
"ollama_url": self.ollama_url,
|
||||
"configured_model": self.model,
|
||||
"vision_models": [m.get("name") for m in vision_models],
|
||||
"total_models": len(models)
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"status": "unavailable",
|
||||
"error": f"HTTP {response.status_code}"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"status": "unavailable",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def extract_text(
|
||||
self,
|
||||
image_data: bytes,
|
||||
filename: str = "image.png",
|
||||
is_handwriting: bool = True
|
||||
) -> VisionOCRResult:
|
||||
"""
|
||||
Extract text from an image using Vision LLM.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes (PNG, JPG, etc.)
|
||||
filename: Original filename (for logging)
|
||||
is_handwriting: True for handwriting, False for printed text
|
||||
|
||||
Returns:
|
||||
VisionOCRResult with extracted text
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
client = await self._get_client()
|
||||
|
||||
# Encode image as base64
|
||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
# Select appropriate prompt
|
||||
prompt = HANDWRITING_OCR_PROMPT if is_handwriting else PRINTED_OCR_PROMPT
|
||||
|
||||
# Ollama Vision API request
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
"images": [image_base64]
|
||||
}
|
||||
],
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.1, # Low temperature for consistent OCR
|
||||
"num_predict": 2048, # Max tokens for extracted text
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(f"Sending image to Vision OCR: {filename} ({len(image_data)} bytes)")
|
||||
|
||||
response = await client.post(
|
||||
f"{self.ollama_url}/api/chat",
|
||||
json=payload,
|
||||
timeout=180.0 # 3 min timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
|
||||
extracted_text = data.get("message", {}).get("content", "")
|
||||
|
||||
processing_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# Estimate confidence based on response quality
|
||||
confidence = self._estimate_confidence(extracted_text)
|
||||
|
||||
logger.info(
|
||||
f"Vision OCR completed for {filename}: "
|
||||
f"{len(extracted_text)} chars in {processing_time_ms}ms"
|
||||
)
|
||||
|
||||
return VisionOCRResult(
|
||||
text=extracted_text.strip(),
|
||||
confidence=confidence,
|
||||
processing_time_ms=processing_time_ms,
|
||||
model=self.model,
|
||||
device="local-ollama"
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.error(f"Vision OCR timed out for {filename}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Vision OCR failed for {filename}: {e}")
|
||||
raise
|
||||
|
||||
def _estimate_confidence(self, text: str) -> float:
|
||||
"""
|
||||
Estimate OCR confidence based on text quality.
|
||||
|
||||
This is a heuristic - real confidence would need model output.
|
||||
"""
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
# Count uncertain markers
|
||||
uncertain_markers = text.count("[unleserlich]") + text.count("[?]")
|
||||
|
||||
# Count reasonable text vs markers
|
||||
text_length = len(text.replace("[unleserlich]", "").replace("[?]", ""))
|
||||
|
||||
if text_length == 0:
|
||||
return 0.1
|
||||
|
||||
# Base confidence
|
||||
confidence = 0.85
|
||||
|
||||
# Reduce for uncertain markers
|
||||
confidence -= min(uncertain_markers * 0.05, 0.3)
|
||||
|
||||
# Very short text might be incomplete
|
||||
if text_length < 20:
|
||||
confidence -= 0.1
|
||||
|
||||
return max(confidence, 0.1)
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_vision_ocr_service: Optional[VisionOCRService] = None
|
||||
|
||||
|
||||
def get_vision_ocr_service() -> VisionOCRService:
|
||||
"""Get the Vision OCR service singleton."""
|
||||
global _vision_ocr_service
|
||||
if _vision_ocr_service is None:
|
||||
_vision_ocr_service = VisionOCRService()
|
||||
return _vision_ocr_service
|
||||
|
||||
|
||||
async def extract_handwriting(
|
||||
image_data: bytes,
|
||||
filename: str = "image.png"
|
||||
) -> VisionOCRResult:
|
||||
"""
|
||||
Convenience function to extract handwriting from an image.
|
||||
|
||||
Uses Llama 3.2 Vision locally via Ollama.
|
||||
All processing happens on the local Mac Mini - DSGVO-konform.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes
|
||||
filename: Original filename
|
||||
|
||||
Returns:
|
||||
VisionOCRResult with extracted text
|
||||
"""
|
||||
service = get_vision_ocr_service()
|
||||
return await service.extract_text(image_data, filename, is_handwriting=True)
|
||||
Reference in New Issue
Block a user