fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit 21a844cb8a
1986 changed files with 744143 additions and 1731 deletions

View File

@@ -0,0 +1,28 @@
"""
Services for Klausurkorrektur Module.
- PseudonymizationService: QR code generation, header redaction
- CorrectionService: LLM integration for AI-assisted grading
- RosterParser: Parse Klassenbuch photos and roster files
- SchoolResolver: School/class selection and auto-creation
- ModuleLinker: Cross-module links (Notenbuch, Elternabend, etc.)
"""
from .pseudonymizer import PseudonymizationService, get_pseudonymizer
from .correction_service import ExamCorrectionService, get_correction_service
from .roster_parser import RosterParser, get_roster_parser
from .school_resolver import SchoolResolver, get_school_resolver
from .module_linker import ModuleLinker, get_module_linker
__all__ = [
"PseudonymizationService",
"get_pseudonymizer",
"ExamCorrectionService",
"get_correction_service",
"RosterParser",
"get_roster_parser",
"SchoolResolver",
"get_school_resolver",
"ModuleLinker",
"get_module_linker",
]

View File

@@ -0,0 +1,379 @@
"""
Exam Correction Service using Self-Hosted LLM.
PRIVACY BY DESIGN:
- Only pseudonymized text (doc_token + OCR content) is sent to LLM
- No student names or personal data in prompts
- All processing happens on self-hosted infrastructure (SysEleven)
- No data sent to external APIs (unless explicitly configured)
This service generates AI-assisted corrections and feedback for exam answers.
"""
import logging
from typing import Optional, List
from dataclasses import dataclass
from llm_gateway.services.inference import get_inference_service, InferenceResult
from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage
from llm_gateway.config import get_config
logger = logging.getLogger(__name__)
@dataclass
class QuestionRubric:
"""Rubric for a single exam question."""
question_number: int
question_text: str
max_points: int
expected_answer: str
grading_criteria: str
@dataclass
class QuestionResult:
"""AI correction result for a single question."""
question_number: int
points_awarded: int
max_points: int
feedback: str
strengths: List[str]
improvements: List[str]
@dataclass
class CorrectionResult:
"""Complete correction result for an exam."""
doc_token: str # Pseudonymized identifier
total_score: int
max_score: int
grade: str
overall_feedback: str
question_results: List[QuestionResult]
processing_time_ms: int
# German grading scale (can be customized)
GERMAN_GRADES = [
(95, "1+"), # sehr gut plus
(90, "1"), # sehr gut
(85, "1-"), # sehr gut minus
(80, "2+"), # gut plus
(75, "2"), # gut
(70, "2-"), # gut minus
(65, "3+"), # befriedigend plus
(60, "3"), # befriedigend
(55, "3-"), # befriedigend minus
(50, "4+"), # ausreichend plus
(45, "4"), # ausreichend
(40, "4-"), # ausreichend minus
(33, "5+"), # mangelhaft plus
(27, "5"), # mangelhaft
(20, "5-"), # mangelhaft minus
(0, "6"), # ungenuegend
]
def calculate_grade(percentage: float) -> str:
"""Calculate German grade from percentage."""
for threshold, grade in GERMAN_GRADES:
if percentage >= threshold:
return grade
return "6"
class ExamCorrectionService:
"""
Service for AI-assisted exam correction.
PRIVACY GUARANTEES:
1. Prompts contain NO personal data
2. Only doc_token is used as reference
3. Processing on self-hosted LLM
4. Results stored with pseudonymized identifiers
"""
# System prompt for exam correction (German)
CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten.
WICHTIGE REGELN:
1. Bewerte NUR den fachlichen Inhalt der Antwort
2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren)
3. Gib konstruktives, ermutigzendes Feedback
4. Beziehe dich auf die Bewertungskriterien
5. Sei fair und konsistent
AUSGABEFORMAT (JSON):
{
"points": <Punktzahl>,
"feedback": "<Kurze Begruendung der Bewertung>",
"strengths": ["<Staerke 1>", "<Staerke 2>"],
"improvements": ["<Verbesserungsvorschlag 1>"]
}
Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen."""
OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung.
Einzelbewertungen:
{question_results}
Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%)
Note: {grade}
Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die:
1. Die Staerken hervorhebt
2. Konstruktive Verbesserungsvorschlaege macht
3. Ermutigt und motiviert
Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung."""
def __init__(self, model: Optional[str] = None):
"""
Initialize the correction service.
Args:
model: LLM model to use (default: qwen2.5:14b from config)
DATENSCHUTZ/PRIVACY:
Das Modell läuft lokal auf dem Mac Mini via Ollama.
Keine Daten werden an externe Server gesendet.
"""
config = get_config()
# Use configured correction model (default: qwen2.5:14b)
self.model = model or config.correction_model
self.inference = get_inference_service()
logger.info(f"Correction service initialized with model: {self.model}")
async def correct_question(
self,
student_answer: str,
rubric: QuestionRubric,
subject: str = "Allgemein"
) -> QuestionResult:
"""
Correct a single question answer.
Args:
student_answer: The student's OCR-extracted answer (pseudonymized)
rubric: Grading rubric for this question
subject: Subject for context
Returns:
QuestionResult with points and feedback
"""
# Build prompt with NO personal data
user_prompt = f"""Fach: {subject}
Frage {rubric.question_number}: {rubric.question_text}
Maximale Punktzahl: {rubric.max_points}
Erwartete Antwort:
{rubric.expected_answer}
Bewertungskriterien:
{rubric.grading_criteria}
---
Schuelerantwort:
{student_answer}
---
Bewerte diese Antwort nach den Kriterien."""
request = ChatCompletionRequest(
model=self.model,
messages=[
ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT),
ChatMessage(role="user", content=user_prompt),
],
temperature=0.3, # Lower temperature for consistent grading
max_tokens=500,
)
try:
response = await self.inference.complete(request)
content = response.choices[0].message.content or "{}"
# Parse JSON response
import json
try:
result = json.loads(content)
except json.JSONDecodeError:
# Fallback parsing
logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}")
result = {
"points": rubric.max_points // 2,
"feedback": content[:200],
"strengths": [],
"improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"]
}
points = min(int(result.get("points", 0)), rubric.max_points)
return QuestionResult(
question_number=rubric.question_number,
points_awarded=points,
max_points=rubric.max_points,
feedback=result.get("feedback", ""),
strengths=result.get("strengths", []),
improvements=result.get("improvements", []),
)
except Exception as e:
logger.error(f"Correction failed for question {rubric.question_number}: {e}")
return QuestionResult(
question_number=rubric.question_number,
points_awarded=0,
max_points=rubric.max_points,
feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}",
strengths=[],
improvements=["Manuelle Korrektur erforderlich"],
)
async def correct_exam(
self,
doc_token: str,
ocr_text: str,
rubrics: List[QuestionRubric],
subject: str = "Allgemein"
) -> CorrectionResult:
"""
Correct a complete exam with multiple questions.
Args:
doc_token: Pseudonymized document identifier
ocr_text: Full OCR text of the exam (already redacted)
rubrics: List of question rubrics
subject: Subject name
Returns:
CorrectionResult with all scores and feedback
"""
import time
start_time = time.time()
# Split OCR text into answers (simple heuristic)
answers = self._extract_answers(ocr_text, len(rubrics))
# Correct each question
question_results = []
for i, rubric in enumerate(rubrics):
answer = answers[i] if i < len(answers) else ""
result = await self.correct_question(answer, rubric, subject)
question_results.append(result)
# Calculate totals
total_score = sum(r.points_awarded for r in question_results)
max_score = sum(r.max_points for r in question_results)
percentage = (total_score / max_score * 100) if max_score > 0 else 0
grade = calculate_grade(percentage)
# Generate overall feedback
overall_feedback = await self._generate_overall_feedback(
question_results, total_score, max_score, percentage, grade
)
processing_time_ms = int((time.time() - start_time) * 1000)
return CorrectionResult(
doc_token=doc_token,
total_score=total_score,
max_score=max_score,
grade=grade,
overall_feedback=overall_feedback,
question_results=question_results,
processing_time_ms=processing_time_ms,
)
async def _generate_overall_feedback(
self,
question_results: List[QuestionResult],
total_score: int,
max_score: int,
percentage: float,
grade: str
) -> str:
"""Generate motivating overall feedback."""
# Summarize question results
results_summary = "\n".join([
f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}"
for r in question_results
])
prompt = self.OVERALL_FEEDBACK_PROMPT.format(
question_results=results_summary,
total_score=total_score,
max_score=max_score,
percentage=f"{percentage:.1f}",
grade=grade,
)
request = ChatCompletionRequest(
model=self.model,
messages=[
ChatMessage(role="user", content=prompt),
],
temperature=0.5,
max_tokens=200,
)
try:
response = await self.inference.complete(request)
return response.choices[0].message.content or "Gute Arbeit! Weiter so."
except Exception as e:
logger.error(f"Failed to generate overall feedback: {e}")
return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})"
def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]:
"""
Extract individual answers from OCR text.
Simple heuristic: split by question markers (1., 2., etc.)
More sophisticated extraction can be implemented.
"""
import re
# Try to find question markers
pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*'
parts = re.split(pattern, ocr_text)
answers = []
i = 1 # Skip first empty part
while i < len(parts):
if i + 1 < len(parts):
# parts[i] is the question number, parts[i+1] is the answer
answers.append(parts[i + 1].strip())
i += 2
# Pad with empty answers if needed
while len(answers) < num_questions:
answers.append("")
return answers[:num_questions]
# Singleton instance
_correction_service: Optional[ExamCorrectionService] = None
def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService:
"""
Get or create the correction service singleton.
Args:
model: Optional model override. If None, uses config.correction_model (qwen2.5:14b)
Returns:
ExamCorrectionService instance
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API.
"""
global _correction_service
if _correction_service is None:
_correction_service = ExamCorrectionService(model=model)
elif model and _correction_service.model != model:
# Only recreate if explicitly requesting different model
_correction_service = ExamCorrectionService(model=model)
return _correction_service

View File

@@ -0,0 +1,630 @@
"""
Module Linker Service - Cross-Module Verknuepfungen.
Verknuepft Klausur-Ergebnisse mit anderen BreakPilot-Modulen:
- Notenbuch (School Service)
- Elternabend (Gespraechsvorschlaege)
- Zeugnisse (Notenuebernahme)
- Kalender (Termine)
Privacy:
- Verknuepfungen nutzen doc_tokens (pseudonymisiert)
- Deanonymisierung nur Client-seitig moeglich
"""
import httpx
import os
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from datetime import datetime, timedelta
from enum import Enum
# ============================================================================
# DATA CLASSES
# ============================================================================
class LinkType(str, Enum):
"""Typ der Modul-Verknuepfung."""
NOTENBUCH = "notenbuch"
ELTERNABEND = "elternabend"
ZEUGNIS = "zeugnis"
CALENDAR = "calendar"
KLASSENBUCH = "klassenbuch"
class MeetingUrgency(str, Enum):
"""Dringlichkeit eines Elterngespraechs."""
LOW = "niedrig"
MEDIUM = "mittel"
HIGH = "hoch"
@dataclass
class CorrectionResult:
"""Korrektur-Ergebnis (pseudonymisiert)."""
doc_token: str
score: float # Punkte
max_score: float
grade: str # z.B. "2+"
feedback: str
question_results: List[Dict[str, Any]] = field(default_factory=list)
@dataclass
class GradeEntry:
"""Notenbuch-Eintrag."""
student_id: str # Im Notenbuch: echte Student-ID
doc_token: str # Aus Klausur: pseudonymisiert
grade: str
points: float
max_points: float
exam_name: str
date: str
@dataclass
class ParentMeetingSuggestion:
"""Vorschlag fuer ein Elterngespraech."""
doc_token: str # Pseudonymisiert
reason: str
urgency: MeetingUrgency
grade: str
subject: str
suggested_topics: List[str] = field(default_factory=list)
@dataclass
class CalendarEvent:
"""Kalender-Eintrag."""
id: str
title: str
description: str
start_time: datetime
end_time: datetime
event_type: str
linked_doc_tokens: List[str] = field(default_factory=list)
@dataclass
class ModuleLink:
"""Verknuepfung zu einem anderen Modul."""
id: str
klausur_session_id: str
link_type: LinkType
target_module: str
target_entity_id: str
target_url: Optional[str] = None
link_metadata: Dict[str, Any] = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.utcnow)
@dataclass
class LinkResult:
"""Ergebnis einer Verknuepfungs-Operation."""
success: bool
link: Optional[ModuleLink] = None
message: str = ""
target_url: Optional[str] = None
# ============================================================================
# MODULE LINKER
# ============================================================================
class ModuleLinker:
"""
Verknuepft Klausur-Ergebnisse mit anderen Modulen.
Beispiel:
linker = ModuleLinker()
# Noten ins Notenbuch uebertragen
result = await linker.link_to_notenbuch(
session_id="session-123",
class_id="class-456",
results=correction_results
)
# Elterngespraeche vorschlagen
suggestions = linker.suggest_elternabend(
results=correction_results,
subject="Mathematik"
)
"""
# Notenschwellen fuer Elterngespraeche
GRADE_THRESHOLDS = {
"1+": 0.95, "1": 0.90, "1-": 0.85,
"2+": 0.80, "2": 0.75, "2-": 0.70,
"3+": 0.65, "3": 0.60, "3-": 0.55,
"4+": 0.50, "4": 0.45, "4-": 0.40,
"5+": 0.33, "5": 0.25, "5-": 0.17,
"6": 0.0
}
# Noten die Gespraeche erfordern
MEETING_TRIGGER_GRADES = ["4", "4-", "5+", "5", "5-", "6"]
def __init__(self):
self.school_service_url = os.getenv(
"SCHOOL_SERVICE_URL",
"http://school-service:8084"
)
self.calendar_service_url = os.getenv(
"CALENDAR_SERVICE_URL",
"http://calendar-service:8085"
)
# =========================================================================
# NOTENBUCH INTEGRATION
# =========================================================================
async def link_to_notenbuch(
self,
session_id: str,
class_id: str,
subject: str,
results: List[CorrectionResult],
exam_name: str,
exam_date: str,
identity_map: Optional[Dict[str, str]] = None
) -> LinkResult:
"""
Uebertraegt Noten ins Notenbuch (School Service).
Args:
session_id: Klausur-Session-ID
class_id: Klassen-ID im School Service
subject: Fach
results: Liste der Korrektur-Ergebnisse
exam_name: Name der Klausur
exam_date: Datum der Klausur
identity_map: Optional: doc_token -> student_id Mapping
Note:
Das identity_map wird nur serverseitig genutzt, wenn der
Lehrer explizit die Verknuepfung freigibt. Normalerweise
bleibt das Mapping Client-seitig.
"""
try:
# Noten-Daten aufbereiten
grades_data = []
for result in results:
grade_entry = {
"doc_token": result.doc_token,
"grade": result.grade,
"points": result.score,
"max_points": result.max_score,
"percentage": result.score / result.max_score if result.max_score > 0 else 0
}
# Falls identity_map vorhanden: Student-ID hinzufuegen
if identity_map and result.doc_token in identity_map:
grade_entry["student_id"] = identity_map[result.doc_token]
grades_data.append(grade_entry)
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/classes/{class_id}/exams",
json={
"name": exam_name,
"subject": subject,
"date": exam_date,
"max_points": results[0].max_score if results else 100,
"grades": grades_data,
"klausur_session_id": session_id
}
)
if response.status_code in (200, 201):
data = response.json()
return LinkResult(
success=True,
link=ModuleLink(
id=data.get('id', ''),
klausur_session_id=session_id,
link_type=LinkType.NOTENBUCH,
target_module="school",
target_entity_id=data.get('id', ''),
target_url=f"/app?module=school&class={class_id}&exam={data.get('id')}"
),
message=f"Noten erfolgreich uebertragen ({len(results)} Eintraege)",
target_url=f"/app?module=school&class={class_id}"
)
return LinkResult(
success=False,
message=f"Fehler beim Uebertragen: {response.status_code}"
)
except Exception as e:
return LinkResult(
success=False,
message=f"Verbindungsfehler: {str(e)}"
)
# =========================================================================
# ELTERNABEND VORSCHLAEGE
# =========================================================================
def suggest_elternabend(
self,
results: List[CorrectionResult],
subject: str,
threshold_grade: str = "4"
) -> List[ParentMeetingSuggestion]:
"""
Schlaegt Elterngespraeche fuer schwache Schueler vor.
Args:
results: Liste der Korrektur-Ergebnisse
subject: Fach
threshold_grade: Ab dieser Note wird ein Gespraech vorgeschlagen
Returns:
Liste von Gespraechs-Vorschlaegen (pseudonymisiert)
"""
suggestions = []
threshold_idx = list(self.GRADE_THRESHOLDS.keys()).index(threshold_grade) \
if threshold_grade in self.GRADE_THRESHOLDS else 9
for result in results:
# Pruefe ob Note Gespraech erfordert
if result.grade in self.MEETING_TRIGGER_GRADES:
urgency = self._determine_urgency(result.grade)
topics = self._generate_meeting_topics(result, subject)
suggestions.append(ParentMeetingSuggestion(
doc_token=result.doc_token,
reason=f"Note {result.grade} in {subject}",
urgency=urgency,
grade=result.grade,
subject=subject,
suggested_topics=topics
))
# Nach Dringlichkeit sortieren
urgency_order = {
MeetingUrgency.HIGH: 0,
MeetingUrgency.MEDIUM: 1,
MeetingUrgency.LOW: 2
}
suggestions.sort(key=lambda s: urgency_order[s.urgency])
return suggestions
def _determine_urgency(self, grade: str) -> MeetingUrgency:
"""Bestimmt die Dringlichkeit basierend auf der Note."""
if grade in ["5-", "6"]:
return MeetingUrgency.HIGH
elif grade in ["5", "5+"]:
return MeetingUrgency.MEDIUM
else:
return MeetingUrgency.LOW
def _generate_meeting_topics(
self,
result: CorrectionResult,
subject: str
) -> List[str]:
"""Generiert Gespraechsthemen basierend auf den Ergebnissen."""
topics = []
# Allgemeine Themen
topics.append(f"Leistungsstand in {subject}")
# Basierend auf Feedback
if "Verstaendnis" in result.feedback.lower() or "grundlagen" in result.feedback.lower():
topics.append("Grundlagenverstaendnis foerdern")
if "uebung" in result.feedback.lower():
topics.append("Zusaetzliche Uebungsmoeglichkeiten")
# Basierend auf Aufgaben-Ergebnissen
if result.question_results:
weak_areas = []
for qr in result.question_results:
if qr.get('points_awarded', 0) / qr.get('max_points', 1) < 0.5:
weak_areas.append(qr.get('question_text', ''))
if weak_areas:
topics.append("Gezielte Foerderung in Schwachstellen")
# Standard-Themen
if not topics or len(topics) < 3:
topics.extend([
"Lernstrategien besprechen",
"Unterstuetzungsmoeglichkeiten zu Hause",
"Nachhilfe-Optionen"
])
return topics[:5] # Max 5 Themen
async def create_elternabend_link(
self,
session_id: str,
suggestions: List[ParentMeetingSuggestion],
teacher_id: str
) -> LinkResult:
"""Erstellt Verknuepfungen zum Elternabend-Modul."""
# TODO: Integration mit Elternabend-Modul
# Vorerst nur Metadaten speichern
return LinkResult(
success=True,
link=ModuleLink(
id=f"elternabend-{session_id}",
klausur_session_id=session_id,
link_type=LinkType.ELTERNABEND,
target_module="elternabend",
target_entity_id="",
link_metadata={
"suggestion_count": len(suggestions),
"high_urgency_count": sum(
1 for s in suggestions if s.urgency == MeetingUrgency.HIGH
)
}
),
message=f"{len(suggestions)} Elterngespraeche vorgeschlagen",
target_url="/app?module=elternabend"
)
# =========================================================================
# ZEUGNIS INTEGRATION
# =========================================================================
async def update_zeugnis(
self,
class_id: str,
subject: str,
grades: Dict[str, str],
exam_weight: float = 1.0
) -> LinkResult:
"""
Aktualisiert Zeugnis-Aggregation mit neuen Noten.
Args:
class_id: Klassen-ID
subject: Fach
grades: doc_token -> Note Mapping
exam_weight: Gewichtung der Klausur (Standard: 1.0)
"""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/classes/{class_id}/grades/aggregate",
json={
"subject": subject,
"grades": grades,
"weight": exam_weight,
"type": "klausur"
}
)
if response.status_code in (200, 201):
return LinkResult(
success=True,
message="Zeugnis-Daten aktualisiert",
target_url=f"/app?module=school&class={class_id}&tab=certificates"
)
return LinkResult(
success=False,
message=f"Fehler: {response.status_code}"
)
except Exception as e:
return LinkResult(
success=False,
message=f"Verbindungsfehler: {str(e)}"
)
# =========================================================================
# KALENDER INTEGRATION
# =========================================================================
async def create_calendar_events(
self,
teacher_id: str,
suggestions: List[ParentMeetingSuggestion],
default_duration_minutes: int = 30
) -> List[CalendarEvent]:
"""
Erstellt Kalender-Eintraege fuer Elterngespraeche.
Args:
teacher_id: ID des Lehrers
suggestions: Liste der Gespraechs-Vorschlaege
default_duration_minutes: Standard-Dauer pro Gespraech
"""
events = []
# Zeitslots generieren (ab naechster Woche, nachmittags)
start_date = datetime.now() + timedelta(days=7 - datetime.now().weekday())
start_date = start_date.replace(hour=14, minute=0, second=0, microsecond=0)
slot_index = 0
for suggestion in suggestions:
# Zeitslot berechnen
event_start = start_date + timedelta(minutes=slot_index * default_duration_minutes)
event_end = event_start + timedelta(minutes=default_duration_minutes)
# Naechster Tag wenn nach 18 Uhr
if event_start.hour >= 18:
start_date += timedelta(days=1)
start_date = start_date.replace(hour=14)
slot_index = 0
event_start = start_date
event_end = event_start + timedelta(minutes=default_duration_minutes)
event = CalendarEvent(
id=f"meeting-{suggestion.doc_token[:8]}",
title=f"Elterngespraech ({suggestion.grade})",
description=f"Anlass: {suggestion.reason}\n\nThemen:\n" +
"\n".join(f"- {t}" for t in suggestion.suggested_topics),
start_time=event_start,
end_time=event_end,
event_type="parent_meeting",
linked_doc_tokens=[suggestion.doc_token]
)
events.append(event)
slot_index += 1
# An Kalender-Service senden
try:
async with httpx.AsyncClient(timeout=10.0) as client:
for event in events:
await client.post(
f"{self.calendar_service_url}/api/events",
json={
"teacher_id": teacher_id,
"title": event.title,
"description": event.description,
"start": event.start_time.isoformat(),
"end": event.end_time.isoformat(),
"type": event.event_type,
"metadata": {
"doc_tokens": event.linked_doc_tokens
}
}
)
except Exception as e:
print(f"[ModuleLinker] Calendar service error: {e}")
return events
# =========================================================================
# STATISTIKEN
# =========================================================================
def calculate_grade_statistics(
self,
results: List[CorrectionResult]
) -> Dict[str, Any]:
"""
Berechnet Notenstatistiken.
Returns:
Dict mit Durchschnitt, Verteilung, Median, etc.
"""
if not results:
return {}
# Notenwerte (fuer Durchschnitt)
grade_values = {
"1+": 0.7, "1": 1.0, "1-": 1.3,
"2+": 1.7, "2": 2.0, "2-": 2.3,
"3+": 2.7, "3": 3.0, "3-": 3.3,
"4+": 3.7, "4": 4.0, "4-": 4.3,
"5+": 4.7, "5": 5.0, "5-": 5.3,
"6": 6.0
}
# Noten sammeln
grades = [r.grade for r in results]
points = [r.score for r in results]
max_points = results[0].max_score if results else 100
# Durchschnitt berechnen
numeric_grades = [grade_values.get(g, 4.0) for g in grades]
avg_grade = sum(numeric_grades) / len(numeric_grades)
# Notenverteilung
distribution = {}
for grade in grades:
distribution[grade] = distribution.get(grade, 0) + 1
# Prozent-Verteilung
percent_distribution = {
"sehr gut (1)": sum(1 for g in grades if g.startswith("1")),
"gut (2)": sum(1 for g in grades if g.startswith("2")),
"befriedigend (3)": sum(1 for g in grades if g.startswith("3")),
"ausreichend (4)": sum(1 for g in grades if g.startswith("4")),
"mangelhaft (5)": sum(1 for g in grades if g.startswith("5")),
"ungenuegend (6)": sum(1 for g in grades if g == "6")
}
return {
"count": len(results),
"average_grade": round(avg_grade, 2),
"average_grade_display": self._numeric_to_grade(avg_grade),
"average_points": round(sum(points) / len(points), 1),
"max_points": max_points,
"average_percent": round((sum(points) / len(points) / max_points) * 100, 1),
"best_grade": min(grades, key=lambda g: grade_values.get(g, 6)),
"worst_grade": max(grades, key=lambda g: grade_values.get(g, 0)),
"median_grade": self._calculate_median_grade(grades),
"distribution": distribution,
"percent_distribution": percent_distribution,
"passing_count": sum(1 for g in grades if not g.startswith("5") and g != "6"),
"failing_count": sum(1 for g in grades if g.startswith("5") or g == "6")
}
def _numeric_to_grade(self, value: float) -> str:
"""Konvertiert Notenwert zu Note."""
if value <= 1.15:
return "1+"
elif value <= 1.5:
return "1"
elif value <= 1.85:
return "1-"
elif value <= 2.15:
return "2+"
elif value <= 2.5:
return "2"
elif value <= 2.85:
return "2-"
elif value <= 3.15:
return "3+"
elif value <= 3.5:
return "3"
elif value <= 3.85:
return "3-"
elif value <= 4.15:
return "4+"
elif value <= 4.5:
return "4"
elif value <= 4.85:
return "4-"
elif value <= 5.15:
return "5+"
elif value <= 5.5:
return "5"
elif value <= 5.85:
return "5-"
else:
return "6"
def _calculate_median_grade(self, grades: List[str]) -> str:
"""Berechnet die Median-Note."""
grade_values = {
"1+": 0.7, "1": 1.0, "1-": 1.3,
"2+": 1.7, "2": 2.0, "2-": 2.3,
"3+": 2.7, "3": 3.0, "3-": 3.3,
"4+": 3.7, "4": 4.0, "4-": 4.3,
"5+": 4.7, "5": 5.0, "5-": 5.3,
"6": 6.0
}
numeric = sorted([grade_values.get(g, 4.0) for g in grades])
n = len(numeric)
if n % 2 == 0:
median = (numeric[n // 2 - 1] + numeric[n // 2]) / 2
else:
median = numeric[n // 2]
return self._numeric_to_grade(median)
# Singleton
_module_linker: Optional[ModuleLinker] = None
def get_module_linker() -> ModuleLinker:
"""Gibt die Singleton-Instanz des ModuleLinkers zurueck."""
global _module_linker
if _module_linker is None:
_module_linker = ModuleLinker()
return _module_linker

View File

@@ -0,0 +1,424 @@
"""
Background Processing Service for Klausur Correction.
Orchestrates the complete correction pipeline:
1. Load documents from storage
2. Run TrOCR for text extraction
3. Run AI correction for grading
4. Save results to database
PRIVACY BY DESIGN:
- Only pseudonymized doc_tokens used throughout
- No student names in processing pipeline
- All data stays on self-hosted infrastructure
"""
import asyncio
import logging
from datetime import datetime
from typing import Optional, List, Callable
from dataclasses import dataclass
from sqlalchemy.orm import Session
from ..db_models import (
ExamSession, PseudonymizedDocument,
SessionStatus, DocumentStatus
)
from ..repository import KlausurRepository
from .trocr_client import get_trocr_client, TrOCRClient
from .vision_ocr_service import get_vision_ocr_service, VisionOCRService
from .correction_service import (
get_correction_service, ExamCorrectionService,
QuestionRubric, CorrectionResult
)
from .storage_service import get_storage_service, KlausurStorageService
logger = logging.getLogger(__name__)
@dataclass
class ProcessingProgress:
"""Progress update for SSE streaming."""
session_id: str
total_documents: int
processed_documents: int
current_document: Optional[str] = None
current_step: str = "idle" # ocr, correction, saving
error: Optional[str] = None
@property
def percentage(self) -> int:
if self.total_documents == 0:
return 0
return int(self.processed_documents / self.total_documents * 100)
class ProcessingService:
"""
Background service for exam correction processing.
Usage:
service = ProcessingService(db_session)
await service.process_session(session_id, teacher_id)
"""
def __init__(
self,
db: Session,
trocr_client: Optional[TrOCRClient] = None,
vision_ocr_service: Optional[VisionOCRService] = None,
correction_service: Optional[ExamCorrectionService] = None,
storage_service: Optional[KlausurStorageService] = None,
prefer_vision_ocr: bool = True # Vision-LLM als Primär für Handschrift
):
self.db = db
self.repo = KlausurRepository(db)
self.trocr = trocr_client or get_trocr_client()
self.vision_ocr = vision_ocr_service or get_vision_ocr_service()
self.correction = correction_service or get_correction_service()
self.storage = storage_service or get_storage_service()
self.prefer_vision_ocr = prefer_vision_ocr
# Progress callback for SSE streaming
self._progress_callback: Optional[Callable[[ProcessingProgress], None]] = None
def set_progress_callback(self, callback: Callable[[ProcessingProgress], None]):
"""Set callback for progress updates (SSE streaming)."""
self._progress_callback = callback
def _notify_progress(self, progress: ProcessingProgress):
"""Notify progress to callback if set."""
if self._progress_callback:
try:
self._progress_callback(progress)
except Exception as e:
logger.warning(f"Progress callback failed: {e}")
async def process_session(
self,
session_id: str,
teacher_id: str,
use_ai_correction: bool = True
) -> bool:
"""
Process all documents in a session.
Args:
session_id: Exam session ID
teacher_id: Teacher ID for isolation
use_ai_correction: Whether to run AI correction (requires LLM)
Returns:
True if processing completed successfully
"""
# Get session
session = self.repo.get_session(session_id, teacher_id)
if not session:
logger.error(f"Session not found: {session_id}")
return False
# Get documents
documents = self.repo.list_documents(session_id, teacher_id)
if not documents:
logger.warning(f"No documents in session: {session_id}")
return False
total = len(documents)
processed = 0
logger.info(f"Starting processing for session {session_id}: {total} documents")
# Check OCR service availability (Vision-LLM preferred for handwriting)
vision_ocr_available = await self.vision_ocr.is_available()
trocr_available = await self.trocr.is_available()
if vision_ocr_available and self.prefer_vision_ocr:
logger.info("Using Vision-LLM (llama3.2-vision) for OCR - optimal for handwriting")
use_vision_ocr = True
elif trocr_available:
logger.info("Using TrOCR for OCR")
use_vision_ocr = False
elif vision_ocr_available:
logger.info("TrOCR not available, falling back to Vision-LLM")
use_vision_ocr = True
else:
logger.warning("No OCR service available - OCR will be skipped")
use_vision_ocr = False
trocr_available = False
# Process each document
for doc in documents:
progress = ProcessingProgress(
session_id=session_id,
total_documents=total,
processed_documents=processed,
current_document=doc.doc_token[:8],
current_step="ocr"
)
self._notify_progress(progress)
try:
# Step 1: OCR extraction (Vision-LLM or TrOCR)
if (vision_ocr_available or trocr_available) and doc.status == DocumentStatus.UPLOADED:
await self._process_ocr(session_id, doc, teacher_id, use_vision_ocr=use_vision_ocr)
# Step 2: AI correction
progress.current_step = "correction"
self._notify_progress(progress)
if use_ai_correction and doc.ocr_text:
await self._process_correction(session, doc, teacher_id)
else:
# Just mark as completed without AI
self._mark_document_completed(doc, teacher_id)
processed += 1
except Exception as e:
logger.error(f"Failed to process document {doc.doc_token}: {e}")
self._mark_document_failed(doc, str(e), teacher_id)
# Update session status
self.repo.update_session_status(session_id, teacher_id, SessionStatus.COMPLETED)
# Final progress
progress = ProcessingProgress(
session_id=session_id,
total_documents=total,
processed_documents=processed,
current_step="complete"
)
self._notify_progress(progress)
logger.info(f"Completed processing session {session_id}: {processed}/{total} documents")
return True
async def _process_ocr(
self,
session_id: str,
doc: PseudonymizedDocument,
teacher_id: str,
use_vision_ocr: bool = True
):
"""
Run OCR on a document.
Args:
session_id: Session ID
doc: Document to process
teacher_id: Teacher ID
use_vision_ocr: True to use Vision-LLM (llama3.2-vision), False for TrOCR
"""
# Update status
doc.status = DocumentStatus.OCR_PROCESSING
doc.processing_started_at = datetime.utcnow()
self.db.commit()
# Try to get document from storage (check both redacted and original)
image_data = None
for is_redacted in [True, False]: # Prefer redacted version
for ext in ["png", "jpg", "jpeg", "pdf"]:
image_data = self.storage.get_document(
session_id, doc.doc_token, ext, is_redacted=is_redacted
)
if image_data:
logger.debug(f"Found document: {doc.doc_token[:8]}.{ext} (redacted={is_redacted})")
break
if image_data:
break
if not image_data:
logger.warning(f"No image found for document {doc.doc_token}")
# Use placeholder OCR text for testing
doc.ocr_text = "[Kein Bild gefunden - Manuelle Eingabe erforderlich]"
doc.ocr_confidence = 0
doc.status = DocumentStatus.OCR_COMPLETED
self.db.commit()
return
# Call OCR service (Vision-LLM or TrOCR)
try:
if use_vision_ocr:
# Use Vision-LLM (llama3.2-vision) - better for handwriting
result = await self.vision_ocr.extract_text(
image_data,
filename=f"{doc.doc_token}.png",
is_handwriting=True # Assume handwriting for exams
)
ocr_method = "Vision-LLM"
else:
# Use TrOCR
result = await self.trocr.extract_text(
image_data,
filename=f"{doc.doc_token}.png",
detect_lines=True
)
ocr_method = "TrOCR"
doc.ocr_text = result.text
doc.ocr_confidence = int(result.confidence * 100)
doc.status = DocumentStatus.OCR_COMPLETED
logger.info(
f"OCR completed ({ocr_method}) for {doc.doc_token[:8]}: "
f"{len(result.text)} chars, {result.confidence:.0%} confidence"
)
except Exception as e:
logger.error(f"OCR failed for {doc.doc_token}: {e}")
doc.ocr_text = f"[OCR Fehler: {str(e)[:100]}]"
doc.ocr_confidence = 0
doc.status = DocumentStatus.OCR_COMPLETED # Continue to AI anyway
self.db.commit()
async def _process_correction(
self,
session: ExamSession,
doc: PseudonymizedDocument,
teacher_id: str
):
"""Run AI correction on a document."""
doc.status = DocumentStatus.AI_PROCESSING
self.db.commit()
# Build rubrics from session questions
rubrics = self._build_rubrics(session)
if not rubrics:
# No rubrics defined - use simple scoring
doc.ai_feedback = "Keine Bewertungskriterien definiert. Manuelle Korrektur empfohlen."
doc.ai_score = None
doc.ai_grade = None
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
self.db.commit()
# Update session stats
session.processed_count += 1
self.db.commit()
return
try:
# Run AI correction
result = await self.correction.correct_exam(
doc_token=doc.doc_token,
ocr_text=doc.ocr_text,
rubrics=rubrics,
subject=session.subject or "Allgemein"
)
# Save results
doc.ai_feedback = result.overall_feedback
doc.ai_score = result.total_score
doc.ai_grade = result.grade
doc.ai_details = {
"max_score": result.max_score,
"processing_time_ms": result.processing_time_ms,
"questions": [
{
"number": q.question_number,
"points": q.points_awarded,
"max_points": q.max_points,
"feedback": q.feedback,
"strengths": q.strengths,
"improvements": q.improvements
}
for q in result.question_results
]
}
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
logger.info(
f"Correction completed for {doc.doc_token[:8]}: "
f"{result.total_score}/{result.max_score} ({result.grade})"
)
except Exception as e:
logger.error(f"AI correction failed for {doc.doc_token}: {e}")
doc.ai_feedback = f"KI-Korrektur fehlgeschlagen: {str(e)[:200]}"
doc.status = DocumentStatus.COMPLETED # Mark complete anyway
doc.processing_completed_at = datetime.utcnow()
# Update session stats
session.processed_count += 1
self.db.commit()
def _build_rubrics(self, session: ExamSession) -> List[QuestionRubric]:
"""Build QuestionRubric list from session questions."""
rubrics = []
if not session.questions:
return rubrics
for i, q in enumerate(session.questions):
rubric = QuestionRubric(
question_number=q.get("number", i + 1),
question_text=q.get("text", f"Frage {i + 1}"),
max_points=q.get("points", 10),
expected_answer=q.get("expected_answer", ""),
grading_criteria=q.get("rubric", session.rubric or "")
)
rubrics.append(rubric)
return rubrics
def _mark_document_completed(
self,
doc: PseudonymizedDocument,
teacher_id: str
):
"""Mark document as completed without AI correction."""
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
if not doc.ai_feedback:
doc.ai_feedback = "Verarbeitung abgeschlossen (ohne KI-Korrektur)"
self.db.commit()
# Update session stats
if doc.session:
doc.session.processed_count += 1
self.db.commit()
def _mark_document_failed(
self,
doc: PseudonymizedDocument,
error: str,
teacher_id: str
):
"""Mark document as failed."""
doc.status = DocumentStatus.FAILED
doc.processing_error = error[:500]
doc.processing_completed_at = datetime.utcnow()
self.db.commit()
# Background task function for FastAPI
async def process_session_background(
session_id: str,
teacher_id: str,
db_url: str
):
"""
Background task for session processing.
This function creates its own DB session for use in background tasks.
"""
from ..database import SessionLocal
db = SessionLocal()
try:
service = ProcessingService(db)
await service.process_session(session_id, teacher_id)
finally:
db.close()
# Singleton for main service
_processing_service: Optional[ProcessingService] = None
def get_processing_service(db: Session) -> ProcessingService:
"""Get processing service instance."""
return ProcessingService(db)

View File

@@ -0,0 +1,376 @@
"""
Pseudonymization Service for Klausurkorrektur.
Implements privacy-by-design principles:
- QR code generation with random doc_tokens
- Header redaction to remove personal data before OCR
- No student identity data leaves the teacher's device
DSGVO Art. 4 Nr. 5 Compliance:
The doc_token is a 128-bit random UUID that cannot be used to
identify a student without the encrypted identity map.
"""
import uuid
import io
import logging
from typing import List, Tuple, Optional
from dataclasses import dataclass
from PIL import Image, ImageDraw, ImageFont
logger = logging.getLogger(__name__)
# Optional imports (graceful fallback if not installed)
try:
import qrcode
HAS_QRCODE = True
except ImportError:
HAS_QRCODE = False
logger.warning("qrcode not installed - QR generation disabled")
try:
import cv2
import numpy as np
HAS_CV2 = True
except ImportError:
HAS_CV2 = False
logger.warning("opencv-python not installed - image processing disabled")
try:
from pyzbar.pyzbar import decode as pyzbar_decode
HAS_PYZBAR = True
except ImportError:
HAS_PYZBAR = False
logger.warning("pyzbar not installed - QR reading disabled")
@dataclass
class RedactionResult:
"""Result of header redaction."""
redacted_image: bytes
original_height: int
redacted_height: int
redaction_applied: bool
@dataclass
class QRDetectionResult:
"""Result of QR code detection."""
doc_token: Optional[str]
confidence: float
bbox: Optional[Tuple[int, int, int, int]] # x, y, width, height
class PseudonymizationService:
"""
Service for document pseudonymization.
PRIVACY GUARANTEES:
1. doc_tokens are cryptographically random (UUID4)
2. No deterministic relationship between token and student
3. Header redaction removes visible personal data
4. Identity mapping is encrypted client-side
"""
# Default header height to redact (in pixels, assuming 300 DPI scan)
DEFAULT_HEADER_HEIGHT = 300 # ~1 inch / 2.5cm
@staticmethod
def generate_doc_token() -> str:
"""
Generate a cryptographically random document token.
Uses UUID4 which provides 122 bits of randomness.
This ensures no correlation between tokens is possible.
"""
return str(uuid.uuid4())
@staticmethod
def generate_batch_tokens(count: int) -> List[str]:
"""Generate multiple unique doc_tokens."""
return [PseudonymizationService.generate_doc_token() for _ in range(count)]
def generate_qr_code(
self,
doc_token: str,
size: int = 200,
border: int = 2
) -> bytes:
"""
Generate a QR code image for a doc_token.
Args:
doc_token: The pseudonymization token
size: Size of the QR code in pixels
border: Border size in QR modules
Returns:
PNG image as bytes
"""
if not HAS_QRCODE:
raise RuntimeError("qrcode library not installed")
qr = qrcode.QRCode(
version=1,
error_correction=qrcode.constants.ERROR_CORRECT_M,
box_size=10,
border=border,
)
qr.add_data(doc_token)
qr.make(fit=True)
img = qr.make_image(fill_color="black", back_color="white")
img = img.resize((size, size), Image.Resampling.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format="PNG")
return buffer.getvalue()
def generate_qr_sheet(
self,
doc_tokens: List[str],
page_size: Tuple[int, int] = (2480, 3508), # A4 at 300 DPI
qr_size: int = 200,
margin: int = 100,
labels: Optional[List[str]] = None
) -> bytes:
"""
Generate a printable sheet of QR codes.
Args:
doc_tokens: List of tokens to generate QR codes for
page_size: Page dimensions (width, height) in pixels
qr_size: Size of each QR code
margin: Page margin
labels: Optional labels (e.g., "Nr. 1", "Nr. 2") - NO student names!
Returns:
PNG image of the full sheet
"""
if not HAS_QRCODE:
raise RuntimeError("qrcode library not installed")
width, height = page_size
img = Image.new('RGB', (width, height), 'white')
draw = ImageDraw.Draw(img)
# Calculate grid
usable_width = width - 2 * margin
usable_height = height - 2 * margin
cell_width = qr_size + 50
cell_height = qr_size + 80 # Extra space for label
cols = usable_width // cell_width
rows = usable_height // cell_height
# Try to load a font (fallback to default)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
except (IOError, OSError):
font = ImageFont.load_default()
# Generate QR codes
for i, token in enumerate(doc_tokens):
if i >= cols * rows:
logger.warning(f"Sheet full, skipping {len(doc_tokens) - i} tokens")
break
row = i // cols
col = i % cols
x = margin + col * cell_width
y = margin + row * cell_height
# Generate QR code
qr_bytes = self.generate_qr_code(token, qr_size)
qr_img = Image.open(io.BytesIO(qr_bytes))
img.paste(qr_img, (x, y))
# Add label (number only, NO names)
label = labels[i] if labels and i < len(labels) else f"Nr. {i + 1}"
draw.text((x, y + qr_size + 5), label, fill="black", font=font)
# Add truncated token for verification
token_short = token[:8] + "..."
draw.text((x, y + qr_size + 25), token_short, fill="gray", font=font)
buffer = io.BytesIO()
img.save(buffer, format="PNG")
return buffer.getvalue()
def detect_qr_code(self, image_bytes: bytes) -> QRDetectionResult:
"""
Detect and decode QR code from an image.
Args:
image_bytes: Image data (PNG, JPEG, etc.)
Returns:
QRDetectionResult with doc_token if found
"""
if not HAS_PYZBAR:
return QRDetectionResult(
doc_token=None,
confidence=0.0,
bbox=None
)
try:
img = Image.open(io.BytesIO(image_bytes))
# Decode QR codes
decoded = pyzbar_decode(img)
for obj in decoded:
if obj.type == 'QRCODE':
token = obj.data.decode('utf-8')
# Validate it looks like a UUID
try:
uuid.UUID(token)
rect = obj.rect
return QRDetectionResult(
doc_token=token,
confidence=1.0,
bbox=(rect.left, rect.top, rect.width, rect.height)
)
except ValueError:
continue
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
except Exception as e:
logger.error(f"QR detection failed: {e}")
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
def redact_header(
self,
image_bytes: bytes,
header_height: Optional[int] = None,
fill_color: Tuple[int, int, int] = (255, 255, 255)
) -> RedactionResult:
"""
Redact the header area of a scanned exam page.
This removes the area where student name/class/date typically appears.
The redaction is permanent - no original data is preserved.
Args:
image_bytes: Original scanned image
header_height: Height in pixels to redact (None = auto-detect)
fill_color: RGB color to fill redacted area (default: white)
Returns:
RedactionResult with redacted image
"""
try:
img = Image.open(io.BytesIO(image_bytes))
width, height = img.size
# Determine header height
redact_height = header_height or self.DEFAULT_HEADER_HEIGHT
# Create a copy and redact header
redacted = img.copy()
draw = ImageDraw.Draw(redacted)
draw.rectangle([(0, 0), (width, redact_height)], fill=fill_color)
# Save result
buffer = io.BytesIO()
redacted.save(buffer, format="PNG")
return RedactionResult(
redacted_image=buffer.getvalue(),
original_height=height,
redacted_height=redact_height,
redaction_applied=True
)
except Exception as e:
logger.error(f"Header redaction failed: {e}")
return RedactionResult(
redacted_image=image_bytes,
original_height=0,
redacted_height=0,
redaction_applied=False
)
def smart_redact_header(
self,
image_bytes: bytes,
preserve_qr: bool = True
) -> RedactionResult:
"""
Smart header redaction that detects text regions.
Uses OCR confidence to identify and redact only the header
area containing personal data.
Args:
image_bytes: Original scanned image
preserve_qr: If True, don't redact QR code areas
Returns:
RedactionResult with intelligently redacted image
"""
if not HAS_CV2:
# Fallback to simple redaction
return self.redact_header(image_bytes)
try:
# Convert to OpenCV format
nparr = np.frombuffer(image_bytes, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
height, width = img.shape[:2]
# Detect QR code position if present
qr_result = self.detect_qr_code(image_bytes)
# Calculate redaction area (top portion of page)
# Typically header is in top 10-15% of page
header_height = int(height * 0.12)
# If QR code is in header area, adjust redaction
if preserve_qr and qr_result.bbox:
qr_x, qr_y, qr_w, qr_h = qr_result.bbox
if qr_y < header_height:
# QR is in header - redact around it
# Create mask
mask = np.ones((header_height, width), dtype=np.uint8) * 255
# Leave QR area unredacted
mask[max(0, qr_y):min(header_height, qr_y + qr_h),
max(0, qr_x):min(width, qr_x + qr_w)] = 0
# Apply white fill where mask is 255
img[:header_height][mask == 255] = [255, 255, 255]
else:
# QR not in header - simple redaction
img[:header_height] = [255, 255, 255]
else:
# Simple header redaction
img[:header_height] = [255, 255, 255]
# Encode result
_, buffer = cv2.imencode('.png', img)
return RedactionResult(
redacted_image=buffer.tobytes(),
original_height=height,
redacted_height=header_height,
redaction_applied=True
)
except Exception as e:
logger.error(f"Smart redaction failed: {e}")
return self.redact_header(image_bytes)
# Singleton instance
_pseudonymizer: Optional[PseudonymizationService] = None
def get_pseudonymizer() -> PseudonymizationService:
"""Get or create the pseudonymization service singleton."""
global _pseudonymizer
if _pseudonymizer is None:
_pseudonymizer = PseudonymizationService()
return _pseudonymizer

View File

@@ -0,0 +1,502 @@
"""
Roster Parser Service - Klassenbuch und Schuelerlisten parsen.
Unterstuetzt:
- Klassenbuch-Fotos (OCR mit PaddleOCR)
- PDF-Schuelerlisten (SchILD, ASV, etc.)
- CSV-Dateien
- Manuelle Eingabe
Privacy-First:
- Alle Verarbeitung serverseitig (kein externer Upload)
- Daten bleiben im Lehrer-Namespace
"""
import re
import csv
import io
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Tuple
from difflib import SequenceMatcher
# Optionale Imports
try:
from services.file_processor import get_file_processor, ProcessingResult
HAS_OCR = True
except ImportError:
HAS_OCR = False
try:
import fitz # PyMuPDF
HAS_PDF = True
except ImportError:
HAS_PDF = False
@dataclass
class RosterEntry:
"""Eintrag in einer Schuelerliste."""
first_name: str
last_name: str
student_number: Optional[str] = None
parent_email: Optional[str] = None
parent_phone: Optional[str] = None
birth_date: Optional[str] = None
additional_data: Dict[str, str] = field(default_factory=dict)
@dataclass
class ParsedRoster:
"""Ergebnis des Roster-Parsings."""
entries: List[RosterEntry]
source_type: str # klassenbuch, pdf, csv
confidence: float
warnings: List[str] = field(default_factory=list)
raw_text: Optional[str] = None
@dataclass
class NameMatch:
"""Ergebnis eines Name-Matchings."""
detected_name: str
matched_entry: Optional[RosterEntry]
confidence: float
match_type: str # exact, first_name, fuzzy, none
class RosterParser:
"""
Parst Klassenlisten aus verschiedenen Quellen.
Beispiel:
parser = RosterParser()
# Klassenbuch-Foto
roster = parser.parse_klassenbuch_image(image_bytes)
# PDF-Liste
roster = parser.parse_pdf_roster(pdf_bytes)
# Namen matchen
matches = parser.match_first_names(
detected=["Max", "Anna", "Tim"],
roster=roster.entries
)
"""
# Regex-Patterns fuer Kontaktdaten
EMAIL_PATTERN = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
PHONE_PATTERN = re.compile(r'(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}')
DATE_PATTERN = re.compile(r'\b(\d{1,2})\.(\d{1,2})\.(\d{2,4})\b')
# Deutsche Vornamen (Auszug fuer Validierung)
COMMON_FIRST_NAMES = {
'max', 'anna', 'tim', 'lena', 'paul', 'marie', 'felix', 'emma',
'leon', 'sophia', 'lukas', 'mia', 'jonas', 'hannah', 'elias', 'emilia',
'ben', 'lea', 'noah', 'lina', 'finn', 'amelie', 'luis', 'laura',
'moritz', 'clara', 'henry', 'julia', 'julian', 'emily', 'david', 'johanna',
'niklas', 'charlotte', 'simon', 'maja', 'alexander', 'sarah', 'jan', 'lisa',
'tom', 'nele', 'luca', 'sophie', 'erik', 'alina', 'fabian', 'paula',
'philipp', 'luisa', 'tobias', 'melina', 'vincent', 'lara', 'maximilian', 'elena'
}
def __init__(self):
self.file_processor = get_file_processor() if HAS_OCR else None
# =========================================================================
# KLASSENBUCH-FOTO PARSING
# =========================================================================
def parse_klassenbuch_image(self, image_bytes: bytes) -> ParsedRoster:
"""
Parst ein Klassenbuch-Foto via OCR.
Args:
image_bytes: Bild als Bytes (PNG, JPG)
Returns:
ParsedRoster mit extrahierten Schuelerdaten
"""
if not HAS_OCR or not self.file_processor:
return ParsedRoster(
entries=[],
source_type='klassenbuch',
confidence=0.0,
warnings=['OCR nicht verfuegbar (PaddleOCR nicht installiert)']
)
# OCR ausfuehren
result: ProcessingResult = self.file_processor.process_file(
image_bytes,
filename='klassenbuch.png',
processing_mode='ocr_handwriting'
)
# Text in Zeilen aufteilen
lines = result.text.split('\n')
entries = []
warnings = []
for line in lines:
line = line.strip()
if not line or len(line) < 3:
continue
entry = self._parse_roster_line(line)
if entry:
entries.append(entry)
return ParsedRoster(
entries=entries,
source_type='klassenbuch',
confidence=result.confidence,
warnings=warnings,
raw_text=result.text
)
def _parse_roster_line(self, line: str) -> Optional[RosterEntry]:
"""Parst eine einzelne Zeile aus dem Klassenbuch."""
# Bereinigen
line = re.sub(r'\s+', ' ', line).strip()
# Nummer am Anfang entfernen (z.B. "1. Max Mustermann")
line = re.sub(r'^\d+[\.\)\s]+', '', line)
# Email extrahieren
email_match = self.EMAIL_PATTERN.search(line)
email = email_match.group() if email_match else None
if email:
line = line.replace(email, '')
# Telefon extrahieren
phone_match = self.PHONE_PATTERN.search(line)
phone = phone_match.group() if phone_match else None
if phone:
line = line.replace(phone, '')
# Geburtsdatum extrahieren
date_match = self.DATE_PATTERN.search(line)
birth_date = date_match.group() if date_match else None
if birth_date:
line = line.replace(birth_date, '')
# Namen parsen (Rest der Zeile)
line = re.sub(r'\s+', ' ', line).strip()
if not line:
return None
first_name, last_name = self._parse_name(line)
if not first_name:
return None
return RosterEntry(
first_name=first_name,
last_name=last_name or '',
parent_email=email,
parent_phone=phone,
birth_date=birth_date
)
def _parse_name(self, text: str) -> Tuple[Optional[str], Optional[str]]:
"""
Parst einen Namen in Vor- und Nachname.
Formate:
- "Max Mustermann"
- "Mustermann, Max"
- "Max M."
- "Max"
"""
text = text.strip()
if not text:
return None, None
# Format: "Nachname, Vorname"
if ',' in text:
parts = text.split(',', 1)
last_name = parts[0].strip()
first_name = parts[1].strip() if len(parts) > 1 else ''
return first_name, last_name
# Format: "Vorname Nachname" oder "Vorname"
parts = text.split()
if len(parts) == 1:
return parts[0], None
elif len(parts) == 2:
return parts[0], parts[1]
else:
# Erster Teil ist Vorname, Rest ist Nachname
return parts[0], ' '.join(parts[1:])
# =========================================================================
# PDF ROSTER PARSING
# =========================================================================
def parse_pdf_roster(self, pdf_bytes: bytes) -> ParsedRoster:
"""
Parst eine PDF-Schuelerliste.
Unterstuetzt gaengige Schulverwaltungs-Exporte:
- SchILD-NRW
- ASV (Bayern)
- Untis
- Generic CSV-in-PDF
"""
if not HAS_PDF:
return ParsedRoster(
entries=[],
source_type='pdf',
confidence=0.0,
warnings=['PDF-Parsing nicht verfuegbar (PyMuPDF nicht installiert)']
)
entries = []
warnings = []
raw_text = ''
try:
doc = fitz.open(stream=pdf_bytes, filetype='pdf')
for page in doc:
text = page.get_text()
raw_text += text + '\n'
# Tabellen extrahieren
tables = page.find_tables()
for table in tables:
df = table.to_pandas()
for _, row in df.iterrows():
entry = self._parse_table_row(row.to_dict())
if entry:
entries.append(entry)
# Falls keine Tabellen: Zeilenweise parsen
if not tables:
for line in text.split('\n'):
entry = self._parse_roster_line(line)
if entry:
entries.append(entry)
doc.close()
except Exception as e:
warnings.append(f'PDF-Parsing Fehler: {str(e)}')
# Duplikate entfernen
entries = self._deduplicate_entries(entries)
return ParsedRoster(
entries=entries,
source_type='pdf',
confidence=0.9 if entries else 0.0,
warnings=warnings,
raw_text=raw_text
)
def _parse_table_row(self, row: Dict) -> Optional[RosterEntry]:
"""Parst eine Tabellenzeile in einen RosterEntry."""
# Spalten-Mappings (verschiedene Formate)
name_columns = ['name', 'schueler', 'schüler', 'student', 'nachname', 'last_name']
first_name_columns = ['vorname', 'first_name', 'firstname']
email_columns = ['email', 'e-mail', 'mail', 'eltern_email', 'parent_email']
phone_columns = ['telefon', 'phone', 'tel', 'handy', 'mobile', 'eltern_tel']
first_name = None
last_name = None
email = None
phone = None
for key, value in row.items():
if not value or str(value).strip() == '':
continue
key_lower = str(key).lower()
value_str = str(value).strip()
if any(col in key_lower for col in first_name_columns):
first_name = value_str
elif any(col in key_lower for col in name_columns):
# Kann "Vorname Nachname" oder nur "Nachname" sein
if first_name:
last_name = value_str
else:
first_name, last_name = self._parse_name(value_str)
elif any(col in key_lower for col in email_columns):
if self.EMAIL_PATTERN.match(value_str):
email = value_str
elif any(col in key_lower for col in phone_columns):
phone = value_str
if not first_name:
return None
return RosterEntry(
first_name=first_name,
last_name=last_name or '',
parent_email=email,
parent_phone=phone
)
# =========================================================================
# CSV PARSING
# =========================================================================
def parse_csv_roster(self, csv_content: str) -> ParsedRoster:
"""
Parst eine CSV-Schuelerliste.
Args:
csv_content: CSV als String
Returns:
ParsedRoster
"""
entries = []
warnings = []
try:
# Delimiter erraten
dialect = csv.Sniffer().sniff(csv_content[:1024])
reader = csv.DictReader(io.StringIO(csv_content), dialect=dialect)
for row in reader:
entry = self._parse_table_row(row)
if entry:
entries.append(entry)
except csv.Error as e:
warnings.append(f'CSV-Parsing Fehler: {str(e)}')
# Fallback: Zeilenweise parsen
for line in csv_content.split('\n'):
entry = self._parse_roster_line(line)
if entry:
entries.append(entry)
return ParsedRoster(
entries=entries,
source_type='csv',
confidence=0.95 if entries else 0.0,
warnings=warnings,
raw_text=csv_content
)
# =========================================================================
# NAME MATCHING
# =========================================================================
def match_first_names(
self,
detected: List[str],
roster: List[RosterEntry],
threshold: float = 0.7
) -> List[NameMatch]:
"""
Matched erkannte Vornamen zu Roster-Eintraegen.
Args:
detected: Liste erkannter Vornamen (z.B. ["Max", "Anna"])
roster: Vollstaendige Schuelerliste
threshold: Mindest-Konfidenz fuer Fuzzy-Matching
Returns:
Liste von NameMatch-Objekten
"""
matches = []
used_entries = set()
for name in detected:
name_lower = name.lower().strip()
best_match = None
best_confidence = 0.0
match_type = 'none'
for i, entry in enumerate(roster):
if i in used_entries:
continue
entry_first_lower = entry.first_name.lower().strip()
# Exakter Match
if name_lower == entry_first_lower:
best_match = entry
best_confidence = 1.0
match_type = 'exact'
used_entries.add(i)
break
# Vorname-Anfang Match (z.B. "Max" matched "Maximilian")
if entry_first_lower.startswith(name_lower) or name_lower.startswith(entry_first_lower):
confidence = min(len(name_lower), len(entry_first_lower)) / max(len(name_lower), len(entry_first_lower))
if confidence > best_confidence and confidence >= threshold:
best_match = entry
best_confidence = confidence
match_type = 'first_name'
# Fuzzy Match
ratio = SequenceMatcher(None, name_lower, entry_first_lower).ratio()
if ratio > best_confidence and ratio >= threshold:
best_match = entry
best_confidence = ratio
match_type = 'fuzzy'
if best_match and match_type != 'exact':
# Entry als verwendet markieren
for i, entry in enumerate(roster):
if entry is best_match:
used_entries.add(i)
break
matches.append(NameMatch(
detected_name=name,
matched_entry=best_match,
confidence=best_confidence,
match_type=match_type
))
return matches
# =========================================================================
# HELPERS
# =========================================================================
def _deduplicate_entries(self, entries: List[RosterEntry]) -> List[RosterEntry]:
"""Entfernt Duplikate basierend auf Vor- und Nachname."""
seen = set()
unique = []
for entry in entries:
key = (entry.first_name.lower(), entry.last_name.lower())
if key not in seen:
seen.add(key)
unique.append(entry)
return unique
def validate_entry(self, entry: RosterEntry) -> List[str]:
"""Validiert einen RosterEntry und gibt Warnungen zurueck."""
warnings = []
# Vorname pruefen
if not entry.first_name:
warnings.append('Kein Vorname')
elif len(entry.first_name) < 2:
warnings.append('Vorname zu kurz')
# Email validieren
if entry.parent_email and not self.EMAIL_PATTERN.match(entry.parent_email):
warnings.append('Ungueltige Email-Adresse')
return warnings
# Singleton
_roster_parser: Optional[RosterParser] = None
def get_roster_parser() -> RosterParser:
"""Gibt die Singleton-Instanz des RosterParsers zurueck."""
global _roster_parser
if _roster_parser is None:
_roster_parser = RosterParser()
return _roster_parser

View File

@@ -0,0 +1,613 @@
"""
School Resolver Service - Schul-Auswahl und Klassen-Erstellung.
Funktionen:
- Bundesland -> Schulform -> Schule Kaskade
- Auto-Erstellung von Klassen aus erkannten Daten
- Integration mit Go School Service (Port 8084)
Privacy:
- Schuldaten sind Stammdaten (kein DSGVO-Problem)
- Schueler-Erstellung nur im Lehrer-Namespace
"""
import httpx
import os
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from enum import Enum
# ============================================================================
# KONSTANTEN
# ============================================================================
BUNDESLAENDER = {
"BW": "Baden-Wuerttemberg",
"BY": "Bayern",
"BE": "Berlin",
"BB": "Brandenburg",
"HB": "Bremen",
"HH": "Hamburg",
"HE": "Hessen",
"MV": "Mecklenburg-Vorpommern",
"NI": "Niedersachsen",
"NW": "Nordrhein-Westfalen",
"RP": "Rheinland-Pfalz",
"SL": "Saarland",
"SN": "Sachsen",
"ST": "Sachsen-Anhalt",
"SH": "Schleswig-Holstein",
"TH": "Thueringen"
}
SCHULFORMEN = {
"grundschule": {
"name": "Grundschule",
"grades": [1, 2, 3, 4],
"short": "GS"
},
"hauptschule": {
"name": "Hauptschule",
"grades": [5, 6, 7, 8, 9, 10],
"short": "HS"
},
"realschule": {
"name": "Realschule",
"grades": [5, 6, 7, 8, 9, 10],
"short": "RS"
},
"gymnasium": {
"name": "Gymnasium",
"grades": [5, 6, 7, 8, 9, 10, 11, 12, 13],
"short": "GYM"
},
"gesamtschule": {
"name": "Gesamtschule",
"grades": [5, 6, 7, 8, 9, 10, 11, 12, 13],
"short": "IGS"
},
"oberschule": {
"name": "Oberschule",
"grades": [5, 6, 7, 8, 9, 10],
"short": "OBS"
},
"sekundarschule": {
"name": "Sekundarschule",
"grades": [5, 6, 7, 8, 9, 10],
"short": "SEK"
},
"foerderschule": {
"name": "Foerderschule",
"grades": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"short": "FS"
},
"berufsschule": {
"name": "Berufsschule",
"grades": [10, 11, 12, 13],
"short": "BS"
},
"fachoberschule": {
"name": "Fachoberschule",
"grades": [11, 12, 13],
"short": "FOS"
}
}
# Faecher mit Standardbezeichnungen
FAECHER = {
"mathematik": {"name": "Mathematik", "short": "Ma"},
"deutsch": {"name": "Deutsch", "short": "De"},
"englisch": {"name": "Englisch", "short": "En"},
"franzoesisch": {"name": "Franzoesisch", "short": "Fr"},
"spanisch": {"name": "Spanisch", "short": "Sp"},
"latein": {"name": "Latein", "short": "La"},
"physik": {"name": "Physik", "short": "Ph"},
"chemie": {"name": "Chemie", "short": "Ch"},
"biologie": {"name": "Biologie", "short": "Bio"},
"geschichte": {"name": "Geschichte", "short": "Ge"},
"erdkunde": {"name": "Erdkunde", "short": "Ek"},
"politik": {"name": "Politik", "short": "Po"},
"wirtschaft": {"name": "Wirtschaft", "short": "Wi"},
"kunst": {"name": "Kunst", "short": "Ku"},
"musik": {"name": "Musik", "short": "Mu"},
"sport": {"name": "Sport", "short": "Sp"},
"religion": {"name": "Religion", "short": "Re"},
"ethik": {"name": "Ethik", "short": "Et"},
"informatik": {"name": "Informatik", "short": "If"},
"sachunterricht": {"name": "Sachunterricht", "short": "SU"}
}
# ============================================================================
# DATA CLASSES
# ============================================================================
@dataclass
class School:
"""Schule."""
id: str
name: str
bundesland: str
schulform: str
address: Optional[str] = None
city: Optional[str] = None
@dataclass
class SchoolClass:
"""Schulklasse."""
id: str
school_id: str
name: str # z.B. "3a"
grade_level: int # z.B. 3
school_year: str # z.B. "2025/2026"
teacher_id: str
student_count: int = 0
@dataclass
class Student:
"""Schueler (Stammdaten, keine PII im Klausur-Kontext)."""
id: str
class_id: str
first_name: str
last_name: str
student_number: Optional[str] = None
@dataclass
class DetectedClassInfo:
"""Aus Klausuren erkannte Klasseninformationen."""
class_name: str # z.B. "3a"
grade_level: Optional[int] = None # z.B. 3
subject: Optional[str] = None
date: Optional[str] = None
students: List[Dict[str, str]] = field(default_factory=list)
confidence: float = 0.0
@dataclass
class SchoolContext:
"""Vollstaendiger Schulkontext fuer einen Lehrer."""
teacher_id: str
school: Optional[School] = None
classes: List[SchoolClass] = field(default_factory=list)
current_school_year: str = "2025/2026"
# ============================================================================
# SCHOOL RESOLVER
# ============================================================================
class SchoolResolver:
"""
Verwaltet Schul- und Klassenkontext.
Beispiel:
resolver = SchoolResolver()
# Schul-Kaskade
schools = await resolver.search_schools("Niedersachsen", "Grundschule", "Jever")
# Klasse auto-erstellen
class_obj = await resolver.auto_create_class(
teacher_id="teacher-123",
school_id="school-456",
detected_info=DetectedClassInfo(
class_name="3a",
students=[{"firstName": "Max"}, {"firstName": "Anna"}]
)
)
"""
def __init__(self):
self.school_service_url = os.getenv(
"SCHOOL_SERVICE_URL",
"http://school-service:8084"
)
# Fallback auf lokale Daten wenn Service nicht erreichbar
self._local_schools: Dict[str, School] = {}
self._local_classes: Dict[str, SchoolClass] = {}
# =========================================================================
# BUNDESLAND / SCHULFORM LOOKUP
# =========================================================================
def get_bundeslaender(self) -> Dict[str, str]:
"""Gibt alle Bundeslaender zurueck."""
return BUNDESLAENDER
def get_schulformen(self) -> Dict[str, Dict]:
"""Gibt alle Schulformen zurueck."""
return SCHULFORMEN
def get_faecher(self) -> Dict[str, Dict]:
"""Gibt alle Faecher zurueck."""
return FAECHER
def get_grades_for_schulform(self, schulform: str) -> List[int]:
"""Gibt die Klassenstufen fuer eine Schulform zurueck."""
if schulform in SCHULFORMEN:
return SCHULFORMEN[schulform]["grades"]
return list(range(1, 14)) # Default: alle Stufen
def detect_grade_from_class_name(self, class_name: str) -> Optional[int]:
"""
Erkennt die Klassenstufe aus dem Klassennamen.
Beispiele:
- "3a" -> 3
- "10b" -> 10
- "Q1" -> 11
- "EF" -> 10
"""
import re
# Standard-Format: Zahl + Buchstabe
match = re.match(r'^(\d{1,2})[a-zA-Z]?$', class_name)
if match:
return int(match.group(1))
# Oberstufen-Formate
upper_grades = {
'ef': 10, 'e': 10,
'q1': 11, 'q2': 12,
'k1': 11, 'k2': 12,
'11': 11, '12': 12, '13': 13
}
class_lower = class_name.lower()
if class_lower in upper_grades:
return upper_grades[class_lower]
return None
def normalize_subject(self, detected_subject: str) -> Optional[str]:
"""
Normalisiert einen erkannten Fachnamen.
Beispiel: "Mathe" -> "mathematik"
"""
subject_lower = detected_subject.lower().strip()
# Direkte Matches
if subject_lower in FAECHER:
return subject_lower
# Abkuerzungen und Varianten
subject_aliases = {
'mathe': 'mathematik',
'bio': 'biologie',
'phy': 'physik',
'che': 'chemie',
'geo': 'erdkunde',
'geographie': 'erdkunde',
'powi': 'politik',
'sowi': 'politik',
'reli': 'religion',
'info': 'informatik',
'su': 'sachunterricht'
}
if subject_lower in subject_aliases:
return subject_aliases[subject_lower]
# Teilstring-Match
for key in FAECHER:
if key.startswith(subject_lower) or subject_lower.startswith(key[:3]):
return key
return None
# =========================================================================
# SCHOOL SERVICE INTEGRATION
# =========================================================================
async def search_schools(
self,
bundesland: Optional[str] = None,
schulform: Optional[str] = None,
name_query: Optional[str] = None,
limit: int = 20
) -> List[School]:
"""
Sucht Schulen im School Service.
Args:
bundesland: Bundesland-Kuerzel (z.B. "NI")
schulform: Schulform-Key (z.B. "grundschule")
name_query: Suchbegriff fuer Schulname
limit: Max. Anzahl Ergebnisse
"""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
params = {}
if bundesland:
params['state'] = bundesland
if schulform:
params['type'] = schulform
if name_query:
params['q'] = name_query
params['limit'] = limit
response = await client.get(
f"{self.school_service_url}/api/schools",
params=params
)
if response.status_code == 200:
data = response.json()
return [
School(
id=s['id'],
name=s['name'],
bundesland=s.get('state', bundesland or ''),
schulform=s.get('type', schulform or ''),
address=s.get('address'),
city=s.get('city')
)
for s in data.get('schools', [])
]
except Exception as e:
print(f"[SchoolResolver] Service error: {e}")
# Fallback: Leere Liste (Schule kann manuell angelegt werden)
return []
async def get_or_create_school(
self,
teacher_id: str,
bundesland: str,
schulform: str,
school_name: str,
city: Optional[str] = None
) -> School:
"""
Holt oder erstellt eine Schule.
Falls die Schule existiert, wird sie zurueckgegeben.
Sonst wird sie neu erstellt.
"""
# Zuerst suchen
existing = await self.search_schools(
bundesland=bundesland,
schulform=schulform,
name_query=school_name,
limit=1
)
if existing:
return existing[0]
# Neu erstellen
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/schools",
json={
"name": school_name,
"state": bundesland,
"type": schulform,
"city": city,
"created_by": teacher_id
}
)
if response.status_code in (200, 201):
data = response.json()
return School(
id=data['id'],
name=school_name,
bundesland=bundesland,
schulform=schulform,
city=city
)
except Exception as e:
print(f"[SchoolResolver] Create school error: {e}")
# Fallback: Lokale Schule erstellen
import uuid
school_id = str(uuid.uuid4())
school = School(
id=school_id,
name=school_name,
bundesland=bundesland,
schulform=schulform,
city=city
)
self._local_schools[school_id] = school
return school
# =========================================================================
# CLASS MANAGEMENT
# =========================================================================
async def get_classes_for_teacher(
self,
teacher_id: str,
school_id: Optional[str] = None
) -> List[SchoolClass]:
"""Holt alle Klassen eines Lehrers."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
params = {"teacher_id": teacher_id}
if school_id:
params["school_id"] = school_id
response = await client.get(
f"{self.school_service_url}/api/classes",
params=params
)
if response.status_code == 200:
data = response.json()
return [
SchoolClass(
id=c['id'],
school_id=c.get('school_id', ''),
name=c['name'],
grade_level=c.get('grade_level', 0),
school_year=c.get('school_year', '2025/2026'),
teacher_id=teacher_id,
student_count=c.get('student_count', 0)
)
for c in data.get('classes', [])
]
except Exception as e:
print(f"[SchoolResolver] Get classes error: {e}")
return list(self._local_classes.values())
async def auto_create_class(
self,
teacher_id: str,
school_id: str,
detected_info: DetectedClassInfo,
school_year: str = "2025/2026"
) -> SchoolClass:
"""
Erstellt automatisch eine Klasse aus erkannten Daten.
Args:
teacher_id: ID des Lehrers
school_id: ID der Schule
detected_info: Aus Klausuren erkannte Informationen
school_year: Schuljahr
"""
grade_level = detected_info.grade_level or self.detect_grade_from_class_name(
detected_info.class_name
) or 0
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/classes",
json={
"school_id": school_id,
"name": detected_info.class_name,
"grade_level": grade_level,
"school_year": school_year,
"teacher_id": teacher_id
}
)
if response.status_code in (200, 201):
data = response.json()
class_id = data['id']
# Schueler hinzufuegen
if detected_info.students:
await self._bulk_create_students(
class_id,
detected_info.students
)
return SchoolClass(
id=class_id,
school_id=school_id,
name=detected_info.class_name,
grade_level=grade_level,
school_year=school_year,
teacher_id=teacher_id,
student_count=len(detected_info.students)
)
except Exception as e:
print(f"[SchoolResolver] Create class error: {e}")
# Fallback: Lokale Klasse
import uuid
class_id = str(uuid.uuid4())
school_class = SchoolClass(
id=class_id,
school_id=school_id,
name=detected_info.class_name,
grade_level=grade_level,
school_year=school_year,
teacher_id=teacher_id,
student_count=len(detected_info.students)
)
self._local_classes[class_id] = school_class
return school_class
async def _bulk_create_students(
self,
class_id: str,
students: List[Dict[str, str]]
) -> List[Student]:
"""Erstellt mehrere Schueler auf einmal."""
created = []
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/classes/{class_id}/students/bulk",
json={
"students": [
{
"first_name": s.get("firstName", s.get("first_name", "")),
"last_name": s.get("lastName", s.get("last_name", ""))
}
for s in students
]
}
)
if response.status_code in (200, 201):
data = response.json()
created = [
Student(
id=s['id'],
class_id=class_id,
first_name=s['first_name'],
last_name=s.get('last_name', '')
)
for s in data.get('students', [])
]
except Exception as e:
print(f"[SchoolResolver] Bulk create students error: {e}")
return created
# =========================================================================
# CONTEXT MANAGEMENT
# =========================================================================
async def get_teacher_context(self, teacher_id: str) -> SchoolContext:
"""
Holt den vollstaendigen Schulkontext eines Lehrers.
Beinhaltet Schule, Klassen und aktuelles Schuljahr.
"""
context = SchoolContext(teacher_id=teacher_id)
# Klassen laden
classes = await self.get_classes_for_teacher(teacher_id)
context.classes = classes
# Schule aus erster Klasse ableiten
if classes and classes[0].school_id:
schools = await self.search_schools()
for school in schools:
if school.id == classes[0].school_id:
context.school = school
break
return context
# Singleton
_school_resolver: Optional[SchoolResolver] = None
def get_school_resolver() -> SchoolResolver:
"""Gibt die Singleton-Instanz des SchoolResolvers zurueck."""
global _school_resolver
if _school_resolver is None:
_school_resolver = SchoolResolver()
return _school_resolver

View File

@@ -0,0 +1,197 @@
"""
Storage Service for Klausur Documents.
PRIVACY BY DESIGN:
- Documents stored with doc_token as identifier (not student names)
- Organized by session_id/doc_token for teacher isolation
- Auto-cleanup when retention period expires
"""
import os
import io
import logging
from typing import Optional, BinaryIO
from pathlib import Path
from minio import Minio
from minio.error import S3Error
logger = logging.getLogger(__name__)
class KlausurStorageService:
"""
MinIO/S3 Storage Service for exam documents.
Structure:
klausur-exams/
{session_id}/
{doc_token}.{ext}
{doc_token}_redacted.{ext} # After header redaction
"""
def __init__(self):
self.endpoint = os.getenv("MINIO_ENDPOINT", "minio:9000")
self.access_key = os.getenv("MINIO_ROOT_USER", "breakpilot_dev")
self.secret_key = os.getenv("MINIO_ROOT_PASSWORD", "breakpilot_dev_123")
self.secure = os.getenv("MINIO_SECURE", "false").lower() == "true"
self.bucket_name = os.getenv("KLAUSUR_BUCKET", "klausur-exams")
self._client: Optional[Minio] = None
@property
def client(self) -> Minio:
"""Lazy-init MinIO client."""
if self._client is None:
self._client = Minio(
self.endpoint,
access_key=self.access_key,
secret_key=self.secret_key,
secure=self.secure
)
self._ensure_bucket()
return self._client
def _ensure_bucket(self):
"""Create bucket if it doesn't exist."""
try:
if not self._client.bucket_exists(self.bucket_name):
self._client.make_bucket(self.bucket_name)
logger.info(f"Created Klausur bucket: {self.bucket_name}")
except S3Error as e:
logger.warning(f"MinIO bucket check failed: {e}")
def upload_document(
self,
session_id: str,
doc_token: str,
file_data: bytes,
file_extension: str = "png",
is_redacted: bool = False
) -> str:
"""
Upload exam document to storage.
Args:
session_id: Exam session ID
doc_token: Pseudonymized document token
file_data: Document binary data
file_extension: File extension (png, jpg, pdf)
is_redacted: Whether this is the redacted version
Returns:
Object path in storage
"""
suffix = "_redacted" if is_redacted else ""
object_name = f"{session_id}/{doc_token}{suffix}.{file_extension}"
# Determine content type
content_types = {
"png": "image/png",
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"pdf": "application/pdf",
}
content_type = content_types.get(file_extension.lower(), "application/octet-stream")
try:
self.client.put_object(
bucket_name=self.bucket_name,
object_name=object_name,
data=io.BytesIO(file_data),
length=len(file_data),
content_type=content_type
)
logger.info(f"Uploaded document: {object_name}")
return object_name
except S3Error as e:
logger.error(f"Failed to upload document: {e}")
raise
def get_document(
self,
session_id: str,
doc_token: str,
file_extension: str = "png",
is_redacted: bool = False
) -> Optional[bytes]:
"""
Download exam document from storage.
Args:
session_id: Exam session ID
doc_token: Pseudonymized document token
file_extension: File extension
is_redacted: Whether to get the redacted version
Returns:
Document binary data or None if not found
"""
suffix = "_redacted" if is_redacted else ""
object_name = f"{session_id}/{doc_token}{suffix}.{file_extension}"
try:
response = self.client.get_object(self.bucket_name, object_name)
data = response.read()
response.close()
response.release_conn()
return data
except S3Error as e:
if e.code == "NoSuchKey":
logger.warning(f"Document not found: {object_name}")
return None
logger.error(f"Failed to get document: {e}")
raise
def delete_session_documents(self, session_id: str) -> int:
"""
Delete all documents for a session.
Args:
session_id: Exam session ID
Returns:
Number of deleted objects
"""
deleted_count = 0
prefix = f"{session_id}/"
try:
objects = self.client.list_objects(self.bucket_name, prefix=prefix)
for obj in objects:
self.client.remove_object(self.bucket_name, obj.object_name)
deleted_count += 1
logger.debug(f"Deleted: {obj.object_name}")
logger.info(f"Deleted {deleted_count} documents for session {session_id}")
return deleted_count
except S3Error as e:
logger.error(f"Failed to delete session documents: {e}")
raise
def document_exists(
self,
session_id: str,
doc_token: str,
file_extension: str = "png"
) -> bool:
"""Check if document exists in storage."""
object_name = f"{session_id}/{doc_token}.{file_extension}"
try:
self.client.stat_object(self.bucket_name, object_name)
return True
except S3Error:
return False
# Singleton instance
_storage_service: Optional[KlausurStorageService] = None
def get_storage_service() -> KlausurStorageService:
"""Get or create the storage service singleton."""
global _storage_service
if _storage_service is None:
_storage_service = KlausurStorageService()
return _storage_service

View File

@@ -0,0 +1,214 @@
"""
TrOCR Client - Connects to external TrOCR service (Mac Mini).
This client forwards OCR requests to the TrOCR service running on
the Mac Mini, enabling handwriting recognition without requiring
local GPU/ML dependencies.
Privacy: Images are sent over the local network only - no cloud.
"""
import os
import httpx
import logging
from typing import Optional, List, Dict
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# Mac Mini TrOCR Service URL
TROCR_SERVICE_URL = os.environ.get(
"TROCR_SERVICE_URL",
"http://192.168.178.163:8084"
)
@dataclass
class OCRResult:
"""Result from TrOCR extraction."""
text: str
confidence: float
processing_time_ms: int
device: str = "remote"
class TrOCRClient:
"""
Client for external TrOCR service.
Usage:
client = TrOCRClient()
# Check if service is available
if await client.is_available():
result = await client.extract_text(image_bytes)
print(result.text)
"""
def __init__(self, base_url: Optional[str] = None):
self.base_url = base_url or TROCR_SERVICE_URL
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create HTTP client."""
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
base_url=self.base_url,
timeout=300.0 # 5 min timeout for model loading
)
return self._client
async def close(self):
"""Close the HTTP client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
async def is_available(self) -> bool:
"""Check if TrOCR service is available."""
try:
client = await self._get_client()
response = await client.get("/health", timeout=5.0)
return response.status_code == 200
except Exception as e:
logger.warning(f"TrOCR service not available: {e}")
return False
async def get_status(self) -> Dict:
"""Get TrOCR service status."""
try:
client = await self._get_client()
response = await client.get("/api/v1/status")
response.raise_for_status()
return response.json()
except Exception as e:
logger.error(f"Failed to get TrOCR status: {e}")
return {
"status": "unavailable",
"error": str(e)
}
async def extract_text(
self,
image_data: bytes,
filename: str = "image.png",
detect_lines: bool = True
) -> OCRResult:
"""
Extract text from an image using TrOCR.
Args:
image_data: Raw image bytes
filename: Original filename
detect_lines: Whether to detect individual lines
Returns:
OCRResult with extracted text
"""
try:
client = await self._get_client()
files = {"file": (filename, image_data, "image/png")}
params = {"detect_lines": str(detect_lines).lower()}
response = await client.post(
"/api/v1/extract",
files=files,
params=params
)
response.raise_for_status()
data = response.json()
return OCRResult(
text=data.get("text", ""),
confidence=data.get("confidence", 0.0),
processing_time_ms=data.get("processing_time_ms", 0),
device=data.get("device", "remote")
)
except httpx.TimeoutException:
logger.error("TrOCR request timed out (model may be loading)")
raise
except Exception as e:
logger.error(f"TrOCR extraction failed: {e}")
raise
async def batch_extract(
self,
images: List[bytes],
filenames: Optional[List[str]] = None,
detect_lines: bool = True
) -> List[OCRResult]:
"""
Extract text from multiple images.
Args:
images: List of image bytes
filenames: Optional list of filenames
detect_lines: Whether to detect individual lines
Returns:
List of OCRResult
"""
if filenames is None:
filenames = [f"image_{i}.png" for i in range(len(images))]
try:
client = await self._get_client()
files = [
("files", (fn, img, "image/png"))
for fn, img in zip(filenames, images)
]
response = await client.post(
"/api/v1/batch-extract",
files=files
)
response.raise_for_status()
data = response.json()
results = []
for item in data.get("results", []):
results.append(OCRResult(
text=item.get("text", ""),
confidence=item.get("confidence", 0.85),
processing_time_ms=0,
device="remote"
))
return results
except Exception as e:
logger.error(f"TrOCR batch extraction failed: {e}")
raise
# Singleton instance
_trocr_client: Optional[TrOCRClient] = None
def get_trocr_client() -> TrOCRClient:
"""Get the TrOCR client singleton."""
global _trocr_client
if _trocr_client is None:
_trocr_client = TrOCRClient()
return _trocr_client
async def extract_text_from_image(
image_data: bytes,
filename: str = "image.png"
) -> OCRResult:
"""
Convenience function to extract text from an image.
Args:
image_data: Raw image bytes
filename: Original filename
Returns:
OCRResult with extracted text
"""
client = get_trocr_client()
return await client.extract_text(image_data, filename)

View File

@@ -0,0 +1,577 @@
"""
TrOCR Service for Handwriting Recognition.
Uses Microsoft's TrOCR model for extracting handwritten text from exam images.
Supports fine-tuning with teacher corrections via LoRA adapters.
PRIVACY BY DESIGN:
- All processing happens locally
- No data sent to external services
- Fine-tuning data stays on-premise
"""
import logging
import os
from pathlib import Path
from typing import Optional, List, Dict, Tuple
from dataclasses import dataclass
from io import BytesIO
import json
logger = logging.getLogger(__name__)
# Model paths
MODEL_CACHE_DIR = Path(os.environ.get("TROCR_CACHE_DIR", "/app/models/trocr"))
LORA_ADAPTERS_DIR = Path(os.environ.get("TROCR_LORA_DIR", "/app/models/trocr/lora"))
TRAINING_DATA_DIR = Path(os.environ.get("TROCR_TRAINING_DIR", "/app/data/trocr_training"))
@dataclass
class OCRResult:
"""Result from TrOCR extraction."""
text: str
confidence: float
bounding_boxes: List[Dict] # [{"x": 0, "y": 0, "w": 100, "h": 20, "text": "..."}]
processing_time_ms: int
@dataclass
class TrainingExample:
"""A single training example for fine-tuning."""
image_path: str
ground_truth: str
teacher_id: str
created_at: str
class TrOCRService:
"""
Handwriting recognition service using TrOCR.
Features:
- Line-by-line handwriting extraction
- Confidence scoring
- LoRA fine-tuning support
- Batch processing
"""
# Available models (from smallest to largest)
MODELS = {
"trocr-small": "microsoft/trocr-small-handwritten",
"trocr-base": "microsoft/trocr-base-handwritten", # Recommended
"trocr-large": "microsoft/trocr-large-handwritten",
}
def __init__(self, model_name: str = "trocr-base", device: str = "auto"):
"""
Initialize TrOCR service.
Args:
model_name: One of "trocr-small", "trocr-base", "trocr-large"
device: "cpu", "cuda", "mps" (Apple Silicon), or "auto"
"""
self.model_name = model_name
self.model_id = self.MODELS.get(model_name, self.MODELS["trocr-base"])
self.device = self._get_device(device)
self._processor = None
self._model = None
self._lora_adapter = None
# Create directories
MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
LORA_ADAPTERS_DIR.mkdir(parents=True, exist_ok=True)
TRAINING_DATA_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"TrOCR Service initialized: model={model_name}, device={self.device}")
def _get_device(self, device: str) -> str:
"""Determine the best device for inference."""
if device != "auto":
return device
try:
import torch
if torch.cuda.is_available():
return "cuda"
elif torch.backends.mps.is_available():
return "mps"
return "cpu"
except ImportError:
return "cpu"
def _load_model(self):
"""Lazy-load the TrOCR model."""
if self._model is not None:
return
try:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
logger.info(f"Loading TrOCR model: {self.model_id}")
self._processor = TrOCRProcessor.from_pretrained(
self.model_id,
cache_dir=str(MODEL_CACHE_DIR)
)
self._model = VisionEncoderDecoderModel.from_pretrained(
self.model_id,
cache_dir=str(MODEL_CACHE_DIR)
)
# Move to device
if self.device == "cuda":
self._model = self._model.cuda()
elif self.device == "mps":
self._model = self._model.to("mps")
# Load LoRA adapter if exists
adapter_path = LORA_ADAPTERS_DIR / f"{self.model_name}_adapter"
if adapter_path.exists():
self._load_lora_adapter(adapter_path)
logger.info(f"TrOCR model loaded successfully on {self.device}")
except ImportError as e:
logger.error(f"Missing dependencies: {e}")
logger.error("Install with: pip install transformers torch pillow")
raise
except Exception as e:
logger.error(f"Failed to load TrOCR model: {e}")
raise
def _load_lora_adapter(self, adapter_path: Path):
"""Load a LoRA adapter for fine-tuned model."""
try:
from peft import PeftModel
logger.info(f"Loading LoRA adapter from {adapter_path}")
self._model = PeftModel.from_pretrained(self._model, str(adapter_path))
self._lora_adapter = str(adapter_path)
logger.info("LoRA adapter loaded successfully")
except ImportError:
logger.warning("peft not installed, skipping LoRA adapter")
except Exception as e:
logger.warning(f"Failed to load LoRA adapter: {e}")
async def extract_text(
self,
image_data: bytes,
detect_lines: bool = True
) -> OCRResult:
"""
Extract handwritten text from an image.
Args:
image_data: Raw image bytes (PNG, JPG, etc.)
detect_lines: If True, detect and process individual lines
Returns:
OCRResult with extracted text and confidence
"""
import time
start_time = time.time()
self._load_model()
try:
from PIL import Image
import torch
# Load image
image = Image.open(BytesIO(image_data)).convert("RGB")
if detect_lines:
# Detect text lines and process each
lines, bboxes = await self._detect_and_extract_lines(image)
text = "\n".join(lines)
confidence = 0.85 # Average confidence estimate
else:
# Process whole image
text, confidence = await self._extract_single(image)
bboxes = []
processing_time_ms = int((time.time() - start_time) * 1000)
return OCRResult(
text=text,
confidence=confidence,
bounding_boxes=bboxes,
processing_time_ms=processing_time_ms
)
except Exception as e:
logger.error(f"OCR extraction failed: {e}")
return OCRResult(
text="",
confidence=0.0,
bounding_boxes=[],
processing_time_ms=int((time.time() - start_time) * 1000)
)
async def _extract_single(self, image) -> Tuple[str, float]:
"""Extract text from a single image (no line detection)."""
import torch
# Preprocess
pixel_values = self._processor(
images=image,
return_tensors="pt"
).pixel_values
if self.device == "cuda":
pixel_values = pixel_values.cuda()
elif self.device == "mps":
pixel_values = pixel_values.to("mps")
# Generate
with torch.no_grad():
generated_ids = self._model.generate(
pixel_values,
max_length=128,
num_beams=4,
return_dict_in_generate=True,
output_scores=True
)
# Decode
text = self._processor.batch_decode(
generated_ids.sequences,
skip_special_tokens=True
)[0]
# Estimate confidence from generation scores
confidence = self._estimate_confidence(generated_ids)
return text.strip(), confidence
async def _detect_and_extract_lines(self, image) -> Tuple[List[str], List[Dict]]:
"""Detect text lines and extract each separately."""
from PIL import Image
import numpy as np
# Convert to numpy for line detection
img_array = np.array(image.convert("L")) # Grayscale
# Simple horizontal projection for line detection
lines_y = self._detect_line_positions(img_array)
if not lines_y:
# Fallback: process whole image
text, _ = await self._extract_single(image)
return [text], []
# Extract each line
results = []
bboxes = []
width = image.width
for i, (y_start, y_end) in enumerate(lines_y):
# Crop line
line_img = image.crop((0, y_start, width, y_end))
# Ensure minimum height
if line_img.height < 20:
continue
# Extract text
text, conf = await self._extract_single(line_img)
if text.strip():
results.append(text)
bboxes.append({
"x": 0,
"y": y_start,
"w": width,
"h": y_end - y_start,
"text": text,
"confidence": conf
})
return results, bboxes
def _detect_line_positions(self, img_array) -> List[Tuple[int, int]]:
"""Detect horizontal text line positions using projection profile."""
import numpy as np
# Horizontal projection (sum of pixels per row)
projection = np.sum(255 - img_array, axis=1)
# Threshold to find text rows
threshold = np.max(projection) * 0.1
text_rows = projection > threshold
# Find line boundaries
lines = []
in_line = False
line_start = 0
for i, is_text in enumerate(text_rows):
if is_text and not in_line:
in_line = True
line_start = max(0, i - 5) # Add padding
elif not is_text and in_line:
in_line = False
line_end = min(len(text_rows) - 1, i + 5) # Add padding
if line_end - line_start > 15: # Minimum line height
lines.append((line_start, line_end))
# Handle last line
if in_line:
lines.append((line_start, len(text_rows) - 1))
return lines
def _estimate_confidence(self, generated_output) -> float:
"""Estimate confidence from generation scores."""
try:
import torch
if hasattr(generated_output, 'scores') and generated_output.scores:
# Average probability of selected tokens
probs = []
for score in generated_output.scores:
prob = torch.softmax(score, dim=-1).max().item()
probs.append(prob)
return sum(probs) / len(probs) if probs else 0.5
return 0.75 # Default confidence
except Exception:
return 0.75
async def batch_extract(
self,
images: List[bytes],
detect_lines: bool = True
) -> List[OCRResult]:
"""
Extract text from multiple images.
Args:
images: List of image bytes
detect_lines: If True, detect lines in each image
Returns:
List of OCRResult
"""
results = []
for img_data in images:
result = await self.extract_text(img_data, detect_lines)
results.append(result)
return results
# ==========================================
# FINE-TUNING SUPPORT
# ==========================================
def add_training_example(
self,
image_data: bytes,
ground_truth: str,
teacher_id: str
) -> str:
"""
Add a training example for fine-tuning.
Args:
image_data: Image bytes
ground_truth: Correct text (teacher-provided)
teacher_id: ID of the teacher providing correction
Returns:
Example ID
"""
import uuid
from datetime import datetime
example_id = str(uuid.uuid4())
# Save image
image_path = TRAINING_DATA_DIR / f"{example_id}.png"
with open(image_path, "wb") as f:
f.write(image_data)
# Save metadata
example = TrainingExample(
image_path=str(image_path),
ground_truth=ground_truth,
teacher_id=teacher_id,
created_at=datetime.utcnow().isoformat()
)
meta_path = TRAINING_DATA_DIR / f"{example_id}.json"
with open(meta_path, "w") as f:
json.dump(example.__dict__, f, indent=2)
logger.info(f"Training example added: {example_id}")
return example_id
def get_training_examples(self, teacher_id: Optional[str] = None) -> List[TrainingExample]:
"""Get all training examples, optionally filtered by teacher."""
examples = []
for meta_file in TRAINING_DATA_DIR.glob("*.json"):
with open(meta_file) as f:
data = json.load(f)
example = TrainingExample(**data)
if teacher_id is None or example.teacher_id == teacher_id:
examples.append(example)
return examples
async def fine_tune(
self,
teacher_id: Optional[str] = None,
epochs: int = 3,
learning_rate: float = 5e-5
) -> Dict:
"""
Fine-tune the model with collected training examples.
Uses LoRA for efficient fine-tuning.
Args:
teacher_id: If provided, only use examples from this teacher
epochs: Number of training epochs
learning_rate: Learning rate for fine-tuning
Returns:
Training statistics
"""
examples = self.get_training_examples(teacher_id)
if len(examples) < 10:
return {
"status": "error",
"message": f"Need at least 10 examples, have {len(examples)}"
}
try:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import Trainer, TrainingArguments
from PIL import Image
import torch
self._load_model()
logger.info(f"Starting fine-tuning with {len(examples)} examples")
# Configure LoRA
lora_config = LoraConfig(
task_type=TaskType.SEQ_2_SEQ_LM,
r=16, # LoRA rank
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"] # Attention layers
)
# Apply LoRA
model = get_peft_model(self._model, lora_config)
# Prepare dataset
class OCRDataset(torch.utils.data.Dataset):
def __init__(self, examples, processor):
self.examples = examples
self.processor = processor
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
ex = self.examples[idx]
image = Image.open(ex.image_path).convert("RGB")
pixel_values = self.processor(
images=image, return_tensors="pt"
).pixel_values.squeeze()
labels = self.processor.tokenizer(
ex.ground_truth,
return_tensors="pt",
padding="max_length",
max_length=128
).input_ids.squeeze()
return {
"pixel_values": pixel_values,
"labels": labels
}
dataset = OCRDataset(examples, self._processor)
# Training arguments
output_dir = LORA_ADAPTERS_DIR / f"{self.model_name}_adapter"
training_args = TrainingArguments(
output_dir=str(output_dir),
num_train_epochs=epochs,
per_device_train_batch_size=4,
learning_rate=learning_rate,
save_strategy="epoch",
logging_steps=10,
remove_unused_columns=False,
)
# Train
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
)
train_result = trainer.train()
# Save adapter
model.save_pretrained(str(output_dir))
# Reload model with new adapter
self._model = None
self._load_model()
return {
"status": "success",
"examples_used": len(examples),
"epochs": epochs,
"adapter_path": str(output_dir),
"train_loss": train_result.training_loss
}
except ImportError as e:
logger.error(f"Missing dependencies for fine-tuning: {e}")
return {
"status": "error",
"message": f"Missing dependencies: {e}. Install with: pip install peft"
}
except Exception as e:
logger.error(f"Fine-tuning failed: {e}")
return {
"status": "error",
"message": str(e)
}
def get_model_info(self) -> Dict:
"""Get information about the loaded model."""
adapter_path = LORA_ADAPTERS_DIR / f"{self.model_name}_adapter"
return {
"model_name": self.model_name,
"model_id": self.model_id,
"device": self.device,
"is_loaded": self._model is not None,
"has_lora_adapter": adapter_path.exists(),
"lora_adapter_path": str(adapter_path) if adapter_path.exists() else None,
"training_examples_count": len(list(TRAINING_DATA_DIR.glob("*.json"))),
}
# Singleton instance
_trocr_service: Optional[TrOCRService] = None
def get_trocr_service(model_name: str = "trocr-base") -> TrOCRService:
"""Get or create the TrOCR service singleton."""
global _trocr_service
if _trocr_service is None or _trocr_service.model_name != model_name:
_trocr_service = TrOCRService(model_name=model_name)
return _trocr_service

View File

@@ -0,0 +1,309 @@
"""
Vision-OCR Service - Handschrifterkennung mit Llama 3.2 Vision.
DATENSCHUTZ/PRIVACY BY DESIGN:
- Alle Verarbeitung erfolgt lokal auf dem Mac Mini
- Keine Daten verlassen das lokale Netzwerk
- Keine Cloud-APIs beteiligt
- Perfekt für DSGVO-konforme Schulumgebungen
Verwendet llama3.2-vision:11b über Ollama für OCR/Handschrifterkennung.
Dies ist eine Alternative zu TrOCR mit besserer Handschrifterkennung.
"""
import os
import base64
import httpx
import logging
import time
from typing import Optional
from dataclasses import dataclass
from llm_gateway.config import get_config
logger = logging.getLogger(__name__)
@dataclass
class VisionOCRResult:
"""Result from Vision-LLM OCR extraction."""
text: str
confidence: float
processing_time_ms: int
model: str = "llama3.2-vision:11b"
device: str = "local-ollama"
# OCR System Prompt für optimale Handschrifterkennung
HANDWRITING_OCR_PROMPT = """Du bist ein Experte für Handschrifterkennung (OCR).
AUFGABE: Extrahiere den handschriftlichen Text aus dem Bild so genau wie möglich.
WICHTIGE REGELN:
1. Transkribiere NUR den sichtbaren Text - erfinde nichts dazu
2. Behalte die Zeilenstruktur bei (jede Zeile auf einer neuen Zeile)
3. Bei unleserlichen Stellen: [unleserlich] oder [?] verwenden
4. Ignoriere Linien, Kästchen und andere Formatierungen
5. Korrigiere KEINE Rechtschreibfehler - transkribiere exakt was da steht
6. Bei Aufzählungen: Nummern/Punkte beibehalten (1., 2., a), b), etc.)
AUSGABE: Nur der transkribierte Text, keine Erklärungen oder Kommentare."""
# Alternative Prompt für gedruckten Text
PRINTED_OCR_PROMPT = """Extrahiere den gesamten Text aus diesem Bild.
Behalte die Struktur bei (Absätze, Listen, etc.).
Gib nur den extrahierten Text zurück, ohne Kommentare."""
class VisionOCRService:
"""
OCR Service mit Llama 3.2 Vision über Ollama.
Läuft komplett lokal auf dem Mac Mini - keine Cloud-Verbindung nötig.
Ideal für datenschutzkonforme Klausurkorrektur in Schulen.
Usage:
service = VisionOCRService()
if await service.is_available():
result = await service.extract_text(image_bytes)
print(result.text)
"""
def __init__(self, ollama_url: Optional[str] = None, model: Optional[str] = None):
"""
Initialize Vision OCR Service.
Args:
ollama_url: Ollama API URL (default: from config)
model: Vision model to use (default: llama3.2-vision:11b)
"""
config = get_config()
self.ollama_url = ollama_url or (config.ollama.base_url if config.ollama else "http://localhost:11434")
self.model = model or config.vision_model
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create HTTP client."""
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
timeout=300.0 # 5 min timeout für große Bilder
)
return self._client
async def close(self):
"""Close the HTTP client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
async def is_available(self) -> bool:
"""Check if Ollama with vision model is available."""
try:
client = await self._get_client()
# Check Ollama health
response = await client.get(
f"{self.ollama_url}/api/tags",
timeout=5.0
)
if response.status_code != 200:
return False
# Check if vision model is installed
data = response.json()
models = [m.get("name", "") for m in data.get("models", [])]
# Check for any vision model
has_vision = any(
"vision" in m.lower() or "llava" in m.lower()
for m in models
)
if not has_vision:
logger.warning(f"No vision model found. Available: {models}")
return False
return True
except Exception as e:
logger.warning(f"Vision OCR service not available: {e}")
return False
async def get_status(self) -> dict:
"""Get service status."""
try:
client = await self._get_client()
response = await client.get(f"{self.ollama_url}/api/tags")
if response.status_code == 200:
data = response.json()
models = data.get("models", [])
vision_models = [
m for m in models
if "vision" in m.get("name", "").lower() or "llava" in m.get("name", "").lower()
]
return {
"status": "available",
"ollama_url": self.ollama_url,
"configured_model": self.model,
"vision_models": [m.get("name") for m in vision_models],
"total_models": len(models)
}
else:
return {
"status": "unavailable",
"error": f"HTTP {response.status_code}"
}
except Exception as e:
return {
"status": "unavailable",
"error": str(e)
}
async def extract_text(
self,
image_data: bytes,
filename: str = "image.png",
is_handwriting: bool = True
) -> VisionOCRResult:
"""
Extract text from an image using Vision LLM.
Args:
image_data: Raw image bytes (PNG, JPG, etc.)
filename: Original filename (for logging)
is_handwriting: True for handwriting, False for printed text
Returns:
VisionOCRResult with extracted text
"""
start_time = time.time()
try:
client = await self._get_client()
# Encode image as base64
image_base64 = base64.b64encode(image_data).decode("utf-8")
# Select appropriate prompt
prompt = HANDWRITING_OCR_PROMPT if is_handwriting else PRINTED_OCR_PROMPT
# Ollama Vision API request
payload = {
"model": self.model,
"messages": [
{
"role": "user",
"content": prompt,
"images": [image_base64]
}
],
"stream": False,
"options": {
"temperature": 0.1, # Low temperature for consistent OCR
"num_predict": 2048, # Max tokens for extracted text
}
}
logger.info(f"Sending image to Vision OCR: {filename} ({len(image_data)} bytes)")
response = await client.post(
f"{self.ollama_url}/api/chat",
json=payload,
timeout=180.0 # 3 min timeout
)
response.raise_for_status()
data = response.json()
extracted_text = data.get("message", {}).get("content", "")
processing_time_ms = int((time.time() - start_time) * 1000)
# Estimate confidence based on response quality
confidence = self._estimate_confidence(extracted_text)
logger.info(
f"Vision OCR completed for {filename}: "
f"{len(extracted_text)} chars in {processing_time_ms}ms"
)
return VisionOCRResult(
text=extracted_text.strip(),
confidence=confidence,
processing_time_ms=processing_time_ms,
model=self.model,
device="local-ollama"
)
except httpx.TimeoutException:
logger.error(f"Vision OCR timed out for {filename}")
raise
except Exception as e:
logger.error(f"Vision OCR failed for {filename}: {e}")
raise
def _estimate_confidence(self, text: str) -> float:
"""
Estimate OCR confidence based on text quality.
This is a heuristic - real confidence would need model output.
"""
if not text:
return 0.0
# Count uncertain markers
uncertain_markers = text.count("[unleserlich]") + text.count("[?]")
# Count reasonable text vs markers
text_length = len(text.replace("[unleserlich]", "").replace("[?]", ""))
if text_length == 0:
return 0.1
# Base confidence
confidence = 0.85
# Reduce for uncertain markers
confidence -= min(uncertain_markers * 0.05, 0.3)
# Very short text might be incomplete
if text_length < 20:
confidence -= 0.1
return max(confidence, 0.1)
# Singleton instance
_vision_ocr_service: Optional[VisionOCRService] = None
def get_vision_ocr_service() -> VisionOCRService:
"""Get the Vision OCR service singleton."""
global _vision_ocr_service
if _vision_ocr_service is None:
_vision_ocr_service = VisionOCRService()
return _vision_ocr_service
async def extract_handwriting(
image_data: bytes,
filename: str = "image.png"
) -> VisionOCRResult:
"""
Convenience function to extract handwriting from an image.
Uses Llama 3.2 Vision locally via Ollama.
All processing happens on the local Mac Mini - DSGVO-konform.
Args:
image_data: Raw image bytes
filename: Original filename
Returns:
VisionOCRResult with extracted text
"""
service = get_vision_ocr_service()
return await service.extract_text(image_data, filename, is_handwriting=True)