feat: BreakPilot PWA - Full codebase (clean push without large binaries)
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed

All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
This commit is contained in:
BreakPilot Dev
2026-02-11 13:25:58 +01:00
commit 19855efacc
2512 changed files with 933814 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
"""
Klausurkorrektur Module - Privacy-by-Design Exam Correction.
DSGVO-compliant exam correction with QR-code based pseudonymization.
No personal data is sent to the LLM.
Architecture:
- Pseudonymization via doc_token (128-bit UUID)
- Teacher namespace isolation
- Self-hosted LLM at SysEleven
- Zero-knowledge identity mapping (encrypted client-side)
"""
from .db_models import (
ExamSession, PseudonymizedDocument, QRBatchJob,
SessionStatus, DocumentStatus,
# Magic Onboarding
OnboardingSession, DetectedStudent, ModuleLink,
OnboardingStatus, ModuleLinkType
)
from .repository import KlausurRepository
from .database import get_db, init_db
# Services
from .services.roster_parser import RosterParser, get_roster_parser
from .services.school_resolver import SchoolResolver, get_school_resolver
from .services.module_linker import ModuleLinker, get_module_linker
__all__ = [
# Models
"ExamSession",
"PseudonymizedDocument",
"QRBatchJob",
"SessionStatus",
"DocumentStatus",
# Magic Onboarding Models
"OnboardingSession",
"DetectedStudent",
"ModuleLink",
"OnboardingStatus",
"ModuleLinkType",
# Repository
"KlausurRepository",
# Database
"get_db",
"init_db",
# Services
"RosterParser",
"get_roster_parser",
"SchoolResolver",
"get_school_resolver",
"ModuleLinker",
"get_module_linker",
]

View File

@@ -0,0 +1,47 @@
"""
Database Configuration for Klausur Module.
Uses the same PostgreSQL database as the main backend.
"""
import os
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
# Database URL from environment (uses same DB as Backend)
_raw_url = os.getenv(
"DATABASE_URL",
"postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot"
)
# SQLAlchemy 2.0 requires "postgresql://" instead of "postgres://"
DATABASE_URL = _raw_url.replace("postgres://", "postgresql://", 1) if _raw_url.startswith("postgres://") else _raw_url
# Engine configuration
engine = create_engine(
DATABASE_URL,
pool_pre_ping=True,
pool_size=5,
max_overflow=10,
echo=os.getenv("SQL_ECHO", "false").lower() == "true"
)
# Declarative Base
Base = declarative_base()
# Session factory
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def get_db():
"""Database dependency for FastAPI endpoints."""
db = SessionLocal()
try:
yield db
finally:
db.close()
def init_db():
"""Creates all tables (for development)."""
from . import db_models # Import models to register them
Base.metadata.create_all(bind=engine)

View File

@@ -0,0 +1,377 @@
"""
SQLAlchemy Database Models for Klausurkorrektur Module.
Privacy-by-Design: No personal data (student names) is stored in these models.
Only pseudonymized doc_tokens are used to reference exam documents.
"""
from datetime import datetime
from sqlalchemy import (
Column, String, Integer, DateTime, JSON,
Boolean, Text, Enum as SQLEnum, ForeignKey, LargeBinary
)
from sqlalchemy.orm import relationship
import enum
import uuid
from .database import Base
class SessionStatus(str, enum.Enum):
"""Status of an exam correction session."""
CREATED = "created" # Session created, awaiting uploads
UPLOADING = "uploading" # Documents being uploaded
PROCESSING = "processing" # OCR and AI correction in progress
COMPLETED = "completed" # All documents processed
ARCHIVED = "archived" # Session archived (data retention)
DELETED = "deleted" # Soft delete
class OnboardingStatus(str, enum.Enum):
"""Status of a magic onboarding session."""
ANALYZING = "analyzing" # Local LLM extracting headers
CONFIRMING = "confirming" # User confirming detected data
PROCESSING = "processing" # Cloud LLM correcting exams
LINKING = "linking" # Creating module links
COMPLETE = "complete" # Onboarding finished
class ModuleLinkType(str, enum.Enum):
"""Type of cross-module link."""
NOTENBUCH = "notenbuch" # Link to grade book
ELTERNABEND = "elternabend" # Link to parent meetings
ZEUGNIS = "zeugnis" # Link to certificates
CALENDAR = "calendar" # Link to calendar events
KLASSENBUCH = "klassenbuch" # Link to class book
class DocumentStatus(str, enum.Enum):
"""Status of a single pseudonymized document."""
UPLOADED = "uploaded" # Document uploaded, awaiting OCR
OCR_PROCESSING = "ocr_processing" # OCR in progress
OCR_COMPLETED = "ocr_completed" # OCR done, awaiting AI correction
AI_PROCESSING = "ai_processing" # AI correction in progress
COMPLETED = "completed" # Fully processed
FAILED = "failed" # Processing failed
class ExamSession(Base):
"""
Exam Correction Session.
Groups multiple pseudonymized documents for a single exam correction task.
No personal data is stored - teacher_id is the only identifying info.
"""
__tablename__ = 'klausur_sessions'
# Primary Key
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
# Teacher isolation (mandatory)
teacher_id = Column(String(100), nullable=False, index=True)
# Session metadata
name = Column(String(200), nullable=False) # e.g., "Mathe 10a - Klausur 1"
subject = Column(String(100), default="")
class_name = Column(String(100), default="") # e.g., "10a"
# Exam configuration
total_points = Column(Integer, default=100)
rubric = Column(Text, default="") # Bewertungskriterien
questions = Column(JSON, default=list) # [{question, points, rubric}]
# Status
status = Column(
SQLEnum(SessionStatus),
default=SessionStatus.CREATED,
nullable=False,
index=True
)
# Statistics (anonymized)
document_count = Column(Integer, default=0)
processed_count = Column(Integer, default=0)
# Encrypted identity map (only teacher can decrypt)
# This is stored encrypted with teacher's password
encrypted_identity_map = Column(LargeBinary, nullable=True)
identity_map_iv = Column(String(64), nullable=True) # IV for AES decryption
# Timestamps
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
completed_at = Column(DateTime, nullable=True)
# Data retention: auto-delete after this date
retention_until = Column(DateTime, nullable=True)
# Magic Onboarding: Link to school class (optional)
linked_school_class_id = Column(String(36), nullable=True)
linked_subject_id = Column(String(36), nullable=True)
# Relationship to documents
documents = relationship(
"PseudonymizedDocument",
back_populates="session",
cascade="all, delete-orphan"
)
def __repr__(self):
return f"<ExamSession {self.id[:8]}: {self.name} ({self.status.value})>"
class PseudonymizedDocument(Base):
"""
Pseudonymized Exam Document.
PRIVACY DESIGN:
- doc_token is a 128-bit random UUID, NOT derivable from student identity
- No student name or personal info is stored here
- Identity mapping is stored encrypted in ExamSession.encrypted_identity_map
- The backend CANNOT de-pseudonymize documents
Only the teacher (with their encryption key) can map doc_token -> student name.
"""
__tablename__ = 'klausur_documents'
# Primary Key: The pseudonymization token
doc_token = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
# Session relationship
session_id = Column(String(36), ForeignKey('klausur_sessions.id'), nullable=False, index=True)
# Processing status
status = Column(
SQLEnum(DocumentStatus),
default=DocumentStatus.UPLOADED,
nullable=False,
index=True
)
# Page info
page_number = Column(Integer, default=1)
total_pages = Column(Integer, default=1)
# OCR result (redacted - no header/name visible)
ocr_text = Column(Text, default="")
ocr_confidence = Column(Integer, default=0) # 0-100
# AI correction result (pseudonymized)
ai_feedback = Column(Text, default="")
ai_score = Column(Integer, nullable=True) # Points achieved
ai_grade = Column(String(10), nullable=True) # e.g., "2+" or "B"
ai_details = Column(JSON, default=dict) # Per-question scores
# Processing metadata
processing_started_at = Column(DateTime, nullable=True)
processing_completed_at = Column(DateTime, nullable=True)
processing_error = Column(Text, nullable=True)
# Timestamps
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
# Relationship
session = relationship("ExamSession", back_populates="documents")
def __repr__(self):
return f"<PseudonymizedDocument {self.doc_token[:8]} ({self.status.value})>"
class QRBatchJob(Base):
"""
QR Code Generation Batch Job.
Tracks generation of QR overlay sheets for printing.
The generated PDF contains QR codes with doc_tokens.
"""
__tablename__ = 'klausur_qr_batches'
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
# Session relationship
session_id = Column(String(36), ForeignKey('klausur_sessions.id'), nullable=False, index=True)
teacher_id = Column(String(100), nullable=False, index=True)
# Batch info
student_count = Column(Integer, nullable=False)
generated_tokens = Column(JSON, default=list) # List of generated doc_tokens
# Generated PDF (stored as path reference, not in DB)
pdf_path = Column(String(500), nullable=True)
# Timestamps
created_at = Column(DateTime, default=datetime.utcnow)
downloaded_at = Column(DateTime, nullable=True)
def __repr__(self):
return f"<QRBatchJob {self.id[:8]}: {self.student_count} students>"
class OnboardingSession(Base):
"""
Magic Onboarding Session.
Tracks the automatic class/student detection and setup process.
Temporary data structure - merged into ExamSession after confirmation.
"""
__tablename__ = 'klausur_onboarding_sessions'
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
# Links
klausur_session_id = Column(String(36), ForeignKey('klausur_sessions.id'), nullable=True)
teacher_id = Column(String(100), nullable=False, index=True)
# Detected metadata (from local LLM)
detected_class = Column(String(100), nullable=True)
detected_subject = Column(String(100), nullable=True)
detected_date = Column(DateTime, nullable=True)
detected_student_count = Column(Integer, default=0)
detection_confidence = Column(Integer, default=0) # 0-100
# Confirmed data (after user review)
confirmed_class = Column(String(100), nullable=True)
confirmed_subject = Column(String(100), nullable=True)
# Linked school entities (after confirmation)
linked_school_id = Column(String(36), nullable=True)
linked_class_id = Column(String(36), nullable=True)
# School context
bundesland = Column(String(50), nullable=True)
schulform = Column(String(50), nullable=True)
school_name = Column(String(200), nullable=True)
# Status
status = Column(
SQLEnum(OnboardingStatus),
default=OnboardingStatus.ANALYZING,
nullable=False,
index=True
)
# Progress tracking
analysis_completed_at = Column(DateTime, nullable=True)
confirmation_completed_at = Column(DateTime, nullable=True)
processing_started_at = Column(DateTime, nullable=True)
processing_completed_at = Column(DateTime, nullable=True)
# Timestamps
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
# Relationships
detected_students = relationship(
"DetectedStudent",
back_populates="onboarding_session",
cascade="all, delete-orphan"
)
def __repr__(self):
return f"<OnboardingSession {self.id[:8]}: {self.detected_class} ({self.status.value})>"
class DetectedStudent(Base):
"""
Student detected during Magic Onboarding.
Temporary storage for detected student data before confirmation.
After confirmation, students are created in the School Service.
"""
__tablename__ = 'klausur_detected_students'
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
# Onboarding session
onboarding_session_id = Column(
String(36),
ForeignKey('klausur_onboarding_sessions.id'),
nullable=False,
index=True
)
# Detected data (from exam header)
detected_first_name = Column(String(100), nullable=True)
detected_last_name_hint = Column(String(100), nullable=True) # Partial, e.g. "M."
# Confirmed data (after roster matching)
confirmed_first_name = Column(String(100), nullable=True)
confirmed_last_name = Column(String(100), nullable=True)
# Matched to School Service student
matched_student_id = Column(String(36), nullable=True)
# Parent contact (extracted from roster)
parent_email = Column(String(200), nullable=True)
parent_phone = Column(String(50), nullable=True)
# Link to pseudonymized document
doc_token = Column(String(36), nullable=True)
# Confidence
confidence = Column(Integer, default=0) # 0-100
# Timestamps
created_at = Column(DateTime, default=datetime.utcnow)
# Relationship
onboarding_session = relationship("OnboardingSession", back_populates="detected_students")
def __repr__(self):
name = self.confirmed_first_name or self.detected_first_name or "?"
return f"<DetectedStudent {self.id[:8]}: {name}>"
class ModuleLink(Base):
"""
Cross-module link from Klausur to other BreakPilot modules.
Tracks connections to: Notenbuch, Elternabend, Zeugnis, Calendar
"""
__tablename__ = 'klausur_module_links'
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
# Source
klausur_session_id = Column(
String(36),
ForeignKey('klausur_sessions.id'),
nullable=False,
index=True
)
# Link type
link_type = Column(
SQLEnum(ModuleLinkType),
nullable=False,
index=True
)
# Target
target_module = Column(String(50), nullable=False) # school, calendar, etc.
target_entity_id = Column(String(36), nullable=True)
target_url = Column(String(500), nullable=True)
# Link metadata
link_metadata = Column(JSON, default=dict)
# Timestamps
created_at = Column(DateTime, default=datetime.utcnow)
def __repr__(self):
return f"<ModuleLink {self.id[:8]}: {self.link_type.value} -> {self.target_module}>"
# Export all models
__all__ = [
"SessionStatus",
"DocumentStatus",
"OnboardingStatus",
"ModuleLinkType",
"ExamSession",
"PseudonymizedDocument",
"QRBatchJob",
"OnboardingSession",
"DetectedStudent",
"ModuleLink",
]

View File

@@ -0,0 +1,377 @@
"""
Repository for Klausurkorrektur Module.
All queries are filtered by teacher_id to ensure complete namespace isolation.
No cross-teacher data access is possible.
"""
from datetime import datetime, timedelta
from typing import Optional, List
from sqlalchemy.orm import Session
from sqlalchemy import and_, func
from .db_models import (
ExamSession, PseudonymizedDocument, QRBatchJob,
SessionStatus, DocumentStatus
)
class KlausurRepository:
"""
Repository for exam correction data.
PRIVACY DESIGN:
- All queries MUST include teacher_id filter
- No method allows access to other teachers' data
- Bulk operations are scoped to teacher namespace
"""
def __init__(self, db: Session):
self.db = db
# ==================== Session Operations ====================
def create_session(
self,
teacher_id: str,
name: str,
subject: str = "",
class_name: str = "",
total_points: int = 100,
rubric: str = "",
questions: Optional[List[dict]] = None,
retention_days: int = 30
) -> ExamSession:
"""Create a new exam correction session."""
session = ExamSession(
teacher_id=teacher_id,
name=name,
subject=subject,
class_name=class_name,
total_points=total_points,
rubric=rubric,
questions=questions or [],
status=SessionStatus.CREATED,
retention_until=datetime.utcnow() + timedelta(days=retention_days)
)
self.db.add(session)
self.db.commit()
self.db.refresh(session)
return session
def get_session(
self,
session_id: str,
teacher_id: str
) -> Optional[ExamSession]:
"""Get a session by ID (teacher-scoped)."""
return self.db.query(ExamSession).filter(
and_(
ExamSession.id == session_id,
ExamSession.teacher_id == teacher_id,
ExamSession.status != SessionStatus.DELETED
)
).first()
def list_sessions(
self,
teacher_id: str,
include_archived: bool = False,
limit: int = 50,
offset: int = 0
) -> List[ExamSession]:
"""List all sessions for a teacher."""
query = self.db.query(ExamSession).filter(
and_(
ExamSession.teacher_id == teacher_id,
ExamSession.status != SessionStatus.DELETED
)
)
if not include_archived:
query = query.filter(ExamSession.status != SessionStatus.ARCHIVED)
return query.order_by(ExamSession.created_at.desc()).offset(offset).limit(limit).all()
def update_session_status(
self,
session_id: str,
teacher_id: str,
status: SessionStatus
) -> Optional[ExamSession]:
"""Update session status."""
session = self.get_session(session_id, teacher_id)
if session:
session.status = status
if status == SessionStatus.COMPLETED:
session.completed_at = datetime.utcnow()
self.db.commit()
self.db.refresh(session)
return session
def update_session_identity_map(
self,
session_id: str,
teacher_id: str,
encrypted_map: bytes,
iv: str
) -> Optional[ExamSession]:
"""Store encrypted identity map (teacher-scoped)."""
session = self.get_session(session_id, teacher_id)
if session:
session.encrypted_identity_map = encrypted_map
session.identity_map_iv = iv
self.db.commit()
self.db.refresh(session)
return session
def delete_session(
self,
session_id: str,
teacher_id: str,
hard_delete: bool = False
) -> bool:
"""Delete a session (soft or hard delete)."""
session = self.get_session(session_id, teacher_id)
if not session:
return False
if hard_delete:
self.db.delete(session)
else:
session.status = SessionStatus.DELETED
self.db.commit()
return True
# ==================== Document Operations ====================
def create_document(
self,
session_id: str,
teacher_id: str,
doc_token: Optional[str] = None,
page_number: int = 1,
total_pages: int = 1
) -> Optional[PseudonymizedDocument]:
"""Create a new pseudonymized document."""
# Verify session belongs to teacher
session = self.get_session(session_id, teacher_id)
if not session:
return None
doc = PseudonymizedDocument(
session_id=session_id,
page_number=page_number,
total_pages=total_pages,
status=DocumentStatus.UPLOADED
)
if doc_token:
doc.doc_token = doc_token
self.db.add(doc)
# Update session document count
session.document_count += 1
self.db.commit()
self.db.refresh(doc)
return doc
def get_document(
self,
doc_token: str,
teacher_id: str
) -> Optional[PseudonymizedDocument]:
"""Get a document by token (teacher-scoped via session)."""
return self.db.query(PseudonymizedDocument).join(
ExamSession
).filter(
and_(
PseudonymizedDocument.doc_token == doc_token,
ExamSession.teacher_id == teacher_id,
ExamSession.status != SessionStatus.DELETED
)
).first()
def list_documents(
self,
session_id: str,
teacher_id: str
) -> List[PseudonymizedDocument]:
"""List all documents in a session (teacher-scoped)."""
# Verify session belongs to teacher
session = self.get_session(session_id, teacher_id)
if not session:
return []
return self.db.query(PseudonymizedDocument).filter(
PseudonymizedDocument.session_id == session_id
).order_by(PseudonymizedDocument.created_at).all()
def update_document_ocr(
self,
doc_token: str,
teacher_id: str,
ocr_text: str,
confidence: int = 0
) -> Optional[PseudonymizedDocument]:
"""Update document with OCR results."""
doc = self.get_document(doc_token, teacher_id)
if doc:
doc.ocr_text = ocr_text
doc.ocr_confidence = confidence
doc.status = DocumentStatus.OCR_COMPLETED
self.db.commit()
self.db.refresh(doc)
return doc
def update_document_ai_result(
self,
doc_token: str,
teacher_id: str,
feedback: str,
score: Optional[int] = None,
grade: Optional[str] = None,
details: Optional[dict] = None
) -> Optional[PseudonymizedDocument]:
"""Update document with AI correction results."""
doc = self.get_document(doc_token, teacher_id)
if doc:
doc.ai_feedback = feedback
doc.ai_score = score
doc.ai_grade = grade
doc.ai_details = details or {}
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
# Update session processed count
session = doc.session
session.processed_count += 1
# Check if all documents are processed
if session.processed_count >= session.document_count:
session.status = SessionStatus.COMPLETED
session.completed_at = datetime.utcnow()
self.db.commit()
self.db.refresh(doc)
return doc
def update_document_status(
self,
doc_token: str,
teacher_id: str,
status: DocumentStatus,
error: Optional[str] = None
) -> Optional[PseudonymizedDocument]:
"""Update document processing status."""
doc = self.get_document(doc_token, teacher_id)
if doc:
doc.status = status
if error:
doc.processing_error = error
if status in [DocumentStatus.OCR_PROCESSING, DocumentStatus.AI_PROCESSING]:
doc.processing_started_at = datetime.utcnow()
self.db.commit()
self.db.refresh(doc)
return doc
# ==================== QR Batch Operations ====================
def create_qr_batch(
self,
session_id: str,
teacher_id: str,
student_count: int,
generated_tokens: List[str]
) -> Optional[QRBatchJob]:
"""Create a QR code batch job."""
# Verify session belongs to teacher
session = self.get_session(session_id, teacher_id)
if not session:
return None
batch = QRBatchJob(
session_id=session_id,
teacher_id=teacher_id,
student_count=student_count,
generated_tokens=generated_tokens
)
self.db.add(batch)
self.db.commit()
self.db.refresh(batch)
return batch
def get_qr_batch(
self,
batch_id: str,
teacher_id: str
) -> Optional[QRBatchJob]:
"""Get a QR batch by ID (teacher-scoped)."""
return self.db.query(QRBatchJob).filter(
and_(
QRBatchJob.id == batch_id,
QRBatchJob.teacher_id == teacher_id
)
).first()
# ==================== Statistics (Anonymized) ====================
def get_session_stats(
self,
session_id: str,
teacher_id: str
) -> dict:
"""Get anonymized statistics for a session."""
session = self.get_session(session_id, teacher_id)
if not session:
return {}
# Count documents by status
status_counts = self.db.query(
PseudonymizedDocument.status,
func.count(PseudonymizedDocument.doc_token)
).filter(
PseudonymizedDocument.session_id == session_id
).group_by(PseudonymizedDocument.status).all()
# Score statistics (anonymized)
score_stats = self.db.query(
func.avg(PseudonymizedDocument.ai_score),
func.min(PseudonymizedDocument.ai_score),
func.max(PseudonymizedDocument.ai_score)
).filter(
and_(
PseudonymizedDocument.session_id == session_id,
PseudonymizedDocument.ai_score.isnot(None)
)
).first()
return {
"session_id": session_id,
"total_documents": session.document_count,
"processed_documents": session.processed_count,
"status_breakdown": {s.value: c for s, c in status_counts},
"score_average": float(score_stats[0]) if score_stats[0] else None,
"score_min": score_stats[1],
"score_max": score_stats[2]
}
# ==================== Data Retention ====================
def cleanup_expired_sessions(self) -> int:
"""Delete sessions past their retention date. Returns count deleted."""
now = datetime.utcnow()
expired = self.db.query(ExamSession).filter(
and_(
ExamSession.retention_until < now,
ExamSession.status != SessionStatus.DELETED
)
).all()
count = len(expired)
for session in expired:
session.status = SessionStatus.DELETED
# Clear sensitive data
session.encrypted_identity_map = None
session.identity_map_iv = None
self.db.commit()
return count

1970
backend/klausur/routes.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,28 @@
"""
Services for Klausurkorrektur Module.
- PseudonymizationService: QR code generation, header redaction
- CorrectionService: LLM integration for AI-assisted grading
- RosterParser: Parse Klassenbuch photos and roster files
- SchoolResolver: School/class selection and auto-creation
- ModuleLinker: Cross-module links (Notenbuch, Elternabend, etc.)
"""
from .pseudonymizer import PseudonymizationService, get_pseudonymizer
from .correction_service import ExamCorrectionService, get_correction_service
from .roster_parser import RosterParser, get_roster_parser
from .school_resolver import SchoolResolver, get_school_resolver
from .module_linker import ModuleLinker, get_module_linker
__all__ = [
"PseudonymizationService",
"get_pseudonymizer",
"ExamCorrectionService",
"get_correction_service",
"RosterParser",
"get_roster_parser",
"SchoolResolver",
"get_school_resolver",
"ModuleLinker",
"get_module_linker",
]

View File

@@ -0,0 +1,379 @@
"""
Exam Correction Service using Self-Hosted LLM.
PRIVACY BY DESIGN:
- Only pseudonymized text (doc_token + OCR content) is sent to LLM
- No student names or personal data in prompts
- All processing happens on self-hosted infrastructure (SysEleven)
- No data sent to external APIs (unless explicitly configured)
This service generates AI-assisted corrections and feedback for exam answers.
"""
import logging
from typing import Optional, List
from dataclasses import dataclass
from llm_gateway.services.inference import get_inference_service, InferenceResult
from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage
from llm_gateway.config import get_config
logger = logging.getLogger(__name__)
@dataclass
class QuestionRubric:
"""Rubric for a single exam question."""
question_number: int
question_text: str
max_points: int
expected_answer: str
grading_criteria: str
@dataclass
class QuestionResult:
"""AI correction result for a single question."""
question_number: int
points_awarded: int
max_points: int
feedback: str
strengths: List[str]
improvements: List[str]
@dataclass
class CorrectionResult:
"""Complete correction result for an exam."""
doc_token: str # Pseudonymized identifier
total_score: int
max_score: int
grade: str
overall_feedback: str
question_results: List[QuestionResult]
processing_time_ms: int
# German grading scale (can be customized)
GERMAN_GRADES = [
(95, "1+"), # sehr gut plus
(90, "1"), # sehr gut
(85, "1-"), # sehr gut minus
(80, "2+"), # gut plus
(75, "2"), # gut
(70, "2-"), # gut minus
(65, "3+"), # befriedigend plus
(60, "3"), # befriedigend
(55, "3-"), # befriedigend minus
(50, "4+"), # ausreichend plus
(45, "4"), # ausreichend
(40, "4-"), # ausreichend minus
(33, "5+"), # mangelhaft plus
(27, "5"), # mangelhaft
(20, "5-"), # mangelhaft minus
(0, "6"), # ungenuegend
]
def calculate_grade(percentage: float) -> str:
"""Calculate German grade from percentage."""
for threshold, grade in GERMAN_GRADES:
if percentage >= threshold:
return grade
return "6"
class ExamCorrectionService:
"""
Service for AI-assisted exam correction.
PRIVACY GUARANTEES:
1. Prompts contain NO personal data
2. Only doc_token is used as reference
3. Processing on self-hosted LLM
4. Results stored with pseudonymized identifiers
"""
# System prompt for exam correction (German)
CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten.
WICHTIGE REGELN:
1. Bewerte NUR den fachlichen Inhalt der Antwort
2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren)
3. Gib konstruktives, ermutigzendes Feedback
4. Beziehe dich auf die Bewertungskriterien
5. Sei fair und konsistent
AUSGABEFORMAT (JSON):
{
"points": <Punktzahl>,
"feedback": "<Kurze Begruendung der Bewertung>",
"strengths": ["<Staerke 1>", "<Staerke 2>"],
"improvements": ["<Verbesserungsvorschlag 1>"]
}
Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen."""
OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung.
Einzelbewertungen:
{question_results}
Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%)
Note: {grade}
Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die:
1. Die Staerken hervorhebt
2. Konstruktive Verbesserungsvorschlaege macht
3. Ermutigt und motiviert
Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung."""
def __init__(self, model: Optional[str] = None):
"""
Initialize the correction service.
Args:
model: LLM model to use (default: qwen2.5:14b from config)
DATENSCHUTZ/PRIVACY:
Das Modell läuft lokal auf dem Mac Mini via Ollama.
Keine Daten werden an externe Server gesendet.
"""
config = get_config()
# Use configured correction model (default: qwen2.5:14b)
self.model = model or config.correction_model
self.inference = get_inference_service()
logger.info(f"Correction service initialized with model: {self.model}")
async def correct_question(
self,
student_answer: str,
rubric: QuestionRubric,
subject: str = "Allgemein"
) -> QuestionResult:
"""
Correct a single question answer.
Args:
student_answer: The student's OCR-extracted answer (pseudonymized)
rubric: Grading rubric for this question
subject: Subject for context
Returns:
QuestionResult with points and feedback
"""
# Build prompt with NO personal data
user_prompt = f"""Fach: {subject}
Frage {rubric.question_number}: {rubric.question_text}
Maximale Punktzahl: {rubric.max_points}
Erwartete Antwort:
{rubric.expected_answer}
Bewertungskriterien:
{rubric.grading_criteria}
---
Schuelerantwort:
{student_answer}
---
Bewerte diese Antwort nach den Kriterien."""
request = ChatCompletionRequest(
model=self.model,
messages=[
ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT),
ChatMessage(role="user", content=user_prompt),
],
temperature=0.3, # Lower temperature for consistent grading
max_tokens=500,
)
try:
response = await self.inference.complete(request)
content = response.choices[0].message.content or "{}"
# Parse JSON response
import json
try:
result = json.loads(content)
except json.JSONDecodeError:
# Fallback parsing
logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}")
result = {
"points": rubric.max_points // 2,
"feedback": content[:200],
"strengths": [],
"improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"]
}
points = min(int(result.get("points", 0)), rubric.max_points)
return QuestionResult(
question_number=rubric.question_number,
points_awarded=points,
max_points=rubric.max_points,
feedback=result.get("feedback", ""),
strengths=result.get("strengths", []),
improvements=result.get("improvements", []),
)
except Exception as e:
logger.error(f"Correction failed for question {rubric.question_number}: {e}")
return QuestionResult(
question_number=rubric.question_number,
points_awarded=0,
max_points=rubric.max_points,
feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}",
strengths=[],
improvements=["Manuelle Korrektur erforderlich"],
)
async def correct_exam(
self,
doc_token: str,
ocr_text: str,
rubrics: List[QuestionRubric],
subject: str = "Allgemein"
) -> CorrectionResult:
"""
Correct a complete exam with multiple questions.
Args:
doc_token: Pseudonymized document identifier
ocr_text: Full OCR text of the exam (already redacted)
rubrics: List of question rubrics
subject: Subject name
Returns:
CorrectionResult with all scores and feedback
"""
import time
start_time = time.time()
# Split OCR text into answers (simple heuristic)
answers = self._extract_answers(ocr_text, len(rubrics))
# Correct each question
question_results = []
for i, rubric in enumerate(rubrics):
answer = answers[i] if i < len(answers) else ""
result = await self.correct_question(answer, rubric, subject)
question_results.append(result)
# Calculate totals
total_score = sum(r.points_awarded for r in question_results)
max_score = sum(r.max_points for r in question_results)
percentage = (total_score / max_score * 100) if max_score > 0 else 0
grade = calculate_grade(percentage)
# Generate overall feedback
overall_feedback = await self._generate_overall_feedback(
question_results, total_score, max_score, percentage, grade
)
processing_time_ms = int((time.time() - start_time) * 1000)
return CorrectionResult(
doc_token=doc_token,
total_score=total_score,
max_score=max_score,
grade=grade,
overall_feedback=overall_feedback,
question_results=question_results,
processing_time_ms=processing_time_ms,
)
async def _generate_overall_feedback(
self,
question_results: List[QuestionResult],
total_score: int,
max_score: int,
percentage: float,
grade: str
) -> str:
"""Generate motivating overall feedback."""
# Summarize question results
results_summary = "\n".join([
f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}"
for r in question_results
])
prompt = self.OVERALL_FEEDBACK_PROMPT.format(
question_results=results_summary,
total_score=total_score,
max_score=max_score,
percentage=f"{percentage:.1f}",
grade=grade,
)
request = ChatCompletionRequest(
model=self.model,
messages=[
ChatMessage(role="user", content=prompt),
],
temperature=0.5,
max_tokens=200,
)
try:
response = await self.inference.complete(request)
return response.choices[0].message.content or "Gute Arbeit! Weiter so."
except Exception as e:
logger.error(f"Failed to generate overall feedback: {e}")
return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})"
def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]:
"""
Extract individual answers from OCR text.
Simple heuristic: split by question markers (1., 2., etc.)
More sophisticated extraction can be implemented.
"""
import re
# Try to find question markers
pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*'
parts = re.split(pattern, ocr_text)
answers = []
i = 1 # Skip first empty part
while i < len(parts):
if i + 1 < len(parts):
# parts[i] is the question number, parts[i+1] is the answer
answers.append(parts[i + 1].strip())
i += 2
# Pad with empty answers if needed
while len(answers) < num_questions:
answers.append("")
return answers[:num_questions]
# Singleton instance
_correction_service: Optional[ExamCorrectionService] = None
def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService:
"""
Get or create the correction service singleton.
Args:
model: Optional model override. If None, uses config.correction_model (qwen2.5:14b)
Returns:
ExamCorrectionService instance
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API.
"""
global _correction_service
if _correction_service is None:
_correction_service = ExamCorrectionService(model=model)
elif model and _correction_service.model != model:
# Only recreate if explicitly requesting different model
_correction_service = ExamCorrectionService(model=model)
return _correction_service

View File

@@ -0,0 +1,630 @@
"""
Module Linker Service - Cross-Module Verknuepfungen.
Verknuepft Klausur-Ergebnisse mit anderen BreakPilot-Modulen:
- Notenbuch (School Service)
- Elternabend (Gespraechsvorschlaege)
- Zeugnisse (Notenuebernahme)
- Kalender (Termine)
Privacy:
- Verknuepfungen nutzen doc_tokens (pseudonymisiert)
- Deanonymisierung nur Client-seitig moeglich
"""
import httpx
import os
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from datetime import datetime, timedelta
from enum import Enum
# ============================================================================
# DATA CLASSES
# ============================================================================
class LinkType(str, Enum):
"""Typ der Modul-Verknuepfung."""
NOTENBUCH = "notenbuch"
ELTERNABEND = "elternabend"
ZEUGNIS = "zeugnis"
CALENDAR = "calendar"
KLASSENBUCH = "klassenbuch"
class MeetingUrgency(str, Enum):
"""Dringlichkeit eines Elterngespraechs."""
LOW = "niedrig"
MEDIUM = "mittel"
HIGH = "hoch"
@dataclass
class CorrectionResult:
"""Korrektur-Ergebnis (pseudonymisiert)."""
doc_token: str
score: float # Punkte
max_score: float
grade: str # z.B. "2+"
feedback: str
question_results: List[Dict[str, Any]] = field(default_factory=list)
@dataclass
class GradeEntry:
"""Notenbuch-Eintrag."""
student_id: str # Im Notenbuch: echte Student-ID
doc_token: str # Aus Klausur: pseudonymisiert
grade: str
points: float
max_points: float
exam_name: str
date: str
@dataclass
class ParentMeetingSuggestion:
"""Vorschlag fuer ein Elterngespraech."""
doc_token: str # Pseudonymisiert
reason: str
urgency: MeetingUrgency
grade: str
subject: str
suggested_topics: List[str] = field(default_factory=list)
@dataclass
class CalendarEvent:
"""Kalender-Eintrag."""
id: str
title: str
description: str
start_time: datetime
end_time: datetime
event_type: str
linked_doc_tokens: List[str] = field(default_factory=list)
@dataclass
class ModuleLink:
"""Verknuepfung zu einem anderen Modul."""
id: str
klausur_session_id: str
link_type: LinkType
target_module: str
target_entity_id: str
target_url: Optional[str] = None
link_metadata: Dict[str, Any] = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.utcnow)
@dataclass
class LinkResult:
"""Ergebnis einer Verknuepfungs-Operation."""
success: bool
link: Optional[ModuleLink] = None
message: str = ""
target_url: Optional[str] = None
# ============================================================================
# MODULE LINKER
# ============================================================================
class ModuleLinker:
"""
Verknuepft Klausur-Ergebnisse mit anderen Modulen.
Beispiel:
linker = ModuleLinker()
# Noten ins Notenbuch uebertragen
result = await linker.link_to_notenbuch(
session_id="session-123",
class_id="class-456",
results=correction_results
)
# Elterngespraeche vorschlagen
suggestions = linker.suggest_elternabend(
results=correction_results,
subject="Mathematik"
)
"""
# Notenschwellen fuer Elterngespraeche
GRADE_THRESHOLDS = {
"1+": 0.95, "1": 0.90, "1-": 0.85,
"2+": 0.80, "2": 0.75, "2-": 0.70,
"3+": 0.65, "3": 0.60, "3-": 0.55,
"4+": 0.50, "4": 0.45, "4-": 0.40,
"5+": 0.33, "5": 0.25, "5-": 0.17,
"6": 0.0
}
# Noten die Gespraeche erfordern
MEETING_TRIGGER_GRADES = ["4", "4-", "5+", "5", "5-", "6"]
def __init__(self):
self.school_service_url = os.getenv(
"SCHOOL_SERVICE_URL",
"http://school-service:8084"
)
self.calendar_service_url = os.getenv(
"CALENDAR_SERVICE_URL",
"http://calendar-service:8085"
)
# =========================================================================
# NOTENBUCH INTEGRATION
# =========================================================================
async def link_to_notenbuch(
self,
session_id: str,
class_id: str,
subject: str,
results: List[CorrectionResult],
exam_name: str,
exam_date: str,
identity_map: Optional[Dict[str, str]] = None
) -> LinkResult:
"""
Uebertraegt Noten ins Notenbuch (School Service).
Args:
session_id: Klausur-Session-ID
class_id: Klassen-ID im School Service
subject: Fach
results: Liste der Korrektur-Ergebnisse
exam_name: Name der Klausur
exam_date: Datum der Klausur
identity_map: Optional: doc_token -> student_id Mapping
Note:
Das identity_map wird nur serverseitig genutzt, wenn der
Lehrer explizit die Verknuepfung freigibt. Normalerweise
bleibt das Mapping Client-seitig.
"""
try:
# Noten-Daten aufbereiten
grades_data = []
for result in results:
grade_entry = {
"doc_token": result.doc_token,
"grade": result.grade,
"points": result.score,
"max_points": result.max_score,
"percentage": result.score / result.max_score if result.max_score > 0 else 0
}
# Falls identity_map vorhanden: Student-ID hinzufuegen
if identity_map and result.doc_token in identity_map:
grade_entry["student_id"] = identity_map[result.doc_token]
grades_data.append(grade_entry)
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/classes/{class_id}/exams",
json={
"name": exam_name,
"subject": subject,
"date": exam_date,
"max_points": results[0].max_score if results else 100,
"grades": grades_data,
"klausur_session_id": session_id
}
)
if response.status_code in (200, 201):
data = response.json()
return LinkResult(
success=True,
link=ModuleLink(
id=data.get('id', ''),
klausur_session_id=session_id,
link_type=LinkType.NOTENBUCH,
target_module="school",
target_entity_id=data.get('id', ''),
target_url=f"/app?module=school&class={class_id}&exam={data.get('id')}"
),
message=f"Noten erfolgreich uebertragen ({len(results)} Eintraege)",
target_url=f"/app?module=school&class={class_id}"
)
return LinkResult(
success=False,
message=f"Fehler beim Uebertragen: {response.status_code}"
)
except Exception as e:
return LinkResult(
success=False,
message=f"Verbindungsfehler: {str(e)}"
)
# =========================================================================
# ELTERNABEND VORSCHLAEGE
# =========================================================================
def suggest_elternabend(
self,
results: List[CorrectionResult],
subject: str,
threshold_grade: str = "4"
) -> List[ParentMeetingSuggestion]:
"""
Schlaegt Elterngespraeche fuer schwache Schueler vor.
Args:
results: Liste der Korrektur-Ergebnisse
subject: Fach
threshold_grade: Ab dieser Note wird ein Gespraech vorgeschlagen
Returns:
Liste von Gespraechs-Vorschlaegen (pseudonymisiert)
"""
suggestions = []
threshold_idx = list(self.GRADE_THRESHOLDS.keys()).index(threshold_grade) \
if threshold_grade in self.GRADE_THRESHOLDS else 9
for result in results:
# Pruefe ob Note Gespraech erfordert
if result.grade in self.MEETING_TRIGGER_GRADES:
urgency = self._determine_urgency(result.grade)
topics = self._generate_meeting_topics(result, subject)
suggestions.append(ParentMeetingSuggestion(
doc_token=result.doc_token,
reason=f"Note {result.grade} in {subject}",
urgency=urgency,
grade=result.grade,
subject=subject,
suggested_topics=topics
))
# Nach Dringlichkeit sortieren
urgency_order = {
MeetingUrgency.HIGH: 0,
MeetingUrgency.MEDIUM: 1,
MeetingUrgency.LOW: 2
}
suggestions.sort(key=lambda s: urgency_order[s.urgency])
return suggestions
def _determine_urgency(self, grade: str) -> MeetingUrgency:
"""Bestimmt die Dringlichkeit basierend auf der Note."""
if grade in ["5-", "6"]:
return MeetingUrgency.HIGH
elif grade in ["5", "5+"]:
return MeetingUrgency.MEDIUM
else:
return MeetingUrgency.LOW
def _generate_meeting_topics(
self,
result: CorrectionResult,
subject: str
) -> List[str]:
"""Generiert Gespraechsthemen basierend auf den Ergebnissen."""
topics = []
# Allgemeine Themen
topics.append(f"Leistungsstand in {subject}")
# Basierend auf Feedback
if "Verstaendnis" in result.feedback.lower() or "grundlagen" in result.feedback.lower():
topics.append("Grundlagenverstaendnis foerdern")
if "uebung" in result.feedback.lower():
topics.append("Zusaetzliche Uebungsmoeglichkeiten")
# Basierend auf Aufgaben-Ergebnissen
if result.question_results:
weak_areas = []
for qr in result.question_results:
if qr.get('points_awarded', 0) / qr.get('max_points', 1) < 0.5:
weak_areas.append(qr.get('question_text', ''))
if weak_areas:
topics.append("Gezielte Foerderung in Schwachstellen")
# Standard-Themen
if not topics or len(topics) < 3:
topics.extend([
"Lernstrategien besprechen",
"Unterstuetzungsmoeglichkeiten zu Hause",
"Nachhilfe-Optionen"
])
return topics[:5] # Max 5 Themen
async def create_elternabend_link(
self,
session_id: str,
suggestions: List[ParentMeetingSuggestion],
teacher_id: str
) -> LinkResult:
"""Erstellt Verknuepfungen zum Elternabend-Modul."""
# TODO: Integration mit Elternabend-Modul
# Vorerst nur Metadaten speichern
return LinkResult(
success=True,
link=ModuleLink(
id=f"elternabend-{session_id}",
klausur_session_id=session_id,
link_type=LinkType.ELTERNABEND,
target_module="elternabend",
target_entity_id="",
link_metadata={
"suggestion_count": len(suggestions),
"high_urgency_count": sum(
1 for s in suggestions if s.urgency == MeetingUrgency.HIGH
)
}
),
message=f"{len(suggestions)} Elterngespraeche vorgeschlagen",
target_url="/app?module=elternabend"
)
# =========================================================================
# ZEUGNIS INTEGRATION
# =========================================================================
async def update_zeugnis(
self,
class_id: str,
subject: str,
grades: Dict[str, str],
exam_weight: float = 1.0
) -> LinkResult:
"""
Aktualisiert Zeugnis-Aggregation mit neuen Noten.
Args:
class_id: Klassen-ID
subject: Fach
grades: doc_token -> Note Mapping
exam_weight: Gewichtung der Klausur (Standard: 1.0)
"""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/classes/{class_id}/grades/aggregate",
json={
"subject": subject,
"grades": grades,
"weight": exam_weight,
"type": "klausur"
}
)
if response.status_code in (200, 201):
return LinkResult(
success=True,
message="Zeugnis-Daten aktualisiert",
target_url=f"/app?module=school&class={class_id}&tab=certificates"
)
return LinkResult(
success=False,
message=f"Fehler: {response.status_code}"
)
except Exception as e:
return LinkResult(
success=False,
message=f"Verbindungsfehler: {str(e)}"
)
# =========================================================================
# KALENDER INTEGRATION
# =========================================================================
async def create_calendar_events(
self,
teacher_id: str,
suggestions: List[ParentMeetingSuggestion],
default_duration_minutes: int = 30
) -> List[CalendarEvent]:
"""
Erstellt Kalender-Eintraege fuer Elterngespraeche.
Args:
teacher_id: ID des Lehrers
suggestions: Liste der Gespraechs-Vorschlaege
default_duration_minutes: Standard-Dauer pro Gespraech
"""
events = []
# Zeitslots generieren (ab naechster Woche, nachmittags)
start_date = datetime.now() + timedelta(days=7 - datetime.now().weekday())
start_date = start_date.replace(hour=14, minute=0, second=0, microsecond=0)
slot_index = 0
for suggestion in suggestions:
# Zeitslot berechnen
event_start = start_date + timedelta(minutes=slot_index * default_duration_minutes)
event_end = event_start + timedelta(minutes=default_duration_minutes)
# Naechster Tag wenn nach 18 Uhr
if event_start.hour >= 18:
start_date += timedelta(days=1)
start_date = start_date.replace(hour=14)
slot_index = 0
event_start = start_date
event_end = event_start + timedelta(minutes=default_duration_minutes)
event = CalendarEvent(
id=f"meeting-{suggestion.doc_token[:8]}",
title=f"Elterngespraech ({suggestion.grade})",
description=f"Anlass: {suggestion.reason}\n\nThemen:\n" +
"\n".join(f"- {t}" for t in suggestion.suggested_topics),
start_time=event_start,
end_time=event_end,
event_type="parent_meeting",
linked_doc_tokens=[suggestion.doc_token]
)
events.append(event)
slot_index += 1
# An Kalender-Service senden
try:
async with httpx.AsyncClient(timeout=10.0) as client:
for event in events:
await client.post(
f"{self.calendar_service_url}/api/events",
json={
"teacher_id": teacher_id,
"title": event.title,
"description": event.description,
"start": event.start_time.isoformat(),
"end": event.end_time.isoformat(),
"type": event.event_type,
"metadata": {
"doc_tokens": event.linked_doc_tokens
}
}
)
except Exception as e:
print(f"[ModuleLinker] Calendar service error: {e}")
return events
# =========================================================================
# STATISTIKEN
# =========================================================================
def calculate_grade_statistics(
self,
results: List[CorrectionResult]
) -> Dict[str, Any]:
"""
Berechnet Notenstatistiken.
Returns:
Dict mit Durchschnitt, Verteilung, Median, etc.
"""
if not results:
return {}
# Notenwerte (fuer Durchschnitt)
grade_values = {
"1+": 0.7, "1": 1.0, "1-": 1.3,
"2+": 1.7, "2": 2.0, "2-": 2.3,
"3+": 2.7, "3": 3.0, "3-": 3.3,
"4+": 3.7, "4": 4.0, "4-": 4.3,
"5+": 4.7, "5": 5.0, "5-": 5.3,
"6": 6.0
}
# Noten sammeln
grades = [r.grade for r in results]
points = [r.score for r in results]
max_points = results[0].max_score if results else 100
# Durchschnitt berechnen
numeric_grades = [grade_values.get(g, 4.0) for g in grades]
avg_grade = sum(numeric_grades) / len(numeric_grades)
# Notenverteilung
distribution = {}
for grade in grades:
distribution[grade] = distribution.get(grade, 0) + 1
# Prozent-Verteilung
percent_distribution = {
"sehr gut (1)": sum(1 for g in grades if g.startswith("1")),
"gut (2)": sum(1 for g in grades if g.startswith("2")),
"befriedigend (3)": sum(1 for g in grades if g.startswith("3")),
"ausreichend (4)": sum(1 for g in grades if g.startswith("4")),
"mangelhaft (5)": sum(1 for g in grades if g.startswith("5")),
"ungenuegend (6)": sum(1 for g in grades if g == "6")
}
return {
"count": len(results),
"average_grade": round(avg_grade, 2),
"average_grade_display": self._numeric_to_grade(avg_grade),
"average_points": round(sum(points) / len(points), 1),
"max_points": max_points,
"average_percent": round((sum(points) / len(points) / max_points) * 100, 1),
"best_grade": min(grades, key=lambda g: grade_values.get(g, 6)),
"worst_grade": max(grades, key=lambda g: grade_values.get(g, 0)),
"median_grade": self._calculate_median_grade(grades),
"distribution": distribution,
"percent_distribution": percent_distribution,
"passing_count": sum(1 for g in grades if not g.startswith("5") and g != "6"),
"failing_count": sum(1 for g in grades if g.startswith("5") or g == "6")
}
def _numeric_to_grade(self, value: float) -> str:
"""Konvertiert Notenwert zu Note."""
if value <= 1.15:
return "1+"
elif value <= 1.5:
return "1"
elif value <= 1.85:
return "1-"
elif value <= 2.15:
return "2+"
elif value <= 2.5:
return "2"
elif value <= 2.85:
return "2-"
elif value <= 3.15:
return "3+"
elif value <= 3.5:
return "3"
elif value <= 3.85:
return "3-"
elif value <= 4.15:
return "4+"
elif value <= 4.5:
return "4"
elif value <= 4.85:
return "4-"
elif value <= 5.15:
return "5+"
elif value <= 5.5:
return "5"
elif value <= 5.85:
return "5-"
else:
return "6"
def _calculate_median_grade(self, grades: List[str]) -> str:
"""Berechnet die Median-Note."""
grade_values = {
"1+": 0.7, "1": 1.0, "1-": 1.3,
"2+": 1.7, "2": 2.0, "2-": 2.3,
"3+": 2.7, "3": 3.0, "3-": 3.3,
"4+": 3.7, "4": 4.0, "4-": 4.3,
"5+": 4.7, "5": 5.0, "5-": 5.3,
"6": 6.0
}
numeric = sorted([grade_values.get(g, 4.0) for g in grades])
n = len(numeric)
if n % 2 == 0:
median = (numeric[n // 2 - 1] + numeric[n // 2]) / 2
else:
median = numeric[n // 2]
return self._numeric_to_grade(median)
# Singleton
_module_linker: Optional[ModuleLinker] = None
def get_module_linker() -> ModuleLinker:
"""Gibt die Singleton-Instanz des ModuleLinkers zurueck."""
global _module_linker
if _module_linker is None:
_module_linker = ModuleLinker()
return _module_linker

View File

@@ -0,0 +1,424 @@
"""
Background Processing Service for Klausur Correction.
Orchestrates the complete correction pipeline:
1. Load documents from storage
2. Run TrOCR for text extraction
3. Run AI correction for grading
4. Save results to database
PRIVACY BY DESIGN:
- Only pseudonymized doc_tokens used throughout
- No student names in processing pipeline
- All data stays on self-hosted infrastructure
"""
import asyncio
import logging
from datetime import datetime
from typing import Optional, List, Callable
from dataclasses import dataclass
from sqlalchemy.orm import Session
from ..db_models import (
ExamSession, PseudonymizedDocument,
SessionStatus, DocumentStatus
)
from ..repository import KlausurRepository
from .trocr_client import get_trocr_client, TrOCRClient
from .vision_ocr_service import get_vision_ocr_service, VisionOCRService
from .correction_service import (
get_correction_service, ExamCorrectionService,
QuestionRubric, CorrectionResult
)
from .storage_service import get_storage_service, KlausurStorageService
logger = logging.getLogger(__name__)
@dataclass
class ProcessingProgress:
"""Progress update for SSE streaming."""
session_id: str
total_documents: int
processed_documents: int
current_document: Optional[str] = None
current_step: str = "idle" # ocr, correction, saving
error: Optional[str] = None
@property
def percentage(self) -> int:
if self.total_documents == 0:
return 0
return int(self.processed_documents / self.total_documents * 100)
class ProcessingService:
"""
Background service for exam correction processing.
Usage:
service = ProcessingService(db_session)
await service.process_session(session_id, teacher_id)
"""
def __init__(
self,
db: Session,
trocr_client: Optional[TrOCRClient] = None,
vision_ocr_service: Optional[VisionOCRService] = None,
correction_service: Optional[ExamCorrectionService] = None,
storage_service: Optional[KlausurStorageService] = None,
prefer_vision_ocr: bool = True # Vision-LLM als Primär für Handschrift
):
self.db = db
self.repo = KlausurRepository(db)
self.trocr = trocr_client or get_trocr_client()
self.vision_ocr = vision_ocr_service or get_vision_ocr_service()
self.correction = correction_service or get_correction_service()
self.storage = storage_service or get_storage_service()
self.prefer_vision_ocr = prefer_vision_ocr
# Progress callback for SSE streaming
self._progress_callback: Optional[Callable[[ProcessingProgress], None]] = None
def set_progress_callback(self, callback: Callable[[ProcessingProgress], None]):
"""Set callback for progress updates (SSE streaming)."""
self._progress_callback = callback
def _notify_progress(self, progress: ProcessingProgress):
"""Notify progress to callback if set."""
if self._progress_callback:
try:
self._progress_callback(progress)
except Exception as e:
logger.warning(f"Progress callback failed: {e}")
async def process_session(
self,
session_id: str,
teacher_id: str,
use_ai_correction: bool = True
) -> bool:
"""
Process all documents in a session.
Args:
session_id: Exam session ID
teacher_id: Teacher ID for isolation
use_ai_correction: Whether to run AI correction (requires LLM)
Returns:
True if processing completed successfully
"""
# Get session
session = self.repo.get_session(session_id, teacher_id)
if not session:
logger.error(f"Session not found: {session_id}")
return False
# Get documents
documents = self.repo.list_documents(session_id, teacher_id)
if not documents:
logger.warning(f"No documents in session: {session_id}")
return False
total = len(documents)
processed = 0
logger.info(f"Starting processing for session {session_id}: {total} documents")
# Check OCR service availability (Vision-LLM preferred for handwriting)
vision_ocr_available = await self.vision_ocr.is_available()
trocr_available = await self.trocr.is_available()
if vision_ocr_available and self.prefer_vision_ocr:
logger.info("Using Vision-LLM (llama3.2-vision) for OCR - optimal for handwriting")
use_vision_ocr = True
elif trocr_available:
logger.info("Using TrOCR for OCR")
use_vision_ocr = False
elif vision_ocr_available:
logger.info("TrOCR not available, falling back to Vision-LLM")
use_vision_ocr = True
else:
logger.warning("No OCR service available - OCR will be skipped")
use_vision_ocr = False
trocr_available = False
# Process each document
for doc in documents:
progress = ProcessingProgress(
session_id=session_id,
total_documents=total,
processed_documents=processed,
current_document=doc.doc_token[:8],
current_step="ocr"
)
self._notify_progress(progress)
try:
# Step 1: OCR extraction (Vision-LLM or TrOCR)
if (vision_ocr_available or trocr_available) and doc.status == DocumentStatus.UPLOADED:
await self._process_ocr(session_id, doc, teacher_id, use_vision_ocr=use_vision_ocr)
# Step 2: AI correction
progress.current_step = "correction"
self._notify_progress(progress)
if use_ai_correction and doc.ocr_text:
await self._process_correction(session, doc, teacher_id)
else:
# Just mark as completed without AI
self._mark_document_completed(doc, teacher_id)
processed += 1
except Exception as e:
logger.error(f"Failed to process document {doc.doc_token}: {e}")
self._mark_document_failed(doc, str(e), teacher_id)
# Update session status
self.repo.update_session_status(session_id, teacher_id, SessionStatus.COMPLETED)
# Final progress
progress = ProcessingProgress(
session_id=session_id,
total_documents=total,
processed_documents=processed,
current_step="complete"
)
self._notify_progress(progress)
logger.info(f"Completed processing session {session_id}: {processed}/{total} documents")
return True
async def _process_ocr(
self,
session_id: str,
doc: PseudonymizedDocument,
teacher_id: str,
use_vision_ocr: bool = True
):
"""
Run OCR on a document.
Args:
session_id: Session ID
doc: Document to process
teacher_id: Teacher ID
use_vision_ocr: True to use Vision-LLM (llama3.2-vision), False for TrOCR
"""
# Update status
doc.status = DocumentStatus.OCR_PROCESSING
doc.processing_started_at = datetime.utcnow()
self.db.commit()
# Try to get document from storage (check both redacted and original)
image_data = None
for is_redacted in [True, False]: # Prefer redacted version
for ext in ["png", "jpg", "jpeg", "pdf"]:
image_data = self.storage.get_document(
session_id, doc.doc_token, ext, is_redacted=is_redacted
)
if image_data:
logger.debug(f"Found document: {doc.doc_token[:8]}.{ext} (redacted={is_redacted})")
break
if image_data:
break
if not image_data:
logger.warning(f"No image found for document {doc.doc_token}")
# Use placeholder OCR text for testing
doc.ocr_text = "[Kein Bild gefunden - Manuelle Eingabe erforderlich]"
doc.ocr_confidence = 0
doc.status = DocumentStatus.OCR_COMPLETED
self.db.commit()
return
# Call OCR service (Vision-LLM or TrOCR)
try:
if use_vision_ocr:
# Use Vision-LLM (llama3.2-vision) - better for handwriting
result = await self.vision_ocr.extract_text(
image_data,
filename=f"{doc.doc_token}.png",
is_handwriting=True # Assume handwriting for exams
)
ocr_method = "Vision-LLM"
else:
# Use TrOCR
result = await self.trocr.extract_text(
image_data,
filename=f"{doc.doc_token}.png",
detect_lines=True
)
ocr_method = "TrOCR"
doc.ocr_text = result.text
doc.ocr_confidence = int(result.confidence * 100)
doc.status = DocumentStatus.OCR_COMPLETED
logger.info(
f"OCR completed ({ocr_method}) for {doc.doc_token[:8]}: "
f"{len(result.text)} chars, {result.confidence:.0%} confidence"
)
except Exception as e:
logger.error(f"OCR failed for {doc.doc_token}: {e}")
doc.ocr_text = f"[OCR Fehler: {str(e)[:100]}]"
doc.ocr_confidence = 0
doc.status = DocumentStatus.OCR_COMPLETED # Continue to AI anyway
self.db.commit()
async def _process_correction(
self,
session: ExamSession,
doc: PseudonymizedDocument,
teacher_id: str
):
"""Run AI correction on a document."""
doc.status = DocumentStatus.AI_PROCESSING
self.db.commit()
# Build rubrics from session questions
rubrics = self._build_rubrics(session)
if not rubrics:
# No rubrics defined - use simple scoring
doc.ai_feedback = "Keine Bewertungskriterien definiert. Manuelle Korrektur empfohlen."
doc.ai_score = None
doc.ai_grade = None
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
self.db.commit()
# Update session stats
session.processed_count += 1
self.db.commit()
return
try:
# Run AI correction
result = await self.correction.correct_exam(
doc_token=doc.doc_token,
ocr_text=doc.ocr_text,
rubrics=rubrics,
subject=session.subject or "Allgemein"
)
# Save results
doc.ai_feedback = result.overall_feedback
doc.ai_score = result.total_score
doc.ai_grade = result.grade
doc.ai_details = {
"max_score": result.max_score,
"processing_time_ms": result.processing_time_ms,
"questions": [
{
"number": q.question_number,
"points": q.points_awarded,
"max_points": q.max_points,
"feedback": q.feedback,
"strengths": q.strengths,
"improvements": q.improvements
}
for q in result.question_results
]
}
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
logger.info(
f"Correction completed for {doc.doc_token[:8]}: "
f"{result.total_score}/{result.max_score} ({result.grade})"
)
except Exception as e:
logger.error(f"AI correction failed for {doc.doc_token}: {e}")
doc.ai_feedback = f"KI-Korrektur fehlgeschlagen: {str(e)[:200]}"
doc.status = DocumentStatus.COMPLETED # Mark complete anyway
doc.processing_completed_at = datetime.utcnow()
# Update session stats
session.processed_count += 1
self.db.commit()
def _build_rubrics(self, session: ExamSession) -> List[QuestionRubric]:
"""Build QuestionRubric list from session questions."""
rubrics = []
if not session.questions:
return rubrics
for i, q in enumerate(session.questions):
rubric = QuestionRubric(
question_number=q.get("number", i + 1),
question_text=q.get("text", f"Frage {i + 1}"),
max_points=q.get("points", 10),
expected_answer=q.get("expected_answer", ""),
grading_criteria=q.get("rubric", session.rubric or "")
)
rubrics.append(rubric)
return rubrics
def _mark_document_completed(
self,
doc: PseudonymizedDocument,
teacher_id: str
):
"""Mark document as completed without AI correction."""
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
if not doc.ai_feedback:
doc.ai_feedback = "Verarbeitung abgeschlossen (ohne KI-Korrektur)"
self.db.commit()
# Update session stats
if doc.session:
doc.session.processed_count += 1
self.db.commit()
def _mark_document_failed(
self,
doc: PseudonymizedDocument,
error: str,
teacher_id: str
):
"""Mark document as failed."""
doc.status = DocumentStatus.FAILED
doc.processing_error = error[:500]
doc.processing_completed_at = datetime.utcnow()
self.db.commit()
# Background task function for FastAPI
async def process_session_background(
session_id: str,
teacher_id: str,
db_url: str
):
"""
Background task for session processing.
This function creates its own DB session for use in background tasks.
"""
from ..database import SessionLocal
db = SessionLocal()
try:
service = ProcessingService(db)
await service.process_session(session_id, teacher_id)
finally:
db.close()
# Singleton for main service
_processing_service: Optional[ProcessingService] = None
def get_processing_service(db: Session) -> ProcessingService:
"""Get processing service instance."""
return ProcessingService(db)

View File

@@ -0,0 +1,376 @@
"""
Pseudonymization Service for Klausurkorrektur.
Implements privacy-by-design principles:
- QR code generation with random doc_tokens
- Header redaction to remove personal data before OCR
- No student identity data leaves the teacher's device
DSGVO Art. 4 Nr. 5 Compliance:
The doc_token is a 128-bit random UUID that cannot be used to
identify a student without the encrypted identity map.
"""
import uuid
import io
import logging
from typing import List, Tuple, Optional
from dataclasses import dataclass
from PIL import Image, ImageDraw, ImageFont
logger = logging.getLogger(__name__)
# Optional imports (graceful fallback if not installed)
try:
import qrcode
HAS_QRCODE = True
except ImportError:
HAS_QRCODE = False
logger.warning("qrcode not installed - QR generation disabled")
try:
import cv2
import numpy as np
HAS_CV2 = True
except ImportError:
HAS_CV2 = False
logger.warning("opencv-python not installed - image processing disabled")
try:
from pyzbar.pyzbar import decode as pyzbar_decode
HAS_PYZBAR = True
except ImportError:
HAS_PYZBAR = False
logger.warning("pyzbar not installed - QR reading disabled")
@dataclass
class RedactionResult:
"""Result of header redaction."""
redacted_image: bytes
original_height: int
redacted_height: int
redaction_applied: bool
@dataclass
class QRDetectionResult:
"""Result of QR code detection."""
doc_token: Optional[str]
confidence: float
bbox: Optional[Tuple[int, int, int, int]] # x, y, width, height
class PseudonymizationService:
"""
Service for document pseudonymization.
PRIVACY GUARANTEES:
1. doc_tokens are cryptographically random (UUID4)
2. No deterministic relationship between token and student
3. Header redaction removes visible personal data
4. Identity mapping is encrypted client-side
"""
# Default header height to redact (in pixels, assuming 300 DPI scan)
DEFAULT_HEADER_HEIGHT = 300 # ~1 inch / 2.5cm
@staticmethod
def generate_doc_token() -> str:
"""
Generate a cryptographically random document token.
Uses UUID4 which provides 122 bits of randomness.
This ensures no correlation between tokens is possible.
"""
return str(uuid.uuid4())
@staticmethod
def generate_batch_tokens(count: int) -> List[str]:
"""Generate multiple unique doc_tokens."""
return [PseudonymizationService.generate_doc_token() for _ in range(count)]
def generate_qr_code(
self,
doc_token: str,
size: int = 200,
border: int = 2
) -> bytes:
"""
Generate a QR code image for a doc_token.
Args:
doc_token: The pseudonymization token
size: Size of the QR code in pixels
border: Border size in QR modules
Returns:
PNG image as bytes
"""
if not HAS_QRCODE:
raise RuntimeError("qrcode library not installed")
qr = qrcode.QRCode(
version=1,
error_correction=qrcode.constants.ERROR_CORRECT_M,
box_size=10,
border=border,
)
qr.add_data(doc_token)
qr.make(fit=True)
img = qr.make_image(fill_color="black", back_color="white")
img = img.resize((size, size), Image.Resampling.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format="PNG")
return buffer.getvalue()
def generate_qr_sheet(
self,
doc_tokens: List[str],
page_size: Tuple[int, int] = (2480, 3508), # A4 at 300 DPI
qr_size: int = 200,
margin: int = 100,
labels: Optional[List[str]] = None
) -> bytes:
"""
Generate a printable sheet of QR codes.
Args:
doc_tokens: List of tokens to generate QR codes for
page_size: Page dimensions (width, height) in pixels
qr_size: Size of each QR code
margin: Page margin
labels: Optional labels (e.g., "Nr. 1", "Nr. 2") - NO student names!
Returns:
PNG image of the full sheet
"""
if not HAS_QRCODE:
raise RuntimeError("qrcode library not installed")
width, height = page_size
img = Image.new('RGB', (width, height), 'white')
draw = ImageDraw.Draw(img)
# Calculate grid
usable_width = width - 2 * margin
usable_height = height - 2 * margin
cell_width = qr_size + 50
cell_height = qr_size + 80 # Extra space for label
cols = usable_width // cell_width
rows = usable_height // cell_height
# Try to load a font (fallback to default)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
except (IOError, OSError):
font = ImageFont.load_default()
# Generate QR codes
for i, token in enumerate(doc_tokens):
if i >= cols * rows:
logger.warning(f"Sheet full, skipping {len(doc_tokens) - i} tokens")
break
row = i // cols
col = i % cols
x = margin + col * cell_width
y = margin + row * cell_height
# Generate QR code
qr_bytes = self.generate_qr_code(token, qr_size)
qr_img = Image.open(io.BytesIO(qr_bytes))
img.paste(qr_img, (x, y))
# Add label (number only, NO names)
label = labels[i] if labels and i < len(labels) else f"Nr. {i + 1}"
draw.text((x, y + qr_size + 5), label, fill="black", font=font)
# Add truncated token for verification
token_short = token[:8] + "..."
draw.text((x, y + qr_size + 25), token_short, fill="gray", font=font)
buffer = io.BytesIO()
img.save(buffer, format="PNG")
return buffer.getvalue()
def detect_qr_code(self, image_bytes: bytes) -> QRDetectionResult:
"""
Detect and decode QR code from an image.
Args:
image_bytes: Image data (PNG, JPEG, etc.)
Returns:
QRDetectionResult with doc_token if found
"""
if not HAS_PYZBAR:
return QRDetectionResult(
doc_token=None,
confidence=0.0,
bbox=None
)
try:
img = Image.open(io.BytesIO(image_bytes))
# Decode QR codes
decoded = pyzbar_decode(img)
for obj in decoded:
if obj.type == 'QRCODE':
token = obj.data.decode('utf-8')
# Validate it looks like a UUID
try:
uuid.UUID(token)
rect = obj.rect
return QRDetectionResult(
doc_token=token,
confidence=1.0,
bbox=(rect.left, rect.top, rect.width, rect.height)
)
except ValueError:
continue
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
except Exception as e:
logger.error(f"QR detection failed: {e}")
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
def redact_header(
self,
image_bytes: bytes,
header_height: Optional[int] = None,
fill_color: Tuple[int, int, int] = (255, 255, 255)
) -> RedactionResult:
"""
Redact the header area of a scanned exam page.
This removes the area where student name/class/date typically appears.
The redaction is permanent - no original data is preserved.
Args:
image_bytes: Original scanned image
header_height: Height in pixels to redact (None = auto-detect)
fill_color: RGB color to fill redacted area (default: white)
Returns:
RedactionResult with redacted image
"""
try:
img = Image.open(io.BytesIO(image_bytes))
width, height = img.size
# Determine header height
redact_height = header_height or self.DEFAULT_HEADER_HEIGHT
# Create a copy and redact header
redacted = img.copy()
draw = ImageDraw.Draw(redacted)
draw.rectangle([(0, 0), (width, redact_height)], fill=fill_color)
# Save result
buffer = io.BytesIO()
redacted.save(buffer, format="PNG")
return RedactionResult(
redacted_image=buffer.getvalue(),
original_height=height,
redacted_height=redact_height,
redaction_applied=True
)
except Exception as e:
logger.error(f"Header redaction failed: {e}")
return RedactionResult(
redacted_image=image_bytes,
original_height=0,
redacted_height=0,
redaction_applied=False
)
def smart_redact_header(
self,
image_bytes: bytes,
preserve_qr: bool = True
) -> RedactionResult:
"""
Smart header redaction that detects text regions.
Uses OCR confidence to identify and redact only the header
area containing personal data.
Args:
image_bytes: Original scanned image
preserve_qr: If True, don't redact QR code areas
Returns:
RedactionResult with intelligently redacted image
"""
if not HAS_CV2:
# Fallback to simple redaction
return self.redact_header(image_bytes)
try:
# Convert to OpenCV format
nparr = np.frombuffer(image_bytes, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
height, width = img.shape[:2]
# Detect QR code position if present
qr_result = self.detect_qr_code(image_bytes)
# Calculate redaction area (top portion of page)
# Typically header is in top 10-15% of page
header_height = int(height * 0.12)
# If QR code is in header area, adjust redaction
if preserve_qr and qr_result.bbox:
qr_x, qr_y, qr_w, qr_h = qr_result.bbox
if qr_y < header_height:
# QR is in header - redact around it
# Create mask
mask = np.ones((header_height, width), dtype=np.uint8) * 255
# Leave QR area unredacted
mask[max(0, qr_y):min(header_height, qr_y + qr_h),
max(0, qr_x):min(width, qr_x + qr_w)] = 0
# Apply white fill where mask is 255
img[:header_height][mask == 255] = [255, 255, 255]
else:
# QR not in header - simple redaction
img[:header_height] = [255, 255, 255]
else:
# Simple header redaction
img[:header_height] = [255, 255, 255]
# Encode result
_, buffer = cv2.imencode('.png', img)
return RedactionResult(
redacted_image=buffer.tobytes(),
original_height=height,
redacted_height=header_height,
redaction_applied=True
)
except Exception as e:
logger.error(f"Smart redaction failed: {e}")
return self.redact_header(image_bytes)
# Singleton instance
_pseudonymizer: Optional[PseudonymizationService] = None
def get_pseudonymizer() -> PseudonymizationService:
"""Get or create the pseudonymization service singleton."""
global _pseudonymizer
if _pseudonymizer is None:
_pseudonymizer = PseudonymizationService()
return _pseudonymizer

View File

@@ -0,0 +1,502 @@
"""
Roster Parser Service - Klassenbuch und Schuelerlisten parsen.
Unterstuetzt:
- Klassenbuch-Fotos (OCR mit PaddleOCR)
- PDF-Schuelerlisten (SchILD, ASV, etc.)
- CSV-Dateien
- Manuelle Eingabe
Privacy-First:
- Alle Verarbeitung serverseitig (kein externer Upload)
- Daten bleiben im Lehrer-Namespace
"""
import re
import csv
import io
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Tuple
from difflib import SequenceMatcher
# Optionale Imports
try:
from services.file_processor import get_file_processor, ProcessingResult
HAS_OCR = True
except ImportError:
HAS_OCR = False
try:
import fitz # PyMuPDF
HAS_PDF = True
except ImportError:
HAS_PDF = False
@dataclass
class RosterEntry:
"""Eintrag in einer Schuelerliste."""
first_name: str
last_name: str
student_number: Optional[str] = None
parent_email: Optional[str] = None
parent_phone: Optional[str] = None
birth_date: Optional[str] = None
additional_data: Dict[str, str] = field(default_factory=dict)
@dataclass
class ParsedRoster:
"""Ergebnis des Roster-Parsings."""
entries: List[RosterEntry]
source_type: str # klassenbuch, pdf, csv
confidence: float
warnings: List[str] = field(default_factory=list)
raw_text: Optional[str] = None
@dataclass
class NameMatch:
"""Ergebnis eines Name-Matchings."""
detected_name: str
matched_entry: Optional[RosterEntry]
confidence: float
match_type: str # exact, first_name, fuzzy, none
class RosterParser:
"""
Parst Klassenlisten aus verschiedenen Quellen.
Beispiel:
parser = RosterParser()
# Klassenbuch-Foto
roster = parser.parse_klassenbuch_image(image_bytes)
# PDF-Liste
roster = parser.parse_pdf_roster(pdf_bytes)
# Namen matchen
matches = parser.match_first_names(
detected=["Max", "Anna", "Tim"],
roster=roster.entries
)
"""
# Regex-Patterns fuer Kontaktdaten
EMAIL_PATTERN = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
PHONE_PATTERN = re.compile(r'(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}')
DATE_PATTERN = re.compile(r'\b(\d{1,2})\.(\d{1,2})\.(\d{2,4})\b')
# Deutsche Vornamen (Auszug fuer Validierung)
COMMON_FIRST_NAMES = {
'max', 'anna', 'tim', 'lena', 'paul', 'marie', 'felix', 'emma',
'leon', 'sophia', 'lukas', 'mia', 'jonas', 'hannah', 'elias', 'emilia',
'ben', 'lea', 'noah', 'lina', 'finn', 'amelie', 'luis', 'laura',
'moritz', 'clara', 'henry', 'julia', 'julian', 'emily', 'david', 'johanna',
'niklas', 'charlotte', 'simon', 'maja', 'alexander', 'sarah', 'jan', 'lisa',
'tom', 'nele', 'luca', 'sophie', 'erik', 'alina', 'fabian', 'paula',
'philipp', 'luisa', 'tobias', 'melina', 'vincent', 'lara', 'maximilian', 'elena'
}
def __init__(self):
self.file_processor = get_file_processor() if HAS_OCR else None
# =========================================================================
# KLASSENBUCH-FOTO PARSING
# =========================================================================
def parse_klassenbuch_image(self, image_bytes: bytes) -> ParsedRoster:
"""
Parst ein Klassenbuch-Foto via OCR.
Args:
image_bytes: Bild als Bytes (PNG, JPG)
Returns:
ParsedRoster mit extrahierten Schuelerdaten
"""
if not HAS_OCR or not self.file_processor:
return ParsedRoster(
entries=[],
source_type='klassenbuch',
confidence=0.0,
warnings=['OCR nicht verfuegbar (PaddleOCR nicht installiert)']
)
# OCR ausfuehren
result: ProcessingResult = self.file_processor.process_file(
image_bytes,
filename='klassenbuch.png',
processing_mode='ocr_handwriting'
)
# Text in Zeilen aufteilen
lines = result.text.split('\n')
entries = []
warnings = []
for line in lines:
line = line.strip()
if not line or len(line) < 3:
continue
entry = self._parse_roster_line(line)
if entry:
entries.append(entry)
return ParsedRoster(
entries=entries,
source_type='klassenbuch',
confidence=result.confidence,
warnings=warnings,
raw_text=result.text
)
def _parse_roster_line(self, line: str) -> Optional[RosterEntry]:
"""Parst eine einzelne Zeile aus dem Klassenbuch."""
# Bereinigen
line = re.sub(r'\s+', ' ', line).strip()
# Nummer am Anfang entfernen (z.B. "1. Max Mustermann")
line = re.sub(r'^\d+[\.\)\s]+', '', line)
# Email extrahieren
email_match = self.EMAIL_PATTERN.search(line)
email = email_match.group() if email_match else None
if email:
line = line.replace(email, '')
# Telefon extrahieren
phone_match = self.PHONE_PATTERN.search(line)
phone = phone_match.group() if phone_match else None
if phone:
line = line.replace(phone, '')
# Geburtsdatum extrahieren
date_match = self.DATE_PATTERN.search(line)
birth_date = date_match.group() if date_match else None
if birth_date:
line = line.replace(birth_date, '')
# Namen parsen (Rest der Zeile)
line = re.sub(r'\s+', ' ', line).strip()
if not line:
return None
first_name, last_name = self._parse_name(line)
if not first_name:
return None
return RosterEntry(
first_name=first_name,
last_name=last_name or '',
parent_email=email,
parent_phone=phone,
birth_date=birth_date
)
def _parse_name(self, text: str) -> Tuple[Optional[str], Optional[str]]:
"""
Parst einen Namen in Vor- und Nachname.
Formate:
- "Max Mustermann"
- "Mustermann, Max"
- "Max M."
- "Max"
"""
text = text.strip()
if not text:
return None, None
# Format: "Nachname, Vorname"
if ',' in text:
parts = text.split(',', 1)
last_name = parts[0].strip()
first_name = parts[1].strip() if len(parts) > 1 else ''
return first_name, last_name
# Format: "Vorname Nachname" oder "Vorname"
parts = text.split()
if len(parts) == 1:
return parts[0], None
elif len(parts) == 2:
return parts[0], parts[1]
else:
# Erster Teil ist Vorname, Rest ist Nachname
return parts[0], ' '.join(parts[1:])
# =========================================================================
# PDF ROSTER PARSING
# =========================================================================
def parse_pdf_roster(self, pdf_bytes: bytes) -> ParsedRoster:
"""
Parst eine PDF-Schuelerliste.
Unterstuetzt gaengige Schulverwaltungs-Exporte:
- SchILD-NRW
- ASV (Bayern)
- Untis
- Generic CSV-in-PDF
"""
if not HAS_PDF:
return ParsedRoster(
entries=[],
source_type='pdf',
confidence=0.0,
warnings=['PDF-Parsing nicht verfuegbar (PyMuPDF nicht installiert)']
)
entries = []
warnings = []
raw_text = ''
try:
doc = fitz.open(stream=pdf_bytes, filetype='pdf')
for page in doc:
text = page.get_text()
raw_text += text + '\n'
# Tabellen extrahieren
tables = page.find_tables()
for table in tables:
df = table.to_pandas()
for _, row in df.iterrows():
entry = self._parse_table_row(row.to_dict())
if entry:
entries.append(entry)
# Falls keine Tabellen: Zeilenweise parsen
if not tables:
for line in text.split('\n'):
entry = self._parse_roster_line(line)
if entry:
entries.append(entry)
doc.close()
except Exception as e:
warnings.append(f'PDF-Parsing Fehler: {str(e)}')
# Duplikate entfernen
entries = self._deduplicate_entries(entries)
return ParsedRoster(
entries=entries,
source_type='pdf',
confidence=0.9 if entries else 0.0,
warnings=warnings,
raw_text=raw_text
)
def _parse_table_row(self, row: Dict) -> Optional[RosterEntry]:
"""Parst eine Tabellenzeile in einen RosterEntry."""
# Spalten-Mappings (verschiedene Formate)
name_columns = ['name', 'schueler', 'schüler', 'student', 'nachname', 'last_name']
first_name_columns = ['vorname', 'first_name', 'firstname']
email_columns = ['email', 'e-mail', 'mail', 'eltern_email', 'parent_email']
phone_columns = ['telefon', 'phone', 'tel', 'handy', 'mobile', 'eltern_tel']
first_name = None
last_name = None
email = None
phone = None
for key, value in row.items():
if not value or str(value).strip() == '':
continue
key_lower = str(key).lower()
value_str = str(value).strip()
if any(col in key_lower for col in first_name_columns):
first_name = value_str
elif any(col in key_lower for col in name_columns):
# Kann "Vorname Nachname" oder nur "Nachname" sein
if first_name:
last_name = value_str
else:
first_name, last_name = self._parse_name(value_str)
elif any(col in key_lower for col in email_columns):
if self.EMAIL_PATTERN.match(value_str):
email = value_str
elif any(col in key_lower for col in phone_columns):
phone = value_str
if not first_name:
return None
return RosterEntry(
first_name=first_name,
last_name=last_name or '',
parent_email=email,
parent_phone=phone
)
# =========================================================================
# CSV PARSING
# =========================================================================
def parse_csv_roster(self, csv_content: str) -> ParsedRoster:
"""
Parst eine CSV-Schuelerliste.
Args:
csv_content: CSV als String
Returns:
ParsedRoster
"""
entries = []
warnings = []
try:
# Delimiter erraten
dialect = csv.Sniffer().sniff(csv_content[:1024])
reader = csv.DictReader(io.StringIO(csv_content), dialect=dialect)
for row in reader:
entry = self._parse_table_row(row)
if entry:
entries.append(entry)
except csv.Error as e:
warnings.append(f'CSV-Parsing Fehler: {str(e)}')
# Fallback: Zeilenweise parsen
for line in csv_content.split('\n'):
entry = self._parse_roster_line(line)
if entry:
entries.append(entry)
return ParsedRoster(
entries=entries,
source_type='csv',
confidence=0.95 if entries else 0.0,
warnings=warnings,
raw_text=csv_content
)
# =========================================================================
# NAME MATCHING
# =========================================================================
def match_first_names(
self,
detected: List[str],
roster: List[RosterEntry],
threshold: float = 0.7
) -> List[NameMatch]:
"""
Matched erkannte Vornamen zu Roster-Eintraegen.
Args:
detected: Liste erkannter Vornamen (z.B. ["Max", "Anna"])
roster: Vollstaendige Schuelerliste
threshold: Mindest-Konfidenz fuer Fuzzy-Matching
Returns:
Liste von NameMatch-Objekten
"""
matches = []
used_entries = set()
for name in detected:
name_lower = name.lower().strip()
best_match = None
best_confidence = 0.0
match_type = 'none'
for i, entry in enumerate(roster):
if i in used_entries:
continue
entry_first_lower = entry.first_name.lower().strip()
# Exakter Match
if name_lower == entry_first_lower:
best_match = entry
best_confidence = 1.0
match_type = 'exact'
used_entries.add(i)
break
# Vorname-Anfang Match (z.B. "Max" matched "Maximilian")
if entry_first_lower.startswith(name_lower) or name_lower.startswith(entry_first_lower):
confidence = min(len(name_lower), len(entry_first_lower)) / max(len(name_lower), len(entry_first_lower))
if confidence > best_confidence and confidence >= threshold:
best_match = entry
best_confidence = confidence
match_type = 'first_name'
# Fuzzy Match
ratio = SequenceMatcher(None, name_lower, entry_first_lower).ratio()
if ratio > best_confidence and ratio >= threshold:
best_match = entry
best_confidence = ratio
match_type = 'fuzzy'
if best_match and match_type != 'exact':
# Entry als verwendet markieren
for i, entry in enumerate(roster):
if entry is best_match:
used_entries.add(i)
break
matches.append(NameMatch(
detected_name=name,
matched_entry=best_match,
confidence=best_confidence,
match_type=match_type
))
return matches
# =========================================================================
# HELPERS
# =========================================================================
def _deduplicate_entries(self, entries: List[RosterEntry]) -> List[RosterEntry]:
"""Entfernt Duplikate basierend auf Vor- und Nachname."""
seen = set()
unique = []
for entry in entries:
key = (entry.first_name.lower(), entry.last_name.lower())
if key not in seen:
seen.add(key)
unique.append(entry)
return unique
def validate_entry(self, entry: RosterEntry) -> List[str]:
"""Validiert einen RosterEntry und gibt Warnungen zurueck."""
warnings = []
# Vorname pruefen
if not entry.first_name:
warnings.append('Kein Vorname')
elif len(entry.first_name) < 2:
warnings.append('Vorname zu kurz')
# Email validieren
if entry.parent_email and not self.EMAIL_PATTERN.match(entry.parent_email):
warnings.append('Ungueltige Email-Adresse')
return warnings
# Singleton
_roster_parser: Optional[RosterParser] = None
def get_roster_parser() -> RosterParser:
"""Gibt die Singleton-Instanz des RosterParsers zurueck."""
global _roster_parser
if _roster_parser is None:
_roster_parser = RosterParser()
return _roster_parser

View File

@@ -0,0 +1,613 @@
"""
School Resolver Service - Schul-Auswahl und Klassen-Erstellung.
Funktionen:
- Bundesland -> Schulform -> Schule Kaskade
- Auto-Erstellung von Klassen aus erkannten Daten
- Integration mit Go School Service (Port 8084)
Privacy:
- Schuldaten sind Stammdaten (kein DSGVO-Problem)
- Schueler-Erstellung nur im Lehrer-Namespace
"""
import httpx
import os
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from enum import Enum
# ============================================================================
# KONSTANTEN
# ============================================================================
BUNDESLAENDER = {
"BW": "Baden-Wuerttemberg",
"BY": "Bayern",
"BE": "Berlin",
"BB": "Brandenburg",
"HB": "Bremen",
"HH": "Hamburg",
"HE": "Hessen",
"MV": "Mecklenburg-Vorpommern",
"NI": "Niedersachsen",
"NW": "Nordrhein-Westfalen",
"RP": "Rheinland-Pfalz",
"SL": "Saarland",
"SN": "Sachsen",
"ST": "Sachsen-Anhalt",
"SH": "Schleswig-Holstein",
"TH": "Thueringen"
}
SCHULFORMEN = {
"grundschule": {
"name": "Grundschule",
"grades": [1, 2, 3, 4],
"short": "GS"
},
"hauptschule": {
"name": "Hauptschule",
"grades": [5, 6, 7, 8, 9, 10],
"short": "HS"
},
"realschule": {
"name": "Realschule",
"grades": [5, 6, 7, 8, 9, 10],
"short": "RS"
},
"gymnasium": {
"name": "Gymnasium",
"grades": [5, 6, 7, 8, 9, 10, 11, 12, 13],
"short": "GYM"
},
"gesamtschule": {
"name": "Gesamtschule",
"grades": [5, 6, 7, 8, 9, 10, 11, 12, 13],
"short": "IGS"
},
"oberschule": {
"name": "Oberschule",
"grades": [5, 6, 7, 8, 9, 10],
"short": "OBS"
},
"sekundarschule": {
"name": "Sekundarschule",
"grades": [5, 6, 7, 8, 9, 10],
"short": "SEK"
},
"foerderschule": {
"name": "Foerderschule",
"grades": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"short": "FS"
},
"berufsschule": {
"name": "Berufsschule",
"grades": [10, 11, 12, 13],
"short": "BS"
},
"fachoberschule": {
"name": "Fachoberschule",
"grades": [11, 12, 13],
"short": "FOS"
}
}
# Faecher mit Standardbezeichnungen
FAECHER = {
"mathematik": {"name": "Mathematik", "short": "Ma"},
"deutsch": {"name": "Deutsch", "short": "De"},
"englisch": {"name": "Englisch", "short": "En"},
"franzoesisch": {"name": "Franzoesisch", "short": "Fr"},
"spanisch": {"name": "Spanisch", "short": "Sp"},
"latein": {"name": "Latein", "short": "La"},
"physik": {"name": "Physik", "short": "Ph"},
"chemie": {"name": "Chemie", "short": "Ch"},
"biologie": {"name": "Biologie", "short": "Bio"},
"geschichte": {"name": "Geschichte", "short": "Ge"},
"erdkunde": {"name": "Erdkunde", "short": "Ek"},
"politik": {"name": "Politik", "short": "Po"},
"wirtschaft": {"name": "Wirtschaft", "short": "Wi"},
"kunst": {"name": "Kunst", "short": "Ku"},
"musik": {"name": "Musik", "short": "Mu"},
"sport": {"name": "Sport", "short": "Sp"},
"religion": {"name": "Religion", "short": "Re"},
"ethik": {"name": "Ethik", "short": "Et"},
"informatik": {"name": "Informatik", "short": "If"},
"sachunterricht": {"name": "Sachunterricht", "short": "SU"}
}
# ============================================================================
# DATA CLASSES
# ============================================================================
@dataclass
class School:
"""Schule."""
id: str
name: str
bundesland: str
schulform: str
address: Optional[str] = None
city: Optional[str] = None
@dataclass
class SchoolClass:
"""Schulklasse."""
id: str
school_id: str
name: str # z.B. "3a"
grade_level: int # z.B. 3
school_year: str # z.B. "2025/2026"
teacher_id: str
student_count: int = 0
@dataclass
class Student:
"""Schueler (Stammdaten, keine PII im Klausur-Kontext)."""
id: str
class_id: str
first_name: str
last_name: str
student_number: Optional[str] = None
@dataclass
class DetectedClassInfo:
"""Aus Klausuren erkannte Klasseninformationen."""
class_name: str # z.B. "3a"
grade_level: Optional[int] = None # z.B. 3
subject: Optional[str] = None
date: Optional[str] = None
students: List[Dict[str, str]] = field(default_factory=list)
confidence: float = 0.0
@dataclass
class SchoolContext:
"""Vollstaendiger Schulkontext fuer einen Lehrer."""
teacher_id: str
school: Optional[School] = None
classes: List[SchoolClass] = field(default_factory=list)
current_school_year: str = "2025/2026"
# ============================================================================
# SCHOOL RESOLVER
# ============================================================================
class SchoolResolver:
"""
Verwaltet Schul- und Klassenkontext.
Beispiel:
resolver = SchoolResolver()
# Schul-Kaskade
schools = await resolver.search_schools("Niedersachsen", "Grundschule", "Jever")
# Klasse auto-erstellen
class_obj = await resolver.auto_create_class(
teacher_id="teacher-123",
school_id="school-456",
detected_info=DetectedClassInfo(
class_name="3a",
students=[{"firstName": "Max"}, {"firstName": "Anna"}]
)
)
"""
def __init__(self):
self.school_service_url = os.getenv(
"SCHOOL_SERVICE_URL",
"http://school-service:8084"
)
# Fallback auf lokale Daten wenn Service nicht erreichbar
self._local_schools: Dict[str, School] = {}
self._local_classes: Dict[str, SchoolClass] = {}
# =========================================================================
# BUNDESLAND / SCHULFORM LOOKUP
# =========================================================================
def get_bundeslaender(self) -> Dict[str, str]:
"""Gibt alle Bundeslaender zurueck."""
return BUNDESLAENDER
def get_schulformen(self) -> Dict[str, Dict]:
"""Gibt alle Schulformen zurueck."""
return SCHULFORMEN
def get_faecher(self) -> Dict[str, Dict]:
"""Gibt alle Faecher zurueck."""
return FAECHER
def get_grades_for_schulform(self, schulform: str) -> List[int]:
"""Gibt die Klassenstufen fuer eine Schulform zurueck."""
if schulform in SCHULFORMEN:
return SCHULFORMEN[schulform]["grades"]
return list(range(1, 14)) # Default: alle Stufen
def detect_grade_from_class_name(self, class_name: str) -> Optional[int]:
"""
Erkennt die Klassenstufe aus dem Klassennamen.
Beispiele:
- "3a" -> 3
- "10b" -> 10
- "Q1" -> 11
- "EF" -> 10
"""
import re
# Standard-Format: Zahl + Buchstabe
match = re.match(r'^(\d{1,2})[a-zA-Z]?$', class_name)
if match:
return int(match.group(1))
# Oberstufen-Formate
upper_grades = {
'ef': 10, 'e': 10,
'q1': 11, 'q2': 12,
'k1': 11, 'k2': 12,
'11': 11, '12': 12, '13': 13
}
class_lower = class_name.lower()
if class_lower in upper_grades:
return upper_grades[class_lower]
return None
def normalize_subject(self, detected_subject: str) -> Optional[str]:
"""
Normalisiert einen erkannten Fachnamen.
Beispiel: "Mathe" -> "mathematik"
"""
subject_lower = detected_subject.lower().strip()
# Direkte Matches
if subject_lower in FAECHER:
return subject_lower
# Abkuerzungen und Varianten
subject_aliases = {
'mathe': 'mathematik',
'bio': 'biologie',
'phy': 'physik',
'che': 'chemie',
'geo': 'erdkunde',
'geographie': 'erdkunde',
'powi': 'politik',
'sowi': 'politik',
'reli': 'religion',
'info': 'informatik',
'su': 'sachunterricht'
}
if subject_lower in subject_aliases:
return subject_aliases[subject_lower]
# Teilstring-Match
for key in FAECHER:
if key.startswith(subject_lower) or subject_lower.startswith(key[:3]):
return key
return None
# =========================================================================
# SCHOOL SERVICE INTEGRATION
# =========================================================================
async def search_schools(
self,
bundesland: Optional[str] = None,
schulform: Optional[str] = None,
name_query: Optional[str] = None,
limit: int = 20
) -> List[School]:
"""
Sucht Schulen im School Service.
Args:
bundesland: Bundesland-Kuerzel (z.B. "NI")
schulform: Schulform-Key (z.B. "grundschule")
name_query: Suchbegriff fuer Schulname
limit: Max. Anzahl Ergebnisse
"""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
params = {}
if bundesland:
params['state'] = bundesland
if schulform:
params['type'] = schulform
if name_query:
params['q'] = name_query
params['limit'] = limit
response = await client.get(
f"{self.school_service_url}/api/schools",
params=params
)
if response.status_code == 200:
data = response.json()
return [
School(
id=s['id'],
name=s['name'],
bundesland=s.get('state', bundesland or ''),
schulform=s.get('type', schulform or ''),
address=s.get('address'),
city=s.get('city')
)
for s in data.get('schools', [])
]
except Exception as e:
print(f"[SchoolResolver] Service error: {e}")
# Fallback: Leere Liste (Schule kann manuell angelegt werden)
return []
async def get_or_create_school(
self,
teacher_id: str,
bundesland: str,
schulform: str,
school_name: str,
city: Optional[str] = None
) -> School:
"""
Holt oder erstellt eine Schule.
Falls die Schule existiert, wird sie zurueckgegeben.
Sonst wird sie neu erstellt.
"""
# Zuerst suchen
existing = await self.search_schools(
bundesland=bundesland,
schulform=schulform,
name_query=school_name,
limit=1
)
if existing:
return existing[0]
# Neu erstellen
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/schools",
json={
"name": school_name,
"state": bundesland,
"type": schulform,
"city": city,
"created_by": teacher_id
}
)
if response.status_code in (200, 201):
data = response.json()
return School(
id=data['id'],
name=school_name,
bundesland=bundesland,
schulform=schulform,
city=city
)
except Exception as e:
print(f"[SchoolResolver] Create school error: {e}")
# Fallback: Lokale Schule erstellen
import uuid
school_id = str(uuid.uuid4())
school = School(
id=school_id,
name=school_name,
bundesland=bundesland,
schulform=schulform,
city=city
)
self._local_schools[school_id] = school
return school
# =========================================================================
# CLASS MANAGEMENT
# =========================================================================
async def get_classes_for_teacher(
self,
teacher_id: str,
school_id: Optional[str] = None
) -> List[SchoolClass]:
"""Holt alle Klassen eines Lehrers."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
params = {"teacher_id": teacher_id}
if school_id:
params["school_id"] = school_id
response = await client.get(
f"{self.school_service_url}/api/classes",
params=params
)
if response.status_code == 200:
data = response.json()
return [
SchoolClass(
id=c['id'],
school_id=c.get('school_id', ''),
name=c['name'],
grade_level=c.get('grade_level', 0),
school_year=c.get('school_year', '2025/2026'),
teacher_id=teacher_id,
student_count=c.get('student_count', 0)
)
for c in data.get('classes', [])
]
except Exception as e:
print(f"[SchoolResolver] Get classes error: {e}")
return list(self._local_classes.values())
async def auto_create_class(
self,
teacher_id: str,
school_id: str,
detected_info: DetectedClassInfo,
school_year: str = "2025/2026"
) -> SchoolClass:
"""
Erstellt automatisch eine Klasse aus erkannten Daten.
Args:
teacher_id: ID des Lehrers
school_id: ID der Schule
detected_info: Aus Klausuren erkannte Informationen
school_year: Schuljahr
"""
grade_level = detected_info.grade_level or self.detect_grade_from_class_name(
detected_info.class_name
) or 0
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/classes",
json={
"school_id": school_id,
"name": detected_info.class_name,
"grade_level": grade_level,
"school_year": school_year,
"teacher_id": teacher_id
}
)
if response.status_code in (200, 201):
data = response.json()
class_id = data['id']
# Schueler hinzufuegen
if detected_info.students:
await self._bulk_create_students(
class_id,
detected_info.students
)
return SchoolClass(
id=class_id,
school_id=school_id,
name=detected_info.class_name,
grade_level=grade_level,
school_year=school_year,
teacher_id=teacher_id,
student_count=len(detected_info.students)
)
except Exception as e:
print(f"[SchoolResolver] Create class error: {e}")
# Fallback: Lokale Klasse
import uuid
class_id = str(uuid.uuid4())
school_class = SchoolClass(
id=class_id,
school_id=school_id,
name=detected_info.class_name,
grade_level=grade_level,
school_year=school_year,
teacher_id=teacher_id,
student_count=len(detected_info.students)
)
self._local_classes[class_id] = school_class
return school_class
async def _bulk_create_students(
self,
class_id: str,
students: List[Dict[str, str]]
) -> List[Student]:
"""Erstellt mehrere Schueler auf einmal."""
created = []
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
f"{self.school_service_url}/api/classes/{class_id}/students/bulk",
json={
"students": [
{
"first_name": s.get("firstName", s.get("first_name", "")),
"last_name": s.get("lastName", s.get("last_name", ""))
}
for s in students
]
}
)
if response.status_code in (200, 201):
data = response.json()
created = [
Student(
id=s['id'],
class_id=class_id,
first_name=s['first_name'],
last_name=s.get('last_name', '')
)
for s in data.get('students', [])
]
except Exception as e:
print(f"[SchoolResolver] Bulk create students error: {e}")
return created
# =========================================================================
# CONTEXT MANAGEMENT
# =========================================================================
async def get_teacher_context(self, teacher_id: str) -> SchoolContext:
"""
Holt den vollstaendigen Schulkontext eines Lehrers.
Beinhaltet Schule, Klassen und aktuelles Schuljahr.
"""
context = SchoolContext(teacher_id=teacher_id)
# Klassen laden
classes = await self.get_classes_for_teacher(teacher_id)
context.classes = classes
# Schule aus erster Klasse ableiten
if classes and classes[0].school_id:
schools = await self.search_schools()
for school in schools:
if school.id == classes[0].school_id:
context.school = school
break
return context
# Singleton
_school_resolver: Optional[SchoolResolver] = None
def get_school_resolver() -> SchoolResolver:
"""Gibt die Singleton-Instanz des SchoolResolvers zurueck."""
global _school_resolver
if _school_resolver is None:
_school_resolver = SchoolResolver()
return _school_resolver

View File

@@ -0,0 +1,197 @@
"""
Storage Service for Klausur Documents.
PRIVACY BY DESIGN:
- Documents stored with doc_token as identifier (not student names)
- Organized by session_id/doc_token for teacher isolation
- Auto-cleanup when retention period expires
"""
import os
import io
import logging
from typing import Optional, BinaryIO
from pathlib import Path
from minio import Minio
from minio.error import S3Error
logger = logging.getLogger(__name__)
class KlausurStorageService:
"""
MinIO/S3 Storage Service for exam documents.
Structure:
klausur-exams/
{session_id}/
{doc_token}.{ext}
{doc_token}_redacted.{ext} # After header redaction
"""
def __init__(self):
self.endpoint = os.getenv("MINIO_ENDPOINT", "minio:9000")
self.access_key = os.getenv("MINIO_ROOT_USER", "breakpilot_dev")
self.secret_key = os.getenv("MINIO_ROOT_PASSWORD", "breakpilot_dev_123")
self.secure = os.getenv("MINIO_SECURE", "false").lower() == "true"
self.bucket_name = os.getenv("KLAUSUR_BUCKET", "klausur-exams")
self._client: Optional[Minio] = None
@property
def client(self) -> Minio:
"""Lazy-init MinIO client."""
if self._client is None:
self._client = Minio(
self.endpoint,
access_key=self.access_key,
secret_key=self.secret_key,
secure=self.secure
)
self._ensure_bucket()
return self._client
def _ensure_bucket(self):
"""Create bucket if it doesn't exist."""
try:
if not self._client.bucket_exists(self.bucket_name):
self._client.make_bucket(self.bucket_name)
logger.info(f"Created Klausur bucket: {self.bucket_name}")
except S3Error as e:
logger.warning(f"MinIO bucket check failed: {e}")
def upload_document(
self,
session_id: str,
doc_token: str,
file_data: bytes,
file_extension: str = "png",
is_redacted: bool = False
) -> str:
"""
Upload exam document to storage.
Args:
session_id: Exam session ID
doc_token: Pseudonymized document token
file_data: Document binary data
file_extension: File extension (png, jpg, pdf)
is_redacted: Whether this is the redacted version
Returns:
Object path in storage
"""
suffix = "_redacted" if is_redacted else ""
object_name = f"{session_id}/{doc_token}{suffix}.{file_extension}"
# Determine content type
content_types = {
"png": "image/png",
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"pdf": "application/pdf",
}
content_type = content_types.get(file_extension.lower(), "application/octet-stream")
try:
self.client.put_object(
bucket_name=self.bucket_name,
object_name=object_name,
data=io.BytesIO(file_data),
length=len(file_data),
content_type=content_type
)
logger.info(f"Uploaded document: {object_name}")
return object_name
except S3Error as e:
logger.error(f"Failed to upload document: {e}")
raise
def get_document(
self,
session_id: str,
doc_token: str,
file_extension: str = "png",
is_redacted: bool = False
) -> Optional[bytes]:
"""
Download exam document from storage.
Args:
session_id: Exam session ID
doc_token: Pseudonymized document token
file_extension: File extension
is_redacted: Whether to get the redacted version
Returns:
Document binary data or None if not found
"""
suffix = "_redacted" if is_redacted else ""
object_name = f"{session_id}/{doc_token}{suffix}.{file_extension}"
try:
response = self.client.get_object(self.bucket_name, object_name)
data = response.read()
response.close()
response.release_conn()
return data
except S3Error as e:
if e.code == "NoSuchKey":
logger.warning(f"Document not found: {object_name}")
return None
logger.error(f"Failed to get document: {e}")
raise
def delete_session_documents(self, session_id: str) -> int:
"""
Delete all documents for a session.
Args:
session_id: Exam session ID
Returns:
Number of deleted objects
"""
deleted_count = 0
prefix = f"{session_id}/"
try:
objects = self.client.list_objects(self.bucket_name, prefix=prefix)
for obj in objects:
self.client.remove_object(self.bucket_name, obj.object_name)
deleted_count += 1
logger.debug(f"Deleted: {obj.object_name}")
logger.info(f"Deleted {deleted_count} documents for session {session_id}")
return deleted_count
except S3Error as e:
logger.error(f"Failed to delete session documents: {e}")
raise
def document_exists(
self,
session_id: str,
doc_token: str,
file_extension: str = "png"
) -> bool:
"""Check if document exists in storage."""
object_name = f"{session_id}/{doc_token}.{file_extension}"
try:
self.client.stat_object(self.bucket_name, object_name)
return True
except S3Error:
return False
# Singleton instance
_storage_service: Optional[KlausurStorageService] = None
def get_storage_service() -> KlausurStorageService:
"""Get or create the storage service singleton."""
global _storage_service
if _storage_service is None:
_storage_service = KlausurStorageService()
return _storage_service

View File

@@ -0,0 +1,214 @@
"""
TrOCR Client - Connects to external TrOCR service (Mac Mini).
This client forwards OCR requests to the TrOCR service running on
the Mac Mini, enabling handwriting recognition without requiring
local GPU/ML dependencies.
Privacy: Images are sent over the local network only - no cloud.
"""
import os
import httpx
import logging
from typing import Optional, List, Dict
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# Mac Mini TrOCR Service URL
TROCR_SERVICE_URL = os.environ.get(
"TROCR_SERVICE_URL",
"http://192.168.178.163:8084"
)
@dataclass
class OCRResult:
"""Result from TrOCR extraction."""
text: str
confidence: float
processing_time_ms: int
device: str = "remote"
class TrOCRClient:
"""
Client for external TrOCR service.
Usage:
client = TrOCRClient()
# Check if service is available
if await client.is_available():
result = await client.extract_text(image_bytes)
print(result.text)
"""
def __init__(self, base_url: Optional[str] = None):
self.base_url = base_url or TROCR_SERVICE_URL
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create HTTP client."""
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
base_url=self.base_url,
timeout=300.0 # 5 min timeout for model loading
)
return self._client
async def close(self):
"""Close the HTTP client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
async def is_available(self) -> bool:
"""Check if TrOCR service is available."""
try:
client = await self._get_client()
response = await client.get("/health", timeout=5.0)
return response.status_code == 200
except Exception as e:
logger.warning(f"TrOCR service not available: {e}")
return False
async def get_status(self) -> Dict:
"""Get TrOCR service status."""
try:
client = await self._get_client()
response = await client.get("/api/v1/status")
response.raise_for_status()
return response.json()
except Exception as e:
logger.error(f"Failed to get TrOCR status: {e}")
return {
"status": "unavailable",
"error": str(e)
}
async def extract_text(
self,
image_data: bytes,
filename: str = "image.png",
detect_lines: bool = True
) -> OCRResult:
"""
Extract text from an image using TrOCR.
Args:
image_data: Raw image bytes
filename: Original filename
detect_lines: Whether to detect individual lines
Returns:
OCRResult with extracted text
"""
try:
client = await self._get_client()
files = {"file": (filename, image_data, "image/png")}
params = {"detect_lines": str(detect_lines).lower()}
response = await client.post(
"/api/v1/extract",
files=files,
params=params
)
response.raise_for_status()
data = response.json()
return OCRResult(
text=data.get("text", ""),
confidence=data.get("confidence", 0.0),
processing_time_ms=data.get("processing_time_ms", 0),
device=data.get("device", "remote")
)
except httpx.TimeoutException:
logger.error("TrOCR request timed out (model may be loading)")
raise
except Exception as e:
logger.error(f"TrOCR extraction failed: {e}")
raise
async def batch_extract(
self,
images: List[bytes],
filenames: Optional[List[str]] = None,
detect_lines: bool = True
) -> List[OCRResult]:
"""
Extract text from multiple images.
Args:
images: List of image bytes
filenames: Optional list of filenames
detect_lines: Whether to detect individual lines
Returns:
List of OCRResult
"""
if filenames is None:
filenames = [f"image_{i}.png" for i in range(len(images))]
try:
client = await self._get_client()
files = [
("files", (fn, img, "image/png"))
for fn, img in zip(filenames, images)
]
response = await client.post(
"/api/v1/batch-extract",
files=files
)
response.raise_for_status()
data = response.json()
results = []
for item in data.get("results", []):
results.append(OCRResult(
text=item.get("text", ""),
confidence=item.get("confidence", 0.85),
processing_time_ms=0,
device="remote"
))
return results
except Exception as e:
logger.error(f"TrOCR batch extraction failed: {e}")
raise
# Singleton instance
_trocr_client: Optional[TrOCRClient] = None
def get_trocr_client() -> TrOCRClient:
"""Get the TrOCR client singleton."""
global _trocr_client
if _trocr_client is None:
_trocr_client = TrOCRClient()
return _trocr_client
async def extract_text_from_image(
image_data: bytes,
filename: str = "image.png"
) -> OCRResult:
"""
Convenience function to extract text from an image.
Args:
image_data: Raw image bytes
filename: Original filename
Returns:
OCRResult with extracted text
"""
client = get_trocr_client()
return await client.extract_text(image_data, filename)

View File

@@ -0,0 +1,577 @@
"""
TrOCR Service for Handwriting Recognition.
Uses Microsoft's TrOCR model for extracting handwritten text from exam images.
Supports fine-tuning with teacher corrections via LoRA adapters.
PRIVACY BY DESIGN:
- All processing happens locally
- No data sent to external services
- Fine-tuning data stays on-premise
"""
import logging
import os
from pathlib import Path
from typing import Optional, List, Dict, Tuple
from dataclasses import dataclass
from io import BytesIO
import json
logger = logging.getLogger(__name__)
# Model paths
MODEL_CACHE_DIR = Path(os.environ.get("TROCR_CACHE_DIR", "/app/models/trocr"))
LORA_ADAPTERS_DIR = Path(os.environ.get("TROCR_LORA_DIR", "/app/models/trocr/lora"))
TRAINING_DATA_DIR = Path(os.environ.get("TROCR_TRAINING_DIR", "/app/data/trocr_training"))
@dataclass
class OCRResult:
"""Result from TrOCR extraction."""
text: str
confidence: float
bounding_boxes: List[Dict] # [{"x": 0, "y": 0, "w": 100, "h": 20, "text": "..."}]
processing_time_ms: int
@dataclass
class TrainingExample:
"""A single training example for fine-tuning."""
image_path: str
ground_truth: str
teacher_id: str
created_at: str
class TrOCRService:
"""
Handwriting recognition service using TrOCR.
Features:
- Line-by-line handwriting extraction
- Confidence scoring
- LoRA fine-tuning support
- Batch processing
"""
# Available models (from smallest to largest)
MODELS = {
"trocr-small": "microsoft/trocr-small-handwritten",
"trocr-base": "microsoft/trocr-base-handwritten", # Recommended
"trocr-large": "microsoft/trocr-large-handwritten",
}
def __init__(self, model_name: str = "trocr-base", device: str = "auto"):
"""
Initialize TrOCR service.
Args:
model_name: One of "trocr-small", "trocr-base", "trocr-large"
device: "cpu", "cuda", "mps" (Apple Silicon), or "auto"
"""
self.model_name = model_name
self.model_id = self.MODELS.get(model_name, self.MODELS["trocr-base"])
self.device = self._get_device(device)
self._processor = None
self._model = None
self._lora_adapter = None
# Create directories
MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
LORA_ADAPTERS_DIR.mkdir(parents=True, exist_ok=True)
TRAINING_DATA_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"TrOCR Service initialized: model={model_name}, device={self.device}")
def _get_device(self, device: str) -> str:
"""Determine the best device for inference."""
if device != "auto":
return device
try:
import torch
if torch.cuda.is_available():
return "cuda"
elif torch.backends.mps.is_available():
return "mps"
return "cpu"
except ImportError:
return "cpu"
def _load_model(self):
"""Lazy-load the TrOCR model."""
if self._model is not None:
return
try:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
logger.info(f"Loading TrOCR model: {self.model_id}")
self._processor = TrOCRProcessor.from_pretrained(
self.model_id,
cache_dir=str(MODEL_CACHE_DIR)
)
self._model = VisionEncoderDecoderModel.from_pretrained(
self.model_id,
cache_dir=str(MODEL_CACHE_DIR)
)
# Move to device
if self.device == "cuda":
self._model = self._model.cuda()
elif self.device == "mps":
self._model = self._model.to("mps")
# Load LoRA adapter if exists
adapter_path = LORA_ADAPTERS_DIR / f"{self.model_name}_adapter"
if adapter_path.exists():
self._load_lora_adapter(adapter_path)
logger.info(f"TrOCR model loaded successfully on {self.device}")
except ImportError as e:
logger.error(f"Missing dependencies: {e}")
logger.error("Install with: pip install transformers torch pillow")
raise
except Exception as e:
logger.error(f"Failed to load TrOCR model: {e}")
raise
def _load_lora_adapter(self, adapter_path: Path):
"""Load a LoRA adapter for fine-tuned model."""
try:
from peft import PeftModel
logger.info(f"Loading LoRA adapter from {adapter_path}")
self._model = PeftModel.from_pretrained(self._model, str(adapter_path))
self._lora_adapter = str(adapter_path)
logger.info("LoRA adapter loaded successfully")
except ImportError:
logger.warning("peft not installed, skipping LoRA adapter")
except Exception as e:
logger.warning(f"Failed to load LoRA adapter: {e}")
async def extract_text(
self,
image_data: bytes,
detect_lines: bool = True
) -> OCRResult:
"""
Extract handwritten text from an image.
Args:
image_data: Raw image bytes (PNG, JPG, etc.)
detect_lines: If True, detect and process individual lines
Returns:
OCRResult with extracted text and confidence
"""
import time
start_time = time.time()
self._load_model()
try:
from PIL import Image
import torch
# Load image
image = Image.open(BytesIO(image_data)).convert("RGB")
if detect_lines:
# Detect text lines and process each
lines, bboxes = await self._detect_and_extract_lines(image)
text = "\n".join(lines)
confidence = 0.85 # Average confidence estimate
else:
# Process whole image
text, confidence = await self._extract_single(image)
bboxes = []
processing_time_ms = int((time.time() - start_time) * 1000)
return OCRResult(
text=text,
confidence=confidence,
bounding_boxes=bboxes,
processing_time_ms=processing_time_ms
)
except Exception as e:
logger.error(f"OCR extraction failed: {e}")
return OCRResult(
text="",
confidence=0.0,
bounding_boxes=[],
processing_time_ms=int((time.time() - start_time) * 1000)
)
async def _extract_single(self, image) -> Tuple[str, float]:
"""Extract text from a single image (no line detection)."""
import torch
# Preprocess
pixel_values = self._processor(
images=image,
return_tensors="pt"
).pixel_values
if self.device == "cuda":
pixel_values = pixel_values.cuda()
elif self.device == "mps":
pixel_values = pixel_values.to("mps")
# Generate
with torch.no_grad():
generated_ids = self._model.generate(
pixel_values,
max_length=128,
num_beams=4,
return_dict_in_generate=True,
output_scores=True
)
# Decode
text = self._processor.batch_decode(
generated_ids.sequences,
skip_special_tokens=True
)[0]
# Estimate confidence from generation scores
confidence = self._estimate_confidence(generated_ids)
return text.strip(), confidence
async def _detect_and_extract_lines(self, image) -> Tuple[List[str], List[Dict]]:
"""Detect text lines and extract each separately."""
from PIL import Image
import numpy as np
# Convert to numpy for line detection
img_array = np.array(image.convert("L")) # Grayscale
# Simple horizontal projection for line detection
lines_y = self._detect_line_positions(img_array)
if not lines_y:
# Fallback: process whole image
text, _ = await self._extract_single(image)
return [text], []
# Extract each line
results = []
bboxes = []
width = image.width
for i, (y_start, y_end) in enumerate(lines_y):
# Crop line
line_img = image.crop((0, y_start, width, y_end))
# Ensure minimum height
if line_img.height < 20:
continue
# Extract text
text, conf = await self._extract_single(line_img)
if text.strip():
results.append(text)
bboxes.append({
"x": 0,
"y": y_start,
"w": width,
"h": y_end - y_start,
"text": text,
"confidence": conf
})
return results, bboxes
def _detect_line_positions(self, img_array) -> List[Tuple[int, int]]:
"""Detect horizontal text line positions using projection profile."""
import numpy as np
# Horizontal projection (sum of pixels per row)
projection = np.sum(255 - img_array, axis=1)
# Threshold to find text rows
threshold = np.max(projection) * 0.1
text_rows = projection > threshold
# Find line boundaries
lines = []
in_line = False
line_start = 0
for i, is_text in enumerate(text_rows):
if is_text and not in_line:
in_line = True
line_start = max(0, i - 5) # Add padding
elif not is_text and in_line:
in_line = False
line_end = min(len(text_rows) - 1, i + 5) # Add padding
if line_end - line_start > 15: # Minimum line height
lines.append((line_start, line_end))
# Handle last line
if in_line:
lines.append((line_start, len(text_rows) - 1))
return lines
def _estimate_confidence(self, generated_output) -> float:
"""Estimate confidence from generation scores."""
try:
import torch
if hasattr(generated_output, 'scores') and generated_output.scores:
# Average probability of selected tokens
probs = []
for score in generated_output.scores:
prob = torch.softmax(score, dim=-1).max().item()
probs.append(prob)
return sum(probs) / len(probs) if probs else 0.5
return 0.75 # Default confidence
except Exception:
return 0.75
async def batch_extract(
self,
images: List[bytes],
detect_lines: bool = True
) -> List[OCRResult]:
"""
Extract text from multiple images.
Args:
images: List of image bytes
detect_lines: If True, detect lines in each image
Returns:
List of OCRResult
"""
results = []
for img_data in images:
result = await self.extract_text(img_data, detect_lines)
results.append(result)
return results
# ==========================================
# FINE-TUNING SUPPORT
# ==========================================
def add_training_example(
self,
image_data: bytes,
ground_truth: str,
teacher_id: str
) -> str:
"""
Add a training example for fine-tuning.
Args:
image_data: Image bytes
ground_truth: Correct text (teacher-provided)
teacher_id: ID of the teacher providing correction
Returns:
Example ID
"""
import uuid
from datetime import datetime
example_id = str(uuid.uuid4())
# Save image
image_path = TRAINING_DATA_DIR / f"{example_id}.png"
with open(image_path, "wb") as f:
f.write(image_data)
# Save metadata
example = TrainingExample(
image_path=str(image_path),
ground_truth=ground_truth,
teacher_id=teacher_id,
created_at=datetime.utcnow().isoformat()
)
meta_path = TRAINING_DATA_DIR / f"{example_id}.json"
with open(meta_path, "w") as f:
json.dump(example.__dict__, f, indent=2)
logger.info(f"Training example added: {example_id}")
return example_id
def get_training_examples(self, teacher_id: Optional[str] = None) -> List[TrainingExample]:
"""Get all training examples, optionally filtered by teacher."""
examples = []
for meta_file in TRAINING_DATA_DIR.glob("*.json"):
with open(meta_file) as f:
data = json.load(f)
example = TrainingExample(**data)
if teacher_id is None or example.teacher_id == teacher_id:
examples.append(example)
return examples
async def fine_tune(
self,
teacher_id: Optional[str] = None,
epochs: int = 3,
learning_rate: float = 5e-5
) -> Dict:
"""
Fine-tune the model with collected training examples.
Uses LoRA for efficient fine-tuning.
Args:
teacher_id: If provided, only use examples from this teacher
epochs: Number of training epochs
learning_rate: Learning rate for fine-tuning
Returns:
Training statistics
"""
examples = self.get_training_examples(teacher_id)
if len(examples) < 10:
return {
"status": "error",
"message": f"Need at least 10 examples, have {len(examples)}"
}
try:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import Trainer, TrainingArguments
from PIL import Image
import torch
self._load_model()
logger.info(f"Starting fine-tuning with {len(examples)} examples")
# Configure LoRA
lora_config = LoraConfig(
task_type=TaskType.SEQ_2_SEQ_LM,
r=16, # LoRA rank
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"] # Attention layers
)
# Apply LoRA
model = get_peft_model(self._model, lora_config)
# Prepare dataset
class OCRDataset(torch.utils.data.Dataset):
def __init__(self, examples, processor):
self.examples = examples
self.processor = processor
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
ex = self.examples[idx]
image = Image.open(ex.image_path).convert("RGB")
pixel_values = self.processor(
images=image, return_tensors="pt"
).pixel_values.squeeze()
labels = self.processor.tokenizer(
ex.ground_truth,
return_tensors="pt",
padding="max_length",
max_length=128
).input_ids.squeeze()
return {
"pixel_values": pixel_values,
"labels": labels
}
dataset = OCRDataset(examples, self._processor)
# Training arguments
output_dir = LORA_ADAPTERS_DIR / f"{self.model_name}_adapter"
training_args = TrainingArguments(
output_dir=str(output_dir),
num_train_epochs=epochs,
per_device_train_batch_size=4,
learning_rate=learning_rate,
save_strategy="epoch",
logging_steps=10,
remove_unused_columns=False,
)
# Train
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
)
train_result = trainer.train()
# Save adapter
model.save_pretrained(str(output_dir))
# Reload model with new adapter
self._model = None
self._load_model()
return {
"status": "success",
"examples_used": len(examples),
"epochs": epochs,
"adapter_path": str(output_dir),
"train_loss": train_result.training_loss
}
except ImportError as e:
logger.error(f"Missing dependencies for fine-tuning: {e}")
return {
"status": "error",
"message": f"Missing dependencies: {e}. Install with: pip install peft"
}
except Exception as e:
logger.error(f"Fine-tuning failed: {e}")
return {
"status": "error",
"message": str(e)
}
def get_model_info(self) -> Dict:
"""Get information about the loaded model."""
adapter_path = LORA_ADAPTERS_DIR / f"{self.model_name}_adapter"
return {
"model_name": self.model_name,
"model_id": self.model_id,
"device": self.device,
"is_loaded": self._model is not None,
"has_lora_adapter": adapter_path.exists(),
"lora_adapter_path": str(adapter_path) if adapter_path.exists() else None,
"training_examples_count": len(list(TRAINING_DATA_DIR.glob("*.json"))),
}
# Singleton instance
_trocr_service: Optional[TrOCRService] = None
def get_trocr_service(model_name: str = "trocr-base") -> TrOCRService:
"""Get or create the TrOCR service singleton."""
global _trocr_service
if _trocr_service is None or _trocr_service.model_name != model_name:
_trocr_service = TrOCRService(model_name=model_name)
return _trocr_service

View File

@@ -0,0 +1,309 @@
"""
Vision-OCR Service - Handschrifterkennung mit Llama 3.2 Vision.
DATENSCHUTZ/PRIVACY BY DESIGN:
- Alle Verarbeitung erfolgt lokal auf dem Mac Mini
- Keine Daten verlassen das lokale Netzwerk
- Keine Cloud-APIs beteiligt
- Perfekt für DSGVO-konforme Schulumgebungen
Verwendet llama3.2-vision:11b über Ollama für OCR/Handschrifterkennung.
Dies ist eine Alternative zu TrOCR mit besserer Handschrifterkennung.
"""
import os
import base64
import httpx
import logging
import time
from typing import Optional
from dataclasses import dataclass
from llm_gateway.config import get_config
logger = logging.getLogger(__name__)
@dataclass
class VisionOCRResult:
"""Result from Vision-LLM OCR extraction."""
text: str
confidence: float
processing_time_ms: int
model: str = "llama3.2-vision:11b"
device: str = "local-ollama"
# OCR System Prompt für optimale Handschrifterkennung
HANDWRITING_OCR_PROMPT = """Du bist ein Experte für Handschrifterkennung (OCR).
AUFGABE: Extrahiere den handschriftlichen Text aus dem Bild so genau wie möglich.
WICHTIGE REGELN:
1. Transkribiere NUR den sichtbaren Text - erfinde nichts dazu
2. Behalte die Zeilenstruktur bei (jede Zeile auf einer neuen Zeile)
3. Bei unleserlichen Stellen: [unleserlich] oder [?] verwenden
4. Ignoriere Linien, Kästchen und andere Formatierungen
5. Korrigiere KEINE Rechtschreibfehler - transkribiere exakt was da steht
6. Bei Aufzählungen: Nummern/Punkte beibehalten (1., 2., a), b), etc.)
AUSGABE: Nur der transkribierte Text, keine Erklärungen oder Kommentare."""
# Alternative Prompt für gedruckten Text
PRINTED_OCR_PROMPT = """Extrahiere den gesamten Text aus diesem Bild.
Behalte die Struktur bei (Absätze, Listen, etc.).
Gib nur den extrahierten Text zurück, ohne Kommentare."""
class VisionOCRService:
"""
OCR Service mit Llama 3.2 Vision über Ollama.
Läuft komplett lokal auf dem Mac Mini - keine Cloud-Verbindung nötig.
Ideal für datenschutzkonforme Klausurkorrektur in Schulen.
Usage:
service = VisionOCRService()
if await service.is_available():
result = await service.extract_text(image_bytes)
print(result.text)
"""
def __init__(self, ollama_url: Optional[str] = None, model: Optional[str] = None):
"""
Initialize Vision OCR Service.
Args:
ollama_url: Ollama API URL (default: from config)
model: Vision model to use (default: llama3.2-vision:11b)
"""
config = get_config()
self.ollama_url = ollama_url or (config.ollama.base_url if config.ollama else "http://localhost:11434")
self.model = model or config.vision_model
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create HTTP client."""
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
timeout=300.0 # 5 min timeout für große Bilder
)
return self._client
async def close(self):
"""Close the HTTP client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
async def is_available(self) -> bool:
"""Check if Ollama with vision model is available."""
try:
client = await self._get_client()
# Check Ollama health
response = await client.get(
f"{self.ollama_url}/api/tags",
timeout=5.0
)
if response.status_code != 200:
return False
# Check if vision model is installed
data = response.json()
models = [m.get("name", "") for m in data.get("models", [])]
# Check for any vision model
has_vision = any(
"vision" in m.lower() or "llava" in m.lower()
for m in models
)
if not has_vision:
logger.warning(f"No vision model found. Available: {models}")
return False
return True
except Exception as e:
logger.warning(f"Vision OCR service not available: {e}")
return False
async def get_status(self) -> dict:
"""Get service status."""
try:
client = await self._get_client()
response = await client.get(f"{self.ollama_url}/api/tags")
if response.status_code == 200:
data = response.json()
models = data.get("models", [])
vision_models = [
m for m in models
if "vision" in m.get("name", "").lower() or "llava" in m.get("name", "").lower()
]
return {
"status": "available",
"ollama_url": self.ollama_url,
"configured_model": self.model,
"vision_models": [m.get("name") for m in vision_models],
"total_models": len(models)
}
else:
return {
"status": "unavailable",
"error": f"HTTP {response.status_code}"
}
except Exception as e:
return {
"status": "unavailable",
"error": str(e)
}
async def extract_text(
self,
image_data: bytes,
filename: str = "image.png",
is_handwriting: bool = True
) -> VisionOCRResult:
"""
Extract text from an image using Vision LLM.
Args:
image_data: Raw image bytes (PNG, JPG, etc.)
filename: Original filename (for logging)
is_handwriting: True for handwriting, False for printed text
Returns:
VisionOCRResult with extracted text
"""
start_time = time.time()
try:
client = await self._get_client()
# Encode image as base64
image_base64 = base64.b64encode(image_data).decode("utf-8")
# Select appropriate prompt
prompt = HANDWRITING_OCR_PROMPT if is_handwriting else PRINTED_OCR_PROMPT
# Ollama Vision API request
payload = {
"model": self.model,
"messages": [
{
"role": "user",
"content": prompt,
"images": [image_base64]
}
],
"stream": False,
"options": {
"temperature": 0.1, # Low temperature for consistent OCR
"num_predict": 2048, # Max tokens for extracted text
}
}
logger.info(f"Sending image to Vision OCR: {filename} ({len(image_data)} bytes)")
response = await client.post(
f"{self.ollama_url}/api/chat",
json=payload,
timeout=180.0 # 3 min timeout
)
response.raise_for_status()
data = response.json()
extracted_text = data.get("message", {}).get("content", "")
processing_time_ms = int((time.time() - start_time) * 1000)
# Estimate confidence based on response quality
confidence = self._estimate_confidence(extracted_text)
logger.info(
f"Vision OCR completed for {filename}: "
f"{len(extracted_text)} chars in {processing_time_ms}ms"
)
return VisionOCRResult(
text=extracted_text.strip(),
confidence=confidence,
processing_time_ms=processing_time_ms,
model=self.model,
device="local-ollama"
)
except httpx.TimeoutException:
logger.error(f"Vision OCR timed out for {filename}")
raise
except Exception as e:
logger.error(f"Vision OCR failed for {filename}: {e}")
raise
def _estimate_confidence(self, text: str) -> float:
"""
Estimate OCR confidence based on text quality.
This is a heuristic - real confidence would need model output.
"""
if not text:
return 0.0
# Count uncertain markers
uncertain_markers = text.count("[unleserlich]") + text.count("[?]")
# Count reasonable text vs markers
text_length = len(text.replace("[unleserlich]", "").replace("[?]", ""))
if text_length == 0:
return 0.1
# Base confidence
confidence = 0.85
# Reduce for uncertain markers
confidence -= min(uncertain_markers * 0.05, 0.3)
# Very short text might be incomplete
if text_length < 20:
confidence -= 0.1
return max(confidence, 0.1)
# Singleton instance
_vision_ocr_service: Optional[VisionOCRService] = None
def get_vision_ocr_service() -> VisionOCRService:
"""Get the Vision OCR service singleton."""
global _vision_ocr_service
if _vision_ocr_service is None:
_vision_ocr_service = VisionOCRService()
return _vision_ocr_service
async def extract_handwriting(
image_data: bytes,
filename: str = "image.png"
) -> VisionOCRResult:
"""
Convenience function to extract handwriting from an image.
Uses Llama 3.2 Vision locally via Ollama.
All processing happens on the local Mac Mini - DSGVO-konform.
Args:
image_data: Raw image bytes
filename: Original filename
Returns:
VisionOCRResult with extracted text
"""
service = get_vision_ocr_service()
return await service.extract_text(image_data, filename, is_handwriting=True)

View File

@@ -0,0 +1,9 @@
"""
Tests for Klausurkorrektur Module.
Tests cover:
- Database models and repository
- Pseudonymization service
- API routes
- Privacy guarantees
"""

View File

@@ -0,0 +1,455 @@
"""
Tests for Magic Onboarding functionality.
Tests cover:
- OnboardingSession lifecycle
- Student detection and confirmation
- Roster parsing
- School resolution
- Module linking
"""
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from datetime import datetime
# Import models
from klausur.db_models import (
OnboardingSession, DetectedStudent, ModuleLink,
OnboardingStatus, ModuleLinkType
)
# Import services
from klausur.services.roster_parser import RosterParser, RosterEntry, NameMatch
from klausur.services.school_resolver import SchoolResolver, BUNDESLAENDER, SCHULFORMEN
from klausur.services.module_linker import (
ModuleLinker, CorrectionResult, MeetingUrgency, ParentMeetingSuggestion
)
# =============================================================================
# ROSTER PARSER TESTS
# =============================================================================
class TestRosterParser:
"""Tests for RosterParser service."""
def test_match_first_names_exact_match(self):
"""Test exact name matching."""
parser = RosterParser()
roster = [
RosterEntry(first_name="Max", last_name="Mueller"),
RosterEntry(first_name="Anna", last_name="Schmidt"),
RosterEntry(first_name="Tim", last_name="Weber"),
]
detected = ["Max", "Anna", "Tim"]
matches = parser.match_first_names(detected, roster)
# Check all names matched
assert len(matches) == 3
# Find Max match
max_match = next(m for m in matches if m.detected_name == "Max")
assert max_match.matched_entry is not None
assert max_match.matched_entry.last_name == "Mueller"
assert max_match.match_type == "exact"
assert max_match.confidence == 1.0
def test_match_first_names_fuzzy_match(self):
"""Test fuzzy matching for similar names."""
parser = RosterParser()
roster = [
RosterEntry(first_name="Maximilian", last_name="Mueller"),
RosterEntry(first_name="Anna-Lena", last_name="Schmidt"),
]
# "Max" should fuzzy-match "Maximilian" (starts with)
detected = ["Max"]
matches = parser.match_first_names(detected, roster)
assert len(matches) == 1
max_match = matches[0]
# Should match to Maximilian via first_name matching
if max_match.matched_entry is not None:
assert max_match.match_type in ["first_name", "fuzzy"]
def test_match_first_names_no_match(self):
"""Test handling of unmatched names."""
parser = RosterParser()
roster = [
RosterEntry(first_name="Max", last_name="Mueller"),
]
detected = ["Sophie", "Lisa"]
matches = parser.match_first_names(detected, roster)
# Both should be unmatched
assert len(matches) == 2
for match in matches:
assert match.matched_entry is None
assert match.match_type == "none"
def test_roster_entry_creation(self):
"""Test RosterEntry dataclass creation."""
entry = RosterEntry(
first_name="Max",
last_name="Mueller",
student_number="12345",
parent_email="eltern@example.com",
parent_phone="+49123456789"
)
assert entry.first_name == "Max"
assert entry.last_name == "Mueller"
assert entry.parent_email == "eltern@example.com"
def test_name_match_dataclass(self):
"""Test NameMatch dataclass creation."""
entry = RosterEntry(first_name="Max", last_name="Mueller")
match = NameMatch(
detected_name="Max",
matched_entry=entry,
confidence=1.0,
match_type="exact"
)
assert match.detected_name == "Max"
assert match.matched_entry.last_name == "Mueller"
assert match.confidence == 1.0
# =============================================================================
# SCHOOL RESOLVER TESTS
# =============================================================================
class TestSchoolResolver:
"""Tests for SchoolResolver service."""
def test_bundeslaender_completeness(self):
"""Test that all 16 German states are included."""
assert len(BUNDESLAENDER) == 16
# BUNDESLAENDER is a dict with codes as keys
assert "NI" in BUNDESLAENDER # Niedersachsen
assert "BY" in BUNDESLAENDER # Bayern
assert "BE" in BUNDESLAENDER # Berlin
# Check values too
assert BUNDESLAENDER["NI"] == "Niedersachsen"
def test_schulformen_have_grades(self):
"""Test that each Schulform has grade ranges."""
for schulform, info in SCHULFORMEN.items():
assert "grades" in info
assert isinstance(info["grades"], list)
assert len(info["grades"]) > 0
def test_detect_grade_from_class_name(self):
"""Test grade detection from class names."""
resolver = SchoolResolver()
# Test various formats
assert resolver.detect_grade_from_class_name("3a") == 3
assert resolver.detect_grade_from_class_name("10b") == 10
assert resolver.detect_grade_from_class_name("Q1") == 11
assert resolver.detect_grade_from_class_name("Q2") == 12
assert resolver.detect_grade_from_class_name("12") == 12
def test_detect_grade_returns_none_for_invalid(self):
"""Test grade detection returns None for invalid input."""
resolver = SchoolResolver()
assert resolver.detect_grade_from_class_name("abc") is None
assert resolver.detect_grade_from_class_name("") is None
def test_local_storage_initialization(self):
"""Test that local storage starts empty."""
resolver = SchoolResolver()
assert resolver._local_schools == {}
assert resolver._local_classes == {}
# =============================================================================
# MODULE LINKER TESTS
# =============================================================================
class TestModuleLinker:
"""Tests for ModuleLinker service."""
def test_suggest_elternabend_for_weak_students(self):
"""Test parent meeting suggestions for failing grades."""
linker = ModuleLinker()
results = [
CorrectionResult(
doc_token="token1", score=25, max_score=100,
grade="5", feedback=""
),
CorrectionResult(
doc_token="token2", score=85, max_score=100,
grade="2", feedback=""
),
CorrectionResult(
doc_token="token3", score=30, max_score=100,
grade="5-", feedback=""
),
CorrectionResult(
doc_token="token4", score=20, max_score=100,
grade="6", feedback=""
),
]
suggestions = linker.suggest_elternabend(
results, subject="Mathematik", threshold_grade="4"
)
# Should suggest meetings for students with grades 4 or worse
# Grades 5, 5-, and 6 should trigger meetings
assert len(suggestions) == 3
# Verify suggestions use doc_tokens (privacy)
for suggestion in suggestions:
assert suggestion.doc_token in ["token1", "token3", "token4"]
def test_suggest_elternabend_empty_for_good_class(self):
"""Test no suggestions for good performers."""
linker = ModuleLinker()
results = [
CorrectionResult(
doc_token="token1", score=95, max_score=100,
grade="1", feedback=""
),
CorrectionResult(
doc_token="token2", score=85, max_score=100,
grade="2", feedback=""
),
CorrectionResult(
doc_token="token3", score=78, max_score=100,
grade="3", feedback=""
),
]
suggestions = linker.suggest_elternabend(
results, subject="Deutsch", threshold_grade="4"
)
assert len(suggestions) == 0
def test_calculate_grade_statistics(self):
"""Test grade distribution calculation."""
linker = ModuleLinker()
results = [
CorrectionResult(doc_token="t1", score=95, max_score=100, grade="1", feedback=""),
CorrectionResult(doc_token="t2", score=85, max_score=100, grade="2", feedback=""),
CorrectionResult(doc_token="t3", score=85, max_score=100, grade="2", feedback=""),
CorrectionResult(doc_token="t4", score=75, max_score=100, grade="3", feedback=""),
CorrectionResult(doc_token="t5", score=55, max_score=100, grade="4", feedback=""),
CorrectionResult(doc_token="t6", score=25, max_score=100, grade="5", feedback=""),
]
stats = linker.calculate_grade_statistics(results)
assert isinstance(stats, dict)
assert stats["count"] == 6
# Check grade distribution
assert stats["distribution"].get("1", 0) == 1
assert stats["distribution"].get("2", 0) == 2
assert stats["distribution"].get("3", 0) == 1
# Check passing/failing counts
assert stats["passing_count"] == 5 # Grades 1-4 pass
assert stats["failing_count"] == 1 # Grade 5 fails
def test_calculate_statistics_empty_results(self):
"""Test statistics with no results."""
linker = ModuleLinker()
stats = linker.calculate_grade_statistics([])
assert stats == {}
def test_correction_result_creation(self):
"""Test CorrectionResult dataclass."""
result = CorrectionResult(
doc_token="abc-123",
score=87,
max_score=100,
grade="2+",
feedback="Gut geloest",
question_results=[{"aufgabe": 1, "punkte": 10}]
)
assert result.doc_token == "abc-123"
assert result.score == 87
assert result.grade == "2+"
# =============================================================================
# DB MODEL TESTS
# =============================================================================
class TestOnboardingModels:
"""Tests for Magic Onboarding database models."""
def test_onboarding_status_enum_values(self):
"""Test OnboardingStatus enum has all required values."""
assert OnboardingStatus.ANALYZING.value == "analyzing"
assert OnboardingStatus.CONFIRMING.value == "confirming"
assert OnboardingStatus.PROCESSING.value == "processing"
assert OnboardingStatus.LINKING.value == "linking"
assert OnboardingStatus.COMPLETE.value == "complete"
def test_module_link_type_enum_values(self):
"""Test ModuleLinkType enum has all required values."""
assert ModuleLinkType.NOTENBUCH.value == "notenbuch"
assert ModuleLinkType.ELTERNABEND.value == "elternabend"
assert ModuleLinkType.ZEUGNIS.value == "zeugnis"
assert ModuleLinkType.CALENDAR.value == "calendar"
assert ModuleLinkType.KLASSENBUCH.value == "klassenbuch"
def test_onboarding_session_repr(self):
"""Test OnboardingSession string representation."""
session = OnboardingSession(
id="12345678-1234-1234-1234-123456789abc",
teacher_id="teacher-1",
detected_class="3a",
status=OnboardingStatus.ANALYZING
)
repr_str = repr(session)
assert "12345678" in repr_str
assert "3a" in repr_str
assert "analyzing" in repr_str
def test_detected_student_repr(self):
"""Test DetectedStudent string representation."""
student = DetectedStudent(
id="12345678-1234-1234-1234-123456789abc",
detected_first_name="Max"
)
repr_str = repr(student)
assert "Max" in repr_str
def test_module_link_repr(self):
"""Test ModuleLink string representation."""
link = ModuleLink(
id="12345678-1234-1234-1234-123456789abc",
klausur_session_id="session-1",
link_type=ModuleLinkType.NOTENBUCH,
target_module="school"
)
repr_str = repr(link)
assert "notenbuch" in repr_str
assert "school" in repr_str
# =============================================================================
# PRIVACY TESTS
# =============================================================================
class TestPrivacyInMagicOnboarding:
"""Tests ensuring privacy is maintained in Magic Onboarding."""
def test_detected_student_no_full_last_name_in_detection(self):
"""Test that detection only captures hints, not full last names."""
student = DetectedStudent(
id="12345678-1234-1234-1234-123456789abc",
detected_first_name="Max",
detected_last_name_hint="M." # Only initial/hint, not full name
)
# The detection phase should only have hints
assert student.detected_last_name_hint == "M."
# Full name is only set after teacher confirmation
assert student.confirmed_last_name is None
def test_module_link_uses_doc_tokens_not_names(self):
"""Test that module links use pseudonymized tokens."""
linker = ModuleLinker()
# Results should only contain doc_tokens, not student names
results = [
CorrectionResult(
doc_token="uuid-token-1", score=45, max_score=100,
grade="4", feedback=""
),
]
suggestions = linker.suggest_elternabend(
results, subject="Deutsch", threshold_grade="4"
)
# Suggestions reference doc_tokens, not names
for suggestion in suggestions:
assert hasattr(suggestion, 'doc_token')
# Verify doc_token is the pseudonymized one
assert suggestion.doc_token == "uuid-token-1"
# =============================================================================
# INTEGRATION FLOW TESTS
# =============================================================================
class TestMagicOnboardingFlow:
"""Tests for the complete Magic Onboarding flow."""
def test_onboarding_status_progression(self):
"""Test that status progresses correctly through the flow."""
statuses = list(OnboardingStatus)
# Verify correct order
assert statuses[0] == OnboardingStatus.ANALYZING
assert statuses[1] == OnboardingStatus.CONFIRMING
assert statuses[2] == OnboardingStatus.PROCESSING
assert statuses[3] == OnboardingStatus.LINKING
assert statuses[4] == OnboardingStatus.COMPLETE
def test_grade_conversion_german_scale(self):
"""Test that German grading scale (1-6) is used correctly."""
linker = ModuleLinker()
# Test the internal grade checking
# Grades 1-4 are passing, 5-6 are failing
results = [
CorrectionResult(doc_token="t1", score=95, max_score=100, grade="1", feedback=""),
CorrectionResult(doc_token="t2", score=80, max_score=100, grade="2", feedback=""),
CorrectionResult(doc_token="t3", score=65, max_score=100, grade="3", feedback=""),
CorrectionResult(doc_token="t4", score=50, max_score=100, grade="4", feedback=""),
CorrectionResult(doc_token="t5", score=30, max_score=100, grade="5", feedback=""),
CorrectionResult(doc_token="t6", score=15, max_score=100, grade="6", feedback=""),
]
stats = linker.calculate_grade_statistics(results)
# 4 passing (grades 1-4), 2 failing (grades 5, 6)
assert stats["passing_count"] == 4
assert stats["failing_count"] == 2
def test_meeting_urgency_levels(self):
"""Test meeting urgency assignment based on grades."""
linker = ModuleLinker()
results = [
CorrectionResult(doc_token="t1", score=55, max_score=100, grade="4", feedback=""),
CorrectionResult(doc_token="t2", score=30, max_score=100, grade="5", feedback=""),
CorrectionResult(doc_token="t3", score=15, max_score=100, grade="6", feedback=""),
]
suggestions = linker.suggest_elternabend(
results, subject="Mathe", threshold_grade="4"
)
# Verify urgency levels exist and are meaningful
urgencies = [s.urgency for s in suggestions]
assert len(urgencies) == 3
# Grade 6 should be high urgency
grade_6_suggestion = next(s for s in suggestions if s.grade == "6")
assert grade_6_suggestion.urgency == MeetingUrgency.HIGH

View File

@@ -0,0 +1,209 @@
"""
Tests for PseudonymizationService.
Verifies that:
- doc_tokens are cryptographically random
- QR codes are generated correctly
- Header redaction works as expected
- No personal data leaks through pseudonymization
"""
import pytest
import uuid
from unittest.mock import patch, MagicMock
from klausur.services.pseudonymizer import (
PseudonymizationService,
get_pseudonymizer,
RedactionResult,
QRDetectionResult,
)
class TestDocTokenGeneration:
"""Tests for doc_token generation."""
def test_generate_doc_token_returns_valid_uuid(self):
"""doc_token should be a valid UUID4."""
service = PseudonymizationService()
token = service.generate_doc_token()
# Should be a valid UUID
parsed = uuid.UUID(token)
assert parsed.version == 4
def test_generate_doc_token_is_unique(self):
"""Each generated token should be unique."""
service = PseudonymizationService()
tokens = [service.generate_doc_token() for _ in range(1000)]
# All tokens should be unique
assert len(set(tokens)) == 1000
def test_generate_batch_tokens_correct_count(self):
"""Batch generation should return correct number of tokens."""
service = PseudonymizationService()
tokens = service.generate_batch_tokens(25)
assert len(tokens) == 25
assert len(set(tokens)) == 25 # All unique
def test_token_no_correlation_to_index(self):
"""Tokens should not correlate to their generation order."""
service = PseudonymizationService()
# Generate multiple batches
batch1 = service.generate_batch_tokens(10)
batch2 = service.generate_batch_tokens(10)
# No overlap between batches
assert not set(batch1).intersection(set(batch2))
class TestQRCodeGeneration:
"""Tests for QR code generation."""
def test_generate_qr_code_returns_bytes(self):
"""QR code generation should return PNG bytes."""
service = PseudonymizationService()
token = service.generate_doc_token()
try:
qr_bytes = service.generate_qr_code(token)
assert isinstance(qr_bytes, bytes)
# PNG magic bytes
assert qr_bytes[:8] == b'\x89PNG\r\n\x1a\n'
except RuntimeError:
pytest.skip("qrcode library not installed")
def test_generate_qr_code_custom_size(self):
"""QR code should respect custom size."""
service = PseudonymizationService()
token = service.generate_doc_token()
try:
# Generate with different sizes
small = service.generate_qr_code(token, size=100)
large = service.generate_qr_code(token, size=400)
# Both should be valid PNG
assert small[:8] == b'\x89PNG\r\n\x1a\n'
assert large[:8] == b'\x89PNG\r\n\x1a\n'
# Large should be bigger
assert len(large) > len(small)
except RuntimeError:
pytest.skip("qrcode library not installed")
class TestHeaderRedaction:
"""Tests for header redaction."""
def test_redact_header_returns_redaction_result(self):
"""Redaction should return proper RedactionResult."""
service = PseudonymizationService()
# Create a simple test image (1x1 white pixel PNG)
# This is a minimal valid PNG
test_png = (
b'\x89PNG\r\n\x1a\n' # PNG signature
b'\x00\x00\x00\rIHDR' # IHDR chunk
b'\x00\x00\x00\x01' # Width: 1
b'\x00\x00\x00\x01' # Height: 1
b'\x08\x02' # Bit depth: 8, Color type: RGB
b'\x00\x00\x00' # Compression, Filter, Interlace
b'\x90wS\xde' # CRC
b'\x00\x00\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18\xd8N' # IDAT
b'\x00\x00\x00\x00IEND\xaeB`\x82' # IEND
)
result = service.redact_header(test_png)
assert isinstance(result, RedactionResult)
assert isinstance(result.redacted_image, bytes)
def test_redact_header_with_invalid_image_returns_original(self):
"""Invalid images should return original bytes with redaction_applied=False."""
service = PseudonymizationService()
invalid_data = b'not an image'
result = service.redact_header(invalid_data)
assert result.redacted_image == invalid_data
assert result.redaction_applied is False
class TestQRDetection:
"""Tests for QR code detection."""
def test_detect_qr_code_no_qr_returns_none(self):
"""Image without QR should return None token."""
service = PseudonymizationService()
# Empty/invalid image
result = service.detect_qr_code(b'not an image with qr')
assert result.doc_token is None
assert result.confidence == 0.0
class TestSingleton:
"""Tests for singleton pattern."""
def test_get_pseudonymizer_returns_same_instance(self):
"""Singleton should return same instance."""
instance1 = get_pseudonymizer()
instance2 = get_pseudonymizer()
assert instance1 is instance2
def test_pseudonymizer_is_service_instance(self):
"""Singleton should be PseudonymizationService."""
instance = get_pseudonymizer()
assert isinstance(instance, PseudonymizationService)
class TestPrivacyGuarantees:
"""Tests verifying privacy guarantees."""
def test_token_cannot_be_reversed_to_name(self):
"""Tokens should have no mathematical relationship to any input."""
service = PseudonymizationService()
# Generate tokens for "students"
student_names = ["Max Mustermann", "Anna Schmidt", "Tim Mueller"]
tokens = service.generate_batch_tokens(len(student_names))
# Tokens should not contain any part of names
for token in tokens:
for name in student_names:
assert name.lower() not in token.lower()
for part in name.split():
assert part.lower() not in token.lower()
def test_token_generation_is_not_deterministic(self):
"""Same input should not produce same token."""
service = PseudonymizationService()
# Even with "same student count", tokens should differ
batch1 = service.generate_batch_tokens(5)
batch2 = service.generate_batch_tokens(5)
# No tokens should match
assert not set(batch1).intersection(set(batch2))
def test_token_entropy(self):
"""Tokens should have sufficient entropy."""
service = PseudonymizationService()
tokens = service.generate_batch_tokens(100)
# Each token should be 36 chars (UUID format: 8-4-4-4-12)
for token in tokens:
assert len(token) == 36
assert token.count('-') == 4
# Check character distribution (rough entropy check)
all_chars = ''.join(t.replace('-', '') for t in tokens)
unique_chars = set(all_chars)
# Should use all hex digits (0-9, a-f)
assert len(unique_chars) >= 10

View File

@@ -0,0 +1,248 @@
"""
Tests for KlausurRepository.
Verifies:
- Teacher isolation (critical for privacy)
- CRUD operations
- Data retention cleanup
"""
import pytest
from datetime import datetime, timedelta
from unittest.mock import MagicMock, patch
from sqlalchemy.orm import Session
from klausur.repository import KlausurRepository
from klausur.db_models import (
ExamSession, PseudonymizedDocument, QRBatchJob,
SessionStatus, DocumentStatus
)
@pytest.fixture
def mock_db():
"""Create a mock database session."""
return MagicMock(spec=Session)
@pytest.fixture
def repo(mock_db):
"""Create a repository with mock DB."""
return KlausurRepository(mock_db)
class TestTeacherIsolation:
"""Tests for teacher namespace isolation (CRITICAL for privacy)."""
def test_get_session_requires_teacher_id(self, repo, mock_db):
"""Getting a session must require teacher_id."""
# Setup mock
mock_query = MagicMock()
mock_db.query.return_value = mock_query
mock_query.filter.return_value = mock_query
mock_query.first.return_value = None
# Attempt to get session
result = repo.get_session("session-123", "teacher-A")
# Verify filter was called (teacher isolation)
mock_db.query.assert_called_with(ExamSession)
mock_query.filter.assert_called()
def test_list_sessions_only_returns_teacher_sessions(self, repo, mock_db):
"""Listing sessions must filter by teacher_id."""
mock_query = MagicMock()
mock_db.query.return_value = mock_query
mock_query.filter.return_value = mock_query
mock_query.order_by.return_value = mock_query
mock_query.offset.return_value = mock_query
mock_query.limit.return_value = mock_query
mock_query.all.return_value = []
result = repo.list_sessions("teacher-A")
# Verify query chain
mock_db.query.assert_called_with(ExamSession)
def test_get_document_verifies_teacher_ownership(self, repo, mock_db):
"""Getting a document must verify teacher owns the session."""
mock_query = MagicMock()
mock_db.query.return_value = mock_query
mock_query.join.return_value = mock_query
mock_query.filter.return_value = mock_query
mock_query.first.return_value = None
result = repo.get_document("doc-token-123", "teacher-A")
# Must join with ExamSession to verify teacher_id
mock_query.join.assert_called()
def test_different_teachers_cannot_see_each_others_sessions(self, repo, mock_db):
"""Teacher A cannot access Teacher B's sessions."""
# Create mock session owned by teacher-B
session_b = MagicMock(spec=ExamSession)
session_b.teacher_id = "teacher-B"
session_b.id = "session-123"
mock_query = MagicMock()
mock_db.query.return_value = mock_query
mock_query.filter.return_value = mock_query
# Return None because filter should exclude teacher-B's session
mock_query.first.return_value = None
# Teacher A tries to access
result = repo.get_session("session-123", "teacher-A")
assert result is None
class TestSessionOperations:
"""Tests for session CRUD operations."""
def test_create_session_sets_teacher_id(self, repo, mock_db):
"""Creating a session must set the teacher_id."""
repo.create_session(
teacher_id="teacher-123",
name="Mathe Klausur",
subject="Mathematik"
)
# Verify session was added with teacher_id
mock_db.add.assert_called_once()
added_session = mock_db.add.call_args[0][0]
assert added_session.teacher_id == "teacher-123"
assert added_session.name == "Mathe Klausur"
def test_create_session_sets_retention_date(self, repo, mock_db):
"""Sessions must have a retention date for auto-deletion."""
repo.create_session(
teacher_id="teacher-123",
name="Test",
retention_days=30
)
added_session = mock_db.add.call_args[0][0]
assert added_session.retention_until is not None
# Should be approximately 30 days in the future
expected = datetime.utcnow() + timedelta(days=30)
diff = abs((added_session.retention_until - expected).total_seconds())
assert diff < 60 # Within 1 minute
def test_delete_session_soft_delete_by_default(self, repo, mock_db):
"""Deleting should soft-delete by default."""
mock_session = MagicMock(spec=ExamSession)
mock_session.status = SessionStatus.CREATED
mock_query = MagicMock()
mock_db.query.return_value = mock_query
mock_query.filter.return_value = mock_query
mock_query.first.return_value = mock_session
result = repo.delete_session("session-123", "teacher-A")
# Should set status to DELETED, not actually delete
assert mock_session.status == SessionStatus.DELETED
mock_db.delete.assert_not_called()
def test_delete_session_hard_delete_when_requested(self, repo, mock_db):
"""Hard delete should actually delete the record."""
mock_session = MagicMock(spec=ExamSession)
mock_query = MagicMock()
mock_db.query.return_value = mock_query
mock_query.filter.return_value = mock_query
mock_query.first.return_value = mock_session
result = repo.delete_session("session-123", "teacher-A", hard_delete=True)
mock_db.delete.assert_called_once_with(mock_session)
class TestDocumentOperations:
"""Tests for document CRUD operations."""
def test_create_document_requires_valid_session(self, repo, mock_db):
"""Creating a document requires a valid session owned by teacher."""
# Session not found (wrong teacher or doesn't exist)
mock_query = MagicMock()
mock_db.query.return_value = mock_query
mock_query.filter.return_value = mock_query
mock_query.first.return_value = None
result = repo.create_document(
session_id="session-123",
teacher_id="teacher-A"
)
assert result is None
def test_update_document_ocr_changes_status(self, repo, mock_db):
"""Updating OCR results should update document status."""
mock_doc = MagicMock(spec=PseudonymizedDocument)
mock_doc.status = DocumentStatus.UPLOADED
# Mock get_document
with patch.object(repo, 'get_document', return_value=mock_doc):
result = repo.update_document_ocr(
doc_token="doc-123",
teacher_id="teacher-A",
ocr_text="Student answer text",
confidence=95
)
assert mock_doc.ocr_text == "Student answer text"
assert mock_doc.ocr_confidence == 95
assert mock_doc.status == DocumentStatus.OCR_COMPLETED
class TestDataRetention:
"""Tests for data retention and cleanup."""
def test_cleanup_expired_sessions(self, repo, mock_db):
"""Cleanup should mark expired sessions as deleted."""
# Create expired session
expired_session = MagicMock(spec=ExamSession)
expired_session.retention_until = datetime.utcnow() - timedelta(days=1)
expired_session.status = SessionStatus.COMPLETED
expired_session.encrypted_identity_map = b"encrypted_data"
mock_query = MagicMock()
mock_db.query.return_value = mock_query
mock_query.filter.return_value = mock_query
mock_query.all.return_value = [expired_session]
count = repo.cleanup_expired_sessions()
assert count == 1
assert expired_session.status == SessionStatus.DELETED
# Identity map should be cleared
assert expired_session.encrypted_identity_map is None
class TestStatistics:
"""Tests for anonymized statistics."""
def test_get_session_stats_returns_anonymized_data(self, repo, mock_db):
"""Statistics should not contain any PII."""
mock_session = MagicMock(spec=ExamSession)
mock_session.document_count = 25
mock_session.processed_count = 20
mock_query = MagicMock()
mock_db.query.return_value = mock_query
mock_query.filter.return_value = mock_query
# first() is called twice: once for status counts and once for score stats
# Return a tuple for score_stats that can be subscripted
mock_query.first.return_value = (85.0, 60, 100) # avg, min, max scores
mock_query.group_by.return_value = mock_query
mock_query.all.return_value = []
with patch.object(repo, 'get_session', return_value=mock_session):
stats = repo.get_session_stats("session-123", "teacher-A")
# Stats should contain only aggregate data, no PII
assert "session_id" in stats
assert "total_documents" in stats
# Should NOT contain student names or tokens
assert "student_names" not in stats
assert "doc_tokens" not in stats

View File

@@ -0,0 +1,346 @@
"""
Tests for Klausur API Routes.
Verifies:
- API endpoint behavior
- Request validation
- Response format
- Privacy guarantees at API level
"""
import pytest
from unittest.mock import MagicMock, patch, AsyncMock
from fastapi.testclient import TestClient
from fastapi import FastAPI
from klausur.routes import router
from klausur.db_models import SessionStatus, DocumentStatus
@pytest.fixture
def app():
"""Create test FastAPI app."""
app = FastAPI()
app.include_router(router, prefix="/api")
return app
@pytest.fixture
def client(app):
"""Create test client."""
return TestClient(app)
class TestSessionEndpoints:
"""Tests for session-related endpoints."""
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_db')
def test_create_session_returns_201(self, mock_get_db, mock_repo_class, client):
"""Creating a session should return 201."""
# Setup mocks
mock_db = MagicMock()
mock_get_db.return_value = iter([mock_db])
mock_repo = MagicMock()
mock_repo_class.return_value = mock_repo
mock_session = MagicMock()
mock_session.id = "session-123"
mock_session.name = "Test Klausur"
mock_session.subject = "Mathe"
mock_session.class_name = "10a"
mock_session.total_points = 100
mock_session.status = SessionStatus.CREATED
mock_session.document_count = 0
mock_session.processed_count = 0
mock_session.created_at = "2024-01-15T10:00:00"
mock_session.completed_at = None
mock_session.retention_until = "2024-02-15T10:00:00"
mock_repo.create_session.return_value = mock_session
response = client.post("/api/klausur/sessions", json={
"name": "Test Klausur",
"subject": "Mathe",
"class_name": "10a"
})
assert response.status_code == 201
data = response.json()
assert data["name"] == "Test Klausur"
assert data["status"] == "created"
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_db')
def test_create_session_validates_name(self, mock_get_db, mock_repo_class, client):
"""Session name is required and must not be empty."""
response = client.post("/api/klausur/sessions", json={
"name": "", # Empty name
"subject": "Mathe"
})
assert response.status_code == 422 # Validation error
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_db')
def test_list_sessions_returns_array(self, mock_get_db, mock_repo_class, client):
"""Listing sessions should return an array."""
mock_db = MagicMock()
mock_get_db.return_value = iter([mock_db])
mock_repo = MagicMock()
mock_repo_class.return_value = mock_repo
mock_repo.list_sessions.return_value = []
response = client.get("/api/klausur/sessions")
assert response.status_code == 200
data = response.json()
assert "sessions" in data
assert isinstance(data["sessions"], list)
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_db')
def test_get_session_404_when_not_found(self, mock_get_db, mock_repo_class, client):
"""Getting non-existent session should return 404."""
mock_db = MagicMock()
mock_get_db.return_value = iter([mock_db])
mock_repo = MagicMock()
mock_repo_class.return_value = mock_repo
mock_repo.get_session.return_value = None
response = client.get("/api/klausur/sessions/nonexistent-123")
assert response.status_code == 404
class TestQREndpoints:
"""Tests for QR code generation endpoints."""
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_pseudonymizer')
@patch('klausur.routes.get_db')
def test_generate_qr_batch_creates_tokens(
self, mock_get_db, mock_get_pseudonymizer, mock_repo_class, client
):
"""QR batch generation should create correct number of tokens."""
mock_db = MagicMock()
mock_get_db.return_value = iter([mock_db])
mock_repo = MagicMock()
mock_repo_class.return_value = mock_repo
mock_session = MagicMock()
mock_repo.get_session.return_value = mock_session
mock_batch = MagicMock()
mock_batch.id = "batch-123"
mock_batch.student_count = 5
mock_repo.create_qr_batch.return_value = mock_batch
mock_pseudonymizer = MagicMock()
mock_pseudonymizer.generate_batch_tokens.return_value = [
"token-1", "token-2", "token-3", "token-4", "token-5"
]
mock_get_pseudonymizer.return_value = mock_pseudonymizer
response = client.post("/api/klausur/sessions/session-123/qr-batch", json={
"student_count": 5
})
assert response.status_code == 200
data = response.json()
assert len(data["generated_tokens"]) == 5
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_db')
def test_qr_batch_validates_student_count(self, mock_get_db, mock_repo_class, client):
"""Student count must be within valid range."""
# Too many students
response = client.post("/api/klausur/sessions/session-123/qr-batch", json={
"student_count": 200 # Max is 100
})
assert response.status_code == 422
class TestUploadEndpoints:
"""Tests for document upload endpoints."""
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_pseudonymizer')
@patch('klausur.routes.get_db')
def test_upload_applies_redaction_by_default(
self, mock_get_db, mock_get_pseudonymizer, mock_repo_class, client
):
"""Upload should apply header redaction by default."""
mock_db = MagicMock()
mock_get_db.return_value = iter([mock_db])
mock_repo = MagicMock()
mock_repo_class.return_value = mock_repo
mock_session = MagicMock()
mock_repo.get_session.return_value = mock_session
mock_doc = MagicMock()
mock_doc.doc_token = "doc-token-123"
mock_doc.session_id = "session-123"
mock_doc.status = DocumentStatus.UPLOADED
mock_doc.page_number = 1
mock_doc.total_pages = 1
mock_doc.ocr_confidence = 0
mock_doc.ai_score = None
mock_doc.ai_grade = None
mock_doc.ai_feedback = None
mock_doc.created_at = "2024-01-15T10:00:00"
mock_doc.processing_completed_at = None
mock_repo.create_document.return_value = mock_doc
mock_pseudonymizer = MagicMock()
mock_pseudonymizer.detect_qr_code.return_value = MagicMock(doc_token=None)
mock_pseudonymizer.generate_doc_token.return_value = "doc-token-123"
mock_pseudonymizer.smart_redact_header.return_value = MagicMock(
redaction_applied=True,
redacted_image=b"redacted",
redacted_height=300
)
mock_get_pseudonymizer.return_value = mock_pseudonymizer
# Create a minimal file upload
response = client.post(
"/api/klausur/sessions/session-123/upload",
files={"file": ("test.png", b"fake image data", "image/png")}
)
# Verify redaction was called
mock_pseudonymizer.smart_redact_header.assert_called_once()
class TestResultsEndpoints:
"""Tests for results endpoints."""
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_db')
def test_results_only_return_pseudonymized_data(
self, mock_get_db, mock_repo_class, client
):
"""Results should only contain doc_tokens, not names."""
mock_db = MagicMock()
mock_get_db.return_value = iter([mock_db])
mock_repo = MagicMock()
mock_repo_class.return_value = mock_repo
mock_session = MagicMock()
mock_session.total_points = 100
mock_repo.get_session.return_value = mock_session
mock_doc = MagicMock()
mock_doc.doc_token = "anonymous-token-123"
mock_doc.status = DocumentStatus.COMPLETED
mock_doc.ai_score = 85
mock_doc.ai_grade = "2+"
mock_doc.ai_feedback = "Good work"
mock_doc.ai_details = {}
mock_repo.list_documents.return_value = [mock_doc]
response = client.get("/api/klausur/sessions/session-123/results")
assert response.status_code == 200
data = response.json()
# Results should use doc_token, not student name
assert len(data) == 1
assert "doc_token" in data[0]
assert "student_name" not in data[0]
assert "name" not in data[0]
class TestIdentityMapEndpoints:
"""Tests for identity map (vault) endpoints."""
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_db')
def test_store_identity_map_accepts_encrypted_data(
self, mock_get_db, mock_repo_class, client
):
"""Identity map endpoint should accept encrypted data."""
mock_db = MagicMock()
mock_get_db.return_value = iter([mock_db])
mock_repo = MagicMock()
mock_repo_class.return_value = mock_repo
mock_session = MagicMock()
mock_repo.update_session_identity_map.return_value = mock_session
# Base64 encoded "encrypted" data
import base64
encrypted = base64.b64encode(b"encrypted identity map").decode()
response = client.post("/api/klausur/sessions/session-123/identity-map", json={
"encrypted_data": encrypted,
"iv": "base64iv=="
})
assert response.status_code == 204
@patch('klausur.routes.KlausurRepository')
@patch('klausur.routes.get_db')
def test_get_identity_map_returns_encrypted_blob(
self, mock_get_db, mock_repo_class, client
):
"""Getting identity map should return encrypted blob."""
mock_db = MagicMock()
mock_get_db.return_value = iter([mock_db])
mock_repo = MagicMock()
mock_repo_class.return_value = mock_repo
mock_session = MagicMock()
mock_session.encrypted_identity_map = b"encrypted data"
mock_session.identity_map_iv = "ivvalue"
mock_repo.get_session.return_value = mock_session
response = client.get("/api/klausur/sessions/session-123/identity-map")
assert response.status_code == 200
data = response.json()
assert "encrypted_data" in data
assert "iv" in data
class TestPrivacyAtAPILevel:
"""Tests verifying privacy guarantees at API level."""
def test_no_student_names_in_any_response_schema(self):
"""Verify response schemas don't include student names."""
from klausur.routes import (
SessionResponse, DocumentResponse, CorrectionResultResponse
)
# Check all response model fields
session_fields = SessionResponse.model_fields.keys()
doc_fields = DocumentResponse.model_fields.keys()
result_fields = CorrectionResultResponse.model_fields.keys()
all_fields = list(session_fields) + list(doc_fields) + list(result_fields)
# Should not contain student-name-related fields
# Note: "name" alone is allowed (e.g., session/exam name like "Mathe Klausur")
forbidden = ["student_name", "schueler_name", "student", "pupil", "schueler"]
for field in all_fields:
assert field.lower() not in forbidden, f"Field '{field}' may contain PII"
def test_identity_map_request_requires_encryption(self):
"""Identity map must be encrypted before storage."""
from klausur.routes import IdentityMapUpdate
# Check that schema requires encrypted_data, not plain names
fields = IdentityMapUpdate.model_fields.keys()
assert "encrypted_data" in fields
assert "names" not in fields
assert "student_names" not in fields