""" Klausurkorrektur API Routes. Privacy-by-Design exam correction with QR-code based pseudonymization. All endpoints are teacher-scoped - no cross-teacher data access possible. DSGVO Compliance: - No student names stored in backend - Only doc_tokens (pseudonymized IDs) used - Identity mapping encrypted client-side - All data auto-deleted after retention period """ import uuid import logging import re import json from datetime import datetime, timedelta from typing import Optional, List from io import BytesIO from fastapi import APIRouter, HTTPException, Query, Depends, UploadFile, File, Response, BackgroundTasks from fastapi.responses import StreamingResponse from sqlalchemy.orm import Session from pydantic import BaseModel, Field from .database import get_db from .db_models import ( ExamSession, PseudonymizedDocument, QRBatchJob, SessionStatus, DocumentStatus ) from .repository import KlausurRepository from .services.pseudonymizer import get_pseudonymizer from .services.correction_service import get_correction_service, QuestionRubric from .services.storage_service import get_storage_service from .services.processing_service import get_processing_service logger = logging.getLogger(__name__) router = APIRouter(prefix="/klausur", tags=["Klausurkorrektur"]) # ============================================================================= # Pydantic Schemas # ============================================================================= class SessionCreate(BaseModel): """Request to create a new exam session.""" name: str = Field(..., min_length=1, max_length=200, description="Session name (e.g., 'Mathe 10a - Klausur 1')") subject: str = Field(default="", max_length=100) class_name: str = Field(default="", max_length=100, description="Class name (e.g., '10a')") total_points: int = Field(default=100, ge=1, le=1000) rubric: str = Field(default="", description="General grading criteria") questions: List[dict] = Field(default=[], description="Question definitions with rubrics") retention_days: int = Field(default=30, ge=1, le=365, description="Auto-delete after N days") class SessionResponse(BaseModel): """Response for an exam session.""" id: str name: str subject: str class_name: str total_points: int status: str document_count: int processed_count: int created_at: datetime completed_at: Optional[datetime] = None retention_until: Optional[datetime] = None class Config: from_attributes = True class SessionListResponse(BaseModel): """List of exam sessions.""" sessions: List[SessionResponse] total: int class DocumentResponse(BaseModel): """Response for a pseudonymized document.""" doc_token: str session_id: str status: str page_number: int total_pages: int ocr_confidence: int ai_score: Optional[int] = None ai_grade: Optional[str] = None ai_feedback: Optional[str] = None created_at: datetime processing_completed_at: Optional[datetime] = None class Config: from_attributes = True class DocumentListResponse(BaseModel): """List of documents in a session.""" documents: List[DocumentResponse] total: int class QRBatchRequest(BaseModel): """Request to generate QR code batch.""" student_count: int = Field(..., ge=1, le=100, description="Number of QR codes to generate") labels: Optional[List[str]] = Field(default=None, description="Optional labels (numbers only, NO names!)") class QRBatchResponse(BaseModel): """Response with generated QR batch.""" batch_id: str session_id: str student_count: int generated_tokens: List[str] class IdentityMapUpdate(BaseModel): """Request to store encrypted identity map.""" encrypted_data: str = Field(..., description="Base64-encoded encrypted identity map") iv: str = Field(..., description="Initialization vector for decryption") class ProcessingStats(BaseModel): """Processing statistics for a session.""" session_id: str total_documents: int processed_documents: int status_breakdown: dict score_average: Optional[float] = None score_min: Optional[int] = None score_max: Optional[int] = None class CorrectionResultResponse(BaseModel): """AI correction result (pseudonymized).""" doc_token: str total_score: int max_score: int grade: str overall_feedback: str question_results: List[dict] # ============================================================================= # Helper Functions # ============================================================================= def get_teacher_id(request=None) -> str: """ Get teacher ID from request context. In production, this should extract the teacher ID from JWT token. For now, we use a placeholder that should be replaced with actual auth. """ # TODO: Implement proper JWT extraction # return request.state.teacher_id return "default_teacher" # ============================================================================= # Session Endpoints # ============================================================================= @router.post("/sessions", response_model=SessionResponse, status_code=201) async def create_session( data: SessionCreate, db: Session = Depends(get_db) ): """ Create a new exam correction session. This initializes a workspace for pseudonymized exam correction. No student data is stored at this point. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) session = repo.create_session( teacher_id=teacher_id, name=data.name, subject=data.subject, class_name=data.class_name, total_points=data.total_points, rubric=data.rubric, questions=data.questions, retention_days=data.retention_days ) return SessionResponse( id=session.id, name=session.name, subject=session.subject, class_name=session.class_name, total_points=session.total_points, status=session.status.value, document_count=session.document_count, processed_count=session.processed_count, created_at=session.created_at, completed_at=session.completed_at, retention_until=session.retention_until ) @router.get("/sessions", response_model=SessionListResponse) async def list_sessions( include_archived: bool = Query(False, description="Include archived sessions"), limit: int = Query(50, ge=1, le=100), offset: int = Query(0, ge=0), db: Session = Depends(get_db) ): """List all exam sessions for the current teacher.""" teacher_id = get_teacher_id() repo = KlausurRepository(db) sessions = repo.list_sessions( teacher_id=teacher_id, include_archived=include_archived, limit=limit, offset=offset ) return SessionListResponse( sessions=[SessionResponse( id=s.id, name=s.name, subject=s.subject, class_name=s.class_name, total_points=s.total_points, status=s.status.value, document_count=s.document_count, processed_count=s.processed_count, created_at=s.created_at, completed_at=s.completed_at, retention_until=s.retention_until ) for s in sessions], total=len(sessions) ) @router.get("/sessions/{session_id}", response_model=SessionResponse) async def get_session( session_id: str, db: Session = Depends(get_db) ): """Get details of a specific session.""" teacher_id = get_teacher_id() repo = KlausurRepository(db) session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") return SessionResponse( id=session.id, name=session.name, subject=session.subject, class_name=session.class_name, total_points=session.total_points, status=session.status.value, document_count=session.document_count, processed_count=session.processed_count, created_at=session.created_at, completed_at=session.completed_at, retention_until=session.retention_until ) @router.delete("/sessions/{session_id}", status_code=204) async def delete_session( session_id: str, hard_delete: bool = Query(False, description="Permanently delete (vs soft delete)"), db: Session = Depends(get_db) ): """Delete an exam session and all associated documents.""" teacher_id = get_teacher_id() repo = KlausurRepository(db) success = repo.delete_session(session_id, teacher_id, hard_delete=hard_delete) if not success: raise HTTPException(status_code=404, detail="Session not found") return Response(status_code=204) # ============================================================================= # QR Code Generation Endpoints # ============================================================================= @router.post("/sessions/{session_id}/qr-batch", response_model=QRBatchResponse) async def generate_qr_batch( session_id: str, data: QRBatchRequest, db: Session = Depends(get_db) ): """ Generate QR codes for exam pseudonymization. Each QR code contains a random doc_token that will be used to track the exam through the correction process WITHOUT revealing the student's identity. IMPORTANT: Labels should be numbers only (e.g., "Nr. 1", "Nr. 2"), NOT student names! """ teacher_id = get_teacher_id() repo = KlausurRepository(db) session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") # Generate random tokens pseudonymizer = get_pseudonymizer() tokens = pseudonymizer.generate_batch_tokens(data.student_count) # Create batch record batch = repo.create_qr_batch( session_id=session_id, teacher_id=teacher_id, student_count=data.student_count, generated_tokens=tokens ) return QRBatchResponse( batch_id=batch.id, session_id=session_id, student_count=data.student_count, generated_tokens=tokens ) @router.get("/sessions/{session_id}/qr-sheet") async def download_qr_sheet( session_id: str, batch_id: Optional[str] = Query(None), db: Session = Depends(get_db) ): """ Download printable QR code sheet as PNG. The sheet contains QR codes with doc_tokens that students will attach to their exams for pseudonymized tracking. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") # Get the batch (or create one if not specified) if batch_id: batch = repo.get_qr_batch(batch_id, teacher_id) if not batch: raise HTTPException(status_code=404, detail="QR batch not found") tokens = batch.generated_tokens else: # Get all tokens from documents docs = repo.list_documents(session_id, teacher_id) tokens = [d.doc_token for d in docs] if not tokens: raise HTTPException(status_code=400, detail="No documents or QR batch found") # Generate QR sheet pseudonymizer = get_pseudonymizer() try: sheet_bytes = pseudonymizer.generate_qr_sheet(tokens) except RuntimeError as e: raise HTTPException(status_code=500, detail=str(e)) return StreamingResponse( BytesIO(sheet_bytes), media_type="image/png", headers={ "Content-Disposition": f"attachment; filename=qr_sheet_{session_id[:8]}.png" } ) # ============================================================================= # Document Upload & Processing Endpoints # ============================================================================= @router.post("/sessions/{session_id}/upload", response_model=DocumentResponse) async def upload_document( session_id: str, file: UploadFile = File(...), auto_redact: bool = Query(True, description="Automatically redact header area"), db: Session = Depends(get_db) ): """ Upload a scanned exam page. The document will be: 1. Scanned for QR code to extract doc_token 2. Header area redacted to remove personal data (if auto_redact=True) 3. Stored for OCR processing PRIVACY: Header redaction removes student name/class before storage. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") # Read file content content = await file.read() pseudonymizer = get_pseudonymizer() # Try to detect QR code qr_result = pseudonymizer.detect_qr_code(content) doc_token = qr_result.doc_token if not doc_token: # Generate new token if QR not found doc_token = pseudonymizer.generate_doc_token() logger.warning(f"No QR code found in upload, generated new token: {doc_token[:8]}") # Redact header if requested if auto_redact: redaction_result = pseudonymizer.smart_redact_header(content, preserve_qr=True) if redaction_result.redaction_applied: content = redaction_result.redacted_image logger.info(f"Redacted {redaction_result.redacted_height}px header from document") # Create document record doc = repo.create_document( session_id=session_id, teacher_id=teacher_id, doc_token=doc_token ) if not doc: raise HTTPException(status_code=500, detail="Failed to create document") # Store content in MinIO storage try: storage = get_storage_service() file_ext = file.filename.split(".")[-1] if file.filename else "png" storage.upload_document( session_id=session_id, doc_token=doc_token, file_data=content, file_extension=file_ext, is_redacted=auto_redact ) logger.info(f"Stored document {doc_token[:8]} in MinIO") except Exception as e: logger.warning(f"Failed to store document in MinIO (continuing anyway): {e}") return DocumentResponse( doc_token=doc.doc_token, session_id=doc.session_id, status=doc.status.value, page_number=doc.page_number, total_pages=doc.total_pages, ocr_confidence=doc.ocr_confidence, ai_score=doc.ai_score, ai_grade=doc.ai_grade, ai_feedback=doc.ai_feedback, created_at=doc.created_at, processing_completed_at=doc.processing_completed_at ) @router.get("/sessions/{session_id}/documents", response_model=DocumentListResponse) async def list_documents( session_id: str, db: Session = Depends(get_db) ): """List all documents in a session (pseudonymized).""" teacher_id = get_teacher_id() repo = KlausurRepository(db) session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") docs = repo.list_documents(session_id, teacher_id) return DocumentListResponse( documents=[DocumentResponse( doc_token=d.doc_token, session_id=d.session_id, status=d.status.value, page_number=d.page_number, total_pages=d.total_pages, ocr_confidence=d.ocr_confidence, ai_score=d.ai_score, ai_grade=d.ai_grade, ai_feedback=d.ai_feedback, created_at=d.created_at, processing_completed_at=d.processing_completed_at ) for d in docs], total=len(docs) ) @router.get("/documents/{doc_token}", response_model=DocumentResponse) async def get_document( doc_token: str, db: Session = Depends(get_db) ): """Get details of a specific document by token.""" teacher_id = get_teacher_id() repo = KlausurRepository(db) doc = repo.get_document(doc_token, teacher_id) if not doc: raise HTTPException(status_code=404, detail="Document not found") return DocumentResponse( doc_token=doc.doc_token, session_id=doc.session_id, status=doc.status.value, page_number=doc.page_number, total_pages=doc.total_pages, ocr_confidence=doc.ocr_confidence, ai_score=doc.ai_score, ai_grade=doc.ai_grade, ai_feedback=doc.ai_feedback, created_at=doc.created_at, processing_completed_at=doc.processing_completed_at ) # ============================================================================= # Processing & Correction Endpoints # ============================================================================= @router.post("/sessions/{session_id}/process", status_code=202) async def start_processing( session_id: str, background_tasks: BackgroundTasks, use_ai: bool = Query(default=True, description="Run AI correction (requires LLM)"), db: Session = Depends(get_db) ): """ Start OCR and AI correction for all uploaded documents. This triggers background processing: 1. OCR extraction of student answers (via TrOCR on Mac Mini) 2. AI-assisted correction using self-hosted LLM 3. Grade calculation PRIVACY: Only pseudonymized text is sent to LLM. No student names or personal data. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") if session.document_count == 0: raise HTTPException(status_code=400, detail="No documents to process") if session.status == SessionStatus.PROCESSING: raise HTTPException(status_code=409, detail="Session is already processing") # Update session status repo.update_session_status(session_id, teacher_id, SessionStatus.PROCESSING) # Start background processing task async def run_processing(): """Background task wrapper.""" from .database import SessionLocal db_session = SessionLocal() try: service = get_processing_service(db_session) await service.process_session( session_id=session_id, teacher_id=teacher_id, use_ai_correction=use_ai ) except Exception as e: logger.error(f"Background processing failed: {e}") # Mark session as failed try: repo_err = KlausurRepository(db_session) repo_err.update_session_status(session_id, teacher_id, SessionStatus.CREATED) except Exception: pass finally: db_session.close() # Add to background tasks background_tasks.add_task(run_processing) logger.info(f"Started background processing for session {session_id} with {session.document_count} documents") return { "status": "processing", "message": "Background processing started", "session_id": session_id, "document_count": session.document_count, "use_ai_correction": use_ai } @router.get("/sessions/{session_id}/stats", response_model=ProcessingStats) async def get_processing_stats( session_id: str, db: Session = Depends(get_db) ): """Get anonymized processing statistics for a session.""" teacher_id = get_teacher_id() repo = KlausurRepository(db) stats = repo.get_session_stats(session_id, teacher_id) if not stats: raise HTTPException(status_code=404, detail="Session not found") return ProcessingStats(**stats) @router.get("/sessions/{session_id}/results", response_model=List[CorrectionResultResponse]) async def get_correction_results( session_id: str, db: Session = Depends(get_db) ): """ Get AI correction results (pseudonymized). Returns doc_token + scores/grades WITHOUT student names. The teacher's client can rejoin these with the encrypted identity map to reveal which student each result belongs to. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") docs = repo.list_documents(session_id, teacher_id) results = [] for doc in docs: if doc.status == DocumentStatus.COMPLETED: results.append(CorrectionResultResponse( doc_token=doc.doc_token, total_score=doc.ai_score or 0, max_score=session.total_points, grade=doc.ai_grade or "", overall_feedback=doc.ai_feedback or "", question_results=doc.ai_details.get("question_results", []) if doc.ai_details else [] )) return results # ============================================================================= # Identity Map (Client-Side Encryption) Endpoints # ============================================================================= @router.post("/sessions/{session_id}/identity-map", status_code=204) async def store_identity_map( session_id: str, data: IdentityMapUpdate, db: Session = Depends(get_db) ): """ Store encrypted identity map for a session. PRIVACY DESIGN: - The identity map (doc_token → student name) is encrypted with the teacher's password BEFORE being sent to server - Server stores only the encrypted blob - Server CANNOT decrypt the mapping - Only the teacher (with their password) can rejoin results This is zero-knowledge storage. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) import base64 try: encrypted_bytes = base64.b64decode(data.encrypted_data) except Exception: raise HTTPException(status_code=400, detail="Invalid base64 data") result = repo.update_session_identity_map( session_id=session_id, teacher_id=teacher_id, encrypted_map=encrypted_bytes, iv=data.iv ) if not result: raise HTTPException(status_code=404, detail="Session not found") return Response(status_code=204) @router.get("/sessions/{session_id}/identity-map") async def get_identity_map( session_id: str, db: Session = Depends(get_db) ): """ Retrieve encrypted identity map. Returns the encrypted blob that the teacher's client can decrypt locally to rejoin results with student names. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") if not session.encrypted_identity_map: raise HTTPException(status_code=404, detail="No identity map stored") import base64 return { "encrypted_data": base64.b64encode(session.encrypted_identity_map).decode(), "iv": session.identity_map_iv } # ============================================================================= # Data Retention Endpoint # ============================================================================= @router.post("/maintenance/cleanup", status_code=200) async def cleanup_expired_data( db: Session = Depends(get_db) ): """ Clean up expired sessions (data retention). This should be called periodically (e.g., daily cron job). Deletes sessions past their retention_until date. """ repo = KlausurRepository(db) deleted_count = repo.cleanup_expired_sessions() return { "status": "ok", "deleted_sessions": deleted_count, "timestamp": datetime.utcnow().isoformat() } # ============================================================================= # Magic Onboarding Endpoints # ============================================================================= # Import additional models for Magic Onboarding from .db_models import OnboardingSession, DetectedStudent, ModuleLink, OnboardingStatus, ModuleLinkType from .services.roster_parser import get_roster_parser from .services.school_resolver import get_school_resolver, BUNDESLAENDER, SCHULFORMEN, FAECHER from .services.module_linker import get_module_linker, CorrectionResult class MagicAnalysisRequest(BaseModel): """Request for magic header analysis (client-side results).""" detected_class: Optional[str] = None detected_subject: Optional[str] = None detected_date: Optional[str] = None students: List[dict] = Field(default=[]) # [{firstName, lastNameHint, confidence}] confidence: float = Field(default=0.0, ge=0.0, le=1.0) class MagicAnalysisResponse(BaseModel): """Response after magic analysis.""" onboarding_id: str detected_class: Optional[str] detected_subject: Optional[str] detected_date: Optional[str] student_count: int confidence: float bundeslaender: dict # For school cascade schulformen: dict existing_classes: List[dict] # Teacher's existing classes class OnboardingConfirmRequest(BaseModel): """Request to confirm onboarding data.""" onboarding_id: str # School context bundesland: str schulform: str school_name: str # Class info class_name: str subject: str # Students (confirmed) students: List[dict] # [{firstName, lastName, parentEmail?, parentPhone?}] # Options create_class: bool = Field(default=True) link_to_existing_class_id: Optional[str] = None class OnboardingConfirmResponse(BaseModel): """Response after confirmation.""" session_id: str onboarding_id: str class_id: Optional[str] student_count: int ready_for_correction: bool class RosterUploadResponse(BaseModel): """Response after roster upload.""" parsed_count: int matched_count: int entries: List[dict] # [{firstName, lastName, parentEmail?, matched: bool}] warnings: List[str] class MagicCorrectionRequest(BaseModel): """Request to start magic correction.""" onboarding_id: str rubric: str = Field(default="") questions: List[dict] = Field(default=[]) class ResultsWithLinksResponse(BaseModel): """Results with module links.""" results: List[CorrectionResultResponse] statistics: dict module_links: List[dict] parent_meeting_suggestions: List[dict] class FileExtractionRequest(BaseModel): """Request to extract info from uploaded exam files.""" filenames: List[str] = Field(default=[], description="Original filenames for metadata extraction") use_llm: bool = Field(default=True, description="Use LLM for intelligent extraction") class ExamExtractionResult(BaseModel): """Extracted information from an exam file.""" filename: str detected_student_name: Optional[str] = None detected_last_name_hint: Optional[str] = None detected_class: Optional[str] = None detected_subject: Optional[str] = None detected_date: Optional[str] = None detected_grade: Optional[str] = None detected_score: Optional[int] = None detected_max_score: Optional[int] = None is_nachschreiben: bool = False is_separate_page: bool = False page_number: Optional[int] = None question_scores: List[dict] = Field(default=[]) # [{question: 1, score: 5, max: 10}] raw_text: Optional[str] = None confidence: float = 0.0 class FileExtractionResponse(BaseModel): """Response with extracted exam information.""" results: List[ExamExtractionResult] detected_class: Optional[str] = None detected_subject: Optional[str] = None detected_date: Optional[str] = None student_count: int = 0 overall_confidence: float = 0.0 @router.post("/magic-onboarding/extract", response_model=FileExtractionResponse) async def extract_exam_info( files: List[UploadFile] = File(...), db: Session = Depends(get_db) ): """ Server-side extraction of exam information using OCR and LLM. Extracts: - Student names from headers - Class and subject from context - Grades and scores if already corrected - Question-level scores Uses: 1. Filename parsing for initial metadata 2. OCR for text extraction 3. Ollama/Qwen for intelligent parsing (if available) """ import re import httpx results = [] class_votes = {} subject_votes = {} date_votes = {} for file in files: filename = file.filename or "" content = await file.read() # Parse filename for metadata filename_info = _parse_exam_filename(filename) result = ExamExtractionResult( filename=filename, detected_class=filename_info.get('class'), detected_subject=filename_info.get('subject'), detected_date=filename_info.get('date'), is_nachschreiben=filename_info.get('nachschreiben', False), is_separate_page=filename_info.get('separate_page', False), page_number=filename_info.get('page_number'), confidence=0.5 # Base confidence from filename ) # Try to extract student name from filename if filename_info.get('student_name'): result.detected_student_name = filename_info['student_name'] result.confidence = 0.7 # Vote for class/subject if result.detected_class: class_votes[result.detected_class] = class_votes.get(result.detected_class, 0) + 1 if result.detected_subject: subject_votes[result.detected_subject] = subject_votes.get(result.detected_subject, 0) + 1 if result.detected_date: date_votes[result.detected_date] = date_votes.get(result.detected_date, 0) + 1 # Try LLM extraction if Ollama is available try: llm_result = await _extract_with_ollama(content, filename) if llm_result: result.detected_student_name = llm_result.get('student_name') or result.detected_student_name result.detected_last_name_hint = llm_result.get('last_name_hint') result.detected_grade = llm_result.get('grade') result.detected_score = llm_result.get('score') result.detected_max_score = llm_result.get('max_score') result.question_scores = llm_result.get('question_scores', []) result.raw_text = llm_result.get('raw_text', '')[:500] # Truncate for response result.confidence = max(result.confidence, llm_result.get('confidence', 0.0)) except Exception as e: logger.warning(f"LLM extraction failed for {filename}: {e}") results.append(result) # Determine overall detected values detected_class = max(class_votes.items(), key=lambda x: x[1])[0] if class_votes else None detected_subject = max(subject_votes.items(), key=lambda x: x[1])[0] if subject_votes else None detected_date = max(date_votes.items(), key=lambda x: x[1])[0] if date_votes else None overall_confidence = sum(r.confidence for r in results) / len(results) if results else 0.0 return FileExtractionResponse( results=results, detected_class=detected_class, detected_subject=detected_subject, detected_date=detected_date, student_count=len(results), overall_confidence=overall_confidence ) def _parse_exam_filename(filename: str) -> dict: """ Parse exam filename for metadata. Expected patterns: - 20260119_103820_Mathe_Klasse_3-1_2026-01-15_085630.pdf - Mathe_Klasse_3_Nachschreiben_2026-01-15_090901.pdf - Mathe_Klasse_3-2_Miguel_Seite_2_2026-01-15_090620.pdf """ import re result = { 'class': None, 'subject': None, 'date': None, 'nachschreiben': False, 'separate_page': False, 'page_number': None, 'student_name': None } # Remove extension name = filename.rsplit('.', 1)[0] if '.' in filename else filename # Detect subject (common German subjects) subjects = ['Mathe', 'Mathematik', 'Deutsch', 'Englisch', 'Physik', 'Chemie', 'Bio', 'Biologie', 'Geschichte', 'Erdkunde', 'Geographie', 'Kunst', 'Musik', 'Sport', 'Informatik', 'Französisch', 'Latein', 'Spanisch', 'Religion', 'Ethik', 'Politik', 'Wirtschaft'] for subject in subjects: if subject.lower() in name.lower(): result['subject'] = subject break # Detect class (e.g., Klasse_3-1, 3a, 10b, Q1) class_patterns = [ r'Klasse[_\s]*(\d+[-a-zA-Z0-9]*)', # Klasse_3-1, Klasse 10a r'(\d{1,2}[a-zA-Z])', # 3a, 10b r'(Q[12])', # Q1, Q2 (Oberstufe) r'(E[PF])', # EP, EF (Einführungsphase) ] for pattern in class_patterns: match = re.search(pattern, name, re.IGNORECASE) if match: result['class'] = match.group(1) break # Detect date (YYYY-MM-DD or DD.MM.YYYY) date_patterns = [ r'(\d{4}-\d{2}-\d{2})', # 2026-01-15 r'(\d{2}\.\d{2}\.\d{4})', # 15.01.2026 ] for pattern in date_patterns: match = re.search(pattern, name) if match: result['date'] = match.group(1) break # Detect Nachschreiben if 'nachschreib' in name.lower(): result['nachschreiben'] = True # Detect separate page (Seite_2) page_match = re.search(r'Seite[_\s]*(\d+)', name, re.IGNORECASE) if page_match: result['separate_page'] = True result['page_number'] = int(page_match.group(1)) # Try to extract student name (usually after class, before date) # Pattern: ...Klasse_3-2_Miguel_Seite... name_match = re.search(r'Klasse[_\s]*\d+[-a-zA-Z0-9]*[_\s]+([A-Z][a-z]+)(?:[_\s]|$)', name) if name_match: potential_name = name_match.group(1) # Exclude common non-name words if potential_name not in ['Seite', 'Nachschreiben', 'Teil', 'Aufgabe']: result['student_name'] = potential_name return result async def _extract_with_ollama(content: bytes, filename: str) -> Optional[dict]: """ Use Ollama (local or Mac Mini) to extract information from exam content. Tries local Ollama first, then Mac Mini if configured. """ import httpx import base64 # Ollama endpoints to try ollama_endpoints = [ "http://localhost:11434", # Local "http://192.168.178.163:11434", # Mac Mini ] # Convert PDF first page to image if needed image_data = None if filename.lower().endswith('.pdf'): try: # Try to extract first page as image # This requires pdf2image or PyMuPDF image_data = await _pdf_to_image(content) except Exception as e: logger.warning(f"PDF conversion failed: {e}") return None elif filename.lower().endswith(('.png', '.jpg', '.jpeg')): image_data = content if not image_data: return None # Create prompt for extraction prompt = """Analysiere dieses Bild einer Klausur/Klassenarbeit und extrahiere folgende Informationen im JSON-Format: { "student_name": "Vorname des Schülers (falls sichtbar)", "last_name_hint": "Anfangsbuchstabe des Nachnamens (z.B. 'M.' falls sichtbar)", "grade": "Note falls eingetragen (z.B. '2+', '3', '5-')", "score": Punktzahl als Zahl (falls vorhanden), "max_score": Maximale Punktzahl als Zahl (falls vorhanden), "question_scores": [{"question": 1, "score": 5, "max": 10}], "confidence": Konfidenz 0.0-1.0 } Antworte NUR mit dem JSON, kein zusätzlicher Text.""" # Try each endpoint for endpoint in ollama_endpoints: try: async with httpx.AsyncClient(timeout=30.0) as client: # Check if vision model is available response = await client.get(f"{endpoint}/api/tags") if response.status_code != 200: continue models = response.json().get('models', []) # Prefer vision models: llava, bakllava, moondream, qwen2-vl vision_model = None for m in models: name = m.get('name', '').lower() if any(vm in name for vm in ['llava', 'moondream', 'qwen', 'vision']): vision_model = m['name'] break # Fall back to text model with OCR model = vision_model or (models[0]['name'] if models else None) if not model: continue # Call Ollama request_data = { "model": model, "prompt": prompt, "stream": False } if vision_model and image_data: request_data["images"] = [base64.b64encode(image_data).decode()] response = await client.post( f"{endpoint}/api/generate", json=request_data ) if response.status_code == 200: result_text = response.json().get('response', '') # Parse JSON from response import json try: # Extract JSON from response json_match = re.search(r'\{[^}]+\}', result_text, re.DOTALL) if json_match: return json.loads(json_match.group()) except json.JSONDecodeError: logger.warning(f"Failed to parse LLM response as JSON") return None except Exception as e: logger.debug(f"Ollama endpoint {endpoint} failed: {e}") continue return None async def _pdf_to_image(content: bytes) -> Optional[bytes]: """Convert first page of PDF to PNG image.""" try: import fitz # PyMuPDF doc = fitz.open(stream=content, filetype="pdf") page = doc[0] pix = page.get_pixmap(dpi=150) return pix.tobytes("png") except ImportError: pass try: from pdf2image import convert_from_bytes images = convert_from_bytes(content, first_page=1, last_page=1, dpi=150) if images: from io import BytesIO buffer = BytesIO() images[0].save(buffer, format='PNG') return buffer.getvalue() except ImportError: pass return None @router.post("/magic-onboarding/analyze", response_model=MagicAnalysisResponse) async def magic_analyze( data: MagicAnalysisRequest, db: Session = Depends(get_db) ): """ Phase 1: Store client-side analysis results and prepare for confirmation. The actual header extraction happens client-side using the local LLM. This endpoint stores the results and provides school cascade data. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) resolver = get_school_resolver() # Create onboarding session onboarding = OnboardingSession( teacher_id=teacher_id, detected_class=data.detected_class, detected_subject=data.detected_subject, detected_student_count=len(data.students), detection_confidence=int(data.confidence * 100), status=OnboardingStatus.CONFIRMING ) onboarding.analysis_completed_at = datetime.utcnow() db.add(onboarding) # Store detected students for student_data in data.students: student = DetectedStudent( onboarding_session_id=onboarding.id, detected_first_name=student_data.get('firstName'), detected_last_name_hint=student_data.get('lastNameHint'), confidence=int(student_data.get('confidence', 0) * 100) ) db.add(student) db.commit() db.refresh(onboarding) # Get teacher's existing classes existing_classes = await resolver.get_classes_for_teacher(teacher_id) return MagicAnalysisResponse( onboarding_id=onboarding.id, detected_class=onboarding.detected_class, detected_subject=onboarding.detected_subject, detected_date=data.detected_date, student_count=onboarding.detected_student_count, confidence=data.confidence, bundeslaender=BUNDESLAENDER, schulformen={k: v['name'] for k, v in SCHULFORMEN.items()}, existing_classes=[{ 'id': c.id, 'name': c.name, 'grade_level': c.grade_level } for c in existing_classes] ) @router.post("/magic-onboarding/upload-roster", response_model=RosterUploadResponse) async def upload_roster( onboarding_id: str = Query(...), file: UploadFile = File(...), db: Session = Depends(get_db) ): """ Phase 2a: Upload Klassenbuch photo or roster file. Parses the uploaded file and matches names to detected students. """ teacher_id = get_teacher_id() parser = get_roster_parser() # Get onboarding session onboarding = db.query(OnboardingSession).filter( OnboardingSession.id == onboarding_id, OnboardingSession.teacher_id == teacher_id ).first() if not onboarding: raise HTTPException(status_code=404, detail="Onboarding session not found") # Read file content = await file.read() filename = file.filename.lower() # Parse based on file type if filename.endswith(('.png', '.jpg', '.jpeg')): roster = parser.parse_klassenbuch_image(content) elif filename.endswith('.pdf'): roster = parser.parse_pdf_roster(content) elif filename.endswith('.csv'): roster = parser.parse_csv_roster(content.decode('utf-8')) else: raise HTTPException(status_code=400, detail="Unsupported file format") # Get detected students detected_students = db.query(DetectedStudent).filter( DetectedStudent.onboarding_session_id == onboarding_id ).all() detected_names = [s.detected_first_name for s in detected_students if s.detected_first_name] # Match names matches = parser.match_first_names(detected_names, roster.entries) # Update detected students with matched data matched_count = 0 for match in matches: if match.matched_entry and match.confidence > 0.7: for student in detected_students: if student.detected_first_name == match.detected_name: student.confirmed_first_name = match.matched_entry.first_name student.confirmed_last_name = match.matched_entry.last_name student.parent_email = match.matched_entry.parent_email student.parent_phone = match.matched_entry.parent_phone matched_count += 1 break db.commit() return RosterUploadResponse( parsed_count=len(roster.entries), matched_count=matched_count, entries=[{ 'firstName': e.first_name, 'lastName': e.last_name, 'parentEmail': e.parent_email, 'parentPhone': e.parent_phone, 'matched': any( m.matched_entry and m.matched_entry.first_name == e.first_name for m in matches ) } for e in roster.entries], warnings=roster.warnings ) @router.post("/magic-onboarding/confirm", response_model=OnboardingConfirmResponse) async def confirm_onboarding( data: OnboardingConfirmRequest, db: Session = Depends(get_db) ): """ Phase 2b: Confirm onboarding data and create class/session. Creates the school class (if requested) and exam session. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) resolver = get_school_resolver() # Get onboarding session onboarding = db.query(OnboardingSession).filter( OnboardingSession.id == data.onboarding_id, OnboardingSession.teacher_id == teacher_id ).first() if not onboarding: raise HTTPException(status_code=404, detail="Onboarding session not found") # Update school context onboarding.bundesland = data.bundesland onboarding.schulform = data.schulform onboarding.school_name = data.school_name onboarding.confirmed_class = data.class_name onboarding.confirmed_subject = data.subject onboarding.confirmation_completed_at = datetime.utcnow() class_id = data.link_to_existing_class_id # Create class if requested if data.create_class and not class_id: from .services.school_resolver import DetectedClassInfo # Get or create school school = await resolver.get_or_create_school( teacher_id=teacher_id, bundesland=data.bundesland, schulform=data.schulform, school_name=data.school_name ) onboarding.linked_school_id = school.id # Create class class_info = DetectedClassInfo( class_name=data.class_name, students=data.students ) school_class = await resolver.auto_create_class( teacher_id=teacher_id, school_id=school.id, detected_info=class_info ) class_id = school_class.id onboarding.linked_class_id = class_id # Create exam session session = repo.create_session( teacher_id=teacher_id, name=f"{data.subject} - {data.class_name}", subject=data.subject, class_name=data.class_name, total_points=100 ) session.linked_school_class_id = class_id onboarding.klausur_session_id = session.id onboarding.status = OnboardingStatus.PROCESSING # Update detected students with confirmed data for student_data in data.students: # Update or create detected student first_name = student_data.get('firstName') if first_name: student = db.query(DetectedStudent).filter( DetectedStudent.onboarding_session_id == data.onboarding_id, DetectedStudent.detected_first_name == first_name ).first() if student: student.confirmed_first_name = first_name student.confirmed_last_name = student_data.get('lastName', '') student.parent_email = student_data.get('parentEmail') student.parent_phone = student_data.get('parentPhone') db.commit() return OnboardingConfirmResponse( session_id=session.id, onboarding_id=onboarding.id, class_id=class_id, student_count=len(data.students), ready_for_correction=True ) @router.post("/magic-onboarding/start-correction") async def start_magic_correction( data: MagicCorrectionRequest, db: Session = Depends(get_db) ): """ Phase 3: Start background correction. Triggers the AI correction process for all uploaded documents. """ teacher_id = get_teacher_id() # Get onboarding session onboarding = db.query(OnboardingSession).filter( OnboardingSession.id == data.onboarding_id, OnboardingSession.teacher_id == teacher_id ).first() if not onboarding: raise HTTPException(status_code=404, detail="Onboarding session not found") if not onboarding.klausur_session_id: raise HTTPException(status_code=400, detail="Session not confirmed yet") onboarding.processing_started_at = datetime.utcnow() db.commit() # The actual correction is triggered via the existing /sessions/{id}/process endpoint return { "status": "started", "session_id": onboarding.klausur_session_id, "onboarding_id": onboarding.id, "message": "Korrektur gestartet. Verwende /sessions/{id}/progress-stream fuer Updates." } @router.get("/sessions/{session_id}/results-with-links", response_model=ResultsWithLinksResponse) async def get_results_with_links( session_id: str, db: Session = Depends(get_db) ): """ Phase 4: Get results with module links. Returns correction results along with suggestions for module linking. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) linker = get_module_linker() session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") # Get documents documents = repo.list_documents(session_id, teacher_id) completed_docs = [d for d in documents if d.status == DocumentStatus.COMPLETED] # Build correction results results = [] correction_results = [] # For linker for doc in completed_docs: result = CorrectionResultResponse( doc_token=doc.doc_token, total_score=doc.ai_score or 0, max_score=session.total_points, grade=doc.ai_grade or "", overall_feedback=doc.ai_feedback or "", question_results=doc.ai_details.get('question_results', []) if doc.ai_details else [] ) results.append(result) correction_results.append(CorrectionResult( doc_token=doc.doc_token, score=float(doc.ai_score or 0), max_score=float(session.total_points), grade=doc.ai_grade or "", feedback=doc.ai_feedback or "" )) # Calculate statistics stats = linker.calculate_grade_statistics(correction_results) # Get existing module links links = db.query(ModuleLink).filter( ModuleLink.klausur_session_id == session_id ).all() # Generate parent meeting suggestions meeting_suggestions = linker.suggest_elternabend( results=correction_results, subject=session.subject ) return ResultsWithLinksResponse( results=results, statistics=stats, module_links=[{ 'id': link.id, 'type': link.link_type.value, 'module': link.target_module, 'url': link.target_url } for link in links], parent_meeting_suggestions=[{ 'doc_token': s.doc_token, 'reason': s.reason, 'urgency': s.urgency.value, 'grade': s.grade, 'topics': s.suggested_topics } for s in meeting_suggestions] ) @router.post("/sessions/{session_id}/link-to-module") async def create_module_link( session_id: str, link_type: str = Query(..., description="notenbuch, elternabend, zeugnis, calendar"), db: Session = Depends(get_db) ): """ Phase 4: Create a link to another module. Creates the actual connection to Notenbuch, Elternabend, etc. """ teacher_id = get_teacher_id() repo = KlausurRepository(db) linker = get_module_linker() session = repo.get_session(session_id, teacher_id) if not session: raise HTTPException(status_code=404, detail="Session not found") # Get documents documents = repo.list_documents(session_id, teacher_id) completed_docs = [d for d in documents if d.status == DocumentStatus.COMPLETED] # Build correction results correction_results = [ CorrectionResult( doc_token=doc.doc_token, score=float(doc.ai_score or 0), max_score=float(session.total_points), grade=doc.ai_grade or "", feedback=doc.ai_feedback or "" ) for doc in completed_docs ] result = None if link_type == "notenbuch": result = await linker.link_to_notenbuch( session_id=session_id, class_id=session.linked_school_class_id or "", subject=session.subject, results=correction_results, exam_name=session.name, exam_date=session.created_at.strftime("%Y-%m-%d") ) elif link_type == "elternabend": suggestions = linker.suggest_elternabend( results=correction_results, subject=session.subject ) result = await linker.create_elternabend_link( session_id=session_id, suggestions=suggestions, teacher_id=teacher_id ) elif link_type == "zeugnis": grades = {r.doc_token: r.grade for r in correction_results} result = await linker.update_zeugnis( class_id=session.linked_school_class_id or "", subject=session.subject, grades=grades ) elif link_type == "calendar": suggestions = linker.suggest_elternabend( results=correction_results, subject=session.subject ) events = await linker.create_calendar_events( teacher_id=teacher_id, meetings=suggestions ) result = type('obj', (object,), { 'success': len(events) > 0, 'message': f"{len(events)} Kalendereintraege erstellt" })() else: raise HTTPException(status_code=400, detail=f"Unknown link type: {link_type}") if result and result.success: # Store the link link = ModuleLink( klausur_session_id=session_id, link_type=ModuleLinkType(link_type), target_module=link_type, target_entity_id=getattr(result, 'link', {}).target_entity_id if hasattr(result, 'link') and result.link else "", target_url=getattr(result, 'target_url', None) ) db.add(link) db.commit() return { "success": result.success if result else False, "message": result.message if result else "Unknown error", "target_url": getattr(result, 'target_url', None) if result else None } @router.get("/school-data/bundeslaender") async def get_bundeslaender(): """Get list of German federal states.""" return {"bundeslaender": BUNDESLAENDER} @router.get("/school-data/schulformen") async def get_schulformen(): """Get list of school types.""" return {"schulformen": {k: v['name'] for k, v in SCHULFORMEN.items()}} @router.get("/school-data/faecher") async def get_faecher(): """Get list of subjects.""" return {"faecher": {k: v['name'] for k, v in FAECHER.items()}} # ============================================================================= # TrOCR HANDWRITING RECOGNITION ENDPOINTS # ============================================================================= class TrOCRExtractRequest(BaseModel): """Request for TrOCR text extraction.""" detect_lines: bool = Field(default=True, description="Detect and process text lines separately") class TrOCRTrainingRequest(BaseModel): """Request to add a training example.""" ground_truth: str = Field(..., min_length=1, description="Correct text for the image") class TrOCRFineTuneRequest(BaseModel): """Request to start fine-tuning.""" epochs: int = Field(default=3, ge=1, le=10) learning_rate: float = Field(default=5e-5, gt=0, lt=1) @router.post("/trocr/extract") async def trocr_extract( file: UploadFile = File(...), detect_lines: bool = Query(default=True), teacher_id: str = Query(default="teacher_1") ): """ Extract handwritten text from an image using TrOCR. This endpoint uses Microsoft's TrOCR model optimized for handwriting. Processing happens on Mac Mini TrOCR service - no cloud, only local network. Args: file: Image file (PNG, JPG) detect_lines: If True, detect individual text lines teacher_id: Teacher ID for logging Returns: Extracted text with confidence scores """ # Try remote TrOCR client first (Mac Mini) try: from .services.trocr_client import get_trocr_client client = get_trocr_client() if await client.is_available(): content = await file.read() result = await client.extract_text( content, filename=file.filename or "image.png", detect_lines=detect_lines ) return { "text": result.text, "confidence": result.confidence, "bounding_boxes": [], "processing_time_ms": result.processing_time_ms, "model": "trocr-base-handwritten", "device": result.device, "service": "mac-mini" } except Exception as e: logger.warning(f"Remote TrOCR client failed: {e}") # Fallback to local TrOCR service try: from .services.trocr_service import get_trocr_service service = get_trocr_service() content = await file.read() result = await service.extract_text(content, detect_lines=detect_lines) return { "text": result.text, "confidence": result.confidence, "bounding_boxes": result.bounding_boxes, "processing_time_ms": result.processing_time_ms, "model": service.model_name, "has_lora_adapter": service._lora_adapter is not None, "service": "local" } except ImportError as e: logger.error(f"TrOCR not available locally or remotely: {e}") raise HTTPException( status_code=503, detail="TrOCR not available. Mac Mini service unreachable and local dependencies missing." ) except Exception as e: logger.error(f"TrOCR extraction failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/trocr/batch-extract") async def trocr_batch_extract( files: List[UploadFile] = File(...), detect_lines: bool = Query(default=True), teacher_id: str = Query(default="teacher_1") ): """ Extract handwritten text from multiple images. Args: files: List of image files detect_lines: If True, detect individual text lines teacher_id: Teacher ID for logging Returns: List of extraction results """ try: from .services.trocr_service import get_trocr_service service = get_trocr_service() # Read all files images = [await f.read() for f in files] # Extract from all results = await service.batch_extract(images, detect_lines=detect_lines) return { "results": [ { "filename": files[i].filename, "text": r.text, "confidence": r.confidence, "processing_time_ms": r.processing_time_ms } for i, r in enumerate(results) ], "total_files": len(files), "model": service.model_name } except ImportError as e: raise HTTPException(status_code=503, detail=f"TrOCR not available: {e}") except Exception as e: logger.error(f"TrOCR batch extraction failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/trocr/training/add") async def trocr_add_training_example( file: UploadFile = File(...), ground_truth: str = Query(..., min_length=1), teacher_id: str = Query(default="teacher_1") ): """ Add a training example for TrOCR fine-tuning. When a teacher corrects OCR output, submit the correction here to improve future recognition accuracy. Args: file: Image file with handwritten text ground_truth: The correct text (teacher-corrected) teacher_id: Teacher ID (for tracking) Returns: Example ID """ try: from .services.trocr_service import get_trocr_service service = get_trocr_service() # Read file content = await file.read() # Add training example example_id = service.add_training_example( image_data=content, ground_truth=ground_truth, teacher_id=teacher_id ) info = service.get_model_info() return { "example_id": example_id, "ground_truth": ground_truth, "teacher_id": teacher_id, "total_examples": info["training_examples_count"], "message": "Training example added successfully" } except Exception as e: logger.error(f"Failed to add training example: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/trocr/training/fine-tune") async def trocr_fine_tune( request: TrOCRFineTuneRequest, teacher_id: str = Query(default=None) ): """ Start fine-tuning TrOCR with collected training examples. Uses LoRA for efficient fine-tuning. Requires at least 10 training examples. Args: request: Fine-tuning parameters teacher_id: If provided, only use examples from this teacher Returns: Training results """ try: from .services.trocr_service import get_trocr_service service = get_trocr_service() # Run fine-tuning result = await service.fine_tune( teacher_id=teacher_id, epochs=request.epochs, learning_rate=request.learning_rate ) return result except ImportError as e: raise HTTPException( status_code=503, detail=f"Fine-tuning dependencies not installed: {e}. Install with: pip install peft" ) except Exception as e: logger.error(f"Fine-tuning failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.get("/trocr/training/examples") async def trocr_list_training_examples( teacher_id: str = Query(default=None) ): """ List training examples. Args: teacher_id: If provided, filter by teacher Returns: List of training examples """ try: from .services.trocr_service import get_trocr_service service = get_trocr_service() examples = service.get_training_examples(teacher_id) return { "examples": [ { "image_path": e.image_path, "ground_truth": e.ground_truth[:100] + "..." if len(e.ground_truth) > 100 else e.ground_truth, "teacher_id": e.teacher_id, "created_at": e.created_at } for e in examples ], "total": len(examples) } except Exception as e: logger.error(f"Failed to list training examples: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.get("/trocr/status") async def trocr_status(): """ Get TrOCR model status and info. Returns: Model information including device, adapter status, etc. """ result = { "status": "unavailable", "services": {} } # Check Mac Mini TrOCR service try: from .services.trocr_client import get_trocr_client client = get_trocr_client() if await client.is_available(): remote_status = await client.get_status() result["services"]["mac_mini"] = { "status": "available", **remote_status } result["status"] = "available" result["primary_service"] = "mac_mini" except Exception as e: result["services"]["mac_mini"] = { "status": "error", "error": str(e) } # Check local TrOCR service try: from .services.trocr_service import get_trocr_service service = get_trocr_service() info = service.get_model_info() result["services"]["local"] = { "status": "available", **info } if result["status"] != "available": result["status"] = "available" result["primary_service"] = "local" except ImportError as e: result["services"]["local"] = { "status": "not_installed", "error": str(e) } except Exception as e: result["services"]["local"] = { "status": "error", "error": str(e) } return result