""" Background Processing Service for Klausur Correction. Orchestrates the complete correction pipeline: 1. Load documents from storage 2. Run TrOCR for text extraction 3. Run AI correction for grading 4. Save results to database PRIVACY BY DESIGN: - Only pseudonymized doc_tokens used throughout - No student names in processing pipeline - All data stays on self-hosted infrastructure """ import asyncio import logging from datetime import datetime from typing import Optional, List, Callable from dataclasses import dataclass from sqlalchemy.orm import Session from ..db_models import ( ExamSession, PseudonymizedDocument, SessionStatus, DocumentStatus ) from ..repository import KlausurRepository from .trocr_client import get_trocr_client, TrOCRClient from .vision_ocr_service import get_vision_ocr_service, VisionOCRService from .correction_service import ( get_correction_service, ExamCorrectionService, QuestionRubric, CorrectionResult ) from .storage_service import get_storage_service, KlausurStorageService logger = logging.getLogger(__name__) @dataclass class ProcessingProgress: """Progress update for SSE streaming.""" session_id: str total_documents: int processed_documents: int current_document: Optional[str] = None current_step: str = "idle" # ocr, correction, saving error: Optional[str] = None @property def percentage(self) -> int: if self.total_documents == 0: return 0 return int(self.processed_documents / self.total_documents * 100) class ProcessingService: """ Background service for exam correction processing. Usage: service = ProcessingService(db_session) await service.process_session(session_id, teacher_id) """ def __init__( self, db: Session, trocr_client: Optional[TrOCRClient] = None, vision_ocr_service: Optional[VisionOCRService] = None, correction_service: Optional[ExamCorrectionService] = None, storage_service: Optional[KlausurStorageService] = None, prefer_vision_ocr: bool = True # Vision-LLM als Primär für Handschrift ): self.db = db self.repo = KlausurRepository(db) self.trocr = trocr_client or get_trocr_client() self.vision_ocr = vision_ocr_service or get_vision_ocr_service() self.correction = correction_service or get_correction_service() self.storage = storage_service or get_storage_service() self.prefer_vision_ocr = prefer_vision_ocr # Progress callback for SSE streaming self._progress_callback: Optional[Callable[[ProcessingProgress], None]] = None def set_progress_callback(self, callback: Callable[[ProcessingProgress], None]): """Set callback for progress updates (SSE streaming).""" self._progress_callback = callback def _notify_progress(self, progress: ProcessingProgress): """Notify progress to callback if set.""" if self._progress_callback: try: self._progress_callback(progress) except Exception as e: logger.warning(f"Progress callback failed: {e}") async def process_session( self, session_id: str, teacher_id: str, use_ai_correction: bool = True ) -> bool: """ Process all documents in a session. Args: session_id: Exam session ID teacher_id: Teacher ID for isolation use_ai_correction: Whether to run AI correction (requires LLM) Returns: True if processing completed successfully """ # Get session session = self.repo.get_session(session_id, teacher_id) if not session: logger.error(f"Session not found: {session_id}") return False # Get documents documents = self.repo.list_documents(session_id, teacher_id) if not documents: logger.warning(f"No documents in session: {session_id}") return False total = len(documents) processed = 0 logger.info(f"Starting processing for session {session_id}: {total} documents") # Check OCR service availability (Vision-LLM preferred for handwriting) vision_ocr_available = await self.vision_ocr.is_available() trocr_available = await self.trocr.is_available() if vision_ocr_available and self.prefer_vision_ocr: logger.info("Using Vision-LLM (llama3.2-vision) for OCR - optimal for handwriting") use_vision_ocr = True elif trocr_available: logger.info("Using TrOCR for OCR") use_vision_ocr = False elif vision_ocr_available: logger.info("TrOCR not available, falling back to Vision-LLM") use_vision_ocr = True else: logger.warning("No OCR service available - OCR will be skipped") use_vision_ocr = False trocr_available = False # Process each document for doc in documents: progress = ProcessingProgress( session_id=session_id, total_documents=total, processed_documents=processed, current_document=doc.doc_token[:8], current_step="ocr" ) self._notify_progress(progress) try: # Step 1: OCR extraction (Vision-LLM or TrOCR) if (vision_ocr_available or trocr_available) and doc.status == DocumentStatus.UPLOADED: await self._process_ocr(session_id, doc, teacher_id, use_vision_ocr=use_vision_ocr) # Step 2: AI correction progress.current_step = "correction" self._notify_progress(progress) if use_ai_correction and doc.ocr_text: await self._process_correction(session, doc, teacher_id) else: # Just mark as completed without AI self._mark_document_completed(doc, teacher_id) processed += 1 except Exception as e: logger.error(f"Failed to process document {doc.doc_token}: {e}") self._mark_document_failed(doc, str(e), teacher_id) # Update session status self.repo.update_session_status(session_id, teacher_id, SessionStatus.COMPLETED) # Final progress progress = ProcessingProgress( session_id=session_id, total_documents=total, processed_documents=processed, current_step="complete" ) self._notify_progress(progress) logger.info(f"Completed processing session {session_id}: {processed}/{total} documents") return True async def _process_ocr( self, session_id: str, doc: PseudonymizedDocument, teacher_id: str, use_vision_ocr: bool = True ): """ Run OCR on a document. Args: session_id: Session ID doc: Document to process teacher_id: Teacher ID use_vision_ocr: True to use Vision-LLM (llama3.2-vision), False for TrOCR """ # Update status doc.status = DocumentStatus.OCR_PROCESSING doc.processing_started_at = datetime.utcnow() self.db.commit() # Try to get document from storage (check both redacted and original) image_data = None for is_redacted in [True, False]: # Prefer redacted version for ext in ["png", "jpg", "jpeg", "pdf"]: image_data = self.storage.get_document( session_id, doc.doc_token, ext, is_redacted=is_redacted ) if image_data: logger.debug(f"Found document: {doc.doc_token[:8]}.{ext} (redacted={is_redacted})") break if image_data: break if not image_data: logger.warning(f"No image found for document {doc.doc_token}") # Use placeholder OCR text for testing doc.ocr_text = "[Kein Bild gefunden - Manuelle Eingabe erforderlich]" doc.ocr_confidence = 0 doc.status = DocumentStatus.OCR_COMPLETED self.db.commit() return # Call OCR service (Vision-LLM or TrOCR) try: if use_vision_ocr: # Use Vision-LLM (llama3.2-vision) - better for handwriting result = await self.vision_ocr.extract_text( image_data, filename=f"{doc.doc_token}.png", is_handwriting=True # Assume handwriting for exams ) ocr_method = "Vision-LLM" else: # Use TrOCR result = await self.trocr.extract_text( image_data, filename=f"{doc.doc_token}.png", detect_lines=True ) ocr_method = "TrOCR" doc.ocr_text = result.text doc.ocr_confidence = int(result.confidence * 100) doc.status = DocumentStatus.OCR_COMPLETED logger.info( f"OCR completed ({ocr_method}) for {doc.doc_token[:8]}: " f"{len(result.text)} chars, {result.confidence:.0%} confidence" ) except Exception as e: logger.error(f"OCR failed for {doc.doc_token}: {e}") doc.ocr_text = f"[OCR Fehler: {str(e)[:100]}]" doc.ocr_confidence = 0 doc.status = DocumentStatus.OCR_COMPLETED # Continue to AI anyway self.db.commit() async def _process_correction( self, session: ExamSession, doc: PseudonymizedDocument, teacher_id: str ): """Run AI correction on a document.""" doc.status = DocumentStatus.AI_PROCESSING self.db.commit() # Build rubrics from session questions rubrics = self._build_rubrics(session) if not rubrics: # No rubrics defined - use simple scoring doc.ai_feedback = "Keine Bewertungskriterien definiert. Manuelle Korrektur empfohlen." doc.ai_score = None doc.ai_grade = None doc.status = DocumentStatus.COMPLETED doc.processing_completed_at = datetime.utcnow() self.db.commit() # Update session stats session.processed_count += 1 self.db.commit() return try: # Run AI correction result = await self.correction.correct_exam( doc_token=doc.doc_token, ocr_text=doc.ocr_text, rubrics=rubrics, subject=session.subject or "Allgemein" ) # Save results doc.ai_feedback = result.overall_feedback doc.ai_score = result.total_score doc.ai_grade = result.grade doc.ai_details = { "max_score": result.max_score, "processing_time_ms": result.processing_time_ms, "questions": [ { "number": q.question_number, "points": q.points_awarded, "max_points": q.max_points, "feedback": q.feedback, "strengths": q.strengths, "improvements": q.improvements } for q in result.question_results ] } doc.status = DocumentStatus.COMPLETED doc.processing_completed_at = datetime.utcnow() logger.info( f"Correction completed for {doc.doc_token[:8]}: " f"{result.total_score}/{result.max_score} ({result.grade})" ) except Exception as e: logger.error(f"AI correction failed for {doc.doc_token}: {e}") doc.ai_feedback = f"KI-Korrektur fehlgeschlagen: {str(e)[:200]}" doc.status = DocumentStatus.COMPLETED # Mark complete anyway doc.processing_completed_at = datetime.utcnow() # Update session stats session.processed_count += 1 self.db.commit() def _build_rubrics(self, session: ExamSession) -> List[QuestionRubric]: """Build QuestionRubric list from session questions.""" rubrics = [] if not session.questions: return rubrics for i, q in enumerate(session.questions): rubric = QuestionRubric( question_number=q.get("number", i + 1), question_text=q.get("text", f"Frage {i + 1}"), max_points=q.get("points", 10), expected_answer=q.get("expected_answer", ""), grading_criteria=q.get("rubric", session.rubric or "") ) rubrics.append(rubric) return rubrics def _mark_document_completed( self, doc: PseudonymizedDocument, teacher_id: str ): """Mark document as completed without AI correction.""" doc.status = DocumentStatus.COMPLETED doc.processing_completed_at = datetime.utcnow() if not doc.ai_feedback: doc.ai_feedback = "Verarbeitung abgeschlossen (ohne KI-Korrektur)" self.db.commit() # Update session stats if doc.session: doc.session.processed_count += 1 self.db.commit() def _mark_document_failed( self, doc: PseudonymizedDocument, error: str, teacher_id: str ): """Mark document as failed.""" doc.status = DocumentStatus.FAILED doc.processing_error = error[:500] doc.processing_completed_at = datetime.utcnow() self.db.commit() # Background task function for FastAPI async def process_session_background( session_id: str, teacher_id: str, db_url: str ): """ Background task for session processing. This function creates its own DB session for use in background tasks. """ from ..database import SessionLocal db = SessionLocal() try: service = ProcessingService(db) await service.process_session(session_id, teacher_id) finally: db.close() # Singleton for main service _processing_service: Optional[ProcessingService] = None def get_processing_service(db: Session) -> ProcessingService: """Get processing service instance.""" return ProcessingService(db)