A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
425 lines
14 KiB
Python
425 lines
14 KiB
Python
"""
|
|
Background Processing Service for Klausur Correction.
|
|
|
|
Orchestrates the complete correction pipeline:
|
|
1. Load documents from storage
|
|
2. Run TrOCR for text extraction
|
|
3. Run AI correction for grading
|
|
4. Save results to database
|
|
|
|
PRIVACY BY DESIGN:
|
|
- Only pseudonymized doc_tokens used throughout
|
|
- No student names in processing pipeline
|
|
- All data stays on self-hosted infrastructure
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Optional, List, Callable
|
|
from dataclasses import dataclass
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from ..db_models import (
|
|
ExamSession, PseudonymizedDocument,
|
|
SessionStatus, DocumentStatus
|
|
)
|
|
from ..repository import KlausurRepository
|
|
from .trocr_client import get_trocr_client, TrOCRClient
|
|
from .vision_ocr_service import get_vision_ocr_service, VisionOCRService
|
|
from .correction_service import (
|
|
get_correction_service, ExamCorrectionService,
|
|
QuestionRubric, CorrectionResult
|
|
)
|
|
from .storage_service import get_storage_service, KlausurStorageService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ProcessingProgress:
|
|
"""Progress update for SSE streaming."""
|
|
session_id: str
|
|
total_documents: int
|
|
processed_documents: int
|
|
current_document: Optional[str] = None
|
|
current_step: str = "idle" # ocr, correction, saving
|
|
error: Optional[str] = None
|
|
|
|
@property
|
|
def percentage(self) -> int:
|
|
if self.total_documents == 0:
|
|
return 0
|
|
return int(self.processed_documents / self.total_documents * 100)
|
|
|
|
|
|
class ProcessingService:
|
|
"""
|
|
Background service for exam correction processing.
|
|
|
|
Usage:
|
|
service = ProcessingService(db_session)
|
|
await service.process_session(session_id, teacher_id)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
db: Session,
|
|
trocr_client: Optional[TrOCRClient] = None,
|
|
vision_ocr_service: Optional[VisionOCRService] = None,
|
|
correction_service: Optional[ExamCorrectionService] = None,
|
|
storage_service: Optional[KlausurStorageService] = None,
|
|
prefer_vision_ocr: bool = True # Vision-LLM als Primär für Handschrift
|
|
):
|
|
self.db = db
|
|
self.repo = KlausurRepository(db)
|
|
self.trocr = trocr_client or get_trocr_client()
|
|
self.vision_ocr = vision_ocr_service or get_vision_ocr_service()
|
|
self.correction = correction_service or get_correction_service()
|
|
self.storage = storage_service or get_storage_service()
|
|
self.prefer_vision_ocr = prefer_vision_ocr
|
|
|
|
# Progress callback for SSE streaming
|
|
self._progress_callback: Optional[Callable[[ProcessingProgress], None]] = None
|
|
|
|
def set_progress_callback(self, callback: Callable[[ProcessingProgress], None]):
|
|
"""Set callback for progress updates (SSE streaming)."""
|
|
self._progress_callback = callback
|
|
|
|
def _notify_progress(self, progress: ProcessingProgress):
|
|
"""Notify progress to callback if set."""
|
|
if self._progress_callback:
|
|
try:
|
|
self._progress_callback(progress)
|
|
except Exception as e:
|
|
logger.warning(f"Progress callback failed: {e}")
|
|
|
|
async def process_session(
|
|
self,
|
|
session_id: str,
|
|
teacher_id: str,
|
|
use_ai_correction: bool = True
|
|
) -> bool:
|
|
"""
|
|
Process all documents in a session.
|
|
|
|
Args:
|
|
session_id: Exam session ID
|
|
teacher_id: Teacher ID for isolation
|
|
use_ai_correction: Whether to run AI correction (requires LLM)
|
|
|
|
Returns:
|
|
True if processing completed successfully
|
|
"""
|
|
# Get session
|
|
session = self.repo.get_session(session_id, teacher_id)
|
|
if not session:
|
|
logger.error(f"Session not found: {session_id}")
|
|
return False
|
|
|
|
# Get documents
|
|
documents = self.repo.list_documents(session_id, teacher_id)
|
|
if not documents:
|
|
logger.warning(f"No documents in session: {session_id}")
|
|
return False
|
|
|
|
total = len(documents)
|
|
processed = 0
|
|
|
|
logger.info(f"Starting processing for session {session_id}: {total} documents")
|
|
|
|
# Check OCR service availability (Vision-LLM preferred for handwriting)
|
|
vision_ocr_available = await self.vision_ocr.is_available()
|
|
trocr_available = await self.trocr.is_available()
|
|
|
|
if vision_ocr_available and self.prefer_vision_ocr:
|
|
logger.info("Using Vision-LLM (llama3.2-vision) for OCR - optimal for handwriting")
|
|
use_vision_ocr = True
|
|
elif trocr_available:
|
|
logger.info("Using TrOCR for OCR")
|
|
use_vision_ocr = False
|
|
elif vision_ocr_available:
|
|
logger.info("TrOCR not available, falling back to Vision-LLM")
|
|
use_vision_ocr = True
|
|
else:
|
|
logger.warning("No OCR service available - OCR will be skipped")
|
|
use_vision_ocr = False
|
|
trocr_available = False
|
|
|
|
# Process each document
|
|
for doc in documents:
|
|
progress = ProcessingProgress(
|
|
session_id=session_id,
|
|
total_documents=total,
|
|
processed_documents=processed,
|
|
current_document=doc.doc_token[:8],
|
|
current_step="ocr"
|
|
)
|
|
self._notify_progress(progress)
|
|
|
|
try:
|
|
# Step 1: OCR extraction (Vision-LLM or TrOCR)
|
|
if (vision_ocr_available or trocr_available) and doc.status == DocumentStatus.UPLOADED:
|
|
await self._process_ocr(session_id, doc, teacher_id, use_vision_ocr=use_vision_ocr)
|
|
|
|
# Step 2: AI correction
|
|
progress.current_step = "correction"
|
|
self._notify_progress(progress)
|
|
|
|
if use_ai_correction and doc.ocr_text:
|
|
await self._process_correction(session, doc, teacher_id)
|
|
else:
|
|
# Just mark as completed without AI
|
|
self._mark_document_completed(doc, teacher_id)
|
|
|
|
processed += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to process document {doc.doc_token}: {e}")
|
|
self._mark_document_failed(doc, str(e), teacher_id)
|
|
|
|
# Update session status
|
|
self.repo.update_session_status(session_id, teacher_id, SessionStatus.COMPLETED)
|
|
|
|
# Final progress
|
|
progress = ProcessingProgress(
|
|
session_id=session_id,
|
|
total_documents=total,
|
|
processed_documents=processed,
|
|
current_step="complete"
|
|
)
|
|
self._notify_progress(progress)
|
|
|
|
logger.info(f"Completed processing session {session_id}: {processed}/{total} documents")
|
|
return True
|
|
|
|
async def _process_ocr(
|
|
self,
|
|
session_id: str,
|
|
doc: PseudonymizedDocument,
|
|
teacher_id: str,
|
|
use_vision_ocr: bool = True
|
|
):
|
|
"""
|
|
Run OCR on a document.
|
|
|
|
Args:
|
|
session_id: Session ID
|
|
doc: Document to process
|
|
teacher_id: Teacher ID
|
|
use_vision_ocr: True to use Vision-LLM (llama3.2-vision), False for TrOCR
|
|
"""
|
|
# Update status
|
|
doc.status = DocumentStatus.OCR_PROCESSING
|
|
doc.processing_started_at = datetime.utcnow()
|
|
self.db.commit()
|
|
|
|
# Try to get document from storage (check both redacted and original)
|
|
image_data = None
|
|
for is_redacted in [True, False]: # Prefer redacted version
|
|
for ext in ["png", "jpg", "jpeg", "pdf"]:
|
|
image_data = self.storage.get_document(
|
|
session_id, doc.doc_token, ext, is_redacted=is_redacted
|
|
)
|
|
if image_data:
|
|
logger.debug(f"Found document: {doc.doc_token[:8]}.{ext} (redacted={is_redacted})")
|
|
break
|
|
if image_data:
|
|
break
|
|
|
|
if not image_data:
|
|
logger.warning(f"No image found for document {doc.doc_token}")
|
|
# Use placeholder OCR text for testing
|
|
doc.ocr_text = "[Kein Bild gefunden - Manuelle Eingabe erforderlich]"
|
|
doc.ocr_confidence = 0
|
|
doc.status = DocumentStatus.OCR_COMPLETED
|
|
self.db.commit()
|
|
return
|
|
|
|
# Call OCR service (Vision-LLM or TrOCR)
|
|
try:
|
|
if use_vision_ocr:
|
|
# Use Vision-LLM (llama3.2-vision) - better for handwriting
|
|
result = await self.vision_ocr.extract_text(
|
|
image_data,
|
|
filename=f"{doc.doc_token}.png",
|
|
is_handwriting=True # Assume handwriting for exams
|
|
)
|
|
ocr_method = "Vision-LLM"
|
|
else:
|
|
# Use TrOCR
|
|
result = await self.trocr.extract_text(
|
|
image_data,
|
|
filename=f"{doc.doc_token}.png",
|
|
detect_lines=True
|
|
)
|
|
ocr_method = "TrOCR"
|
|
|
|
doc.ocr_text = result.text
|
|
doc.ocr_confidence = int(result.confidence * 100)
|
|
doc.status = DocumentStatus.OCR_COMPLETED
|
|
|
|
logger.info(
|
|
f"OCR completed ({ocr_method}) for {doc.doc_token[:8]}: "
|
|
f"{len(result.text)} chars, {result.confidence:.0%} confidence"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"OCR failed for {doc.doc_token}: {e}")
|
|
doc.ocr_text = f"[OCR Fehler: {str(e)[:100]}]"
|
|
doc.ocr_confidence = 0
|
|
doc.status = DocumentStatus.OCR_COMPLETED # Continue to AI anyway
|
|
|
|
self.db.commit()
|
|
|
|
async def _process_correction(
|
|
self,
|
|
session: ExamSession,
|
|
doc: PseudonymizedDocument,
|
|
teacher_id: str
|
|
):
|
|
"""Run AI correction on a document."""
|
|
doc.status = DocumentStatus.AI_PROCESSING
|
|
self.db.commit()
|
|
|
|
# Build rubrics from session questions
|
|
rubrics = self._build_rubrics(session)
|
|
|
|
if not rubrics:
|
|
# No rubrics defined - use simple scoring
|
|
doc.ai_feedback = "Keine Bewertungskriterien definiert. Manuelle Korrektur empfohlen."
|
|
doc.ai_score = None
|
|
doc.ai_grade = None
|
|
doc.status = DocumentStatus.COMPLETED
|
|
doc.processing_completed_at = datetime.utcnow()
|
|
self.db.commit()
|
|
|
|
# Update session stats
|
|
session.processed_count += 1
|
|
self.db.commit()
|
|
return
|
|
|
|
try:
|
|
# Run AI correction
|
|
result = await self.correction.correct_exam(
|
|
doc_token=doc.doc_token,
|
|
ocr_text=doc.ocr_text,
|
|
rubrics=rubrics,
|
|
subject=session.subject or "Allgemein"
|
|
)
|
|
|
|
# Save results
|
|
doc.ai_feedback = result.overall_feedback
|
|
doc.ai_score = result.total_score
|
|
doc.ai_grade = result.grade
|
|
doc.ai_details = {
|
|
"max_score": result.max_score,
|
|
"processing_time_ms": result.processing_time_ms,
|
|
"questions": [
|
|
{
|
|
"number": q.question_number,
|
|
"points": q.points_awarded,
|
|
"max_points": q.max_points,
|
|
"feedback": q.feedback,
|
|
"strengths": q.strengths,
|
|
"improvements": q.improvements
|
|
}
|
|
for q in result.question_results
|
|
]
|
|
}
|
|
doc.status = DocumentStatus.COMPLETED
|
|
doc.processing_completed_at = datetime.utcnow()
|
|
|
|
logger.info(
|
|
f"Correction completed for {doc.doc_token[:8]}: "
|
|
f"{result.total_score}/{result.max_score} ({result.grade})"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"AI correction failed for {doc.doc_token}: {e}")
|
|
doc.ai_feedback = f"KI-Korrektur fehlgeschlagen: {str(e)[:200]}"
|
|
doc.status = DocumentStatus.COMPLETED # Mark complete anyway
|
|
doc.processing_completed_at = datetime.utcnow()
|
|
|
|
# Update session stats
|
|
session.processed_count += 1
|
|
self.db.commit()
|
|
|
|
def _build_rubrics(self, session: ExamSession) -> List[QuestionRubric]:
|
|
"""Build QuestionRubric list from session questions."""
|
|
rubrics = []
|
|
|
|
if not session.questions:
|
|
return rubrics
|
|
|
|
for i, q in enumerate(session.questions):
|
|
rubric = QuestionRubric(
|
|
question_number=q.get("number", i + 1),
|
|
question_text=q.get("text", f"Frage {i + 1}"),
|
|
max_points=q.get("points", 10),
|
|
expected_answer=q.get("expected_answer", ""),
|
|
grading_criteria=q.get("rubric", session.rubric or "")
|
|
)
|
|
rubrics.append(rubric)
|
|
|
|
return rubrics
|
|
|
|
def _mark_document_completed(
|
|
self,
|
|
doc: PseudonymizedDocument,
|
|
teacher_id: str
|
|
):
|
|
"""Mark document as completed without AI correction."""
|
|
doc.status = DocumentStatus.COMPLETED
|
|
doc.processing_completed_at = datetime.utcnow()
|
|
if not doc.ai_feedback:
|
|
doc.ai_feedback = "Verarbeitung abgeschlossen (ohne KI-Korrektur)"
|
|
self.db.commit()
|
|
|
|
# Update session stats
|
|
if doc.session:
|
|
doc.session.processed_count += 1
|
|
self.db.commit()
|
|
|
|
def _mark_document_failed(
|
|
self,
|
|
doc: PseudonymizedDocument,
|
|
error: str,
|
|
teacher_id: str
|
|
):
|
|
"""Mark document as failed."""
|
|
doc.status = DocumentStatus.FAILED
|
|
doc.processing_error = error[:500]
|
|
doc.processing_completed_at = datetime.utcnow()
|
|
self.db.commit()
|
|
|
|
|
|
# Background task function for FastAPI
|
|
async def process_session_background(
|
|
session_id: str,
|
|
teacher_id: str,
|
|
db_url: str
|
|
):
|
|
"""
|
|
Background task for session processing.
|
|
|
|
This function creates its own DB session for use in background tasks.
|
|
"""
|
|
from ..database import SessionLocal
|
|
|
|
db = SessionLocal()
|
|
try:
|
|
service = ProcessingService(db)
|
|
await service.process_session(session_id, teacher_id)
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
# Singleton for main service
|
|
_processing_service: Optional[ProcessingService] = None
|
|
|
|
|
|
def get_processing_service(db: Session) -> ProcessingService:
|
|
"""Get processing service instance."""
|
|
return ProcessingService(db)
|