This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/klausur/services/processing_service.py
Benjamin Admin bfdaf63ba9 fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

425 lines
14 KiB
Python

"""
Background Processing Service for Klausur Correction.
Orchestrates the complete correction pipeline:
1. Load documents from storage
2. Run TrOCR for text extraction
3. Run AI correction for grading
4. Save results to database
PRIVACY BY DESIGN:
- Only pseudonymized doc_tokens used throughout
- No student names in processing pipeline
- All data stays on self-hosted infrastructure
"""
import asyncio
import logging
from datetime import datetime
from typing import Optional, List, Callable
from dataclasses import dataclass
from sqlalchemy.orm import Session
from ..db_models import (
ExamSession, PseudonymizedDocument,
SessionStatus, DocumentStatus
)
from ..repository import KlausurRepository
from .trocr_client import get_trocr_client, TrOCRClient
from .vision_ocr_service import get_vision_ocr_service, VisionOCRService
from .correction_service import (
get_correction_service, ExamCorrectionService,
QuestionRubric, CorrectionResult
)
from .storage_service import get_storage_service, KlausurStorageService
logger = logging.getLogger(__name__)
@dataclass
class ProcessingProgress:
"""Progress update for SSE streaming."""
session_id: str
total_documents: int
processed_documents: int
current_document: Optional[str] = None
current_step: str = "idle" # ocr, correction, saving
error: Optional[str] = None
@property
def percentage(self) -> int:
if self.total_documents == 0:
return 0
return int(self.processed_documents / self.total_documents * 100)
class ProcessingService:
"""
Background service for exam correction processing.
Usage:
service = ProcessingService(db_session)
await service.process_session(session_id, teacher_id)
"""
def __init__(
self,
db: Session,
trocr_client: Optional[TrOCRClient] = None,
vision_ocr_service: Optional[VisionOCRService] = None,
correction_service: Optional[ExamCorrectionService] = None,
storage_service: Optional[KlausurStorageService] = None,
prefer_vision_ocr: bool = True # Vision-LLM als Primär für Handschrift
):
self.db = db
self.repo = KlausurRepository(db)
self.trocr = trocr_client or get_trocr_client()
self.vision_ocr = vision_ocr_service or get_vision_ocr_service()
self.correction = correction_service or get_correction_service()
self.storage = storage_service or get_storage_service()
self.prefer_vision_ocr = prefer_vision_ocr
# Progress callback for SSE streaming
self._progress_callback: Optional[Callable[[ProcessingProgress], None]] = None
def set_progress_callback(self, callback: Callable[[ProcessingProgress], None]):
"""Set callback for progress updates (SSE streaming)."""
self._progress_callback = callback
def _notify_progress(self, progress: ProcessingProgress):
"""Notify progress to callback if set."""
if self._progress_callback:
try:
self._progress_callback(progress)
except Exception as e:
logger.warning(f"Progress callback failed: {e}")
async def process_session(
self,
session_id: str,
teacher_id: str,
use_ai_correction: bool = True
) -> bool:
"""
Process all documents in a session.
Args:
session_id: Exam session ID
teacher_id: Teacher ID for isolation
use_ai_correction: Whether to run AI correction (requires LLM)
Returns:
True if processing completed successfully
"""
# Get session
session = self.repo.get_session(session_id, teacher_id)
if not session:
logger.error(f"Session not found: {session_id}")
return False
# Get documents
documents = self.repo.list_documents(session_id, teacher_id)
if not documents:
logger.warning(f"No documents in session: {session_id}")
return False
total = len(documents)
processed = 0
logger.info(f"Starting processing for session {session_id}: {total} documents")
# Check OCR service availability (Vision-LLM preferred for handwriting)
vision_ocr_available = await self.vision_ocr.is_available()
trocr_available = await self.trocr.is_available()
if vision_ocr_available and self.prefer_vision_ocr:
logger.info("Using Vision-LLM (llama3.2-vision) for OCR - optimal for handwriting")
use_vision_ocr = True
elif trocr_available:
logger.info("Using TrOCR for OCR")
use_vision_ocr = False
elif vision_ocr_available:
logger.info("TrOCR not available, falling back to Vision-LLM")
use_vision_ocr = True
else:
logger.warning("No OCR service available - OCR will be skipped")
use_vision_ocr = False
trocr_available = False
# Process each document
for doc in documents:
progress = ProcessingProgress(
session_id=session_id,
total_documents=total,
processed_documents=processed,
current_document=doc.doc_token[:8],
current_step="ocr"
)
self._notify_progress(progress)
try:
# Step 1: OCR extraction (Vision-LLM or TrOCR)
if (vision_ocr_available or trocr_available) and doc.status == DocumentStatus.UPLOADED:
await self._process_ocr(session_id, doc, teacher_id, use_vision_ocr=use_vision_ocr)
# Step 2: AI correction
progress.current_step = "correction"
self._notify_progress(progress)
if use_ai_correction and doc.ocr_text:
await self._process_correction(session, doc, teacher_id)
else:
# Just mark as completed without AI
self._mark_document_completed(doc, teacher_id)
processed += 1
except Exception as e:
logger.error(f"Failed to process document {doc.doc_token}: {e}")
self._mark_document_failed(doc, str(e), teacher_id)
# Update session status
self.repo.update_session_status(session_id, teacher_id, SessionStatus.COMPLETED)
# Final progress
progress = ProcessingProgress(
session_id=session_id,
total_documents=total,
processed_documents=processed,
current_step="complete"
)
self._notify_progress(progress)
logger.info(f"Completed processing session {session_id}: {processed}/{total} documents")
return True
async def _process_ocr(
self,
session_id: str,
doc: PseudonymizedDocument,
teacher_id: str,
use_vision_ocr: bool = True
):
"""
Run OCR on a document.
Args:
session_id: Session ID
doc: Document to process
teacher_id: Teacher ID
use_vision_ocr: True to use Vision-LLM (llama3.2-vision), False for TrOCR
"""
# Update status
doc.status = DocumentStatus.OCR_PROCESSING
doc.processing_started_at = datetime.utcnow()
self.db.commit()
# Try to get document from storage (check both redacted and original)
image_data = None
for is_redacted in [True, False]: # Prefer redacted version
for ext in ["png", "jpg", "jpeg", "pdf"]:
image_data = self.storage.get_document(
session_id, doc.doc_token, ext, is_redacted=is_redacted
)
if image_data:
logger.debug(f"Found document: {doc.doc_token[:8]}.{ext} (redacted={is_redacted})")
break
if image_data:
break
if not image_data:
logger.warning(f"No image found for document {doc.doc_token}")
# Use placeholder OCR text for testing
doc.ocr_text = "[Kein Bild gefunden - Manuelle Eingabe erforderlich]"
doc.ocr_confidence = 0
doc.status = DocumentStatus.OCR_COMPLETED
self.db.commit()
return
# Call OCR service (Vision-LLM or TrOCR)
try:
if use_vision_ocr:
# Use Vision-LLM (llama3.2-vision) - better for handwriting
result = await self.vision_ocr.extract_text(
image_data,
filename=f"{doc.doc_token}.png",
is_handwriting=True # Assume handwriting for exams
)
ocr_method = "Vision-LLM"
else:
# Use TrOCR
result = await self.trocr.extract_text(
image_data,
filename=f"{doc.doc_token}.png",
detect_lines=True
)
ocr_method = "TrOCR"
doc.ocr_text = result.text
doc.ocr_confidence = int(result.confidence * 100)
doc.status = DocumentStatus.OCR_COMPLETED
logger.info(
f"OCR completed ({ocr_method}) for {doc.doc_token[:8]}: "
f"{len(result.text)} chars, {result.confidence:.0%} confidence"
)
except Exception as e:
logger.error(f"OCR failed for {doc.doc_token}: {e}")
doc.ocr_text = f"[OCR Fehler: {str(e)[:100]}]"
doc.ocr_confidence = 0
doc.status = DocumentStatus.OCR_COMPLETED # Continue to AI anyway
self.db.commit()
async def _process_correction(
self,
session: ExamSession,
doc: PseudonymizedDocument,
teacher_id: str
):
"""Run AI correction on a document."""
doc.status = DocumentStatus.AI_PROCESSING
self.db.commit()
# Build rubrics from session questions
rubrics = self._build_rubrics(session)
if not rubrics:
# No rubrics defined - use simple scoring
doc.ai_feedback = "Keine Bewertungskriterien definiert. Manuelle Korrektur empfohlen."
doc.ai_score = None
doc.ai_grade = None
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
self.db.commit()
# Update session stats
session.processed_count += 1
self.db.commit()
return
try:
# Run AI correction
result = await self.correction.correct_exam(
doc_token=doc.doc_token,
ocr_text=doc.ocr_text,
rubrics=rubrics,
subject=session.subject or "Allgemein"
)
# Save results
doc.ai_feedback = result.overall_feedback
doc.ai_score = result.total_score
doc.ai_grade = result.grade
doc.ai_details = {
"max_score": result.max_score,
"processing_time_ms": result.processing_time_ms,
"questions": [
{
"number": q.question_number,
"points": q.points_awarded,
"max_points": q.max_points,
"feedback": q.feedback,
"strengths": q.strengths,
"improvements": q.improvements
}
for q in result.question_results
]
}
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
logger.info(
f"Correction completed for {doc.doc_token[:8]}: "
f"{result.total_score}/{result.max_score} ({result.grade})"
)
except Exception as e:
logger.error(f"AI correction failed for {doc.doc_token}: {e}")
doc.ai_feedback = f"KI-Korrektur fehlgeschlagen: {str(e)[:200]}"
doc.status = DocumentStatus.COMPLETED # Mark complete anyway
doc.processing_completed_at = datetime.utcnow()
# Update session stats
session.processed_count += 1
self.db.commit()
def _build_rubrics(self, session: ExamSession) -> List[QuestionRubric]:
"""Build QuestionRubric list from session questions."""
rubrics = []
if not session.questions:
return rubrics
for i, q in enumerate(session.questions):
rubric = QuestionRubric(
question_number=q.get("number", i + 1),
question_text=q.get("text", f"Frage {i + 1}"),
max_points=q.get("points", 10),
expected_answer=q.get("expected_answer", ""),
grading_criteria=q.get("rubric", session.rubric or "")
)
rubrics.append(rubric)
return rubrics
def _mark_document_completed(
self,
doc: PseudonymizedDocument,
teacher_id: str
):
"""Mark document as completed without AI correction."""
doc.status = DocumentStatus.COMPLETED
doc.processing_completed_at = datetime.utcnow()
if not doc.ai_feedback:
doc.ai_feedback = "Verarbeitung abgeschlossen (ohne KI-Korrektur)"
self.db.commit()
# Update session stats
if doc.session:
doc.session.processed_count += 1
self.db.commit()
def _mark_document_failed(
self,
doc: PseudonymizedDocument,
error: str,
teacher_id: str
):
"""Mark document as failed."""
doc.status = DocumentStatus.FAILED
doc.processing_error = error[:500]
doc.processing_completed_at = datetime.utcnow()
self.db.commit()
# Background task function for FastAPI
async def process_session_background(
session_id: str,
teacher_id: str,
db_url: str
):
"""
Background task for session processing.
This function creates its own DB session for use in background tasks.
"""
from ..database import SessionLocal
db = SessionLocal()
try:
service = ProcessingService(db)
await service.process_session(session_id, teacher_id)
finally:
db.close()
# Singleton for main service
_processing_service: Optional[ProcessingService] = None
def get_processing_service(db: Session) -> ProcessingService:
"""Get processing service instance."""
return ProcessingService(db)