fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
424
backend/klausur/services/processing_service.py
Normal file
424
backend/klausur/services/processing_service.py
Normal file
@@ -0,0 +1,424 @@
|
||||
"""
|
||||
Background Processing Service for Klausur Correction.
|
||||
|
||||
Orchestrates the complete correction pipeline:
|
||||
1. Load documents from storage
|
||||
2. Run TrOCR for text extraction
|
||||
3. Run AI correction for grading
|
||||
4. Save results to database
|
||||
|
||||
PRIVACY BY DESIGN:
|
||||
- Only pseudonymized doc_tokens used throughout
|
||||
- No student names in processing pipeline
|
||||
- All data stays on self-hosted infrastructure
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Callable
|
||||
from dataclasses import dataclass
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ..db_models import (
|
||||
ExamSession, PseudonymizedDocument,
|
||||
SessionStatus, DocumentStatus
|
||||
)
|
||||
from ..repository import KlausurRepository
|
||||
from .trocr_client import get_trocr_client, TrOCRClient
|
||||
from .vision_ocr_service import get_vision_ocr_service, VisionOCRService
|
||||
from .correction_service import (
|
||||
get_correction_service, ExamCorrectionService,
|
||||
QuestionRubric, CorrectionResult
|
||||
)
|
||||
from .storage_service import get_storage_service, KlausurStorageService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingProgress:
|
||||
"""Progress update for SSE streaming."""
|
||||
session_id: str
|
||||
total_documents: int
|
||||
processed_documents: int
|
||||
current_document: Optional[str] = None
|
||||
current_step: str = "idle" # ocr, correction, saving
|
||||
error: Optional[str] = None
|
||||
|
||||
@property
|
||||
def percentage(self) -> int:
|
||||
if self.total_documents == 0:
|
||||
return 0
|
||||
return int(self.processed_documents / self.total_documents * 100)
|
||||
|
||||
|
||||
class ProcessingService:
|
||||
"""
|
||||
Background service for exam correction processing.
|
||||
|
||||
Usage:
|
||||
service = ProcessingService(db_session)
|
||||
await service.process_session(session_id, teacher_id)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db: Session,
|
||||
trocr_client: Optional[TrOCRClient] = None,
|
||||
vision_ocr_service: Optional[VisionOCRService] = None,
|
||||
correction_service: Optional[ExamCorrectionService] = None,
|
||||
storage_service: Optional[KlausurStorageService] = None,
|
||||
prefer_vision_ocr: bool = True # Vision-LLM als Primär für Handschrift
|
||||
):
|
||||
self.db = db
|
||||
self.repo = KlausurRepository(db)
|
||||
self.trocr = trocr_client or get_trocr_client()
|
||||
self.vision_ocr = vision_ocr_service or get_vision_ocr_service()
|
||||
self.correction = correction_service or get_correction_service()
|
||||
self.storage = storage_service or get_storage_service()
|
||||
self.prefer_vision_ocr = prefer_vision_ocr
|
||||
|
||||
# Progress callback for SSE streaming
|
||||
self._progress_callback: Optional[Callable[[ProcessingProgress], None]] = None
|
||||
|
||||
def set_progress_callback(self, callback: Callable[[ProcessingProgress], None]):
|
||||
"""Set callback for progress updates (SSE streaming)."""
|
||||
self._progress_callback = callback
|
||||
|
||||
def _notify_progress(self, progress: ProcessingProgress):
|
||||
"""Notify progress to callback if set."""
|
||||
if self._progress_callback:
|
||||
try:
|
||||
self._progress_callback(progress)
|
||||
except Exception as e:
|
||||
logger.warning(f"Progress callback failed: {e}")
|
||||
|
||||
async def process_session(
|
||||
self,
|
||||
session_id: str,
|
||||
teacher_id: str,
|
||||
use_ai_correction: bool = True
|
||||
) -> bool:
|
||||
"""
|
||||
Process all documents in a session.
|
||||
|
||||
Args:
|
||||
session_id: Exam session ID
|
||||
teacher_id: Teacher ID for isolation
|
||||
use_ai_correction: Whether to run AI correction (requires LLM)
|
||||
|
||||
Returns:
|
||||
True if processing completed successfully
|
||||
"""
|
||||
# Get session
|
||||
session = self.repo.get_session(session_id, teacher_id)
|
||||
if not session:
|
||||
logger.error(f"Session not found: {session_id}")
|
||||
return False
|
||||
|
||||
# Get documents
|
||||
documents = self.repo.list_documents(session_id, teacher_id)
|
||||
if not documents:
|
||||
logger.warning(f"No documents in session: {session_id}")
|
||||
return False
|
||||
|
||||
total = len(documents)
|
||||
processed = 0
|
||||
|
||||
logger.info(f"Starting processing for session {session_id}: {total} documents")
|
||||
|
||||
# Check OCR service availability (Vision-LLM preferred for handwriting)
|
||||
vision_ocr_available = await self.vision_ocr.is_available()
|
||||
trocr_available = await self.trocr.is_available()
|
||||
|
||||
if vision_ocr_available and self.prefer_vision_ocr:
|
||||
logger.info("Using Vision-LLM (llama3.2-vision) for OCR - optimal for handwriting")
|
||||
use_vision_ocr = True
|
||||
elif trocr_available:
|
||||
logger.info("Using TrOCR for OCR")
|
||||
use_vision_ocr = False
|
||||
elif vision_ocr_available:
|
||||
logger.info("TrOCR not available, falling back to Vision-LLM")
|
||||
use_vision_ocr = True
|
||||
else:
|
||||
logger.warning("No OCR service available - OCR will be skipped")
|
||||
use_vision_ocr = False
|
||||
trocr_available = False
|
||||
|
||||
# Process each document
|
||||
for doc in documents:
|
||||
progress = ProcessingProgress(
|
||||
session_id=session_id,
|
||||
total_documents=total,
|
||||
processed_documents=processed,
|
||||
current_document=doc.doc_token[:8],
|
||||
current_step="ocr"
|
||||
)
|
||||
self._notify_progress(progress)
|
||||
|
||||
try:
|
||||
# Step 1: OCR extraction (Vision-LLM or TrOCR)
|
||||
if (vision_ocr_available or trocr_available) and doc.status == DocumentStatus.UPLOADED:
|
||||
await self._process_ocr(session_id, doc, teacher_id, use_vision_ocr=use_vision_ocr)
|
||||
|
||||
# Step 2: AI correction
|
||||
progress.current_step = "correction"
|
||||
self._notify_progress(progress)
|
||||
|
||||
if use_ai_correction and doc.ocr_text:
|
||||
await self._process_correction(session, doc, teacher_id)
|
||||
else:
|
||||
# Just mark as completed without AI
|
||||
self._mark_document_completed(doc, teacher_id)
|
||||
|
||||
processed += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process document {doc.doc_token}: {e}")
|
||||
self._mark_document_failed(doc, str(e), teacher_id)
|
||||
|
||||
# Update session status
|
||||
self.repo.update_session_status(session_id, teacher_id, SessionStatus.COMPLETED)
|
||||
|
||||
# Final progress
|
||||
progress = ProcessingProgress(
|
||||
session_id=session_id,
|
||||
total_documents=total,
|
||||
processed_documents=processed,
|
||||
current_step="complete"
|
||||
)
|
||||
self._notify_progress(progress)
|
||||
|
||||
logger.info(f"Completed processing session {session_id}: {processed}/{total} documents")
|
||||
return True
|
||||
|
||||
async def _process_ocr(
|
||||
self,
|
||||
session_id: str,
|
||||
doc: PseudonymizedDocument,
|
||||
teacher_id: str,
|
||||
use_vision_ocr: bool = True
|
||||
):
|
||||
"""
|
||||
Run OCR on a document.
|
||||
|
||||
Args:
|
||||
session_id: Session ID
|
||||
doc: Document to process
|
||||
teacher_id: Teacher ID
|
||||
use_vision_ocr: True to use Vision-LLM (llama3.2-vision), False for TrOCR
|
||||
"""
|
||||
# Update status
|
||||
doc.status = DocumentStatus.OCR_PROCESSING
|
||||
doc.processing_started_at = datetime.utcnow()
|
||||
self.db.commit()
|
||||
|
||||
# Try to get document from storage (check both redacted and original)
|
||||
image_data = None
|
||||
for is_redacted in [True, False]: # Prefer redacted version
|
||||
for ext in ["png", "jpg", "jpeg", "pdf"]:
|
||||
image_data = self.storage.get_document(
|
||||
session_id, doc.doc_token, ext, is_redacted=is_redacted
|
||||
)
|
||||
if image_data:
|
||||
logger.debug(f"Found document: {doc.doc_token[:8]}.{ext} (redacted={is_redacted})")
|
||||
break
|
||||
if image_data:
|
||||
break
|
||||
|
||||
if not image_data:
|
||||
logger.warning(f"No image found for document {doc.doc_token}")
|
||||
# Use placeholder OCR text for testing
|
||||
doc.ocr_text = "[Kein Bild gefunden - Manuelle Eingabe erforderlich]"
|
||||
doc.ocr_confidence = 0
|
||||
doc.status = DocumentStatus.OCR_COMPLETED
|
||||
self.db.commit()
|
||||
return
|
||||
|
||||
# Call OCR service (Vision-LLM or TrOCR)
|
||||
try:
|
||||
if use_vision_ocr:
|
||||
# Use Vision-LLM (llama3.2-vision) - better for handwriting
|
||||
result = await self.vision_ocr.extract_text(
|
||||
image_data,
|
||||
filename=f"{doc.doc_token}.png",
|
||||
is_handwriting=True # Assume handwriting for exams
|
||||
)
|
||||
ocr_method = "Vision-LLM"
|
||||
else:
|
||||
# Use TrOCR
|
||||
result = await self.trocr.extract_text(
|
||||
image_data,
|
||||
filename=f"{doc.doc_token}.png",
|
||||
detect_lines=True
|
||||
)
|
||||
ocr_method = "TrOCR"
|
||||
|
||||
doc.ocr_text = result.text
|
||||
doc.ocr_confidence = int(result.confidence * 100)
|
||||
doc.status = DocumentStatus.OCR_COMPLETED
|
||||
|
||||
logger.info(
|
||||
f"OCR completed ({ocr_method}) for {doc.doc_token[:8]}: "
|
||||
f"{len(result.text)} chars, {result.confidence:.0%} confidence"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR failed for {doc.doc_token}: {e}")
|
||||
doc.ocr_text = f"[OCR Fehler: {str(e)[:100]}]"
|
||||
doc.ocr_confidence = 0
|
||||
doc.status = DocumentStatus.OCR_COMPLETED # Continue to AI anyway
|
||||
|
||||
self.db.commit()
|
||||
|
||||
async def _process_correction(
|
||||
self,
|
||||
session: ExamSession,
|
||||
doc: PseudonymizedDocument,
|
||||
teacher_id: str
|
||||
):
|
||||
"""Run AI correction on a document."""
|
||||
doc.status = DocumentStatus.AI_PROCESSING
|
||||
self.db.commit()
|
||||
|
||||
# Build rubrics from session questions
|
||||
rubrics = self._build_rubrics(session)
|
||||
|
||||
if not rubrics:
|
||||
# No rubrics defined - use simple scoring
|
||||
doc.ai_feedback = "Keine Bewertungskriterien definiert. Manuelle Korrektur empfohlen."
|
||||
doc.ai_score = None
|
||||
doc.ai_grade = None
|
||||
doc.status = DocumentStatus.COMPLETED
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
self.db.commit()
|
||||
|
||||
# Update session stats
|
||||
session.processed_count += 1
|
||||
self.db.commit()
|
||||
return
|
||||
|
||||
try:
|
||||
# Run AI correction
|
||||
result = await self.correction.correct_exam(
|
||||
doc_token=doc.doc_token,
|
||||
ocr_text=doc.ocr_text,
|
||||
rubrics=rubrics,
|
||||
subject=session.subject or "Allgemein"
|
||||
)
|
||||
|
||||
# Save results
|
||||
doc.ai_feedback = result.overall_feedback
|
||||
doc.ai_score = result.total_score
|
||||
doc.ai_grade = result.grade
|
||||
doc.ai_details = {
|
||||
"max_score": result.max_score,
|
||||
"processing_time_ms": result.processing_time_ms,
|
||||
"questions": [
|
||||
{
|
||||
"number": q.question_number,
|
||||
"points": q.points_awarded,
|
||||
"max_points": q.max_points,
|
||||
"feedback": q.feedback,
|
||||
"strengths": q.strengths,
|
||||
"improvements": q.improvements
|
||||
}
|
||||
for q in result.question_results
|
||||
]
|
||||
}
|
||||
doc.status = DocumentStatus.COMPLETED
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
|
||||
logger.info(
|
||||
f"Correction completed for {doc.doc_token[:8]}: "
|
||||
f"{result.total_score}/{result.max_score} ({result.grade})"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI correction failed for {doc.doc_token}: {e}")
|
||||
doc.ai_feedback = f"KI-Korrektur fehlgeschlagen: {str(e)[:200]}"
|
||||
doc.status = DocumentStatus.COMPLETED # Mark complete anyway
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
|
||||
# Update session stats
|
||||
session.processed_count += 1
|
||||
self.db.commit()
|
||||
|
||||
def _build_rubrics(self, session: ExamSession) -> List[QuestionRubric]:
|
||||
"""Build QuestionRubric list from session questions."""
|
||||
rubrics = []
|
||||
|
||||
if not session.questions:
|
||||
return rubrics
|
||||
|
||||
for i, q in enumerate(session.questions):
|
||||
rubric = QuestionRubric(
|
||||
question_number=q.get("number", i + 1),
|
||||
question_text=q.get("text", f"Frage {i + 1}"),
|
||||
max_points=q.get("points", 10),
|
||||
expected_answer=q.get("expected_answer", ""),
|
||||
grading_criteria=q.get("rubric", session.rubric or "")
|
||||
)
|
||||
rubrics.append(rubric)
|
||||
|
||||
return rubrics
|
||||
|
||||
def _mark_document_completed(
|
||||
self,
|
||||
doc: PseudonymizedDocument,
|
||||
teacher_id: str
|
||||
):
|
||||
"""Mark document as completed without AI correction."""
|
||||
doc.status = DocumentStatus.COMPLETED
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
if not doc.ai_feedback:
|
||||
doc.ai_feedback = "Verarbeitung abgeschlossen (ohne KI-Korrektur)"
|
||||
self.db.commit()
|
||||
|
||||
# Update session stats
|
||||
if doc.session:
|
||||
doc.session.processed_count += 1
|
||||
self.db.commit()
|
||||
|
||||
def _mark_document_failed(
|
||||
self,
|
||||
doc: PseudonymizedDocument,
|
||||
error: str,
|
||||
teacher_id: str
|
||||
):
|
||||
"""Mark document as failed."""
|
||||
doc.status = DocumentStatus.FAILED
|
||||
doc.processing_error = error[:500]
|
||||
doc.processing_completed_at = datetime.utcnow()
|
||||
self.db.commit()
|
||||
|
||||
|
||||
# Background task function for FastAPI
|
||||
async def process_session_background(
|
||||
session_id: str,
|
||||
teacher_id: str,
|
||||
db_url: str
|
||||
):
|
||||
"""
|
||||
Background task for session processing.
|
||||
|
||||
This function creates its own DB session for use in background tasks.
|
||||
"""
|
||||
from ..database import SessionLocal
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
service = ProcessingService(db)
|
||||
await service.process_session(session_id, teacher_id)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# Singleton for main service
|
||||
_processing_service: Optional[ProcessingService] = None
|
||||
|
||||
|
||||
def get_processing_service(db: Session) -> ProcessingService:
|
||||
"""Get processing service instance."""
|
||||
return ProcessingService(db)
|
||||
Reference in New Issue
Block a user