breakpilot-pwa/backend/klausur/services/processing_service.py

"""
Background Processing Service for Klausur Correction.

Orchestrates the complete correction pipeline:
1. Load documents from storage
2. Run TrOCR for text extraction
3. Run AI correction for grading
4. Save results to database

PRIVACY BY DESIGN:
- Only pseudonymized doc_tokens used throughout
- No student names in processing pipeline
- All data stays on self-hosted infrastructure
"""
import asyncio
import logging
from datetime import datetime
from typing import Optional, List, Callable
from dataclasses import dataclass

from sqlalchemy.orm import Session

from ..db_models import (
    ExamSession, PseudonymizedDocument,
    SessionStatus, DocumentStatus
)
from ..repository import KlausurRepository
from .trocr_client import get_trocr_client, TrOCRClient
from .vision_ocr_service import get_vision_ocr_service, VisionOCRService
from .correction_service import (
    get_correction_service, ExamCorrectionService,
    QuestionRubric, CorrectionResult
)
from .storage_service import get_storage_service, KlausurStorageService

logger = logging.getLogger(__name__)


@dataclass
class ProcessingProgress:
    """Progress update for SSE streaming."""
    session_id: str
    total_documents: int
    processed_documents: int
    current_document: Optional[str] = None
    current_step: str = "idle"  # ocr, correction, saving
    error: Optional[str] = None

    @property
    def percentage(self) -> int:
        if self.total_documents == 0:
            return 0
        return int(self.processed_documents / self.total_documents * 100)


class ProcessingService:
    """
    Background service for exam correction processing.

    Usage:
        service = ProcessingService(db_session)
        await service.process_session(session_id, teacher_id)
    """

    def __init__(
        self,
        db: Session,
        trocr_client: Optional[TrOCRClient] = None,
        vision_ocr_service: Optional[VisionOCRService] = None,
        correction_service: Optional[ExamCorrectionService] = None,
        storage_service: Optional[KlausurStorageService] = None,
        prefer_vision_ocr: bool = True  # Vision-LLM als Primär für Handschrift
    ):
        self.db = db
        self.repo = KlausurRepository(db)
        self.trocr = trocr_client or get_trocr_client()
        self.vision_ocr = vision_ocr_service or get_vision_ocr_service()
        self.correction = correction_service or get_correction_service()
        self.storage = storage_service or get_storage_service()
        self.prefer_vision_ocr = prefer_vision_ocr

        # Progress callback for SSE streaming
        self._progress_callback: Optional[Callable[[ProcessingProgress], None]] = None

    def set_progress_callback(self, callback: Callable[[ProcessingProgress], None]):
        """Set callback for progress updates (SSE streaming)."""
        self._progress_callback = callback

    def _notify_progress(self, progress: ProcessingProgress):
        """Notify progress to callback if set."""
        if self._progress_callback:
            try:
                self._progress_callback(progress)
            except Exception as e:
                logger.warning(f"Progress callback failed: {e}")

    async def process_session(
        self,
        session_id: str,
        teacher_id: str,
        use_ai_correction: bool = True
    ) -> bool:
        """
        Process all documents in a session.

        Args:
            session_id: Exam session ID
            teacher_id: Teacher ID for isolation
            use_ai_correction: Whether to run AI correction (requires LLM)

        Returns:
            True if processing completed successfully
        """
        # Get session
        session = self.repo.get_session(session_id, teacher_id)
        if not session:
            logger.error(f"Session not found: {session_id}")
            return False

        # Get documents
        documents = self.repo.list_documents(session_id, teacher_id)
        if not documents:
            logger.warning(f"No documents in session: {session_id}")
            return False

        total = len(documents)
        processed = 0

        logger.info(f"Starting processing for session {session_id}: {total} documents")

        # Check OCR service availability (Vision-LLM preferred for handwriting)
        vision_ocr_available = await self.vision_ocr.is_available()
        trocr_available = await self.trocr.is_available()

        if vision_ocr_available and self.prefer_vision_ocr:
            logger.info("Using Vision-LLM (llama3.2-vision) for OCR - optimal for handwriting")
            use_vision_ocr = True
        elif trocr_available:
            logger.info("Using TrOCR for OCR")
            use_vision_ocr = False
        elif vision_ocr_available:
            logger.info("TrOCR not available, falling back to Vision-LLM")
            use_vision_ocr = True
        else:
            logger.warning("No OCR service available - OCR will be skipped")
            use_vision_ocr = False
            trocr_available = False

        # Process each document
        for doc in documents:
            progress = ProcessingProgress(
                session_id=session_id,
                total_documents=total,
                processed_documents=processed,
                current_document=doc.doc_token[:8],
                current_step="ocr"
            )
            self._notify_progress(progress)

            try:
                # Step 1: OCR extraction (Vision-LLM or TrOCR)
                if (vision_ocr_available or trocr_available) and doc.status == DocumentStatus.UPLOADED:
                    await self._process_ocr(session_id, doc, teacher_id, use_vision_ocr=use_vision_ocr)

                # Step 2: AI correction
                progress.current_step = "correction"
                self._notify_progress(progress)

                if use_ai_correction and doc.ocr_text:
                    await self._process_correction(session, doc, teacher_id)
                else:
                    # Just mark as completed without AI
                    self._mark_document_completed(doc, teacher_id)

                processed += 1

            except Exception as e:
                logger.error(f"Failed to process document {doc.doc_token}: {e}")
                self._mark_document_failed(doc, str(e), teacher_id)

        # Update session status
        self.repo.update_session_status(session_id, teacher_id, SessionStatus.COMPLETED)

        # Final progress
        progress = ProcessingProgress(
            session_id=session_id,
            total_documents=total,
            processed_documents=processed,
            current_step="complete"
        )
        self._notify_progress(progress)

        logger.info(f"Completed processing session {session_id}: {processed}/{total} documents")
        return True

    async def _process_ocr(
        self,
        session_id: str,
        doc: PseudonymizedDocument,
        teacher_id: str,
        use_vision_ocr: bool = True
    ):
        """
        Run OCR on a document.

        Args:
            session_id: Session ID
            doc: Document to process
            teacher_id: Teacher ID
            use_vision_ocr: True to use Vision-LLM (llama3.2-vision), False for TrOCR
        """
        # Update status
        doc.status = DocumentStatus.OCR_PROCESSING
        doc.processing_started_at = datetime.utcnow()
        self.db.commit()

        # Try to get document from storage (check both redacted and original)
        image_data = None
        for is_redacted in [True, False]:  # Prefer redacted version
            for ext in ["png", "jpg", "jpeg", "pdf"]:
                image_data = self.storage.get_document(
                    session_id, doc.doc_token, ext, is_redacted=is_redacted
                )
                if image_data:
                    logger.debug(f"Found document: {doc.doc_token[:8]}.{ext} (redacted={is_redacted})")
                    break
            if image_data:
                break

        if not image_data:
            logger.warning(f"No image found for document {doc.doc_token}")
            # Use placeholder OCR text for testing
            doc.ocr_text = "[Kein Bild gefunden - Manuelle Eingabe erforderlich]"
            doc.ocr_confidence = 0
            doc.status = DocumentStatus.OCR_COMPLETED
            self.db.commit()
            return

        # Call OCR service (Vision-LLM or TrOCR)
        try:
            if use_vision_ocr:
                # Use Vision-LLM (llama3.2-vision) - better for handwriting
                result = await self.vision_ocr.extract_text(
                    image_data,
                    filename=f"{doc.doc_token}.png",
                    is_handwriting=True  # Assume handwriting for exams
                )
                ocr_method = "Vision-LLM"
            else:
                # Use TrOCR
                result = await self.trocr.extract_text(
                    image_data,
                    filename=f"{doc.doc_token}.png",
                    detect_lines=True
                )
                ocr_method = "TrOCR"

            doc.ocr_text = result.text
            doc.ocr_confidence = int(result.confidence * 100)
            doc.status = DocumentStatus.OCR_COMPLETED

            logger.info(
                f"OCR completed ({ocr_method}) for {doc.doc_token[:8]}: "
                f"{len(result.text)} chars, {result.confidence:.0%} confidence"
            )

        except Exception as e:
            logger.error(f"OCR failed for {doc.doc_token}: {e}")
            doc.ocr_text = f"[OCR Fehler: {str(e)[:100]}]"
            doc.ocr_confidence = 0
            doc.status = DocumentStatus.OCR_COMPLETED  # Continue to AI anyway

        self.db.commit()

    async def _process_correction(
        self,
        session: ExamSession,
        doc: PseudonymizedDocument,
        teacher_id: str
    ):
        """Run AI correction on a document."""
        doc.status = DocumentStatus.AI_PROCESSING
        self.db.commit()

        # Build rubrics from session questions
        rubrics = self._build_rubrics(session)

        if not rubrics:
            # No rubrics defined - use simple scoring
            doc.ai_feedback = "Keine Bewertungskriterien definiert. Manuelle Korrektur empfohlen."
            doc.ai_score = None
            doc.ai_grade = None
            doc.status = DocumentStatus.COMPLETED
            doc.processing_completed_at = datetime.utcnow()
            self.db.commit()

            # Update session stats
            session.processed_count += 1
            self.db.commit()
            return

        try:
            # Run AI correction
            result = await self.correction.correct_exam(
                doc_token=doc.doc_token,
                ocr_text=doc.ocr_text,
                rubrics=rubrics,
                subject=session.subject or "Allgemein"
            )

            # Save results
            doc.ai_feedback = result.overall_feedback
            doc.ai_score = result.total_score
            doc.ai_grade = result.grade
            doc.ai_details = {
                "max_score": result.max_score,
                "processing_time_ms": result.processing_time_ms,
                "questions": [
                    {
                        "number": q.question_number,
                        "points": q.points_awarded,
                        "max_points": q.max_points,
                        "feedback": q.feedback,
                        "strengths": q.strengths,
                        "improvements": q.improvements
                    }
                    for q in result.question_results
                ]
            }
            doc.status = DocumentStatus.COMPLETED
            doc.processing_completed_at = datetime.utcnow()

            logger.info(
                f"Correction completed for {doc.doc_token[:8]}: "
                f"{result.total_score}/{result.max_score} ({result.grade})"
            )

        except Exception as e:
            logger.error(f"AI correction failed for {doc.doc_token}: {e}")
            doc.ai_feedback = f"KI-Korrektur fehlgeschlagen: {str(e)[:200]}"
            doc.status = DocumentStatus.COMPLETED  # Mark complete anyway
            doc.processing_completed_at = datetime.utcnow()

        # Update session stats
        session.processed_count += 1
        self.db.commit()

    def _build_rubrics(self, session: ExamSession) -> List[QuestionRubric]:
        """Build QuestionRubric list from session questions."""
        rubrics = []

        if not session.questions:
            return rubrics

        for i, q in enumerate(session.questions):
            rubric = QuestionRubric(
                question_number=q.get("number", i + 1),
                question_text=q.get("text", f"Frage {i + 1}"),
                max_points=q.get("points", 10),
                expected_answer=q.get("expected_answer", ""),
                grading_criteria=q.get("rubric", session.rubric or "")
            )
            rubrics.append(rubric)

        return rubrics

    def _mark_document_completed(
        self,
        doc: PseudonymizedDocument,
        teacher_id: str
    ):
        """Mark document as completed without AI correction."""
        doc.status = DocumentStatus.COMPLETED
        doc.processing_completed_at = datetime.utcnow()
        if not doc.ai_feedback:
            doc.ai_feedback = "Verarbeitung abgeschlossen (ohne KI-Korrektur)"
        self.db.commit()

        # Update session stats
        if doc.session:
            doc.session.processed_count += 1
            self.db.commit()

    def _mark_document_failed(
        self,
        doc: PseudonymizedDocument,
        error: str,
        teacher_id: str
    ):
        """Mark document as failed."""
        doc.status = DocumentStatus.FAILED
        doc.processing_error = error[:500]
        doc.processing_completed_at = datetime.utcnow()
        self.db.commit()


# Background task function for FastAPI
async def process_session_background(
    session_id: str,
    teacher_id: str,
    db_url: str
):
    """
    Background task for session processing.

    This function creates its own DB session for use in background tasks.
    """
    from ..database import SessionLocal

    db = SessionLocal()
    try:
        service = ProcessingService(db)
        await service.process_session(session_id, teacher_id)
    finally:
        db.close()


# Singleton for main service
_processing_service: Optional[ProcessingService] = None


def get_processing_service(db: Session) -> ProcessingService:
    """Get processing service instance."""
    return ProcessingService(db)