breakpilot-pwa/backend/klausur/services/correction_service.py

"""
Exam Correction Service using Self-Hosted LLM.

PRIVACY BY DESIGN:
- Only pseudonymized text (doc_token + OCR content) is sent to LLM
- No student names or personal data in prompts
- All processing happens on self-hosted infrastructure (SysEleven)
- No data sent to external APIs (unless explicitly configured)

This service generates AI-assisted corrections and feedback for exam answers.
"""
import logging
from typing import Optional, List
from dataclasses import dataclass

from llm_gateway.services.inference import get_inference_service, InferenceResult
from llm_gateway.models.chat import ChatCompletionRequest, ChatMessage
from llm_gateway.config import get_config

logger = logging.getLogger(__name__)


@dataclass
class QuestionRubric:
    """Rubric for a single exam question."""
    question_number: int
    question_text: str
    max_points: int
    expected_answer: str
    grading_criteria: str


@dataclass
class QuestionResult:
    """AI correction result for a single question."""
    question_number: int
    points_awarded: int
    max_points: int
    feedback: str
    strengths: List[str]
    improvements: List[str]


@dataclass
class CorrectionResult:
    """Complete correction result for an exam."""
    doc_token: str  # Pseudonymized identifier
    total_score: int
    max_score: int
    grade: str
    overall_feedback: str
    question_results: List[QuestionResult]
    processing_time_ms: int


# German grading scale (can be customized)
GERMAN_GRADES = [
    (95, "1+"),  # sehr gut plus
    (90, "1"),   # sehr gut
    (85, "1-"),  # sehr gut minus
    (80, "2+"),  # gut plus
    (75, "2"),   # gut
    (70, "2-"),  # gut minus
    (65, "3+"),  # befriedigend plus
    (60, "3"),   # befriedigend
    (55, "3-"),  # befriedigend minus
    (50, "4+"),  # ausreichend plus
    (45, "4"),   # ausreichend
    (40, "4-"),  # ausreichend minus
    (33, "5+"),  # mangelhaft plus
    (27, "5"),   # mangelhaft
    (20, "5-"),  # mangelhaft minus
    (0, "6"),    # ungenuegend
]


def calculate_grade(percentage: float) -> str:
    """Calculate German grade from percentage."""
    for threshold, grade in GERMAN_GRADES:
        if percentage >= threshold:
            return grade
    return "6"


class ExamCorrectionService:
    """
    Service for AI-assisted exam correction.

    PRIVACY GUARANTEES:
    1. Prompts contain NO personal data
    2. Only doc_token is used as reference
    3. Processing on self-hosted LLM
    4. Results stored with pseudonymized identifiers
    """

    # System prompt for exam correction (German)
    CORRECTION_SYSTEM_PROMPT = """Du bist ein erfahrener Lehrer und korrigierst Schuelerantworten.

WICHTIGE REGELN:
1. Bewerte NUR den fachlichen Inhalt der Antwort
2. Ignoriere Rechtschreibfehler (ausser bei Deutschklausuren)
3. Gib konstruktives, ermutigzendes Feedback
4. Beziehe dich auf die Bewertungskriterien
5. Sei fair und konsistent

AUSGABEFORMAT (JSON):
{
    "points": <Punktzahl>,
    "feedback": "<Kurze Begruendung der Bewertung>",
    "strengths": ["<Staerke 1>", "<Staerke 2>"],
    "improvements": ["<Verbesserungsvorschlag 1>"]
}

Antworte NUR mit dem JSON-Objekt, ohne weitere Erklaerungen."""

    OVERALL_FEEDBACK_PROMPT = """Basierend auf den einzelnen Bewertungen, erstelle eine Gesamtrueckmeldung.

Einzelbewertungen:
{question_results}

Gesamtpunktzahl: {total_score}/{max_score} ({percentage}%)
Note: {grade}

Erstelle eine motivierende Gesamtrueckmeldung (2-3 Saetze), die:
1. Die Staerken hervorhebt
2. Konstruktive Verbesserungsvorschlaege macht
3. Ermutigt und motiviert

Antworte nur mit dem Feedback-Text, ohne JSON-Formatierung."""

    def __init__(self, model: Optional[str] = None):
        """
        Initialize the correction service.

        Args:
            model: LLM model to use (default: qwen2.5:14b from config)

        DATENSCHUTZ/PRIVACY:
        Das Modell läuft lokal auf dem Mac Mini via Ollama.
        Keine Daten werden an externe Server gesendet.
        """
        config = get_config()
        # Use configured correction model (default: qwen2.5:14b)
        self.model = model or config.correction_model
        self.inference = get_inference_service()
        logger.info(f"Correction service initialized with model: {self.model}")

    async def correct_question(
        self,
        student_answer: str,
        rubric: QuestionRubric,
        subject: str = "Allgemein"
    ) -> QuestionResult:
        """
        Correct a single question answer.

        Args:
            student_answer: The student's OCR-extracted answer (pseudonymized)
            rubric: Grading rubric for this question
            subject: Subject for context

        Returns:
            QuestionResult with points and feedback
        """
        # Build prompt with NO personal data
        user_prompt = f"""Fach: {subject}
Frage {rubric.question_number}: {rubric.question_text}
Maximale Punktzahl: {rubric.max_points}

Erwartete Antwort:
{rubric.expected_answer}

Bewertungskriterien:
{rubric.grading_criteria}

---

Schuelerantwort:
{student_answer}

---

Bewerte diese Antwort nach den Kriterien."""

        request = ChatCompletionRequest(
            model=self.model,
            messages=[
                ChatMessage(role="system", content=self.CORRECTION_SYSTEM_PROMPT),
                ChatMessage(role="user", content=user_prompt),
            ],
            temperature=0.3,  # Lower temperature for consistent grading
            max_tokens=500,
        )

        try:
            response = await self.inference.complete(request)
            content = response.choices[0].message.content or "{}"

            # Parse JSON response
            import json
            try:
                result = json.loads(content)
            except json.JSONDecodeError:
                # Fallback parsing
                logger.warning(f"Failed to parse LLM response as JSON: {content[:100]}")
                result = {
                    "points": rubric.max_points // 2,
                    "feedback": content[:200],
                    "strengths": [],
                    "improvements": ["Automatische Bewertung fehlgeschlagen - manuelle Pruefung erforderlich"]
                }

            points = min(int(result.get("points", 0)), rubric.max_points)

            return QuestionResult(
                question_number=rubric.question_number,
                points_awarded=points,
                max_points=rubric.max_points,
                feedback=result.get("feedback", ""),
                strengths=result.get("strengths", []),
                improvements=result.get("improvements", []),
            )

        except Exception as e:
            logger.error(f"Correction failed for question {rubric.question_number}: {e}")
            return QuestionResult(
                question_number=rubric.question_number,
                points_awarded=0,
                max_points=rubric.max_points,
                feedback=f"Automatische Bewertung fehlgeschlagen: {str(e)}",
                strengths=[],
                improvements=["Manuelle Korrektur erforderlich"],
            )

    async def correct_exam(
        self,
        doc_token: str,
        ocr_text: str,
        rubrics: List[QuestionRubric],
        subject: str = "Allgemein"
    ) -> CorrectionResult:
        """
        Correct a complete exam with multiple questions.

        Args:
            doc_token: Pseudonymized document identifier
            ocr_text: Full OCR text of the exam (already redacted)
            rubrics: List of question rubrics
            subject: Subject name

        Returns:
            CorrectionResult with all scores and feedback
        """
        import time
        start_time = time.time()

        # Split OCR text into answers (simple heuristic)
        answers = self._extract_answers(ocr_text, len(rubrics))

        # Correct each question
        question_results = []
        for i, rubric in enumerate(rubrics):
            answer = answers[i] if i < len(answers) else ""
            result = await self.correct_question(answer, rubric, subject)
            question_results.append(result)

        # Calculate totals
        total_score = sum(r.points_awarded for r in question_results)
        max_score = sum(r.max_points for r in question_results)
        percentage = (total_score / max_score * 100) if max_score > 0 else 0
        grade = calculate_grade(percentage)

        # Generate overall feedback
        overall_feedback = await self._generate_overall_feedback(
            question_results, total_score, max_score, percentage, grade
        )

        processing_time_ms = int((time.time() - start_time) * 1000)

        return CorrectionResult(
            doc_token=doc_token,
            total_score=total_score,
            max_score=max_score,
            grade=grade,
            overall_feedback=overall_feedback,
            question_results=question_results,
            processing_time_ms=processing_time_ms,
        )

    async def _generate_overall_feedback(
        self,
        question_results: List[QuestionResult],
        total_score: int,
        max_score: int,
        percentage: float,
        grade: str
    ) -> str:
        """Generate motivating overall feedback."""
        # Summarize question results
        results_summary = "\n".join([
            f"Frage {r.question_number}: {r.points_awarded}/{r.max_points} Punkte - {r.feedback[:100]}"
            for r in question_results
        ])

        prompt = self.OVERALL_FEEDBACK_PROMPT.format(
            question_results=results_summary,
            total_score=total_score,
            max_score=max_score,
            percentage=f"{percentage:.1f}",
            grade=grade,
        )

        request = ChatCompletionRequest(
            model=self.model,
            messages=[
                ChatMessage(role="user", content=prompt),
            ],
            temperature=0.5,
            max_tokens=200,
        )

        try:
            response = await self.inference.complete(request)
            return response.choices[0].message.content or "Gute Arbeit! Weiter so."
        except Exception as e:
            logger.error(f"Failed to generate overall feedback: {e}")
            return f"Gesamtergebnis: {total_score}/{max_score} Punkte ({grade})"

    def _extract_answers(self, ocr_text: str, num_questions: int) -> List[str]:
        """
        Extract individual answers from OCR text.

        Simple heuristic: split by question markers (1., 2., etc.)
        More sophisticated extraction can be implemented.
        """
        import re

        # Try to find question markers
        pattern = r'(?:^|\n)\s*(\d+)[.\)]\s*'
        parts = re.split(pattern, ocr_text)

        answers = []
        i = 1  # Skip first empty part
        while i < len(parts):
            if i + 1 < len(parts):
                # parts[i] is the question number, parts[i+1] is the answer
                answers.append(parts[i + 1].strip())
            i += 2

        # Pad with empty answers if needed
        while len(answers) < num_questions:
            answers.append("")

        return answers[:num_questions]


# Singleton instance
_correction_service: Optional[ExamCorrectionService] = None


def get_correction_service(model: Optional[str] = None) -> ExamCorrectionService:
    """
    Get or create the correction service singleton.

    Args:
        model: Optional model override. If None, uses config.correction_model (qwen2.5:14b)

    Returns:
        ExamCorrectionService instance

    DATENSCHUTZ: Alle Verarbeitung erfolgt lokal via Ollama - keine Cloud-API.
    """
    global _correction_service
    if _correction_service is None:
        _correction_service = ExamCorrectionService(model=model)
    elif model and _correction_service.model != model:
        # Only recreate if explicitly requesting different model
        _correction_service = ExamCorrectionService(model=model)
    return _correction_service