breakpilot-pwa/backend/generators/cloze_generator.py

"""
Cloze Generator - Erstellt Lückentexte aus Quelltexten.

Generiert:
- Lückentexte mit ausgeblendeten Schlüsselwörtern
- Verschiedene Schwierigkeitsgrade
- Hinweise und Erklärungen
"""

import logging
import json
import re
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from enum import Enum

logger = logging.getLogger(__name__)


class ClozeType(str, Enum):
    """Typen von Lückentexten."""
    FILL_IN = "fill_in"         # Freies Ausfüllen
    DRAG_DROP = "drag_drop"     # Drag & Drop
    DROPDOWN = "dropdown"       # Dropdown-Auswahl


@dataclass
class ClozeGap:
    """Eine Lücke im Text."""
    position: int           # Position im Text (0-basiert)
    answer: str             # Korrekte Antwort
    alternatives: List[str] # Alternative korrekte Antworten
    hint: Optional[str]     # Hinweis
    distractors: List[str]  # Falsche Optionen (für Dropdown/Drag-Drop)


@dataclass
class ClozeText:
    """Ein kompletter Lückentext."""
    text_with_gaps: str     # Text mit Platzhaltern
    original_text: str      # Originaltext
    gaps: List[ClozeGap]    # Liste der Lücken
    cloze_type: ClozeType   # Typ des Lückentexts
    topic: Optional[str]    # Thema
    difficulty: str         # easy, medium, hard


class ClozeGenerator:
    """
    Generiert Lückentexte aus Quelltexten.

    Unterstützt verschiedene Modi:
    - Automatische Erkennung wichtiger Begriffe
    - LLM-basierte intelligente Auswahl
    - Manuelle Vorgabe von Lücken
    """

    def __init__(self, llm_client=None):
        """
        Initialisiert den Generator.

        Args:
            llm_client: Optional - LLM-Client für intelligente Generierung
        """
        self.llm_client = llm_client
        logger.info("ClozeGenerator initialized")

        # Wortarten, die oft als Lücken geeignet sind
        self._important_pos = {"NOUN", "VERB", "ADJ"}  # Substantive, Verben, Adjektive

    def generate(
        self,
        source_text: str,
        num_gaps: int = 5,
        difficulty: str = "medium",
        cloze_type: ClozeType = ClozeType.FILL_IN,
        topic: Optional[str] = None
    ) -> ClozeText:
        """
        Generiert einen Lückentext aus einem Quelltext.

        Args:
            source_text: Der Ausgangstext
            num_gaps: Anzahl der Lücken
            difficulty: Schwierigkeitsgrad (easy, medium, hard)
            cloze_type: Art des Lückentexts
            topic: Optionales Thema

        Returns:
            ClozeText-Objekt
        """
        logger.info(f"Generating cloze text with {num_gaps} gaps (difficulty: {difficulty})")

        if not source_text or len(source_text.strip()) < 50:
            logger.warning("Source text too short")
            return self._empty_cloze(source_text, cloze_type)

        if self.llm_client:
            return self._generate_with_llm(
                source_text, num_gaps, difficulty, cloze_type, topic
            )
        else:
            return self._generate_automatic(
                source_text, num_gaps, difficulty, cloze_type, topic
            )

    def _generate_with_llm(
        self,
        source_text: str,
        num_gaps: int,
        difficulty: str,
        cloze_type: ClozeType,
        topic: Optional[str]
    ) -> ClozeText:
        """Generiert Lückentext mit LLM."""
        prompt = f"""
Erstelle einen Lückentext auf Deutsch basierend auf folgendem Text.
Ersetze {num_gaps} wichtige Begriffe durch Lücken.
Schwierigkeitsgrad: {difficulty}
{f'Thema: {topic}' if topic else ''}

Originaltext:
{source_text}

Wähle {num_gaps} wichtige Begriffe (Substantive, Verben, Fachbegriffe) aus.
Für jeden Begriff gib an:
- Das Wort, das ausgeblendet wird
- Alternative Schreibweisen (falls vorhanden)
- Einen Hinweis
- 3 ähnliche aber falsche Wörter (Distraktoren)

Antworte im JSON-Format:
{{
  "gaps": [
    {{
      "word": "Photosynthese",
      "alternatives": ["Fotosynthese"],
      "hint": "Prozess bei dem Pflanzen Licht nutzen",
      "distractors": ["Zellatmung", "Osmose", "Diffusion"]
    }}
  ]
}}
"""

        try:
            response = self.llm_client.generate(prompt)
            data = json.loads(response)
            return self._create_cloze_from_llm(
                source_text, data, difficulty, cloze_type, topic
            )
        except Exception as e:
            logger.error(f"Error generating with LLM: {e}")
            return self._generate_automatic(
                source_text, num_gaps, difficulty, cloze_type, topic
            )

    def _generate_automatic(
        self,
        source_text: str,
        num_gaps: int,
        difficulty: str,
        cloze_type: ClozeType,
        topic: Optional[str]
    ) -> ClozeText:
        """Generiert Lückentext automatisch ohne LLM."""
        # Finde wichtige Wörter
        words = self._find_important_words(source_text)

        # Wähle Wörter basierend auf Schwierigkeit
        selected = self._select_words_by_difficulty(words, num_gaps, difficulty)

        # Erstelle Lücken
        gaps = []
        text_with_gaps = source_text

        for i, (word, pos) in enumerate(selected):
            # Position im aktuellen Text finden
            match = re.search(r'\b' + re.escape(word) + r'\b', text_with_gaps)
            if match:
                # Ersetze durch Platzhalter
                placeholder = f"[_{i+1}_]"
                text_with_gaps = text_with_gaps[:match.start()] + placeholder + text_with_gaps[match.end():]

                gap = ClozeGap(
                    position=i,
                    answer=word,
                    alternatives=[word.lower(), word.upper()],
                    hint=self._generate_hint(word, source_text),
                    distractors=self._generate_distractors(word, words)
                )
                gaps.append(gap)

        return ClozeText(
            text_with_gaps=text_with_gaps,
            original_text=source_text,
            gaps=gaps,
            cloze_type=cloze_type,
            topic=topic,
            difficulty=difficulty
        )

    def _find_important_words(self, text: str) -> List[tuple]:
        """Findet wichtige Wörter im Text."""
        # Einfache Heuristik: Längere Wörter sind oft wichtiger
        words = re.findall(r'\b[A-Za-zäöüÄÖÜß]{4,}\b', text)

        # Zähle Häufigkeit
        word_count = {}
        for word in words:
            word_lower = word.lower()
            word_count[word_lower] = word_count.get(word_lower, 0) + 1

        # Sortiere nach Länge und Häufigkeit
        unique_words = list(set(words))
        scored = []
        for word in unique_words:
            score = len(word) + word_count[word.lower()] * 2
            # Bevorzuge Wörter mit Großbuchstaben (Substantive)
            if word[0].isupper():
                score += 3
            scored.append((word, score))

        scored.sort(key=lambda x: x[1], reverse=True)
        return [(w, s) for w, s in scored]

    def _select_words_by_difficulty(
        self,
        words: List[tuple],
        num_gaps: int,
        difficulty: str
    ) -> List[tuple]:
        """Wählt Wörter basierend auf Schwierigkeit."""
        if difficulty == "easy":
            # Einfach: Häufige, wichtige Wörter
            return words[:num_gaps]
        elif difficulty == "hard":
            # Schwer: Weniger häufige Wörter
            return words[num_gaps:num_gaps*2] if len(words) > num_gaps else words[:num_gaps]
        else:
            # Medium: Mischung
            return words[:num_gaps]

    def _generate_hint(self, word: str, text: str) -> str:
        """Generiert einen Hinweis für ein Wort."""
        # Einfacher Hinweis basierend auf Kontext
        sentences = text.split('.')
        for sentence in sentences:
            if word in sentence:
                # Extrahiere Kontext
                words_in_sentence = sentence.split()
                if len(words_in_sentence) > 5:
                    return f"Beginnt mit '{word[0]}' ({len(word)} Buchstaben)"
        return f"Beginnt mit '{word[0]}'"

    def _generate_distractors(self, word: str, all_words: List[tuple]) -> List[str]:
        """Generiert Distraktoren (falsche Optionen)."""
        distractors = []
        word_len = len(word)

        # Finde ähnlich lange Wörter
        for w, _ in all_words:
            if w.lower() != word.lower():
                if abs(len(w) - word_len) <= 2:
                    distractors.append(w)
                    if len(distractors) >= 3:
                        break

        # Falls nicht genug, füge generische hinzu
        while len(distractors) < 3:
            distractors.append(f"[Option {len(distractors)+1}]")

        return distractors[:3]

    def _create_cloze_from_llm(
        self,
        source_text: str,
        data: Dict[str, Any],
        difficulty: str,
        cloze_type: ClozeType,
        topic: Optional[str]
    ) -> ClozeText:
        """Erstellt ClozeText aus LLM-Antwort."""
        text_with_gaps = source_text
        gaps = []

        for i, gap_data in enumerate(data.get("gaps", [])):
            word = gap_data.get("word", "")
            if word:
                # Ersetze im Text
                pattern = r'\b' + re.escape(word) + r'\b'
                placeholder = f"[_{i+1}_]"
                text_with_gaps = re.sub(pattern, placeholder, text_with_gaps, count=1)

                gap = ClozeGap(
                    position=i,
                    answer=word,
                    alternatives=gap_data.get("alternatives", []),
                    hint=gap_data.get("hint"),
                    distractors=gap_data.get("distractors", [])
                )
                gaps.append(gap)

        return ClozeText(
            text_with_gaps=text_with_gaps,
            original_text=source_text,
            gaps=gaps,
            cloze_type=cloze_type,
            topic=topic,
            difficulty=difficulty
        )

    def _empty_cloze(self, text: str, cloze_type: ClozeType) -> ClozeText:
        """Erstellt leeren ClozeText bei Fehler."""
        return ClozeText(
            text_with_gaps=text,
            original_text=text,
            gaps=[],
            cloze_type=cloze_type,
            topic=None,
            difficulty="medium"
        )

    def to_h5p_format(self, cloze: ClozeText) -> Dict[str, Any]:
        """
        Konvertiert Lückentext ins H5P-Format.

        Args:
            cloze: ClozeText-Objekt

        Returns:
            H5P-kompatibles Dict
        """
        # H5P Fill in the Blanks Format
        h5p_text = cloze.text_with_gaps

        # Ersetze Platzhalter durch H5P-Format
        for i, gap in enumerate(cloze.gaps):
            placeholder = f"[_{i+1}_]"
            answers = [gap.answer] + gap.alternatives
            h5p_answer = "/".join(answers)

            if cloze.cloze_type == ClozeType.DROPDOWN:
                # Mit Distraktoren
                all_options = answers + gap.distractors
                h5p_answer = "/".join(all_options)

            h5p_text = h5p_text.replace(placeholder, f"*{h5p_answer}*")

        return {
            "library": "H5P.Blanks",
            "params": {
                "text": h5p_text,
                "behaviour": {
                    "enableRetry": True,
                    "enableSolutionsButton": True,
                    "caseSensitive": False,
                    "showSolutionsRequiresInput": True
                }
            }
        }

    def to_dict(self, cloze: ClozeText) -> Dict[str, Any]:
        """Konvertiert ClozeText zu Dictionary-Format."""
        return {
            "text_with_gaps": cloze.text_with_gaps,
            "original_text": cloze.original_text,
            "gaps": [
                {
                    "position": gap.position,
                    "answer": gap.answer,
                    "alternatives": gap.alternatives,
                    "hint": gap.hint,
                    "distractors": gap.distractors
                }
                for gap in cloze.gaps
            ],
            "cloze_type": cloze.cloze_type.value,
            "topic": cloze.topic,
            "difficulty": cloze.difficulty
        }