breakpilot-pwa/backend/ai_processing/qa_generator.py

"""
AI Processing - Q&A Generator.

Generiert Frage-Antwort-Paare mit Leitner-System-Vorbereitung.
"""

from pathlib import Path
import json
import os
import requests
import logging

from .core import (
    get_openai_api_key,
    get_vision_api,
    BEREINIGT_DIR,
)

logger = logging.getLogger(__name__)


def _generate_qa_with_openai(analysis_data: dict, num_questions: int = 8) -> dict:
    """
    Generiert Frage-Antwort-Paare basierend auf der Arbeitsblatt-Analyse.

    Wichtige didaktische Anforderungen:
    - Fragen basieren fast wörtlich auf dem vorhandenen Stoff
    - Nur minimale Umformulierung erlaubt
    - Schlüsselwörter/Fachbegriffe werden als wichtig markiert
    - Schwierigkeitsgrad entspricht dem Original (grade_level)

    Args:
        analysis_data: Die Analyse-JSON des Arbeitsblatts
        num_questions: Anzahl der zu generierenden Fragen (Standard: 8)

    Returns:
        Dict mit qa_items und metadata
    """
    api_key = get_openai_api_key()

    # Extrahiere relevante Inhalte
    title = analysis_data.get("title") or "Arbeitsblatt"
    subject = analysis_data.get("subject") or "Allgemein"
    grade_level = analysis_data.get("grade_level") or "unbekannt"
    canonical_text = analysis_data.get("canonical_text") or ""
    printed_blocks = analysis_data.get("printed_blocks") or []
    tasks = analysis_data.get("tasks") or []

    # Baue Textinhalt zusammen
    content_parts = []
    if canonical_text:
        content_parts.append(canonical_text)
    for block in printed_blocks:
        text = block.get("text", "").strip()
        if text and text not in content_parts:
            content_parts.append(text)

    # Aufgaben-Texte hinzufügen
    for task in tasks:
        desc = task.get("description", "").strip()
        text = task.get("text_with_gaps", "").strip()
        if desc:
            content_parts.append(f"Aufgabe: {desc}")
        if text:
            content_parts.append(text)

    worksheet_content = "\n\n".join(content_parts)

    if not worksheet_content.strip():
        logger.warning("Kein Textinhalt für Q&A-Generierung gefunden")
        return {"qa_items": [], "metadata": {"error": "Kein Textinhalt gefunden"}}

    url = "https://api.openai.com/v1/chat/completions"
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

    system_prompt = f"""Du bist ein erfahrener Pädagoge, der Frage-Antwort-Paare für Schüler erstellt.

WICHTIGE REGELN:

1. INHALTE NUR AUS DEM TEXT:
   - Verwende FAST WÖRTLICH den vorhandenen Stoff
   - Du darfst nur minimal umformulieren (z.B. "Beschreibe..." → "Erkläre in eigenen Worten...")
   - KEINE neuen Fakten oder Inhalte einführen!
   - Alles muss aus dem gegebenen Text ableitbar sein

2. SCHWIERIGKEITSGRAD:
   - Niveau muss exakt "{grade_level}" entsprechen
   - Fragen altersgerecht formulieren

3. SCHLÜSSELWÖRTER MARKIEREN:
   - Identifiziere wichtige Fachbegriffe als "key_terms"
   - Diese Begriffe sind besonders wichtig für die Wiederholung
   - Beispiele: Netzhaut, Linse, Pupille (beim Thema Auge)

4. FRAGETYPEN:
   - Wissensfragen: "Was ist...?", "Nenne..."
   - Verständnisfragen: "Erkläre...", "Beschreibe..."
   - Anwendungsfragen: "Warum...?", "Was passiert, wenn...?"

5. ANTWORT-FORMAT:
   - Kurze, präzise Antworten (1-3 Sätze)
   - Die Antwort muss direkt aus dem Text stammen

6. AUSGABE: Nur gültiges JSON, kein Markdown."""

    user_prompt = f"""Erstelle {num_questions} Frage-Antwort-Paare aus diesem Arbeitsblatt:

TITEL: {title}
FACH: {subject}
KLASSENSTUFE: {grade_level}

TEXT:
{worksheet_content}

Gib das Ergebnis als JSON zurück:

{{
  "qa_items": [
    {{
      "id": "qa1",
      "question": "Die Frage hier (fast wörtlich aus dem Text)",
      "answer": "Die korrekte Antwort (direkt aus dem Text)",
      "question_type": "knowledge" | "understanding" | "application",
      "key_terms": ["wichtiger Begriff 1", "wichtiger Begriff 2"],
      "difficulty": 1-3,
      "source_hint": "Kurzer Hinweis, wo im Text die Antwort steht",
      "leitner_box": 0
    }}
  ],
  "metadata": {{
    "subject": "{subject}",
    "grade_level": "{grade_level}",
    "source_title": "{title}",
    "total_questions": {num_questions},
    "key_terms_summary": ["alle", "wichtigen", "Fachbegriffe", "gesammelt"]
  }}
}}

WICHTIG:
- Alle Antworten müssen aus dem Text ableitbar sein!
- "leitner_box": 0 bedeutet "neu" (noch nicht gelernt)
- "difficulty": 1=leicht, 2=mittel, 3=schwer (passend zu {grade_level})
- "key_terms" sind die wichtigsten Wörter, die der Schüler lernen soll"""

    payload = {
        "model": "gpt-4o-mini",
        "response_format": {"type": "json_object"},
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "max_tokens": 3000,
        "temperature": 0.5,
    }

    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    data = response.json()

    try:
        content = data["choices"][0]["message"]["content"]
        qa_data = json.loads(content)
    except (KeyError, json.JSONDecodeError) as e:
        raise RuntimeError(f"Fehler bei Q&A-Generierung: {e}")

    # Initialisiere Leitner-Box Felder für alle Items
    for item in qa_data.get("qa_items", []):
        if "leitner_box" not in item:
            item["leitner_box"] = 0  # 0=neu, 1=gelernt, 2=gefestigt
        if "correct_count" not in item:
            item["correct_count"] = 0
        if "incorrect_count" not in item:
            item["incorrect_count"] = 0
        if "last_seen" not in item:
            item["last_seen"] = None
        if "next_review" not in item:
            item["next_review"] = None

    return qa_data


def _generate_qa_with_claude(analysis_data: dict, num_questions: int = 8) -> dict:
    """
    Generiert Frage-Antwort-Paare mit Claude API.
    """
    import anthropic

    api_key = os.getenv("ANTHROPIC_API_KEY")
    if not api_key:
        raise RuntimeError("ANTHROPIC_API_KEY ist nicht gesetzt.")

    client = anthropic.Anthropic(api_key=api_key)

    # Extrahiere relevante Inhalte
    title = analysis_data.get("title") or "Arbeitsblatt"
    subject = analysis_data.get("subject") or "Allgemein"
    grade_level = analysis_data.get("grade_level") or "unbekannt"
    canonical_text = analysis_data.get("canonical_text") or ""
    printed_blocks = analysis_data.get("printed_blocks") or []
    tasks = analysis_data.get("tasks") or []

    content_parts = []
    if canonical_text:
        content_parts.append(canonical_text)
    for block in printed_blocks:
        text = block.get("text", "").strip()
        if text and text not in content_parts:
            content_parts.append(text)
    for task in tasks:
        desc = task.get("description", "").strip()
        if desc:
            content_parts.append(f"Aufgabe: {desc}")

    worksheet_content = "\n\n".join(content_parts)

    if not worksheet_content.strip():
        return {"qa_items": [], "metadata": {"error": "Kein Textinhalt gefunden"}}

    prompt = f"""Erstelle {num_questions} Frage-Antwort-Paare aus diesem Arbeitsblatt.

WICHTIGE REGELN:
1. Verwende FAST WÖRTLICH den vorhandenen Stoff - KEINE neuen Fakten!
2. Schwierigkeitsgrad: exakt "{grade_level}"
3. Markiere wichtige Fachbegriffe als "key_terms"

TITEL: {title}
FACH: {subject}
KLASSENSTUFE: {grade_level}

TEXT:
{worksheet_content}

Antworte NUR mit diesem JSON:
{{
  "qa_items": [
    {{
      "id": "qa1",
      "question": "Frage (fast wörtlich aus Text)",
      "answer": "Antwort (direkt aus Text)",
      "question_type": "knowledge",
      "key_terms": ["Begriff1", "Begriff2"],
      "difficulty": 1,
      "source_hint": "Wo im Text",
      "leitner_box": 0
    }}
  ],
  "metadata": {{
    "subject": "{subject}",
    "grade_level": "{grade_level}",
    "source_title": "{title}",
    "total_questions": {num_questions},
    "key_terms_summary": ["alle", "Fachbegriffe"]
  }}
}}"""

    message = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=3000,
        messages=[{"role": "user", "content": prompt}]
    )

    content = message.content[0].text

    try:
        if "```json" in content:
            content = content.split("```json")[1].split("```")[0]
        elif "```" in content:
            content = content.split("```")[1].split("```")[0]
        qa_data = json.loads(content.strip())
    except json.JSONDecodeError as e:
        raise RuntimeError(f"Claude hat ungültiges JSON geliefert: {e}")

    # Initialisiere Leitner-Box Felder
    for item in qa_data.get("qa_items", []):
        if "leitner_box" not in item:
            item["leitner_box"] = 0
        if "correct_count" not in item:
            item["correct_count"] = 0
        if "incorrect_count" not in item:
            item["incorrect_count"] = 0
        if "last_seen" not in item:
            item["last_seen"] = None
        if "next_review" not in item:
            item["next_review"] = None

    return qa_data


def generate_qa_from_analysis(analysis_path: Path, num_questions: int = 8) -> Path:
    """
    Generiert Frage-Antwort-Paare aus einer Analyse-JSON-Datei.

    Die Q&A-Paare werden:
    - Fast wörtlich aus dem Originaltext erstellt
    - Mit Leitner-Box-System für Wiederholung vorbereitet
    - Mit Schlüsselbegriffen für Festigung markiert

    Args:
        analysis_path: Pfad zur *_analyse.json Datei
        num_questions: Anzahl der zu generierenden Fragen

    Returns:
        Pfad zur generierten *_qa.json Datei
    """
    if not analysis_path.exists():
        raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}")

    try:
        analysis_data = json.loads(analysis_path.read_text(encoding="utf-8"))
    except json.JSONDecodeError as e:
        raise RuntimeError(f"Ungültige Analyse-JSON: {e}")

    logger.info(f"Generiere Q&A-Paare für: {analysis_path.name}")

    vision_api = get_vision_api()

    # Generiere Q&A (nutze konfigurierte API)
    if vision_api == "claude":
        try:
            qa_data = _generate_qa_with_claude(analysis_data, num_questions)
        except Exception as e:
            logger.warning(f"Claude Q&A-Generierung fehlgeschlagen, nutze OpenAI: {e}")
            qa_data = _generate_qa_with_openai(analysis_data, num_questions)
    else:
        qa_data = _generate_qa_with_openai(analysis_data, num_questions)

    # Speichere Q&A-Daten
    out_name = analysis_path.stem.replace("_analyse", "") + "_qa.json"
    out_path = BEREINIGT_DIR / out_name
    out_path.write_text(json.dumps(qa_data, ensure_ascii=False, indent=2), encoding="utf-8")

    logger.info(f"Q&A-Paare gespeichert: {out_path.name}")
    return out_path