""" AI Processor - Cloze Text Generator Generate cloze (fill-in-the-blank) texts from worksheet analysis. """ from pathlib import Path import json import logging import os import requests from ..config import VISION_API, BEREINIGT_DIR, get_openai_api_key logger = logging.getLogger(__name__) # Language codes to names LANGUAGE_NAMES = { "tr": "Tuerkisch", "ar": "Arabisch", "ru": "Russisch", "en": "Englisch", "fr": "Franzoesisch", "es": "Spanisch", "pl": "Polnisch", "uk": "Ukrainisch", } def _generate_cloze_with_openai(analysis_data: dict, target_language: str = "tr") -> dict: """ Generate cloze texts based on worksheet analysis. Important didactic requirements: - Multiple meaningful gaps per sentence (not just one!) - Difficulty level matches the original - Translation with the same gaps Args: analysis_data: The analysis JSON of the worksheet target_language: Target language for translation (default: "tr" for Turkish) Returns: Dict with cloze_items and metadata """ api_key = get_openai_api_key() title = analysis_data.get("title") or "Arbeitsblatt" subject = analysis_data.get("subject") or "Allgemein" grade_level = analysis_data.get("grade_level") or "unbekannt" canonical_text = analysis_data.get("canonical_text") or "" printed_blocks = analysis_data.get("printed_blocks") or [] content_parts = [] if canonical_text: content_parts.append(canonical_text) for block in printed_blocks: text = block.get("text", "").strip() if text and text not in content_parts: content_parts.append(text) worksheet_content = "\n\n".join(content_parts) if not worksheet_content.strip(): logger.warning("Kein Textinhalt fuer Lueckentext-Generierung gefunden") return {"cloze_items": [], "metadata": {"error": "Kein Textinhalt gefunden"}} target_lang_name = LANGUAGE_NAMES.get(target_language, "Tuerkisch") url = "https://api.openai.com/v1/chat/completions" headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} system_prompt = f"""Du bist ein erfahrener Paedagoge, der Lueckentexte fuer Schueler erstellt. WICHTIGE REGELN FUER LUECKENTEXTE: 1. MEHRERE LUECKEN PRO SATZ: - Erstelle IMMER mehrere sinnvolle Luecken pro Satz - Beispiel: "Ich habe gestern meine Hausaufgaben gemacht." → Luecken: "habe" UND "gemacht" (nicht nur eine!) 2. SCHWIERIGKEITSGRAD: - Niveau muss exakt "{grade_level}" entsprechen 3. SINNVOLLE LUECKENWOERTER: - Verben (konjugiert) - Wichtige Nomen - Adjektive - KEINE Artikel oder Praepositionen allein 4. UEBERSETZUNG: - Uebersetze den VOLLSTAENDIGEN Satz auf {target_lang_name} - Die GLEICHEN Woerter muessen als Luecken markiert sein 5. AUSGABE: Nur gueltiges JSON, kein Markdown.""" user_prompt = f"""Erstelle Lueckentexte aus diesem Arbeitsblatt: TITEL: {title} FACH: {subject} KLASSENSTUFE: {grade_level} TEXT: {worksheet_content} Erstelle 5-8 Saetze mit Luecken. Gib das Ergebnis als JSON zurueck: {{ "cloze_items": [ {{ "id": "c1", "original_sentence": "Der vollstaendige Originalsatz ohne Luecken", "sentence_with_gaps": "Der Satz mit ___ fuer jede Luecke", "gaps": [ {{ "id": "g1", "word": "das fehlende Wort", "position": 0, "hint": "optionaler Hinweis" }} ], "translation": {{ "language": "{target_language}", "language_name": "{target_lang_name}", "full_sentence": "Vollstaendige Uebersetzung", "sentence_with_gaps": "Uebersetzung mit ___ an gleichen Stellen" }} }} ], "metadata": {{ "subject": "{subject}", "grade_level": "{grade_level}", "source_title": "{title}", "target_language": "{target_language}", "total_gaps": 0 }} }} WICHTIG: - Jeder Satz MUSS mindestens 2 Luecken haben! - Position ist der Index des Wortes im Satz (0-basiert)""" payload = { "model": "gpt-4o-mini", "response_format": {"type": "json_object"}, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ], "max_tokens": 3000, "temperature": 0.7, } response = requests.post(url, headers=headers, json=payload) response.raise_for_status() data = response.json() try: content = data["choices"][0]["message"]["content"] cloze_data = json.loads(content) except (KeyError, json.JSONDecodeError) as e: raise RuntimeError(f"Fehler bei Lueckentext-Generierung: {e}") # Calculate total number of gaps total_gaps = sum(len(item.get("gaps", [])) for item in cloze_data.get("cloze_items", [])) if "metadata" in cloze_data: cloze_data["metadata"]["total_gaps"] = total_gaps return cloze_data def _generate_cloze_with_claude(analysis_data: dict, target_language: str = "tr") -> dict: """Generate cloze texts with Claude API.""" import anthropic api_key = os.getenv("ANTHROPIC_API_KEY") if not api_key: raise RuntimeError("ANTHROPIC_API_KEY ist nicht gesetzt.") client = anthropic.Anthropic(api_key=api_key) title = analysis_data.get("title") or "Arbeitsblatt" subject = analysis_data.get("subject") or "Allgemein" grade_level = analysis_data.get("grade_level") or "unbekannt" canonical_text = analysis_data.get("canonical_text") or "" printed_blocks = analysis_data.get("printed_blocks") or [] content_parts = [] if canonical_text: content_parts.append(canonical_text) for block in printed_blocks: text = block.get("text", "").strip() if text and text not in content_parts: content_parts.append(text) worksheet_content = "\n\n".join(content_parts) if not worksheet_content.strip(): return {"cloze_items": [], "metadata": {"error": "Kein Textinhalt gefunden"}} target_lang_name = LANGUAGE_NAMES.get(target_language, "Tuerkisch") prompt = f"""Erstelle Lueckentexte aus diesem Arbeitsblatt. WICHTIGE REGELN: 1. MEHRERE LUECKEN PRO SATZ (mindestens 2!) Beispiel: "Ich habe gestern Hausaufgaben gemacht" → Luecken: "habe" UND "gemacht" 2. Schwierigkeitsgrad: exakt "{grade_level}" 3. Uebersetzung auf {target_lang_name} mit gleichen Luecken TITEL: {title} FACH: {subject} KLASSENSTUFE: {grade_level} TEXT: {worksheet_content} Antworte NUR mit diesem JSON (5-8 Saetze): {{ "cloze_items": [ {{ "id": "c1", "original_sentence": "Vollstaendiger Satz", "sentence_with_gaps": "Satz mit ___ fuer Luecken", "gaps": [ {{"id": "g1", "word": "Lueckenwort", "position": 0, "hint": "Hinweis"}} ], "translation": {{ "language": "{target_language}", "language_name": "{target_lang_name}", "full_sentence": "Uebersetzung", "sentence_with_gaps": "Uebersetzung mit ___" }} }} ], "metadata": {{ "subject": "{subject}", "grade_level": "{grade_level}", "source_title": "{title}", "target_language": "{target_language}", "total_gaps": 0 }} }}""" message = client.messages.create( model="claude-3-5-sonnet-20241022", max_tokens=3000, messages=[{"role": "user", "content": prompt}] ) content = message.content[0].text try: if "```json" in content: content = content.split("```json")[1].split("```")[0] elif "```" in content: content = content.split("```")[1].split("```")[0] cloze_data = json.loads(content.strip()) except json.JSONDecodeError as e: raise RuntimeError(f"Claude hat ungueltiges JSON geliefert: {e}") # Calculate total number of gaps total_gaps = sum(len(item.get("gaps", [])) for item in cloze_data.get("cloze_items", [])) if "metadata" in cloze_data: cloze_data["metadata"]["total_gaps"] = total_gaps return cloze_data def generate_cloze_from_analysis(analysis_path: Path, target_language: str = "tr") -> Path: """ Generate cloze texts from an analysis JSON file. The cloze texts will: - Have multiple meaningful gaps per sentence - Match the difficulty level of the original - Include translation to target language Args: analysis_path: Path to *_analyse.json file target_language: Language code for translation (default: "tr" for Turkish) Returns: Path to generated *_cloze.json file """ if not analysis_path.exists(): raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}") try: analysis_data = json.loads(analysis_path.read_text(encoding="utf-8")) except json.JSONDecodeError as e: raise RuntimeError(f"Ungueltige Analyse-JSON: {e}") logger.info(f"Generiere Lueckentexte fuer: {analysis_path.name}") # Generate cloze texts (use configured API) if VISION_API == "claude": try: cloze_data = _generate_cloze_with_claude(analysis_data, target_language) except Exception as e: logger.warning(f"Claude Lueckentext-Generierung fehlgeschlagen, nutze OpenAI: {e}") cloze_data = _generate_cloze_with_openai(analysis_data, target_language) else: cloze_data = _generate_cloze_with_openai(analysis_data, target_language) # Save cloze data out_name = analysis_path.stem.replace("_analyse", "") + "_cloze.json" out_path = BEREINIGT_DIR / out_name out_path.write_text(json.dumps(cloze_data, ensure_ascii=False, indent=2), encoding="utf-8") logger.info(f"Lueckentexte gespeichert: {out_path.name}") return out_path