fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions

View File

@@ -0,0 +1,106 @@
"""
AI Processor Module
A modular AI-powered worksheet processing system for:
- Vision-based analysis
- Content generation (MC, Cloze, Q&A)
- Print version export
- Mindmap visualization
Usage:
from ai_processor import analyze_scan_structure_with_ai, generate_mc_from_analysis
"""
# Configuration
from .config import (
BASE_DIR,
EINGANG_DIR,
BEREINIGT_DIR,
VISION_API,
get_openai_api_key,
get_anthropic_api_key,
ensure_directories,
)
# Utilities
from .utils import (
encode_image_to_data_url,
encode_image_to_base64,
dummy_process_scan,
get_media_type,
)
# Vision - Scan Analysis
from .vision import (
analyze_scan_structure_with_ai,
describe_scan_with_ai,
remove_handwriting_from_scan,
build_clean_html_from_analysis,
)
# Generators
from .generators import (
generate_mc_from_analysis,
generate_cloze_from_analysis,
generate_qa_from_analysis,
update_leitner_progress,
get_next_review_items,
)
# Export - Print Versions
from .export import (
generate_print_version_qa,
generate_print_version_cloze,
generate_print_version_mc,
generate_print_version_worksheet,
)
# Visualization - Mindmap
from .visualization import (
generate_mindmap_data,
generate_mindmap_html,
save_mindmap_for_worksheet,
)
# Legacy aliases for backward compatibility
_get_api_key = get_openai_api_key
_encode_image_to_data_url = encode_image_to_data_url
__all__ = [
# Config
"BASE_DIR",
"EINGANG_DIR",
"BEREINIGT_DIR",
"VISION_API",
"get_openai_api_key",
"get_anthropic_api_key",
"ensure_directories",
# Utils
"encode_image_to_data_url",
"encode_image_to_base64",
"dummy_process_scan",
"get_media_type",
# Vision
"analyze_scan_structure_with_ai",
"describe_scan_with_ai",
"remove_handwriting_from_scan",
"build_clean_html_from_analysis",
# Generators
"generate_mc_from_analysis",
"generate_cloze_from_analysis",
"generate_qa_from_analysis",
"update_leitner_progress",
"get_next_review_items",
# Export
"generate_print_version_qa",
"generate_print_version_cloze",
"generate_print_version_mc",
"generate_print_version_worksheet",
# Visualization
"generate_mindmap_data",
"generate_mindmap_html",
"save_mindmap_for_worksheet",
# Legacy aliases
"_get_api_key",
"_encode_image_to_data_url",
]

View File

@@ -0,0 +1,43 @@
"""
AI Processor - Configuration
API keys, constants, and directory paths.
"""
from pathlib import Path
import os
import logging
logger = logging.getLogger(__name__)
# Directory Configuration
BASE_DIR = Path.home() / "Arbeitsblaetter"
EINGANG_DIR = BASE_DIR / "Eingang"
BEREINIGT_DIR = BASE_DIR / "Bereinigt"
# Vision API Configuration
# Set VISION_API environment variable to "openai" or "claude" (default: claude)
VISION_API = os.getenv("VISION_API", "claude").lower()
def get_openai_api_key() -> str:
"""Get OpenAI API key from environment."""
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY ist nicht gesetzt. Bitte API-Schluessel als Umgebungsvariable setzen.")
return api_key
def get_anthropic_api_key() -> str:
"""Get Anthropic API key from environment."""
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise RuntimeError("ANTHROPIC_API_KEY ist nicht gesetzt.")
return api_key
# Ensure directories exist
def ensure_directories():
"""Create directories if they don't exist."""
EINGANG_DIR.mkdir(parents=True, exist_ok=True)
BEREINIGT_DIR.mkdir(parents=True, exist_ok=True)

View File

@@ -0,0 +1,19 @@
"""
AI Processor - Export Module
Print version generation and worksheet export.
"""
from .print_versions import (
generate_print_version_qa,
generate_print_version_cloze,
generate_print_version_mc,
)
from .worksheet import generate_print_version_worksheet
__all__ = [
"generate_print_version_qa",
"generate_print_version_cloze",
"generate_print_version_mc",
"generate_print_version_worksheet",
]

View File

@@ -0,0 +1,508 @@
"""
AI Processor - Print Version Generators
Generate printable HTML versions for Q&A, Cloze, and Multiple Choice.
"""
from pathlib import Path
import json
import logging
import random
from ..config import BEREINIGT_DIR
logger = logging.getLogger(__name__)
def generate_print_version_qa(qa_path: Path, include_answers: bool = False) -> Path:
"""
Generate a printable HTML version of the Q&A pairs.
Args:
qa_path: Path to *_qa.json file
include_answers: True for solution sheet (for parents)
Returns:
Path to generated HTML file
"""
if not qa_path.exists():
raise FileNotFoundError(f"Q&A-Datei nicht gefunden: {qa_path}")
qa_data = json.loads(qa_path.read_text(encoding="utf-8"))
items = qa_data.get("qa_items", [])
metadata = qa_data.get("metadata", {})
title = metadata.get("source_title", "Arbeitsblatt")
subject = metadata.get("subject", "")
grade = metadata.get("grade_level", "")
html_parts = []
html_parts.append(_get_qa_html_header(title))
# Header
version_text = "Loesungsblatt" if include_answers else "Fragenblatt"
html_parts.append(f"<h1>{title} - {version_text}</h1>")
meta_parts = []
if subject:
meta_parts.append(f"Fach: {subject}")
if grade:
meta_parts.append(f"Klasse: {grade}")
meta_parts.append(f"Anzahl Fragen: {len(items)}")
html_parts.append(f"<div class='meta'>{' | '.join(meta_parts)}</div>")
# Questions
for idx, item in enumerate(items, 1):
html_parts.append("<div class='question-block'>")
html_parts.append(f"<div class='question-number'>Frage {idx}</div>")
html_parts.append(f"<div class='question-text'>{item.get('question', '')}</div>")
if include_answers:
html_parts.append(f"<div class='answer'><strong>Antwort:</strong> {item.get('answer', '')}</div>")
key_terms = item.get("key_terms", [])
if key_terms:
terms_html = " ".join([f"<span>{term}</span>" for term in key_terms])
html_parts.append(f"<div class='key-terms'>Wichtige Begriffe: {terms_html}</div>")
else:
html_parts.append("<div class='answer-lines'>")
for _ in range(3):
html_parts.append("<div class='answer-line'></div>")
html_parts.append("</div>")
html_parts.append("</div>")
html_parts.append("</body></html>")
# Save
suffix = "_qa_solutions.html" if include_answers else "_qa_print.html"
out_name = qa_path.stem.replace("_qa", "") + suffix
out_path = BEREINIGT_DIR / out_name
out_path.write_text("\n".join(html_parts), encoding="utf-8")
logger.info(f"Print-Version gespeichert: {out_path.name}")
return out_path
def generate_print_version_cloze(cloze_path: Path, include_answers: bool = False) -> Path:
"""
Generate a printable HTML version of the cloze texts.
Args:
cloze_path: Path to *_cloze.json file
include_answers: True for solution sheet (for parents)
Returns:
Path to generated HTML file
"""
if not cloze_path.exists():
raise FileNotFoundError(f"Cloze-Datei nicht gefunden: {cloze_path}")
cloze_data = json.loads(cloze_path.read_text(encoding="utf-8"))
items = cloze_data.get("cloze_items", [])
metadata = cloze_data.get("metadata", {})
title = metadata.get("source_title", "Arbeitsblatt")
subject = metadata.get("subject", "")
grade = metadata.get("grade_level", "")
total_gaps = metadata.get("total_gaps", 0)
html_parts = []
html_parts.append(_get_cloze_html_header(title))
# Header
version_text = "Loesungsblatt" if include_answers else "Lueckentext"
html_parts.append(f"<h1>{title} - {version_text}</h1>")
meta_parts = []
if subject:
meta_parts.append(f"Fach: {subject}")
if grade:
meta_parts.append(f"Klasse: {grade}")
meta_parts.append(f"Luecken gesamt: {total_gaps}")
html_parts.append(f"<div class='meta'>{' | '.join(meta_parts)}</div>")
# Collect all gap words for word bank
all_words = []
# Cloze texts
for idx, item in enumerate(items, 1):
html_parts.append("<div class='cloze-item'>")
html_parts.append(f"<div class='cloze-number'>{idx}.</div>")
gaps = item.get("gaps", [])
sentence = item.get("sentence_with_gaps", "")
if include_answers:
# Solution sheet: fill gaps with answers
for gap in gaps:
word = gap.get("word", "")
sentence = sentence.replace("___", f"<span class='gap-filled'>{word}</span>", 1)
else:
# Question sheet: gaps as lines
sentence = sentence.replace("___", "<span class='gap'>&nbsp;</span>")
for gap in gaps:
all_words.append(gap.get("word", ""))
html_parts.append(f"<div class='cloze-sentence'>{sentence}</div>")
# Show translation
translation = item.get("translation", {})
if translation:
lang_name = translation.get("language_name", "Uebersetzung")
full_sentence = translation.get("full_sentence", "")
if full_sentence:
html_parts.append("<div class='translation'>")
html_parts.append(f"<div class='translation-label'>{lang_name}:</div>")
html_parts.append(full_sentence)
html_parts.append("</div>")
html_parts.append("</div>")
# Word bank (only for question sheet)
if not include_answers and all_words:
random.shuffle(all_words)
html_parts.append("<div class='word-bank'>")
html_parts.append("<div class='word-bank-title'>Wortbank (diese Woerter fehlen):</div>")
for word in all_words:
html_parts.append(f"<span class='word'>{word}</span>")
html_parts.append("</div>")
html_parts.append("</body></html>")
# Save
suffix = "_cloze_solutions.html" if include_answers else "_cloze_print.html"
out_name = cloze_path.stem.replace("_cloze", "") + suffix
out_path = BEREINIGT_DIR / out_name
out_path.write_text("\n".join(html_parts), encoding="utf-8")
logger.info(f"Cloze Print-Version gespeichert: {out_path.name}")
return out_path
def generate_print_version_mc(mc_path: Path, include_answers: bool = False) -> str:
"""
Generate a printable HTML version of the multiple choice questions.
Args:
mc_path: Path to *_mc.json file
include_answers: True for solution sheet with marked correct answers
Returns:
HTML string (for direct delivery)
"""
if not mc_path.exists():
raise FileNotFoundError(f"MC-Datei nicht gefunden: {mc_path}")
mc_data = json.loads(mc_path.read_text(encoding="utf-8"))
questions = mc_data.get("questions", [])
metadata = mc_data.get("metadata", {})
title = metadata.get("source_title", "Arbeitsblatt")
subject = metadata.get("subject", "")
grade = metadata.get("grade_level", "")
html_parts = []
html_parts.append(_get_mc_html_header(title))
# Header
version_text = "Loesungsblatt" if include_answers else "Multiple Choice Test"
html_parts.append(f"<h1>{title}</h1>")
html_parts.append(f"<div class='meta'><strong>{version_text}</strong>")
if subject:
html_parts.append(f" | Fach: {subject}")
if grade:
html_parts.append(f" | Klasse: {grade}")
html_parts.append(f" | Anzahl Fragen: {len(questions)}</div>")
if not include_answers:
html_parts.append("<div class='instructions'>")
html_parts.append("<strong>Anleitung:</strong> Kreuze bei jeder Frage die richtige Antwort an. ")
html_parts.append("Es ist immer nur eine Antwort richtig.")
html_parts.append("</div>")
# Questions
for idx, q in enumerate(questions, 1):
html_parts.append("<div class='question-block'>")
html_parts.append(f"<div class='question-number'>Frage {idx}</div>")
html_parts.append(f"<div class='question-text'>{q.get('question', '')}</div>")
html_parts.append("<div class='options'>")
correct_answer = q.get("correct_answer", "")
for opt in q.get("options", []):
opt_id = opt.get("id", "")
is_correct = opt_id == correct_answer
opt_class = "option"
checkbox_class = "option-checkbox"
if include_answers and is_correct:
opt_class += " option-correct"
checkbox_class += " checked"
html_parts.append(f"<div class='{opt_class}'>")
html_parts.append(f"<div class='{checkbox_class}'></div>")
html_parts.append(f"<span class='option-label'>{opt_id})</span>")
html_parts.append(f"<span class='option-text'>{opt.get('text', '')}</span>")
html_parts.append("</div>")
html_parts.append("</div>")
# Explanation only for solution sheet
if include_answers and q.get("explanation"):
html_parts.append(f"<div class='explanation'><strong>Erklaerung:</strong> {q.get('explanation')}</div>")
html_parts.append("</div>")
# Answer key (compact) - only for solution sheet
if include_answers:
html_parts.append("<div class='answer-key'>")
html_parts.append("<div class='answer-key-title'>Loesungsschluessel</div>")
html_parts.append("<div class='answer-key-grid'>")
for idx, q in enumerate(questions, 1):
html_parts.append("<div class='answer-key-item'>")
html_parts.append(f"<span class='answer-key-q'>{idx}.</span> ")
html_parts.append(f"<span class='answer-key-a'>{q.get('correct_answer', '')}</span>")
html_parts.append("</div>")
html_parts.append("</div>")
html_parts.append("</div>")
html_parts.append("</body></html>")
return "\n".join(html_parts)
def _get_qa_html_header(title: str) -> str:
"""Get HTML header for Q&A print version."""
return f"""<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<title>{title} - Fragen</title>
<style>
@media print {{
.no-print {{ display: none; }}
.page-break {{ page-break-before: always; }}
}}
body {{
font-family: Arial, sans-serif;
max-width: 800px;
margin: 40px auto;
padding: 20px;
line-height: 1.6;
}}
h1 {{ font-size: 24px; margin-bottom: 8px; }}
.meta {{ color: #666; margin-bottom: 24px; }}
.question-block {{
margin-bottom: 32px;
padding-bottom: 16px;
border-bottom: 1px dashed #ccc;
}}
.question-number {{ font-weight: bold; color: #333; }}
.question-text {{ font-size: 16px; margin: 8px 0; }}
.answer-lines {{ margin-top: 12px; }}
.answer-line {{ border-bottom: 1px solid #999; height: 28px; }}
.answer {{
margin-top: 8px;
padding: 8px;
background: #e8f5e9;
border-left: 3px solid #4caf50;
}}
.key-terms {{ font-size: 12px; color: #666; margin-top: 8px; }}
.key-terms span {{
background: #fff3e0;
padding: 2px 6px;
border-radius: 3px;
margin-right: 4px;
}}
</style>
</head>
<body>
"""
def _get_cloze_html_header(title: str) -> str:
"""Get HTML header for cloze print version."""
return f"""<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<title>{title} - Lueckentext</title>
<style>
@media print {{
.no-print {{ display: none; }}
.page-break {{ page-break-before: always; }}
}}
body {{
font-family: Arial, sans-serif;
max-width: 800px;
margin: 40px auto;
padding: 20px;
line-height: 1.8;
}}
h1 {{ font-size: 24px; margin-bottom: 8px; }}
.meta {{ color: #666; margin-bottom: 24px; }}
.cloze-item {{
margin-bottom: 24px;
padding: 16px;
background: #f9f9f9;
border-radius: 8px;
}}
.cloze-number {{ font-weight: bold; color: #333; margin-bottom: 8px; }}
.cloze-sentence {{ font-size: 16px; line-height: 2; }}
.gap {{
display: inline-block;
min-width: 80px;
border-bottom: 2px solid #333;
margin: 0 4px;
text-align: center;
}}
.gap-filled {{
display: inline-block;
padding: 2px 8px;
background: #e8f5e9;
border: 1px solid #4caf50;
border-radius: 4px;
font-weight: bold;
}}
.translation {{
margin-top: 12px;
padding: 8px;
background: #e3f2fd;
border-left: 3px solid #2196f3;
font-size: 14px;
color: #555;
}}
.translation-label {{ font-size: 12px; color: #777; margin-bottom: 4px; }}
.word-bank {{
margin-top: 32px;
padding: 16px;
background: #fff3e0;
border-radius: 8px;
}}
.word-bank-title {{ font-weight: bold; margin-bottom: 12px; }}
.word {{
display: inline-block;
padding: 4px 12px;
margin: 4px;
background: white;
border: 1px solid #ddd;
border-radius: 4px;
}}
</style>
</head>
<body>
"""
def _get_mc_html_header(title: str) -> str:
"""Get HTML header for MC print version."""
return f"""<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<title>{title} - Multiple Choice</title>
<style>
@media print {{
.no-print {{ display: none; }}
.page-break {{ page-break-before: always; }}
body {{ font-size: 14pt; }}
}}
body {{
font-family: Arial, Helvetica, sans-serif;
max-width: 800px;
margin: 40px auto;
padding: 20px;
line-height: 1.6;
color: #000;
}}
h1 {{
font-size: 28px;
margin-bottom: 8px;
border-bottom: 2px solid #000;
padding-bottom: 8px;
}}
.meta {{ color: #333; margin-bottom: 32px; font-size: 14px; }}
.instructions {{
background: #f5f5f5;
padding: 12px 16px;
border-radius: 4px;
margin-bottom: 24px;
font-size: 14px;
}}
.question-block {{
margin-bottom: 28px;
padding-bottom: 16px;
border-bottom: 1px solid #ddd;
}}
.question-number {{ font-weight: bold; font-size: 18px; color: #000; margin-bottom: 8px; }}
.question-text {{ font-size: 16px; margin: 8px 0 16px 0; line-height: 1.5; }}
.options {{ margin-left: 20px; }}
.option {{
display: flex;
align-items: flex-start;
margin-bottom: 12px;
padding: 8px 12px;
border: 1px solid #ccc;
border-radius: 4px;
background: #fff;
}}
.option-correct {{
background: #e8f5e9;
border-color: #4caf50;
border-width: 2px;
}}
.option-checkbox {{
width: 20px;
height: 20px;
border: 2px solid #333;
border-radius: 50%;
margin-right: 12px;
flex-shrink: 0;
display: flex;
align-items: center;
justify-content: center;
}}
.option-checkbox.checked::after {{
content: "";
font-weight: bold;
color: #4caf50;
}}
.option-label {{ font-weight: bold; margin-right: 8px; min-width: 24px; }}
.option-text {{ flex: 1; }}
.explanation {{
margin-top: 8px;
padding: 8px 12px;
background: #e3f2fd;
border-left: 3px solid #2196f3;
font-size: 13px;
color: #333;
}}
.answer-key {{
margin-top: 40px;
padding: 16px;
background: #f5f5f5;
border-radius: 8px;
}}
.answer-key-title {{
font-weight: bold;
font-size: 18px;
margin-bottom: 12px;
border-bottom: 1px solid #999;
padding-bottom: 8px;
}}
.answer-key-grid {{
display: grid;
grid-template-columns: repeat(5, 1fr);
gap: 8px;
}}
.answer-key-item {{
padding: 8px;
text-align: center;
background: white;
border: 1px solid #ddd;
border-radius: 4px;
}}
.answer-key-q {{ font-weight: bold; }}
.answer-key-a {{ color: #4caf50; font-weight: bold; }}
</style>
</head>
<body>
"""

View File

@@ -0,0 +1,286 @@
"""
AI Processor - Worksheet Export
Generate printable worksheet versions.
"""
from pathlib import Path
import json
import logging
logger = logging.getLogger(__name__)
def generate_print_version_worksheet(analysis_path: Path) -> str:
"""
Generate a print-optimized HTML version of the worksheet.
Features:
- Large, readable font (16pt)
- Black and white / grayscale compatible
- Clear structure for printing
- No interactive elements
Args:
analysis_path: Path to *_analyse.json file
Returns:
HTML string for direct delivery
"""
if not analysis_path.exists():
raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}")
try:
data = json.loads(analysis_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as e:
raise RuntimeError(f"Analyse-Datei enthaelt kein gueltiges JSON: {analysis_path}\n{e}") from e
title = data.get("title") or "Arbeitsblatt"
subject = data.get("subject") or ""
grade_level = data.get("grade_level") or ""
instructions = data.get("instructions") or ""
tasks = data.get("tasks", []) or []
canonical_text = data.get("canonical_text") or ""
printed_blocks = data.get("printed_blocks") or []
html_parts = []
html_parts.append(_get_worksheet_html_header(title))
# Print button
html_parts.append('<button class="print-button no-print" onclick="window.print()">🖨️ Drucken</button>')
# Title
html_parts.append(f"<h1>{title}</h1>")
# Meta information
meta_parts = []
if subject:
meta_parts.append(f"<span><strong>Fach:</strong> {subject}</span>")
if grade_level:
meta_parts.append(f"<span><strong>Klasse:</strong> {grade_level}</span>")
if meta_parts:
html_parts.append(f"<div class='meta'>{''.join(meta_parts)}</div>")
# Instructions
if instructions:
html_parts.append("<div class='instructions'>")
html_parts.append("<div class='instructions-label'>Arbeitsanweisung:</div>")
html_parts.append(f"<div>{instructions}</div>")
html_parts.append("</div>")
# Main text / printed blocks
has_text_content = False
if printed_blocks:
html_parts.append("<section class='text-section'>")
for block in printed_blocks:
role = (block.get("role") or "body").lower()
text = (block.get("text") or "").strip()
if not text:
continue
has_text_content = True
if role == "title":
html_parts.append(f"<div class='text-block'><div class='text-block-title'>{text}</div></div>")
else:
html_parts.append(f"<div class='text-block'>{text}</div>")
html_parts.append("</section>")
elif canonical_text:
html_parts.append("<section class='text-section'>")
paragraphs = [
p.strip()
for p in canonical_text.replace("\r\n", "\n").split("\n\n")
if p.strip()
]
for p in paragraphs:
has_text_content = True
html_parts.append(f"<div class='text-block'>{p}</div>")
html_parts.append("</section>")
# Tasks
if tasks:
html_parts.append("<section class='task-section'>")
html_parts.append("<h2>Aufgaben</h2>")
for idx, task in enumerate(tasks, start=1):
t_type = task.get("type") or "Aufgabe"
desc = task.get("description") or ""
text_with_gaps = task.get("text_with_gaps")
html_parts.append("<div class='task'>")
# Task header
type_label = {
"fill_in_blank": "Lueckentext",
"multiple_choice": "Multiple Choice",
"free_text": "Freitext",
"matching": "Zuordnung",
"labeling": "Beschriftung",
"calculation": "Rechnung",
"other": "Aufgabe"
}.get(t_type, t_type)
html_parts.append(f"<div class='task-header'>Aufgabe {idx}: {type_label}</div>")
if desc:
html_parts.append(f"<div class='task-content'>{desc}</div>")
if text_with_gaps:
rendered = text_with_gaps.replace("___", "<span class='gap-line'>&nbsp;</span>")
html_parts.append(f"<div class='task-content' style='margin-top:12px;'>{rendered}</div>")
# Answer lines for free text tasks
if t_type in ["free_text", "other"] or (not text_with_gaps and not desc):
html_parts.append("<div class='answer-lines'>")
for _ in range(3):
html_parts.append("<div class='answer-line'></div>")
html_parts.append("</div>")
html_parts.append("</div>")
html_parts.append("</section>")
# Footer
html_parts.append("<div class='footer'>")
html_parts.append("Dieses Arbeitsblatt wurde automatisch aus einem Scan rekonstruiert.")
html_parts.append("</div>")
html_parts.append("</body></html>")
return "\n".join(html_parts)
def _get_worksheet_html_header(title: str) -> str:
"""Get HTML header for worksheet print version."""
return f"""<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<title>{title}</title>
<style>
@page {{
size: A4;
margin: 20mm;
}}
@media print {{
body {{
font-size: 14pt !important;
-webkit-print-color-adjust: exact;
print-color-adjust: exact;
}}
.no-print {{ display: none !important; }}
.page-break {{ page-break-before: always; }}
}}
* {{ box-sizing: border-box; }}
body {{
font-family: Arial, "Helvetica Neue", sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 30px;
line-height: 1.7;
font-size: 16px;
color: #000;
background: #fff;
}}
h1 {{
font-size: 28px;
margin: 0 0 8px 0;
padding-bottom: 8px;
border-bottom: 3px solid #000;
}}
h2 {{
font-size: 20px;
margin: 28px 0 12px 0;
padding-bottom: 4px;
border-bottom: 1px solid #666;
}}
.meta {{
font-size: 14px;
color: #333;
margin-bottom: 20px;
padding: 8px 0;
}}
.meta span {{
margin-right: 20px;
}}
.instructions {{
margin: 20px 0;
padding: 16px;
border: 2px solid #333;
background: #f5f5f5;
font-size: 15px;
}}
.instructions-label {{
font-weight: bold;
margin-bottom: 8px;
}}
.text-section {{
margin: 24px 0;
}}
.text-block {{
margin-bottom: 16px;
text-align: justify;
}}
.text-block-title {{
font-weight: bold;
font-size: 17px;
margin-bottom: 8px;
}}
.task-section {{
margin-top: 32px;
}}
.task {{
margin-bottom: 24px;
padding: 16px;
border: 1px solid #999;
background: #fafafa;
}}
.task-header {{
font-weight: bold;
font-size: 16px;
margin-bottom: 12px;
padding-bottom: 8px;
border-bottom: 1px dashed #666;
}}
.task-content {{
font-size: 15px;
}}
.gap-line {{
display: inline-block;
border-bottom: 2px solid #000;
min-width: 100px;
margin: 0 6px;
}}
.answer-lines {{
margin-top: 16px;
}}
.answer-line {{
border-bottom: 1px solid #333;
height: 36px;
margin-bottom: 4px;
}}
.footer {{
margin-top: 40px;
padding-top: 16px;
border-top: 1px solid #ccc;
font-size: 11px;
color: #666;
text-align: center;
}}
.print-button {{
position: fixed;
top: 20px;
right: 20px;
padding: 12px 24px;
background: #333;
color: #fff;
border: none;
border-radius: 6px;
cursor: pointer;
font-size: 14px;
}}
.print-button:hover {{
background: #555;
}}
</style>
</head>
<body>
"""

View File

@@ -0,0 +1,21 @@
"""
AI Processor - Generators Module
Content generation for multiple choice, cloze, and Q&A.
"""
from .multiple_choice import generate_mc_from_analysis
from .cloze import generate_cloze_from_analysis
from .qa import (
generate_qa_from_analysis,
update_leitner_progress,
get_next_review_items,
)
__all__ = [
"generate_mc_from_analysis",
"generate_cloze_from_analysis",
"generate_qa_from_analysis",
"update_leitner_progress",
"get_next_review_items",
]

View File

@@ -0,0 +1,312 @@
"""
AI Processor - Cloze Text Generator
Generate cloze (fill-in-the-blank) texts from worksheet analysis.
"""
from pathlib import Path
import json
import logging
import os
import requests
from ..config import VISION_API, BEREINIGT_DIR, get_openai_api_key
logger = logging.getLogger(__name__)
# Language codes to names
LANGUAGE_NAMES = {
"tr": "Tuerkisch",
"ar": "Arabisch",
"ru": "Russisch",
"en": "Englisch",
"fr": "Franzoesisch",
"es": "Spanisch",
"pl": "Polnisch",
"uk": "Ukrainisch",
}
def _generate_cloze_with_openai(analysis_data: dict, target_language: str = "tr") -> dict:
"""
Generate cloze texts based on worksheet analysis.
Important didactic requirements:
- Multiple meaningful gaps per sentence (not just one!)
- Difficulty level matches the original
- Translation with the same gaps
Args:
analysis_data: The analysis JSON of the worksheet
target_language: Target language for translation (default: "tr" for Turkish)
Returns:
Dict with cloze_items and metadata
"""
api_key = get_openai_api_key()
title = analysis_data.get("title") or "Arbeitsblatt"
subject = analysis_data.get("subject") or "Allgemein"
grade_level = analysis_data.get("grade_level") or "unbekannt"
canonical_text = analysis_data.get("canonical_text") or ""
printed_blocks = analysis_data.get("printed_blocks") or []
content_parts = []
if canonical_text:
content_parts.append(canonical_text)
for block in printed_blocks:
text = block.get("text", "").strip()
if text and text not in content_parts:
content_parts.append(text)
worksheet_content = "\n\n".join(content_parts)
if not worksheet_content.strip():
logger.warning("Kein Textinhalt fuer Lueckentext-Generierung gefunden")
return {"cloze_items": [], "metadata": {"error": "Kein Textinhalt gefunden"}}
target_lang_name = LANGUAGE_NAMES.get(target_language, "Tuerkisch")
url = "https://api.openai.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
system_prompt = f"""Du bist ein erfahrener Paedagoge, der Lueckentexte fuer Schueler erstellt.
WICHTIGE REGELN FUER LUECKENTEXTE:
1. MEHRERE LUECKEN PRO SATZ:
- Erstelle IMMER mehrere sinnvolle Luecken pro Satz
- Beispiel: "Ich habe gestern meine Hausaufgaben gemacht."
→ Luecken: "habe" UND "gemacht" (nicht nur eine!)
2. SCHWIERIGKEITSGRAD:
- Niveau muss exakt "{grade_level}" entsprechen
3. SINNVOLLE LUECKENWOERTER:
- Verben (konjugiert)
- Wichtige Nomen
- Adjektive
- KEINE Artikel oder Praepositionen allein
4. UEBERSETZUNG:
- Uebersetze den VOLLSTAENDIGEN Satz auf {target_lang_name}
- Die GLEICHEN Woerter muessen als Luecken markiert sein
5. AUSGABE: Nur gueltiges JSON, kein Markdown."""
user_prompt = f"""Erstelle Lueckentexte aus diesem Arbeitsblatt:
TITEL: {title}
FACH: {subject}
KLASSENSTUFE: {grade_level}
TEXT:
{worksheet_content}
Erstelle 5-8 Saetze mit Luecken. Gib das Ergebnis als JSON zurueck:
{{
"cloze_items": [
{{
"id": "c1",
"original_sentence": "Der vollstaendige Originalsatz ohne Luecken",
"sentence_with_gaps": "Der Satz mit ___ fuer jede Luecke",
"gaps": [
{{
"id": "g1",
"word": "das fehlende Wort",
"position": 0,
"hint": "optionaler Hinweis"
}}
],
"translation": {{
"language": "{target_language}",
"language_name": "{target_lang_name}",
"full_sentence": "Vollstaendige Uebersetzung",
"sentence_with_gaps": "Uebersetzung mit ___ an gleichen Stellen"
}}
}}
],
"metadata": {{
"subject": "{subject}",
"grade_level": "{grade_level}",
"source_title": "{title}",
"target_language": "{target_language}",
"total_gaps": 0
}}
}}
WICHTIG:
- Jeder Satz MUSS mindestens 2 Luecken haben!
- Position ist der Index des Wortes im Satz (0-basiert)"""
payload = {
"model": "gpt-4o-mini",
"response_format": {"type": "json_object"},
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"max_tokens": 3000,
"temperature": 0.7,
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
try:
content = data["choices"][0]["message"]["content"]
cloze_data = json.loads(content)
except (KeyError, json.JSONDecodeError) as e:
raise RuntimeError(f"Fehler bei Lueckentext-Generierung: {e}")
# Calculate total number of gaps
total_gaps = sum(len(item.get("gaps", [])) for item in cloze_data.get("cloze_items", []))
if "metadata" in cloze_data:
cloze_data["metadata"]["total_gaps"] = total_gaps
return cloze_data
def _generate_cloze_with_claude(analysis_data: dict, target_language: str = "tr") -> dict:
"""Generate cloze texts with Claude API."""
import anthropic
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise RuntimeError("ANTHROPIC_API_KEY ist nicht gesetzt.")
client = anthropic.Anthropic(api_key=api_key)
title = analysis_data.get("title") or "Arbeitsblatt"
subject = analysis_data.get("subject") or "Allgemein"
grade_level = analysis_data.get("grade_level") or "unbekannt"
canonical_text = analysis_data.get("canonical_text") or ""
printed_blocks = analysis_data.get("printed_blocks") or []
content_parts = []
if canonical_text:
content_parts.append(canonical_text)
for block in printed_blocks:
text = block.get("text", "").strip()
if text and text not in content_parts:
content_parts.append(text)
worksheet_content = "\n\n".join(content_parts)
if not worksheet_content.strip():
return {"cloze_items": [], "metadata": {"error": "Kein Textinhalt gefunden"}}
target_lang_name = LANGUAGE_NAMES.get(target_language, "Tuerkisch")
prompt = f"""Erstelle Lueckentexte aus diesem Arbeitsblatt.
WICHTIGE REGELN:
1. MEHRERE LUECKEN PRO SATZ (mindestens 2!)
Beispiel: "Ich habe gestern Hausaufgaben gemacht" → Luecken: "habe" UND "gemacht"
2. Schwierigkeitsgrad: exakt "{grade_level}"
3. Uebersetzung auf {target_lang_name} mit gleichen Luecken
TITEL: {title}
FACH: {subject}
KLASSENSTUFE: {grade_level}
TEXT:
{worksheet_content}
Antworte NUR mit diesem JSON (5-8 Saetze):
{{
"cloze_items": [
{{
"id": "c1",
"original_sentence": "Vollstaendiger Satz",
"sentence_with_gaps": "Satz mit ___ fuer Luecken",
"gaps": [
{{"id": "g1", "word": "Lueckenwort", "position": 0, "hint": "Hinweis"}}
],
"translation": {{
"language": "{target_language}",
"language_name": "{target_lang_name}",
"full_sentence": "Uebersetzung",
"sentence_with_gaps": "Uebersetzung mit ___"
}}
}}
],
"metadata": {{
"subject": "{subject}",
"grade_level": "{grade_level}",
"source_title": "{title}",
"target_language": "{target_language}",
"total_gaps": 0
}}
}}"""
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=3000,
messages=[{"role": "user", "content": prompt}]
)
content = message.content[0].text
try:
if "```json" in content:
content = content.split("```json")[1].split("```")[0]
elif "```" in content:
content = content.split("```")[1].split("```")[0]
cloze_data = json.loads(content.strip())
except json.JSONDecodeError as e:
raise RuntimeError(f"Claude hat ungueltiges JSON geliefert: {e}")
# Calculate total number of gaps
total_gaps = sum(len(item.get("gaps", [])) for item in cloze_data.get("cloze_items", []))
if "metadata" in cloze_data:
cloze_data["metadata"]["total_gaps"] = total_gaps
return cloze_data
def generate_cloze_from_analysis(analysis_path: Path, target_language: str = "tr") -> Path:
"""
Generate cloze texts from an analysis JSON file.
The cloze texts will:
- Have multiple meaningful gaps per sentence
- Match the difficulty level of the original
- Include translation to target language
Args:
analysis_path: Path to *_analyse.json file
target_language: Language code for translation (default: "tr" for Turkish)
Returns:
Path to generated *_cloze.json file
"""
if not analysis_path.exists():
raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}")
try:
analysis_data = json.loads(analysis_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as e:
raise RuntimeError(f"Ungueltige Analyse-JSON: {e}")
logger.info(f"Generiere Lueckentexte fuer: {analysis_path.name}")
# Generate cloze texts (use configured API)
if VISION_API == "claude":
try:
cloze_data = _generate_cloze_with_claude(analysis_data, target_language)
except Exception as e:
logger.warning(f"Claude Lueckentext-Generierung fehlgeschlagen, nutze OpenAI: {e}")
cloze_data = _generate_cloze_with_openai(analysis_data, target_language)
else:
cloze_data = _generate_cloze_with_openai(analysis_data, target_language)
# Save cloze data
out_name = analysis_path.stem.replace("_analyse", "") + "_cloze.json"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(json.dumps(cloze_data, ensure_ascii=False, indent=2), encoding="utf-8")
logger.info(f"Lueckentexte gespeichert: {out_path.name}")
return out_path

View File

@@ -0,0 +1,291 @@
"""
AI Processor - Multiple Choice Generator
Generate multiple choice questions from worksheet analysis.
"""
from pathlib import Path
import json
import logging
import random
import os
import requests
from ..config import VISION_API, BEREINIGT_DIR, get_openai_api_key
logger = logging.getLogger(__name__)
def _generate_mc_with_openai(analysis_data: dict, num_questions: int = 5) -> dict:
"""
Generate multiple choice questions based on worksheet analysis.
Uses OpenAI GPT-4o-mini for generation.
Difficulty level matches the original (grade_level from analysis).
"""
api_key = get_openai_api_key()
title = analysis_data.get("title") or "Arbeitsblatt"
subject = analysis_data.get("subject") or "Allgemein"
grade_level = analysis_data.get("grade_level") or "unbekannt"
canonical_text = analysis_data.get("canonical_text") or ""
printed_blocks = analysis_data.get("printed_blocks") or []
content_parts = []
if canonical_text:
content_parts.append(canonical_text)
for block in printed_blocks:
text = block.get("text", "").strip()
if text and text not in content_parts:
content_parts.append(text)
worksheet_content = "\n\n".join(content_parts)
if not worksheet_content.strip():
logger.warning("Kein Textinhalt fuer MC-Generierung gefunden")
return {"questions": [], "metadata": {"error": "Kein Textinhalt gefunden"}}
url = "https://api.openai.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
system_prompt = f"""Du bist ein erfahrener Paedagoge, der Multiple-Choice-Fragen fuer Schueler erstellt.
WICHTIGE REGELN:
1. SCHWIERIGKEITSGRAD: Die Fragen muessen exakt dem Niveau "{grade_level}" entsprechen.
2. INHALTSTREUE: Alle Fragen muessen sich direkt auf den gegebenen Text beziehen.
3. QUALITAET DER DISTRAKTOREN: Muessen plausibel klingen, nicht offensichtlich falsch.
4. AUSGABEFORMAT: Gib deine Antwort AUSSCHLIESSLICH als gueltiges JSON zurueck."""
user_prompt = f"""Erstelle {num_questions} Multiple-Choice-Fragen basierend auf diesem Arbeitsblatt:
TITEL: {title}
FACH: {subject}
KLASSENSTUFE: {grade_level}
INHALT DES ARBEITSBLATTS:
{worksheet_content}
Gib die Fragen als JSON zurueck:
{{
"questions": [
{{
"id": "q1",
"question": "Die Fragestellung hier",
"options": [
{{"id": "a", "text": "Antwort A"}},
{{"id": "b", "text": "Antwort B"}},
{{"id": "c", "text": "Antwort C"}},
{{"id": "d", "text": "Antwort D"}}
],
"correct_answer": "a",
"explanation": "Kurze Erklaerung warum diese Antwort richtig ist"
}}
],
"metadata": {{
"subject": "{subject}",
"grade_level": "{grade_level}",
"source_title": "{title}",
"num_questions": {num_questions}
}}
}}"""
payload = {
"model": "gpt-4o-mini",
"response_format": {"type": "json_object"},
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"max_tokens": 2000,
"temperature": 0.7,
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
try:
content = data["choices"][0]["message"]["content"]
mc_data = json.loads(content)
except (KeyError, json.JSONDecodeError) as e:
raise RuntimeError(f"Fehler bei MC-Generierung: {e}")
return mc_data
def _generate_mc_with_claude(analysis_data: dict, num_questions: int = 5) -> dict:
"""Generate multiple choice questions with Claude API."""
import anthropic
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise RuntimeError("ANTHROPIC_API_KEY ist nicht gesetzt.")
client = anthropic.Anthropic(api_key=api_key)
title = analysis_data.get("title") or "Arbeitsblatt"
subject = analysis_data.get("subject") or "Allgemein"
grade_level = analysis_data.get("grade_level") or "unbekannt"
canonical_text = analysis_data.get("canonical_text") or ""
printed_blocks = analysis_data.get("printed_blocks") or []
content_parts = []
if canonical_text:
content_parts.append(canonical_text)
for block in printed_blocks:
text = block.get("text", "").strip()
if text and text not in content_parts:
content_parts.append(text)
worksheet_content = "\n\n".join(content_parts)
if not worksheet_content.strip():
return {"questions": [], "metadata": {"error": "Kein Textinhalt gefunden"}}
prompt = f"""Erstelle {num_questions} Multiple-Choice-Fragen basierend auf diesem Arbeitsblatt.
WICHTIGE REGELN:
1. SCHWIERIGKEITSGRAD: Exakt Niveau "{grade_level}" - nicht leichter, nicht schwerer
2. INHALTSTREUE: Nur Fragen zum gegebenen Text
3. QUALITAET: Plausible Distraktoren (falsche Antworten)
TITEL: {title}
FACH: {subject}
KLASSENSTUFE: {grade_level}
INHALT:
{worksheet_content}
Antworte NUR mit diesem JSON-Format:
{{
"questions": [
{{
"id": "q1",
"question": "Fragestellung",
"options": [
{{"id": "a", "text": "Antwort A"}},
{{"id": "b", "text": "Antwort B"}},
{{"id": "c", "text": "Antwort C"}},
{{"id": "d", "text": "Antwort D"}}
],
"correct_answer": "a",
"explanation": "Erklaerung"
}}
],
"metadata": {{
"subject": "{subject}",
"grade_level": "{grade_level}",
"source_title": "{title}",
"num_questions": {num_questions}
}}
}}"""
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=2000,
messages=[{"role": "user", "content": prompt}]
)
content = message.content[0].text
try:
if "```json" in content:
content = content.split("```json")[1].split("```")[0]
elif "```" in content:
content = content.split("```")[1].split("```")[0]
mc_data = json.loads(content.strip())
except json.JSONDecodeError as e:
raise RuntimeError(f"Claude hat ungueltiges JSON geliefert: {e}")
return mc_data
def _shuffle_mc_options(mc_data: dict) -> dict:
"""
Shuffle the answer options for each question randomly.
Also updates correct_answer accordingly.
"""
if "questions" not in mc_data:
return mc_data
for question in mc_data["questions"]:
options = question.get("options", [])
correct_id = question.get("correct_answer")
if not options or not correct_id:
continue
# Find the text of the correct answer
correct_text = None
for opt in options:
if opt.get("id") == correct_id:
correct_text = opt.get("text")
break
# Shuffle the options
random.shuffle(options)
# Assign new IDs and find new position of correct answer
new_ids = ["a", "b", "c", "d"]
new_correct = None
for i, opt in enumerate(options):
if i < len(new_ids):
if opt.get("text") == correct_text:
new_correct = new_ids[i]
opt["id"] = new_ids[i]
if new_correct:
question["correct_answer"] = new_correct
question["options"] = options
return mc_data
def generate_mc_from_analysis(analysis_path: Path, num_questions: int = 5) -> Path:
"""
Generate multiple choice questions from an analysis JSON file.
The questions will:
- Be based on the extracted text
- Match the difficulty level of the original
- Have randomly arranged answers
Args:
analysis_path: Path to *_analyse.json file
num_questions: Number of questions to generate (default: 5)
Returns:
Path to generated *_mc.json file
"""
if not analysis_path.exists():
raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}")
try:
analysis_data = json.loads(analysis_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as e:
raise RuntimeError(f"Ungueltige Analyse-JSON: {e}")
logger.info(f"Generiere MC-Fragen fuer: {analysis_path.name}")
# Generate MC questions (use configured API)
if VISION_API == "claude":
try:
mc_data = _generate_mc_with_claude(analysis_data, num_questions)
except Exception as e:
logger.warning(f"Claude MC-Generierung fehlgeschlagen, nutze OpenAI: {e}")
mc_data = _generate_mc_with_openai(analysis_data, num_questions)
else:
mc_data = _generate_mc_with_openai(analysis_data, num_questions)
# Shuffle answer positions
mc_data = _shuffle_mc_options(mc_data)
# Save MC data
out_name = analysis_path.stem.replace("_analyse", "") + "_mc.json"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(json.dumps(mc_data, ensure_ascii=False, indent=2), encoding="utf-8")
logger.info(f"MC-Fragen gespeichert: {out_path.name}")
return out_path

View File

@@ -0,0 +1,458 @@
"""
AI Processor - Q&A Generator
Generate question-answer pairs with Leitner system for spaced repetition.
"""
from pathlib import Path
from datetime import datetime, timedelta
import json
import logging
import os
import requests
from ..config import VISION_API, BEREINIGT_DIR, get_openai_api_key
logger = logging.getLogger(__name__)
def _generate_qa_with_openai(analysis_data: dict, num_questions: int = 8) -> dict:
"""
Generate question-answer pairs based on worksheet analysis.
Important didactic requirements:
- Questions based almost verbatim on the existing material
- Only minimal rephrasing allowed
- Key terms/technical terms marked as important
- Difficulty level matches the original (grade_level)
Args:
analysis_data: The analysis JSON of the worksheet
num_questions: Number of questions to generate (default: 8)
Returns:
Dict with qa_items and metadata
"""
api_key = get_openai_api_key()
title = analysis_data.get("title") or "Arbeitsblatt"
subject = analysis_data.get("subject") or "Allgemein"
grade_level = analysis_data.get("grade_level") or "unbekannt"
canonical_text = analysis_data.get("canonical_text") or ""
printed_blocks = analysis_data.get("printed_blocks") or []
tasks = analysis_data.get("tasks") or []
content_parts = []
if canonical_text:
content_parts.append(canonical_text)
for block in printed_blocks:
text = block.get("text", "").strip()
if text and text not in content_parts:
content_parts.append(text)
for task in tasks:
desc = task.get("description", "").strip()
text = task.get("text_with_gaps", "").strip()
if desc:
content_parts.append(f"Aufgabe: {desc}")
if text:
content_parts.append(text)
worksheet_content = "\n\n".join(content_parts)
if not worksheet_content.strip():
logger.warning("Kein Textinhalt fuer Q&A-Generierung gefunden")
return {"qa_items": [], "metadata": {"error": "Kein Textinhalt gefunden"}}
url = "https://api.openai.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
system_prompt = f"""Du bist ein erfahrener Paedagoge, der Frage-Antwort-Paare fuer Schueler erstellt.
WICHTIGE REGELN:
1. INHALTE NUR AUS DEM TEXT:
- Verwende FAST WOERTLICH den vorhandenen Stoff
- KEINE neuen Fakten oder Inhalte einfuehren!
- Alles muss aus dem gegebenen Text ableitbar sein
2. SCHWIERIGKEITSGRAD:
- Niveau muss exakt "{grade_level}" entsprechen
3. SCHLUESSELWOERTER MARKIEREN:
- Identifiziere wichtige Fachbegriffe als "key_terms"
4. FRAGETYPEN:
- Wissensfragen: "Was ist...?", "Nenne..."
- Verstaendnisfragen: "Erklaere...", "Beschreibe..."
- Anwendungsfragen: "Warum...?", "Was passiert, wenn...?"
5. ANTWORT-FORMAT:
- Kurze, praezise Antworten (1-3 Saetze)
6. AUSGABE: Nur gueltiges JSON, kein Markdown."""
user_prompt = f"""Erstelle {num_questions} Frage-Antwort-Paare aus diesem Arbeitsblatt:
TITEL: {title}
FACH: {subject}
KLASSENSTUFE: {grade_level}
TEXT:
{worksheet_content}
Gib das Ergebnis als JSON zurueck:
{{
"qa_items": [
{{
"id": "qa1",
"question": "Die Frage hier (fast woertlich aus dem Text)",
"answer": "Die korrekte Antwort (direkt aus dem Text)",
"question_type": "knowledge" | "understanding" | "application",
"key_terms": ["wichtiger Begriff 1", "wichtiger Begriff 2"],
"difficulty": 1-3,
"source_hint": "Kurzer Hinweis, wo im Text die Antwort steht",
"leitner_box": 0
}}
],
"metadata": {{
"subject": "{subject}",
"grade_level": "{grade_level}",
"source_title": "{title}",
"total_questions": {num_questions},
"key_terms_summary": ["alle", "wichtigen", "Fachbegriffe", "gesammelt"]
}}
}}
WICHTIG:
- Alle Antworten muessen aus dem Text ableitbar sein!
- "leitner_box": 0 bedeutet "neu" (noch nicht gelernt)
- "difficulty": 1=leicht, 2=mittel, 3=schwer"""
payload = {
"model": "gpt-4o-mini",
"response_format": {"type": "json_object"},
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"max_tokens": 3000,
"temperature": 0.5,
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
try:
content = data["choices"][0]["message"]["content"]
qa_data = json.loads(content)
except (KeyError, json.JSONDecodeError) as e:
raise RuntimeError(f"Fehler bei Q&A-Generierung: {e}")
# Initialize Leitner-Box fields for all items
_initialize_leitner_fields(qa_data)
return qa_data
def _generate_qa_with_claude(analysis_data: dict, num_questions: int = 8) -> dict:
"""Generate question-answer pairs with Claude API."""
import anthropic
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise RuntimeError("ANTHROPIC_API_KEY ist nicht gesetzt.")
client = anthropic.Anthropic(api_key=api_key)
title = analysis_data.get("title") or "Arbeitsblatt"
subject = analysis_data.get("subject") or "Allgemein"
grade_level = analysis_data.get("grade_level") or "unbekannt"
canonical_text = analysis_data.get("canonical_text") or ""
printed_blocks = analysis_data.get("printed_blocks") or []
tasks = analysis_data.get("tasks") or []
content_parts = []
if canonical_text:
content_parts.append(canonical_text)
for block in printed_blocks:
text = block.get("text", "").strip()
if text and text not in content_parts:
content_parts.append(text)
for task in tasks:
desc = task.get("description", "").strip()
if desc:
content_parts.append(f"Aufgabe: {desc}")
worksheet_content = "\n\n".join(content_parts)
if not worksheet_content.strip():
return {"qa_items": [], "metadata": {"error": "Kein Textinhalt gefunden"}}
prompt = f"""Erstelle {num_questions} Frage-Antwort-Paare aus diesem Arbeitsblatt.
WICHTIGE REGELN:
1. Verwende FAST WOERTLICH den vorhandenen Stoff - KEINE neuen Fakten!
2. Schwierigkeitsgrad: exakt "{grade_level}"
3. Markiere wichtige Fachbegriffe als "key_terms"
TITEL: {title}
FACH: {subject}
KLASSENSTUFE: {grade_level}
TEXT:
{worksheet_content}
Antworte NUR mit diesem JSON:
{{
"qa_items": [
{{
"id": "qa1",
"question": "Frage (fast woertlich aus Text)",
"answer": "Antwort (direkt aus Text)",
"question_type": "knowledge",
"key_terms": ["Begriff1", "Begriff2"],
"difficulty": 1,
"source_hint": "Wo im Text",
"leitner_box": 0
}}
],
"metadata": {{
"subject": "{subject}",
"grade_level": "{grade_level}",
"source_title": "{title}",
"total_questions": {num_questions},
"key_terms_summary": ["alle", "Fachbegriffe"]
}}
}}"""
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=3000,
messages=[{"role": "user", "content": prompt}]
)
content = message.content[0].text
try:
if "```json" in content:
content = content.split("```json")[1].split("```")[0]
elif "```" in content:
content = content.split("```")[1].split("```")[0]
qa_data = json.loads(content.strip())
except json.JSONDecodeError as e:
raise RuntimeError(f"Claude hat ungueltiges JSON geliefert: {e}")
# Initialize Leitner-Box fields
_initialize_leitner_fields(qa_data)
return qa_data
def _initialize_leitner_fields(qa_data: dict) -> None:
"""Initialize Leitner-Box fields for all Q&A items."""
for item in qa_data.get("qa_items", []):
if "leitner_box" not in item:
item["leitner_box"] = 0
if "correct_count" not in item:
item["correct_count"] = 0
if "incorrect_count" not in item:
item["incorrect_count"] = 0
if "last_seen" not in item:
item["last_seen"] = None
if "next_review" not in item:
item["next_review"] = None
def generate_qa_from_analysis(analysis_path: Path, num_questions: int = 8) -> Path:
"""
Generate question-answer pairs from an analysis JSON file.
The Q&A pairs will:
- Be based almost verbatim on the original text
- Be prepared with Leitner-Box system for repetition
- Have key terms marked for reinforcement
Args:
analysis_path: Path to *_analyse.json file
num_questions: Number of questions to generate
Returns:
Path to generated *_qa.json file
"""
if not analysis_path.exists():
raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}")
try:
analysis_data = json.loads(analysis_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as e:
raise RuntimeError(f"Ungueltige Analyse-JSON: {e}")
logger.info(f"Generiere Q&A-Paare fuer: {analysis_path.name}")
# Generate Q&A (use configured API)
if VISION_API == "claude":
try:
qa_data = _generate_qa_with_claude(analysis_data, num_questions)
except Exception as e:
logger.warning(f"Claude Q&A-Generierung fehlgeschlagen, nutze OpenAI: {e}")
qa_data = _generate_qa_with_openai(analysis_data, num_questions)
else:
qa_data = _generate_qa_with_openai(analysis_data, num_questions)
# Save Q&A data
out_name = analysis_path.stem.replace("_analyse", "") + "_qa.json"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(json.dumps(qa_data, ensure_ascii=False, indent=2), encoding="utf-8")
logger.info(f"Q&A-Paare gespeichert: {out_path.name}")
return out_path
# ---------------------------------------------------------------------------
# Leitner-Box System for Spaced Repetition
# ---------------------------------------------------------------------------
def update_leitner_progress(qa_path: Path, item_id: str, correct: bool) -> dict:
"""
Update the learning progress of a Q&A item using the Leitner system.
Leitner Boxes:
- Box 0: New (not yet learned)
- Box 1: Learned (on error → back to Box 0)
- Box 2: Consolidated (on error → back to Box 1)
On correct answer: Increase box (max 2)
On wrong answer: Decrease box (min 0)
Args:
qa_path: Path to *_qa.json file
item_id: ID of the Q&A item
correct: True if answered correctly
Returns:
Dict with updated item and status
"""
if not qa_path.exists():
raise FileNotFoundError(f"Q&A-Datei nicht gefunden: {qa_path}")
qa_data = json.loads(qa_path.read_text(encoding="utf-8"))
# Find the item
item = None
for qa_item in qa_data.get("qa_items", []):
if qa_item.get("id") == item_id:
item = qa_item
break
if not item:
return {"status": "NOT_FOUND", "message": f"Item {item_id} nicht gefunden"}
# Update statistics
now = datetime.now().isoformat()
item["last_seen"] = now
if correct:
item["correct_count"] = item.get("correct_count", 0) + 1
# Increase box (max 2)
current_box = item.get("leitner_box", 0)
if current_box < 2:
item["leitner_box"] = current_box + 1
# Next review based on box
# Box 0→1: After 1 day, Box 1→2: After 3 days, Box 2: After 7 days
days = [1, 3, 7][item["leitner_box"]]
item["next_review"] = (datetime.now() + timedelta(days=days)).isoformat()
else:
item["incorrect_count"] = item.get("incorrect_count", 0) + 1
# Decrease box (min 0)
current_box = item.get("leitner_box", 0)
if current_box > 0:
item["leitner_box"] = current_box - 1
# On error: review soon
item["next_review"] = (datetime.now() + timedelta(hours=4)).isoformat()
# Save updated data
qa_path.write_text(json.dumps(qa_data, ensure_ascii=False, indent=2), encoding="utf-8")
box_names = ["Neu", "Gelernt", "Gefestigt"]
return {
"status": "OK",
"item_id": item_id,
"correct": correct,
"new_box": item["leitner_box"],
"box_name": box_names[item["leitner_box"]],
"correct_count": item["correct_count"],
"incorrect_count": item["incorrect_count"],
"next_review": item["next_review"]
}
def get_next_review_items(qa_path: Path, limit: int = 5) -> list:
"""
Get the next items to review.
Prioritization:
1. Wrongly answered items (Box 0) - more frequent
2. Learned items (Box 1) whose review is due
3. Consolidated items (Box 2) for occasional refresh
Args:
qa_path: Path to *_qa.json file
limit: Maximum number of items
Returns:
List of items to review (sorted by priority)
"""
if not qa_path.exists():
return []
qa_data = json.loads(qa_path.read_text(encoding="utf-8"))
items = qa_data.get("qa_items", [])
now = datetime.now()
review_items = []
for item in items:
box = item.get("leitner_box", 0)
next_review = item.get("next_review")
incorrect = item.get("incorrect_count", 0)
# Calculate priority (lower = more important)
priority = box * 10 # Box 0 has highest priority
# Bonus for frequently wrong answers
priority -= incorrect * 2
# Check if review is due
is_due = True
if next_review:
try:
review_time = datetime.fromisoformat(next_review)
is_due = now >= review_time
# Overdue items get higher priority
if is_due:
overdue_hours = (now - review_time).total_seconds() / 3600
priority -= overdue_hours
except (ValueError, TypeError):
is_due = True
# New items (Box 0) always included
if box == 0 or is_due:
review_items.append({
**item,
"_priority": priority,
"_is_due": is_due
})
# Sort by priority (lowest first)
review_items.sort(key=lambda x: x["_priority"])
# Remove internal fields and limit
result = []
for item in review_items[:limit]:
clean_item = {k: v for k, v in item.items() if not k.startswith("_")}
result.append(clean_item)
return result

View File

@@ -0,0 +1,83 @@
"""
AI Processor - Utility Functions
Image encoding and helper functions.
"""
from pathlib import Path
import base64
import shutil
import logging
from .config import BEREINIGT_DIR
logger = logging.getLogger(__name__)
def encode_image_to_data_url(input_path: Path) -> str:
"""
Encode an image file to a data URL for API requests.
Args:
input_path: Path to the image file
Returns:
Data URL string (data:image/jpeg;base64,...)
"""
image_bytes = input_path.read_bytes()
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
return f"data:image/jpeg;base64,{image_b64}"
def encode_image_to_base64(input_path: Path) -> str:
"""
Encode an image file to base64 string.
Args:
input_path: Path to the image file
Returns:
Base64 encoded string
"""
image_bytes = input_path.read_bytes()
return base64.b64encode(image_bytes).decode("utf-8")
def dummy_process_scan(input_path: Path) -> Path:
"""
Simple copy to Bereinigt folder - kept as fallback.
Args:
input_path: Path to input file
Returns:
Path to copied file
"""
if not input_path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
new_name = input_path.stem + "_bereinigt" + input_path.suffix
target = BEREINIGT_DIR / new_name
shutil.copy2(input_path, target)
return target
def get_media_type(input_path: Path) -> str:
"""
Determine media type from file extension.
Args:
input_path: Path to the file
Returns:
MIME type string
"""
suffix = input_path.suffix.lower()
media_types = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
".pdf": "application/pdf",
}
return media_types.get(suffix, "image/jpeg")

View File

@@ -0,0 +1,19 @@
"""
AI Processor - Vision Module
Scan analysis and HTML generation.
"""
from .scan_analyzer import (
analyze_scan_structure_with_ai,
describe_scan_with_ai,
remove_handwriting_from_scan,
)
from .html_builder import build_clean_html_from_analysis
__all__ = [
"analyze_scan_structure_with_ai",
"describe_scan_with_ai",
"remove_handwriting_from_scan",
"build_clean_html_from_analysis",
]

View File

@@ -0,0 +1,218 @@
"""
AI Processor - HTML Builder
Build clean HTML worksheets from analysis data.
"""
from pathlib import Path
import json
import logging
from ..config import BEREINIGT_DIR
logger = logging.getLogger(__name__)
def build_clean_html_from_analysis(analysis_path: Path) -> Path:
"""
Build a clean HTML worksheet from an analysis JSON file.
Features:
- Focus on printed text (canonical_text / printed_blocks)
- Handwritten entries and crossed-out words are NOT included
- Uses open-source font stack (Inter / Noto Sans)
Args:
analysis_path: Path to *_analyse.json file
Returns:
Path to the generated HTML file
"""
if not analysis_path.exists():
raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}")
try:
data = json.loads(analysis_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as e:
raise RuntimeError(f"Analyse-Datei enthaelt kein gueltiges JSON: {analysis_path}\n{e}") from e
title = data.get("title") or "Arbeitsblatt"
subject = data.get("subject") or ""
grade_level = data.get("grade_level") or ""
instructions = data.get("instructions") or ""
tasks = data.get("tasks", []) or []
canonical_text = data.get("canonical_text") or ""
printed_blocks = data.get("printed_blocks") or []
struck = data.get("struck_through_words") or []
html_parts = []
html_parts.append("<!DOCTYPE html>")
html_parts.append("<html lang='de'>")
html_parts.append("<head>")
html_parts.append("<meta charset='UTF-8'>")
html_parts.append(f"<title>{title}</title>")
html_parts.append(_get_html_styles())
html_parts.append("</head>")
html_parts.append("<body>")
html_parts.append("<div class='page'>")
# Header section
html_parts.append(f"<h1>{title}</h1>")
meta_bits = []
if subject:
meta_bits.append(f"Fach: {subject}")
if grade_level:
meta_bits.append(f"Klassenstufe: {grade_level}")
if meta_bits:
html_parts.append(f"<div class='meta'>{' | '.join(meta_bits)}</div>")
if instructions:
html_parts.append(
f"<div class='instructions'><strong>Arbeitsanweisung:</strong> {instructions}</div>"
)
# Main text / printed blocks
html_parts.append("<section class='text-blocks'>")
if printed_blocks:
for block in printed_blocks:
role = (block.get("role") or "body").lower()
text = (block.get("text") or "").strip()
if not text:
continue
html_parts.append("<div class='text-block'>")
if role == "title":
html_parts.append(f"<div class='text-block-title'>{text}</div>")
else:
html_parts.append(f"<div>{text}</div>")
html_parts.append("</div>")
elif canonical_text:
# Fallback: split canonical_text into paragraphs
paragraphs = [
p.strip()
for p in canonical_text.replace("\r\n", "\n").split("\n\n")
if p.strip()
]
for p in paragraphs:
html_parts.append(f"<div class='text-block'>{p}</div>")
html_parts.append("</section>")
# Tasks section
if tasks:
html_parts.append("<h2>Aufgaben</h2>")
html_parts.append("<div class='task-list'>")
for idx, task in enumerate(tasks, start=1):
t_type = task.get("type") or "other"
desc = task.get("description") or ""
text_with_gaps = task.get("text_with_gaps")
html_parts.append("<div class='task'>")
html_parts.append(
f"<div class='task-title'>Aufgabe {idx} ({t_type}): {desc}</div>"
)
if text_with_gaps:
rendered = text_with_gaps.replace("___", "<span class='gap-line'>&nbsp;</span>")
html_parts.append(f"<div>{rendered}</div>")
html_parts.append("</div>")
html_parts.append("</div>")
# Footer note
if struck:
html_parts.append(
"<div class='footnote'>Hinweis: Einige im Original durchgestrichene Woerter wurden "
"von der KI erkannt und NICHT in dieses saubere Arbeitsblatt uebernommen.</div>"
)
else:
html_parts.append(
"<div class='footnote'>Dieses Arbeitsblatt wurde automatisch aus einem Scan rekonstruiert "
"und von handschriftlichen Eintragungen bereinigt.</div>"
)
html_parts.append("</div>") # .page
html_parts.append("</body></html>")
html_content = "\n".join(html_parts)
out_name = analysis_path.stem.replace("_analyse", "") + "_clean.html"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(html_content, encoding="utf-8")
return out_path
def _get_html_styles() -> str:
"""Get CSS styles for clean HTML output."""
return """
<style>
:root {
--font-main: "Inter", "Noto Sans", system-ui, -apple-system, BlinkMacSystemFont, sans-serif;
}
* { box-sizing: border-box; }
body {
font-family: var(--font-main);
margin: 32px;
line-height: 1.5;
font-size: 14px;
color: #111827;
}
.page {
max-width: 800px;
margin: 0 auto;
}
h1 {
font-size: 24px;
margin-bottom: 4px;
}
h2 {
font-size: 18px;
margin-top: 24px;
}
.meta {
font-size: 12px;
color: #6b7280;
margin-bottom: 16px;
}
.instructions {
margin-bottom: 20px;
padding: 8px 10px;
border-radius: 8px;
background: #eff6ff;
border: 1px solid #bfdbfe;
font-size: 13px;
}
.text-blocks {
margin-bottom: 24px;
}
.text-block {
margin-bottom: 8px;
}
.text-block-title {
font-weight: 600;
margin-bottom: 4px;
}
.task-list {
margin-top: 8px;
}
.task {
margin-bottom: 14px;
padding-bottom: 8px;
border-bottom: 1px dashed #e5e7eb;
}
.task-title {
font-weight: 600;
margin-bottom: 4px;
}
.gap-line {
display: inline-block;
border-bottom: 1px solid #000;
min-width: 80px;
margin: 0 4px;
}
.footnote {
margin-top: 24px;
font-size: 11px;
color: #9ca3af;
}
</style>
"""

View File

@@ -0,0 +1,307 @@
"""
AI Processor - Scan Analyzer
Vision-based analysis of worksheets using OpenAI and Claude APIs.
"""
from pathlib import Path
import json
import logging
import shutil
import requests
from ..config import (
VISION_API,
BEREINIGT_DIR,
get_openai_api_key,
)
from ..utils import encode_image_to_data_url
logger = logging.getLogger(__name__)
def describe_scan_with_ai(input_path: Path) -> Path:
"""
Vision model gives a short description of the worksheet.
Args:
input_path: Path to the input image
Returns:
Path to the description text file
"""
if not input_path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
api_key = get_openai_api_key()
image_data_url = encode_image_to_data_url(input_path)
url = "https://api.openai.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
payload = {
"model": "gpt-4o-mini",
"messages": [
{
"role": "system",
"content": "Du bist ein hilfreicher Assistent, der Schul-Arbeitsblaetter knapp beschreibt.",
},
{
"role": "user",
"content": [
{
"type": "text",
"text": (
"Beschreibe dieses Arbeitsblatt knapp: Thema, Art der Aufgaben "
"(z.B. Lueckentext, Multiple Choice, Rechenaufgaben) und groben Inhalt."
),
},
{"type": "image_url", "image_url": {"url": image_data_url}},
],
},
],
"max_tokens": 400,
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
try:
description = data["choices"][0]["message"]["content"]
except Exception as e:
raise RuntimeError(f"Unerwartete Antwortstruktur von der KI: {e}\nAntwort: {data}") from e
out_name = input_path.stem + "_beschreibung.txt"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(description, encoding="utf-8")
return out_path
def _analyze_with_openai(input_path: Path) -> Path:
"""
Structured JSON analysis of the worksheet using OpenAI.
Features:
- canonical_text: complete corrected text without handwriting
- printed_blocks: structured blocks of printed text
- handwritten_annotations: student handwritten notes
- struck_through_words: crossed out words
"""
if not input_path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
api_key = get_openai_api_key()
image_data_url = encode_image_to_data_url(input_path)
url = "https://api.openai.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
system_prompt = (
"Du bist ein Experte fuer die Analyse von Schul-Arbeitsblaettern.\n\n"
"HAUPTAUFGABEN:\n"
"1. Erkenne ALLE gedruckten Elemente: Text, Ueberschriften, Tabellen, Linien, Kaestchen, Diagramme, Illustrationen\n"
"2. Identifiziere ALLE handschriftlichen Ergaenzungen: Antworten, Zahlen, Buchstaben, Notizen, Zeichnungen\n"
"3. Bestimme praezise Positionen (Bounding Boxes in Pixeln) fuer JEDES Element\n\n"
"KRITISCH - DIAGRAMME & ILLUSTRATIONEN:\n"
"- Suche aktiv nach: anatomischen Zeichnungen, beschrifteten Diagrammen, Grafiken, Tabellen, Skizzen\n"
"- Wenn du irgendeine bildliche Darstellung siehst (z.B. Auge, Pflanze, Karte, Schaubild), setze 'has_diagram: true'\n"
"- Fuer JEDES visuelle Element: Erstelle einen Eintrag in 'diagram_elements' mit genauer Position\n"
"- Beschrifte-Linien (von Beschriftung zu Bildteil) gehoeren zum Diagramm!\n\n"
"HANDSCHRIFT ERKENNUNG:\n"
"- Unterscheide gedruckt vs. handgeschrieben anhand der Schriftart\n"
"- Klassifiziere Farbe: blau/schwarz/rot/pencil (Bleistift)\n"
"- Durchgestrichene Woerter separat auflisten\n\n"
"AUSGABE: Gib deine Antwort AUSSCHLIESSLICH als gueltiges JSON zurueck (kein Markdown, keine Code-Bloecke)."
)
user_text = _get_analysis_user_prompt()
payload = {
"model": "gpt-4o-mini",
"response_format": {"type": "json_object"},
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": [
{"type": "text", "text": user_text},
{"type": "image_url", "image_url": {"url": image_data_url}},
],
},
],
"max_tokens": 2500,
"temperature": 0.15,
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
try:
content = data["choices"][0]["message"]["content"]
except Exception as e:
raise RuntimeError(f"Unerwartete Antwortstruktur von der KI: {e}\nAntwort: {data}") from e
try:
obj = json.loads(content)
except json.JSONDecodeError as e:
raise RuntimeError(f"Modell hat ungueltiges JSON geliefert: {e}\nInhalt: {content}") from e
out_name = input_path.stem + "_analyse.json"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
return out_path
def _analyze_with_claude(input_path: Path) -> Path:
"""
Structured JSON analysis with Claude Vision API.
Uses Claude 3.5 Sonnet for better OCR and layout detection.
"""
from claude_vision import analyze_worksheet_with_claude
if not input_path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
logger.info(f"Analyzing with Claude Vision: {input_path.name}")
try:
analysis_data = analyze_worksheet_with_claude(
input_path,
max_tokens=2500
)
out_name = input_path.stem + "_analyse.json"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(
json.dumps(analysis_data, ensure_ascii=False, indent=2),
encoding="utf-8"
)
logger.info(f"Claude analysis saved: {out_path.name}")
return out_path
except Exception as e:
logger.error(f"Claude analysis failed: {e}")
raise
def analyze_scan_structure_with_ai(input_path: Path) -> Path:
"""
Structured JSON analysis of the worksheet (Hybrid mode).
Uses the API configured in VISION_API:
- "claude" (default): Claude 3.5 Sonnet - better OCR, layout detection
- "openai": OpenAI GPT-4o-mini - cheaper, faster
Switch via environment variable:
export VISION_API="claude" # or "openai"
Returns:
Path to analysis JSON file
"""
logger.info(f"Using Vision API: {VISION_API}")
if VISION_API == "claude":
try:
return _analyze_with_claude(input_path)
except Exception as e:
logger.warning(f"Claude failed, falling back to OpenAI: {e}")
return _analyze_with_openai(input_path)
elif VISION_API == "openai":
return _analyze_with_openai(input_path)
else:
logger.warning(f"Unknown VISION_API '{VISION_API}', using Claude as default")
return _analyze_with_claude(input_path)
def remove_handwriting_from_scan(input_path: Path) -> Path:
"""
Remove handwriting from worksheet scan using AI-guided image processing.
Process:
1. Load corresponding analysis JSON (from Stage 1)
2. Apply multi-strategy cleaning using WorksheetCleaner
3. Preserve diagrams and printed content
4. Save cleaned image
Returns:
Path to cleaned image (*_clean.jpg)
"""
if not input_path.exists():
raise FileNotFoundError(f"Eingabedatei nicht gefunden: {input_path}")
from image_cleaner import WorksheetCleaner
# Load analysis JSON (from Stage 1)
analysis_name = input_path.stem + "_analyse.json"
analysis_path = BEREINIGT_DIR / analysis_name
# If analysis doesn't exist, run it first
if not analysis_path.exists():
logger.info(f"Analysis not found for {input_path.name}, running analysis first")
analysis_path = analyze_scan_structure_with_ai(input_path)
# Load analysis data
try:
analysis_data = json.loads(analysis_path.read_text(encoding='utf-8'))
except json.JSONDecodeError as e:
logger.error(f"Invalid analysis JSON: {analysis_path}\n{e}")
analysis_data = {
"layout": {"text_regions": [], "diagram_elements": []},
"handwriting_regions": []
}
# Prepare output path
output_name = input_path.stem + "_clean" + input_path.suffix
output_path = BEREINIGT_DIR / output_name
# Clean the image using WorksheetCleaner
cleaner = WorksheetCleaner(debug_mode=False)
try:
cleaned_path = cleaner.clean_worksheet(input_path, analysis_data, output_path)
logger.info(f"Successfully cleaned {input_path.name}")
return cleaned_path
except Exception as e:
logger.error(f"Cleaning failed for {input_path.name}, using original: {e}")
shutil.copy2(input_path, output_path)
return output_path
def _get_analysis_user_prompt() -> str:
"""Get the user prompt for worksheet analysis."""
return (
"Analysiere dieses Arbeitsblatt und gib ein JSON mit folgendem Aufbau zurueck:\n\n"
"{\n"
' "title": string | null,\n'
' "subject": string | null,\n'
' "grade_level": string | null,\n'
' "instructions": string | null,\n'
' "canonical_text": string | null,\n'
' "printed_blocks": [\n'
" {\n"
' "id": string,\n'
' "role": "title" | "instructions" | "body" | "other",\n'
' "text": string\n'
" }\n"
" ],\n"
' "layout": {\n'
' "page_structure": {\n'
' "has_diagram": boolean,\n'
' "orientation": "portrait" | "landscape"\n'
" },\n"
' "text_regions": [...],\n'
' "diagram_elements": [...]\n'
" },\n"
' "handwriting_regions": [...],\n'
' "handwritten_annotations": [...],\n'
' "struck_through_words": [...],\n'
' "tasks": [...]\n'
"}\n\n"
"WICHTIG - BITTE GENAU BEACHTEN:\n"
"1. CANONICAL TEXT: Nur gedruckter Text, OHNE Handschrift\n"
"2. DIAGRAMME: Bei JEDER Zeichnung/Grafik has_diagram: true setzen\n"
"3. HANDSCHRIFT: Mit Farb-Klassifizierung und Bounding Boxes\n"
"4. Bei Unsicherheit: null oder leeres Array"
)

View File

@@ -0,0 +1,17 @@
"""
AI Processor - Visualization Module
Mindmap generation for learning posters.
"""
from .mindmap import (
generate_mindmap_data,
generate_mindmap_html,
save_mindmap_for_worksheet,
)
__all__ = [
"generate_mindmap_data",
"generate_mindmap_html",
"save_mindmap_for_worksheet",
]

View File

@@ -0,0 +1,471 @@
"""
AI Processor - Mindmap Generator
Generate mindmaps for learning posters.
"""
from pathlib import Path
import json
import logging
import math
import os
import requests
from ..config import BEREINIGT_DIR, get_openai_api_key
logger = logging.getLogger(__name__)
def generate_mindmap_data(analysis_path: Path) -> dict:
"""
Extract technical terms from analysis and group them for a mindmap.
Args:
analysis_path: Path to *_analyse.json file
Returns:
Dictionary with mindmap structure:
{
"topic": "Main topic",
"subject": "Subject",
"categories": [
{
"name": "Category",
"color": "#hexcolor",
"emoji": "🔬",
"terms": [
{"term": "Term", "explanation": "Short explanation"}
]
}
]
}
"""
if not analysis_path.exists():
raise FileNotFoundError(f"Analysedatei nicht gefunden: {analysis_path}")
try:
data = json.loads(analysis_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as e:
raise RuntimeError(f"Analyse-Datei enthaelt kein gueltiges JSON: {analysis_path}\n{e}") from e
title = data.get("title") or "Arbeitsblatt"
subject = data.get("subject") or ""
canonical_text = data.get("canonical_text") or ""
tasks = data.get("tasks", []) or []
# Collect all text for analysis
all_text = canonical_text
for task in tasks:
if task.get("description"):
all_text += "\n" + task.get("description")
if task.get("text_with_gaps"):
all_text += "\n" + task.get("text_with_gaps")
if not all_text.strip():
return {
"topic": title,
"subject": subject,
"categories": []
}
# AI-based extraction of technical terms
api_key = get_openai_api_key()
prompt = f"""Analysiere diesen Schultext und extrahiere alle Fachbegriffe fuer eine kindgerechte Lern-Mindmap.
THEMA: {title}
FACH: {subject}
TEXT:
{all_text[:3000]}
AUFGABE:
1. Identifiziere das Hauptthema (ein einzelnes Wort oder kurzer Begriff)
2. Finde ALLE Fachbegriffe und gruppiere sie in 3-6 sinnvolle Kategorien
3. Gib fuer jeden Begriff eine kurze, kindgerechte Erklaerung (max 10 Woerter)
4. Waehle fuer jede Kategorie ein passendes Emoji und eine Farbe
Antworte NUR mit diesem JSON-Format:
{{
"topic": "Hauptthema (z.B. 'Das Auge')",
"categories": [
{{
"name": "Kategoriename",
"emoji": "passendes Emoji",
"color": "#Hexfarbe (bunt, kindgerecht)",
"terms": [
{{"term": "Fachbegriff", "explanation": "Kurze Erklaerung"}}
]
}}
]
}}
WICHTIG:
- Verwende kindgerechte, einfache Sprache
- Bunte, froehliche Farben: #FF6B6B, #4ECDC4, #45B7D1, #96CEB4, #FFEAA7, #DDA0DD, #98D8C8
- Passende Emojis fuer jede Kategorie
- Mindestens 3 Begriffe pro Kategorie wenn moeglich
- Maximal 6 Kategorien"""
try:
# Try Claude first
claude_key = os.environ.get("ANTHROPIC_API_KEY")
if claude_key:
import anthropic
client = anthropic.Anthropic(api_key=claude_key)
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=2000,
messages=[{"role": "user", "content": prompt}]
)
result_text = response.content[0].text
else:
# Fallback to OpenAI
logger.info("Claude Mindmap-Generierung fehlgeschlagen, nutze OpenAI: ANTHROPIC_API_KEY ist nicht gesetzt.")
url = "https://api.openai.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
payload = {
"model": "gpt-4o-mini",
"messages": [
{"role": "system", "content": "Du bist ein Experte fuer kindgerechte Lernmaterialien."},
{"role": "user", "content": prompt}
],
"max_tokens": 2000,
"temperature": 0.7
}
resp = requests.post(url, headers=headers, json=payload, timeout=60)
resp.raise_for_status()
result_text = resp.json()["choices"][0]["message"]["content"]
# Extract JSON
result_text = result_text.strip()
if result_text.startswith("```"):
result_text = result_text.split("```")[1]
if result_text.startswith("json"):
result_text = result_text[4:]
result_text = result_text.strip()
mindmap_data = json.loads(result_text)
mindmap_data["subject"] = subject
return mindmap_data
except Exception as e:
logger.error(f"Mindmap-Generierung fehlgeschlagen: {e}")
return {
"topic": title,
"subject": subject,
"categories": []
}
def generate_mindmap_html(mindmap_data: dict, format: str = "a3") -> str:
"""
Generate a child-friendly HTML/SVG mindmap poster.
Args:
mindmap_data: Dictionary from generate_mindmap_data()
format: "a3" for A3 poster (default) or "a4" for A4 view
Returns:
HTML string with SVG mindmap
"""
topic = mindmap_data.get("topic", "Thema")
subject = mindmap_data.get("subject", "")
categories = mindmap_data.get("categories", [])
# Format-specific settings
if format.lower() == "a4":
page_size = "A4 landscape"
svg_width = 1100
svg_height = 780
radius = 250
else: # a3 (default)
page_size = "A3 landscape"
svg_width = 1400
svg_height = 990
radius = 320
# If no categories, show placeholder
if not categories:
return f"""<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<title>Mindmap - {topic}</title>
<style>
body {{ font-family: 'Comic Sans MS', cursive, sans-serif; text-align: center; padding: 50px; }}
h1 {{ color: #FF6B6B; }}
</style>
</head>
<body>
<h1>🧠 Mindmap: {topic}</h1>
<p>Noch keine Daten vorhanden. Bitte zuerst das Arbeitsblatt analysieren.</p>
</body>
</html>"""
num_categories = len(categories)
center_x = svg_width // 2
center_y = svg_height // 2
# Calculate positions of categories in a circle
category_positions = []
for i, cat in enumerate(categories):
angle = (2 * math.pi * i / num_categories) - (math.pi / 2) # Start at top
x = center_x + radius * math.cos(angle)
y = center_y + radius * math.sin(angle)
category_positions.append({
"x": x,
"y": y,
"angle": angle,
"data": cat
})
html = _get_mindmap_html_header(topic, subject, page_size, svg_width, svg_height)
# Draw connection lines
for pos in category_positions:
color = pos["data"].get("color", "#4ECDC4")
html += f""" <path d="M {center_x} {center_y} Q {(center_x + pos['x'])/2 + 30} {(center_y + pos['y'])/2 - 30} {pos['x']} {pos['y']}"
stroke="{color}" stroke-width="4" fill="none" stroke-linecap="round" opacity="0.6"/>
"""
# Center (main topic)
html += f"""
<!-- Center: Main Topic -->
<g filter="url(#glow)">
<circle cx="{center_x}" cy="{center_y}" r="85" fill="url(#centerGradient)"/>
<defs>
<radialGradient id="centerGradient" cx="30%" cy="30%">
<stop offset="0%" stop-color="#FFD93D"/>
<stop offset="100%" stop-color="#FF6B6B"/>
</radialGradient>
</defs>
<text x="{center_x}" y="{center_y - 10}" text-anchor="middle" font-size="28" font-weight="bold" fill="white">🌟</text>
<text x="{center_x}" y="{center_y + 25}" text-anchor="middle" font-size="22" font-weight="bold" fill="white">{topic}</text>
</g>
"""
# Draw categories with their terms
for i, pos in enumerate(category_positions):
cat = pos["data"]
cat_x = pos["x"]
cat_y = pos["y"]
color = cat.get("color", "#4ECDC4")
emoji = cat.get("emoji", "📚")
name = cat.get("name", "Kategorie")
terms = cat.get("terms", [])
# Category bubble
html += f"""
<!-- Category: {name} -->
<g class="category-group" transform="translate({cat_x}, {cat_y})">
<ellipse cx="0" cy="0" rx="75" ry="45" fill="{color}" filter="url(#shadow)"/>
<text x="0" y="-8" text-anchor="middle" font-size="20">{emoji}</text>
<text x="0" y="18" text-anchor="middle" font-size="14" font-weight="bold" fill="white">{name}</text>
"""
# Terms around the category
term_radius = 110
num_terms = len(terms)
for j, term_data in enumerate(terms[:8]): # Max 8 terms per category
term = term_data.get("term", "")
# Calculate position relative to category
base_angle = pos["angle"]
spread = math.pi * 0.8 # 80% of a half circle
if num_terms > 1:
term_angle = base_angle - spread/2 + (spread * j / (num_terms - 1))
else:
term_angle = base_angle
term_x = term_radius * math.cos(term_angle - base_angle)
term_y = term_radius * math.sin(term_angle - base_angle)
# Small connection line
html += f""" <line x1="0" y1="0" x2="{term_x * 0.6}" y2="{term_y * 0.6}" stroke="{color}" stroke-width="2" opacity="0.5"/>
"""
# Term bubble
bubble_width = max(70, len(term) * 8 + 20)
html += f""" <g class="term-bubble" transform="translate({term_x}, {term_y})">
<rect x="{-bubble_width/2}" y="-22" width="{bubble_width}" height="44" rx="22" fill="white" stroke="{color}" stroke-width="2" filter="url(#shadow)"/>
<text x="0" y="5" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">{term}</text>
</g>
"""
html += " </g>\n"
# Legend with explanations (bottom)
html += f"""
<!-- Legend -->
<g transform="translate(50, {svg_height - 80})">
<text x="0" y="0" font-size="14" font-weight="bold" fill="#666">📖 Begriffe zum Lernen:</text>
"""
legend_x = 0
for i, pos in enumerate(category_positions):
cat = pos["data"]
color = cat.get("color", "#4ECDC4")
emoji = cat.get("emoji", "📚")
name = cat.get("name", "")
terms = cat.get("terms", [])
terms_text = ", ".join([t.get("term", "") for t in terms[:3]])
if len(terms) > 3:
terms_text += "..."
html += f""" <g transform="translate({legend_x}, 25)">
<circle cx="8" cy="0" r="8" fill="{color}"/>
<text x="22" y="4" font-size="11" fill="#444"><tspan font-weight="bold">{emoji} {name}:</tspan> {terms_text}</text>
</g>
"""
legend_x += 220
html += """ </g>
</svg>
</div>
</body>
</html>"""
return html
def save_mindmap_for_worksheet(analysis_path: Path, mindmap_data: dict = None) -> Path:
"""
Save a mindmap for a worksheet.
Args:
analysis_path: Path to *_analyse.json file
mindmap_data: Optional - already generated mindmap data.
If not provided, it will be generated.
Returns:
Path to saved *_mindmap.json file
"""
if mindmap_data is None:
mindmap_data = generate_mindmap_data(analysis_path)
# Save JSON
out_name = analysis_path.stem.replace("_analyse", "") + "_mindmap.json"
out_path = BEREINIGT_DIR / out_name
out_path.write_text(json.dumps(mindmap_data, ensure_ascii=False, indent=2), encoding="utf-8")
logger.info(f"Mindmap-Daten gespeichert: {out_path.name}")
return out_path
def _get_mindmap_html_header(topic: str, subject: str, page_size: str, svg_width: int, svg_height: int) -> str:
"""Get HTML header for mindmap."""
return f"""<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<title>Lernposter - {topic}</title>
<style>
@page {{
size: {page_size};
margin: 10mm;
}}
@media print {{
body {{ -webkit-print-color-adjust: exact; print-color-adjust: exact; }}
.no-print {{ display: none !important; }}
}}
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
body {{
font-family: 'Comic Sans MS', 'Chalkboard SE', 'Comic Neue', cursive, sans-serif;
background: linear-gradient(135deg, #f5f7fa 0%, #e4e8f0 100%);
min-height: 100vh;
padding: 20px;
}}
.poster-container {{
width: 100%;
max-width: 1400px;
margin: 0 auto;
background: white;
border-radius: 20px;
box-shadow: 0 10px 40px rgba(0,0,0,0.1);
overflow: hidden;
}}
.poster-header {{
background: linear-gradient(90deg, #FF6B6B, #4ECDC4);
padding: 15px 30px;
display: flex;
justify-content: space-between;
align-items: center;
}}
.poster-title {{
color: white;
font-size: 24px;
text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
}}
.poster-subject {{
color: white;
font-size: 16px;
opacity: 0.9;
}}
.mindmap-svg {{
width: 100%;
height: auto;
}}
.print-btn {{
position: fixed;
top: 20px;
right: 20px;
padding: 12px 24px;
background: #4ECDC4;
color: white;
border: none;
border-radius: 25px;
font-size: 16px;
cursor: pointer;
box-shadow: 0 4px 15px rgba(78, 205, 196, 0.4);
font-family: inherit;
}}
.print-btn:hover {{
transform: scale(1.05);
background: #45B7D1;
}}
.category-group:hover {{
transform: scale(1.02);
cursor: pointer;
}}
.term-bubble:hover {{
transform: scale(1.1);
filter: brightness(1.1);
}}
</style>
</head>
<body>
<button class="print-btn no-print" onclick="window.print()">🖨️ Als A3 drucken</button>
<div class="poster-container">
<div class="poster-header">
<div class="poster-title">🧠 Lernposter: {topic}</div>
<div class="poster-subject">{subject}</div>
</div>
<svg class="mindmap-svg" viewBox="0 0 {svg_width} {svg_height}" xmlns="http://www.w3.org/2000/svg">
<defs>
<!-- Shadow for bubbles -->
<filter id="shadow" x="-20%" y="-20%" width="140%" height="140%">
<feDropShadow dx="2" dy="4" stdDeviation="4" flood-opacity="0.2"/>
</filter>
<!-- Glow effect for center -->
<filter id="glow">
<feGaussianBlur stdDeviation="8" result="coloredBlur"/>
<feMerge>
<feMergeNode in="coloredBlur"/>
<feMergeNode in="SourceGraphic"/>
</feMerge>
</filter>
</defs>
<!-- Background pattern (subtle dots) -->
<pattern id="dots" x="0" y="0" width="30" height="30" patternUnits="userSpaceOnUse">
<circle cx="15" cy="15" r="1.5" fill="#e0e0e0"/>
</pattern>
<rect width="100%" height="100%" fill="url(#dots)"/>
<!-- Connection lines from center to categories -->
"""