Restructure: Move final 16 root files into packages (backend-lehrer)

classroom/ (+2): state_engine_api, state_engine_models vocabulary/ (2): api, db worksheets/ (2): api, models services/ (+6): audio, email, translation, claude_vision, ai_processor, story_generator api/ (4): school, klausur_proxy, progress, user_language Only main.py + config.py remain at root. 16 shims added. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 22:50:37 +02:00
parent 6be555fb7c
commit cba877c65a
36 changed files with 3712 additions and 3564 deletions
--- a/backend-lehrer/vocabulary/init.py
+++ b/backend-lehrer/vocabulary/init.py
@@ -0,0 +1,33 @@
+# Vocabulary Module
+# vocabulary/api.py  — API router (search, browse, import, translate)
+# vocabulary/db.py   — PostgreSQL storage for vocabulary word catalog
+
+from .api import router
+from .db import (
+    VocabularyWord,
+    get_pool,
+    init_vocabulary_tables,
+    search_words,
+    get_word,
+    browse_words,
+    insert_word,
+    insert_words_bulk,
+    count_words,
+    get_all_tags,
+    get_all_pos,
+)
+
+__all__ = [
+    "router",
+    "VocabularyWord",
+    "get_pool",
+    "init_vocabulary_tables",
+    "search_words",
+    "get_word",
+    "browse_words",
+    "insert_word",
+    "insert_words_bulk",
+    "count_words",
+    "get_all_tags",
+    "get_all_pos",
+]
--- a/backend-lehrer/vocabulary/api.py
+++ b/backend-lehrer/vocabulary/api.py
@@ -0,0 +1,352 @@
+"""
+Vocabulary API — Search, browse, and build learning units from the word catalog.
+
+Endpoints for teachers to find words and create learning units,
+and for students to access word details with audio/images/syllables.
+"""
+
+import logging
+import json
+from typing import Any, Dict, List, Optional
+
+from fastapi import APIRouter, HTTPException, Query
+from pydantic import BaseModel
+
+from .db import (
+    search_words,
+    get_word,
+    browse_words,
+    insert_word,
+    count_words,
+    get_all_tags,
+    get_all_pos,
+    VocabularyWord,
+)
+from learning_units import (
+    LearningUnitCreate,
+    create_learning_unit,
+    get_learning_unit,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/vocabulary", tags=["vocabulary"])
+
+
+# ---------------------------------------------------------------------------
+# Search & Browse
+# ---------------------------------------------------------------------------
+
+
+@router.get("/search")
+async def api_search_words(
+    q: str = Query("", description="Search query"),
+    lang: str = Query("en", pattern="^(en|de)$"),
+    limit: int = Query(20, ge=1, le=100),
+    offset: int = Query(0, ge=0),
+):
+    """Full-text search for vocabulary words."""
+    if not q.strip():
+        return {"words": [], "query": q, "total": 0}
+
+    words = await search_words(q.strip(), lang=lang, limit=limit, offset=offset)
+    return {
+        "words": [w.to_dict() for w in words],
+        "query": q,
+        "total": len(words),
+    }
+
+
+@router.get("/browse")
+async def api_browse_words(
+    pos: str = Query("", description="Part of speech filter"),
+    difficulty: int = Query(0, ge=0, le=5, description="Difficulty 1-5, 0=all"),
+    tag: str = Query("", description="Tag filter"),
+    limit: int = Query(50, ge=1, le=200),
+    offset: int = Query(0, ge=0),
+):
+    """Browse vocabulary words with filters."""
+    words = await browse_words(
+        pos=pos, difficulty=difficulty, tag=tag,
+        limit=limit, offset=offset,
+    )
+    return {
+        "words": [w.to_dict() for w in words],
+        "filters": {"pos": pos, "difficulty": difficulty, "tag": tag},
+        "total": len(words),
+    }
+
+
+@router.get("/word/{word_id}")
+async def api_get_word(word_id: str):
+    """Get a single word with all details."""
+    word = await get_word(word_id)
+    if not word:
+        raise HTTPException(status_code=404, detail="Wort nicht gefunden")
+    return word.to_dict()
+
+
+@router.get("/filters")
+async def api_get_filters():
+    """Get available filter options (tags, parts of speech, word count)."""
+    tags = await get_all_tags()
+    pos_list = await get_all_pos()
+    total = await count_words()
+    return {
+        "tags": tags,
+        "parts_of_speech": pos_list,
+        "total_words": total,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Audio TTS for Words
+# ---------------------------------------------------------------------------
+
+
+@router.get("/word/{word_id}/audio/{lang}")
+async def api_get_word_audio(word_id: str, lang: str = "en"):
+    """Get or generate TTS audio for a vocabulary word.
+
+    Returns MP3 audio. Generated on first request, cached after.
+    Uses Piper TTS (MIT license) with Thorsten (DE) and Lessac (EN) voices.
+    """
+    from fastapi.responses import Response as FastAPIResponse
+
+    word = await get_word(word_id)
+    if not word:
+        raise HTTPException(status_code=404, detail="Wort nicht gefunden")
+
+    text = word.english if lang == "en" else word.german
+    if not text:
+        raise HTTPException(status_code=400, detail=f"Kein Text fuer Sprache '{lang}'")
+
+    from audio_service import get_or_generate_audio
+    audio_bytes = await get_or_generate_audio(text, language=lang, word_id=word_id)
+
+    if not audio_bytes:
+        raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
+
+    return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
+
+
+@router.get("/word/{word_id}/audio-syllables/{lang}")
+async def api_get_syllable_audio(word_id: str, lang: str = "en"):
+    """Get TTS audio with slow syllable pronunciation.
+
+    Generates audio like "ap ... ple" with pauses between syllables.
+    """
+    from fastapi.responses import Response as FastAPIResponse
+
+    word = await get_word(word_id)
+    if not word:
+        raise HTTPException(status_code=404, detail="Wort nicht gefunden")
+
+    syllables = word.syllables_en if lang == "en" else word.syllables_de
+    if not syllables:
+        # Fallback to full word
+        text = word.english if lang == "en" else word.german
+        syllables = [text]
+
+    # Join syllables with pauses (Piper handles "..." as pause)
+    slow_text = " ... ".join(syllables)
+
+    from audio_service import get_or_generate_audio
+    cache_key = f"{word_id}_syl_{lang}"
+    audio_bytes = await get_or_generate_audio(slow_text, language=lang, word_id=cache_key)
+
+    if not audio_bytes:
+        raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
+
+    return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
+
+
+# ---------------------------------------------------------------------------
+# Learning Unit Creation from Word Selection
+# ---------------------------------------------------------------------------
+
+
+class CreateUnitFromWordsPayload(BaseModel):
+    title: str
+    word_ids: List[str]
+    grade: Optional[str] = None
+    language: Optional[str] = "de"
+
+
+@router.post("/units")
+async def api_create_unit_from_words(payload: CreateUnitFromWordsPayload):
+    """Create a learning unit from selected vocabulary word IDs.
+
+    Fetches full word details, creates a LearningUnit in the
+    learning_units system, and stores the vocabulary data.
+    """
+    if not payload.word_ids:
+        raise HTTPException(status_code=400, detail="Keine Woerter ausgewaehlt")
+
+    # Fetch all selected words
+    words = []
+    for wid in payload.word_ids:
+        word = await get_word(wid)
+        if word:
+            words.append(word)
+
+    if not words:
+        raise HTTPException(status_code=404, detail="Keine der Woerter gefunden")
+
+    # Create learning unit
+    lu = create_learning_unit(LearningUnitCreate(
+        title=payload.title,
+        topic="Vocabulary",
+        grade_level=payload.grade or "5-8",
+        language=payload.language or "de",
+        status="raw",
+    ))
+
+    # Save vocabulary data as analysis JSON for generators
+    import os
+    analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten")
+    os.makedirs(analysis_dir, exist_ok=True)
+
+    vocab_data = [w.to_dict() for w in words]
+    analysis_path = os.path.join(analysis_dir, f"{lu.id}_vocab.json")
+    with open(analysis_path, "w", encoding="utf-8") as f:
+        json.dump({"words": vocab_data, "title": payload.title}, f, ensure_ascii=False, indent=2)
+
+    # Also save as QA items for flashcards/type trainer
+    qa_items = []
+    for i, w in enumerate(words):
+        qa_items.append({
+            "id": f"qa_{i+1}",
+            "question": w.english,
+            "answer": w.german,
+            "question_type": "knowledge",
+            "key_terms": [w.english],
+            "difficulty": w.difficulty,
+            "source_hint": w.part_of_speech,
+            "leitner_box": 0,
+            "correct_count": 0,
+            "incorrect_count": 0,
+            "last_seen": None,
+            "next_review": None,
+            # Extra fields for enhanced flashcards
+            "ipa_en": w.ipa_en,
+            "ipa_de": w.ipa_de,
+            "syllables_en": w.syllables_en,
+            "syllables_de": w.syllables_de,
+            "example_en": w.example_en,
+            "example_de": w.example_de,
+            "image_url": w.image_url,
+            "audio_url_en": w.audio_url_en,
+            "audio_url_de": w.audio_url_de,
+            "part_of_speech": w.part_of_speech,
+            "translations": w.translations,
+        })
+
+    qa_path = os.path.join(analysis_dir, f"{lu.id}_qa.json")
+    with open(qa_path, "w", encoding="utf-8") as f:
+        json.dump({
+            "qa_items": qa_items,
+            "metadata": {
+                "subject": "English Vocabulary",
+                "grade_level": payload.grade or "5-8",
+                "source_title": payload.title,
+                "total_questions": len(qa_items),
+            },
+        }, f, ensure_ascii=False, indent=2)
+
+    logger.info(f"Created vocab unit {lu.id} with {len(words)} words")
+
+    return {
+        "unit_id": lu.id,
+        "title": payload.title,
+        "word_count": len(words),
+        "status": "created",
+    }
+
+
+@router.get("/units/{unit_id}")
+async def api_get_unit_words(unit_id: str):
+    """Get all words for a learning unit."""
+    import os
+    vocab_path = os.path.join(
+        os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten"),
+        f"{unit_id}_vocab.json",
+    )
+    if not os.path.exists(vocab_path):
+        raise HTTPException(status_code=404, detail="Unit nicht gefunden")
+
+    with open(vocab_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    return {
+        "unit_id": unit_id,
+        "title": data.get("title", ""),
+        "words": data.get("words", []),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Bulk Import (for seeding the dictionary)
+# ---------------------------------------------------------------------------
+
+
+class BulkImportPayload(BaseModel):
+    words: List[Dict[str, Any]]
+
+
+@router.post("/import")
+async def api_bulk_import(payload: BulkImportPayload):
+    """Bulk import vocabulary words (for seeding the dictionary).
+
+    Each word dict should have at minimum: english, german.
+    Optional: ipa_en, ipa_de, part_of_speech, syllables_en, syllables_de,
+    example_en, example_de, difficulty, tags, translations.
+    """
+    from .db import insert_words_bulk
+
+    words = []
+    for w in payload.words:
+        words.append(VocabularyWord(
+            english=w.get("english", ""),
+            german=w.get("german", ""),
+            ipa_en=w.get("ipa_en", ""),
+            ipa_de=w.get("ipa_de", ""),
+            part_of_speech=w.get("part_of_speech", ""),
+            syllables_en=w.get("syllables_en", []),
+            syllables_de=w.get("syllables_de", []),
+            example_en=w.get("example_en", ""),
+            example_de=w.get("example_de", ""),
+            difficulty=w.get("difficulty", 1),
+            tags=w.get("tags", []),
+            translations=w.get("translations", {}),
+        ))
+
+    count = await insert_words_bulk(words)
+    logger.info(f"Bulk imported {count} vocabulary words")
+    return {"imported": count}
+
+
+# ---------------------------------------------------------------------------
+# Translation Generation
+# ---------------------------------------------------------------------------
+
+
+class TranslateRequest(BaseModel):
+    word_ids: List[str]
+    target_language: str
+
+
+@router.post("/translate")
+async def api_translate_words(payload: TranslateRequest):
+    """Generate translations for vocabulary words into a target language.
+
+    Uses local LLM (Ollama) for translation. Results are cached in the
+    vocabulary_words.translations JSONB field.
+    """
+    from translation_service import translate_and_store
+
+    if payload.target_language not in {"tr", "ar", "uk", "ru", "pl", "fr", "es"}:
+        raise HTTPException(status_code=400, detail=f"Sprache '{payload.target_language}' nicht unterstuetzt")
+
+    count = await translate_and_store(payload.word_ids, payload.target_language)
+    return {"translated": count, "target_language": payload.target_language}
--- a/backend-lehrer/vocabulary/db.py
+++ b/backend-lehrer/vocabulary/db.py
@@ -0,0 +1,296 @@
+"""
+Vocabulary Database — PostgreSQL storage for the vocabulary word catalog.
+
+Stores 160k+ words with translations, IPA, syllables, examples, images, audio.
+Uses asyncpg for async PostgreSQL access (same pattern as game/database.py).
+
+Schema: lehrer.vocabulary_words (search_path set in main.py)
+"""
+
+import logging
+import os
+import uuid
+from dataclasses import dataclass, field, asdict
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+_RAW_DB_URL = os.getenv(
+    "DATABASE_URL",
+    "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot",
+)
+# Strip SQLAlchemy dialect prefix (asyncpg needs plain postgresql://)
+DATABASE_URL = _RAW_DB_URL.replace("postgresql+asyncpg://", "postgresql://")
+# Strip search_path options (set via SET after connect)
+if "options=" in DATABASE_URL:
+    DATABASE_URL = DATABASE_URL.split("?")[0] if "options=" in DATABASE_URL.split("?")[-1] else DATABASE_URL
+
+_pool = None
+
+
+async def get_pool():
+    """Get or create the asyncpg connection pool."""
+    global _pool
+    if _pool is None:
+        import asyncpg
+        _pool = await asyncpg.create_pool(
+            DATABASE_URL, min_size=2, max_size=10,
+            server_settings={"search_path": "lehrer,core,public"},
+        )
+    return _pool
+
+
+async def init_vocabulary_tables():
+    """Create vocabulary tables if they don't exist."""
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        await conn.execute("""
+            CREATE TABLE IF NOT EXISTS vocabulary_words (
+                id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+                english TEXT NOT NULL,
+                german TEXT NOT NULL DEFAULT '',
+                ipa_en TEXT NOT NULL DEFAULT '',
+                ipa_de TEXT NOT NULL DEFAULT '',
+                part_of_speech TEXT NOT NULL DEFAULT '',
+                syllables_en TEXT[] NOT NULL DEFAULT '{}',
+                syllables_de TEXT[] NOT NULL DEFAULT '{}',
+                example_en TEXT NOT NULL DEFAULT '',
+                example_de TEXT NOT NULL DEFAULT '',
+                image_url TEXT NOT NULL DEFAULT '',
+                audio_url_en TEXT NOT NULL DEFAULT '',
+                audio_url_de TEXT NOT NULL DEFAULT '',
+                difficulty INT NOT NULL DEFAULT 1,
+                tags TEXT[] NOT NULL DEFAULT '{}',
+                translations JSONB NOT NULL DEFAULT '{}',
+                created_at TIMESTAMPTZ NOT NULL DEFAULT now()
+            );
+
+            CREATE INDEX IF NOT EXISTS idx_vocab_english
+                ON vocabulary_words (lower(english));
+            CREATE INDEX IF NOT EXISTS idx_vocab_german
+                ON vocabulary_words (lower(german));
+            CREATE INDEX IF NOT EXISTS idx_vocab_pos
+                ON vocabulary_words (part_of_speech);
+            CREATE INDEX IF NOT EXISTS idx_vocab_difficulty
+                ON vocabulary_words (difficulty);
+            CREATE INDEX IF NOT EXISTS idx_vocab_tags
+                ON vocabulary_words USING GIN (tags);
+        """)
+        # Enable trigram extension for fuzzy search (optional)
+        try:
+            await conn.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_vocab_english_trgm
+                    ON vocabulary_words USING GIN (english gin_trgm_ops);
+            """)
+        except Exception:
+            logger.info("pg_trgm not available — trigram search disabled, using LIKE fallback")
+
+    logger.info("vocabulary_words table initialized")
+
+
+@dataclass
+class VocabularyWord:
+    """A single vocabulary word with all metadata."""
+    id: str = ""
+    english: str = ""
+    german: str = ""
+    ipa_en: str = ""
+    ipa_de: str = ""
+    part_of_speech: str = ""
+    syllables_en: List[str] = field(default_factory=list)
+    syllables_de: List[str] = field(default_factory=list)
+    example_en: str = ""
+    example_de: str = ""
+    image_url: str = ""
+    audio_url_en: str = ""
+    audio_url_de: str = ""
+    difficulty: int = 1
+    tags: List[str] = field(default_factory=list)
+    translations: Dict[str, str] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+def _row_to_word(row) -> VocabularyWord:
+    """Convert an asyncpg Record to VocabularyWord."""
+    import json
+    translations = row["translations"]
+    if isinstance(translations, str):
+        translations = json.loads(translations)
+    return VocabularyWord(
+        id=str(row["id"]),
+        english=row["english"],
+        german=row["german"],
+        ipa_en=row["ipa_en"],
+        ipa_de=row["ipa_de"],
+        part_of_speech=row["part_of_speech"],
+        syllables_en=list(row["syllables_en"] or []),
+        syllables_de=list(row["syllables_de"] or []),
+        example_en=row["example_en"],
+        example_de=row["example_de"],
+        image_url=row["image_url"],
+        audio_url_en=row["audio_url_en"],
+        audio_url_de=row["audio_url_de"],
+        difficulty=row["difficulty"],
+        tags=list(row["tags"] or []),
+        translations=translations or {},
+    )
+
+
+async def search_words(
+    query: str, lang: str = "en", limit: int = 20, offset: int = 0,
+) -> List[VocabularyWord]:
+    """Full-text search for words. Uses trigram similarity if available, else ILIKE."""
+    pool = await get_pool()
+    col = "english" if lang == "en" else "german"
+    async with pool.acquire() as conn:
+        # Try trigram search first, fall back to ILIKE
+        try:
+            rows = await conn.fetch(
+                f"""
+                SELECT * FROM vocabulary_words
+                WHERE lower({col}) LIKE $1 OR {col} % $2
+                ORDER BY similarity({col}, $2) DESC, lower({col})
+                LIMIT $3 OFFSET $4
+                """,
+                f"%{query.lower()}%", query, limit, offset,
+            )
+        except Exception:
+            rows = await conn.fetch(
+                f"""
+                SELECT * FROM vocabulary_words
+                WHERE lower({col}) LIKE $1
+                ORDER BY lower({col})
+                LIMIT $2 OFFSET $3
+                """,
+                f"%{query.lower()}%", limit, offset,
+            )
+    return [_row_to_word(r) for r in rows]
+
+
+async def get_word(word_id: str) -> Optional[VocabularyWord]:
+    """Get a single word by ID."""
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            "SELECT * FROM vocabulary_words WHERE id = $1", uuid.UUID(word_id),
+        )
+    return _row_to_word(row) if row else None
+
+
+async def browse_words(
+    pos: str = "", difficulty: int = 0, tag: str = "",
+    limit: int = 50, offset: int = 0,
+) -> List[VocabularyWord]:
+    """Browse words with filters."""
+    pool = await get_pool()
+    conditions = []
+    params: List[Any] = []
+    idx = 1
+
+    if pos:
+        conditions.append(f"part_of_speech = ${idx}")
+        params.append(pos)
+        idx += 1
+    if difficulty > 0:
+        conditions.append(f"difficulty = ${idx}")
+        params.append(difficulty)
+        idx += 1
+    if tag:
+        conditions.append(f"${idx} = ANY(tags)")
+        params.append(tag)
+        idx += 1
+
+    where = "WHERE " + " AND ".join(conditions) if conditions else ""
+    params.extend([limit, offset])
+
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            f"SELECT * FROM vocabulary_words {where} ORDER BY english LIMIT ${idx} OFFSET ${idx+1}",
+            *params,
+        )
+    return [_row_to_word(r) for r in rows]
+
+
+async def insert_word(word: VocabularyWord) -> str:
+    """Insert a new word, returns the ID."""
+    pool = await get_pool()
+    import json
+    word_id = word.id or str(uuid.uuid4())
+    async with pool.acquire() as conn:
+        await conn.execute(
+            """
+            INSERT INTO vocabulary_words
+                (id, english, german, ipa_en, ipa_de, part_of_speech,
+                 syllables_en, syllables_de, example_en, example_de,
+                 image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
+            VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
+            ON CONFLICT (id) DO NOTHING
+            """,
+            uuid.UUID(word_id), word.english, word.german,
+            word.ipa_en, word.ipa_de, word.part_of_speech,
+            word.syllables_en, word.syllables_de,
+            word.example_en, word.example_de,
+            word.image_url, word.audio_url_en, word.audio_url_de,
+            word.difficulty, word.tags, json.dumps(word.translations),
+        )
+    return word_id
+
+
+async def insert_words_bulk(words: List[VocabularyWord]) -> int:
+    """Bulk insert words. Returns count of inserted rows."""
+    pool = await get_pool()
+    import json
+    records = []
+    for w in words:
+        wid = w.id or str(uuid.uuid4())
+        records.append((
+            uuid.UUID(wid), w.english, w.german,
+            w.ipa_en, w.ipa_de, w.part_of_speech,
+            w.syllables_en, w.syllables_de,
+            w.example_en, w.example_de,
+            w.image_url, w.audio_url_en, w.audio_url_de,
+            w.difficulty, w.tags, json.dumps(w.translations),
+        ))
+    async with pool.acquire() as conn:
+        await conn.executemany(
+            """
+            INSERT INTO vocabulary_words
+                (id, english, german, ipa_en, ipa_de, part_of_speech,
+                 syllables_en, syllables_de, example_en, example_de,
+                 image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
+            VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
+            ON CONFLICT (id) DO NOTHING
+            """,
+            records,
+        )
+    return len(records)
+
+
+async def count_words() -> int:
+    """Count total words in the database."""
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        return await conn.fetchval("SELECT COUNT(*) FROM vocabulary_words")
+
+
+async def get_all_tags() -> List[str]:
+    """Get all unique tags."""
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            "SELECT DISTINCT unnest(tags) AS tag FROM vocabulary_words ORDER BY tag"
+        )
+    return [r["tag"] for r in rows]
+
+
+async def get_all_pos() -> List[str]:
+    """Get all unique parts of speech."""
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            "SELECT DISTINCT part_of_speech FROM vocabulary_words WHERE part_of_speech != '' ORDER BY part_of_speech"
+        )
+    return [r["part_of_speech"] for r in rows]