Restructure: Move final 16 root files into packages (backend-lehrer)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 37s
CI / test-go-edu-search (push) Successful in 35s
CI / test-python-klausur (push) Failing after 2m41s
CI / test-python-agent-core (push) Successful in 30s
CI / test-nodejs-website (push) Successful in 38s

classroom/ (+2): state_engine_api, state_engine_models
vocabulary/ (2): api, db
worksheets/ (2): api, models
services/  (+6): audio, email, translation, claude_vision, ai_processor, story_generator
api/        (4): school, klausur_proxy, progress, user_language

Only main.py + config.py remain at root. 16 shims added.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 22:50:37 +02:00
parent 6be555fb7c
commit cba877c65a
36 changed files with 3712 additions and 3564 deletions

View File

@@ -0,0 +1,33 @@
# Vocabulary Module
# vocabulary/api.py — API router (search, browse, import, translate)
# vocabulary/db.py — PostgreSQL storage for vocabulary word catalog
from .api import router
from .db import (
VocabularyWord,
get_pool,
init_vocabulary_tables,
search_words,
get_word,
browse_words,
insert_word,
insert_words_bulk,
count_words,
get_all_tags,
get_all_pos,
)
__all__ = [
"router",
"VocabularyWord",
"get_pool",
"init_vocabulary_tables",
"search_words",
"get_word",
"browse_words",
"insert_word",
"insert_words_bulk",
"count_words",
"get_all_tags",
"get_all_pos",
]

View File

@@ -0,0 +1,352 @@
"""
Vocabulary API — Search, browse, and build learning units from the word catalog.
Endpoints for teachers to find words and create learning units,
and for students to access word details with audio/images/syllables.
"""
import logging
import json
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from .db import (
search_words,
get_word,
browse_words,
insert_word,
count_words,
get_all_tags,
get_all_pos,
VocabularyWord,
)
from learning_units import (
LearningUnitCreate,
create_learning_unit,
get_learning_unit,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/vocabulary", tags=["vocabulary"])
# ---------------------------------------------------------------------------
# Search & Browse
# ---------------------------------------------------------------------------
@router.get("/search")
async def api_search_words(
q: str = Query("", description="Search query"),
lang: str = Query("en", pattern="^(en|de)$"),
limit: int = Query(20, ge=1, le=100),
offset: int = Query(0, ge=0),
):
"""Full-text search for vocabulary words."""
if not q.strip():
return {"words": [], "query": q, "total": 0}
words = await search_words(q.strip(), lang=lang, limit=limit, offset=offset)
return {
"words": [w.to_dict() for w in words],
"query": q,
"total": len(words),
}
@router.get("/browse")
async def api_browse_words(
pos: str = Query("", description="Part of speech filter"),
difficulty: int = Query(0, ge=0, le=5, description="Difficulty 1-5, 0=all"),
tag: str = Query("", description="Tag filter"),
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0),
):
"""Browse vocabulary words with filters."""
words = await browse_words(
pos=pos, difficulty=difficulty, tag=tag,
limit=limit, offset=offset,
)
return {
"words": [w.to_dict() for w in words],
"filters": {"pos": pos, "difficulty": difficulty, "tag": tag},
"total": len(words),
}
@router.get("/word/{word_id}")
async def api_get_word(word_id: str):
"""Get a single word with all details."""
word = await get_word(word_id)
if not word:
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
return word.to_dict()
@router.get("/filters")
async def api_get_filters():
"""Get available filter options (tags, parts of speech, word count)."""
tags = await get_all_tags()
pos_list = await get_all_pos()
total = await count_words()
return {
"tags": tags,
"parts_of_speech": pos_list,
"total_words": total,
}
# ---------------------------------------------------------------------------
# Audio TTS for Words
# ---------------------------------------------------------------------------
@router.get("/word/{word_id}/audio/{lang}")
async def api_get_word_audio(word_id: str, lang: str = "en"):
"""Get or generate TTS audio for a vocabulary word.
Returns MP3 audio. Generated on first request, cached after.
Uses Piper TTS (MIT license) with Thorsten (DE) and Lessac (EN) voices.
"""
from fastapi.responses import Response as FastAPIResponse
word = await get_word(word_id)
if not word:
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
text = word.english if lang == "en" else word.german
if not text:
raise HTTPException(status_code=400, detail=f"Kein Text fuer Sprache '{lang}'")
from audio_service import get_or_generate_audio
audio_bytes = await get_or_generate_audio(text, language=lang, word_id=word_id)
if not audio_bytes:
raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
@router.get("/word/{word_id}/audio-syllables/{lang}")
async def api_get_syllable_audio(word_id: str, lang: str = "en"):
"""Get TTS audio with slow syllable pronunciation.
Generates audio like "ap ... ple" with pauses between syllables.
"""
from fastapi.responses import Response as FastAPIResponse
word = await get_word(word_id)
if not word:
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
syllables = word.syllables_en if lang == "en" else word.syllables_de
if not syllables:
# Fallback to full word
text = word.english if lang == "en" else word.german
syllables = [text]
# Join syllables with pauses (Piper handles "..." as pause)
slow_text = " ... ".join(syllables)
from audio_service import get_or_generate_audio
cache_key = f"{word_id}_syl_{lang}"
audio_bytes = await get_or_generate_audio(slow_text, language=lang, word_id=cache_key)
if not audio_bytes:
raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
# ---------------------------------------------------------------------------
# Learning Unit Creation from Word Selection
# ---------------------------------------------------------------------------
class CreateUnitFromWordsPayload(BaseModel):
title: str
word_ids: List[str]
grade: Optional[str] = None
language: Optional[str] = "de"
@router.post("/units")
async def api_create_unit_from_words(payload: CreateUnitFromWordsPayload):
"""Create a learning unit from selected vocabulary word IDs.
Fetches full word details, creates a LearningUnit in the
learning_units system, and stores the vocabulary data.
"""
if not payload.word_ids:
raise HTTPException(status_code=400, detail="Keine Woerter ausgewaehlt")
# Fetch all selected words
words = []
for wid in payload.word_ids:
word = await get_word(wid)
if word:
words.append(word)
if not words:
raise HTTPException(status_code=404, detail="Keine der Woerter gefunden")
# Create learning unit
lu = create_learning_unit(LearningUnitCreate(
title=payload.title,
topic="Vocabulary",
grade_level=payload.grade or "5-8",
language=payload.language or "de",
status="raw",
))
# Save vocabulary data as analysis JSON for generators
import os
analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten")
os.makedirs(analysis_dir, exist_ok=True)
vocab_data = [w.to_dict() for w in words]
analysis_path = os.path.join(analysis_dir, f"{lu.id}_vocab.json")
with open(analysis_path, "w", encoding="utf-8") as f:
json.dump({"words": vocab_data, "title": payload.title}, f, ensure_ascii=False, indent=2)
# Also save as QA items for flashcards/type trainer
qa_items = []
for i, w in enumerate(words):
qa_items.append({
"id": f"qa_{i+1}",
"question": w.english,
"answer": w.german,
"question_type": "knowledge",
"key_terms": [w.english],
"difficulty": w.difficulty,
"source_hint": w.part_of_speech,
"leitner_box": 0,
"correct_count": 0,
"incorrect_count": 0,
"last_seen": None,
"next_review": None,
# Extra fields for enhanced flashcards
"ipa_en": w.ipa_en,
"ipa_de": w.ipa_de,
"syllables_en": w.syllables_en,
"syllables_de": w.syllables_de,
"example_en": w.example_en,
"example_de": w.example_de,
"image_url": w.image_url,
"audio_url_en": w.audio_url_en,
"audio_url_de": w.audio_url_de,
"part_of_speech": w.part_of_speech,
"translations": w.translations,
})
qa_path = os.path.join(analysis_dir, f"{lu.id}_qa.json")
with open(qa_path, "w", encoding="utf-8") as f:
json.dump({
"qa_items": qa_items,
"metadata": {
"subject": "English Vocabulary",
"grade_level": payload.grade or "5-8",
"source_title": payload.title,
"total_questions": len(qa_items),
},
}, f, ensure_ascii=False, indent=2)
logger.info(f"Created vocab unit {lu.id} with {len(words)} words")
return {
"unit_id": lu.id,
"title": payload.title,
"word_count": len(words),
"status": "created",
}
@router.get("/units/{unit_id}")
async def api_get_unit_words(unit_id: str):
"""Get all words for a learning unit."""
import os
vocab_path = os.path.join(
os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten"),
f"{unit_id}_vocab.json",
)
if not os.path.exists(vocab_path):
raise HTTPException(status_code=404, detail="Unit nicht gefunden")
with open(vocab_path, "r", encoding="utf-8") as f:
data = json.load(f)
return {
"unit_id": unit_id,
"title": data.get("title", ""),
"words": data.get("words", []),
}
# ---------------------------------------------------------------------------
# Bulk Import (for seeding the dictionary)
# ---------------------------------------------------------------------------
class BulkImportPayload(BaseModel):
words: List[Dict[str, Any]]
@router.post("/import")
async def api_bulk_import(payload: BulkImportPayload):
"""Bulk import vocabulary words (for seeding the dictionary).
Each word dict should have at minimum: english, german.
Optional: ipa_en, ipa_de, part_of_speech, syllables_en, syllables_de,
example_en, example_de, difficulty, tags, translations.
"""
from .db import insert_words_bulk
words = []
for w in payload.words:
words.append(VocabularyWord(
english=w.get("english", ""),
german=w.get("german", ""),
ipa_en=w.get("ipa_en", ""),
ipa_de=w.get("ipa_de", ""),
part_of_speech=w.get("part_of_speech", ""),
syllables_en=w.get("syllables_en", []),
syllables_de=w.get("syllables_de", []),
example_en=w.get("example_en", ""),
example_de=w.get("example_de", ""),
difficulty=w.get("difficulty", 1),
tags=w.get("tags", []),
translations=w.get("translations", {}),
))
count = await insert_words_bulk(words)
logger.info(f"Bulk imported {count} vocabulary words")
return {"imported": count}
# ---------------------------------------------------------------------------
# Translation Generation
# ---------------------------------------------------------------------------
class TranslateRequest(BaseModel):
word_ids: List[str]
target_language: str
@router.post("/translate")
async def api_translate_words(payload: TranslateRequest):
"""Generate translations for vocabulary words into a target language.
Uses local LLM (Ollama) for translation. Results are cached in the
vocabulary_words.translations JSONB field.
"""
from translation_service import translate_and_store
if payload.target_language not in {"tr", "ar", "uk", "ru", "pl", "fr", "es"}:
raise HTTPException(status_code=400, detail=f"Sprache '{payload.target_language}' nicht unterstuetzt")
count = await translate_and_store(payload.word_ids, payload.target_language)
return {"translated": count, "target_language": payload.target_language}

View File

@@ -0,0 +1,296 @@
"""
Vocabulary Database — PostgreSQL storage for the vocabulary word catalog.
Stores 160k+ words with translations, IPA, syllables, examples, images, audio.
Uses asyncpg for async PostgreSQL access (same pattern as game/database.py).
Schema: lehrer.vocabulary_words (search_path set in main.py)
"""
import logging
import os
import uuid
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
_RAW_DB_URL = os.getenv(
"DATABASE_URL",
"postgresql://breakpilot:breakpilot@postgres:5432/breakpilot",
)
# Strip SQLAlchemy dialect prefix (asyncpg needs plain postgresql://)
DATABASE_URL = _RAW_DB_URL.replace("postgresql+asyncpg://", "postgresql://")
# Strip search_path options (set via SET after connect)
if "options=" in DATABASE_URL:
DATABASE_URL = DATABASE_URL.split("?")[0] if "options=" in DATABASE_URL.split("?")[-1] else DATABASE_URL
_pool = None
async def get_pool():
"""Get or create the asyncpg connection pool."""
global _pool
if _pool is None:
import asyncpg
_pool = await asyncpg.create_pool(
DATABASE_URL, min_size=2, max_size=10,
server_settings={"search_path": "lehrer,core,public"},
)
return _pool
async def init_vocabulary_tables():
"""Create vocabulary tables if they don't exist."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute("""
CREATE TABLE IF NOT EXISTS vocabulary_words (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
english TEXT NOT NULL,
german TEXT NOT NULL DEFAULT '',
ipa_en TEXT NOT NULL DEFAULT '',
ipa_de TEXT NOT NULL DEFAULT '',
part_of_speech TEXT NOT NULL DEFAULT '',
syllables_en TEXT[] NOT NULL DEFAULT '{}',
syllables_de TEXT[] NOT NULL DEFAULT '{}',
example_en TEXT NOT NULL DEFAULT '',
example_de TEXT NOT NULL DEFAULT '',
image_url TEXT NOT NULL DEFAULT '',
audio_url_en TEXT NOT NULL DEFAULT '',
audio_url_de TEXT NOT NULL DEFAULT '',
difficulty INT NOT NULL DEFAULT 1,
tags TEXT[] NOT NULL DEFAULT '{}',
translations JSONB NOT NULL DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_vocab_english
ON vocabulary_words (lower(english));
CREATE INDEX IF NOT EXISTS idx_vocab_german
ON vocabulary_words (lower(german));
CREATE INDEX IF NOT EXISTS idx_vocab_pos
ON vocabulary_words (part_of_speech);
CREATE INDEX IF NOT EXISTS idx_vocab_difficulty
ON vocabulary_words (difficulty);
CREATE INDEX IF NOT EXISTS idx_vocab_tags
ON vocabulary_words USING GIN (tags);
""")
# Enable trigram extension for fuzzy search (optional)
try:
await conn.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_vocab_english_trgm
ON vocabulary_words USING GIN (english gin_trgm_ops);
""")
except Exception:
logger.info("pg_trgm not available — trigram search disabled, using LIKE fallback")
logger.info("vocabulary_words table initialized")
@dataclass
class VocabularyWord:
"""A single vocabulary word with all metadata."""
id: str = ""
english: str = ""
german: str = ""
ipa_en: str = ""
ipa_de: str = ""
part_of_speech: str = ""
syllables_en: List[str] = field(default_factory=list)
syllables_de: List[str] = field(default_factory=list)
example_en: str = ""
example_de: str = ""
image_url: str = ""
audio_url_en: str = ""
audio_url_de: str = ""
difficulty: int = 1
tags: List[str] = field(default_factory=list)
translations: Dict[str, str] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
def _row_to_word(row) -> VocabularyWord:
"""Convert an asyncpg Record to VocabularyWord."""
import json
translations = row["translations"]
if isinstance(translations, str):
translations = json.loads(translations)
return VocabularyWord(
id=str(row["id"]),
english=row["english"],
german=row["german"],
ipa_en=row["ipa_en"],
ipa_de=row["ipa_de"],
part_of_speech=row["part_of_speech"],
syllables_en=list(row["syllables_en"] or []),
syllables_de=list(row["syllables_de"] or []),
example_en=row["example_en"],
example_de=row["example_de"],
image_url=row["image_url"],
audio_url_en=row["audio_url_en"],
audio_url_de=row["audio_url_de"],
difficulty=row["difficulty"],
tags=list(row["tags"] or []),
translations=translations or {},
)
async def search_words(
query: str, lang: str = "en", limit: int = 20, offset: int = 0,
) -> List[VocabularyWord]:
"""Full-text search for words. Uses trigram similarity if available, else ILIKE."""
pool = await get_pool()
col = "english" if lang == "en" else "german"
async with pool.acquire() as conn:
# Try trigram search first, fall back to ILIKE
try:
rows = await conn.fetch(
f"""
SELECT * FROM vocabulary_words
WHERE lower({col}) LIKE $1 OR {col} % $2
ORDER BY similarity({col}, $2) DESC, lower({col})
LIMIT $3 OFFSET $4
""",
f"%{query.lower()}%", query, limit, offset,
)
except Exception:
rows = await conn.fetch(
f"""
SELECT * FROM vocabulary_words
WHERE lower({col}) LIKE $1
ORDER BY lower({col})
LIMIT $2 OFFSET $3
""",
f"%{query.lower()}%", limit, offset,
)
return [_row_to_word(r) for r in rows]
async def get_word(word_id: str) -> Optional[VocabularyWord]:
"""Get a single word by ID."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM vocabulary_words WHERE id = $1", uuid.UUID(word_id),
)
return _row_to_word(row) if row else None
async def browse_words(
pos: str = "", difficulty: int = 0, tag: str = "",
limit: int = 50, offset: int = 0,
) -> List[VocabularyWord]:
"""Browse words with filters."""
pool = await get_pool()
conditions = []
params: List[Any] = []
idx = 1
if pos:
conditions.append(f"part_of_speech = ${idx}")
params.append(pos)
idx += 1
if difficulty > 0:
conditions.append(f"difficulty = ${idx}")
params.append(difficulty)
idx += 1
if tag:
conditions.append(f"${idx} = ANY(tags)")
params.append(tag)
idx += 1
where = "WHERE " + " AND ".join(conditions) if conditions else ""
params.extend([limit, offset])
async with pool.acquire() as conn:
rows = await conn.fetch(
f"SELECT * FROM vocabulary_words {where} ORDER BY english LIMIT ${idx} OFFSET ${idx+1}",
*params,
)
return [_row_to_word(r) for r in rows]
async def insert_word(word: VocabularyWord) -> str:
"""Insert a new word, returns the ID."""
pool = await get_pool()
import json
word_id = word.id or str(uuid.uuid4())
async with pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO vocabulary_words
(id, english, german, ipa_en, ipa_de, part_of_speech,
syllables_en, syllables_de, example_en, example_de,
image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
ON CONFLICT (id) DO NOTHING
""",
uuid.UUID(word_id), word.english, word.german,
word.ipa_en, word.ipa_de, word.part_of_speech,
word.syllables_en, word.syllables_de,
word.example_en, word.example_de,
word.image_url, word.audio_url_en, word.audio_url_de,
word.difficulty, word.tags, json.dumps(word.translations),
)
return word_id
async def insert_words_bulk(words: List[VocabularyWord]) -> int:
"""Bulk insert words. Returns count of inserted rows."""
pool = await get_pool()
import json
records = []
for w in words:
wid = w.id or str(uuid.uuid4())
records.append((
uuid.UUID(wid), w.english, w.german,
w.ipa_en, w.ipa_de, w.part_of_speech,
w.syllables_en, w.syllables_de,
w.example_en, w.example_de,
w.image_url, w.audio_url_en, w.audio_url_de,
w.difficulty, w.tags, json.dumps(w.translations),
))
async with pool.acquire() as conn:
await conn.executemany(
"""
INSERT INTO vocabulary_words
(id, english, german, ipa_en, ipa_de, part_of_speech,
syllables_en, syllables_de, example_en, example_de,
image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
ON CONFLICT (id) DO NOTHING
""",
records,
)
return len(records)
async def count_words() -> int:
"""Count total words in the database."""
pool = await get_pool()
async with pool.acquire() as conn:
return await conn.fetchval("SELECT COUNT(*) FROM vocabulary_words")
async def get_all_tags() -> List[str]:
"""Get all unique tags."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT DISTINCT unnest(tags) AS tag FROM vocabulary_words ORDER BY tag"
)
return [r["tag"] for r in rows]
async def get_all_pos() -> List[str]:
"""Get all unique parts of speech."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT DISTINCT part_of_speech FROM vocabulary_words WHERE part_of_speech != '' ORDER BY part_of_speech"
)
return [r["part_of_speech"] for r in rows]