Files
breakpilot-lehrer/backend-lehrer/vocabulary/api.py
Benjamin Admin 52a15b24fe
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m29s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 22s
Add custom word entry + language pair support for learning units
- New UnitBuilder component with language pair selector (DE⇄EN, ES, FR, etc.)
- Manual word entry form with auto-suggest from Kaikki dictionary (6M words)
- "No results" prompt to add multi-word terms (e.g. "schottisches Hochland")
- New backend endpoint GET /vocabulary/lookup-translation (any→any via EN hub)
- Updated POST /vocabulary/units: accepts custom_words + source_lang/target_lang
- Split unit endpoints into vocabulary/unit_api.py (500 LOC budget)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-29 15:24:13 +02:00

382 lines
13 KiB
Python

"""
Vocabulary API — Search, browse, and build learning units from the word catalog.
Endpoints for teachers to find words and create learning units,
and for students to access word details with audio/images/syllables.
"""
import logging
import json
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from .db import (
search_words,
get_word,
browse_words,
insert_word,
count_words,
get_all_tags,
get_all_pos,
VocabularyWord,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/vocabulary", tags=["vocabulary"])
# ---------------------------------------------------------------------------
# Search & Browse
# ---------------------------------------------------------------------------
@router.get("/search")
async def api_search_words(
q: str = Query("", description="Search query"),
lang: str = Query("en"),
limit: int = Query(20, ge=1, le=100),
offset: int = Query(0, ge=0),
source: str = Query("kaikki", description="Source: kaikki (6M words) or manual (27 words)"),
):
"""Full-text search for vocabulary words.
source=kaikki searches the 6.27M Kaikki/Wiktionary dictionary.
source=manual searches the manually curated vocabulary_words table.
"""
if not q.strip():
return {"words": [], "query": q, "total": 0}
if source == "kaikki":
return await _search_kaikki(q.strip(), lang, limit, offset)
words = await search_words(q.strip(), lang=lang, limit=limit, offset=offset)
return {
"words": [w.to_dict() for w in words],
"query": q,
"total": len(words),
}
async def _search_kaikki(q: str, lang: str, limit: int, offset: int):
"""Search the vocabulary_kaikki table (6.27M Wiktionary entries)."""
from vocabulary.db import get_pool
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT id, word, lang, pos, ipa, translations, example
FROM vocabulary_kaikki
WHERE lang = $1 AND lower(word) LIKE $2
ORDER BY length(word), lower(word)
LIMIT $3 OFFSET $4
""",
lang, f"{q.lower()}%", limit, offset,
)
words = []
for r in rows:
tr = r["translations"]
if isinstance(tr, str):
import json as _json
tr = _json.loads(tr)
words.append({
"id": str(r["id"]),
"english": r["word"] if r["lang"] == "en" else "",
"german": tr.get("de", {}).get("text", "") if r["lang"] == "en" else r["word"] if r["lang"] == "de" else "",
"word": r["word"],
"lang": r["lang"],
"ipa_en": r["ipa"] if r["lang"] == "en" else "",
"ipa_de": r["ipa"] if r["lang"] == "de" else "",
"part_of_speech": r["pos"],
"syllables_en": [],
"syllables_de": [],
"example_en": r["example"] if r["lang"] == "en" else "",
"example_de": r["example"] if r["lang"] == "de" else "",
"image_url": "",
"audio_url_en": "",
"audio_url_de": "",
"difficulty": 0,
"tags": [],
"translations": tr,
})
return {"words": words, "query": q, "total": len(words), "source": "kaikki"}
@router.get("/browse")
async def api_browse_words(
pos: str = Query("", description="Part of speech filter"),
difficulty: int = Query(0, ge=0, le=5, description="Difficulty 1-5, 0=all"),
tag: str = Query("", description="Tag filter"),
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0),
):
"""Browse vocabulary words with filters."""
words = await browse_words(
pos=pos, difficulty=difficulty, tag=tag,
limit=limit, offset=offset,
)
return {
"words": [w.to_dict() for w in words],
"filters": {"pos": pos, "difficulty": difficulty, "tag": tag},
"total": len(words),
}
@router.get("/word/{word_id}")
async def api_get_word(word_id: str):
"""Get a single word with all details."""
word = await get_word(word_id)
if not word:
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
return word.to_dict()
@router.get("/filters")
async def api_get_filters():
"""Get available filter options (tags, parts of speech, word count)."""
tags = await get_all_tags()
pos_list = await get_all_pos()
total = await count_words()
# Kaikki stats (hardcoded to avoid slow COUNT on 6M rows)
return {
"tags": tags,
"parts_of_speech": pos_list,
"total_words": total,
"kaikki_total": 6271749,
"kaikki_languages": 24,
}
# ---------------------------------------------------------------------------
# Audio TTS for Words
# ---------------------------------------------------------------------------
@router.get("/word/{word_id}/audio/{lang}")
async def api_get_word_audio(word_id: str, lang: str = "en"):
"""Get or generate TTS audio for a vocabulary word.
Returns MP3 audio. Generated on first request, cached after.
Uses Piper TTS (MIT license) with Thorsten (DE) and Lessac (EN) voices.
"""
from fastapi.responses import Response as FastAPIResponse
word = await get_word(word_id)
if not word:
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
text = word.english if lang == "en" else word.german
if not text:
raise HTTPException(status_code=400, detail=f"Kein Text fuer Sprache '{lang}'")
from services.audio import get_or_generate_audio
audio_bytes = await get_or_generate_audio(text, language=lang, word_id=word_id)
if not audio_bytes:
raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
@router.get("/word/{word_id}/audio-syllables/{lang}")
async def api_get_syllable_audio(word_id: str, lang: str = "en"):
"""Get TTS audio with slow syllable pronunciation.
Generates audio like "ap ... ple" with pauses between syllables.
"""
from fastapi.responses import Response as FastAPIResponse
word = await get_word(word_id)
if not word:
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
syllables = word.syllables_en if lang == "en" else word.syllables_de
if not syllables:
# Fallback to full word
text = word.english if lang == "en" else word.german
syllables = [text]
# Join syllables with pauses (Piper handles "..." as pause)
slow_text = " ... ".join(syllables)
from services.audio import get_or_generate_audio
cache_key = f"{word_id}_syl_{lang}"
audio_bytes = await get_or_generate_audio(slow_text, language=lang, word_id=cache_key)
if not audio_bytes:
raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
@router.get("/tts")
async def api_tts(text: str = Query("", min_length=1), lang: str = Query("de")):
"""Text-to-Speech endpoint. Returns MP3 audio for any text.
Uses Piper TTS (Thorsten DE / Lessac EN). Cached by text+lang.
"""
from fastapi.responses import Response as FastAPIResponse
from services.audio import get_or_generate_audio
audio_bytes = await get_or_generate_audio(text, language=lang)
if not audio_bytes:
raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
# ---------------------------------------------------------------------------
# Learning Unit Creation from Word Selection
# ---------------------------------------------------------------------------
# Unit creation and translation lookup moved to vocabulary/unit_api.py
# ---------------------------------------------------------------------------
# Bulk Import (for seeding the dictionary)
# ---------------------------------------------------------------------------
class BulkImportPayload(BaseModel):
words: List[Dict[str, Any]]
@router.post("/import")
async def api_bulk_import(payload: BulkImportPayload):
"""Bulk import vocabulary words (for seeding the dictionary).
Each word dict should have at minimum: english, german.
Optional: ipa_en, ipa_de, part_of_speech, syllables_en, syllables_de,
example_en, example_de, difficulty, tags, translations.
"""
from .db import insert_words_bulk
words = []
for w in payload.words:
words.append(VocabularyWord(
english=w.get("english", ""),
german=w.get("german", ""),
ipa_en=w.get("ipa_en", ""),
ipa_de=w.get("ipa_de", ""),
part_of_speech=w.get("part_of_speech", ""),
syllables_en=w.get("syllables_en", []),
syllables_de=w.get("syllables_de", []),
example_en=w.get("example_en", ""),
example_de=w.get("example_de", ""),
difficulty=w.get("difficulty", 1),
tags=w.get("tags", []),
translations=w.get("translations", {}),
))
count = await insert_words_bulk(words)
logger.info(f"Bulk imported {count} vocabulary words")
return {"imported": count}
# ---------------------------------------------------------------------------
# Translation Generation
# ---------------------------------------------------------------------------
@router.post("/enrich-images")
async def api_enrich_images(word_ids: List[str] = None):
"""Fetch and store images for vocabulary words (Wikipedia + emoji fallback)."""
from services.image_service import enrich_words_with_images
from vocabulary.db import get_pool
import uuid as _uuid
if not word_ids:
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch("SELECT id FROM vocabulary_words WHERE image_url = '' OR image_url IS NULL")
word_ids = [str(r["id"]) for r in rows]
if not word_ids:
return {"enriched": 0, "message": "All words already have images"}
count = await enrich_words_with_images(word_ids)
return {"enriched": count, "total": len(word_ids)}
@router.get("/topics")
async def api_get_topics(
q: str = Query("", description="Search topic or word"),
lang: str = Query("en", description="Display language for word labels"),
):
"""Find topics matching a search word. Returns related word lists.
If q matches a topic name → returns that topic.
If q matches a word in any topic → returns all topics containing that word.
Words are returned with translations if lang != en.
"""
from vocabulary.db import get_pool
pool = await get_pool()
async with pool.acquire() as conn:
if not q.strip():
rows = await conn.fetch("SELECT topic, words, word_count FROM vocabulary_topics ORDER BY topic LIMIT 50")
else:
q_lower = q.strip().lower()
rows = await conn.fetch("""
SELECT topic, words, word_count FROM vocabulary_topics
WHERE lower(topic) LIKE $1 OR $2 = ANY(words)
ORDER BY word_count DESC
""", f"%{q_lower}%", q_lower)
# Translate word labels if not English
topics = []
for r in rows:
en_words = list(r["words"])
display_words = en_words
if lang != "en":
# Batch-lookup translations from Kaikki
translated = []
for w in en_words[:20]: # Limit to 20 for speed
tr_row = await conn.fetchrow(
"SELECT translations FROM vocabulary_kaikki WHERE lang = 'en' AND lower(word) = $1 LIMIT 1",
w.lower(),
)
if tr_row and tr_row["translations"]:
import json as _json
tr = tr_row["translations"]
if isinstance(tr, str):
tr = _json.loads(tr)
tr_text = tr.get(lang, {}).get("text", "")
translated.append(tr_text if tr_text else w)
else:
translated.append(w)
display_words = translated + en_words[20:]
topics.append({
"topic": r["topic"],
"words": en_words,
"display_words": display_words,
"word_count": r["word_count"],
})
return {"topics": topics, "query": q, "lang": lang}
class TranslateRequest(BaseModel):
word_ids: List[str]
target_language: str
@router.post("/translate")
async def api_translate_words(payload: TranslateRequest):
"""Generate translations for vocabulary words into a target language.
Uses local LLM (Ollama) for translation. Results are cached in the
vocabulary_words.translations JSONB field.
"""
from services.translation import translate_and_store
if payload.target_language not in {"tr", "ar", "uk", "ru", "pl", "fr", "es"}:
raise HTTPException(status_code=400, detail=f"Sprache '{payload.target_language}' nicht unterstuetzt")
count = await translate_and_store(payload.word_ids, payload.target_language)
return {"translated": count, "target_language": payload.target_language}