31 curated topics with 683 words (Fruit, Animals, Body, Eye, Sports, School, Family, Weather, etc.). When user types a word that belongs to a topic, the topic appears as a suggestion with "Alle laden" button. Clicking "Alle laden" fetches all words from that topic via Kaikki and displays them for easy selection into a learning unit. New endpoint: GET /api/vocabulary/topics?q=banana New table: vocabulary_topics (topic, words[], word_count) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
479 lines
16 KiB
Python
479 lines
16 KiB
Python
"""
|
|
Vocabulary API — Search, browse, and build learning units from the word catalog.
|
|
|
|
Endpoints for teachers to find words and create learning units,
|
|
and for students to access word details with audio/images/syllables.
|
|
"""
|
|
|
|
import logging
|
|
import json
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from fastapi import APIRouter, HTTPException, Query
|
|
from pydantic import BaseModel
|
|
|
|
from .db import (
|
|
search_words,
|
|
get_word,
|
|
browse_words,
|
|
insert_word,
|
|
count_words,
|
|
get_all_tags,
|
|
get_all_pos,
|
|
VocabularyWord,
|
|
)
|
|
from units.learning import (
|
|
LearningUnitCreate,
|
|
create_learning_unit,
|
|
get_learning_unit,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/vocabulary", tags=["vocabulary"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Search & Browse
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.get("/search")
|
|
async def api_search_words(
|
|
q: str = Query("", description="Search query"),
|
|
lang: str = Query("en"),
|
|
limit: int = Query(20, ge=1, le=100),
|
|
offset: int = Query(0, ge=0),
|
|
source: str = Query("kaikki", description="Source: kaikki (6M words) or manual (27 words)"),
|
|
):
|
|
"""Full-text search for vocabulary words.
|
|
|
|
source=kaikki searches the 6.27M Kaikki/Wiktionary dictionary.
|
|
source=manual searches the manually curated vocabulary_words table.
|
|
"""
|
|
if not q.strip():
|
|
return {"words": [], "query": q, "total": 0}
|
|
|
|
if source == "kaikki":
|
|
return await _search_kaikki(q.strip(), lang, limit, offset)
|
|
|
|
words = await search_words(q.strip(), lang=lang, limit=limit, offset=offset)
|
|
return {
|
|
"words": [w.to_dict() for w in words],
|
|
"query": q,
|
|
"total": len(words),
|
|
}
|
|
|
|
|
|
async def _search_kaikki(q: str, lang: str, limit: int, offset: int):
|
|
"""Search the vocabulary_kaikki table (6.27M Wiktionary entries)."""
|
|
from vocabulary.db import get_pool
|
|
pool = await get_pool()
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT id, word, lang, pos, ipa, translations, example
|
|
FROM vocabulary_kaikki
|
|
WHERE lang = $1 AND lower(word) LIKE $2
|
|
ORDER BY length(word), lower(word)
|
|
LIMIT $3 OFFSET $4
|
|
""",
|
|
lang, f"{q.lower()}%", limit, offset,
|
|
)
|
|
|
|
words = []
|
|
for r in rows:
|
|
tr = r["translations"]
|
|
if isinstance(tr, str):
|
|
import json as _json
|
|
tr = _json.loads(tr)
|
|
words.append({
|
|
"id": str(r["id"]),
|
|
"english": r["word"] if r["lang"] == "en" else "",
|
|
"german": tr.get("de", {}).get("text", "") if r["lang"] == "en" else r["word"] if r["lang"] == "de" else "",
|
|
"word": r["word"],
|
|
"lang": r["lang"],
|
|
"ipa_en": r["ipa"] if r["lang"] == "en" else "",
|
|
"ipa_de": r["ipa"] if r["lang"] == "de" else "",
|
|
"part_of_speech": r["pos"],
|
|
"syllables_en": [],
|
|
"syllables_de": [],
|
|
"example_en": r["example"] if r["lang"] == "en" else "",
|
|
"example_de": r["example"] if r["lang"] == "de" else "",
|
|
"image_url": "",
|
|
"audio_url_en": "",
|
|
"audio_url_de": "",
|
|
"difficulty": 0,
|
|
"tags": [],
|
|
"translations": tr,
|
|
})
|
|
|
|
return {"words": words, "query": q, "total": len(words), "source": "kaikki"}
|
|
|
|
|
|
@router.get("/browse")
|
|
async def api_browse_words(
|
|
pos: str = Query("", description="Part of speech filter"),
|
|
difficulty: int = Query(0, ge=0, le=5, description="Difficulty 1-5, 0=all"),
|
|
tag: str = Query("", description="Tag filter"),
|
|
limit: int = Query(50, ge=1, le=200),
|
|
offset: int = Query(0, ge=0),
|
|
):
|
|
"""Browse vocabulary words with filters."""
|
|
words = await browse_words(
|
|
pos=pos, difficulty=difficulty, tag=tag,
|
|
limit=limit, offset=offset,
|
|
)
|
|
return {
|
|
"words": [w.to_dict() for w in words],
|
|
"filters": {"pos": pos, "difficulty": difficulty, "tag": tag},
|
|
"total": len(words),
|
|
}
|
|
|
|
|
|
@router.get("/word/{word_id}")
|
|
async def api_get_word(word_id: str):
|
|
"""Get a single word with all details."""
|
|
word = await get_word(word_id)
|
|
if not word:
|
|
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
|
|
return word.to_dict()
|
|
|
|
|
|
@router.get("/filters")
|
|
async def api_get_filters():
|
|
"""Get available filter options (tags, parts of speech, word count)."""
|
|
tags = await get_all_tags()
|
|
pos_list = await get_all_pos()
|
|
total = await count_words()
|
|
# Kaikki stats (hardcoded to avoid slow COUNT on 6M rows)
|
|
return {
|
|
"tags": tags,
|
|
"parts_of_speech": pos_list,
|
|
"total_words": total,
|
|
"kaikki_total": 6271749,
|
|
"kaikki_languages": 24,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Audio TTS for Words
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.get("/word/{word_id}/audio/{lang}")
|
|
async def api_get_word_audio(word_id: str, lang: str = "en"):
|
|
"""Get or generate TTS audio for a vocabulary word.
|
|
|
|
Returns MP3 audio. Generated on first request, cached after.
|
|
Uses Piper TTS (MIT license) with Thorsten (DE) and Lessac (EN) voices.
|
|
"""
|
|
from fastapi.responses import Response as FastAPIResponse
|
|
|
|
word = await get_word(word_id)
|
|
if not word:
|
|
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
|
|
|
|
text = word.english if lang == "en" else word.german
|
|
if not text:
|
|
raise HTTPException(status_code=400, detail=f"Kein Text fuer Sprache '{lang}'")
|
|
|
|
from services.audio import get_or_generate_audio
|
|
audio_bytes = await get_or_generate_audio(text, language=lang, word_id=word_id)
|
|
|
|
if not audio_bytes:
|
|
raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
|
|
|
|
return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
|
|
|
|
|
|
@router.get("/word/{word_id}/audio-syllables/{lang}")
|
|
async def api_get_syllable_audio(word_id: str, lang: str = "en"):
|
|
"""Get TTS audio with slow syllable pronunciation.
|
|
|
|
Generates audio like "ap ... ple" with pauses between syllables.
|
|
"""
|
|
from fastapi.responses import Response as FastAPIResponse
|
|
|
|
word = await get_word(word_id)
|
|
if not word:
|
|
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
|
|
|
|
syllables = word.syllables_en if lang == "en" else word.syllables_de
|
|
if not syllables:
|
|
# Fallback to full word
|
|
text = word.english if lang == "en" else word.german
|
|
syllables = [text]
|
|
|
|
# Join syllables with pauses (Piper handles "..." as pause)
|
|
slow_text = " ... ".join(syllables)
|
|
|
|
from services.audio import get_or_generate_audio
|
|
cache_key = f"{word_id}_syl_{lang}"
|
|
audio_bytes = await get_or_generate_audio(slow_text, language=lang, word_id=cache_key)
|
|
|
|
if not audio_bytes:
|
|
raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
|
|
|
|
return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
|
|
|
|
|
|
@router.get("/tts")
|
|
async def api_tts(text: str = Query("", min_length=1), lang: str = Query("de")):
|
|
"""Text-to-Speech endpoint. Returns MP3 audio for any text.
|
|
|
|
Uses Piper TTS (Thorsten DE / Lessac EN). Cached by text+lang.
|
|
"""
|
|
from fastapi.responses import Response as FastAPIResponse
|
|
from services.audio import get_or_generate_audio
|
|
|
|
audio_bytes = await get_or_generate_audio(text, language=lang)
|
|
if not audio_bytes:
|
|
raise HTTPException(status_code=503, detail="TTS Service nicht verfuegbar")
|
|
|
|
return FastAPIResponse(content=audio_bytes, media_type="audio/mpeg")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Learning Unit Creation from Word Selection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class CreateUnitFromWordsPayload(BaseModel):
|
|
title: str
|
|
word_ids: List[str]
|
|
grade: Optional[str] = None
|
|
language: Optional[str] = "de"
|
|
|
|
|
|
@router.post("/units")
|
|
async def api_create_unit_from_words(payload: CreateUnitFromWordsPayload):
|
|
"""Create a learning unit from selected vocabulary word IDs.
|
|
|
|
Fetches full word details, creates a LearningUnit in the
|
|
learning_units system, and stores the vocabulary data.
|
|
"""
|
|
if not payload.word_ids:
|
|
raise HTTPException(status_code=400, detail="Keine Woerter ausgewaehlt")
|
|
|
|
# Fetch all selected words
|
|
words = []
|
|
for wid in payload.word_ids:
|
|
word = await get_word(wid)
|
|
if word:
|
|
words.append(word)
|
|
|
|
if not words:
|
|
raise HTTPException(status_code=404, detail="Keine der Woerter gefunden")
|
|
|
|
# Create learning unit
|
|
lu = create_learning_unit(LearningUnitCreate(
|
|
title=payload.title,
|
|
topic="Vocabulary",
|
|
grade_level=payload.grade or "5-8",
|
|
language=payload.language or "de",
|
|
status="raw",
|
|
))
|
|
|
|
# Save vocabulary data as analysis JSON for generators
|
|
import os
|
|
analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten")
|
|
os.makedirs(analysis_dir, exist_ok=True)
|
|
|
|
vocab_data = [w.to_dict() for w in words]
|
|
analysis_path = os.path.join(analysis_dir, f"{lu.id}_vocab.json")
|
|
with open(analysis_path, "w", encoding="utf-8") as f:
|
|
json.dump({"words": vocab_data, "title": payload.title}, f, ensure_ascii=False, indent=2)
|
|
|
|
# Also save as QA items for flashcards/type trainer
|
|
qa_items = []
|
|
for i, w in enumerate(words):
|
|
qa_items.append({
|
|
"id": f"qa_{i+1}",
|
|
"question": w.english,
|
|
"answer": w.german,
|
|
"question_type": "knowledge",
|
|
"key_terms": [w.english],
|
|
"difficulty": w.difficulty,
|
|
"source_hint": w.part_of_speech,
|
|
"leitner_box": 0,
|
|
"correct_count": 0,
|
|
"incorrect_count": 0,
|
|
"last_seen": None,
|
|
"next_review": None,
|
|
# Extra fields for enhanced flashcards
|
|
"ipa_en": w.ipa_en,
|
|
"ipa_de": w.ipa_de,
|
|
"syllables_en": w.syllables_en,
|
|
"syllables_de": w.syllables_de,
|
|
"example_en": w.example_en,
|
|
"example_de": w.example_de,
|
|
"image_url": w.image_url,
|
|
"audio_url_en": w.audio_url_en,
|
|
"audio_url_de": w.audio_url_de,
|
|
"part_of_speech": w.part_of_speech,
|
|
"translations": w.translations,
|
|
})
|
|
|
|
qa_path = os.path.join(analysis_dir, f"{lu.id}_qa.json")
|
|
with open(qa_path, "w", encoding="utf-8") as f:
|
|
json.dump({
|
|
"qa_items": qa_items,
|
|
"metadata": {
|
|
"subject": "English Vocabulary",
|
|
"grade_level": payload.grade or "5-8",
|
|
"source_title": payload.title,
|
|
"total_questions": len(qa_items),
|
|
},
|
|
}, f, ensure_ascii=False, indent=2)
|
|
|
|
# Auto-enrich words with images (Wikipedia + emoji fallback)
|
|
try:
|
|
from services.image_service import enrich_words_with_images
|
|
await enrich_words_with_images(payload.word_ids)
|
|
except Exception as e:
|
|
logger.warning(f"Image enrichment failed (non-critical): {e}")
|
|
|
|
logger.info(f"Created vocab unit {lu.id} with {len(words)} words")
|
|
|
|
return {
|
|
"unit_id": lu.id,
|
|
"title": payload.title,
|
|
"word_count": len(words),
|
|
"status": "created",
|
|
}
|
|
|
|
|
|
@router.get("/units/{unit_id}")
|
|
async def api_get_unit_words(unit_id: str):
|
|
"""Get all words for a learning unit."""
|
|
import os
|
|
vocab_path = os.path.join(
|
|
os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten"),
|
|
f"{unit_id}_vocab.json",
|
|
)
|
|
if not os.path.exists(vocab_path):
|
|
raise HTTPException(status_code=404, detail="Unit nicht gefunden")
|
|
|
|
with open(vocab_path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
return {
|
|
"unit_id": unit_id,
|
|
"title": data.get("title", ""),
|
|
"words": data.get("words", []),
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Bulk Import (for seeding the dictionary)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class BulkImportPayload(BaseModel):
|
|
words: List[Dict[str, Any]]
|
|
|
|
|
|
@router.post("/import")
|
|
async def api_bulk_import(payload: BulkImportPayload):
|
|
"""Bulk import vocabulary words (for seeding the dictionary).
|
|
|
|
Each word dict should have at minimum: english, german.
|
|
Optional: ipa_en, ipa_de, part_of_speech, syllables_en, syllables_de,
|
|
example_en, example_de, difficulty, tags, translations.
|
|
"""
|
|
from .db import insert_words_bulk
|
|
|
|
words = []
|
|
for w in payload.words:
|
|
words.append(VocabularyWord(
|
|
english=w.get("english", ""),
|
|
german=w.get("german", ""),
|
|
ipa_en=w.get("ipa_en", ""),
|
|
ipa_de=w.get("ipa_de", ""),
|
|
part_of_speech=w.get("part_of_speech", ""),
|
|
syllables_en=w.get("syllables_en", []),
|
|
syllables_de=w.get("syllables_de", []),
|
|
example_en=w.get("example_en", ""),
|
|
example_de=w.get("example_de", ""),
|
|
difficulty=w.get("difficulty", 1),
|
|
tags=w.get("tags", []),
|
|
translations=w.get("translations", {}),
|
|
))
|
|
|
|
count = await insert_words_bulk(words)
|
|
logger.info(f"Bulk imported {count} vocabulary words")
|
|
return {"imported": count}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Translation Generation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.post("/enrich-images")
|
|
async def api_enrich_images(word_ids: List[str] = None):
|
|
"""Fetch and store images for vocabulary words (Wikipedia + emoji fallback)."""
|
|
from services.image_service import enrich_words_with_images
|
|
from vocabulary.db import get_pool
|
|
import uuid as _uuid
|
|
|
|
if not word_ids:
|
|
pool = await get_pool()
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch("SELECT id FROM vocabulary_words WHERE image_url = '' OR image_url IS NULL")
|
|
word_ids = [str(r["id"]) for r in rows]
|
|
|
|
if not word_ids:
|
|
return {"enriched": 0, "message": "All words already have images"}
|
|
|
|
count = await enrich_words_with_images(word_ids)
|
|
return {"enriched": count, "total": len(word_ids)}
|
|
|
|
|
|
@router.get("/topics")
|
|
async def api_get_topics(q: str = Query("", description="Search topic or word")):
|
|
"""Find topics matching a search word. Returns related word lists.
|
|
|
|
If q matches a topic name → returns that topic.
|
|
If q matches a word in any topic → returns all topics containing that word.
|
|
"""
|
|
from vocabulary.db import get_pool
|
|
pool = await get_pool()
|
|
async with pool.acquire() as conn:
|
|
if not q.strip():
|
|
rows = await conn.fetch("SELECT topic, words, word_count FROM vocabulary_topics ORDER BY topic LIMIT 50")
|
|
else:
|
|
q_lower = q.strip().lower()
|
|
rows = await conn.fetch("""
|
|
SELECT topic, words, word_count FROM vocabulary_topics
|
|
WHERE lower(topic) LIKE $1 OR $2 = ANY(words)
|
|
ORDER BY word_count DESC
|
|
""", f"%{q_lower}%", q_lower)
|
|
|
|
return {
|
|
"topics": [{"topic": r["topic"], "words": list(r["words"]), "word_count": r["word_count"]} for r in rows],
|
|
"query": q,
|
|
}
|
|
|
|
|
|
class TranslateRequest(BaseModel):
|
|
word_ids: List[str]
|
|
target_language: str
|
|
|
|
|
|
@router.post("/translate")
|
|
async def api_translate_words(payload: TranslateRequest):
|
|
"""Generate translations for vocabulary words into a target language.
|
|
|
|
Uses local LLM (Ollama) for translation. Results are cached in the
|
|
vocabulary_words.translations JSONB field.
|
|
"""
|
|
from services.translation import translate_and_store
|
|
|
|
if payload.target_language not in {"tr", "ar", "uk", "ru", "pl", "fr", "es"}:
|
|
raise HTTPException(status_code=400, detail=f"Sprache '{payload.target_language}' nicht unterstuetzt")
|
|
|
|
count = await translate_and_store(payload.word_ids, payload.target_language)
|
|
return {"translated": count, "target_language": payload.target_language}
|