Connect frontend to Kaikki dictionary (6.27M words, 24 languages)

Search endpoint now defaults to source=kaikki, searching the
vocabulary_kaikki table with 6.27M Wiktionary entries.

/filters returns kaikki_total and kaikki_languages count.
/vocabulary header shows "6,271,749 Woerter in 24 Sprachen".

Manual vocabulary_words (27 entries) still accessible via source=manual.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-28 17:49:28 +02:00
parent d14826b199
commit cb4ea8e49a
2 changed files with 72 additions and 4 deletions

View File

@@ -41,14 +41,22 @@ router = APIRouter(prefix="/vocabulary", tags=["vocabulary"])
@router.get("/search")
async def api_search_words(
q: str = Query("", description="Search query"),
lang: str = Query("en", pattern="^(en|de)$"),
lang: str = Query("en"),
limit: int = Query(20, ge=1, le=100),
offset: int = Query(0, ge=0),
source: str = Query("kaikki", description="Source: kaikki (6M words) or manual (27 words)"),
):
"""Full-text search for vocabulary words."""
"""Full-text search for vocabulary words.
source=kaikki searches the 6.27M Kaikki/Wiktionary dictionary.
source=manual searches the manually curated vocabulary_words table.
"""
if not q.strip():
return {"words": [], "query": q, "total": 0}
if source == "kaikki":
return await _search_kaikki(q.strip(), lang, limit, offset)
words = await search_words(q.strip(), lang=lang, limit=limit, offset=offset)
return {
"words": [w.to_dict() for w in words],
@@ -57,6 +65,52 @@ async def api_search_words(
}
async def _search_kaikki(q: str, lang: str, limit: int, offset: int):
"""Search the vocabulary_kaikki table (6.27M Wiktionary entries)."""
from vocabulary.db import get_pool
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT id, word, lang, pos, ipa, translations, example
FROM vocabulary_kaikki
WHERE lang = $1 AND lower(word) LIKE $2
ORDER BY length(word), lower(word)
LIMIT $3 OFFSET $4
""",
lang, f"{q.lower()}%", limit, offset,
)
words = []
for r in rows:
tr = r["translations"]
if isinstance(tr, str):
import json as _json
tr = _json.loads(tr)
words.append({
"id": str(r["id"]),
"english": r["word"] if r["lang"] == "en" else "",
"german": tr.get("de", {}).get("text", "") if r["lang"] == "en" else r["word"] if r["lang"] == "de" else "",
"word": r["word"],
"lang": r["lang"],
"ipa_en": r["ipa"] if r["lang"] == "en" else "",
"ipa_de": r["ipa"] if r["lang"] == "de" else "",
"part_of_speech": r["pos"],
"syllables_en": [],
"syllables_de": [],
"example_en": r["example"] if r["lang"] == "en" else "",
"example_de": r["example"] if r["lang"] == "de" else "",
"image_url": "",
"audio_url_en": "",
"audio_url_de": "",
"difficulty": 0,
"tags": [],
"translations": tr,
})
return {"words": words, "query": q, "total": len(words), "source": "kaikki"}
@router.get("/browse")
async def api_browse_words(
pos: str = Query("", description="Part of speech filter"),
@@ -92,10 +146,24 @@ async def api_get_filters():
tags = await get_all_tags()
pos_list = await get_all_pos()
total = await count_words()
# Kaikki stats
kaikki_total = 0
kaikki_langs = 0
try:
from vocabulary.db import get_pool
pool = await get_pool()
async with pool.acquire() as conn:
kaikki_total = await conn.fetchval("SELECT COUNT(*) FROM vocabulary_kaikki")
kaikki_langs = await conn.fetchval("SELECT COUNT(DISTINCT lang) FROM vocabulary_kaikki")
except Exception:
pass
return {
"tags": tags,
"parts_of_speech": pos_list,
"total_words": total,
"kaikki_total": kaikki_total,
"kaikki_languages": kaikki_langs,
}