Add Vocabulary Learning Platform (Phase 1: DB + API + Editor)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 59s
CI / test-go-edu-search (push) Successful in 45s
CI / test-python-klausur (push) Failing after 3m7s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 31s

Strategic pivot: Studio-v2 becomes a language learning platform.
Compliance guardrail added to CLAUDE.md — no scan/OCR of third-party
content in customer frontend. Upload of OWN materials remains allowed.

Phase 1.1 — vocabulary_db.py: PostgreSQL model for 160k+ words
with english, german, IPA, syllables, examples, images, audio,
difficulty, tags, translations (multilingual). Trigram search index.

Phase 1.2 — vocabulary_api.py: Search, browse, filters, bulk import,
learning unit creation from word selection. Creates QA items with
enhanced fields (IPA, syllables, image, audio) for flashcards.

Phase 1.3 — /vocabulary page: Search bar with POS/difficulty filters,
word cards with audio buttons, unit builder sidebar. Teacher selects
words → creates learning unit → redirects to flashcards.

Sidebar: Added "Woerterbuch" (/vocabulary) and "Lernmodule" (/learn).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 13:36:28 +02:00
parent 7fc5464df7
commit 7ff9860c69
6 changed files with 900 additions and 1 deletions

View File

@@ -48,6 +48,12 @@ ALERTS_AGENT_ENABLED = os.getenv("ALERTS_AGENT_ENABLED", "false").lower() == "tr
@asynccontextmanager
async def lifespan(app: FastAPI):
logger.info("Backend-Lehrer starting up (DB search_path=lehrer,core,public)")
# Initialize vocabulary tables
try:
from vocabulary_db import init_vocabulary_tables
await init_vocabulary_tables()
except Exception as e:
logger.warning(f"Vocabulary tables init failed (non-critical): {e}")
yield
logger.info("Backend-Lehrer shutting down")
@@ -109,6 +115,10 @@ app.include_router(learning_units_router, prefix="/api")
from progress_api import router as progress_router
app.include_router(progress_router, prefix="/api")
# --- 4c. Vocabulary Catalog ---
from vocabulary_api import router as vocabulary_router
app.include_router(vocabulary_router, prefix="/api")
from unit_api import router as unit_router
app.include_router(unit_router) # Already has /api/units prefix

View File

@@ -0,0 +1,264 @@
"""
Vocabulary API — Search, browse, and build learning units from the word catalog.
Endpoints for teachers to find words and create learning units,
and for students to access word details with audio/images/syllables.
"""
import logging
import json
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from vocabulary_db import (
search_words,
get_word,
browse_words,
insert_word,
count_words,
get_all_tags,
get_all_pos,
VocabularyWord,
)
from learning_units import (
LearningUnitCreate,
create_learning_unit,
get_learning_unit,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/vocabulary", tags=["vocabulary"])
# ---------------------------------------------------------------------------
# Search & Browse
# ---------------------------------------------------------------------------
@router.get("/search")
async def api_search_words(
q: str = Query("", description="Search query"),
lang: str = Query("en", pattern="^(en|de)$"),
limit: int = Query(20, ge=1, le=100),
offset: int = Query(0, ge=0),
):
"""Full-text search for vocabulary words."""
if not q.strip():
return {"words": [], "query": q, "total": 0}
words = await search_words(q.strip(), lang=lang, limit=limit, offset=offset)
return {
"words": [w.to_dict() for w in words],
"query": q,
"total": len(words),
}
@router.get("/browse")
async def api_browse_words(
pos: str = Query("", description="Part of speech filter"),
difficulty: int = Query(0, ge=0, le=5, description="Difficulty 1-5, 0=all"),
tag: str = Query("", description="Tag filter"),
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0),
):
"""Browse vocabulary words with filters."""
words = await browse_words(
pos=pos, difficulty=difficulty, tag=tag,
limit=limit, offset=offset,
)
return {
"words": [w.to_dict() for w in words],
"filters": {"pos": pos, "difficulty": difficulty, "tag": tag},
"total": len(words),
}
@router.get("/word/{word_id}")
async def api_get_word(word_id: str):
"""Get a single word with all details."""
word = await get_word(word_id)
if not word:
raise HTTPException(status_code=404, detail="Wort nicht gefunden")
return word.to_dict()
@router.get("/filters")
async def api_get_filters():
"""Get available filter options (tags, parts of speech, word count)."""
tags = await get_all_tags()
pos_list = await get_all_pos()
total = await count_words()
return {
"tags": tags,
"parts_of_speech": pos_list,
"total_words": total,
}
# ---------------------------------------------------------------------------
# Learning Unit Creation from Word Selection
# ---------------------------------------------------------------------------
class CreateUnitFromWordsPayload(BaseModel):
title: str
word_ids: List[str]
grade: Optional[str] = None
language: Optional[str] = "de"
@router.post("/units")
async def api_create_unit_from_words(payload: CreateUnitFromWordsPayload):
"""Create a learning unit from selected vocabulary word IDs.
Fetches full word details, creates a LearningUnit in the
learning_units system, and stores the vocabulary data.
"""
if not payload.word_ids:
raise HTTPException(status_code=400, detail="Keine Woerter ausgewaehlt")
# Fetch all selected words
words = []
for wid in payload.word_ids:
word = await get_word(wid)
if word:
words.append(word)
if not words:
raise HTTPException(status_code=404, detail="Keine der Woerter gefunden")
# Create learning unit
lu = create_learning_unit(LearningUnitCreate(
title=payload.title,
topic="Vocabulary",
grade_level=payload.grade or "5-8",
language=payload.language or "de",
status="raw",
))
# Save vocabulary data as analysis JSON for generators
import os
analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten")
os.makedirs(analysis_dir, exist_ok=True)
vocab_data = [w.to_dict() for w in words]
analysis_path = os.path.join(analysis_dir, f"{lu.id}_vocab.json")
with open(analysis_path, "w", encoding="utf-8") as f:
json.dump({"words": vocab_data, "title": payload.title}, f, ensure_ascii=False, indent=2)
# Also save as QA items for flashcards/type trainer
qa_items = []
for i, w in enumerate(words):
qa_items.append({
"id": f"qa_{i+1}",
"question": w.english,
"answer": w.german,
"question_type": "knowledge",
"key_terms": [w.english],
"difficulty": w.difficulty,
"source_hint": w.part_of_speech,
"leitner_box": 0,
"correct_count": 0,
"incorrect_count": 0,
"last_seen": None,
"next_review": None,
# Extra fields for enhanced flashcards
"ipa_en": w.ipa_en,
"ipa_de": w.ipa_de,
"syllables_en": w.syllables_en,
"syllables_de": w.syllables_de,
"example_en": w.example_en,
"example_de": w.example_de,
"image_url": w.image_url,
"audio_url_en": w.audio_url_en,
"audio_url_de": w.audio_url_de,
"part_of_speech": w.part_of_speech,
"translations": w.translations,
})
qa_path = os.path.join(analysis_dir, f"{lu.id}_qa.json")
with open(qa_path, "w", encoding="utf-8") as f:
json.dump({
"qa_items": qa_items,
"metadata": {
"subject": "English Vocabulary",
"grade_level": payload.grade or "5-8",
"source_title": payload.title,
"total_questions": len(qa_items),
},
}, f, ensure_ascii=False, indent=2)
logger.info(f"Created vocab unit {lu.id} with {len(words)} words")
return {
"unit_id": lu.id,
"title": payload.title,
"word_count": len(words),
"status": "created",
}
@router.get("/units/{unit_id}")
async def api_get_unit_words(unit_id: str):
"""Get all words for a learning unit."""
import os
vocab_path = os.path.join(
os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten"),
f"{unit_id}_vocab.json",
)
if not os.path.exists(vocab_path):
raise HTTPException(status_code=404, detail="Unit nicht gefunden")
with open(vocab_path, "r", encoding="utf-8") as f:
data = json.load(f)
return {
"unit_id": unit_id,
"title": data.get("title", ""),
"words": data.get("words", []),
}
# ---------------------------------------------------------------------------
# Bulk Import (for seeding the dictionary)
# ---------------------------------------------------------------------------
class BulkImportPayload(BaseModel):
words: List[Dict[str, Any]]
@router.post("/import")
async def api_bulk_import(payload: BulkImportPayload):
"""Bulk import vocabulary words (for seeding the dictionary).
Each word dict should have at minimum: english, german.
Optional: ipa_en, ipa_de, part_of_speech, syllables_en, syllables_de,
example_en, example_de, difficulty, tags, translations.
"""
from vocabulary_db import insert_words_bulk
words = []
for w in payload.words:
words.append(VocabularyWord(
english=w.get("english", ""),
german=w.get("german", ""),
ipa_en=w.get("ipa_en", ""),
ipa_de=w.get("ipa_de", ""),
part_of_speech=w.get("part_of_speech", ""),
syllables_en=w.get("syllables_en", []),
syllables_de=w.get("syllables_de", []),
example_en=w.get("example_en", ""),
example_de=w.get("example_de", ""),
difficulty=w.get("difficulty", 1),
tags=w.get("tags", []),
translations=w.get("translations", {}),
))
count = await insert_words_bulk(words)
logger.info(f"Bulk imported {count} vocabulary words")
return {"imported": count}

View File

@@ -0,0 +1,274 @@
"""
Vocabulary Database — PostgreSQL storage for the vocabulary word catalog.
Stores 160k+ words with translations, IPA, syllables, examples, images, audio.
Uses asyncpg for async PostgreSQL access (same pattern as game/database.py).
Schema: lehrer.vocabulary_words (search_path set in main.py)
"""
import logging
import os
import uuid
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
DATABASE_URL = os.getenv(
"DATABASE_URL",
"postgresql://breakpilot:breakpilot@postgres:5432/breakpilot",
)
_pool = None
async def get_pool():
"""Get or create the asyncpg connection pool."""
global _pool
if _pool is None:
import asyncpg
_pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
return _pool
async def init_vocabulary_tables():
"""Create vocabulary tables if they don't exist."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute("""
CREATE TABLE IF NOT EXISTS vocabulary_words (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
english TEXT NOT NULL,
german TEXT NOT NULL DEFAULT '',
ipa_en TEXT NOT NULL DEFAULT '',
ipa_de TEXT NOT NULL DEFAULT '',
part_of_speech TEXT NOT NULL DEFAULT '',
syllables_en TEXT[] NOT NULL DEFAULT '{}',
syllables_de TEXT[] NOT NULL DEFAULT '{}',
example_en TEXT NOT NULL DEFAULT '',
example_de TEXT NOT NULL DEFAULT '',
image_url TEXT NOT NULL DEFAULT '',
audio_url_en TEXT NOT NULL DEFAULT '',
audio_url_de TEXT NOT NULL DEFAULT '',
difficulty INT NOT NULL DEFAULT 1,
tags TEXT[] NOT NULL DEFAULT '{}',
translations JSONB NOT NULL DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_vocab_english
ON vocabulary_words (lower(english));
CREATE INDEX IF NOT EXISTS idx_vocab_german
ON vocabulary_words (lower(german));
CREATE INDEX IF NOT EXISTS idx_vocab_pos
ON vocabulary_words (part_of_speech);
CREATE INDEX IF NOT EXISTS idx_vocab_difficulty
ON vocabulary_words (difficulty);
CREATE INDEX IF NOT EXISTS idx_vocab_tags
ON vocabulary_words USING GIN (tags);
CREATE INDEX IF NOT EXISTS idx_vocab_english_trgm
ON vocabulary_words USING GIN (english gin_trgm_ops);
""")
# Enable trigram extension for fuzzy search (may already exist)
try:
await conn.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
except Exception:
logger.info("pg_trgm extension already exists or cannot be created")
logger.info("vocabulary_words table initialized")
@dataclass
class VocabularyWord:
"""A single vocabulary word with all metadata."""
id: str = ""
english: str = ""
german: str = ""
ipa_en: str = ""
ipa_de: str = ""
part_of_speech: str = ""
syllables_en: List[str] = field(default_factory=list)
syllables_de: List[str] = field(default_factory=list)
example_en: str = ""
example_de: str = ""
image_url: str = ""
audio_url_en: str = ""
audio_url_de: str = ""
difficulty: int = 1
tags: List[str] = field(default_factory=list)
translations: Dict[str, str] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
def _row_to_word(row) -> VocabularyWord:
"""Convert an asyncpg Record to VocabularyWord."""
import json
translations = row["translations"]
if isinstance(translations, str):
translations = json.loads(translations)
return VocabularyWord(
id=str(row["id"]),
english=row["english"],
german=row["german"],
ipa_en=row["ipa_en"],
ipa_de=row["ipa_de"],
part_of_speech=row["part_of_speech"],
syllables_en=list(row["syllables_en"] or []),
syllables_de=list(row["syllables_de"] or []),
example_en=row["example_en"],
example_de=row["example_de"],
image_url=row["image_url"],
audio_url_en=row["audio_url_en"],
audio_url_de=row["audio_url_de"],
difficulty=row["difficulty"],
tags=list(row["tags"] or []),
translations=translations or {},
)
async def search_words(
query: str, lang: str = "en", limit: int = 20, offset: int = 0,
) -> List[VocabularyWord]:
"""Full-text search for words."""
pool = await get_pool()
col = "english" if lang == "en" else "german"
async with pool.acquire() as conn:
rows = await conn.fetch(
f"""
SELECT * FROM vocabulary_words
WHERE lower({col}) LIKE $1 OR {col} % $2
ORDER BY similarity({col}, $2) DESC, lower({col})
LIMIT $3 OFFSET $4
""",
f"%{query.lower()}%", query, limit, offset,
)
return [_row_to_word(r) for r in rows]
async def get_word(word_id: str) -> Optional[VocabularyWord]:
"""Get a single word by ID."""
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM vocabulary_words WHERE id = $1", uuid.UUID(word_id),
)
return _row_to_word(row) if row else None
async def browse_words(
pos: str = "", difficulty: int = 0, tag: str = "",
limit: int = 50, offset: int = 0,
) -> List[VocabularyWord]:
"""Browse words with filters."""
pool = await get_pool()
conditions = []
params: List[Any] = []
idx = 1
if pos:
conditions.append(f"part_of_speech = ${idx}")
params.append(pos)
idx += 1
if difficulty > 0:
conditions.append(f"difficulty = ${idx}")
params.append(difficulty)
idx += 1
if tag:
conditions.append(f"${idx} = ANY(tags)")
params.append(tag)
idx += 1
where = "WHERE " + " AND ".join(conditions) if conditions else ""
params.extend([limit, offset])
async with pool.acquire() as conn:
rows = await conn.fetch(
f"SELECT * FROM vocabulary_words {where} ORDER BY english LIMIT ${idx} OFFSET ${idx+1}",
*params,
)
return [_row_to_word(r) for r in rows]
async def insert_word(word: VocabularyWord) -> str:
"""Insert a new word, returns the ID."""
pool = await get_pool()
import json
word_id = word.id or str(uuid.uuid4())
async with pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO vocabulary_words
(id, english, german, ipa_en, ipa_de, part_of_speech,
syllables_en, syllables_de, example_en, example_de,
image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
ON CONFLICT (id) DO NOTHING
""",
uuid.UUID(word_id), word.english, word.german,
word.ipa_en, word.ipa_de, word.part_of_speech,
word.syllables_en, word.syllables_de,
word.example_en, word.example_de,
word.image_url, word.audio_url_en, word.audio_url_de,
word.difficulty, word.tags, json.dumps(word.translations),
)
return word_id
async def insert_words_bulk(words: List[VocabularyWord]) -> int:
"""Bulk insert words. Returns count of inserted rows."""
pool = await get_pool()
import json
records = []
for w in words:
wid = w.id or str(uuid.uuid4())
records.append((
uuid.UUID(wid), w.english, w.german,
w.ipa_en, w.ipa_de, w.part_of_speech,
w.syllables_en, w.syllables_de,
w.example_en, w.example_de,
w.image_url, w.audio_url_en, w.audio_url_de,
w.difficulty, w.tags, json.dumps(w.translations),
))
async with pool.acquire() as conn:
await conn.executemany(
"""
INSERT INTO vocabulary_words
(id, english, german, ipa_en, ipa_de, part_of_speech,
syllables_en, syllables_de, example_en, example_de,
image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
ON CONFLICT (id) DO NOTHING
""",
records,
)
return len(records)
async def count_words() -> int:
"""Count total words in the database."""
pool = await get_pool()
async with pool.acquire() as conn:
return await conn.fetchval("SELECT COUNT(*) FROM vocabulary_words")
async def get_all_tags() -> List[str]:
"""Get all unique tags."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT DISTINCT unnest(tags) AS tag FROM vocabulary_words ORDER BY tag"
)
return [r["tag"] for r in rows]
async def get_all_pos() -> List[str]:
"""Get all unique parts of speech."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT DISTINCT part_of_speech FROM vocabulary_words WHERE part_of_speech != '' ORDER BY part_of_speech"
)
return [r["part_of_speech"] for r in rows]