Add Vocabulary Learning Platform (Phase 1: DB + API + Editor)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 59s
CI / test-go-edu-search (push) Successful in 45s
CI / test-python-klausur (push) Failing after 3m7s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 31s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 59s
CI / test-go-edu-search (push) Successful in 45s
CI / test-python-klausur (push) Failing after 3m7s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 31s
Strategic pivot: Studio-v2 becomes a language learning platform. Compliance guardrail added to CLAUDE.md — no scan/OCR of third-party content in customer frontend. Upload of OWN materials remains allowed. Phase 1.1 — vocabulary_db.py: PostgreSQL model for 160k+ words with english, german, IPA, syllables, examples, images, audio, difficulty, tags, translations (multilingual). Trigram search index. Phase 1.2 — vocabulary_api.py: Search, browse, filters, bulk import, learning unit creation from word selection. Creates QA items with enhanced fields (IPA, syllables, image, audio) for flashcards. Phase 1.3 — /vocabulary page: Search bar with POS/difficulty filters, word cards with audio buttons, unit builder sidebar. Teacher selects words → creates learning unit → redirects to flashcards. Sidebar: Added "Woerterbuch" (/vocabulary) and "Lernmodule" (/learn). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
274
backend-lehrer/vocabulary_db.py
Normal file
274
backend-lehrer/vocabulary_db.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Vocabulary Database — PostgreSQL storage for the vocabulary word catalog.
|
||||
|
||||
Stores 160k+ words with translations, IPA, syllables, examples, images, audio.
|
||||
Uses asyncpg for async PostgreSQL access (same pattern as game/database.py).
|
||||
|
||||
Schema: lehrer.vocabulary_words (search_path set in main.py)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATABASE_URL = os.getenv(
|
||||
"DATABASE_URL",
|
||||
"postgresql://breakpilot:breakpilot@postgres:5432/breakpilot",
|
||||
)
|
||||
|
||||
_pool = None
|
||||
|
||||
|
||||
async def get_pool():
|
||||
"""Get or create the asyncpg connection pool."""
|
||||
global _pool
|
||||
if _pool is None:
|
||||
import asyncpg
|
||||
_pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
|
||||
return _pool
|
||||
|
||||
|
||||
async def init_vocabulary_tables():
|
||||
"""Create vocabulary tables if they don't exist."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS vocabulary_words (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
english TEXT NOT NULL,
|
||||
german TEXT NOT NULL DEFAULT '',
|
||||
ipa_en TEXT NOT NULL DEFAULT '',
|
||||
ipa_de TEXT NOT NULL DEFAULT '',
|
||||
part_of_speech TEXT NOT NULL DEFAULT '',
|
||||
syllables_en TEXT[] NOT NULL DEFAULT '{}',
|
||||
syllables_de TEXT[] NOT NULL DEFAULT '{}',
|
||||
example_en TEXT NOT NULL DEFAULT '',
|
||||
example_de TEXT NOT NULL DEFAULT '',
|
||||
image_url TEXT NOT NULL DEFAULT '',
|
||||
audio_url_en TEXT NOT NULL DEFAULT '',
|
||||
audio_url_de TEXT NOT NULL DEFAULT '',
|
||||
difficulty INT NOT NULL DEFAULT 1,
|
||||
tags TEXT[] NOT NULL DEFAULT '{}',
|
||||
translations JSONB NOT NULL DEFAULT '{}',
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_vocab_english
|
||||
ON vocabulary_words (lower(english));
|
||||
CREATE INDEX IF NOT EXISTS idx_vocab_german
|
||||
ON vocabulary_words (lower(german));
|
||||
CREATE INDEX IF NOT EXISTS idx_vocab_pos
|
||||
ON vocabulary_words (part_of_speech);
|
||||
CREATE INDEX IF NOT EXISTS idx_vocab_difficulty
|
||||
ON vocabulary_words (difficulty);
|
||||
CREATE INDEX IF NOT EXISTS idx_vocab_tags
|
||||
ON vocabulary_words USING GIN (tags);
|
||||
CREATE INDEX IF NOT EXISTS idx_vocab_english_trgm
|
||||
ON vocabulary_words USING GIN (english gin_trgm_ops);
|
||||
""")
|
||||
# Enable trigram extension for fuzzy search (may already exist)
|
||||
try:
|
||||
await conn.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
|
||||
except Exception:
|
||||
logger.info("pg_trgm extension already exists or cannot be created")
|
||||
|
||||
logger.info("vocabulary_words table initialized")
|
||||
|
||||
|
||||
@dataclass
|
||||
class VocabularyWord:
|
||||
"""A single vocabulary word with all metadata."""
|
||||
id: str = ""
|
||||
english: str = ""
|
||||
german: str = ""
|
||||
ipa_en: str = ""
|
||||
ipa_de: str = ""
|
||||
part_of_speech: str = ""
|
||||
syllables_en: List[str] = field(default_factory=list)
|
||||
syllables_de: List[str] = field(default_factory=list)
|
||||
example_en: str = ""
|
||||
example_de: str = ""
|
||||
image_url: str = ""
|
||||
audio_url_en: str = ""
|
||||
audio_url_de: str = ""
|
||||
difficulty: int = 1
|
||||
tags: List[str] = field(default_factory=list)
|
||||
translations: Dict[str, str] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
def _row_to_word(row) -> VocabularyWord:
|
||||
"""Convert an asyncpg Record to VocabularyWord."""
|
||||
import json
|
||||
translations = row["translations"]
|
||||
if isinstance(translations, str):
|
||||
translations = json.loads(translations)
|
||||
return VocabularyWord(
|
||||
id=str(row["id"]),
|
||||
english=row["english"],
|
||||
german=row["german"],
|
||||
ipa_en=row["ipa_en"],
|
||||
ipa_de=row["ipa_de"],
|
||||
part_of_speech=row["part_of_speech"],
|
||||
syllables_en=list(row["syllables_en"] or []),
|
||||
syllables_de=list(row["syllables_de"] or []),
|
||||
example_en=row["example_en"],
|
||||
example_de=row["example_de"],
|
||||
image_url=row["image_url"],
|
||||
audio_url_en=row["audio_url_en"],
|
||||
audio_url_de=row["audio_url_de"],
|
||||
difficulty=row["difficulty"],
|
||||
tags=list(row["tags"] or []),
|
||||
translations=translations or {},
|
||||
)
|
||||
|
||||
|
||||
async def search_words(
|
||||
query: str, lang: str = "en", limit: int = 20, offset: int = 0,
|
||||
) -> List[VocabularyWord]:
|
||||
"""Full-text search for words."""
|
||||
pool = await get_pool()
|
||||
col = "english" if lang == "en" else "german"
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
f"""
|
||||
SELECT * FROM vocabulary_words
|
||||
WHERE lower({col}) LIKE $1 OR {col} % $2
|
||||
ORDER BY similarity({col}, $2) DESC, lower({col})
|
||||
LIMIT $3 OFFSET $4
|
||||
""",
|
||||
f"%{query.lower()}%", query, limit, offset,
|
||||
)
|
||||
return [_row_to_word(r) for r in rows]
|
||||
|
||||
|
||||
async def get_word(word_id: str) -> Optional[VocabularyWord]:
|
||||
"""Get a single word by ID."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"SELECT * FROM vocabulary_words WHERE id = $1", uuid.UUID(word_id),
|
||||
)
|
||||
return _row_to_word(row) if row else None
|
||||
|
||||
|
||||
async def browse_words(
|
||||
pos: str = "", difficulty: int = 0, tag: str = "",
|
||||
limit: int = 50, offset: int = 0,
|
||||
) -> List[VocabularyWord]:
|
||||
"""Browse words with filters."""
|
||||
pool = await get_pool()
|
||||
conditions = []
|
||||
params: List[Any] = []
|
||||
idx = 1
|
||||
|
||||
if pos:
|
||||
conditions.append(f"part_of_speech = ${idx}")
|
||||
params.append(pos)
|
||||
idx += 1
|
||||
if difficulty > 0:
|
||||
conditions.append(f"difficulty = ${idx}")
|
||||
params.append(difficulty)
|
||||
idx += 1
|
||||
if tag:
|
||||
conditions.append(f"${idx} = ANY(tags)")
|
||||
params.append(tag)
|
||||
idx += 1
|
||||
|
||||
where = "WHERE " + " AND ".join(conditions) if conditions else ""
|
||||
params.extend([limit, offset])
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
f"SELECT * FROM vocabulary_words {where} ORDER BY english LIMIT ${idx} OFFSET ${idx+1}",
|
||||
*params,
|
||||
)
|
||||
return [_row_to_word(r) for r in rows]
|
||||
|
||||
|
||||
async def insert_word(word: VocabularyWord) -> str:
|
||||
"""Insert a new word, returns the ID."""
|
||||
pool = await get_pool()
|
||||
import json
|
||||
word_id = word.id or str(uuid.uuid4())
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO vocabulary_words
|
||||
(id, english, german, ipa_en, ipa_de, part_of_speech,
|
||||
syllables_en, syllables_de, example_en, example_de,
|
||||
image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
|
||||
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
|
||||
ON CONFLICT (id) DO NOTHING
|
||||
""",
|
||||
uuid.UUID(word_id), word.english, word.german,
|
||||
word.ipa_en, word.ipa_de, word.part_of_speech,
|
||||
word.syllables_en, word.syllables_de,
|
||||
word.example_en, word.example_de,
|
||||
word.image_url, word.audio_url_en, word.audio_url_de,
|
||||
word.difficulty, word.tags, json.dumps(word.translations),
|
||||
)
|
||||
return word_id
|
||||
|
||||
|
||||
async def insert_words_bulk(words: List[VocabularyWord]) -> int:
|
||||
"""Bulk insert words. Returns count of inserted rows."""
|
||||
pool = await get_pool()
|
||||
import json
|
||||
records = []
|
||||
for w in words:
|
||||
wid = w.id or str(uuid.uuid4())
|
||||
records.append((
|
||||
uuid.UUID(wid), w.english, w.german,
|
||||
w.ipa_en, w.ipa_de, w.part_of_speech,
|
||||
w.syllables_en, w.syllables_de,
|
||||
w.example_en, w.example_de,
|
||||
w.image_url, w.audio_url_en, w.audio_url_de,
|
||||
w.difficulty, w.tags, json.dumps(w.translations),
|
||||
))
|
||||
async with pool.acquire() as conn:
|
||||
await conn.executemany(
|
||||
"""
|
||||
INSERT INTO vocabulary_words
|
||||
(id, english, german, ipa_en, ipa_de, part_of_speech,
|
||||
syllables_en, syllables_de, example_en, example_de,
|
||||
image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
|
||||
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
|
||||
ON CONFLICT (id) DO NOTHING
|
||||
""",
|
||||
records,
|
||||
)
|
||||
return len(records)
|
||||
|
||||
|
||||
async def count_words() -> int:
|
||||
"""Count total words in the database."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
return await conn.fetchval("SELECT COUNT(*) FROM vocabulary_words")
|
||||
|
||||
|
||||
async def get_all_tags() -> List[str]:
|
||||
"""Get all unique tags."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT DISTINCT unnest(tags) AS tag FROM vocabulary_words ORDER BY tag"
|
||||
)
|
||||
return [r["tag"] for r in rows]
|
||||
|
||||
|
||||
async def get_all_pos() -> List[str]:
|
||||
"""Get all unique parts of speech."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT DISTINCT part_of_speech FROM vocabulary_words WHERE part_of_speech != '' ORDER BY part_of_speech"
|
||||
)
|
||||
return [r["part_of_speech"] for r in rows]
|
||||
Reference in New Issue
Block a user