breakpilot-lehrer/backend-lehrer/vocabulary_db.py

"""
Vocabulary Database — PostgreSQL storage for the vocabulary word catalog.

Stores 160k+ words with translations, IPA, syllables, examples, images, audio.
Uses asyncpg for async PostgreSQL access (same pattern as game/database.py).

Schema: lehrer.vocabulary_words (search_path set in main.py)
"""

import logging
import os
import uuid
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

_RAW_DB_URL = os.getenv(
    "DATABASE_URL",
    "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot",
)
# Strip SQLAlchemy dialect prefix (asyncpg needs plain postgresql://)
DATABASE_URL = _RAW_DB_URL.replace("postgresql+asyncpg://", "postgresql://")
# Strip search_path options (set via SET after connect)
if "options=" in DATABASE_URL:
    DATABASE_URL = DATABASE_URL.split("?")[0] if "options=" in DATABASE_URL.split("?")[-1] else DATABASE_URL

_pool = None


async def get_pool():
    """Get or create the asyncpg connection pool."""
    global _pool
    if _pool is None:
        import asyncpg
        _pool = await asyncpg.create_pool(
            DATABASE_URL, min_size=2, max_size=10,
            server_settings={"search_path": "lehrer,core,public"},
        )
    return _pool


async def init_vocabulary_tables():
    """Create vocabulary tables if they don't exist."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        await conn.execute("""
            CREATE TABLE IF NOT EXISTS vocabulary_words (
                id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                english TEXT NOT NULL,
                german TEXT NOT NULL DEFAULT '',
                ipa_en TEXT NOT NULL DEFAULT '',
                ipa_de TEXT NOT NULL DEFAULT '',
                part_of_speech TEXT NOT NULL DEFAULT '',
                syllables_en TEXT[] NOT NULL DEFAULT '{}',
                syllables_de TEXT[] NOT NULL DEFAULT '{}',
                example_en TEXT NOT NULL DEFAULT '',
                example_de TEXT NOT NULL DEFAULT '',
                image_url TEXT NOT NULL DEFAULT '',
                audio_url_en TEXT NOT NULL DEFAULT '',
                audio_url_de TEXT NOT NULL DEFAULT '',
                difficulty INT NOT NULL DEFAULT 1,
                tags TEXT[] NOT NULL DEFAULT '{}',
                translations JSONB NOT NULL DEFAULT '{}',
                created_at TIMESTAMPTZ NOT NULL DEFAULT now()
            );

            CREATE INDEX IF NOT EXISTS idx_vocab_english
                ON vocabulary_words (lower(english));
            CREATE INDEX IF NOT EXISTS idx_vocab_german
                ON vocabulary_words (lower(german));
            CREATE INDEX IF NOT EXISTS idx_vocab_pos
                ON vocabulary_words (part_of_speech);
            CREATE INDEX IF NOT EXISTS idx_vocab_difficulty
                ON vocabulary_words (difficulty);
            CREATE INDEX IF NOT EXISTS idx_vocab_tags
                ON vocabulary_words USING GIN (tags);
        """)
        # Enable trigram extension for fuzzy search (optional)
        try:
            await conn.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
            await conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_vocab_english_trgm
                    ON vocabulary_words USING GIN (english gin_trgm_ops);
            """)
        except Exception:
            logger.info("pg_trgm not available — trigram search disabled, using LIKE fallback")

    logger.info("vocabulary_words table initialized")


@dataclass
class VocabularyWord:
    """A single vocabulary word with all metadata."""
    id: str = ""
    english: str = ""
    german: str = ""
    ipa_en: str = ""
    ipa_de: str = ""
    part_of_speech: str = ""
    syllables_en: List[str] = field(default_factory=list)
    syllables_de: List[str] = field(default_factory=list)
    example_en: str = ""
    example_de: str = ""
    image_url: str = ""
    audio_url_en: str = ""
    audio_url_de: str = ""
    difficulty: int = 1
    tags: List[str] = field(default_factory=list)
    translations: Dict[str, str] = field(default_factory=dict)

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


def _row_to_word(row) -> VocabularyWord:
    """Convert an asyncpg Record to VocabularyWord."""
    import json
    translations = row["translations"]
    if isinstance(translations, str):
        translations = json.loads(translations)
    return VocabularyWord(
        id=str(row["id"]),
        english=row["english"],
        german=row["german"],
        ipa_en=row["ipa_en"],
        ipa_de=row["ipa_de"],
        part_of_speech=row["part_of_speech"],
        syllables_en=list(row["syllables_en"] or []),
        syllables_de=list(row["syllables_de"] or []),
        example_en=row["example_en"],
        example_de=row["example_de"],
        image_url=row["image_url"],
        audio_url_en=row["audio_url_en"],
        audio_url_de=row["audio_url_de"],
        difficulty=row["difficulty"],
        tags=list(row["tags"] or []),
        translations=translations or {},
    )


async def search_words(
    query: str, lang: str = "en", limit: int = 20, offset: int = 0,
) -> List[VocabularyWord]:
    """Full-text search for words. Uses trigram similarity if available, else ILIKE."""
    pool = await get_pool()
    col = "english" if lang == "en" else "german"
    async with pool.acquire() as conn:
        # Try trigram search first, fall back to ILIKE
        try:
            rows = await conn.fetch(
                f"""
                SELECT * FROM vocabulary_words
                WHERE lower({col}) LIKE $1 OR {col} % $2
                ORDER BY similarity({col}, $2) DESC, lower({col})
                LIMIT $3 OFFSET $4
                """,
                f"%{query.lower()}%", query, limit, offset,
            )
        except Exception:
            rows = await conn.fetch(
                f"""
                SELECT * FROM vocabulary_words
                WHERE lower({col}) LIKE $1
                ORDER BY lower({col})
                LIMIT $2 OFFSET $3
                """,
                f"%{query.lower()}%", limit, offset,
            )
    return [_row_to_word(r) for r in rows]


async def get_word(word_id: str) -> Optional[VocabularyWord]:
    """Get a single word by ID."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow(
            "SELECT * FROM vocabulary_words WHERE id = $1", uuid.UUID(word_id),
        )
    return _row_to_word(row) if row else None


async def browse_words(
    pos: str = "", difficulty: int = 0, tag: str = "",
    limit: int = 50, offset: int = 0,
) -> List[VocabularyWord]:
    """Browse words with filters."""
    pool = await get_pool()
    conditions = []
    params: List[Any] = []
    idx = 1

    if pos:
        conditions.append(f"part_of_speech = ${idx}")
        params.append(pos)
        idx += 1
    if difficulty > 0:
        conditions.append(f"difficulty = ${idx}")
        params.append(difficulty)
        idx += 1
    if tag:
        conditions.append(f"${idx} = ANY(tags)")
        params.append(tag)
        idx += 1

    where = "WHERE " + " AND ".join(conditions) if conditions else ""
    params.extend([limit, offset])

    async with pool.acquire() as conn:
        rows = await conn.fetch(
            f"SELECT * FROM vocabulary_words {where} ORDER BY english LIMIT ${idx} OFFSET ${idx+1}",
            *params,
        )
    return [_row_to_word(r) for r in rows]


async def insert_word(word: VocabularyWord) -> str:
    """Insert a new word, returns the ID."""
    pool = await get_pool()
    import json
    word_id = word.id or str(uuid.uuid4())
    async with pool.acquire() as conn:
        await conn.execute(
            """
            INSERT INTO vocabulary_words
                (id, english, german, ipa_en, ipa_de, part_of_speech,
                 syllables_en, syllables_de, example_en, example_de,
                 image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
            VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
            ON CONFLICT (id) DO NOTHING
            """,
            uuid.UUID(word_id), word.english, word.german,
            word.ipa_en, word.ipa_de, word.part_of_speech,
            word.syllables_en, word.syllables_de,
            word.example_en, word.example_de,
            word.image_url, word.audio_url_en, word.audio_url_de,
            word.difficulty, word.tags, json.dumps(word.translations),
        )
    return word_id


async def insert_words_bulk(words: List[VocabularyWord]) -> int:
    """Bulk insert words. Returns count of inserted rows."""
    pool = await get_pool()
    import json
    records = []
    for w in words:
        wid = w.id or str(uuid.uuid4())
        records.append((
            uuid.UUID(wid), w.english, w.german,
            w.ipa_en, w.ipa_de, w.part_of_speech,
            w.syllables_en, w.syllables_de,
            w.example_en, w.example_de,
            w.image_url, w.audio_url_en, w.audio_url_de,
            w.difficulty, w.tags, json.dumps(w.translations),
        ))
    async with pool.acquire() as conn:
        await conn.executemany(
            """
            INSERT INTO vocabulary_words
                (id, english, german, ipa_en, ipa_de, part_of_speech,
                 syllables_en, syllables_de, example_en, example_de,
                 image_url, audio_url_en, audio_url_de, difficulty, tags, translations)
            VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)
            ON CONFLICT (id) DO NOTHING
            """,
            records,
        )
    return len(records)


async def count_words() -> int:
    """Count total words in the database."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        return await conn.fetchval("SELECT COUNT(*) FROM vocabulary_words")


async def get_all_tags() -> List[str]:
    """Get all unique tags."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            "SELECT DISTINCT unnest(tags) AS tag FROM vocabulary_words ORDER BY tag"
        )
    return [r["tag"] for r in rows]


async def get_all_pos() -> List[str]:
    """Get all unique parts of speech."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            "SELECT DISTINCT part_of_speech FROM vocabulary_words WHERE part_of_speech != '' ORDER BY part_of_speech"
        )
    return [r["part_of_speech"] for r in rows]