breakpilot-lehrer/klausur-service/backend/vocab_session_store.py

"""
Vocabulary Session Store - PostgreSQL persistence for vocab extraction sessions.

Replaces in-memory storage with database persistence.
See migrations/001_vocab_sessions.sql for schema.
"""

import os
import uuid
import logging
import json
from typing import Optional, List, Dict, Any
from datetime import datetime

import asyncpg

logger = logging.getLogger(__name__)

# Database configuration
DATABASE_URL = os.getenv(
    "DATABASE_URL",
    "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db"
)

# Connection pool (initialized lazily)
_pool: Optional[asyncpg.Pool] = None


async def get_pool() -> asyncpg.Pool:
    """Get or create the database connection pool."""
    global _pool
    if _pool is None:
        _pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
    return _pool


async def init_vocab_tables():
    """
    Initialize vocab tables if they don't exist.
    This is called at startup.
    """
    pool = await get_pool()
    async with pool.acquire() as conn:
        # Check if tables exist
        tables_exist = await conn.fetchval("""
            SELECT EXISTS (
                SELECT FROM information_schema.tables
                WHERE table_name = 'vocab_sessions'
            )
        """)

        if not tables_exist:
            logger.info("Creating vocab tables...")
            # Read and execute migration
            migration_path = os.path.join(
                os.path.dirname(__file__),
                "migrations/001_vocab_sessions.sql"
            )
            if os.path.exists(migration_path):
                with open(migration_path, "r") as f:
                    sql = f.read()
                await conn.execute(sql)
                logger.info("Vocab tables created successfully")
            else:
                logger.warning(f"Migration file not found: {migration_path}")
        else:
            logger.debug("Vocab tables already exist")


# =============================================================================
# SESSION OPERATIONS
# =============================================================================

async def create_session_db(
    session_id: str,
    name: str,
    description: str = "",
    source_language: str = "en",
    target_language: str = "de"
) -> Dict[str, Any]:
    """Create a new vocabulary session in the database."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            INSERT INTO vocab_sessions (
                id, name, description, source_language, target_language,
                status, vocabulary_count
            ) VALUES ($1, $2, $3, $4, $5, 'pending', 0)
            RETURNING *
        """, uuid.UUID(session_id), name, description, source_language, target_language)

        return _row_to_dict(row)


async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
    """Get a session by ID."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            SELECT * FROM vocab_sessions WHERE id = $1
        """, uuid.UUID(session_id))

        if row:
            return _row_to_dict(row)
        return None


async def list_sessions_db(
    limit: int = 50,
    offset: int = 0,
    status: Optional[str] = None
) -> List[Dict[str, Any]]:
    """List all sessions with optional filtering."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        if status:
            rows = await conn.fetch("""
                SELECT * FROM vocab_sessions
                WHERE status = $1
                ORDER BY created_at DESC
                LIMIT $2 OFFSET $3
            """, status, limit, offset)
        else:
            rows = await conn.fetch("""
                SELECT * FROM vocab_sessions
                ORDER BY created_at DESC
                LIMIT $1 OFFSET $2
            """, limit, offset)

        return [_row_to_dict(row) for row in rows]


async def update_session_db(
    session_id: str,
    **kwargs
) -> Optional[Dict[str, Any]]:
    """Update a session with given fields."""
    pool = await get_pool()

    # Build dynamic UPDATE query
    fields = []
    values = []
    param_idx = 1

    allowed_fields = [
        'name', 'description', 'status', 'vocabulary_count',
        'extraction_confidence', 'image_path', 'pdf_path', 'pdf_page_count',
        'ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages'
    ]

    for key, value in kwargs.items():
        if key in allowed_fields:
            fields.append(f"{key} = ${param_idx}")
            # Convert dicts/lists to JSON for JSONB columns
            if key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages']:
                value = json.dumps(value) if value else None
            values.append(value)
            param_idx += 1

    if not fields:
        return await get_session_db(session_id)

    values.append(uuid.UUID(session_id))

    async with pool.acquire() as conn:
        row = await conn.fetchrow(f"""
            UPDATE vocab_sessions
            SET {', '.join(fields)}
            WHERE id = ${param_idx}
            RETURNING *
        """, *values)

        if row:
            return _row_to_dict(row)
        return None


async def delete_session_db(session_id: str) -> bool:
    """Delete a session and all related data (cascades)."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute("""
            DELETE FROM vocab_sessions WHERE id = $1
        """, uuid.UUID(session_id))
        return result == "DELETE 1"


# =============================================================================
# VOCABULARY OPERATIONS
# =============================================================================

async def add_vocabulary_db(
    session_id: str,
    vocab_list: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
    """Add vocabulary entries to a session."""
    if not vocab_list:
        return []

    pool = await get_pool()
    results = []

    async with pool.acquire() as conn:
        for vocab in vocab_list:
            vocab_id = str(uuid.uuid4())
            row = await conn.fetchrow("""
                INSERT INTO vocab_entries (
                    id, session_id, english, german, example_sentence,
                    example_sentence_gap, word_type, source_page
                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
                RETURNING *
            """,
                uuid.UUID(vocab_id),
                uuid.UUID(session_id),
                vocab.get('english', ''),
                vocab.get('german', ''),
                vocab.get('example_sentence'),
                vocab.get('example_sentence_gap'),
                vocab.get('word_type'),
                vocab.get('source_page')
            )
            results.append(_row_to_dict(row))

        # Update vocabulary count
        await conn.execute("""
            UPDATE vocab_sessions
            SET vocabulary_count = (
                SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
            )
            WHERE id = $1
        """, uuid.UUID(session_id))

    return results


async def get_vocabulary_db(
    session_id: str,
    source_page: Optional[int] = None
) -> List[Dict[str, Any]]:
    """Get vocabulary entries for a session."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        if source_page is not None:
            rows = await conn.fetch("""
                SELECT * FROM vocab_entries
                WHERE session_id = $1 AND source_page = $2
                ORDER BY created_at
            """, uuid.UUID(session_id), source_page)
        else:
            rows = await conn.fetch("""
                SELECT * FROM vocab_entries
                WHERE session_id = $1
                ORDER BY source_page NULLS LAST, created_at
            """, uuid.UUID(session_id))

        return [_row_to_dict(row) for row in rows]


async def update_vocabulary_db(
    entry_id: str,
    **kwargs
) -> Optional[Dict[str, Any]]:
    """Update a single vocabulary entry."""
    pool = await get_pool()

    fields = []
    values = []
    param_idx = 1

    allowed_fields = [
        'english', 'german', 'example_sentence', 'example_sentence_gap',
        'word_type', 'source_page'
    ]

    for key, value in kwargs.items():
        if key in allowed_fields:
            fields.append(f"{key} = ${param_idx}")
            values.append(value)
            param_idx += 1

    if not fields:
        return None

    values.append(uuid.UUID(entry_id))

    async with pool.acquire() as conn:
        row = await conn.fetchrow(f"""
            UPDATE vocab_entries
            SET {', '.join(fields)}
            WHERE id = ${param_idx}
            RETURNING *
        """, *values)

        if row:
            return _row_to_dict(row)
        return None


async def clear_page_vocabulary_db(session_id: str, page: int) -> int:
    """Clear all vocabulary for a specific page."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute("""
            DELETE FROM vocab_entries
            WHERE session_id = $1 AND source_page = $2
        """, uuid.UUID(session_id), page)

        # Update vocabulary count
        await conn.execute("""
            UPDATE vocab_sessions
            SET vocabulary_count = (
                SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
            )
            WHERE id = $1
        """, uuid.UUID(session_id))

        # Return count of deleted rows
        count = int(result.split()[-1]) if result else 0
        return count


# =============================================================================
# WORKSHEET OPERATIONS
# =============================================================================

async def create_worksheet_db(
    session_id: str,
    worksheet_types: List[str],
    pdf_path: Optional[str] = None,
    solution_path: Optional[str] = None
) -> Dict[str, Any]:
    """Create a worksheet record."""
    pool = await get_pool()
    worksheet_id = str(uuid.uuid4())

    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            INSERT INTO vocab_worksheets (
                id, session_id, worksheet_types, pdf_path, solution_path
            ) VALUES ($1, $2, $3, $4, $5)
            RETURNING *
        """,
            uuid.UUID(worksheet_id),
            uuid.UUID(session_id),
            json.dumps(worksheet_types),
            pdf_path,
            solution_path
        )

        return _row_to_dict(row)


async def get_worksheet_db(worksheet_id: str) -> Optional[Dict[str, Any]]:
    """Get a worksheet by ID."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        row = await conn.fetchrow("""
            SELECT * FROM vocab_worksheets WHERE id = $1
        """, uuid.UUID(worksheet_id))

        if row:
            return _row_to_dict(row)
        return None


async def delete_worksheets_for_session_db(session_id: str) -> int:
    """Delete all worksheets for a session."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        result = await conn.execute("""
            DELETE FROM vocab_worksheets WHERE session_id = $1
        """, uuid.UUID(session_id))

        count = int(result.split()[-1]) if result else 0
        return count


# =============================================================================
# PDF CACHE OPERATIONS
# =============================================================================

# Simple in-memory cache for PDF data (temporary until served)
_pdf_cache: Dict[str, bytes] = {}


def cache_pdf_data(worksheet_id: str, pdf_data: bytes) -> None:
    """Cache PDF data temporarily for download."""
    _pdf_cache[worksheet_id] = pdf_data


def get_cached_pdf_data(worksheet_id: str) -> Optional[bytes]:
    """Get cached PDF data."""
    return _pdf_cache.get(worksheet_id)


def clear_cached_pdf_data(worksheet_id: str) -> None:
    """Clear cached PDF data."""
    _pdf_cache.pop(worksheet_id, None)


# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
    """Convert asyncpg Record to dict with proper type handling."""
    if row is None:
        return {}

    result = dict(row)

    # Convert UUIDs to strings
    for key in ['id', 'session_id']:
        if key in result and result[key] is not None:
            result[key] = str(result[key])

    # Convert datetimes to ISO strings
    for key in ['created_at', 'updated_at', 'generated_at']:
        if key in result and result[key] is not None:
            result[key] = result[key].isoformat()

    # Parse JSONB fields back to dicts/lists
    for key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages', 'worksheet_types']:
        if key in result and result[key] is not None:
            if isinstance(result[key], str):
                result[key] = json.loads(result[key])

    return result