""" Vocabulary Session Store - PostgreSQL persistence for vocab extraction sessions. Replaces in-memory storage with database persistence. See migrations/001_vocab_sessions.sql for schema. """ import os import uuid import logging import json from typing import Optional, List, Dict, Any from datetime import datetime import asyncpg logger = logging.getLogger(__name__) # Database configuration DATABASE_URL = os.getenv( "DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db" ) # Connection pool (initialized lazily) _pool: Optional[asyncpg.Pool] = None async def get_pool() -> asyncpg.Pool: """Get or create the database connection pool.""" global _pool if _pool is None: _pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10) return _pool async def init_vocab_tables(): """ Initialize vocab tables if they don't exist. This is called at startup. """ pool = await get_pool() async with pool.acquire() as conn: # Check if tables exist tables_exist = await conn.fetchval(""" SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_name = 'vocab_sessions' ) """) if not tables_exist: logger.info("Creating vocab tables...") # Read and execute migration migration_path = os.path.join( os.path.dirname(__file__), "migrations/001_vocab_sessions.sql" ) if os.path.exists(migration_path): with open(migration_path, "r") as f: sql = f.read() await conn.execute(sql) logger.info("Vocab tables created successfully") else: logger.warning(f"Migration file not found: {migration_path}") else: logger.debug("Vocab tables already exist") # ============================================================================= # SESSION OPERATIONS # ============================================================================= async def create_session_db( session_id: str, name: str, description: str = "", source_language: str = "en", target_language: str = "de" ) -> Dict[str, Any]: """Create a new vocabulary session in the database.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow(""" INSERT INTO vocab_sessions ( id, name, description, source_language, target_language, status, vocabulary_count ) VALUES ($1, $2, $3, $4, $5, 'pending', 0) RETURNING * """, uuid.UUID(session_id), name, description, source_language, target_language) return _row_to_dict(row) async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]: """Get a session by ID.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow(""" SELECT * FROM vocab_sessions WHERE id = $1 """, uuid.UUID(session_id)) if row: return _row_to_dict(row) return None async def list_sessions_db( limit: int = 50, offset: int = 0, status: Optional[str] = None ) -> List[Dict[str, Any]]: """List all sessions with optional filtering.""" pool = await get_pool() async with pool.acquire() as conn: if status: rows = await conn.fetch(""" SELECT * FROM vocab_sessions WHERE status = $1 ORDER BY created_at DESC LIMIT $2 OFFSET $3 """, status, limit, offset) else: rows = await conn.fetch(""" SELECT * FROM vocab_sessions ORDER BY created_at DESC LIMIT $1 OFFSET $2 """, limit, offset) return [_row_to_dict(row) for row in rows] async def update_session_db( session_id: str, **kwargs ) -> Optional[Dict[str, Any]]: """Update a session with given fields.""" pool = await get_pool() # Build dynamic UPDATE query fields = [] values = [] param_idx = 1 allowed_fields = [ 'name', 'description', 'status', 'vocabulary_count', 'extraction_confidence', 'image_path', 'pdf_path', 'pdf_page_count', 'ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages' ] for key, value in kwargs.items(): if key in allowed_fields: fields.append(f"{key} = ${param_idx}") # Convert dicts/lists to JSON for JSONB columns if key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages']: value = json.dumps(value) if value else None values.append(value) param_idx += 1 if not fields: return await get_session_db(session_id) values.append(uuid.UUID(session_id)) async with pool.acquire() as conn: row = await conn.fetchrow(f""" UPDATE vocab_sessions SET {', '.join(fields)} WHERE id = ${param_idx} RETURNING * """, *values) if row: return _row_to_dict(row) return None async def delete_session_db(session_id: str) -> bool: """Delete a session and all related data (cascades).""" pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute(""" DELETE FROM vocab_sessions WHERE id = $1 """, uuid.UUID(session_id)) return result == "DELETE 1" # ============================================================================= # VOCABULARY OPERATIONS # ============================================================================= async def add_vocabulary_db( session_id: str, vocab_list: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """Add vocabulary entries to a session.""" if not vocab_list: return [] pool = await get_pool() results = [] async with pool.acquire() as conn: for vocab in vocab_list: vocab_id = str(uuid.uuid4()) row = await conn.fetchrow(""" INSERT INTO vocab_entries ( id, session_id, english, german, example_sentence, example_sentence_gap, word_type, source_page ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING * """, uuid.UUID(vocab_id), uuid.UUID(session_id), vocab.get('english', ''), vocab.get('german', ''), vocab.get('example_sentence'), vocab.get('example_sentence_gap'), vocab.get('word_type'), vocab.get('source_page') ) results.append(_row_to_dict(row)) # Update vocabulary count await conn.execute(""" UPDATE vocab_sessions SET vocabulary_count = ( SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1 ) WHERE id = $1 """, uuid.UUID(session_id)) return results async def get_vocabulary_db( session_id: str, source_page: Optional[int] = None ) -> List[Dict[str, Any]]: """Get vocabulary entries for a session.""" pool = await get_pool() async with pool.acquire() as conn: if source_page is not None: rows = await conn.fetch(""" SELECT * FROM vocab_entries WHERE session_id = $1 AND source_page = $2 ORDER BY created_at """, uuid.UUID(session_id), source_page) else: rows = await conn.fetch(""" SELECT * FROM vocab_entries WHERE session_id = $1 ORDER BY source_page NULLS LAST, created_at """, uuid.UUID(session_id)) return [_row_to_dict(row) for row in rows] async def update_vocabulary_db( entry_id: str, **kwargs ) -> Optional[Dict[str, Any]]: """Update a single vocabulary entry.""" pool = await get_pool() fields = [] values = [] param_idx = 1 allowed_fields = [ 'english', 'german', 'example_sentence', 'example_sentence_gap', 'word_type', 'source_page' ] for key, value in kwargs.items(): if key in allowed_fields: fields.append(f"{key} = ${param_idx}") values.append(value) param_idx += 1 if not fields: return None values.append(uuid.UUID(entry_id)) async with pool.acquire() as conn: row = await conn.fetchrow(f""" UPDATE vocab_entries SET {', '.join(fields)} WHERE id = ${param_idx} RETURNING * """, *values) if row: return _row_to_dict(row) return None async def clear_page_vocabulary_db(session_id: str, page: int) -> int: """Clear all vocabulary for a specific page.""" pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute(""" DELETE FROM vocab_entries WHERE session_id = $1 AND source_page = $2 """, uuid.UUID(session_id), page) # Update vocabulary count await conn.execute(""" UPDATE vocab_sessions SET vocabulary_count = ( SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1 ) WHERE id = $1 """, uuid.UUID(session_id)) # Return count of deleted rows count = int(result.split()[-1]) if result else 0 return count # ============================================================================= # WORKSHEET OPERATIONS # ============================================================================= async def create_worksheet_db( session_id: str, worksheet_types: List[str], pdf_path: Optional[str] = None, solution_path: Optional[str] = None ) -> Dict[str, Any]: """Create a worksheet record.""" pool = await get_pool() worksheet_id = str(uuid.uuid4()) async with pool.acquire() as conn: row = await conn.fetchrow(""" INSERT INTO vocab_worksheets ( id, session_id, worksheet_types, pdf_path, solution_path ) VALUES ($1, $2, $3, $4, $5) RETURNING * """, uuid.UUID(worksheet_id), uuid.UUID(session_id), json.dumps(worksheet_types), pdf_path, solution_path ) return _row_to_dict(row) async def get_worksheet_db(worksheet_id: str) -> Optional[Dict[str, Any]]: """Get a worksheet by ID.""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow(""" SELECT * FROM vocab_worksheets WHERE id = $1 """, uuid.UUID(worksheet_id)) if row: return _row_to_dict(row) return None async def delete_worksheets_for_session_db(session_id: str) -> int: """Delete all worksheets for a session.""" pool = await get_pool() async with pool.acquire() as conn: result = await conn.execute(""" DELETE FROM vocab_worksheets WHERE session_id = $1 """, uuid.UUID(session_id)) count = int(result.split()[-1]) if result else 0 return count # ============================================================================= # PDF CACHE OPERATIONS # ============================================================================= # Simple in-memory cache for PDF data (temporary until served) _pdf_cache: Dict[str, bytes] = {} def cache_pdf_data(worksheet_id: str, pdf_data: bytes) -> None: """Cache PDF data temporarily for download.""" _pdf_cache[worksheet_id] = pdf_data def get_cached_pdf_data(worksheet_id: str) -> Optional[bytes]: """Get cached PDF data.""" return _pdf_cache.get(worksheet_id) def clear_cached_pdf_data(worksheet_id: str) -> None: """Clear cached PDF data.""" _pdf_cache.pop(worksheet_id, None) # ============================================================================= # HELPER FUNCTIONS # ============================================================================= def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]: """Convert asyncpg Record to dict with proper type handling.""" if row is None: return {} result = dict(row) # Convert UUIDs to strings for key in ['id', 'session_id']: if key in result and result[key] is not None: result[key] = str(result[key]) # Convert datetimes to ISO strings for key in ['created_at', 'updated_at', 'generated_at']: if key in result and result[key] is not None: result[key] = result[key].isoformat() # Parse JSONB fields back to dicts/lists for key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages', 'worksheet_types']: if key in result and result[key] is not None: if isinstance(result[key], str): result[key] = json.loads(result[key]) return result