feat(klausur-service): Add Tesseract OCR, DSFA RAG, TrOCR, grid detection and vocab session store
New modules: - tesseract_vocab_extractor.py: Bounding-box OCR with multi-PSM pipeline - grid_detection_service.py: CV-based grid/table detection for worksheets - vocab_session_store.py: PostgreSQL persistence for vocab sessions - trocr_api.py: TrOCR handwriting recognition endpoint - dsfa_rag_api.py + dsfa_corpus_ingestion.py: DSFA RAG corpus search Changes: - Dockerfile: Install tesseract-ocr + deu/eng language packs - requirements.txt: Add PyMuPDF, pytesseract, Pillow - main.py: Register new routers, init DB pools + Qdrant collections Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
428
klausur-service/backend/vocab_session_store.py
Normal file
428
klausur-service/backend/vocab_session_store.py
Normal file
@@ -0,0 +1,428 @@
|
||||
"""
|
||||
Vocabulary Session Store - PostgreSQL persistence for vocab extraction sessions.
|
||||
|
||||
Replaces in-memory storage with database persistence.
|
||||
See migrations/001_vocab_sessions.sql for schema.
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import logging
|
||||
import json
|
||||
from typing import Optional, List, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
import asyncpg
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Database configuration
|
||||
DATABASE_URL = os.getenv(
|
||||
"DATABASE_URL",
|
||||
"postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db"
|
||||
)
|
||||
|
||||
# Connection pool (initialized lazily)
|
||||
_pool: Optional[asyncpg.Pool] = None
|
||||
|
||||
|
||||
async def get_pool() -> asyncpg.Pool:
|
||||
"""Get or create the database connection pool."""
|
||||
global _pool
|
||||
if _pool is None:
|
||||
_pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
|
||||
return _pool
|
||||
|
||||
|
||||
async def init_vocab_tables():
|
||||
"""
|
||||
Initialize vocab tables if they don't exist.
|
||||
This is called at startup.
|
||||
"""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
# Check if tables exist
|
||||
tables_exist = await conn.fetchval("""
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'vocab_sessions'
|
||||
)
|
||||
""")
|
||||
|
||||
if not tables_exist:
|
||||
logger.info("Creating vocab tables...")
|
||||
# Read and execute migration
|
||||
migration_path = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"migrations/001_vocab_sessions.sql"
|
||||
)
|
||||
if os.path.exists(migration_path):
|
||||
with open(migration_path, "r") as f:
|
||||
sql = f.read()
|
||||
await conn.execute(sql)
|
||||
logger.info("Vocab tables created successfully")
|
||||
else:
|
||||
logger.warning(f"Migration file not found: {migration_path}")
|
||||
else:
|
||||
logger.debug("Vocab tables already exist")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SESSION OPERATIONS
|
||||
# =============================================================================
|
||||
|
||||
async def create_session_db(
|
||||
session_id: str,
|
||||
name: str,
|
||||
description: str = "",
|
||||
source_language: str = "en",
|
||||
target_language: str = "de"
|
||||
) -> Dict[str, Any]:
|
||||
"""Create a new vocabulary session in the database."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
INSERT INTO vocab_sessions (
|
||||
id, name, description, source_language, target_language,
|
||||
status, vocabulary_count
|
||||
) VALUES ($1, $2, $3, $4, $5, 'pending', 0)
|
||||
RETURNING *
|
||||
""", uuid.UUID(session_id), name, description, source_language, target_language)
|
||||
|
||||
return _row_to_dict(row)
|
||||
|
||||
|
||||
async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get a session by ID."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
SELECT * FROM vocab_sessions WHERE id = $1
|
||||
""", uuid.UUID(session_id))
|
||||
|
||||
if row:
|
||||
return _row_to_dict(row)
|
||||
return None
|
||||
|
||||
|
||||
async def list_sessions_db(
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
status: Optional[str] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""List all sessions with optional filtering."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
if status:
|
||||
rows = await conn.fetch("""
|
||||
SELECT * FROM vocab_sessions
|
||||
WHERE status = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $2 OFFSET $3
|
||||
""", status, limit, offset)
|
||||
else:
|
||||
rows = await conn.fetch("""
|
||||
SELECT * FROM vocab_sessions
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
""", limit, offset)
|
||||
|
||||
return [_row_to_dict(row) for row in rows]
|
||||
|
||||
|
||||
async def update_session_db(
|
||||
session_id: str,
|
||||
**kwargs
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Update a session with given fields."""
|
||||
pool = await get_pool()
|
||||
|
||||
# Build dynamic UPDATE query
|
||||
fields = []
|
||||
values = []
|
||||
param_idx = 1
|
||||
|
||||
allowed_fields = [
|
||||
'name', 'description', 'status', 'vocabulary_count',
|
||||
'extraction_confidence', 'image_path', 'pdf_path', 'pdf_page_count',
|
||||
'ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages'
|
||||
]
|
||||
|
||||
for key, value in kwargs.items():
|
||||
if key in allowed_fields:
|
||||
fields.append(f"{key} = ${param_idx}")
|
||||
# Convert dicts/lists to JSON for JSONB columns
|
||||
if key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages']:
|
||||
value = json.dumps(value) if value else None
|
||||
values.append(value)
|
||||
param_idx += 1
|
||||
|
||||
if not fields:
|
||||
return await get_session_db(session_id)
|
||||
|
||||
values.append(uuid.UUID(session_id))
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(f"""
|
||||
UPDATE vocab_sessions
|
||||
SET {', '.join(fields)}
|
||||
WHERE id = ${param_idx}
|
||||
RETURNING *
|
||||
""", *values)
|
||||
|
||||
if row:
|
||||
return _row_to_dict(row)
|
||||
return None
|
||||
|
||||
|
||||
async def delete_session_db(session_id: str) -> bool:
|
||||
"""Delete a session and all related data (cascades)."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM vocab_sessions WHERE id = $1
|
||||
""", uuid.UUID(session_id))
|
||||
return result == "DELETE 1"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# VOCABULARY OPERATIONS
|
||||
# =============================================================================
|
||||
|
||||
async def add_vocabulary_db(
|
||||
session_id: str,
|
||||
vocab_list: List[Dict[str, Any]]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Add vocabulary entries to a session."""
|
||||
if not vocab_list:
|
||||
return []
|
||||
|
||||
pool = await get_pool()
|
||||
results = []
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
for vocab in vocab_list:
|
||||
vocab_id = str(uuid.uuid4())
|
||||
row = await conn.fetchrow("""
|
||||
INSERT INTO vocab_entries (
|
||||
id, session_id, english, german, example_sentence,
|
||||
example_sentence_gap, word_type, source_page
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
RETURNING *
|
||||
""",
|
||||
uuid.UUID(vocab_id),
|
||||
uuid.UUID(session_id),
|
||||
vocab.get('english', ''),
|
||||
vocab.get('german', ''),
|
||||
vocab.get('example_sentence'),
|
||||
vocab.get('example_sentence_gap'),
|
||||
vocab.get('word_type'),
|
||||
vocab.get('source_page')
|
||||
)
|
||||
results.append(_row_to_dict(row))
|
||||
|
||||
# Update vocabulary count
|
||||
await conn.execute("""
|
||||
UPDATE vocab_sessions
|
||||
SET vocabulary_count = (
|
||||
SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
|
||||
)
|
||||
WHERE id = $1
|
||||
""", uuid.UUID(session_id))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def get_vocabulary_db(
|
||||
session_id: str,
|
||||
source_page: Optional[int] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Get vocabulary entries for a session."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
if source_page is not None:
|
||||
rows = await conn.fetch("""
|
||||
SELECT * FROM vocab_entries
|
||||
WHERE session_id = $1 AND source_page = $2
|
||||
ORDER BY created_at
|
||||
""", uuid.UUID(session_id), source_page)
|
||||
else:
|
||||
rows = await conn.fetch("""
|
||||
SELECT * FROM vocab_entries
|
||||
WHERE session_id = $1
|
||||
ORDER BY source_page NULLS LAST, created_at
|
||||
""", uuid.UUID(session_id))
|
||||
|
||||
return [_row_to_dict(row) for row in rows]
|
||||
|
||||
|
||||
async def update_vocabulary_db(
|
||||
entry_id: str,
|
||||
**kwargs
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Update a single vocabulary entry."""
|
||||
pool = await get_pool()
|
||||
|
||||
fields = []
|
||||
values = []
|
||||
param_idx = 1
|
||||
|
||||
allowed_fields = [
|
||||
'english', 'german', 'example_sentence', 'example_sentence_gap',
|
||||
'word_type', 'source_page'
|
||||
]
|
||||
|
||||
for key, value in kwargs.items():
|
||||
if key in allowed_fields:
|
||||
fields.append(f"{key} = ${param_idx}")
|
||||
values.append(value)
|
||||
param_idx += 1
|
||||
|
||||
if not fields:
|
||||
return None
|
||||
|
||||
values.append(uuid.UUID(entry_id))
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(f"""
|
||||
UPDATE vocab_entries
|
||||
SET {', '.join(fields)}
|
||||
WHERE id = ${param_idx}
|
||||
RETURNING *
|
||||
""", *values)
|
||||
|
||||
if row:
|
||||
return _row_to_dict(row)
|
||||
return None
|
||||
|
||||
|
||||
async def clear_page_vocabulary_db(session_id: str, page: int) -> int:
|
||||
"""Clear all vocabulary for a specific page."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM vocab_entries
|
||||
WHERE session_id = $1 AND source_page = $2
|
||||
""", uuid.UUID(session_id), page)
|
||||
|
||||
# Update vocabulary count
|
||||
await conn.execute("""
|
||||
UPDATE vocab_sessions
|
||||
SET vocabulary_count = (
|
||||
SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1
|
||||
)
|
||||
WHERE id = $1
|
||||
""", uuid.UUID(session_id))
|
||||
|
||||
# Return count of deleted rows
|
||||
count = int(result.split()[-1]) if result else 0
|
||||
return count
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# WORKSHEET OPERATIONS
|
||||
# =============================================================================
|
||||
|
||||
async def create_worksheet_db(
|
||||
session_id: str,
|
||||
worksheet_types: List[str],
|
||||
pdf_path: Optional[str] = None,
|
||||
solution_path: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Create a worksheet record."""
|
||||
pool = await get_pool()
|
||||
worksheet_id = str(uuid.uuid4())
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
INSERT INTO vocab_worksheets (
|
||||
id, session_id, worksheet_types, pdf_path, solution_path
|
||||
) VALUES ($1, $2, $3, $4, $5)
|
||||
RETURNING *
|
||||
""",
|
||||
uuid.UUID(worksheet_id),
|
||||
uuid.UUID(session_id),
|
||||
json.dumps(worksheet_types),
|
||||
pdf_path,
|
||||
solution_path
|
||||
)
|
||||
|
||||
return _row_to_dict(row)
|
||||
|
||||
|
||||
async def get_worksheet_db(worksheet_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get a worksheet by ID."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
SELECT * FROM vocab_worksheets WHERE id = $1
|
||||
""", uuid.UUID(worksheet_id))
|
||||
|
||||
if row:
|
||||
return _row_to_dict(row)
|
||||
return None
|
||||
|
||||
|
||||
async def delete_worksheets_for_session_db(session_id: str) -> int:
|
||||
"""Delete all worksheets for a session."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM vocab_worksheets WHERE session_id = $1
|
||||
""", uuid.UUID(session_id))
|
||||
|
||||
count = int(result.split()[-1]) if result else 0
|
||||
return count
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PDF CACHE OPERATIONS
|
||||
# =============================================================================
|
||||
|
||||
# Simple in-memory cache for PDF data (temporary until served)
|
||||
_pdf_cache: Dict[str, bytes] = {}
|
||||
|
||||
|
||||
def cache_pdf_data(worksheet_id: str, pdf_data: bytes) -> None:
|
||||
"""Cache PDF data temporarily for download."""
|
||||
_pdf_cache[worksheet_id] = pdf_data
|
||||
|
||||
|
||||
def get_cached_pdf_data(worksheet_id: str) -> Optional[bytes]:
|
||||
"""Get cached PDF data."""
|
||||
return _pdf_cache.get(worksheet_id)
|
||||
|
||||
|
||||
def clear_cached_pdf_data(worksheet_id: str) -> None:
|
||||
"""Clear cached PDF data."""
|
||||
_pdf_cache.pop(worksheet_id, None)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# HELPER FUNCTIONS
|
||||
# =============================================================================
|
||||
|
||||
def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
|
||||
"""Convert asyncpg Record to dict with proper type handling."""
|
||||
if row is None:
|
||||
return {}
|
||||
|
||||
result = dict(row)
|
||||
|
||||
# Convert UUIDs to strings
|
||||
for key in ['id', 'session_id']:
|
||||
if key in result and result[key] is not None:
|
||||
result[key] = str(result[key])
|
||||
|
||||
# Convert datetimes to ISO strings
|
||||
for key in ['created_at', 'updated_at', 'generated_at']:
|
||||
if key in result and result[key] is not None:
|
||||
result[key] = result[key].isoformat()
|
||||
|
||||
# Parse JSONB fields back to dicts/lists
|
||||
for key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages', 'worksheet_types']:
|
||||
if key in result and result[key] is not None:
|
||||
if isinstance(result[key], str):
|
||||
result[key] = json.loads(result[key])
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user