""" Vocabulary Worksheet API — core CRUD routes for sessions, uploads, vocabulary editing, worksheet generation, and PDF downloads. Sub-routers (included at bottom): - vocab_worksheet_upload_api: PDF upload, thumbnails, page processing - vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth """ from fastapi import APIRouter, HTTPException, UploadFile, File, Query from fastapi.responses import StreamingResponse from typing import List, Dict, Any from datetime import datetime import uuid import os import io import logging logger = logging.getLogger(__name__) # --- Imports from extracted sub-modules --- from vocab_worksheet_models import ( WorksheetType, SessionStatus, VocabularyEntry, SessionCreate, SessionResponse, VocabularyResponse, VocabularyUpdate, WorksheetGenerateRequest, WorksheetResponse, ) from vocab_worksheet_extraction import extract_vocabulary_from_image from vocab_worksheet_generation import ( generate_worksheet_html, generate_worksheet_pdf, convert_pdf_page_to_image, ) # --- Database integration (used by main.py lifespan) --- try: from vocab_session_store import ( DATABASE_URL, get_pool, init_vocab_tables, list_sessions_db, get_session_db, ) except ImportError: DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db") get_pool = None init_vocab_tables = None list_sessions_db = None get_session_db = None _db_pool = None def set_db_pool(pool): """Set the database connection pool (called from main.py lifespan).""" global _db_pool _db_pool = pool async def _init_vocab_table(): """Initialize vocab tables in database.""" if init_vocab_tables: try: await init_vocab_tables() logger.info("vocab_session_cache table ready") except Exception as e: logger.warning(f"Failed to init vocab tables: {e}") else: logger.info("vocab_session_cache table ready") async def _load_all_sessions(): """Load all vocab sessions from database into memory cache.""" if not list_sessions_db: logger.info("Loaded 0 vocab sessions from database") return try: sessions = await list_sessions_db(limit=500) count = 0 for s in sessions: sid = s.get("id") or s.get("session_id") if sid and sid not in _sessions: _sessions[sid] = { "id": sid, "name": s.get("name", ""), "description": s.get("description", ""), "status": s.get("status", "created"), "vocabulary_count": s.get("vocabulary_count", 0), "source_language": s.get("source_language", "en"), "target_language": s.get("target_language", "de"), "created_at": str(s.get("created_at", "")), } count += 1 logger.info(f"Loaded {count} vocab sessions from database") except Exception as e: logger.warning(f"Failed to load sessions from database: {e}") # --- Router & module-level state --- router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"]) LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets") _sessions: Dict[str, Dict[str, Any]] = {} _worksheets: Dict[str, Dict[str, Any]] = {} @router.post("/sessions", response_model=SessionResponse) async def create_session(session: SessionCreate): """Create a new vocabulary extraction session.""" session_id = str(uuid.uuid4()) session_data = { "id": session_id, "name": session.name, "description": session.description, "source_language": session.source_language, "target_language": session.target_language, "status": SessionStatus.PENDING.value, "vocabulary": [], "vocabulary_count": 0, "image_path": None, "extraction_confidence": None, "created_at": datetime.utcnow(), } _sessions[session_id] = session_data # Create storage directory session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) os.makedirs(session_dir, exist_ok=True) return SessionResponse( id=session_id, name=session.name, description=session.description, source_language=session.source_language, target_language=session.target_language, status=SessionStatus.PENDING.value, vocabulary_count=0, image_path=None, created_at=session_data["created_at"], ) @router.get("/sessions", response_model=List[SessionResponse]) async def list_sessions(limit: int = Query(50, ge=1, le=100)): """List all vocabulary sessions.""" sessions = sorted( _sessions.values(), key=lambda x: x["created_at"], reverse=True )[:limit] return [ SessionResponse( id=s["id"], name=s["name"], description=s.get("description"), source_language=s["source_language"], target_language=s["target_language"], status=s["status"], vocabulary_count=s.get("vocabulary_count", 0), image_path=s.get("image_path"), created_at=s["created_at"], ) for s in sessions ] @router.get("/sessions/{session_id}", response_model=SessionResponse) async def get_session(session_id: str): """Get a specific session.""" if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") s = _sessions[session_id] return SessionResponse( id=s["id"], name=s["name"], description=s.get("description"), source_language=s["source_language"], target_language=s["target_language"], status=s["status"], vocabulary_count=s.get("vocabulary_count", 0), image_path=s.get("image_path"), created_at=s["created_at"], ) @router.post("/sessions/{session_id}/upload") async def upload_image( session_id: str, file: UploadFile = File(...), ): """ Upload a textbook page image or PDF and extract vocabulary. Supported formats: PNG, JPG, JPEG, PDF """ logger.info(f"Upload request for session {session_id}") logger.info(f"File: filename={file.filename}, content_type={file.content_type}") if session_id not in _sessions: logger.error(f"Session {session_id} not found") raise HTTPException(status_code=404, detail="Session not found") session = _sessions[session_id] # Validate file type - check both extension and content type extension = file.filename.split('.')[-1].lower() if file.filename else '' content_type = file.content_type or '' # Accept images and PDFs valid_image_extensions = ['png', 'jpg', 'jpeg'] valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg'] is_pdf = extension == 'pdf' or content_type == 'application/pdf' is_image = extension in valid_image_extensions or content_type in valid_image_content_types if not is_pdf and not is_image: logger.error(f"Invalid file type: extension={extension}, content_type={content_type}") raise HTTPException( status_code=400, detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}" ) # Determine final extension for saving if is_pdf: save_extension = 'png' # PDFs will be converted to PNG elif extension in valid_image_extensions: save_extension = extension elif content_type == 'image/png': save_extension = 'png' else: save_extension = 'jpg' # Read file content content = await file.read() logger.info(f"Read {len(content)} bytes from uploaded file") # Convert PDF to image if needed if is_pdf: logger.info("Converting PDF to image...") content = await convert_pdf_page_to_image(content, page_number=0) logger.info(f"PDF converted, image size: {len(content)} bytes") # Save image session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) os.makedirs(session_dir, exist_ok=True) image_path = os.path.join(session_dir, f"source.{save_extension}") with open(image_path, 'wb') as f: f.write(content) # Update session status session["status"] = SessionStatus.PROCESSING.value session["image_path"] = image_path # Extract vocabulary using Vision LLM vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0) # Update session with extracted vocabulary session["vocabulary"] = [v.dict() for v in vocabulary] session["vocabulary_count"] = len(vocabulary) session["extraction_confidence"] = confidence session["status"] = SessionStatus.EXTRACTED.value result = { "session_id": session_id, "filename": file.filename, "image_path": image_path, "vocabulary_count": len(vocabulary), "extraction_confidence": confidence, "status": SessionStatus.EXTRACTED.value, } if error: result["error"] = error return result @router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse) async def get_vocabulary(session_id: str): """Get extracted vocabulary for a session.""" if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") session = _sessions[session_id] vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])] return VocabularyResponse( session_id=session_id, vocabulary=vocabulary, extraction_confidence=session.get("extraction_confidence"), ) @router.put("/sessions/{session_id}/vocabulary") async def update_vocabulary(session_id: str, update: VocabularyUpdate): """Update vocabulary entries (for manual corrections).""" if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") session = _sessions[session_id] session["vocabulary"] = [v.dict() for v in update.vocabulary] session["vocabulary_count"] = len(update.vocabulary) return { "session_id": session_id, "vocabulary_count": len(update.vocabulary), "message": "Vocabulary updated successfully", } @router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse) async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest): """Generate worksheet PDF(s) from extracted vocabulary.""" if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") session = _sessions[session_id] vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])] if not vocabulary: raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from") worksheet_id = str(uuid.uuid4()) title = request.title or session["name"] # Generate HTML for each worksheet type combined_html = "" for wtype in request.worksheet_types: html = generate_worksheet_html( vocabulary=vocabulary, worksheet_type=wtype, title=f"{title} - {wtype.value}", show_solutions=False, repetitions=request.repetitions, line_height=request.line_height, ) combined_html += html + '
' # Generate PDF try: pdf_bytes = await generate_worksheet_pdf(combined_html) except Exception as e: raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}") # Save PDF session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf") with open(pdf_path, 'wb') as f: f.write(pdf_bytes) # Generate solution PDF if requested solution_path = None if request.include_solutions: solution_html = "" for wtype in request.worksheet_types: html = generate_worksheet_html( vocabulary=vocabulary, worksheet_type=wtype, title=f"{title} - {wtype.value} (Loesung)", show_solutions=True, repetitions=request.repetitions, line_height=request.line_height, ) solution_html += html + '
' solution_bytes = await generate_worksheet_pdf(solution_html) solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf") with open(solution_path, 'wb') as f: f.write(solution_bytes) # Store worksheet info worksheet_data = { "id": worksheet_id, "session_id": session_id, "worksheet_types": [wt.value for wt in request.worksheet_types], "pdf_path": pdf_path, "solution_path": solution_path, "generated_at": datetime.utcnow(), } _worksheets[worksheet_id] = worksheet_data # Update session status session["status"] = SessionStatus.COMPLETED.value return WorksheetResponse( id=worksheet_id, session_id=session_id, worksheet_types=worksheet_data["worksheet_types"], pdf_path=pdf_path, solution_path=solution_path, generated_at=worksheet_data["generated_at"], ) @router.get("/worksheets/{worksheet_id}/pdf") async def download_worksheet_pdf(worksheet_id: str): """Download the generated worksheet PDF.""" if worksheet_id not in _worksheets: raise HTTPException(status_code=404, detail="Worksheet not found") worksheet = _worksheets[worksheet_id] pdf_path = worksheet["pdf_path"] if not os.path.exists(pdf_path): raise HTTPException(status_code=404, detail="PDF file not found") with open(pdf_path, 'rb') as f: pdf_bytes = f.read() return StreamingResponse( io.BytesIO(pdf_bytes), media_type="application/pdf", headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"} ) @router.get("/worksheets/{worksheet_id}/solution") async def download_solution_pdf(worksheet_id: str): """Download the solution PDF.""" if worksheet_id not in _worksheets: raise HTTPException(status_code=404, detail="Worksheet not found") worksheet = _worksheets[worksheet_id] solution_path = worksheet.get("solution_path") if not solution_path or not os.path.exists(solution_path): raise HTTPException(status_code=404, detail="Solution PDF not found") with open(solution_path, 'rb') as f: pdf_bytes = f.read() return StreamingResponse( io.BytesIO(pdf_bytes), media_type="application/pdf", headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"} ) @router.get("/sessions/{session_id}/image") async def get_session_image(session_id: str): """Get the uploaded source image for a session.""" if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") session = _sessions[session_id] image_path = session.get("image_path") if not image_path or not os.path.exists(image_path): raise HTTPException(status_code=404, detail="Image not found") # Determine content type extension = image_path.split('.')[-1].lower() content_type = { 'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg', }.get(extension, 'application/octet-stream') with open(image_path, 'rb') as f: image_bytes = f.read() return StreamingResponse( io.BytesIO(image_bytes), media_type=content_type, ) @router.delete("/sessions/{session_id}") async def delete_session(session_id: str): """Delete a vocabulary session and all associated files.""" if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") # Delete session directory session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) if os.path.exists(session_dir): import shutil shutil.rmtree(session_dir) # Remove from storage del _sessions[session_id] # Remove associated worksheets for wid, ws in list(_worksheets.items()): if ws["session_id"] == session_id: del _worksheets[wid] return {"message": "Session deleted successfully", "session_id": session_id} # --- Include sub-routers --- from vocab_worksheet_upload_api import upload_router from vocab_worksheet_analysis_api import analysis_router router.include_router(upload_router) router.include_router(analysis_router)