breakpilot-lehrer/klausur-service/backend/vocab_worksheet_api.py

"""
Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
vocabulary editing, worksheet generation, and PDF downloads.

Sub-routers (included at bottom):
- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
"""

from fastapi import APIRouter, HTTPException, UploadFile, File, Query
from fastapi.responses import StreamingResponse
from typing import List, Dict, Any
from datetime import datetime
import uuid
import os
import io
import logging

logger = logging.getLogger(__name__)

# --- Imports from extracted sub-modules ---
from vocab_worksheet_models import (
    WorksheetType,
    SessionStatus,
    VocabularyEntry,
    SessionCreate,
    SessionResponse,
    VocabularyResponse,
    VocabularyUpdate,
    WorksheetGenerateRequest,
    WorksheetResponse,
)
from vocab_worksheet_extraction import extract_vocabulary_from_image
from vocab_worksheet_generation import (
    generate_worksheet_html, generate_worksheet_pdf,
    convert_pdf_page_to_image,
)

# --- Database integration (used by main.py lifespan) ---
try:
    from vocab_session_store import (
        DATABASE_URL, get_pool, init_vocab_tables,
        list_sessions_db, get_session_db,
    )
except ImportError:
    DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
    get_pool = None
    init_vocab_tables = None
    list_sessions_db = None
    get_session_db = None

_db_pool = None


def set_db_pool(pool):
    """Set the database connection pool (called from main.py lifespan)."""
    global _db_pool
    _db_pool = pool


async def _init_vocab_table():
    """Initialize vocab tables in database."""
    if init_vocab_tables:
        try:
            await init_vocab_tables()
            logger.info("vocab_session_cache table ready")
        except Exception as e:
            logger.warning(f"Failed to init vocab tables: {e}")
    else:
        logger.info("vocab_session_cache table ready")


async def _load_all_sessions():
    """Load all vocab sessions from database into memory cache."""
    if not list_sessions_db:
        logger.info("Loaded 0 vocab sessions from database")
        return

    try:
        sessions = await list_sessions_db(limit=500)
        count = 0
        for s in sessions:
            sid = s.get("id") or s.get("session_id")
            if sid and sid not in _sessions:
                _sessions[sid] = {
                    "id": sid,
                    "name": s.get("name", ""),
                    "description": s.get("description", ""),
                    "status": s.get("status", "created"),
                    "vocabulary_count": s.get("vocabulary_count", 0),
                    "source_language": s.get("source_language", "en"),
                    "target_language": s.get("target_language", "de"),
                    "created_at": str(s.get("created_at", "")),
                }
                count += 1
        logger.info(f"Loaded {count} vocab sessions from database")
    except Exception as e:
        logger.warning(f"Failed to load sessions from database: {e}")


# --- Router & module-level state ---
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
_sessions: Dict[str, Dict[str, Any]] = {}
_worksheets: Dict[str, Dict[str, Any]] = {}


@router.post("/sessions", response_model=SessionResponse)
async def create_session(session: SessionCreate):
    """Create a new vocabulary extraction session."""
    session_id = str(uuid.uuid4())

    session_data = {
        "id": session_id,
        "name": session.name,
        "description": session.description,
        "source_language": session.source_language,
        "target_language": session.target_language,
        "status": SessionStatus.PENDING.value,
        "vocabulary": [],
        "vocabulary_count": 0,
        "image_path": None,
        "extraction_confidence": None,
        "created_at": datetime.utcnow(),
    }

    _sessions[session_id] = session_data

    # Create storage directory
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)

    return SessionResponse(
        id=session_id,
        name=session.name,
        description=session.description,
        source_language=session.source_language,
        target_language=session.target_language,
        status=SessionStatus.PENDING.value,
        vocabulary_count=0,
        image_path=None,
        created_at=session_data["created_at"],
    )


@router.get("/sessions", response_model=List[SessionResponse])
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
    """List all vocabulary sessions."""
    sessions = sorted(
        _sessions.values(),
        key=lambda x: x["created_at"],
        reverse=True
    )[:limit]

    return [
        SessionResponse(
            id=s["id"],
            name=s["name"],
            description=s.get("description"),
            source_language=s["source_language"],
            target_language=s["target_language"],
            status=s["status"],
            vocabulary_count=s.get("vocabulary_count", 0),
            image_path=s.get("image_path"),
            created_at=s["created_at"],
        )
        for s in sessions
    ]


@router.get("/sessions/{session_id}", response_model=SessionResponse)
async def get_session(session_id: str):
    """Get a specific session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    s = _sessions[session_id]
    return SessionResponse(
        id=s["id"],
        name=s["name"],
        description=s.get("description"),
        source_language=s["source_language"],
        target_language=s["target_language"],
        status=s["status"],
        vocabulary_count=s.get("vocabulary_count", 0),
        image_path=s.get("image_path"),
        created_at=s["created_at"],
    )


@router.post("/sessions/{session_id}/upload")
async def upload_image(
    session_id: str,
    file: UploadFile = File(...),
):
    """
    Upload a textbook page image or PDF and extract vocabulary.

    Supported formats: PNG, JPG, JPEG, PDF
    """
    logger.info(f"Upload request for session {session_id}")
    logger.info(f"File: filename={file.filename}, content_type={file.content_type}")

    if session_id not in _sessions:
        logger.error(f"Session {session_id} not found")
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]

    # Validate file type - check both extension and content type
    extension = file.filename.split('.')[-1].lower() if file.filename else ''
    content_type = file.content_type or ''

    # Accept images and PDFs
    valid_image_extensions = ['png', 'jpg', 'jpeg']
    valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
    is_pdf = extension == 'pdf' or content_type == 'application/pdf'
    is_image = extension in valid_image_extensions or content_type in valid_image_content_types

    if not is_pdf and not is_image:
        logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
        raise HTTPException(
            status_code=400,
            detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
        )

    # Determine final extension for saving
    if is_pdf:
        save_extension = 'png'  # PDFs will be converted to PNG
    elif extension in valid_image_extensions:
        save_extension = extension
    elif content_type == 'image/png':
        save_extension = 'png'
    else:
        save_extension = 'jpg'

    # Read file content
    content = await file.read()
    logger.info(f"Read {len(content)} bytes from uploaded file")

    # Convert PDF to image if needed
    if is_pdf:
        logger.info("Converting PDF to image...")
        content = await convert_pdf_page_to_image(content, page_number=0)
        logger.info(f"PDF converted, image size: {len(content)} bytes")

    # Save image
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    image_path = os.path.join(session_dir, f"source.{save_extension}")

    with open(image_path, 'wb') as f:
        f.write(content)

    # Update session status
    session["status"] = SessionStatus.PROCESSING.value
    session["image_path"] = image_path

    # Extract vocabulary using Vision LLM
    vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)

    # Update session with extracted vocabulary
    session["vocabulary"] = [v.dict() for v in vocabulary]
    session["vocabulary_count"] = len(vocabulary)
    session["extraction_confidence"] = confidence
    session["status"] = SessionStatus.EXTRACTED.value

    result = {
        "session_id": session_id,
        "filename": file.filename,
        "image_path": image_path,
        "vocabulary_count": len(vocabulary),
        "extraction_confidence": confidence,
        "status": SessionStatus.EXTRACTED.value,
    }

    if error:
        result["error"] = error

    return result


@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
async def get_vocabulary(session_id: str):
    """Get extracted vocabulary for a session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    session = _sessions[session_id]
    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
    return VocabularyResponse(
        session_id=session_id,
        vocabulary=vocabulary,
        extraction_confidence=session.get("extraction_confidence"),
    )


@router.put("/sessions/{session_id}/vocabulary")
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
    """Update vocabulary entries (for manual corrections)."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    session["vocabulary"] = [v.dict() for v in update.vocabulary]
    session["vocabulary_count"] = len(update.vocabulary)

    return {
        "session_id": session_id,
        "vocabulary_count": len(update.vocabulary),
        "message": "Vocabulary updated successfully",
    }


@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
    """Generate worksheet PDF(s) from extracted vocabulary."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]

    if not vocabulary:
        raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")

    worksheet_id = str(uuid.uuid4())
    title = request.title or session["name"]

    # Generate HTML for each worksheet type
    combined_html = ""
    for wtype in request.worksheet_types:
        html = generate_worksheet_html(
            vocabulary=vocabulary,
            worksheet_type=wtype,
            title=f"{title} - {wtype.value}",
            show_solutions=False,
            repetitions=request.repetitions,
            line_height=request.line_height,
        )
        combined_html += html + '<div style="page-break-after: always;"></div>'

    # Generate PDF
    try:
        pdf_bytes = await generate_worksheet_pdf(combined_html)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")

    # Save PDF
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
    with open(pdf_path, 'wb') as f:
        f.write(pdf_bytes)

    # Generate solution PDF if requested
    solution_path = None
    if request.include_solutions:
        solution_html = ""
        for wtype in request.worksheet_types:
            html = generate_worksheet_html(
                vocabulary=vocabulary,
                worksheet_type=wtype,
                title=f"{title} - {wtype.value} (Loesung)",
                show_solutions=True,
                repetitions=request.repetitions,
                line_height=request.line_height,
            )
            solution_html += html + '<div style="page-break-after: always;"></div>'

        solution_bytes = await generate_worksheet_pdf(solution_html)
        solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
        with open(solution_path, 'wb') as f:
            f.write(solution_bytes)

    # Store worksheet info
    worksheet_data = {
        "id": worksheet_id,
        "session_id": session_id,
        "worksheet_types": [wt.value for wt in request.worksheet_types],
        "pdf_path": pdf_path,
        "solution_path": solution_path,
        "generated_at": datetime.utcnow(),
    }
    _worksheets[worksheet_id] = worksheet_data

    # Update session status
    session["status"] = SessionStatus.COMPLETED.value

    return WorksheetResponse(
        id=worksheet_id,
        session_id=session_id,
        worksheet_types=worksheet_data["worksheet_types"],
        pdf_path=pdf_path,
        solution_path=solution_path,
        generated_at=worksheet_data["generated_at"],
    )


@router.get("/worksheets/{worksheet_id}/pdf")
async def download_worksheet_pdf(worksheet_id: str):
    """Download the generated worksheet PDF."""
    if worksheet_id not in _worksheets:
        raise HTTPException(status_code=404, detail="Worksheet not found")

    worksheet = _worksheets[worksheet_id]
    pdf_path = worksheet["pdf_path"]

    if not os.path.exists(pdf_path):
        raise HTTPException(status_code=404, detail="PDF file not found")

    with open(pdf_path, 'rb') as f:
        pdf_bytes = f.read()

    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
    )


@router.get("/worksheets/{worksheet_id}/solution")
async def download_solution_pdf(worksheet_id: str):
    """Download the solution PDF."""
    if worksheet_id not in _worksheets:
        raise HTTPException(status_code=404, detail="Worksheet not found")

    worksheet = _worksheets[worksheet_id]
    solution_path = worksheet.get("solution_path")

    if not solution_path or not os.path.exists(solution_path):
        raise HTTPException(status_code=404, detail="Solution PDF not found")

    with open(solution_path, 'rb') as f:
        pdf_bytes = f.read()

    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
    )


@router.get("/sessions/{session_id}/image")
async def get_session_image(session_id: str):
    """Get the uploaded source image for a session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    image_path = session.get("image_path")

    if not image_path or not os.path.exists(image_path):
        raise HTTPException(status_code=404, detail="Image not found")

    # Determine content type
    extension = image_path.split('.')[-1].lower()
    content_type = {
        'png': 'image/png',
        'jpg': 'image/jpeg',
        'jpeg': 'image/jpeg',
    }.get(extension, 'application/octet-stream')

    with open(image_path, 'rb') as f:
        image_bytes = f.read()

    return StreamingResponse(
        io.BytesIO(image_bytes),
        media_type=content_type,
    )


@router.delete("/sessions/{session_id}")
async def delete_session(session_id: str):
    """Delete a vocabulary session and all associated files."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    # Delete session directory
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    if os.path.exists(session_dir):
        import shutil
        shutil.rmtree(session_dir)

    # Remove from storage
    del _sessions[session_id]

    # Remove associated worksheets
    for wid, ws in list(_worksheets.items()):
        if ws["session_id"] == session_id:
            del _worksheets[wid]

    return {"message": "Session deleted successfully", "session_id": session_id}


# --- Include sub-routers ---
from vocab_worksheet_upload_api import upload_router
from vocab_worksheet_analysis_api import analysis_router

router.include_router(upload_router)
router.include_router(analysis_router)