Restructure: Move grid_* + vocab_* into packages (klausur-service)

grid/ package (16 files): grid/build/ — core, zones, cleanup, text_ops, cell_ops, finalize grid/editor/ — api, helpers, columns, filters, headers, zones vocab/ package (10 files): vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare vocab/ — session_store, learn_bridge 26 backward-compat shims. Internal imports relative. RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 21:30:20 +02:00
parent 098a2ff092
commit 59c400b9aa
58 changed files with 8803 additions and 8659 deletions
--- a/klausur-service/backend/vocab_worksheet_api.py
+++ b/klausur-service/backend/vocab_worksheet_api.py
@@ -1,499 +1,4 @@
-"""
-Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
-vocabulary editing, worksheet generation, and PDF downloads.
-
-Sub-routers (included at bottom):
- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
-"""
-
-from fastapi import APIRouter, HTTPException, UploadFile, File, Query
-from fastapi.responses import StreamingResponse
-from typing import List, Dict, Any
-from datetime import datetime
-import uuid
-import os
-import io
-import logging
-
-logger = logging.getLogger(__name__)
-
-# --- Imports from extracted sub-modules ---
-from vocab_worksheet_models import (
-    WorksheetType,
-    SessionStatus,
-    VocabularyEntry,
-    SessionCreate,
-    SessionResponse,
-    VocabularyResponse,
-    VocabularyUpdate,
-    WorksheetGenerateRequest,
-    WorksheetResponse,
-)
-from vocab_worksheet_extraction import extract_vocabulary_from_image
-from vocab_worksheet_generation import (
-    generate_worksheet_html, generate_worksheet_pdf,
-    convert_pdf_page_to_image,
-)
-
-# --- Database integration (used by main.py lifespan) ---
-try:
-    from vocab_session_store import (
-        DATABASE_URL, get_pool, init_vocab_tables,
-        list_sessions_db, get_session_db,
-    )
-except ImportError:
-    DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
-    get_pool = None
-    init_vocab_tables = None
-    list_sessions_db = None
-    get_session_db = None
-
-_db_pool = None
-
-
-def set_db_pool(pool):
-    """Set the database connection pool (called from main.py lifespan)."""
-    global _db_pool
-    _db_pool = pool
-
-
-async def _init_vocab_table():
-    """Initialize vocab tables in database."""
-    if init_vocab_tables:
-        try:
-            await init_vocab_tables()
-            logger.info("vocab_session_cache table ready")
-        except Exception as e:
-            logger.warning(f"Failed to init vocab tables: {e}")
-    else:
-        logger.info("vocab_session_cache table ready")
-
-
-async def _load_all_sessions():
-    """Load all vocab sessions from database into memory cache."""
-    if not list_sessions_db:
-        logger.info("Loaded 0 vocab sessions from database")
-        return
-
-    try:
-        sessions = await list_sessions_db(limit=500)
-        count = 0
-        for s in sessions:
-            sid = s.get("id") or s.get("session_id")
-            if sid and sid not in _sessions:
-                _sessions[sid] = {
-                    "id": sid,
-                    "name": s.get("name", ""),
-                    "description": s.get("description", ""),
-                    "status": s.get("status", "created"),
-                    "vocabulary_count": s.get("vocabulary_count", 0),
-                    "source_language": s.get("source_language", "en"),
-                    "target_language": s.get("target_language", "de"),
-                    "created_at": str(s.get("created_at", "")),
-                }
-                count += 1
-        logger.info(f"Loaded {count} vocab sessions from database")
-    except Exception as e:
-        logger.warning(f"Failed to load sessions from database: {e}")
-
-
-# --- Router & module-level state ---
-router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
-LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
-_sessions: Dict[str, Dict[str, Any]] = {}
-_worksheets: Dict[str, Dict[str, Any]] = {}
-
-
-@router.post("/sessions", response_model=SessionResponse)
-async def create_session(session: SessionCreate):
-    """Create a new vocabulary extraction session."""
-    session_id = str(uuid.uuid4())
-
-    session_data = {
-        "id": session_id,
-        "name": session.name,
-        "description": session.description,
-        "source_language": session.source_language,
-        "target_language": session.target_language,
-        "status": SessionStatus.PENDING.value,
-        "vocabulary": [],
-        "vocabulary_count": 0,
-        "image_path": None,
-        "extraction_confidence": None,
-        "created_at": datetime.utcnow(),
-    }
-
-    _sessions[session_id] = session_data
-
-    # Create storage directory
-    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
-    os.makedirs(session_dir, exist_ok=True)
-
-    return SessionResponse(
-        id=session_id,
-        name=session.name,
-        description=session.description,
-        source_language=session.source_language,
-        target_language=session.target_language,
-        status=SessionStatus.PENDING.value,
-        vocabulary_count=0,
-        image_path=None,
-        created_at=session_data["created_at"],
-    )
-
-
-@router.get("/sessions", response_model=List[SessionResponse])
-async def list_sessions(limit: int = Query(50, ge=1, le=100)):
-    """List all vocabulary sessions."""
-    sessions = sorted(
-        _sessions.values(),
-        key=lambda x: x["created_at"],
-        reverse=True
-    )[:limit]
-
-    return [
-        SessionResponse(
-            id=s["id"],
-            name=s["name"],
-            description=s.get("description"),
-            source_language=s["source_language"],
-            target_language=s["target_language"],
-            status=s["status"],
-            vocabulary_count=s.get("vocabulary_count", 0),
-            image_path=s.get("image_path"),
-            created_at=s["created_at"],
-        )
-        for s in sessions
-    ]
-
-
-@router.get("/sessions/{session_id}", response_model=SessionResponse)
-async def get_session(session_id: str):
-    """Get a specific session."""
-    if session_id not in _sessions:
-        raise HTTPException(status_code=404, detail="Session not found")
-
-    s = _sessions[session_id]
-    return SessionResponse(
-        id=s["id"],
-        name=s["name"],
-        description=s.get("description"),
-        source_language=s["source_language"],
-        target_language=s["target_language"],
-        status=s["status"],
-        vocabulary_count=s.get("vocabulary_count", 0),
-        image_path=s.get("image_path"),
-        created_at=s["created_at"],
-    )
-
-
-@router.post("/sessions/{session_id}/upload")
-async def upload_image(
-    session_id: str,
-    file: UploadFile = File(...),
-):
-    """
-    Upload a textbook page image or PDF and extract vocabulary.
-
-    Supported formats: PNG, JPG, JPEG, PDF
-    """
-    logger.info(f"Upload request for session {session_id}")
-    logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
-
-    if session_id not in _sessions:
-        logger.error(f"Session {session_id} not found")
-        raise HTTPException(status_code=404, detail="Session not found")
-
-    session = _sessions[session_id]
-
-    # Validate file type - check both extension and content type
-    extension = file.filename.split('.')[-1].lower() if file.filename else ''
-    content_type = file.content_type or ''
-
-    # Accept images and PDFs
-    valid_image_extensions = ['png', 'jpg', 'jpeg']
-    valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
-    is_pdf = extension == 'pdf' or content_type == 'application/pdf'
-    is_image = extension in valid_image_extensions or content_type in valid_image_content_types
-
-    if not is_pdf and not is_image:
-        logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
-        raise HTTPException(
-            status_code=400,
-            detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
-        )
-
-    # Determine final extension for saving
-    if is_pdf:
-        save_extension = 'png'  # PDFs will be converted to PNG
-    elif extension in valid_image_extensions:
-        save_extension = extension
-    elif content_type == 'image/png':
-        save_extension = 'png'
-    else:
-        save_extension = 'jpg'
-
-    # Read file content
-    content = await file.read()
-    logger.info(f"Read {len(content)} bytes from uploaded file")
-
-    # Convert PDF to image if needed
-    if is_pdf:
-        logger.info("Converting PDF to image...")
-        content = await convert_pdf_page_to_image(content, page_number=0)
-        logger.info(f"PDF converted, image size: {len(content)} bytes")
-
-    # Save image
-    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
-    os.makedirs(session_dir, exist_ok=True)
-    image_path = os.path.join(session_dir, f"source.{save_extension}")
-
-    with open(image_path, 'wb') as f:
-        f.write(content)
-
-    # Update session status
-    session["status"] = SessionStatus.PROCESSING.value
-    session["image_path"] = image_path
-
-    # Extract vocabulary using Vision LLM
-    vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
-
-    # Update session with extracted vocabulary
-    session["vocabulary"] = [v.dict() for v in vocabulary]
-    session["vocabulary_count"] = len(vocabulary)
-    session["extraction_confidence"] = confidence
-    session["status"] = SessionStatus.EXTRACTED.value
-
-    result = {
-        "session_id": session_id,
-        "filename": file.filename,
-        "image_path": image_path,
-        "vocabulary_count": len(vocabulary),
-        "extraction_confidence": confidence,
-        "status": SessionStatus.EXTRACTED.value,
-    }
-
-    if error:
-        result["error"] = error
-
-    return result
-
-
-@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
-async def get_vocabulary(session_id: str):
-    """Get extracted vocabulary for a session."""
-    if session_id not in _sessions:
-        raise HTTPException(status_code=404, detail="Session not found")
-    session = _sessions[session_id]
-    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
-    return VocabularyResponse(
-        session_id=session_id,
-        vocabulary=vocabulary,
-        extraction_confidence=session.get("extraction_confidence"),
-    )
-
-
-@router.put("/sessions/{session_id}/vocabulary")
-async def update_vocabulary(session_id: str, update: VocabularyUpdate):
-    """Update vocabulary entries (for manual corrections)."""
-    if session_id not in _sessions:
-        raise HTTPException(status_code=404, detail="Session not found")
-
-    session = _sessions[session_id]
-    session["vocabulary"] = [v.dict() for v in update.vocabulary]
-    session["vocabulary_count"] = len(update.vocabulary)
-
-    return {
-        "session_id": session_id,
-        "vocabulary_count": len(update.vocabulary),
-        "message": "Vocabulary updated successfully",
-    }
-
-
-@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
-async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
-    """Generate worksheet PDF(s) from extracted vocabulary."""
-    if session_id not in _sessions:
-        raise HTTPException(status_code=404, detail="Session not found")
-
-    session = _sessions[session_id]
-    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
-
-    if not vocabulary:
-        raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
-
-    worksheet_id = str(uuid.uuid4())
-    title = request.title or session["name"]
-
-    # Generate HTML for each worksheet type
-    combined_html = ""
-    for wtype in request.worksheet_types:
-        html = generate_worksheet_html(
-            vocabulary=vocabulary,
-            worksheet_type=wtype,
-            title=f"{title} - {wtype.value}",
-            show_solutions=False,
-            repetitions=request.repetitions,
-            line_height=request.line_height,
-        )
-        combined_html += html + '<div style="page-break-after: always;"></div>'
-
-    # Generate PDF
-    try:
-        pdf_bytes = await generate_worksheet_pdf(combined_html)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
-
-    # Save PDF
-    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
-    pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
-    with open(pdf_path, 'wb') as f:
-        f.write(pdf_bytes)
-
-    # Generate solution PDF if requested
-    solution_path = None
-    if request.include_solutions:
-        solution_html = ""
-        for wtype in request.worksheet_types:
-            html = generate_worksheet_html(
-                vocabulary=vocabulary,
-                worksheet_type=wtype,
-                title=f"{title} - {wtype.value} (Loesung)",
-                show_solutions=True,
-                repetitions=request.repetitions,
-                line_height=request.line_height,
-            )
-            solution_html += html + '<div style="page-break-after: always;"></div>'
-
-        solution_bytes = await generate_worksheet_pdf(solution_html)
-        solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
-        with open(solution_path, 'wb') as f:
-            f.write(solution_bytes)
-
-    # Store worksheet info
-    worksheet_data = {
-        "id": worksheet_id,
-        "session_id": session_id,
-        "worksheet_types": [wt.value for wt in request.worksheet_types],
-        "pdf_path": pdf_path,
-        "solution_path": solution_path,
-        "generated_at": datetime.utcnow(),
-    }
-    _worksheets[worksheet_id] = worksheet_data
-
-    # Update session status
-    session["status"] = SessionStatus.COMPLETED.value
-
-    return WorksheetResponse(
-        id=worksheet_id,
-        session_id=session_id,
-        worksheet_types=worksheet_data["worksheet_types"],
-        pdf_path=pdf_path,
-        solution_path=solution_path,
-        generated_at=worksheet_data["generated_at"],
-    )
-
-
-@router.get("/worksheets/{worksheet_id}/pdf")
-async def download_worksheet_pdf(worksheet_id: str):
-    """Download the generated worksheet PDF."""
-    if worksheet_id not in _worksheets:
-        raise HTTPException(status_code=404, detail="Worksheet not found")
-
-    worksheet = _worksheets[worksheet_id]
-    pdf_path = worksheet["pdf_path"]
-
-    if not os.path.exists(pdf_path):
-        raise HTTPException(status_code=404, detail="PDF file not found")
-
-    with open(pdf_path, 'rb') as f:
-        pdf_bytes = f.read()
-
-    return StreamingResponse(
-        io.BytesIO(pdf_bytes),
-        media_type="application/pdf",
-        headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
-    )
-
-
-@router.get("/worksheets/{worksheet_id}/solution")
-async def download_solution_pdf(worksheet_id: str):
-    """Download the solution PDF."""
-    if worksheet_id not in _worksheets:
-        raise HTTPException(status_code=404, detail="Worksheet not found")
-
-    worksheet = _worksheets[worksheet_id]
-    solution_path = worksheet.get("solution_path")
-
-    if not solution_path or not os.path.exists(solution_path):
-        raise HTTPException(status_code=404, detail="Solution PDF not found")
-
-    with open(solution_path, 'rb') as f:
-        pdf_bytes = f.read()
-
-    return StreamingResponse(
-        io.BytesIO(pdf_bytes),
-        media_type="application/pdf",
-        headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
-    )
-
-
-@router.get("/sessions/{session_id}/image")
-async def get_session_image(session_id: str):
-    """Get the uploaded source image for a session."""
-    if session_id not in _sessions:
-        raise HTTPException(status_code=404, detail="Session not found")
-
-    session = _sessions[session_id]
-    image_path = session.get("image_path")
-
-    if not image_path or not os.path.exists(image_path):
-        raise HTTPException(status_code=404, detail="Image not found")
-
-    # Determine content type
-    extension = image_path.split('.')[-1].lower()
-    content_type = {
-        'png': 'image/png',
-        'jpg': 'image/jpeg',
-        'jpeg': 'image/jpeg',
-    }.get(extension, 'application/octet-stream')
-
-    with open(image_path, 'rb') as f:
-        image_bytes = f.read()
-
-    return StreamingResponse(
-        io.BytesIO(image_bytes),
-        media_type=content_type,
-    )
-
-
-@router.delete("/sessions/{session_id}")
-async def delete_session(session_id: str):
-    """Delete a vocabulary session and all associated files."""
-    if session_id not in _sessions:
-        raise HTTPException(status_code=404, detail="Session not found")
-
-    # Delete session directory
-    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
-    if os.path.exists(session_dir):
-        import shutil
-        shutil.rmtree(session_dir)
-
-    # Remove from storage
-    del _sessions[session_id]
-
-    # Remove associated worksheets
-    for wid, ws in list(_worksheets.items()):
-        if ws["session_id"] == session_id:
-            del _worksheets[wid]
-
-    return {"message": "Session deleted successfully", "session_id": session_id}
-
-
-# --- Include sub-routers ---
-from vocab_worksheet_upload_api import upload_router
-from vocab_worksheet_analysis_api import analysis_router
-
-router.include_router(upload_router)
-router.include_router(analysis_router)
+# Backward-compat shim -- module moved to vocab/worksheet/api.py
+import importlib as _importlib
+import sys as _sys
+_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.api")