Restructure: Move grid_* + vocab_* into packages (klausur-service)

grid/ package (16 files): grid/build/ — core, zones, cleanup, text_ops, cell_ops, finalize grid/editor/ — api, helpers, columns, filters, headers, zones vocab/ package (10 files): vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare vocab/ — session_store, learn_bridge 26 backward-compat shims. Internal imports relative. RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 21:30:20 +02:00
parent 098a2ff092
commit 59c400b9aa
58 changed files with 8803 additions and 8659 deletions
--- a/klausur-service/backend/vocab/worksheet/init.py
+++ b/klausur-service/backend/vocab/worksheet/init.py
@@ -0,0 +1,5 @@
+"""
+Vocab worksheet sub-package.
+
+Main entry point: ``from vocab.worksheet.api import router``
+"""
--- a/klausur-service/backend/vocab/worksheet/analysis_api.py
+++ b/klausur-service/backend/vocab/worksheet/analysis_api.py
@@ -0,0 +1,472 @@
+"""
+Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
+extract-with-boxes, deskewed images, and learning unit generation.
+
+The two large handlers (compare_ocr_methods, analyze_grid) live in
+vocab_worksheet_compare_api.py and are included via compare_router.
+"""
+
+from fastapi import APIRouter, Body, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from typing import Optional, Dict, Any
+from datetime import datetime
+import os
+import io
+import json
+import logging
+
+def _get_sessions():
+    from .api import _sessions
+    return _sessions
+
+def _get_local_storage_path():
+    from .api import LOCAL_STORAGE_PATH
+    return LOCAL_STORAGE_PATH
+from .generation import convert_pdf_page_to_image
+
+# Try to import Tesseract extractor
+try:
+    from tesseract_vocab_extractor import (
+        extract_bounding_boxes, TESSERACT_AVAILABLE,
+    )
+except ImportError:
+    TESSERACT_AVAILABLE = False
+
+# Try to import Grid Detection Service
+try:
+    from services.grid_detection_service import GridDetectionService
+    GRID_SERVICE_AVAILABLE = True
+except ImportError:
+    GRID_SERVICE_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+analysis_router = APIRouter()
+
+def _ocr_export_dir():
+    return os.path.join(_get_local_storage_path(), "ocr-exports")
+
+def _ground_truth_dir():
+    return os.path.join(_get_local_storage_path(), "ground-truth")
+
+
+# =============================================================================
+# OCR Export Endpoints (for cross-app OCR data sharing)
+# =============================================================================
+
+
+@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
+async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
+    """
+    Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
+
+    Both apps proxy to klausur-service via /klausur-api/, so this endpoint
+    serves as shared storage accessible from both ports.
+    """
+
+    logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
+
+    os.makedirs(_ocr_export_dir(), exist_ok=True)
+
+    # Save the export data
+    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
+    with open(export_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+    # Update latest pointer
+    latest_path = os.path.join(_ocr_export_dir(), "latest.json")
+    with open(latest_path, 'w', encoding='utf-8') as f:
+        json.dump({
+            "session_id": session_id,
+            "page_number": page_number,
+            "saved_at": datetime.utcnow().isoformat(),
+        }, f, ensure_ascii=False, indent=2)
+
+    return {
+        "success": True,
+        "session_id": session_id,
+        "page_number": page_number,
+        "message": "OCR export saved successfully",
+    }
+
+
+@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
+async def load_ocr_export(session_id: str, page_number: int):
+    """Load a specific OCR export by session and page number."""
+
+    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
+
+    if not os.path.exists(export_path):
+        raise HTTPException(status_code=404, detail="OCR export not found")
+
+    with open(export_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    return data
+
+
+@analysis_router.get("/ocr-export/latest")
+async def load_latest_ocr_export():
+    """Load the most recently saved OCR export data."""
+
+    latest_path = os.path.join(_ocr_export_dir(), "latest.json")
+
+    if not os.path.exists(latest_path):
+        raise HTTPException(status_code=404, detail="No OCR exports found")
+
+    with open(latest_path, 'r', encoding='utf-8') as f:
+        pointer = json.load(f)
+
+    session_id = pointer.get("session_id")
+    page_number = pointer.get("page_number")
+
+    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
+
+    if not os.path.exists(export_path):
+        raise HTTPException(status_code=404, detail="Latest OCR export file not found")
+
+    with open(export_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    return data
+
+
+# =============================================================================
+# Extract with Boxes & Deskewed Image
+# =============================================================================
+
+
+async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
+    """Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
+
+    Returns dict with 'entries' list and 'image_width'/'image_height'.
+    Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
+    All bbox coordinates are in percent (0-100).
+    """
+    if not TESSERACT_AVAILABLE:
+        raise HTTPException(status_code=500, detail="Tesseract not available")
+    if not GRID_SERVICE_AVAILABLE:
+        raise HTTPException(status_code=500, detail="GridDetectionService not available")
+
+    # Step 1: Tesseract word-level bounding boxes
+    tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
+    words = tess_result.get("words", [])
+    img_w = tess_result.get("image_width", 0)
+    img_h = tess_result.get("image_height", 0)
+
+    if not words or img_w == 0 or img_h == 0:
+        return {"entries": [], "image_width": img_w, "image_height": img_h}
+
+    # Step 2: Convert to OCR regions (percentage-based)
+    service = GridDetectionService()
+    regions = service.convert_tesseract_regions(words, img_w, img_h)
+
+    if not regions:
+        return {"entries": [], "image_width": img_w, "image_height": img_h}
+
+    # Step 3: Detect grid
+    grid_result = service.detect_grid(regions)
+
+    if not grid_result.cells:
+        return {"entries": [], "image_width": img_w, "image_height": img_h}
+
+    # Step 4: Group cells by logical_row and column_type
+    from services.grid_detection_service import ColumnType
+
+    entries = []
+    for row_idx, row_cells in enumerate(grid_result.cells):
+        en_text = ""
+        de_text = ""
+        ex_text = ""
+        en_bbox = None
+        de_bbox = None
+        ex_bbox = None
+        row_conf_sum = 0.0
+        row_conf_count = 0
+
+        for cell in row_cells:
+            cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
+                         "w": round(cell.width, 2), "h": round(cell.height, 2)}
+
+            if cell.column_type == ColumnType.ENGLISH:
+                en_text = cell.text.strip()
+                en_bbox = cell_bbox
+            elif cell.column_type == ColumnType.GERMAN:
+                de_text = cell.text.strip()
+                de_bbox = cell_bbox
+            elif cell.column_type == ColumnType.EXAMPLE:
+                ex_text = cell.text.strip()
+                ex_bbox = cell_bbox
+
+            if cell.text.strip():
+                row_conf_sum += cell.confidence
+                row_conf_count += 1
+
+        # Skip completely empty rows
+        if not en_text and not de_text and not ex_text:
+            continue
+
+        # Calculate whole-row bounding box
+        all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
+        if all_bboxes:
+            row_x = min(b["x"] for b in all_bboxes)
+            row_y = min(b["y"] for b in all_bboxes)
+            row_right = max(b["x"] + b["w"] for b in all_bboxes)
+            row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
+            row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
+                        "w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
+        else:
+            row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
+
+        avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
+
+        entries.append({
+            "row_index": row_idx,
+            "english": en_text,
+            "german": de_text,
+            "example": ex_text,
+            "confidence": avg_conf,
+            "bbox": row_bbox,
+            "bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
+            "bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
+            "bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
+        })
+
+    return {"entries": entries, "image_width": img_w, "image_height": img_h}
+
+
+@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
+async def extract_with_boxes(session_id: str, page_number: int):
+    """Extract vocabulary entries with bounding boxes for ground truth labeling.
+
+    Uses Tesseract + GridDetectionService for spatial positioning.
+    page_number is 0-indexed.
+    """
+    logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
+
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _get_sessions()[session_id]
+    pdf_data = session.get("pdf_data")
+
+    if not pdf_data:
+        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
+
+    page_count = session.get("pdf_page_count", 1)
+    if page_number < 0 or page_number >= page_count:
+        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
+
+    # Convert page to hires image
+    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
+
+    # Deskew image before OCR
+    deskew_angle = 0.0
+    try:
+        from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
+        if CV2_AVAILABLE:
+            image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
+            logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
+    except Exception as e:
+        logger.warning(f"Deskew failed for page {page_number}: {e}")
+
+    # Cache deskewed image in session for later serving
+    if "deskewed_images" not in session:
+        session["deskewed_images"] = {}
+    session["deskewed_images"][str(page_number)] = image_data
+
+    # Extract entries with boxes (now on deskewed image)
+    result = await extract_entries_with_boxes(image_data)
+
+    # Cache in session
+    if "gt_entries" not in session:
+        session["gt_entries"] = {}
+    session["gt_entries"][str(page_number)] = result["entries"]
+
+    return {
+        "success": True,
+        "entries": result["entries"],
+        "entry_count": len(result["entries"]),
+        "image_width": result["image_width"],
+        "image_height": result["image_height"],
+        "deskew_angle": round(deskew_angle, 2),
+        "deskewed": abs(deskew_angle) > 0.05,
+    }
+
+
+@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
+async def get_deskewed_image(session_id: str, page_number: int):
+    """Return the deskewed page image as PNG.
+
+    Falls back to the original hires image if no deskewed version is cached.
+    """
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _get_sessions()[session_id]
+    deskewed = session.get("deskewed_images", {}).get(str(page_number))
+
+    if deskewed:
+        return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
+
+    # Fallback: render original hires image
+    pdf_data = session.get("pdf_data")
+    if not pdf_data:
+        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
+
+    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
+    return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
+
+
+# =============================================================================
+# Ground Truth Labeling
+# =============================================================================
+
+
+@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
+async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
+    """Save ground truth labels for a page.
+
+    Expects body with 'entries' list - each entry has english, german, example,
+    status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
+    """
+    logger.info(f"Save ground truth for session {session_id}, page {page_number}")
+
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    entries = data.get("entries", [])
+    if not entries:
+        raise HTTPException(status_code=400, detail="No entries provided")
+
+    # Save in session
+    session = _get_sessions()[session_id]
+    if "ground_truth" not in session:
+        session["ground_truth"] = {}
+    session["ground_truth"][str(page_number)] = entries
+
+    # Also save to disk
+    os.makedirs(_ground_truth_dir(), exist_ok=True)
+    gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
+    gt_data = {
+        "session_id": session_id,
+        "page_number": page_number,
+        "saved_at": datetime.now().isoformat(),
+        "entry_count": len(entries),
+        "entries": entries,
+    }
+    with open(gt_path, 'w', encoding='utf-8') as f:
+        json.dump(gt_data, f, ensure_ascii=False, indent=2)
+
+    logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
+
+    confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
+    edited = sum(1 for e in entries if e.get("status") == "edited")
+    skipped = sum(1 for e in entries if e.get("status") == "skipped")
+
+    return {
+        "success": True,
+        "saved_count": len(entries),
+        "confirmed": confirmed,
+        "edited": edited,
+        "skipped": skipped,
+        "file_path": gt_path,
+    }
+
+
+@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
+async def load_ground_truth(session_id: str, page_number: int):
+    """Load saved ground truth for a page."""
+    logger.info(f"Load ground truth for session {session_id}, page {page_number}")
+
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    # Try session cache first
+    session = _get_sessions()[session_id]
+    cached = session.get("ground_truth", {}).get(str(page_number))
+    if cached:
+        return {"success": True, "entries": cached, "source": "cache"}
+
+    # Try disk
+    gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
+    if not os.path.exists(gt_path):
+        raise HTTPException(status_code=404, detail="No ground truth found for this page")
+
+    with open(gt_path, 'r', encoding='utf-8') as f:
+        gt_data = json.load(f)
+
+    return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
+
+
+# ─── Learning Module Generation ─────────────────────────────────────────────
+
+
+class GenerateLearningUnitRequest(BaseModel):
+    grade: Optional[str] = None
+    generate_modules: bool = True
+
+
+@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
+async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
+    """
+    Create a Learning Unit from the vocabulary in this session.
+
+    1. Takes vocabulary from the session
+    2. Creates a Learning Unit in backend-lehrer
+    3. Optionally triggers MC/Cloze/QA generation
+
+    Returns the created unit info and generation status.
+    """
+    if request is None:
+        request = GenerateLearningUnitRequest()
+
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _get_sessions()[session_id]
+    vocabulary = session.get("vocabulary", [])
+
+    if not vocabulary:
+        raise HTTPException(status_code=400, detail="No vocabulary in this session")
+
+    try:
+        from vocab.learn_bridge import create_learning_unit, generate_learning_modules
+
+        # Step 1: Create Learning Unit
+        result = await create_learning_unit(
+            session_name=session["name"],
+            vocabulary=vocabulary,
+            grade=request.grade,
+        )
+
+        # Step 2: Generate modules if requested
+        if request.generate_modules:
+            try:
+                gen_result = await generate_learning_modules(
+                    unit_id=result["unit_id"],
+                    analysis_path=result["analysis_path"],
+                )
+                result["generation"] = gen_result
+            except Exception as e:
+                logger.warning(f"Module generation failed (unit created): {e}")
+                result["generation"] = {"status": "error", "reason": str(e)}
+
+        return result
+
+    except ImportError:
+        raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except RuntimeError as e:
+        raise HTTPException(status_code=502, detail=str(e))
+
+
+# =============================================================================
+# Include compare_ocr_methods & analyze_grid from companion module
+# =============================================================================
+
+from .compare_api import compare_router  # noqa: E402
+
+analysis_router.include_router(compare_router)
--- a/klausur-service/backend/vocab/worksheet/api.py
+++ b/klausur-service/backend/vocab/worksheet/api.py
@@ -0,0 +1,498 @@
+"""
+Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
+vocabulary editing, worksheet generation, and PDF downloads.
+
+Sub-routers (included at bottom):
+- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
+- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
+"""
+
+from fastapi import APIRouter, HTTPException, UploadFile, File, Query
+from fastapi.responses import StreamingResponse
+from typing import List, Dict, Any
+from datetime import datetime
+import uuid
+import os
+import io
+import logging
+
+logger = logging.getLogger(__name__)
+
+# --- Imports from extracted sub-modules ---
+from .models import (
+    SessionStatus,
+    VocabularyEntry,
+    SessionCreate,
+    SessionResponse,
+    VocabularyResponse,
+    VocabularyUpdate,
+    WorksheetGenerateRequest,
+    WorksheetResponse,
+)
+from .extraction import extract_vocabulary_from_image
+from .generation import (
+    generate_worksheet_html, generate_worksheet_pdf,
+    convert_pdf_page_to_image,
+)
+
+# --- Database integration (used by main.py lifespan) ---
+try:
+    from vocab.session_store import (
+        DATABASE_URL, get_pool, init_vocab_tables,
+        list_sessions_db, get_session_db,
+    )
+except ImportError:
+    DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
+    get_pool = None
+    init_vocab_tables = None
+    list_sessions_db = None
+    get_session_db = None
+
+_db_pool = None
+
+
+def set_db_pool(pool):
+    """Set the database connection pool (called from main.py lifespan)."""
+    global _db_pool
+    _db_pool = pool
+
+
+async def _init_vocab_table():
+    """Initialize vocab tables in database."""
+    if init_vocab_tables:
+        try:
+            await init_vocab_tables()
+            logger.info("vocab_session_cache table ready")
+        except Exception as e:
+            logger.warning(f"Failed to init vocab tables: {e}")
+    else:
+        logger.info("vocab_session_cache table ready")
+
+
+async def _load_all_sessions():
+    """Load all vocab sessions from database into memory cache."""
+    if not list_sessions_db:
+        logger.info("Loaded 0 vocab sessions from database")
+        return
+
+    try:
+        sessions = await list_sessions_db(limit=500)
+        count = 0
+        for s in sessions:
+            sid = s.get("id") or s.get("session_id")
+            if sid and sid not in _sessions:
+                _sessions[sid] = {
+                    "id": sid,
+                    "name": s.get("name", ""),
+                    "description": s.get("description", ""),
+                    "status": s.get("status", "created"),
+                    "vocabulary_count": s.get("vocabulary_count", 0),
+                    "source_language": s.get("source_language", "en"),
+                    "target_language": s.get("target_language", "de"),
+                    "created_at": str(s.get("created_at", "")),
+                }
+                count += 1
+        logger.info(f"Loaded {count} vocab sessions from database")
+    except Exception as e:
+        logger.warning(f"Failed to load sessions from database: {e}")
+
+
+# --- Router & module-level state ---
+router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
+LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
+_sessions: Dict[str, Dict[str, Any]] = {}
+_worksheets: Dict[str, Dict[str, Any]] = {}
+
+
+@router.post("/sessions", response_model=SessionResponse)
+async def create_session(session: SessionCreate):
+    """Create a new vocabulary extraction session."""
+    session_id = str(uuid.uuid4())
+
+    session_data = {
+        "id": session_id,
+        "name": session.name,
+        "description": session.description,
+        "source_language": session.source_language,
+        "target_language": session.target_language,
+        "status": SessionStatus.PENDING.value,
+        "vocabulary": [],
+        "vocabulary_count": 0,
+        "image_path": None,
+        "extraction_confidence": None,
+        "created_at": datetime.utcnow(),
+    }
+
+    _sessions[session_id] = session_data
+
+    # Create storage directory
+    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
+    os.makedirs(session_dir, exist_ok=True)
+
+    return SessionResponse(
+        id=session_id,
+        name=session.name,
+        description=session.description,
+        source_language=session.source_language,
+        target_language=session.target_language,
+        status=SessionStatus.PENDING.value,
+        vocabulary_count=0,
+        image_path=None,
+        created_at=session_data["created_at"],
+    )
+
+
+@router.get("/sessions", response_model=List[SessionResponse])
+async def list_sessions(limit: int = Query(50, ge=1, le=100)):
+    """List all vocabulary sessions."""
+    sessions = sorted(
+        _sessions.values(),
+        key=lambda x: x["created_at"],
+        reverse=True
+    )[:limit]
+
+    return [
+        SessionResponse(
+            id=s["id"],
+            name=s["name"],
+            description=s.get("description"),
+            source_language=s["source_language"],
+            target_language=s["target_language"],
+            status=s["status"],
+            vocabulary_count=s.get("vocabulary_count", 0),
+            image_path=s.get("image_path"),
+            created_at=s["created_at"],
+        )
+        for s in sessions
+    ]
+
+
+@router.get("/sessions/{session_id}", response_model=SessionResponse)
+async def get_session(session_id: str):
+    """Get a specific session."""
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    s = _sessions[session_id]
+    return SessionResponse(
+        id=s["id"],
+        name=s["name"],
+        description=s.get("description"),
+        source_language=s["source_language"],
+        target_language=s["target_language"],
+        status=s["status"],
+        vocabulary_count=s.get("vocabulary_count", 0),
+        image_path=s.get("image_path"),
+        created_at=s["created_at"],
+    )
+
+
+@router.post("/sessions/{session_id}/upload")
+async def upload_image(
+    session_id: str,
+    file: UploadFile = File(...),
+):
+    """
+    Upload a textbook page image or PDF and extract vocabulary.
+
+    Supported formats: PNG, JPG, JPEG, PDF
+    """
+    logger.info(f"Upload request for session {session_id}")
+    logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
+
+    if session_id not in _sessions:
+        logger.error(f"Session {session_id} not found")
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _sessions[session_id]
+
+    # Validate file type - check both extension and content type
+    extension = file.filename.split('.')[-1].lower() if file.filename else ''
+    content_type = file.content_type or ''
+
+    # Accept images and PDFs
+    valid_image_extensions = ['png', 'jpg', 'jpeg']
+    valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
+    is_pdf = extension == 'pdf' or content_type == 'application/pdf'
+    is_image = extension in valid_image_extensions or content_type in valid_image_content_types
+
+    if not is_pdf and not is_image:
+        logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
+        raise HTTPException(
+            status_code=400,
+            detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
+        )
+
+    # Determine final extension for saving
+    if is_pdf:
+        save_extension = 'png'  # PDFs will be converted to PNG
+    elif extension in valid_image_extensions:
+        save_extension = extension
+    elif content_type == 'image/png':
+        save_extension = 'png'
+    else:
+        save_extension = 'jpg'
+
+    # Read file content
+    content = await file.read()
+    logger.info(f"Read {len(content)} bytes from uploaded file")
+
+    # Convert PDF to image if needed
+    if is_pdf:
+        logger.info("Converting PDF to image...")
+        content = await convert_pdf_page_to_image(content, page_number=0)
+        logger.info(f"PDF converted, image size: {len(content)} bytes")
+
+    # Save image
+    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
+    os.makedirs(session_dir, exist_ok=True)
+    image_path = os.path.join(session_dir, f"source.{save_extension}")
+
+    with open(image_path, 'wb') as f:
+        f.write(content)
+
+    # Update session status
+    session["status"] = SessionStatus.PROCESSING.value
+    session["image_path"] = image_path
+
+    # Extract vocabulary using Vision LLM
+    vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
+
+    # Update session with extracted vocabulary
+    session["vocabulary"] = [v.dict() for v in vocabulary]
+    session["vocabulary_count"] = len(vocabulary)
+    session["extraction_confidence"] = confidence
+    session["status"] = SessionStatus.EXTRACTED.value
+
+    result = {
+        "session_id": session_id,
+        "filename": file.filename,
+        "image_path": image_path,
+        "vocabulary_count": len(vocabulary),
+        "extraction_confidence": confidence,
+        "status": SessionStatus.EXTRACTED.value,
+    }
+
+    if error:
+        result["error"] = error
+
+    return result
+
+
+@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
+async def get_vocabulary(session_id: str):
+    """Get extracted vocabulary for a session."""
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+    session = _sessions[session_id]
+    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
+    return VocabularyResponse(
+        session_id=session_id,
+        vocabulary=vocabulary,
+        extraction_confidence=session.get("extraction_confidence"),
+    )
+
+
+@router.put("/sessions/{session_id}/vocabulary")
+async def update_vocabulary(session_id: str, update: VocabularyUpdate):
+    """Update vocabulary entries (for manual corrections)."""
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _sessions[session_id]
+    session["vocabulary"] = [v.dict() for v in update.vocabulary]
+    session["vocabulary_count"] = len(update.vocabulary)
+
+    return {
+        "session_id": session_id,
+        "vocabulary_count": len(update.vocabulary),
+        "message": "Vocabulary updated successfully",
+    }
+
+
+@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
+async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
+    """Generate worksheet PDF(s) from extracted vocabulary."""
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _sessions[session_id]
+    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
+
+    if not vocabulary:
+        raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
+
+    worksheet_id = str(uuid.uuid4())
+    title = request.title or session["name"]
+
+    # Generate HTML for each worksheet type
+    combined_html = ""
+    for wtype in request.worksheet_types:
+        html = generate_worksheet_html(
+            vocabulary=vocabulary,
+            worksheet_type=wtype,
+            title=f"{title} - {wtype.value}",
+            show_solutions=False,
+            repetitions=request.repetitions,
+            line_height=request.line_height,
+        )
+        combined_html += html + '<div style="page-break-after: always;"></div>'
+
+    # Generate PDF
+    try:
+        pdf_bytes = await generate_worksheet_pdf(combined_html)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
+
+    # Save PDF
+    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
+    pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
+    with open(pdf_path, 'wb') as f:
+        f.write(pdf_bytes)
+
+    # Generate solution PDF if requested
+    solution_path = None
+    if request.include_solutions:
+        solution_html = ""
+        for wtype in request.worksheet_types:
+            html = generate_worksheet_html(
+                vocabulary=vocabulary,
+                worksheet_type=wtype,
+                title=f"{title} - {wtype.value} (Loesung)",
+                show_solutions=True,
+                repetitions=request.repetitions,
+                line_height=request.line_height,
+            )
+            solution_html += html + '<div style="page-break-after: always;"></div>'
+
+        solution_bytes = await generate_worksheet_pdf(solution_html)
+        solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
+        with open(solution_path, 'wb') as f:
+            f.write(solution_bytes)
+
+    # Store worksheet info
+    worksheet_data = {
+        "id": worksheet_id,
+        "session_id": session_id,
+        "worksheet_types": [wt.value for wt in request.worksheet_types],
+        "pdf_path": pdf_path,
+        "solution_path": solution_path,
+        "generated_at": datetime.utcnow(),
+    }
+    _worksheets[worksheet_id] = worksheet_data
+
+    # Update session status
+    session["status"] = SessionStatus.COMPLETED.value
+
+    return WorksheetResponse(
+        id=worksheet_id,
+        session_id=session_id,
+        worksheet_types=worksheet_data["worksheet_types"],
+        pdf_path=pdf_path,
+        solution_path=solution_path,
+        generated_at=worksheet_data["generated_at"],
+    )
+
+
+@router.get("/worksheets/{worksheet_id}/pdf")
+async def download_worksheet_pdf(worksheet_id: str):
+    """Download the generated worksheet PDF."""
+    if worksheet_id not in _worksheets:
+        raise HTTPException(status_code=404, detail="Worksheet not found")
+
+    worksheet = _worksheets[worksheet_id]
+    pdf_path = worksheet["pdf_path"]
+
+    if not os.path.exists(pdf_path):
+        raise HTTPException(status_code=404, detail="PDF file not found")
+
+    with open(pdf_path, 'rb') as f:
+        pdf_bytes = f.read()
+
+    return StreamingResponse(
+        io.BytesIO(pdf_bytes),
+        media_type="application/pdf",
+        headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
+    )
+
+
+@router.get("/worksheets/{worksheet_id}/solution")
+async def download_solution_pdf(worksheet_id: str):
+    """Download the solution PDF."""
+    if worksheet_id not in _worksheets:
+        raise HTTPException(status_code=404, detail="Worksheet not found")
+
+    worksheet = _worksheets[worksheet_id]
+    solution_path = worksheet.get("solution_path")
+
+    if not solution_path or not os.path.exists(solution_path):
+        raise HTTPException(status_code=404, detail="Solution PDF not found")
+
+    with open(solution_path, 'rb') as f:
+        pdf_bytes = f.read()
+
+    return StreamingResponse(
+        io.BytesIO(pdf_bytes),
+        media_type="application/pdf",
+        headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
+    )
+
+
+@router.get("/sessions/{session_id}/image")
+async def get_session_image(session_id: str):
+    """Get the uploaded source image for a session."""
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _sessions[session_id]
+    image_path = session.get("image_path")
+
+    if not image_path or not os.path.exists(image_path):
+        raise HTTPException(status_code=404, detail="Image not found")
+
+    # Determine content type
+    extension = image_path.split('.')[-1].lower()
+    content_type = {
+        'png': 'image/png',
+        'jpg': 'image/jpeg',
+        'jpeg': 'image/jpeg',
+    }.get(extension, 'application/octet-stream')
+
+    with open(image_path, 'rb') as f:
+        image_bytes = f.read()
+
+    return StreamingResponse(
+        io.BytesIO(image_bytes),
+        media_type=content_type,
+    )
+
+
+@router.delete("/sessions/{session_id}")
+async def delete_session(session_id: str):
+    """Delete a vocabulary session and all associated files."""
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    # Delete session directory
+    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
+    if os.path.exists(session_dir):
+        import shutil
+        shutil.rmtree(session_dir)
+
+    # Remove from storage
+    del _sessions[session_id]
+
+    # Remove associated worksheets
+    for wid, ws in list(_worksheets.items()):
+        if ws["session_id"] == session_id:
+            del _worksheets[wid]
+
+    return {"message": "Session deleted successfully", "session_id": session_id}
+
+
+# --- Include sub-routers ---
+from .upload_api import upload_router
+from .analysis_api import analysis_router
+
+router.include_router(upload_router)
+router.include_router(analysis_router)
--- a/klausur-service/backend/vocab/worksheet/compare_api.py
+++ b/klausur-service/backend/vocab/worksheet/compare_api.py
@@ -0,0 +1,542 @@
+"""
+Vocabulary Worksheet Compare & Grid Analysis API.
+
+Split from vocab_worksheet_analysis_api.py — contains the two largest
+route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
+"""
+
+from fastapi import APIRouter, HTTPException, Query
+import base64
+import json
+import logging
+import os
+
+from .extraction import extract_vocabulary_from_image
+
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")
+
+def _get_sessions():
+    from .api import _sessions
+    return _sessions
+from .generation import convert_pdf_page_to_image
+
+# Try to import Tesseract extractor
+try:
+    from tesseract_vocab_extractor import (
+        run_tesseract_pipeline,
+        match_positions_to_vocab, TESSERACT_AVAILABLE,
+    )
+except ImportError:
+    TESSERACT_AVAILABLE = False
+
+# Try to import CV Pipeline
+try:
+    from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
+except ImportError:
+    CV_PIPELINE_AVAILABLE = False
+
+# Try to import Grid Detection Service
+try:
+    from services.grid_detection_service import GridDetectionService
+    GRID_SERVICE_AVAILABLE = True
+except ImportError:
+    GRID_SERVICE_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+compare_router = APIRouter()
+
+
+# =============================================================================
+# OCR Compare & Grid Analysis Endpoints
+# =============================================================================
+
+
+@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
+async def compare_ocr_methods(session_id: str, page_number: int):
+    """
+    Run multiple OCR methods on a page and compare results.
+
+    This endpoint:
+    1. Gets the page image from the session's uploaded PDF
+    2. Runs Vision LLM extraction (primary method)
+    3. Optionally runs Tesseract extraction
+    4. Compares found vocabulary across methods
+    5. Returns structured comparison results
+
+    page_number is 0-indexed.
+    """
+    import time
+
+    logger.info(f"Compare OCR for session {session_id}, page {page_number}")
+
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _get_sessions()[session_id]
+    pdf_data = session.get("pdf_data")
+
+    if not pdf_data:
+        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
+
+    page_count = session.get("pdf_page_count", 1)
+    if page_number < 0 or page_number >= page_count:
+        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
+
+    # Convert page to image
+    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
+
+    methods_results = {}
+    all_vocab_sets = {}
+
+    # --- Method: Vision LLM ---
+    try:
+        start = time.time()
+        vocab, confidence, error = await extract_vocabulary_from_image(
+            image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
+        )
+        duration = time.time() - start
+
+        vocab_list = []
+        for v in vocab:
+            entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
+            vocab_list.append({
+                "english": entry.get("english", ""),
+                "german": entry.get("german", ""),
+                "example": entry.get("example_sentence", ""),
+            })
+
+        methods_results["vision_llm"] = {
+            "name": "Vision LLM",
+            "model": VISION_MODEL,
+            "duration_seconds": round(duration, 1),
+            "vocabulary_count": len(vocab_list),
+            "vocabulary": vocab_list,
+            "confidence": confidence,
+            "success": len(vocab_list) > 0 and not error,
+            "error": error if error else None,
+        }
+        all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
+    except Exception as e:
+        logger.error(f"Vision LLM failed: {e}")
+        methods_results["vision_llm"] = {
+            "name": "Vision LLM",
+            "model": VISION_MODEL,
+            "duration_seconds": 0,
+            "vocabulary_count": 0,
+            "vocabulary": [],
+            "confidence": 0,
+            "success": False,
+            "error": str(e),
+        }
+        all_vocab_sets["vision_llm"] = set()
+
+    # --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
+    if TESSERACT_AVAILABLE:
+        try:
+            start = time.time()
+            tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
+            duration = time.time() - start
+
+            tess_vocab = tess_result.get("vocabulary", [])
+            tess_words = tess_result.get("words", [])
+
+            # Store Tesseract words in session for later use (grid analysis, position matching)
+            session["tesseract_words"] = tess_words
+            session["tesseract_image_width"] = tess_result.get("image_width", 0)
+            session["tesseract_image_height"] = tess_result.get("image_height", 0)
+            session[f"tesseract_page_{page_number}"] = tess_result
+
+            vocab_list_tess = []
+            for v in tess_vocab:
+                vocab_list_tess.append({
+                    "english": v.get("english", ""),
+                    "german": v.get("german", ""),
+                    "example": v.get("example", ""),
+                })
+
+            methods_results["tesseract"] = {
+                "name": "Tesseract OCR",
+                "model": "tesseract-ocr (eng+deu)",
+                "duration_seconds": round(duration, 1),
+                "vocabulary_count": len(vocab_list_tess),
+                "vocabulary": vocab_list_tess,
+                "confidence": 0.7 if tess_vocab else 0,
+                "success": len(vocab_list_tess) > 0,
+                "error": tess_result.get("error"),
+                "word_count": tess_result.get("word_count", 0),
+                "columns_detected": len(tess_result.get("columns", [])),
+            }
+            all_vocab_sets["tesseract"] = {
+                (v["english"].lower().strip(), v["german"].lower().strip())
+                for v in vocab_list_tess if v["english"] and v["german"]
+            }
+
+            # Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
+            if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
+                llm_vocab_with_bbox = match_positions_to_vocab(
+                    tess_words,
+                    methods_results["vision_llm"]["vocabulary"],
+                    tess_result.get("image_width", 1),
+                    tess_result.get("image_height", 1),
+                )
+                methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
+
+        except Exception as e:
+            logger.error(f"Tesseract failed: {e}")
+            import traceback
+            logger.debug(traceback.format_exc())
+            methods_results["tesseract"] = {
+                "name": "Tesseract OCR",
+                "model": "tesseract-ocr",
+                "duration_seconds": 0,
+                "vocabulary_count": 0,
+                "vocabulary": [],
+                "confidence": 0,
+                "success": False,
+                "error": str(e),
+            }
+            all_vocab_sets["tesseract"] = set()
+
+    # --- Method: CV Pipeline (Document Reconstruction) ---
+    if CV_PIPELINE_AVAILABLE:
+        try:
+            start = time.time()
+            cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
+            duration = time.time() - start
+
+            cv_vocab = cv_result.vocabulary if not cv_result.error else []
+            vocab_list_cv = []
+            for v in cv_vocab:
+                vocab_list_cv.append({
+                    "english": v.get("english", ""),
+                    "german": v.get("german", ""),
+                    "example": v.get("example", ""),
+                })
+
+            methods_results["cv_pipeline"] = {
+                "name": "CV Pipeline (Document Reconstruction)",
+                "model": "opencv + tesseract (multi-pass)",
+                "duration_seconds": round(duration, 1),
+                "vocabulary_count": len(vocab_list_cv),
+                "vocabulary": vocab_list_cv,
+                "confidence": 0.8 if cv_vocab else 0,
+                "success": len(vocab_list_cv) > 0,
+                "error": cv_result.error,
+                "word_count": cv_result.word_count,
+                "columns_detected": cv_result.columns_detected,
+                "stages": cv_result.stages,
+            }
+            all_vocab_sets["cv_pipeline"] = {
+                (v["english"].lower().strip(), v["german"].lower().strip())
+                for v in vocab_list_cv if v["english"] and v["german"]
+            }
+
+        except Exception as e:
+            logger.error(f"CV Pipeline failed: {e}")
+            import traceback
+            logger.debug(traceback.format_exc())
+            methods_results["cv_pipeline"] = {
+                "name": "CV Pipeline (Document Reconstruction)",
+                "model": "opencv + tesseract (multi-pass)",
+                "duration_seconds": 0,
+                "vocabulary_count": 0,
+                "vocabulary": [],
+                "confidence": 0,
+                "success": False,
+                "error": str(e),
+            }
+            all_vocab_sets["cv_pipeline"] = set()
+
+    # --- Build comparison ---
+    all_unique = set()
+    for vs in all_vocab_sets.values():
+        all_unique |= vs
+
+    found_by_all = []
+    found_by_some = []
+    for english, german in sorted(all_unique):
+        found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
+        entry = {"english": english, "german": german, "methods": found_in}
+        if len(found_in) == len(all_vocab_sets):
+            found_by_all.append(entry)
+        else:
+            found_by_some.append(entry)
+
+    total_methods = max(len(all_vocab_sets), 1)
+    agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
+
+    # Find best method
+    best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
+
+    return {
+        "session_id": session_id,
+        "page_number": page_number,
+        "methods": methods_results,
+        "comparison": {
+            "found_by_all_methods": found_by_all,
+            "found_by_some_methods": found_by_some,
+            "total_unique_vocabulary": len(all_unique),
+            "agreement_rate": agreement_rate,
+        },
+        "recommendation": {
+            "best_method": best_method,
+            "reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
+        },
+    }
+
+
+@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
+async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
+    """
+    Analyze the grid/table structure of a vocabulary page.
+
+    Hybrid approach:
+    1. If Tesseract bounding boxes are available (from compare-ocr), use them for
+       real spatial positions via GridDetectionService.
+    2. Otherwise fall back to Vision LLM for grid structure detection.
+
+    page_number is 0-indexed.
+    Returns GridData structure expected by the frontend GridOverlay component.
+    """
+    import httpx
+
+    logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
+
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _get_sessions()[session_id]
+    pdf_data = session.get("pdf_data")
+
+    if not pdf_data:
+        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
+
+    page_count = session.get("pdf_page_count", 1)
+    if page_number < 0 or page_number >= page_count:
+        raise HTTPException(status_code=400, detail=f"Invalid page number.")
+
+    # Convert page to image
+    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
+
+    # --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
+    tess_page_data = session.get(f"tesseract_page_{page_number}")
+
+    if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
+        try:
+            # Run Tesseract if not already cached
+            if not tess_page_data:
+                logger.info("Running Tesseract for grid analysis (not cached)")
+                from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
+                tess_page_data = await _run_tess(image_data, lang="eng+deu")
+                session[f"tesseract_page_{page_number}"] = tess_page_data
+                session["tesseract_words"] = tess_page_data.get("words", [])
+                session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
+                session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
+
+            tess_words = tess_page_data.get("words", [])
+            img_w = tess_page_data.get("image_width", 0)
+            img_h = tess_page_data.get("image_height", 0)
+
+            if tess_words and img_w > 0 and img_h > 0:
+                service = GridDetectionService()
+                regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
+
+                if regions:
+                    grid_result = service.detect_grid(regions)
+                    grid_dict = grid_result.to_dict()
+
+                    # Merge LLM text if available (better quality than Tesseract text)
+                    # The LLM vocab was stored during compare-ocr
+                    grid_dict["source"] = "tesseract+grid_service"
+                    grid_dict["word_count"] = len(tess_words)
+
+                    logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
+                                f"{grid_result.stats.get('recognized', 0)} recognized")
+
+                    return {"success": True, "grid": grid_dict}
+
+            logger.info("Tesseract data insufficient, falling back to LLM")
+
+        except Exception as e:
+            logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
+            import traceback
+            logger.debug(traceback.format_exc())
+
+    # --- Strategy 2: Fall back to Vision LLM ---
+    image_base64 = base64.b64encode(image_data).decode("utf-8")
+
+    grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
+
+Your task: Identify the TABLE STRUCTURE and extract each cell's content.
+
+Return a JSON object with this EXACT structure:
+{
+  "rows": <number of rows>,
+  "columns": <number of columns>,
+  "column_types": ["english", "german", "example"],
+  "entries": [
+    {
+      "row": 0,
+      "col": 0,
+      "text": "the word or phrase in this cell",
+      "column_type": "english",
+      "confidence": 0.95
+    }
+  ]
+}
+
+Rules:
+- row and col are 0-indexed
+- column_type is one of: "english", "german", "example", "unknown"
+- Detect whether each column contains English words, German translations, or example sentences
+- Include ALL non-empty cells
+- confidence is 0.0-1.0 based on how clear the text is
+- If a cell is empty, don't include it
+- Return ONLY the JSON, no other text"""
+
+    try:
+        import asyncio
+
+        raw_text = ""
+        max_retries = 3
+        for attempt in range(max_retries):
+            async with httpx.AsyncClient(timeout=300.0) as client:
+                response = await client.post(
+                    f"{OLLAMA_URL}/api/chat",
+                    json={
+                        "model": VISION_MODEL,
+                        "messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
+                        "stream": False,
+                        "options": {"temperature": 0.1, "num_predict": 8192},
+                    },
+                    timeout=300.0,
+                )
+
+            if response.status_code == 500 and attempt < max_retries - 1:
+                wait_time = 10 * (attempt + 1)
+                logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
+                await asyncio.sleep(wait_time)
+                continue
+            elif response.status_code != 200:
+                error_detail = response.text[:200] if response.text else "Unknown error"
+                return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
+
+            raw_text = response.json().get("message", {}).get("content", "")
+            break
+
+        # Parse JSON from response
+        import re
+        json_match = re.search(r'\{[\s\S]*\}', raw_text)
+        if not json_match:
+            return {"success": False, "error": "Could not parse grid structure from LLM response"}
+
+        grid_raw = json.loads(json_match.group())
+
+        num_rows = grid_raw.get("rows", 0)
+        num_cols = grid_raw.get("columns", 0)
+        column_types = grid_raw.get("column_types", [])
+        entries = grid_raw.get("entries", [])
+
+        if num_rows == 0 or num_cols == 0:
+            return {"success": False, "error": "No grid structure detected"}
+
+        # Ensure column_types has the right length
+        while len(column_types) < num_cols:
+            column_types.append("unknown")
+
+        # Build cell grid with percentage-based coordinates
+        row_height = 100.0 / num_rows
+        col_width = 100.0 / num_cols
+
+        # Track which cells have content
+        cell_map = {}
+        for entry in entries:
+            r = entry.get("row", 0)
+            c = entry.get("col", 0)
+            cell_map[(r, c)] = entry
+
+        cells = []
+        recognized_count = 0
+        empty_count = 0
+        problematic_count = 0
+
+        for r in range(num_rows):
+            row_cells = []
+            for c in range(num_cols):
+                x = c * col_width
+                y = r * row_height
+
+                if (r, c) in cell_map:
+                    entry = cell_map[(r, c)]
+                    text = entry.get("text", "").strip()
+                    conf = entry.get("confidence", 0.8)
+                    col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
+
+                    if text:
+                        status = "recognized" if conf >= 0.5 else "problematic"
+                        if status == "recognized":
+                            recognized_count += 1
+                        else:
+                            problematic_count += 1
+                    else:
+                        status = "empty"
+                        empty_count += 1
+                else:
+                    text = ""
+                    conf = 0.0
+                    col_type = column_types[c] if c < len(column_types) else "unknown"
+                    status = "empty"
+                    empty_count += 1
+
+                row_cells.append({
+                    "row": r,
+                    "col": c,
+                    "x": round(x, 2),
+                    "y": round(y, 2),
+                    "width": round(col_width, 2),
+                    "height": round(row_height, 2),
+                    "text": text,
+                    "confidence": conf,
+                    "status": status,
+                    "column_type": col_type,
+                })
+            cells.append(row_cells)
+
+        total = num_rows * num_cols
+        coverage = (recognized_count + problematic_count) / max(total, 1)
+
+        # Column and row boundaries as percentages
+        col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
+        row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
+
+        grid_data = {
+            "rows": num_rows,
+            "columns": num_cols,
+            "cells": cells,
+            "column_types": column_types,
+            "column_boundaries": col_boundaries,
+            "row_boundaries": row_boundaries,
+            "deskew_angle": 0.0,
+            "source": "vision_llm",
+            "stats": {
+                "recognized": recognized_count,
+                "problematic": problematic_count,
+                "empty": empty_count,
+                "manual": 0,
+                "total": total,
+                "coverage": round(coverage, 3),
+            },
+        }
+
+        return {"success": True, "grid": grid_data}
+
+    except httpx.TimeoutException:
+        logger.error("Grid analysis timed out")
+        return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
+    except Exception as e:
+        logger.error(f"Grid analysis failed: {e}")
+        import traceback
+        logger.debug(traceback.format_exc())
+        return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}
--- a/klausur-service/backend/vocab/worksheet/extraction.py
+++ b/klausur-service/backend/vocab/worksheet/extraction.py
@@ -0,0 +1,325 @@
+"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
+
+Contains:
+- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
+- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
+- _get_demo_vocabulary(): Demo data for testing
+- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
+"""
+
+import base64
+import json
+import logging
+import os
+import re
+import uuid
+from typing import List
+
+import httpx
+
+from .models import VocabularyEntry
+
+logger = logging.getLogger(__name__)
+
+# Ollama Configuration
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
+
+
+# =============================================================================
+# Vision LLM Vocabulary Extraction
+# =============================================================================
+
+VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
+
+AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
+
+{
+  "vocabulary": [
+    {
+      "english": "to improve",
+      "german": "verbessern",
+      "example": "I want to improve my English."
+    }
+  ]
+}
+
+REGELN:
+1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
+2. Behalte die exakte Schreibweise bei
+3. Bei fehlenden Beispielsaetzen: "example": null
+4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
+5. Gib NUR valides JSON zurueck, keine Erklaerungen
+6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
+
+Beispiel-Output:
+{
+  "vocabulary": [
+    {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
+    {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
+  ]
+}"""
+
+
+async def extract_vocabulary_from_image(
+    image_data: bytes,
+    filename: str,
+    page_number: int = 0,
+    use_hybrid: bool = False  # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
+) -> tuple[List[VocabularyEntry], float, str]:
+    """
+    Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
+
+    Args:
+        image_data: Image bytes
+        filename: Original filename for logging
+        page_number: 0-indexed page number for error messages
+        use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
+                   If False, use Vision LLM (slower, better for complex layouts)
+
+    Returns:
+        Tuple of (vocabulary_entries, confidence, error_message)
+        error_message is empty string on success
+    """
+
+    # ==========================================================================
+    # HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
+    # ==========================================================================
+    if use_hybrid:
+        try:
+            from hybrid_vocab_extractor import extract_vocabulary_hybrid
+            logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
+
+            vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
+
+            if error:
+                logger.warning(f"Hybrid extraction had issues: {error}")
+                # Fall through to Vision LLM fallback
+            elif vocab_dicts:
+                # Convert dicts to VocabularyEntry objects
+                vocabulary = [
+                    VocabularyEntry(
+                        id=str(uuid.uuid4()),
+                        english=v.get("english", ""),
+                        german=v.get("german", ""),
+                        example_sentence=v.get("example"),
+                        source_page=page_number + 1
+                    )
+                    for v in vocab_dicts
+                    if v.get("english") and v.get("german")
+                ]
+                logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
+                return vocabulary, confidence, ""
+
+        except ImportError as e:
+            logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
+        except Exception as e:
+            logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
+            import traceback
+            logger.debug(traceback.format_exc())
+
+    # ==========================================================================
+    # FALLBACK: Vision LLM (Ollama llama3.2-vision)
+    # ==========================================================================
+    logger.info(f"Using VISION LLM extraction for {filename}")
+
+    try:
+        # First check if Ollama is available
+        async with httpx.AsyncClient(timeout=10.0) as check_client:
+            try:
+                health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
+                if health_response.status_code != 200:
+                    logger.error(f"Ollama not available at {OLLAMA_URL}")
+                    return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
+            except Exception as e:
+                logger.error(f"Ollama health check failed: {e}")
+                return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
+
+        image_base64 = base64.b64encode(image_data).decode("utf-8")
+
+        payload = {
+            "model": VISION_MODEL,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": VOCAB_EXTRACTION_PROMPT,
+                    "images": [image_base64]
+                }
+            ],
+            "stream": False,
+            "options": {
+                "temperature": 0.1,
+                "num_predict": 4096,
+            }
+        }
+
+        logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
+
+        # Increased timeout for Vision models (they can be slow)
+        async with httpx.AsyncClient(timeout=600.0) as client:
+            response = await client.post(
+                f"{OLLAMA_URL}/api/chat",
+                json=payload,
+                timeout=300.0  # 5 minutes per page
+            )
+            response.raise_for_status()
+
+            data = response.json()
+            extracted_text = data.get("message", {}).get("content", "")
+
+        logger.info(f"Ollama response received: {len(extracted_text)} chars")
+
+        # Parse JSON from response
+        vocabulary = parse_vocabulary_json(extracted_text)
+
+        # Set source_page for each entry
+        for v in vocabulary:
+            v.source_page = page_number + 1
+
+        # Estimate confidence
+        confidence = 0.85 if len(vocabulary) > 0 else 0.1
+
+        logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
+
+        return vocabulary, confidence, ""
+
+    except httpx.TimeoutException:
+        logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
+        return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
+    except Exception as e:
+        logger.error(f"Vocabulary extraction failed for {filename}: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
+
+
+def _get_demo_vocabulary() -> List[VocabularyEntry]:
+    """Return demo vocabulary for testing when Vision LLM is not available."""
+    demo_entries = [
+        {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
+        {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
+        {"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
+        {"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
+        {"english": "success", "german": "Erfolg", "example": "The project was a success."},
+        {"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
+        {"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
+        {"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
+    ]
+    return [
+        VocabularyEntry(
+            id=str(uuid.uuid4()),
+            english=e["english"],
+            german=e["german"],
+            example_sentence=e.get("example"),
+        )
+        for e in demo_entries
+    ]
+
+
+def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
+    """Parse vocabulary JSON from LLM response with robust error handling."""
+
+    def clean_json_string(s: str) -> str:
+        """Clean a JSON string by removing control characters and fixing common issues."""
+        # Remove control characters except newlines and tabs
+        s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+        # Replace unescaped newlines within strings with space
+        # This is a simplistic approach - replace actual newlines with escaped ones
+        s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
+        return s
+
+    def try_parse_json(json_str: str) -> dict:
+        """Try multiple strategies to parse JSON."""
+        # Strategy 1: Direct parse
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            pass
+
+        # Strategy 2: Clean and parse
+        try:
+            cleaned = clean_json_string(json_str)
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            pass
+
+        # Strategy 3: Try to fix common issues
+        try:
+            # Remove trailing commas before } or ]
+            fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
+            # Fix unquoted keys
+            fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
+            return json.loads(fixed)
+        except json.JSONDecodeError:
+            pass
+
+        return None
+
+    try:
+        # Find JSON in response (may have extra text)
+        start = text.find('{')
+        end = text.rfind('}') + 1
+
+        if start == -1 or end == 0:
+            logger.warning("No JSON found in response")
+            return []
+
+        json_str = text[start:end]
+        data = try_parse_json(json_str)
+
+        if data is None:
+            # Strategy 4: Extract vocabulary entries using regex as fallback
+            logger.warning("JSON parsing failed, trying regex extraction")
+            vocabulary = []
+            # Match patterns like {"english": "...", "german": "...", ...}
+            pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
+            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
+
+            for match in matches:
+                english = match[0].strip() if match[0] else ""
+                german = match[1].strip() if match[1] else ""
+                example = match[2].strip() if len(match) > 2 and match[2] else None
+
+                if english and german:
+                    vocab_entry = VocabularyEntry(
+                        id=str(uuid.uuid4()),
+                        english=english,
+                        german=german,
+                        example_sentence=example,
+                    )
+                    vocabulary.append(vocab_entry)
+
+            if vocabulary:
+                logger.info(f"Regex extraction found {len(vocabulary)} entries")
+            return vocabulary
+
+        # Normal JSON parsing succeeded
+        vocabulary = []
+        for i, entry in enumerate(data.get("vocabulary", [])):
+            english = entry.get("english", "").strip()
+            german = entry.get("german", "").strip()
+
+            # Skip entries that look like hallucinations (very long or containing unusual patterns)
+            if len(english) > 100 or len(german) > 200:
+                logger.warning(f"Skipping suspicious entry: {english[:50]}...")
+                continue
+
+            if not english or not german:
+                continue
+
+            vocab_entry = VocabularyEntry(
+                id=str(uuid.uuid4()),
+                english=english,
+                german=german,
+                example_sentence=entry.get("example"),
+                word_type=entry.get("word_type"),
+            )
+            vocabulary.append(vocab_entry)
+
+        return vocabulary
+
+    except Exception as e:
+        logger.error(f"Failed to parse vocabulary JSON: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return []
--- a/klausur-service/backend/vocab/worksheet/generation.py
+++ b/klausur-service/backend/vocab/worksheet/generation.py
@@ -0,0 +1,258 @@
+"""
+Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.
+
+Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
+
+Functions:
+  - generate_worksheet_html(): Build HTML for various worksheet types
+  - generate_worksheet_pdf():  Convert HTML to PDF via WeasyPrint
+  - get_pdf_page_count():      Count pages in a PDF (PyMuPDF)
+  - convert_pdf_page_to_image(): Render single PDF page to PNG
+  - convert_pdf_to_images():     Render multiple PDF pages to PNG
+"""
+
+import logging
+from typing import List
+
+from fastapi import HTTPException
+
+from .models import VocabularyEntry, WorksheetType
+
+logger = logging.getLogger(__name__)
+
+# Optional dependency: WeasyPrint
+try:
+    from weasyprint import HTML as _WeasyHTML
+    WEASYPRINT_AVAILABLE = True
+except (ImportError, OSError):
+    WEASYPRINT_AVAILABLE = False
+    logger.warning("WeasyPrint not available")
+
+# Optional dependency: PyMuPDF
+try:
+    import fitz  # PyMuPDF
+    FITZ_AVAILABLE = True
+except ImportError:
+    FITZ_AVAILABLE = False
+    logger.warning("PyMuPDF (fitz) not available")
+
+
+# =============================================================================
+# Worksheet HTML Generation
+# =============================================================================
+
+def generate_worksheet_html(
+    vocabulary: List[VocabularyEntry],
+    worksheet_type: WorksheetType,
+    title: str,
+    show_solutions: bool = False,
+    repetitions: int = 3,
+    line_height: str = "normal"
+) -> str:
+    """Generate HTML for a worksheet."""
+
+    # Line height CSS
+    line_heights = {
+        "normal": "2.5em",
+        "large": "3.5em",
+        "extra-large": "4.5em"
+    }
+    lh = line_heights.get(line_height, "2.5em")
+
+    html = f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <style>
+        @page {{ size: A4; margin: 2cm; }}
+        body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
+        h1 {{ font-size: 24px; margin-bottom: 10px; }}
+        .meta {{ color: #666; margin-bottom: 20px; }}
+        .name-line {{ margin-bottom: 30px; }}
+        .vocab-table {{ width: 100%; border-collapse: collapse; }}
+        .vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
+        .vocab-word {{ width: 40%; font-weight: 500; }}
+        .vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
+        .vocab-answer {{ width: 60%; color: #2563eb; }}
+        .gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
+        .hint {{ color: #666; font-style: italic; font-size: 12px; }}
+        .section {{ margin-top: 30px; }}
+        .section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
+    </style>
+</head>
+<body>
+    <h1>{title}</h1>
+    <div class="name-line">Name: _________________________ Datum: _____________</div>
+"""
+
+    if worksheet_type == WorksheetType.EN_TO_DE:
+        html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
+        html += '<table class="vocab-table">'
+        for entry in vocabulary:
+            if show_solutions:
+                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
+            else:
+                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
+        html += '</table></div>'
+
+    elif worksheet_type == WorksheetType.DE_TO_EN:
+        html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
+        html += '<table class="vocab-table">'
+        for entry in vocabulary:
+            if show_solutions:
+                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
+            else:
+                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
+        html += '</table></div>'
+
+    elif worksheet_type == WorksheetType.COPY_PRACTICE:
+        html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
+        html += '<table class="vocab-table">'
+        for entry in vocabulary:
+            html += f'<tr><td class="vocab-word">{entry.english}</td>'
+            html += '<td class="vocab-blank">'
+            if show_solutions:
+                html += f' {entry.english} ' * repetitions
+            html += '</td></tr>'
+        html += '</table></div>'
+
+    elif worksheet_type == WorksheetType.GAP_FILL:
+        entries_with_examples = [e for e in vocabulary if e.example_sentence]
+        if entries_with_examples:
+            html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
+            for i, entry in enumerate(entries_with_examples, 1):
+                # Create gap sentence by removing the English word
+                gap_sentence = entry.example_sentence
+                for word in entry.english.split():
+                    if word.lower() in gap_sentence.lower():
+                        gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
+                        gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
+                        gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
+                        break
+
+                html += f'<p>{i}. {gap_sentence}</p>'
+                if show_solutions:
+                    html += f'<p class="hint">Loesung: {entry.english}</p>'
+                else:
+                    html += f'<p class="hint">({entry.german})</p>'
+            html += '</div>'
+
+    html += '</body></html>'
+    return html
+
+
+# =============================================================================
+# Worksheet PDF Generation
+# =============================================================================
+
+async def generate_worksheet_pdf(html: str) -> bytes:
+    """Generate PDF from HTML using WeasyPrint."""
+    try:
+        from weasyprint import HTML
+        pdf_bytes = HTML(string=html).write_pdf()
+        return pdf_bytes
+    except ImportError:
+        logger.warning("WeasyPrint not available, returning HTML")
+        return html.encode('utf-8')
+    except Exception as e:
+        logger.error(f"PDF generation failed: {e}")
+        raise
+
+
+# =============================================================================
+# PDF Utilities (PyMuPDF)
+# =============================================================================
+
+def get_pdf_page_count(pdf_data: bytes) -> int:
+    """Get the number of pages in a PDF."""
+    try:
+        import fitz
+        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
+        count = pdf_document.page_count
+        pdf_document.close()
+        return count
+    except Exception as e:
+        logger.error(f"Failed to get PDF page count: {e}")
+        return 0
+
+
+async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
+    """Convert a specific page of PDF to PNG image using PyMuPDF.
+
+    Args:
+        pdf_data: PDF file as bytes
+        page_number: 0-indexed page number
+        thumbnail: If True, return a smaller thumbnail image
+    """
+    try:
+        import fitz  # PyMuPDF
+
+        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
+
+        if pdf_document.page_count == 0:
+            raise ValueError("PDF has no pages")
+
+        if page_number >= pdf_document.page_count:
+            raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
+
+        page = pdf_document[page_number]
+
+        # Render page to image
+        # For thumbnails: lower resolution, for OCR: higher resolution
+        zoom = 0.5 if thumbnail else 2.0
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat)
+
+        png_data = pix.tobytes("png")
+        pdf_document.close()
+
+        logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
+        return png_data
+
+    except ImportError:
+        logger.error("PyMuPDF (fitz) not installed")
+        raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
+    except Exception as e:
+        logger.error(f"PDF conversion failed: {e}")
+        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
+
+
+async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
+    """Convert multiple pages of PDF to PNG images.
+
+    Args:
+        pdf_data: PDF file as bytes
+        pages: List of 0-indexed page numbers to convert. If None, convert all pages.
+    """
+    try:
+        import fitz
+
+        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
+
+        if pdf_document.page_count == 0:
+            raise ValueError("PDF has no pages")
+
+        # If no pages specified, convert all
+        if pages is None:
+            pages = list(range(pdf_document.page_count))
+
+        images = []
+        zoom = 2.0
+        mat = fitz.Matrix(zoom, zoom)
+
+        for page_num in pages:
+            if page_num < pdf_document.page_count:
+                page = pdf_document[page_num]
+                pix = page.get_pixmap(matrix=mat)
+                images.append(pix.tobytes("png"))
+
+        pdf_document.close()
+        logger.info(f"Converted {len(images)} PDF pages to images")
+        return images
+
+    except ImportError:
+        logger.error("PyMuPDF (fitz) not installed")
+        raise HTTPException(status_code=500, detail="PDF conversion not available")
+    except Exception as e:
+        logger.error(f"PDF conversion failed: {e}")
+        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
--- a/klausur-service/backend/vocab/worksheet/models.py
+++ b/klausur-service/backend/vocab/worksheet/models.py
@@ -0,0 +1,86 @@
+"""Pydantic models and enums for the Vocab Worksheet API."""
+
+from datetime import datetime
+from enum import Enum
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+
+# =============================================================================
+# Enums
+# =============================================================================
+
+class WorksheetType(str, Enum):
+    EN_TO_DE = "en_to_de"          # English -> German translation
+    DE_TO_EN = "de_to_en"          # German -> English translation
+    COPY_PRACTICE = "copy"         # Write word multiple times
+    GAP_FILL = "gap_fill"          # Fill in the blanks
+    COMBINED = "combined"          # All types combined
+
+
+class SessionStatus(str, Enum):
+    PENDING = "pending"            # Session created, no upload yet
+    PROCESSING = "processing"      # OCR in progress
+    EXTRACTED = "extracted"        # Vocabulary extracted, ready to edit
+    COMPLETED = "completed"        # Worksheet generated
+
+
+# =============================================================================
+# Pydantic Models
+# =============================================================================
+
+class VocabularyEntry(BaseModel):
+    id: str
+    english: str
+    german: str
+    example_sentence: Optional[str] = None
+    example_sentence_gap: Optional[str] = None  # With ___ for gap-fill
+    word_type: Optional[str] = None  # noun, verb, adjective, etc.
+    source_page: Optional[int] = None  # Page number where entry was found (1-indexed)
+
+
+class SessionCreate(BaseModel):
+    name: str
+    description: Optional[str] = None
+    source_language: str = "en"  # Source language (default English)
+    target_language: str = "de"  # Target language (default German)
+
+
+class SessionResponse(BaseModel):
+    id: str
+    name: str
+    description: Optional[str]
+    source_language: str
+    target_language: str
+    status: str
+    vocabulary_count: int
+    image_path: Optional[str]
+    created_at: datetime
+
+
+class VocabularyResponse(BaseModel):
+    session_id: str
+    vocabulary: List[VocabularyEntry]
+    extraction_confidence: Optional[float]
+
+
+class VocabularyUpdate(BaseModel):
+    vocabulary: List[VocabularyEntry]
+
+
+class WorksheetGenerateRequest(BaseModel):
+    worksheet_types: List[WorksheetType]
+    title: Optional[str] = None
+    include_solutions: bool = True
+    repetitions: int = 3  # For copy practice
+    line_height: str = "normal"  # normal, large, extra-large
+
+
+class WorksheetResponse(BaseModel):
+    id: str
+    session_id: str
+    worksheet_types: List[str]
+    pdf_path: str
+    solution_path: Optional[str]
+    generated_at: datetime
--- a/klausur-service/backend/vocab/worksheet/ocr.py
+++ b/klausur-service/backend/vocab/worksheet/ocr.py
@@ -0,0 +1,481 @@
+"""
+Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page.
+
+Extracted from vocab_worksheet_api.py to keep file sizes manageable.
+
+Pipeline steps:
+  orientation → deskew → dewarp → crop → scan-quality → enhance →
+  dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build →
+  vocab extraction → row merging
+"""
+
+import logging
+import uuid
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Optional heavy dependencies (not available in every environment)
+# ---------------------------------------------------------------------------
+
+try:
+    import cv2
+    import numpy as np
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+    np = None  # type: ignore[assignment]
+    logger.warning("cv2 / numpy not available — OCR pipeline disabled")
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+
+# CV pipeline helpers
+try:
+    from cv_vocab_pipeline import (
+        deskew_two_pass,
+        dewarp_image,
+        detect_and_fix_orientation,
+        _cells_to_vocab_entries,
+        _fix_phonetic_brackets,
+    )
+except ImportError:
+    deskew_two_pass = None  # type: ignore[assignment]
+    dewarp_image = None  # type: ignore[assignment]
+    detect_and_fix_orientation = None  # type: ignore[assignment]
+    _cells_to_vocab_entries = None  # type: ignore[assignment]
+    _fix_phonetic_brackets = None  # type: ignore[assignment]
+
+try:
+    from cv_cell_grid import (
+        _merge_wrapped_rows,
+        _merge_phonetic_continuation_rows,
+        _merge_continuation_rows,
+    )
+except ImportError:
+    _merge_wrapped_rows = None  # type: ignore[assignment]
+    _merge_phonetic_continuation_rows = None  # type: ignore[assignment]
+    _merge_continuation_rows = None  # type: ignore[assignment]
+
+try:
+    from cv_ocr_engines import ocr_region_rapid
+except ImportError:
+    ocr_region_rapid = None  # type: ignore[assignment]
+
+try:
+    from cv_vocab_types import PageRegion
+except ImportError:
+    PageRegion = None  # type: ignore[assignment]
+
+try:
+    from ocr_pipeline_ocr_merge import (
+        _split_paddle_multi_words,
+        _merge_paddle_tesseract,
+        _deduplicate_words,
+    )
+except ImportError:
+    _split_paddle_multi_words = None  # type: ignore[assignment]
+    _merge_paddle_tesseract = None  # type: ignore[assignment]
+    _deduplicate_words = None  # type: ignore[assignment]
+
+try:
+    from cv_words_first import build_grid_from_words
+except ImportError:
+    build_grid_from_words = None  # type: ignore[assignment]
+
+try:
+    from ocr_pipeline_session_store import (
+        create_session_db as create_pipeline_session_db,
+        update_session_db as update_pipeline_session_db,
+    )
+except ImportError:
+    create_pipeline_session_db = None  # type: ignore[assignment]
+    update_pipeline_session_db = None  # type: ignore[assignment]
+
+
+# ---------------------------------------------------------------------------
+# Main pipeline function
+# ---------------------------------------------------------------------------
+
+async def _run_ocr_pipeline_for_page(
+    img_bgr: "np.ndarray",
+    page_number: int,
+    vocab_session_id: str,
+    *,
+    ipa_mode: str = "none",
+    syllable_mode: str = "none",
+    enable_enhance: bool = True,
+    max_columns: Optional[int] = 3,
+    override_min_conf: Optional[int] = None,
+) -> tuple:
+    """Run the full Kombi OCR pipeline on a single page and return vocab entries.
+
+    Uses the same pipeline as the admin OCR Kombi pipeline:
+    orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
+    (with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
+
+    Args:
+        img_bgr: BGR numpy array.
+        page_number: 0-indexed page number.
+        vocab_session_id: Vocab session ID for logging.
+        ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
+        syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
+
+    Returns (entries, rotation_deg) where entries is a list of dicts and
+    rotation_deg is the orientation correction applied (0, 90, 180, 270).
+    """
+    import time as _time
+
+    t_total = _time.time()
+    img_h, img_w = img_bgr.shape[:2]
+    logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
+
+    # 1. Orientation detection (fix upside-down scans)
+    t0 = _time.time()
+    img_bgr, rotation = detect_and_fix_orientation(img_bgr)
+    if rotation:
+        img_h, img_w = img_bgr.shape[:2]
+        logger.info(f"  orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
+    else:
+        logger.info(f"  orientation: OK ({_time.time() - t0:.1f}s)")
+
+    # 2. Create pipeline session in DB (visible in admin Kombi UI)
+    pipeline_session_id = str(uuid.uuid4())
+    try:
+        _, png_buf = cv2.imencode(".png", img_bgr)
+        original_png = png_buf.tobytes()
+        await create_pipeline_session_db(
+            pipeline_session_id,
+            name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
+            filename=f"page_{page_number + 1}.png",
+            original_png=original_png,
+        )
+    except Exception as e:
+        logger.warning(f"Could not create pipeline session in DB: {e}")
+
+    # 3. Three-pass deskew
+    t0 = _time.time()
+    deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
+    logger.info(f"  deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
+
+    # 4. Dewarp
+    t0 = _time.time()
+    dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
+    logger.info(f"  dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
+
+    # 5. Content crop (removes scanner borders, gutter shadows)
+    t0 = _time.time()
+    try:
+        from page_crop import detect_and_crop_page
+        cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
+        if crop_result.get("crop_applied"):
+            dewarped_bgr = cropped_bgr
+            logger.info(f"  crop: applied ({_time.time() - t0:.1f}s)")
+        else:
+            logger.info(f"  crop: skipped ({_time.time() - t0:.1f}s)")
+    except Exception as e:
+        logger.warning(f"  crop: failed ({e}), continuing with uncropped image")
+
+    # 5b. Scan quality assessment
+    scan_quality_report = None
+    try:
+        from scan_quality import score_scan_quality
+        scan_quality_report = score_scan_quality(dewarped_bgr)
+    except Exception as e:
+        logger.warning(f"  scan quality: failed ({e})")
+
+    if override_min_conf:
+        min_ocr_conf = override_min_conf
+    else:
+        min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
+
+    # 5c. Image enhancement for degraded scans
+    is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
+    if is_degraded and enable_enhance:
+        try:
+            from ocr_image_enhance import enhance_for_ocr
+            dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
+            logger.info("  enhancement: applied (degraded scan)")
+        except Exception as e:
+            logger.warning(f"  enhancement: failed ({e})")
+
+    # 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
+    t0 = _time.time()
+    img_h, img_w = dewarped_bgr.shape[:2]
+
+    # RapidOCR (local ONNX)
+    try:
+        from cv_ocr_engines import ocr_region_rapid
+        from cv_vocab_types import PageRegion
+        full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
+        rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
+    except Exception as e:
+        logger.warning(f"  RapidOCR failed: {e}")
+        rapid_words = []
+
+    # Tesseract
+    from PIL import Image
+    import pytesseract
+    pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
+    data = pytesseract.image_to_data(
+        pil_img, lang="eng+deu", config="--psm 6 --oem 3",
+        output_type=pytesseract.Output.DICT,
+    )
+    tess_words = []
+    for i in range(len(data["text"])):
+        text = str(data["text"][i]).strip()
+        conf_raw = str(data["conf"][i])
+        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
+        if not text or conf < min_ocr_conf:
+            continue
+        tess_words.append({
+            "text": text,
+            "left": data["left"][i], "top": data["top"][i],
+            "width": data["width"][i], "height": data["height"][i],
+            "conf": conf,
+        })
+
+    # Merge dual-engine results
+    from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
+    from cv_words_first import build_grid_from_words
+
+    rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
+    if rapid_split or tess_words:
+        merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
+        merged_words = _deduplicate_words(merged_words)
+    else:
+        merged_words = tess_words  # fallback to Tesseract only
+
+    # Build initial grid from merged words
+    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns)
+    for cell in cells:
+        cell["ocr_engine"] = "rapid_kombi"
+
+    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
+    n_cols = len(columns_meta)
+    logger.info(f"  ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
+                f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
+
+    # 7. Save word_result to pipeline session (needed by _build_grid_core)
+    word_result = {
+        "cells": cells,
+        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
+        "columns_used": columns_meta,
+        "layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
+        "image_width": img_w,
+        "image_height": img_h,
+        "duration_seconds": 0,
+        "ocr_engine": "rapid_kombi",
+        "raw_tesseract_words": tess_words,
+        "summary": {
+            "total_cells": len(cells),
+            "non_empty_cells": sum(1 for c in cells if c.get("text")),
+        },
+    }
+
+    # Save images + word_result to pipeline session for admin visibility
+    try:
+        _, dsk_buf = cv2.imencode(".png", deskewed_bgr)
+        _, dwp_buf = cv2.imencode(".png", dewarped_bgr)
+        await update_pipeline_session_db(
+            pipeline_session_id,
+            deskewed_png=dsk_buf.tobytes(),
+            dewarped_png=dwp_buf.tobytes(),
+            cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
+            word_result=word_result,
+            deskew_result={"angle_applied": round(angle_applied, 3)},
+            dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
+            current_step=8,
+        )
+    except Exception as e:
+        logger.warning(f"Could not update pipeline session: {e}")
+
+    # 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
+    t0 = _time.time()
+    try:
+        from grid_editor_api import _build_grid_core
+        session_data = {
+            "word_result": word_result,
+        }
+        grid_result = await _build_grid_core(
+            pipeline_session_id, session_data,
+            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
+        )
+        logger.info(f"  grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
+                    f"({_time.time() - t0:.1f}s)")
+
+        # Save grid result to pipeline session
+        try:
+            await update_pipeline_session_db(
+                pipeline_session_id,
+                grid_editor_result=grid_result,
+                current_step=11,
+            )
+        except Exception:
+            pass
+
+    except Exception as e:
+        logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
+        grid_result = None
+
+    # 9. Extract vocab entries
+    # Prefer grid-build result (better column detection, more cells) over
+    # the initial build_grid_from_words() which often under-clusters.
+    page_vocabulary = []
+    extraction_source = "none"
+
+    # A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
+    if grid_result and grid_result.get("zones"):
+        for zone in grid_result["zones"]:
+            zone_cols = zone.get("columns", [])
+            zone_cells = zone.get("cells", [])
+            if not zone_cols or not zone_cells:
+                continue
+
+            # Sort columns by x position to determine roles
+            sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
+            col_idx_to_pos = {}
+            for pos, col in enumerate(sorted_cols):
+                ci = col.get("col_index", col.get("index", -1))
+                col_idx_to_pos[ci] = pos
+
+            # Skip zones with only 1 column (likely headers/boxes)
+            if len(sorted_cols) < 2:
+                continue
+
+            # Group cells by row
+            rows_map: dict = {}
+            for cell in zone_cells:
+                ri = cell.get("row_index", 0)
+                if ri not in rows_map:
+                    rows_map[ri] = {}
+                ci = cell.get("col_index", 0)
+                rows_map[ri][ci] = (cell.get("text") or "").strip()
+
+            n_cols = len(sorted_cols)
+            for ri in sorted(rows_map.keys()):
+                row = rows_map[ri]
+                # Collect texts in column-position order
+                texts = []
+                for col in sorted_cols:
+                    ci = col.get("col_index", col.get("index", -1))
+                    texts.append(row.get(ci, ""))
+
+                if not any(texts):
+                    continue
+
+                # Map by position, skipping narrow first column (page refs/markers)
+                # Heuristic: if first column is very narrow (<15% of zone width),
+                # it's likely a marker/ref column — skip it for vocab
+                first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
+                zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
+                skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
+
+                data_texts = texts[1:] if skip_first else texts
+
+                entry = {
+                    "id": str(uuid.uuid4()),
+                    "english": data_texts[0] if len(data_texts) > 0 else "",
+                    "german": data_texts[1] if len(data_texts) > 1 else "",
+                    "example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
+                    "source_page": page_number + 1,
+                }
+                if entry["english"] or entry["german"]:
+                    page_vocabulary.append(entry)
+
+        if page_vocabulary:
+            extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
+
+    # B) Fallback: original cells with column classification
+    if not page_vocabulary:
+        col_types = {c.get("type") for c in columns_meta}
+        is_vocab = bool(col_types & {"column_en", "column_de"})
+
+        if is_vocab:
+            entries = _cells_to_vocab_entries(cells, columns_meta)
+            entries = _fix_phonetic_brackets(entries, pronunciation="british")
+            for entry in entries:
+                if not entry.get("english") and not entry.get("german"):
+                    continue
+                page_vocabulary.append({
+                    "id": str(uuid.uuid4()),
+                    "english": entry.get("english", ""),
+                    "german": entry.get("german", ""),
+                    "example_sentence": entry.get("example", ""),
+                    "source_page": page_number + 1,
+                })
+            extraction_source = f"classified ({len(columns_meta)} cols)"
+        else:
+            # Last resort: all cells by position
+            rows_map2: dict = {}
+            for cell in cells:
+                ri = cell.get("row_index", 0)
+                if ri not in rows_map2:
+                    rows_map2[ri] = {}
+                ci = cell.get("col_index", 0)
+                rows_map2[ri][ci] = (cell.get("text") or "").strip()
+            all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
+            for ri in sorted(rows_map2.keys()):
+                row = rows_map2[ri]
+                texts = [row.get(ci, "") for ci in all_ci]
+                if not any(texts):
+                    continue
+                page_vocabulary.append({
+                    "id": str(uuid.uuid4()),
+                    "english": texts[0] if len(texts) > 0 else "",
+                    "german": texts[1] if len(texts) > 1 else "",
+                    "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
+                    "source_page": page_number + 1,
+                })
+            extraction_source = f"generic ({len(all_ci)} cols)"
+
+    # --- Post-processing: merge cell-wrap continuation rows ---
+    if len(page_vocabulary) >= 2:
+        try:
+            # Convert to internal format (example_sentence → example)
+            internal = []
+            for v in page_vocabulary:
+                internal.append({
+                    'row_index': len(internal),
+                    'english': v.get('english', ''),
+                    'german': v.get('german', ''),
+                    'example': v.get('example_sentence', ''),
+                })
+
+            n_before = len(internal)
+            internal = _merge_wrapped_rows(internal)
+            internal = _merge_phonetic_continuation_rows(internal)
+            internal = _merge_continuation_rows(internal)
+
+            if len(internal) < n_before:
+                # Rebuild page_vocabulary from merged entries
+                merged_vocab = []
+                for entry in internal:
+                    if not entry.get('english') and not entry.get('german'):
+                        continue
+                    merged_vocab.append({
+                        'id': str(uuid.uuid4()),
+                        'english': entry.get('english', ''),
+                        'german': entry.get('german', ''),
+                        'example_sentence': entry.get('example', ''),
+                        'source_page': page_number + 1,
+                    })
+                logger.info(f"  row merging: {n_before} → {len(merged_vocab)} entries")
+                page_vocabulary = merged_vocab
+        except Exception as e:
+            logger.warning(f"  row merging failed (non-critical): {e}")
+
+    logger.info(f"  vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
+
+    total_duration = _time.time() - t_total
+    logger.info(f"Kombi Pipeline page {page_number + 1}: "
+                f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
+
+    return page_vocabulary, rotation, scan_quality_report
--- a/klausur-service/backend/vocab/worksheet/upload_api.py
+++ b/klausur-service/backend/vocab/worksheet/upload_api.py
@@ -0,0 +1,490 @@
+"""
+Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
+
+Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
+
+Routes (no prefix — included into the main /api/v1/vocab router):
+  POST /sessions/{session_id}/upload-pdf-info
+  GET  /sessions/{session_id}/pdf-thumbnail/{page_number}
+  GET  /sessions/{session_id}/pdf-page-image/{page_number}
+  POST /sessions/{session_id}/process-single-page/{page_number}
+  POST /sessions/{session_id}/process-pages
+"""
+
+import io
+import logging
+import os
+import uuid
+from typing import List
+
+from fastapi import APIRouter, HTTPException, Query, UploadFile, File
+from fastapi.responses import StreamingResponse
+
+from .models import SessionStatus
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Local storage path
+# ---------------------------------------------------------------------------
+
+LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
+
+# ---------------------------------------------------------------------------
+# Optional heavy dependencies
+# ---------------------------------------------------------------------------
+
+try:
+    import numpy as np
+    from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
+    OCR_PIPELINE_AVAILABLE = True
+except ImportError:
+    np = None  # type: ignore[assignment]
+    OCR_PIPELINE_AVAILABLE = False
+    logger.warning("OCR pipeline imports not available in upload module")
+
+# Sub-module imports (already split out)
+from .generation import (
+    convert_pdf_page_to_image,
+    convert_pdf_to_images,
+    get_pdf_page_count,
+)
+from .extraction import extract_vocabulary_from_image
+
+try:
+    from .ocr import _run_ocr_pipeline_for_page
+except ImportError:
+    _run_ocr_pipeline_for_page = None  # type: ignore[assignment]
+    logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
+
+# ---------------------------------------------------------------------------
+# In-memory session store (shared with main module)
+# ---------------------------------------------------------------------------
+
+def _get_sessions():
+    from .api import _sessions
+    return _sessions
+
+# ---------------------------------------------------------------------------
+# Router (no prefix — will be included into the main vocab router)
+# ---------------------------------------------------------------------------
+
+upload_router = APIRouter()
+
+
+# =============================================================================
+# POST /sessions/{session_id}/upload-pdf-info
+# =============================================================================
+
+@upload_router.post("/sessions/{session_id}/upload-pdf-info")
+async def upload_pdf_get_info(
+    session_id: str,
+    file: UploadFile = File(...),
+):
+    """
+    Upload a PDF and get page count and thumbnails for preview.
+    Use this before processing to let user select pages.
+    """
+    logger.info(f"PDF info request for session {session_id}")
+
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _get_sessions()[session_id]
+
+    # Validate file type
+    extension = file.filename.split('.')[-1].lower() if file.filename else ''
+    content_type = file.content_type or ''
+
+    if extension != 'pdf' and content_type != 'application/pdf':
+        raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
+
+    content = await file.read()
+
+    # Save PDF temporarily
+    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
+    os.makedirs(session_dir, exist_ok=True)
+    pdf_path = os.path.join(session_dir, "source.pdf")
+
+    with open(pdf_path, 'wb') as f:
+        f.write(content)
+
+    # Get page count
+    page_count = get_pdf_page_count(content)
+
+    # Store PDF data in session for later processing
+    session["pdf_data"] = content
+    session["pdf_path"] = pdf_path
+    session["pdf_page_count"] = page_count
+    session["status"] = "pdf_uploaded"
+
+    # Detect orientation for each page so thumbnails are shown correctly
+    page_rotations: dict = {}
+    if OCR_PIPELINE_AVAILABLE:
+        for pg in range(page_count):
+            try:
+                img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
+                _, rotation = detect_and_fix_orientation(img_bgr)
+                if rotation:
+                    page_rotations[pg] = rotation
+                    logger.info(f"Page {pg + 1}: orientation {rotation}°")
+            except Exception as e:
+                logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
+    session["page_rotations"] = page_rotations
+
+    return {
+        "session_id": session_id,
+        "page_count": page_count,
+        "filename": file.filename,
+        "page_rotations": page_rotations,
+    }
+
+
+# =============================================================================
+# GET /sessions/{session_id}/pdf-thumbnail/{page_number}
+# =============================================================================
+
+@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
+async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
+    """Get a thumbnail image of a specific PDF page.
+
+    Uses fitz for rendering so that page_rotations (from OCR orientation
+    detection) are applied consistently.
+
+    Args:
+        hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
+    """
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _get_sessions()[session_id]
+    pdf_data = session.get("pdf_data")
+
+    if not pdf_data:
+        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
+
+    try:
+        import fitz
+        zoom = 2.0 if hires else 0.5
+        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
+        page = pdf_document[page_number]
+        # Apply orientation correction detected during OCR processing
+        rot = session.get("page_rotations", {}).get(page_number, 0)
+        if rot:
+            page.set_rotation(rot)
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat)
+        png_data = pix.tobytes("png")
+        pdf_document.close()
+    except Exception as e:
+        logger.error(f"PDF thumbnail failed: {e}")
+        raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
+
+    return StreamingResponse(
+        io.BytesIO(png_data),
+        media_type="image/png",
+    )
+
+
+# =============================================================================
+# GET /sessions/{session_id}/pdf-page-image/{page_number}
+# =============================================================================
+
+@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
+async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
+    """PDF page as PNG at arbitrary resolution (for editor view).
+
+    Args:
+        zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
+    """
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _get_sessions()[session_id]
+    pdf_data = session.get("pdf_data")
+
+    if not pdf_data:
+        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
+
+    page_count = session.get("pdf_page_count", 1)
+    if page_number < 0 or page_number >= page_count:
+        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
+
+    try:
+        import fitz
+        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
+        page = pdf_document[page_number]
+        # Apply orientation correction detected during OCR processing
+        rot = session.get("page_rotations", {}).get(page_number, 0)
+        if rot:
+            page.set_rotation(rot)
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat)
+        png_data = pix.tobytes("png")
+        pdf_document.close()
+        logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
+    except Exception as e:
+        logger.error(f"PDF page image failed: {e}")
+        raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
+
+    return StreamingResponse(
+        io.BytesIO(png_data),
+        media_type="image/png",
+    )
+
+
+# =============================================================================
+# POST /sessions/{session_id}/process-single-page/{page_number}
+# =============================================================================
+
+@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
+async def process_single_page(
+    session_id: str,
+    page_number: int,
+    ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
+    syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
+    enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
+    max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
+    min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
+):
+    """
+    Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
+
+    Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
+    dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
+
+    Query params:
+        ipa_mode: "none" (default), "auto", "all", "en", "de"
+        syllable_mode: "none" (default), "auto", "all", "en", "de"
+        enhance: true (default) -- apply CLAHE/denoise for degraded scans
+        max_cols: 3 (default) -- max column count (0=unlimited)
+        min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
+
+    The frontend should call this sequentially for each page.
+    Returns the vocabulary for just this one page.
+    """
+    logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
+
+    if session_id not in _get_sessions():
+        raise HTTPException(
+            status_code=404,
+            detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
+        )
+
+    session = _get_sessions()[session_id]
+    pdf_data = session.get("pdf_data")
+
+    if not pdf_data:
+        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
+
+    page_count = session.get("pdf_page_count", 1)
+
+    if page_number < 0 or page_number >= page_count:
+        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
+
+    # Derive pipeline-level variable names for the quality report
+    enable_enhance = enhance
+    max_columns = max_cols if max_cols > 0 else None
+    override_min_conf = min_conf if min_conf > 0 else None
+
+    # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
+    rotation_deg = 0
+    quality_report = None
+    min_ocr_conf = 40  # default; overridden by pipeline when quality report is available
+    if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
+        try:
+            img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
+            page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
+                img_bgr, page_number, session_id,
+                ipa_mode=ipa_mode, syllable_mode=syllable_mode,
+                enable_enhance=enable_enhance,
+                max_columns=max_columns,
+                override_min_conf=override_min_conf,
+            )
+            # Update min_ocr_conf from quality report if available
+            if quality_report and hasattr(quality_report, 'recommended_min_conf'):
+                min_ocr_conf = quality_report.recommended_min_conf
+        except Exception as e:
+            logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
+            return {
+                "session_id": session_id,
+                "page_number": page_number + 1,
+                "success": False,
+                "error": f"OCR pipeline error: {e}",
+                "vocabulary": [],
+                "vocabulary_count": 0,
+            }
+    else:
+        # Fallback to LLM vision extraction
+        logger.warning("OCR pipeline not available, falling back to LLM vision")
+        image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
+        vocabulary, confidence, error = await extract_vocabulary_from_image(
+            image_data,
+            f"page_{page_number + 1}.png",
+            page_number=page_number
+        )
+        if error:
+            logger.warning(f"Page {page_number + 1} failed: {error}")
+            return {
+                "session_id": session_id,
+                "page_number": page_number + 1,
+                "success": False,
+                "error": error,
+                "vocabulary": [],
+                "vocabulary_count": 0,
+            }
+        page_vocabulary = []
+        for entry in vocabulary:
+            entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
+            entry_dict['source_page'] = page_number + 1
+            if 'id' not in entry_dict or not entry_dict['id']:
+                entry_dict['id'] = str(uuid.uuid4())
+            page_vocabulary.append(entry_dict)
+
+    logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
+
+    # Store rotation for this page (used by image/thumbnail endpoints)
+    session.setdefault("page_rotations", {})[page_number] = rotation_deg
+
+    # Add to session's vocabulary (append, don't replace)
+    existing_vocab = session.get("vocabulary", [])
+    # Remove any existing entries from this page (in case of re-processing)
+    existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
+    existing_vocab.extend(page_vocabulary)
+    session["vocabulary"] = existing_vocab
+    session["vocabulary_count"] = len(existing_vocab)
+    session["status"] = SessionStatus.EXTRACTED.value
+
+    result = {
+        "session_id": session_id,
+        "page_number": page_number + 1,
+        "success": True,
+        "vocabulary": page_vocabulary,
+        "vocabulary_count": len(page_vocabulary),
+        "total_vocabulary_count": len(existing_vocab),
+        "extraction_confidence": 0.9,
+        "rotation": rotation_deg,
+    }
+
+    # Add scan quality report + active steps info
+    if quality_report:
+        sq = quality_report.to_dict()
+        sq["active_steps"] = {
+            "step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
+            "step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
+            "step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
+        }
+        result["scan_quality"] = sq
+
+    return result
+
+
+# =============================================================================
+# POST /sessions/{session_id}/process-pages  (DEPRECATED)
+# =============================================================================
+
+@upload_router.post("/sessions/{session_id}/process-pages")
+async def process_pdf_pages(
+    session_id: str,
+    pages: List[int] = None,
+    process_all: bool = False,
+):
+    """
+    Process specific pages of an uploaded PDF.
+
+    DEPRECATED: Use /process-single-page/{page_number} instead for better results.
+
+    Args:
+        pages: List of 0-indexed page numbers to process
+        process_all: If True, process all pages
+    """
+    logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
+
+    if session_id not in _get_sessions():
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _get_sessions()[session_id]
+    pdf_data = session.get("pdf_data")
+
+    if not pdf_data:
+        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
+
+    page_count = session.get("pdf_page_count", 1)
+
+    # Determine which pages to process
+    if process_all:
+        pages = list(range(page_count))
+    elif pages is None or len(pages) == 0:
+        pages = [0]  # Default to first page
+
+    # Convert selected pages to images
+    images = await convert_pdf_to_images(pdf_data, pages)
+
+    # Extract vocabulary from each page SEQUENTIALLY
+    all_vocabulary = []
+    total_confidence = 0.0
+    successful_pages = []
+    failed_pages = []
+    error_messages = []
+
+    for i, image_data in enumerate(images):
+        page_num = pages[i]
+        logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
+
+        vocabulary, confidence, error = await extract_vocabulary_from_image(
+            image_data,
+            f"page_{page_num + 1}.png",
+            page_number=page_num
+        )
+
+        if error:
+            failed_pages.append(page_num + 1)
+            error_messages.append(error)
+            logger.warning(f"Page {page_num + 1} failed: {error}")
+        else:
+            successful_pages.append(page_num + 1)
+            total_confidence += confidence
+
+            # Add page info to each entry and convert to dict
+            for entry in vocabulary:
+                entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
+                entry_dict['source_page'] = page_num + 1
+                all_vocabulary.append(entry_dict)
+
+            logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
+
+    avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
+
+    # Update session
+    session["vocabulary"] = all_vocabulary
+    session["vocabulary_count"] = len(all_vocabulary)
+    session["extraction_confidence"] = avg_confidence
+    session["processed_pages"] = pages
+    session["successful_pages"] = successful_pages
+    session["failed_pages"] = failed_pages
+    session["status"] = SessionStatus.EXTRACTED.value
+
+    # Save first page as preview image
+    if images:
+        session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
+        image_path = os.path.join(session_dir, "source.png")
+        with open(image_path, 'wb') as f:
+            f.write(images[0])
+        session["image_path"] = image_path
+
+    result = {
+        "session_id": session_id,
+        "pages_processed": len(pages),
+        "pages_successful": len(successful_pages),
+        "pages_failed": len(failed_pages),
+        "successful_pages": successful_pages,
+        "failed_pages": failed_pages,
+        "vocabulary_count": len(all_vocabulary),
+        "extraction_confidence": avg_confidence,
+        "status": SessionStatus.EXTRACTED.value,
+    }
+
+    if error_messages:
+        result["errors"] = error_messages
+
+    return result