breakpilot-lehrer/klausur-service/backend/vocab_worksheet_api.py

"""
Vocabulary Worksheet API - Extract vocabulary from textbook pages and generate worksheets.

DATENSCHUTZ/PRIVACY:
- Alle Verarbeitung erfolgt lokal (Mac Mini mit Ollama)
- Keine Daten werden an externe Server gesendet
- DSGVO-konform fuer Schulumgebungen

Workflow:
1. POST /sessions - Create a vocabulary extraction session
2. POST /sessions/{id}/upload - Upload textbook page image
3. GET /sessions/{id}/vocabulary - Get extracted vocabulary
4. PUT /sessions/{id}/vocabulary - Edit vocabulary (corrections)
5. POST /sessions/{id}/generate - Generate worksheet PDF
6. GET /worksheets/{id}/pdf - Download generated PDF
"""

from fastapi import APIRouter, Body, HTTPException, UploadFile, File, Form, Query
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional, List, Dict, Any
from datetime import datetime
from enum import Enum
import uuid
import os
import io
import json
import base64
import logging

logger = logging.getLogger(__name__)

# Ollama Configuration - Direct call without external modules
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")

# Try to import MinIO storage
try:
    from minio_storage import upload_to_minio, get_from_minio
    MINIO_AVAILABLE = True
except ImportError:
    MINIO_AVAILABLE = False
    logger.warning("MinIO storage not available, using local storage")

# Try to import Tesseract extractor
try:
    from tesseract_vocab_extractor import (
        extract_bounding_boxes, run_tesseract_pipeline,
        match_positions_to_vocab, TESSERACT_AVAILABLE,
    )
except ImportError:
    TESSERACT_AVAILABLE = False
    logger.warning("Tesseract extractor not available")

# Try to import CV Pipeline
try:
    from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
except ImportError:
    CV_PIPELINE_AVAILABLE = False
    logger.warning("CV vocab pipeline not available")

# Try to import OCR Pipeline functions (for process-single-page)
try:
    import cv2
    import numpy as np
    from cv_vocab_pipeline import (
        deskew_image, deskew_image_by_word_alignment, deskew_image_iterative,
        deskew_two_pass,
        dewarp_image, create_ocr_image,
        detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
        detect_row_geometry, build_cell_grid_v2,
        _cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
        _split_broad_columns,
        expand_narrow_columns, positional_column_regions, llm_review_entries,
        detect_and_fix_orientation,
        _fix_phonetic_brackets,
        render_pdf_high_res,
        PageRegion, RowGeometry,
    )
    from ocr_pipeline_session_store import (
        create_session_db as create_pipeline_session_db,
        update_session_db as update_pipeline_session_db,
    )
    OCR_PIPELINE_AVAILABLE = True
except ImportError as _ocr_pipe_err:
    OCR_PIPELINE_AVAILABLE = False
    logger.warning(f"OCR Pipeline functions not available: {_ocr_pipe_err}")

# Try to import Grid Detection Service
try:
    from services.grid_detection_service import GridDetectionService
    GRID_SERVICE_AVAILABLE = True
except ImportError:
    GRID_SERVICE_AVAILABLE = False
    logger.warning("Grid Detection Service not available")

# Database integration (used by main.py lifespan)
try:
    from vocab_session_store import (
        DATABASE_URL, get_pool, init_vocab_tables,
        list_sessions_db, get_session_db,
    )
except ImportError:
    DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
    get_pool = None
    init_vocab_tables = None
    list_sessions_db = None
    get_session_db = None

_db_pool = None


def set_db_pool(pool):
    """Set the database connection pool (called from main.py lifespan)."""
    global _db_pool
    _db_pool = pool


async def _init_vocab_table():
    """Initialize vocab tables in database."""
    if init_vocab_tables:
        try:
            await init_vocab_tables()
            logger.info("vocab_session_cache table ready")
        except Exception as e:
            logger.warning(f"Failed to init vocab tables: {e}")
    else:
        logger.info("vocab_session_cache table ready")


async def _load_all_sessions():
    """Load all vocab sessions from database into memory cache."""
    if not list_sessions_db:
        logger.info("Loaded 0 vocab sessions from database")
        return

    try:
        sessions = await list_sessions_db(limit=500)
        count = 0
        for s in sessions:
            sid = s.get("id") or s.get("session_id")
            if sid and sid not in _sessions:
                _sessions[sid] = {
                    "id": sid,
                    "name": s.get("name", ""),
                    "description": s.get("description", ""),
                    "status": s.get("status", "created"),
                    "vocabulary_count": s.get("vocabulary_count", 0),
                    "source_language": s.get("source_language", "en"),
                    "target_language": s.get("target_language", "de"),
                    "created_at": str(s.get("created_at", "")),
                }
                count += 1
        logger.info(f"Loaded {count} vocab sessions from database")
    except Exception as e:
        logger.warning(f"Failed to load sessions from database: {e}")


router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])

# Local storage path
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")


# =============================================================================
# Enums and Pydantic Models
# =============================================================================

class WorksheetType(str, Enum):
    EN_TO_DE = "en_to_de"          # English -> German translation
    DE_TO_EN = "de_to_en"          # German -> English translation
    COPY_PRACTICE = "copy"         # Write word multiple times
    GAP_FILL = "gap_fill"          # Fill in the blanks
    COMBINED = "combined"          # All types combined


class SessionStatus(str, Enum):
    PENDING = "pending"            # Session created, no upload yet
    PROCESSING = "processing"      # OCR in progress
    EXTRACTED = "extracted"        # Vocabulary extracted, ready to edit
    COMPLETED = "completed"        # Worksheet generated


class VocabularyEntry(BaseModel):
    id: str
    english: str
    german: str
    example_sentence: Optional[str] = None
    example_sentence_gap: Optional[str] = None  # With ___ for gap-fill
    word_type: Optional[str] = None  # noun, verb, adjective, etc.
    source_page: Optional[int] = None  # Page number where entry was found (1-indexed)


class SessionCreate(BaseModel):
    name: str
    description: Optional[str] = None
    source_language: str = "en"  # Source language (default English)
    target_language: str = "de"  # Target language (default German)


class SessionResponse(BaseModel):
    id: str
    name: str
    description: Optional[str]
    source_language: str
    target_language: str
    status: str
    vocabulary_count: int
    image_path: Optional[str]
    created_at: datetime


class VocabularyResponse(BaseModel):
    session_id: str
    vocabulary: List[VocabularyEntry]
    extraction_confidence: Optional[float]


class VocabularyUpdate(BaseModel):
    vocabulary: List[VocabularyEntry]


class WorksheetGenerateRequest(BaseModel):
    worksheet_types: List[WorksheetType]
    title: Optional[str] = None
    include_solutions: bool = True
    repetitions: int = 3  # For copy practice
    line_height: str = "normal"  # normal, large, extra-large


class WorksheetResponse(BaseModel):
    id: str
    session_id: str
    worksheet_types: List[str]
    pdf_path: str
    solution_path: Optional[str]
    generated_at: datetime


# =============================================================================
# In-Memory Storage (simplified - should use PostgreSQL in production)
# =============================================================================

# Session storage
_sessions: Dict[str, Dict[str, Any]] = {}
_worksheets: Dict[str, Dict[str, Any]] = {}


# =============================================================================
# Vision LLM Vocabulary Extraction
# =============================================================================

VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.

AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:

{
  "vocabulary": [
    {
      "english": "to improve",
      "german": "verbessern",
      "example": "I want to improve my English."
    }
  ]
}

REGELN:
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
2. Behalte die exakte Schreibweise bei
3. Bei fehlenden Beispielsaetzen: "example": null
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
5. Gib NUR valides JSON zurueck, keine Erklaerungen
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"

Beispiel-Output:
{
  "vocabulary": [
    {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
    {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
  ]
}"""


async def extract_vocabulary_from_image(
    image_data: bytes,
    filename: str,
    page_number: int = 0,
    use_hybrid: bool = False  # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
) -> tuple[List[VocabularyEntry], float, str]:
    """
    Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).

    Args:
        image_data: Image bytes
        filename: Original filename for logging
        page_number: 0-indexed page number for error messages
        use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
                   If False, use Vision LLM (slower, better for complex layouts)

    Returns:
        Tuple of (vocabulary_entries, confidence, error_message)
        error_message is empty string on success
    """
    import httpx

    # ==========================================================================
    # HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
    # ==========================================================================
    if use_hybrid:
        try:
            from hybrid_vocab_extractor import extract_vocabulary_hybrid
            logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")

            vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)

            if error:
                logger.warning(f"Hybrid extraction had issues: {error}")
                # Fall through to Vision LLM fallback
            elif vocab_dicts:
                # Convert dicts to VocabularyEntry objects
                vocabulary = [
                    VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=v.get("english", ""),
                        german=v.get("german", ""),
                        example_sentence=v.get("example"),
                        source_page=page_number + 1
                    )
                    for v in vocab_dicts
                    if v.get("english") and v.get("german")
                ]
                logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
                return vocabulary, confidence, ""

        except ImportError as e:
            logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
        except Exception as e:
            logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
            import traceback
            logger.debug(traceback.format_exc())

    # ==========================================================================
    # FALLBACK: Vision LLM (Ollama llama3.2-vision)
    # ==========================================================================
    logger.info(f"Using VISION LLM extraction for {filename}")

    try:
        # First check if Ollama is available
        async with httpx.AsyncClient(timeout=10.0) as check_client:
            try:
                health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
                if health_response.status_code != 200:
                    logger.error(f"Ollama not available at {OLLAMA_URL}")
                    return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
            except Exception as e:
                logger.error(f"Ollama health check failed: {e}")
                return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"

        image_base64 = base64.b64encode(image_data).decode("utf-8")

        payload = {
            "model": VISION_MODEL,
            "messages": [
                {
                    "role": "user",
                    "content": VOCAB_EXTRACTION_PROMPT,
                    "images": [image_base64]
                }
            ],
            "stream": False,
            "options": {
                "temperature": 0.1,
                "num_predict": 4096,
            }
        }

        logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")

        # Increased timeout for Vision models (they can be slow)
        async with httpx.AsyncClient(timeout=600.0) as client:
            response = await client.post(
                f"{OLLAMA_URL}/api/chat",
                json=payload,
                timeout=300.0  # 5 minutes per page
            )
            response.raise_for_status()

            data = response.json()
            extracted_text = data.get("message", {}).get("content", "")

        logger.info(f"Ollama response received: {len(extracted_text)} chars")

        # Parse JSON from response
        vocabulary = parse_vocabulary_json(extracted_text)

        # Set source_page for each entry
        for v in vocabulary:
            v.source_page = page_number + 1

        # Estimate confidence
        confidence = 0.85 if len(vocabulary) > 0 else 0.1

        logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")

        return vocabulary, confidence, ""

    except httpx.TimeoutException:
        logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
        return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
    except Exception as e:
        logger.error(f"Vocabulary extraction failed for {filename}: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"


def _get_demo_vocabulary() -> List[VocabularyEntry]:
    """Return demo vocabulary for testing when Vision LLM is not available."""
    demo_entries = [
        {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
        {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
        {"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
        {"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
        {"english": "success", "german": "Erfolg", "example": "The project was a success."},
        {"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
        {"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
        {"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
    ]
    return [
        VocabularyEntry(
            id=str(uuid.uuid4()),
            english=e["english"],
            german=e["german"],
            example_sentence=e.get("example"),
        )
        for e in demo_entries
    ]


def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
    """Parse vocabulary JSON from LLM response with robust error handling."""
    import re

    def clean_json_string(s: str) -> str:
        """Clean a JSON string by removing control characters and fixing common issues."""
        # Remove control characters except newlines and tabs
        s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
        # Replace unescaped newlines within strings with space
        # This is a simplistic approach - replace actual newlines with escaped ones
        s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
        return s

    def try_parse_json(json_str: str) -> dict:
        """Try multiple strategies to parse JSON."""
        # Strategy 1: Direct parse
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass

        # Strategy 2: Clean and parse
        try:
            cleaned = clean_json_string(json_str)
            return json.loads(cleaned)
        except json.JSONDecodeError:
            pass

        # Strategy 3: Try to fix common issues
        try:
            # Remove trailing commas before } or ]
            fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
            # Fix unquoted keys
            fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
            return json.loads(fixed)
        except json.JSONDecodeError:
            pass

        return None

    try:
        # Find JSON in response (may have extra text)
        start = text.find('{')
        end = text.rfind('}') + 1

        if start == -1 or end == 0:
            logger.warning("No JSON found in response")
            return []

        json_str = text[start:end]
        data = try_parse_json(json_str)

        if data is None:
            # Strategy 4: Extract vocabulary entries using regex as fallback
            logger.warning("JSON parsing failed, trying regex extraction")
            vocabulary = []
            # Match patterns like {"english": "...", "german": "...", ...}
            pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)

            for match in matches:
                english = match[0].strip() if match[0] else ""
                german = match[1].strip() if match[1] else ""
                example = match[2].strip() if len(match) > 2 and match[2] else None

                if english and german:
                    vocab_entry = VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=english,
                        german=german,
                        example_sentence=example,
                    )
                    vocabulary.append(vocab_entry)

            if vocabulary:
                logger.info(f"Regex extraction found {len(vocabulary)} entries")
            return vocabulary

        # Normal JSON parsing succeeded
        vocabulary = []
        for i, entry in enumerate(data.get("vocabulary", [])):
            english = entry.get("english", "").strip()
            german = entry.get("german", "").strip()

            # Skip entries that look like hallucinations (very long or containing unusual patterns)
            if len(english) > 100 or len(german) > 200:
                logger.warning(f"Skipping suspicious entry: {english[:50]}...")
                continue

            if not english or not german:
                continue

            vocab_entry = VocabularyEntry(
                id=str(uuid.uuid4()),
                english=english,
                german=german,
                example_sentence=entry.get("example"),
                word_type=entry.get("word_type"),
            )
            vocabulary.append(vocab_entry)

        return vocabulary

    except Exception as e:
        logger.error(f"Failed to parse vocabulary JSON: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return []


# =============================================================================
# Worksheet PDF Generation
# =============================================================================

def generate_worksheet_html(
    vocabulary: List[VocabularyEntry],
    worksheet_type: WorksheetType,
    title: str,
    show_solutions: bool = False,
    repetitions: int = 3,
    line_height: str = "normal"
) -> str:
    """Generate HTML for a worksheet."""

    # Line height CSS
    line_heights = {
        "normal": "2.5em",
        "large": "3.5em",
        "extra-large": "4.5em"
    }
    lh = line_heights.get(line_height, "2.5em")

    html = f"""<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <style>
        @page {{ size: A4; margin: 2cm; }}
        body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
        h1 {{ font-size: 24px; margin-bottom: 10px; }}
        .meta {{ color: #666; margin-bottom: 20px; }}
        .name-line {{ margin-bottom: 30px; }}
        .vocab-table {{ width: 100%; border-collapse: collapse; }}
        .vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
        .vocab-word {{ width: 40%; font-weight: 500; }}
        .vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
        .vocab-answer {{ width: 60%; color: #2563eb; }}
        .gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
        .hint {{ color: #666; font-style: italic; font-size: 12px; }}
        .section {{ margin-top: 30px; }}
        .section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
    </style>
</head>
<body>
    <h1>{title}</h1>
    <div class="name-line">Name: _________________________ Datum: _____________</div>
"""

    if worksheet_type == WorksheetType.EN_TO_DE:
        html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'

    elif worksheet_type == WorksheetType.DE_TO_EN:
        html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'

    elif worksheet_type == WorksheetType.COPY_PRACTICE:
        html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            html += f'<tr><td class="vocab-word">{entry.english}</td>'
            html += '<td class="vocab-blank">'
            if show_solutions:
                html += f' {entry.english} ' * repetitions
            html += '</td></tr>'
        html += '</table></div>'

    elif worksheet_type == WorksheetType.GAP_FILL:
        entries_with_examples = [e for e in vocabulary if e.example_sentence]
        if entries_with_examples:
            html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
            for i, entry in enumerate(entries_with_examples, 1):
                # Create gap sentence by removing the English word
                gap_sentence = entry.example_sentence
                for word in entry.english.split():
                    if word.lower() in gap_sentence.lower():
                        gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
                        break

                html += f'<p>{i}. {gap_sentence}</p>'
                if show_solutions:
                    html += f'<p class="hint">Loesung: {entry.english}</p>'
                else:
                    html += f'<p class="hint">({entry.german})</p>'
            html += '</div>'

    html += '</body></html>'
    return html


async def generate_worksheet_pdf(html: str) -> bytes:
    """Generate PDF from HTML using WeasyPrint."""
    try:
        from weasyprint import HTML
        pdf_bytes = HTML(string=html).write_pdf()
        return pdf_bytes
    except ImportError:
        logger.warning("WeasyPrint not available, returning HTML")
        return html.encode('utf-8')
    except Exception as e:
        logger.error(f"PDF generation failed: {e}")
        raise


# =============================================================================
# API Endpoints
# =============================================================================

@router.post("/sessions", response_model=SessionResponse)
async def create_session(session: SessionCreate):
    """Create a new vocabulary extraction session."""
    session_id = str(uuid.uuid4())

    session_data = {
        "id": session_id,
        "name": session.name,
        "description": session.description,
        "source_language": session.source_language,
        "target_language": session.target_language,
        "status": SessionStatus.PENDING.value,
        "vocabulary": [],
        "vocabulary_count": 0,
        "image_path": None,
        "extraction_confidence": None,
        "created_at": datetime.utcnow(),
    }

    _sessions[session_id] = session_data

    # Create storage directory
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)

    return SessionResponse(
        id=session_id,
        name=session.name,
        description=session.description,
        source_language=session.source_language,
        target_language=session.target_language,
        status=SessionStatus.PENDING.value,
        vocabulary_count=0,
        image_path=None,
        created_at=session_data["created_at"],
    )


@router.get("/sessions", response_model=List[SessionResponse])
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
    """List all vocabulary sessions."""
    sessions = sorted(
        _sessions.values(),
        key=lambda x: x["created_at"],
        reverse=True
    )[:limit]

    return [
        SessionResponse(
            id=s["id"],
            name=s["name"],
            description=s.get("description"),
            source_language=s["source_language"],
            target_language=s["target_language"],
            status=s["status"],
            vocabulary_count=s.get("vocabulary_count", 0),
            image_path=s.get("image_path"),
            created_at=s["created_at"],
        )
        for s in sessions
    ]


@router.get("/sessions/{session_id}", response_model=SessionResponse)
async def get_session(session_id: str):
    """Get a specific session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    s = _sessions[session_id]
    return SessionResponse(
        id=s["id"],
        name=s["name"],
        description=s.get("description"),
        source_language=s["source_language"],
        target_language=s["target_language"],
        status=s["status"],
        vocabulary_count=s.get("vocabulary_count", 0),
        image_path=s.get("image_path"),
        created_at=s["created_at"],
    )


def get_pdf_page_count(pdf_data: bytes) -> int:
    """Get the number of pages in a PDF."""
    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        count = pdf_document.page_count
        pdf_document.close()
        return count
    except Exception as e:
        logger.error(f"Failed to get PDF page count: {e}")
        return 0


async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
    """Convert a specific page of PDF to PNG image using PyMuPDF.

    Args:
        pdf_data: PDF file as bytes
        page_number: 0-indexed page number
        thumbnail: If True, return a smaller thumbnail image
    """
    try:
        import fitz  # PyMuPDF

        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")

        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")

        if page_number >= pdf_document.page_count:
            raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")

        page = pdf_document[page_number]

        # Render page to image
        # For thumbnails: lower resolution, for OCR: higher resolution
        zoom = 0.5 if thumbnail else 2.0
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)

        png_data = pix.tobytes("png")
        pdf_document.close()

        logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
        return png_data

    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")


async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
    """Convert multiple pages of PDF to PNG images.

    Args:
        pdf_data: PDF file as bytes
        pages: List of 0-indexed page numbers to convert. If None, convert all pages.
    """
    try:
        import fitz

        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")

        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")

        # If no pages specified, convert all
        if pages is None:
            pages = list(range(pdf_document.page_count))

        images = []
        zoom = 2.0
        mat = fitz.Matrix(zoom, zoom)

        for page_num in pages:
            if page_num < pdf_document.page_count:
                page = pdf_document[page_num]
                pix = page.get_pixmap(matrix=mat)
                images.append(pix.tobytes("png"))

        pdf_document.close()
        logger.info(f"Converted {len(images)} PDF pages to images")
        return images

    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")


@router.post("/sessions/{session_id}/upload")
async def upload_image(
    session_id: str,
    file: UploadFile = File(...),
):
    """
    Upload a textbook page image or PDF and extract vocabulary.

    Supported formats: PNG, JPG, JPEG, PDF
    """
    logger.info(f"Upload request for session {session_id}")
    logger.info(f"File: filename={file.filename}, content_type={file.content_type}")

    if session_id not in _sessions:
        logger.error(f"Session {session_id} not found")
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]

    # Validate file type - check both extension and content type
    extension = file.filename.split('.')[-1].lower() if file.filename else ''
    content_type = file.content_type or ''

    # Accept images and PDFs
    valid_image_extensions = ['png', 'jpg', 'jpeg']
    valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
    is_pdf = extension == 'pdf' or content_type == 'application/pdf'
    is_image = extension in valid_image_extensions or content_type in valid_image_content_types

    if not is_pdf and not is_image:
        logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
        raise HTTPException(
            status_code=400,
            detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
        )

    # Determine final extension for saving
    if is_pdf:
        save_extension = 'png'  # PDFs will be converted to PNG
    elif extension in valid_image_extensions:
        save_extension = extension
    elif content_type == 'image/png':
        save_extension = 'png'
    else:
        save_extension = 'jpg'

    # Read file content
    content = await file.read()
    logger.info(f"Read {len(content)} bytes from uploaded file")

    # Convert PDF to image if needed
    if is_pdf:
        logger.info("Converting PDF to image...")
        content = await convert_pdf_page_to_image(content, page_number=0)
        logger.info(f"PDF converted, image size: {len(content)} bytes")

    # Save image
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    image_path = os.path.join(session_dir, f"source.{save_extension}")

    with open(image_path, 'wb') as f:
        f.write(content)

    # Update session status
    session["status"] = SessionStatus.PROCESSING.value
    session["image_path"] = image_path

    # Extract vocabulary using Vision LLM
    vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)

    # Update session with extracted vocabulary
    session["vocabulary"] = [v.dict() for v in vocabulary]
    session["vocabulary_count"] = len(vocabulary)
    session["extraction_confidence"] = confidence
    session["status"] = SessionStatus.EXTRACTED.value

    result = {
        "session_id": session_id,
        "filename": file.filename,
        "image_path": image_path,
        "vocabulary_count": len(vocabulary),
        "extraction_confidence": confidence,
        "status": SessionStatus.EXTRACTED.value,
    }

    if error:
        result["error"] = error

    return result


@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
async def get_vocabulary(session_id: str):
    """Get extracted vocabulary for a session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]

    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]

    return VocabularyResponse(
        session_id=session_id,
        vocabulary=vocabulary,
        extraction_confidence=session.get("extraction_confidence"),
    )


@router.put("/sessions/{session_id}/vocabulary")
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
    """Update vocabulary entries (for manual corrections)."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    session["vocabulary"] = [v.dict() for v in update.vocabulary]
    session["vocabulary_count"] = len(update.vocabulary)

    return {
        "session_id": session_id,
        "vocabulary_count": len(update.vocabulary),
        "message": "Vocabulary updated successfully",
    }


@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
    """Generate worksheet PDF(s) from extracted vocabulary."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]

    if not vocabulary:
        raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")

    worksheet_id = str(uuid.uuid4())
    title = request.title or session["name"]

    # Generate HTML for each worksheet type
    combined_html = ""
    for wtype in request.worksheet_types:
        html = generate_worksheet_html(
            vocabulary=vocabulary,
            worksheet_type=wtype,
            title=f"{title} - {wtype.value}",
            show_solutions=False,
            repetitions=request.repetitions,
            line_height=request.line_height,
        )
        combined_html += html + '<div style="page-break-after: always;"></div>'

    # Generate PDF
    try:
        pdf_bytes = await generate_worksheet_pdf(combined_html)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")

    # Save PDF
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
    with open(pdf_path, 'wb') as f:
        f.write(pdf_bytes)

    # Generate solution PDF if requested
    solution_path = None
    if request.include_solutions:
        solution_html = ""
        for wtype in request.worksheet_types:
            html = generate_worksheet_html(
                vocabulary=vocabulary,
                worksheet_type=wtype,
                title=f"{title} - {wtype.value} (Loesung)",
                show_solutions=True,
                repetitions=request.repetitions,
                line_height=request.line_height,
            )
            solution_html += html + '<div style="page-break-after: always;"></div>'

        solution_bytes = await generate_worksheet_pdf(solution_html)
        solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
        with open(solution_path, 'wb') as f:
            f.write(solution_bytes)

    # Store worksheet info
    worksheet_data = {
        "id": worksheet_id,
        "session_id": session_id,
        "worksheet_types": [wt.value for wt in request.worksheet_types],
        "pdf_path": pdf_path,
        "solution_path": solution_path,
        "generated_at": datetime.utcnow(),
    }
    _worksheets[worksheet_id] = worksheet_data

    # Update session status
    session["status"] = SessionStatus.COMPLETED.value

    return WorksheetResponse(
        id=worksheet_id,
        session_id=session_id,
        worksheet_types=worksheet_data["worksheet_types"],
        pdf_path=pdf_path,
        solution_path=solution_path,
        generated_at=worksheet_data["generated_at"],
    )


@router.get("/worksheets/{worksheet_id}/pdf")
async def download_worksheet_pdf(worksheet_id: str):
    """Download the generated worksheet PDF."""
    if worksheet_id not in _worksheets:
        raise HTTPException(status_code=404, detail="Worksheet not found")

    worksheet = _worksheets[worksheet_id]
    pdf_path = worksheet["pdf_path"]

    if not os.path.exists(pdf_path):
        raise HTTPException(status_code=404, detail="PDF file not found")

    with open(pdf_path, 'rb') as f:
        pdf_bytes = f.read()

    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
    )


@router.get("/worksheets/{worksheet_id}/solution")
async def download_solution_pdf(worksheet_id: str):
    """Download the solution PDF."""
    if worksheet_id not in _worksheets:
        raise HTTPException(status_code=404, detail="Worksheet not found")

    worksheet = _worksheets[worksheet_id]
    solution_path = worksheet.get("solution_path")

    if not solution_path or not os.path.exists(solution_path):
        raise HTTPException(status_code=404, detail="Solution PDF not found")

    with open(solution_path, 'rb') as f:
        pdf_bytes = f.read()

    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
    )


@router.get("/sessions/{session_id}/image")
async def get_session_image(session_id: str):
    """Get the uploaded source image for a session."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    image_path = session.get("image_path")

    if not image_path or not os.path.exists(image_path):
        raise HTTPException(status_code=404, detail="Image not found")

    # Determine content type
    extension = image_path.split('.')[-1].lower()
    content_type = {
        'png': 'image/png',
        'jpg': 'image/jpeg',
        'jpeg': 'image/jpeg',
    }.get(extension, 'application/octet-stream')

    with open(image_path, 'rb') as f:
        image_bytes = f.read()

    return StreamingResponse(
        io.BytesIO(image_bytes),
        media_type=content_type,
    )


@router.post("/sessions/{session_id}/upload-pdf-info")
async def upload_pdf_get_info(
    session_id: str,
    file: UploadFile = File(...),
):
    """
    Upload a PDF and get page count and thumbnails for preview.
    Use this before processing to let user select pages.
    """
    logger.info(f"PDF info request for session {session_id}")

    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]

    # Validate file type
    extension = file.filename.split('.')[-1].lower() if file.filename else ''
    content_type = file.content_type or ''

    if extension != 'pdf' and content_type != 'application/pdf':
        raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")

    content = await file.read()

    # Save PDF temporarily
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    pdf_path = os.path.join(session_dir, "source.pdf")

    with open(pdf_path, 'wb') as f:
        f.write(content)

    # Get page count
    page_count = get_pdf_page_count(content)

    # Store PDF data in session for later processing
    session["pdf_data"] = content
    session["pdf_path"] = pdf_path
    session["pdf_page_count"] = page_count
    session["status"] = "pdf_uploaded"

    return {
        "session_id": session_id,
        "page_count": page_count,
        "filename": file.filename,
    }


@router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
    """Get a thumbnail image of a specific PDF page.

    Uses fitz for rendering so that page_rotations (from OCR orientation
    detection) are applied consistently.

    Args:
        hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
    """
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    try:
        import fitz
        zoom = 2.0 if hires else 0.5
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        page = pdf_document[page_number]
        # Apply orientation correction detected during OCR processing
        rot = session.get("page_rotations", {}).get(page_number, 0)
        if rot:
            page.set_rotation(rot)
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        png_data = pix.tobytes("png")
        pdf_document.close()
    except Exception as e:
        logger.error(f"PDF thumbnail failed: {e}")
        raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")

    return StreamingResponse(
        io.BytesIO(png_data),
        media_type="image/png",
    )


@router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
    """PDF page as PNG at arbitrary resolution (for editor view).

    Args:
        zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
    """
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")

    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        page = pdf_document[page_number]
        # Apply orientation correction detected during OCR processing
        rot = session.get("page_rotations", {}).get(page_number, 0)
        if rot:
            page.set_rotation(rot)
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        png_data = pix.tobytes("png")
        pdf_document.close()
        logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
    except Exception as e:
        logger.error(f"PDF page image failed: {e}")
        raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")

    return StreamingResponse(
        io.BytesIO(png_data),
        media_type="image/png",
    )


@router.post("/sessions/{session_id}/process-single-page/{page_number}")
async def process_single_page(
    session_id: str,
    page_number: int,
):
    """
    Process a SINGLE page of an uploaded PDF using the OCR pipeline.

    Uses the multi-step CV pipeline (deskew → dewarp → columns → rows → words)
    instead of LLM vision for much better extraction quality.

    The frontend should call this sequentially for each page.
    Returns the vocabulary for just this one page.
    """
    logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")

    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.get("pdf_page_count", 1)

    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")

    # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
    rotation_deg = 0
    if OCR_PIPELINE_AVAILABLE:
        try:
            img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
            page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
                img_bgr, page_number, session_id,
            )
        except Exception as e:
            logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
            return {
                "session_id": session_id,
                "page_number": page_number + 1,
                "success": False,
                "error": f"OCR pipeline error: {e}",
                "vocabulary": [],
                "vocabulary_count": 0,
            }
    else:
        # Fallback to LLM vision extraction
        logger.warning("OCR pipeline not available, falling back to LLM vision")
        image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
        vocabulary, confidence, error = await extract_vocabulary_from_image(
            image_data,
            f"page_{page_number + 1}.png",
            page_number=page_number
        )
        if error:
            logger.warning(f"Page {page_number + 1} failed: {error}")
            return {
                "session_id": session_id,
                "page_number": page_number + 1,
                "success": False,
                "error": error,
                "vocabulary": [],
                "vocabulary_count": 0,
            }
        page_vocabulary = []
        for entry in vocabulary:
            entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
            entry_dict['source_page'] = page_number + 1
            if 'id' not in entry_dict or not entry_dict['id']:
                entry_dict['id'] = str(uuid.uuid4())
            page_vocabulary.append(entry_dict)

    logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")

    # Store rotation for this page (used by image/thumbnail endpoints)
    session.setdefault("page_rotations", {})[page_number] = rotation_deg

    # Add to session's vocabulary (append, don't replace)
    existing_vocab = session.get("vocabulary", [])
    # Remove any existing entries from this page (in case of re-processing)
    existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
    existing_vocab.extend(page_vocabulary)
    session["vocabulary"] = existing_vocab
    session["vocabulary_count"] = len(existing_vocab)
    session["status"] = SessionStatus.EXTRACTED.value

    return {
        "session_id": session_id,
        "page_number": page_number + 1,
        "success": True,
        "vocabulary": page_vocabulary,
        "vocabulary_count": len(page_vocabulary),
        "total_vocabulary_count": len(existing_vocab),
        "extraction_confidence": 0.9,
        "rotation": rotation_deg,
    }


async def _run_ocr_pipeline_for_page(
    img_bgr: np.ndarray,
    page_number: int,
    vocab_session_id: str,
) -> tuple:
    """Run the full OCR pipeline on a single page image and return vocab entries.

    Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).

    Args:
        img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline).
        page_number: 0-indexed page number.
        vocab_session_id: Vocab session ID for logging.

    Steps: deskew → dewarp → columns → rows → words → (LLM review)
    Returns (entries, rotation_deg) where entries is a list of dicts and
    rotation_deg is the orientation correction applied (0, 90, 180, 270).
    """
    import time as _time

    t_total = _time.time()

    img_h, img_w = img_bgr.shape[:2]
    logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}")

    # 1b. Orientation detection (fix upside-down scans)
    t0 = _time.time()
    img_bgr, rotation = detect_and_fix_orientation(img_bgr)
    if rotation:
        img_h, img_w = img_bgr.shape[:2]
        logger.info(f"  orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
    else:
        logger.info(f"  orientation: OK ({_time.time() - t0:.1f}s)")

    # 2. Create pipeline session in DB (for debugging in admin UI)
    pipeline_session_id = str(uuid.uuid4())
    try:
        _, png_buf = cv2.imencode(".png", img_bgr)
        original_png = png_buf.tobytes()
        await create_pipeline_session_db(
            pipeline_session_id,
            name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
            filename=f"page_{page_number + 1}.png",
            original_png=original_png,
        )
    except Exception as e:
        logger.warning(f"Could not create pipeline session in DB: {e}")

    # 3. Three-pass deskew: iterative + word-alignment + text-line regression
    t0 = _time.time()
    deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
    angle_pass1 = deskew_debug.get("pass1_angle", 0.0)
    angle_pass2 = deskew_debug.get("pass2_angle", 0.0)
    angle_pass3 = deskew_debug.get("pass3_angle", 0.0)

    logger.info(f"  deskew: p1={angle_pass1:.2f} p2={angle_pass2:.2f} "
                f"p3={angle_pass3:.2f} total={angle_applied:.2f} "
                f"({_time.time() - t0:.1f}s)")

    # 4. Dewarp
    t0 = _time.time()
    dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
    logger.info(f"  dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")

    # 5. Column detection
    t0 = _time.time()
    ocr_img = create_ocr_image(dewarped_bgr)
    h, w = ocr_img.shape[:2]

    geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
    if geo_result is None:
        layout_img = create_layout_image(dewarped_bgr)
        regions = analyze_layout(layout_img, ocr_img)
        word_dicts = None
        inv = None
        content_bounds = None
    else:
        geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
        content_w = right_x - left_x
        header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
        geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
                                          top_y=top_y, header_y=header_y, footer_y=footer_y)
        geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
        geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
        content_h = bottom_y - top_y
        regions = positional_column_regions(geometries, content_w, content_h, left_x)
        content_bounds = (left_x, right_x, top_y, bottom_y)

    logger.info(f"  columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")

    # 6. Row detection
    t0 = _time.time()
    if word_dicts is None or inv is None or content_bounds is None:
        # Re-run geometry detection to get intermediates
        geo_result2 = detect_column_geometry(ocr_img, dewarped_bgr)
        if geo_result2 is None:
            raise ValueError("Column geometry detection failed — cannot detect rows")
        _, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result2
        content_bounds = (left_x, right_x, top_y, bottom_y)

    left_x, right_x, top_y, bottom_y = content_bounds
    rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
    logger.info(f"  rows: {len(rows)} detected ({_time.time() - t0:.1f}s)")

    # 7. Word recognition (cell-first OCR v2)
    t0 = _time.time()
    col_regions = regions  # already PageRegion objects

    # Populate row.words for word_count filtering
    for row in rows:
        row_y_rel = row.y - top_y
        row_bottom_rel = row_y_rel + row.height
        row.words = [
            wd for wd in word_dicts
            if row_y_rel <= wd['top'] + wd['height'] / 2 < row_bottom_rel
        ]
        row.word_count = len(row.words)

    cells, columns_meta = build_cell_grid_v2(
        ocr_img, col_regions, rows, img_w, img_h,
        ocr_engine="auto", img_bgr=dewarped_bgr,
    )

    col_types = {c['type'] for c in columns_meta}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    logger.info(f"  words: {len(cells)} cells, vocab={is_vocab} ({_time.time() - t0:.1f}s)")

    if not is_vocab:
        logger.warning(f"  Page {page_number + 1}: layout is not vocab table "
                       f"(types: {col_types}), returning empty")
        return []

    # 8. Map cells → vocab entries
    entries = _cells_to_vocab_entries(cells, columns_meta)
    entries = _fix_phonetic_brackets(entries, pronunciation="british")

    # 9. Optional LLM review
    try:
        review_result = await llm_review_entries(entries)
        if review_result and review_result.get("changes"):
            # Apply corrections
            changes_map = {}
            for ch in review_result["changes"]:
                idx = ch.get("index")
                if idx is not None:
                    changes_map[idx] = ch
            for idx, ch in changes_map.items():
                if 0 <= idx < len(entries):
                    for field in ("english", "german", "example"):
                        if ch.get(field) and ch[field] != entries[idx].get(field):
                            entries[idx][field] = ch[field]
            logger.info(f"  llm review: {len(review_result['changes'])} corrections applied")
    except Exception as e:
        logger.warning(f"  llm review skipped: {e}")

    # 10. Map to frontend format
    page_vocabulary = []
    for entry in entries:
        if not entry.get("english") and not entry.get("german"):
            continue  # skip empty rows
        page_vocabulary.append({
            "id": str(uuid.uuid4()),
            "english": entry.get("english", ""),
            "german": entry.get("german", ""),
            "example_sentence": entry.get("example", ""),
            "source_page": page_number + 1,
        })

    # 11. Update pipeline session in DB (for admin debugging)
    try:
        success_dsk, dsk_buf = cv2.imencode(".png", deskewed_bgr)
        deskewed_png = dsk_buf.tobytes() if success_dsk else None
        success_dwp, dwp_buf = cv2.imencode(".png", dewarped_bgr)
        dewarped_png = dwp_buf.tobytes() if success_dwp else None

        await update_pipeline_session_db(
            pipeline_session_id,
            deskewed_png=deskewed_png,
            dewarped_png=dewarped_png,
            deskew_result={"angle_applied": round(angle_applied, 3)},
            dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
            column_result={"columns": [{"type": r.type, "x": r.x, "y": r.y,
                                         "width": r.width, "height": r.height}
                                        for r in col_regions]},
            row_result={"total_rows": len(rows)},
            word_result={
                "entry_count": len(page_vocabulary),
                "layout": "vocab",
                "vocab_entries": entries,
            },
            current_step=6,
        )
    except Exception as e:
        logger.warning(f"Could not update pipeline session: {e}")

    total_duration = _time.time() - t_total
    logger.info(f"OCR Pipeline page {page_number + 1}: "
                f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")

    return page_vocabulary, rotation


@router.post("/sessions/{session_id}/process-pages")
async def process_pdf_pages(
    session_id: str,
    pages: List[int] = None,
    process_all: bool = False,
):
    """
    Process specific pages of an uploaded PDF.

    DEPRECATED: Use /process-single-page/{page_number} instead for better results.

    Args:
        pages: List of 0-indexed page numbers to process
        process_all: If True, process all pages
    """
    logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")

    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.get("pdf_page_count", 1)

    # Determine which pages to process
    if process_all:
        pages = list(range(page_count))
    elif pages is None or len(pages) == 0:
        pages = [0]  # Default to first page

    # Convert selected pages to images
    images = await convert_pdf_to_images(pdf_data, pages)

    # Extract vocabulary from each page SEQUENTIALLY
    all_vocabulary = []
    total_confidence = 0.0
    successful_pages = []
    failed_pages = []
    error_messages = []

    for i, image_data in enumerate(images):
        page_num = pages[i]
        logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")

        vocabulary, confidence, error = await extract_vocabulary_from_image(
            image_data,
            f"page_{page_num + 1}.png",
            page_number=page_num
        )

        if error:
            failed_pages.append(page_num + 1)
            error_messages.append(error)
            logger.warning(f"Page {page_num + 1} failed: {error}")
        else:
            successful_pages.append(page_num + 1)
            total_confidence += confidence

            # Add page info to each entry and convert to dict
            for entry in vocabulary:
                entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
                entry_dict['source_page'] = page_num + 1
                all_vocabulary.append(entry_dict)

            logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")

    avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0

    # Update session
    session["vocabulary"] = all_vocabulary
    session["vocabulary_count"] = len(all_vocabulary)
    session["extraction_confidence"] = avg_confidence
    session["processed_pages"] = pages
    session["successful_pages"] = successful_pages
    session["failed_pages"] = failed_pages
    session["status"] = SessionStatus.EXTRACTED.value

    # Save first page as preview image
    if images:
        session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
        image_path = os.path.join(session_dir, "source.png")
        with open(image_path, 'wb') as f:
            f.write(images[0])
        session["image_path"] = image_path

    result = {
        "session_id": session_id,
        "pages_processed": len(pages),
        "pages_successful": len(successful_pages),
        "pages_failed": len(failed_pages),
        "successful_pages": successful_pages,
        "failed_pages": failed_pages,
        "vocabulary_count": len(all_vocabulary),
        "extraction_confidence": avg_confidence,
        "status": SessionStatus.EXTRACTED.value,
    }

    if error_messages:
        result["errors"] = error_messages

    return result


@router.delete("/sessions/{session_id}")
async def delete_session(session_id: str):
    """Delete a vocabulary session and all associated files."""
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    # Delete session directory
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    if os.path.exists(session_dir):
        import shutil
        shutil.rmtree(session_dir)

    # Remove from storage
    del _sessions[session_id]

    # Remove associated worksheets
    for wid, ws in list(_worksheets.items()):
        if ws["session_id"] == session_id:
            del _worksheets[wid]

    return {"message": "Session deleted successfully", "session_id": session_id}


# =============================================================================
# OCR Export Endpoints (for cross-app OCR data sharing)
# =============================================================================

OCR_EXPORT_DIR = os.path.join(LOCAL_STORAGE_PATH, "ocr-exports")


@router.post("/sessions/{session_id}/ocr-export/{page_number}")
async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
    """
    Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).

    Both apps proxy to klausur-service via /klausur-api/, so this endpoint
    serves as shared storage accessible from both ports.
    """

    logger.info(f"Saving OCR export for session {session_id}, page {page_number}")

    os.makedirs(OCR_EXPORT_DIR, exist_ok=True)

    # Save the export data
    export_path = os.path.join(OCR_EXPORT_DIR, f"{session_id}_page{page_number}.json")
    with open(export_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    # Update latest pointer
    latest_path = os.path.join(OCR_EXPORT_DIR, "latest.json")
    with open(latest_path, 'w', encoding='utf-8') as f:
        json.dump({
            "session_id": session_id,
            "page_number": page_number,
            "saved_at": datetime.utcnow().isoformat(),
        }, f, ensure_ascii=False, indent=2)

    return {
        "success": True,
        "session_id": session_id,
        "page_number": page_number,
        "message": "OCR export saved successfully",
    }


@router.get("/sessions/{session_id}/ocr-export/{page_number}")
async def load_ocr_export(session_id: str, page_number: int):
    """Load a specific OCR export by session and page number."""

    export_path = os.path.join(OCR_EXPORT_DIR, f"{session_id}_page{page_number}.json")

    if not os.path.exists(export_path):
        raise HTTPException(status_code=404, detail="OCR export not found")

    with open(export_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    return data


# =============================================================================
# OCR Compare & Grid Analysis Endpoints
# =============================================================================


@router.post("/sessions/{session_id}/compare-ocr/{page_number}")
async def compare_ocr_methods(session_id: str, page_number: int):
    """
    Run multiple OCR methods on a page and compare results.

    This endpoint:
    1. Gets the page image from the session's uploaded PDF
    2. Runs Vision LLM extraction (primary method)
    3. Optionally runs Tesseract extraction
    4. Compares found vocabulary across methods
    5. Returns structured comparison results

    page_number is 0-indexed.
    """
    import httpx
    import time

    logger.info(f"Compare OCR for session {session_id}, page {page_number}")

    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")

    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)

    methods_results = {}
    all_vocab_sets = {}

    # --- Method: Vision LLM ---
    try:
        start = time.time()
        vocab, confidence, error = await extract_vocabulary_from_image(
            image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
        )
        duration = time.time() - start

        vocab_list = []
        for v in vocab:
            entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
            vocab_list.append({
                "english": entry.get("english", ""),
                "german": entry.get("german", ""),
                "example": entry.get("example_sentence", ""),
            })

        methods_results["vision_llm"] = {
            "name": "Vision LLM",
            "model": VISION_MODEL,
            "duration_seconds": round(duration, 1),
            "vocabulary_count": len(vocab_list),
            "vocabulary": vocab_list,
            "confidence": confidence,
            "success": len(vocab_list) > 0 and not error,
            "error": error if error else None,
        }
        all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
    except Exception as e:
        logger.error(f"Vision LLM failed: {e}")
        methods_results["vision_llm"] = {
            "name": "Vision LLM",
            "model": VISION_MODEL,
            "duration_seconds": 0,
            "vocabulary_count": 0,
            "vocabulary": [],
            "confidence": 0,
            "success": False,
            "error": str(e),
        }
        all_vocab_sets["vision_llm"] = set()

    # --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
    if TESSERACT_AVAILABLE:
        try:
            start = time.time()
            tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
            duration = time.time() - start

            tess_vocab = tess_result.get("vocabulary", [])
            tess_words = tess_result.get("words", [])

            # Store Tesseract words in session for later use (grid analysis, position matching)
            session["tesseract_words"] = tess_words
            session["tesseract_image_width"] = tess_result.get("image_width", 0)
            session["tesseract_image_height"] = tess_result.get("image_height", 0)
            session[f"tesseract_page_{page_number}"] = tess_result

            vocab_list_tess = []
            for v in tess_vocab:
                vocab_list_tess.append({
                    "english": v.get("english", ""),
                    "german": v.get("german", ""),
                    "example": v.get("example", ""),
                })

            methods_results["tesseract"] = {
                "name": "Tesseract OCR",
                "model": "tesseract-ocr (eng+deu)",
                "duration_seconds": round(duration, 1),
                "vocabulary_count": len(vocab_list_tess),
                "vocabulary": vocab_list_tess,
                "confidence": 0.7 if tess_vocab else 0,
                "success": len(vocab_list_tess) > 0,
                "error": tess_result.get("error"),
                "word_count": tess_result.get("word_count", 0),
                "columns_detected": len(tess_result.get("columns", [])),
            }
            all_vocab_sets["tesseract"] = {
                (v["english"].lower().strip(), v["german"].lower().strip())
                for v in vocab_list_tess if v["english"] and v["german"]
            }

            # Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
            if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
                llm_vocab_with_bbox = match_positions_to_vocab(
                    tess_words,
                    methods_results["vision_llm"]["vocabulary"],
                    tess_result.get("image_width", 1),
                    tess_result.get("image_height", 1),
                )
                methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox

        except Exception as e:
            logger.error(f"Tesseract failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            methods_results["tesseract"] = {
                "name": "Tesseract OCR",
                "model": "tesseract-ocr",
                "duration_seconds": 0,
                "vocabulary_count": 0,
                "vocabulary": [],
                "confidence": 0,
                "success": False,
                "error": str(e),
            }
            all_vocab_sets["tesseract"] = set()

    # --- Method: CV Pipeline (Document Reconstruction) ---
    if CV_PIPELINE_AVAILABLE:
        try:
            start = time.time()
            cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
            duration = time.time() - start

            cv_vocab = cv_result.vocabulary if not cv_result.error else []
            vocab_list_cv = []
            for v in cv_vocab:
                vocab_list_cv.append({
                    "english": v.get("english", ""),
                    "german": v.get("german", ""),
                    "example": v.get("example", ""),
                })

            methods_results["cv_pipeline"] = {
                "name": "CV Pipeline (Document Reconstruction)",
                "model": "opencv + tesseract (multi-pass)",
                "duration_seconds": round(duration, 1),
                "vocabulary_count": len(vocab_list_cv),
                "vocabulary": vocab_list_cv,
                "confidence": 0.8 if cv_vocab else 0,
                "success": len(vocab_list_cv) > 0,
                "error": cv_result.error,
                "word_count": cv_result.word_count,
                "columns_detected": cv_result.columns_detected,
                "stages": cv_result.stages,
            }
            all_vocab_sets["cv_pipeline"] = {
                (v["english"].lower().strip(), v["german"].lower().strip())
                for v in vocab_list_cv if v["english"] and v["german"]
            }

        except Exception as e:
            logger.error(f"CV Pipeline failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            methods_results["cv_pipeline"] = {
                "name": "CV Pipeline (Document Reconstruction)",
                "model": "opencv + tesseract (multi-pass)",
                "duration_seconds": 0,
                "vocabulary_count": 0,
                "vocabulary": [],
                "confidence": 0,
                "success": False,
                "error": str(e),
            }
            all_vocab_sets["cv_pipeline"] = set()

    # --- Build comparison ---
    all_unique = set()
    for vs in all_vocab_sets.values():
        all_unique |= vs

    found_by_all = []
    found_by_some = []
    for english, german in sorted(all_unique):
        found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
        entry = {"english": english, "german": german, "methods": found_in}
        if len(found_in) == len(all_vocab_sets):
            found_by_all.append(entry)
        else:
            found_by_some.append(entry)

    total_methods = max(len(all_vocab_sets), 1)
    agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0

    # Find best method
    best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"

    return {
        "session_id": session_id,
        "page_number": page_number,
        "methods": methods_results,
        "comparison": {
            "found_by_all_methods": found_by_all,
            "found_by_some_methods": found_by_some,
            "total_unique_vocabulary": len(all_unique),
            "agreement_rate": agreement_rate,
        },
        "recommendation": {
            "best_method": best_method,
            "reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
        },
    }


@router.post("/sessions/{session_id}/analyze-grid/{page_number}")
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
    """
    Analyze the grid/table structure of a vocabulary page.

    Hybrid approach:
    1. If Tesseract bounding boxes are available (from compare-ocr), use them for
       real spatial positions via GridDetectionService.
    2. Otherwise fall back to Vision LLM for grid structure detection.

    page_number is 0-indexed.
    Returns GridData structure expected by the frontend GridOverlay component.
    """
    import httpx
    import time

    logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")

    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number.")

    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)

    # --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
    tess_page_data = session.get(f"tesseract_page_{page_number}")

    if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
        try:
            # Run Tesseract if not already cached
            if not tess_page_data:
                logger.info("Running Tesseract for grid analysis (not cached)")
                from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
                tess_page_data = await _run_tess(image_data, lang="eng+deu")
                session[f"tesseract_page_{page_number}"] = tess_page_data
                session["tesseract_words"] = tess_page_data.get("words", [])
                session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
                session["tesseract_image_height"] = tess_page_data.get("image_height", 0)

            tess_words = tess_page_data.get("words", [])
            img_w = tess_page_data.get("image_width", 0)
            img_h = tess_page_data.get("image_height", 0)

            if tess_words and img_w > 0 and img_h > 0:
                service = GridDetectionService()
                regions = service.convert_tesseract_regions(tess_words, img_w, img_h)

                if regions:
                    grid_result = service.detect_grid(regions)
                    grid_dict = grid_result.to_dict()

                    # Merge LLM text if available (better quality than Tesseract text)
                    # The LLM vocab was stored during compare-ocr
                    grid_dict["source"] = "tesseract+grid_service"
                    grid_dict["word_count"] = len(tess_words)

                    logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
                                f"{grid_result.stats.get('recognized', 0)} recognized")

                    return {"success": True, "grid": grid_dict}

            logger.info("Tesseract data insufficient, falling back to LLM")

        except Exception as e:
            logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
            import traceback
            logger.debug(traceback.format_exc())

    # --- Strategy 2: Fall back to Vision LLM ---
    image_base64 = base64.b64encode(image_data).decode("utf-8")

    grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.

Your task: Identify the TABLE STRUCTURE and extract each cell's content.

Return a JSON object with this EXACT structure:
{
  "rows": <number of rows>,
  "columns": <number of columns>,
  "column_types": ["english", "german", "example"],
  "entries": [
    {
      "row": 0,
      "col": 0,
      "text": "the word or phrase in this cell",
      "column_type": "english",
      "confidence": 0.95
    }
  ]
}

Rules:
- row and col are 0-indexed
- column_type is one of: "english", "german", "example", "unknown"
- Detect whether each column contains English words, German translations, or example sentences
- Include ALL non-empty cells
- confidence is 0.0-1.0 based on how clear the text is
- If a cell is empty, don't include it
- Return ONLY the JSON, no other text"""

    try:
        import asyncio

        raw_text = ""
        max_retries = 3
        for attempt in range(max_retries):
            async with httpx.AsyncClient(timeout=300.0) as client:
                response = await client.post(
                    f"{OLLAMA_URL}/api/chat",
                    json={
                        "model": VISION_MODEL,
                        "messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
                        "stream": False,
                        "options": {"temperature": 0.1, "num_predict": 8192},
                    },
                    timeout=300.0,
                )

            if response.status_code == 500 and attempt < max_retries - 1:
                wait_time = 10 * (attempt + 1)
                logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
                await asyncio.sleep(wait_time)
                continue
            elif response.status_code != 200:
                error_detail = response.text[:200] if response.text else "Unknown error"
                return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}

            raw_text = response.json().get("message", {}).get("content", "")
            break

        # Parse JSON from response
        import re
        json_match = re.search(r'\{[\s\S]*\}', raw_text)
        if not json_match:
            return {"success": False, "error": "Could not parse grid structure from LLM response"}

        grid_raw = json.loads(json_match.group())

        num_rows = grid_raw.get("rows", 0)
        num_cols = grid_raw.get("columns", 0)
        column_types = grid_raw.get("column_types", [])
        entries = grid_raw.get("entries", [])

        if num_rows == 0 or num_cols == 0:
            return {"success": False, "error": "No grid structure detected"}

        # Ensure column_types has the right length
        while len(column_types) < num_cols:
            column_types.append("unknown")

        # Build cell grid with percentage-based coordinates
        row_height = 100.0 / num_rows
        col_width = 100.0 / num_cols

        # Track which cells have content
        cell_map = {}
        for entry in entries:
            r = entry.get("row", 0)
            c = entry.get("col", 0)
            cell_map[(r, c)] = entry

        cells = []
        recognized_count = 0
        empty_count = 0
        problematic_count = 0

        for r in range(num_rows):
            row_cells = []
            for c in range(num_cols):
                x = c * col_width
                y = r * row_height

                if (r, c) in cell_map:
                    entry = cell_map[(r, c)]
                    text = entry.get("text", "").strip()
                    conf = entry.get("confidence", 0.8)
                    col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")

                    if text:
                        status = "recognized" if conf >= 0.5 else "problematic"
                        if status == "recognized":
                            recognized_count += 1
                        else:
                            problematic_count += 1
                    else:
                        status = "empty"
                        empty_count += 1
                else:
                    text = ""
                    conf = 0.0
                    col_type = column_types[c] if c < len(column_types) else "unknown"
                    status = "empty"
                    empty_count += 1

                row_cells.append({
                    "row": r,
                    "col": c,
                    "x": round(x, 2),
                    "y": round(y, 2),
                    "width": round(col_width, 2),
                    "height": round(row_height, 2),
                    "text": text,
                    "confidence": conf,
                    "status": status,
                    "column_type": col_type,
                })
            cells.append(row_cells)

        total = num_rows * num_cols
        coverage = (recognized_count + problematic_count) / max(total, 1)

        # Column and row boundaries as percentages
        col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
        row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]

        grid_data = {
            "rows": num_rows,
            "columns": num_cols,
            "cells": cells,
            "column_types": column_types,
            "column_boundaries": col_boundaries,
            "row_boundaries": row_boundaries,
            "deskew_angle": 0.0,
            "source": "vision_llm",
            "stats": {
                "recognized": recognized_count,
                "problematic": problematic_count,
                "empty": empty_count,
                "manual": 0,
                "total": total,
                "coverage": round(coverage, 3),
            },
        }

        return {"success": True, "grid": grid_data}

    except httpx.TimeoutException:
        logger.error("Grid analysis timed out")
        return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
    except Exception as e:
        logger.error(f"Grid analysis failed: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}


@router.get("/ocr-export/latest")
async def load_latest_ocr_export():
    """Load the most recently saved OCR export data."""

    latest_path = os.path.join(OCR_EXPORT_DIR, "latest.json")

    if not os.path.exists(latest_path):
        raise HTTPException(status_code=404, detail="No OCR exports found")

    with open(latest_path, 'r', encoding='utf-8') as f:
        pointer = json.load(f)

    session_id = pointer.get("session_id")
    page_number = pointer.get("page_number")

    export_path = os.path.join(OCR_EXPORT_DIR, f"{session_id}_page{page_number}.json")

    if not os.path.exists(export_path):
        raise HTTPException(status_code=404, detail="Latest OCR export file not found")

    with open(export_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    return data


# =============================================================================
# Ground Truth Labeling
# =============================================================================

GROUND_TRUTH_DIR = os.path.join(LOCAL_STORAGE_PATH, "ground-truth")


async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
    """Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.

    Returns dict with 'entries' list and 'image_width'/'image_height'.
    Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
    All bbox coordinates are in percent (0-100).
    """
    if not TESSERACT_AVAILABLE:
        raise HTTPException(status_code=500, detail="Tesseract not available")
    if not GRID_SERVICE_AVAILABLE:
        raise HTTPException(status_code=500, detail="GridDetectionService not available")

    # Step 1: Tesseract word-level bounding boxes
    tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
    words = tess_result.get("words", [])
    img_w = tess_result.get("image_width", 0)
    img_h = tess_result.get("image_height", 0)

    if not words or img_w == 0 or img_h == 0:
        return {"entries": [], "image_width": img_w, "image_height": img_h}

    # Step 2: Convert to OCR regions (percentage-based)
    service = GridDetectionService()
    regions = service.convert_tesseract_regions(words, img_w, img_h)

    if not regions:
        return {"entries": [], "image_width": img_w, "image_height": img_h}

    # Step 3: Detect grid
    grid_result = service.detect_grid(regions)

    if not grid_result.cells:
        return {"entries": [], "image_width": img_w, "image_height": img_h}

    # Step 4: Group cells by logical_row and column_type
    from services.grid_detection_service import ColumnType

    entries = []
    for row_idx, row_cells in enumerate(grid_result.cells):
        en_text = ""
        de_text = ""
        ex_text = ""
        en_bbox = None
        de_bbox = None
        ex_bbox = None
        row_conf_sum = 0.0
        row_conf_count = 0

        for cell in row_cells:
            cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
                         "w": round(cell.width, 2), "h": round(cell.height, 2)}

            if cell.column_type == ColumnType.ENGLISH:
                en_text = cell.text.strip()
                en_bbox = cell_bbox
            elif cell.column_type == ColumnType.GERMAN:
                de_text = cell.text.strip()
                de_bbox = cell_bbox
            elif cell.column_type == ColumnType.EXAMPLE:
                ex_text = cell.text.strip()
                ex_bbox = cell_bbox

            if cell.text.strip():
                row_conf_sum += cell.confidence
                row_conf_count += 1

        # Skip completely empty rows
        if not en_text and not de_text and not ex_text:
            continue

        # Calculate whole-row bounding box
        all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
        if all_bboxes:
            row_x = min(b["x"] for b in all_bboxes)
            row_y = min(b["y"] for b in all_bboxes)
            row_right = max(b["x"] + b["w"] for b in all_bboxes)
            row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
            row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
                        "w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
        else:
            row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}

        avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)

        entries.append({
            "row_index": row_idx,
            "english": en_text,
            "german": de_text,
            "example": ex_text,
            "confidence": avg_conf,
            "bbox": row_bbox,
            "bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
            "bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
            "bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
        })

    return {"entries": entries, "image_width": img_w, "image_height": img_h}


@router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
async def extract_with_boxes(session_id: str, page_number: int):
    """Extract vocabulary entries with bounding boxes for ground truth labeling.

    Uses Tesseract + GridDetectionService for spatial positioning.
    page_number is 0-indexed.
    """
    logger.info(f"Extract with boxes for session {session_id}, page {page_number}")

    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")

    # Convert page to hires image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)

    # Deskew image before OCR
    deskew_angle = 0.0
    try:
        from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
        if CV2_AVAILABLE:
            image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
            logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
    except Exception as e:
        logger.warning(f"Deskew failed for page {page_number}: {e}")

    # Cache deskewed image in session for later serving
    if "deskewed_images" not in session:
        session["deskewed_images"] = {}
    session["deskewed_images"][str(page_number)] = image_data

    # Extract entries with boxes (now on deskewed image)
    result = await extract_entries_with_boxes(image_data)

    # Cache in session
    if "gt_entries" not in session:
        session["gt_entries"] = {}
    session["gt_entries"][str(page_number)] = result["entries"]

    return {
        "success": True,
        "entries": result["entries"],
        "entry_count": len(result["entries"]),
        "image_width": result["image_width"],
        "image_height": result["image_height"],
        "deskew_angle": round(deskew_angle, 2),
        "deskewed": abs(deskew_angle) > 0.05,
    }


@router.get("/sessions/{session_id}/deskewed-image/{page_number}")
async def get_deskewed_image(session_id: str, page_number: int):
    """Return the deskewed page image as PNG.

    Falls back to the original hires image if no deskewed version is cached.
    """
    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    session = _sessions[session_id]
    deskewed = session.get("deskewed_images", {}).get(str(page_number))

    if deskewed:
        return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")

    # Fallback: render original hires image
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    return StreamingResponse(io.BytesIO(image_data), media_type="image/png")


@router.post("/sessions/{session_id}/ground-truth/{page_number}")
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
    """Save ground truth labels for a page.

    Expects body with 'entries' list - each entry has english, german, example,
    status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
    """
    logger.info(f"Save ground truth for session {session_id}, page {page_number}")

    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    entries = data.get("entries", [])
    if not entries:
        raise HTTPException(status_code=400, detail="No entries provided")

    # Save in session
    session = _sessions[session_id]
    if "ground_truth" not in session:
        session["ground_truth"] = {}
    session["ground_truth"][str(page_number)] = entries

    # Also save to disk
    os.makedirs(GROUND_TRUTH_DIR, exist_ok=True)
    gt_path = os.path.join(GROUND_TRUTH_DIR, f"{session_id}_page{page_number}.json")
    gt_data = {
        "session_id": session_id,
        "page_number": page_number,
        "saved_at": datetime.now().isoformat(),
        "entry_count": len(entries),
        "entries": entries,
    }
    with open(gt_path, 'w', encoding='utf-8') as f:
        json.dump(gt_data, f, ensure_ascii=False, indent=2)

    logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")

    confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
    edited = sum(1 for e in entries if e.get("status") == "edited")
    skipped = sum(1 for e in entries if e.get("status") == "skipped")

    return {
        "success": True,
        "saved_count": len(entries),
        "confirmed": confirmed,
        "edited": edited,
        "skipped": skipped,
        "file_path": gt_path,
    }


@router.get("/sessions/{session_id}/ground-truth/{page_number}")
async def load_ground_truth(session_id: str, page_number: int):
    """Load saved ground truth for a page."""
    logger.info(f"Load ground truth for session {session_id}, page {page_number}")

    if session_id not in _sessions:
        raise HTTPException(status_code=404, detail="Session not found")

    # Try session cache first
    session = _sessions[session_id]
    cached = session.get("ground_truth", {}).get(str(page_number))
    if cached:
        return {"success": True, "entries": cached, "source": "cache"}

    # Try disk
    gt_path = os.path.join(GROUND_TRUTH_DIR, f"{session_id}_page{page_number}.json")
    if not os.path.exists(gt_path):
        raise HTTPException(status_code=404, detail="No ground truth found for this page")

    with open(gt_path, 'r', encoding='utf-8') as f:
        gt_data = json.load(f)

    return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}