breakpilot-pwa/klausur-service/backend/vocab_worksheet_api.py

"""
Vocabulary Worksheet API - Extract vocabulary from textbook pages and generate worksheets.

DATENSCHUTZ/PRIVACY:
- Alle Verarbeitung erfolgt lokal (Mac Mini mit Ollama)
- Keine Daten werden an externe Server gesendet
- DSGVO-konform fuer Schulumgebungen

Workflow:
1. POST /sessions - Create a vocabulary extraction session
2. POST /sessions/{id}/upload - Upload textbook page image
3. GET /sessions/{id}/vocabulary - Get extracted vocabulary
4. PUT /sessions/{id}/vocabulary - Edit vocabulary (corrections)
5. POST /sessions/{id}/generate - Generate worksheet PDF
6. GET /worksheets/{id}/pdf - Download generated PDF
"""

from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Query
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional, List, Dict, Any
from datetime import datetime
from enum import Enum
import uuid
import os
import io
import json
import base64
import logging

# PostgreSQL persistence (replaces in-memory storage)
from vocab_session_store import (
    init_vocab_tables,
    create_session_db,
    get_session_db,
    list_sessions_db,
    update_session_db,
    delete_session_db,
    add_vocabulary_db,
    get_vocabulary_db,
    update_vocabulary_db,
    clear_page_vocabulary_db,
    create_worksheet_db,
    get_worksheet_db,
    delete_worksheets_for_session_db,
    cache_pdf_data,
    get_cached_pdf_data,
    clear_cached_pdf_data,
)

logger = logging.getLogger(__name__)

# Ollama Configuration - Direct call without external modules
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")

# Try to import MinIO storage
try:
    from minio_storage import upload_to_minio, get_from_minio
    MINIO_AVAILABLE = True
except ImportError:
    MINIO_AVAILABLE = False
    logger.warning("MinIO storage not available, using local storage")

router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])

# Local storage path
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")


# =============================================================================
# Enums and Pydantic Models
# =============================================================================

class WorksheetType(str, Enum):
    EN_TO_DE = "en_to_de"          # English -> German translation
    DE_TO_EN = "de_to_en"          # German -> English translation
    COPY_PRACTICE = "copy"         # Write word multiple times
    GAP_FILL = "gap_fill"          # Fill in the blanks
    COMBINED = "combined"          # All types combined


class SessionStatus(str, Enum):
    PENDING = "pending"            # Session created, no upload yet
    PROCESSING = "processing"      # OCR in progress
    EXTRACTED = "extracted"        # Vocabulary extracted, ready to edit
    COMPLETED = "completed"        # Worksheet generated


class VocabularyEntry(BaseModel):
    id: str
    english: str
    german: str
    example_sentence: Optional[str] = None
    example_sentence_gap: Optional[str] = None  # With ___ for gap-fill
    word_type: Optional[str] = None  # noun, verb, adjective, etc.
    source_page: Optional[int] = None  # Page number where entry was found (1-indexed)
    # Grid position fields for layout-preserving OCR
    source_x: Optional[float] = None       # X position as percentage (0-100)
    source_y: Optional[float] = None       # Y position as percentage (0-100)
    source_width: Optional[float] = None   # Width as percentage (0-100)
    source_height: Optional[float] = None  # Height as percentage (0-100)
    source_column: Optional[int] = None    # 0-indexed column in detected grid
    source_row: Optional[int] = None       # 0-indexed row in detected grid
    confidence: Optional[float] = None     # OCR confidence score (0-1)
    recognition_status: Optional[str] = None  # recognized | manual | unrecognized


class OcrPrompts(BaseModel):
    filterHeaders: bool = True
    filterFooters: bool = True
    filterPageNumbers: bool = True
    customFilter: str = ""
    headerPatterns: List[str] = []
    footerPatterns: List[str] = []


class SessionCreate(BaseModel):
    name: str
    description: Optional[str] = None
    source_language: str = "en"  # Source language (default English)
    target_language: str = "de"  # Target language (default German)
    ocr_prompts: Optional[OcrPrompts] = None  # OCR filtering settings from frontend


class SessionResponse(BaseModel):
    id: str
    name: str
    description: Optional[str]
    source_language: str
    target_language: str
    status: str
    vocabulary_count: int
    image_path: Optional[str]
    created_at: datetime


class VocabularyResponse(BaseModel):
    session_id: str
    vocabulary: List[VocabularyEntry]
    extraction_confidence: Optional[float]


class VocabularyUpdate(BaseModel):
    vocabulary: List[VocabularyEntry]


class WorksheetGenerateRequest(BaseModel):
    worksheet_types: List[WorksheetType]
    title: Optional[str] = None
    include_solutions: bool = True
    repetitions: int = 3  # For copy practice
    line_height: str = "normal"  # normal, large, extra-large


class WorksheetResponse(BaseModel):
    id: str
    session_id: str
    worksheet_types: List[str]
    pdf_path: str
    solution_path: Optional[str]
    generated_at: datetime


# =============================================================================
# PostgreSQL Storage (persistent across container restarts)
# =============================================================================

# Note: In-memory storage removed. All data now persisted in PostgreSQL.
# See vocab_session_store.py for implementation.

# Startup event to initialize tables
@router.on_event("startup")
async def startup():
    """Initialize vocab session tables on startup."""
    logger.info("Initializing vocab session PostgreSQL tables...")
    success = await init_vocab_tables()
    if success:
        logger.info("Vocab session tables ready")
    else:
        logger.warning("Failed to initialize vocab tables - storage may not work")


# =============================================================================
# Vision LLM Vocabulary Extraction
# =============================================================================

VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.

AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:

{
  "vocabulary": [
    {
      "english": "to improve",
      "german": "verbessern",
      "example": "I want to improve my English."
    }
  ]
}

REGELN:
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
2. Behalte die exakte Schreibweise bei
3. Bei fehlenden Beispielsaetzen: "example": null
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
5. Gib NUR valides JSON zurueck, keine Erklaerungen
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"

Beispiel-Output:
{
  "vocabulary": [
    {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
    {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
  ]
}"""


async def extract_vocabulary_from_image(
    image_data: bytes,
    filename: str,
    page_number: int = 0,
    ocr_method: str = "tesseract"  # Options: "tesseract" (D), "vision_llm" (B), "paddleocr" (C)
) -> tuple[List[VocabularyEntry], float, str]:
    """
    Extract vocabulary from an image using different OCR methods.

    OCR Methods (documented in SBOM):
    - Loesung A: User's 32B LLM (external)
    - Loesung B: Vision LLM (Ollama llama3.2-vision)
    - Loesung C: PaddleOCR + LLM (DEAKTIVIERT - funktioniert nicht unter Rosetta 2)
    - Loesung D: Tesseract OCR + LLM (ARM64-nativ, Apache 2.0) <- DEFAULT

    Args:
        image_data: Image bytes
        filename: Original filename for logging
        page_number: 0-indexed page number for error messages
        ocr_method: OCR method to use ("tesseract", "vision_llm", "paddleocr")

    Returns:
        Tuple of (vocabulary_entries, confidence, error_message)
        error_message is empty string on success
    """
    import httpx

    # ==========================================================================
    # LOESUNG D: Tesseract OCR + LLM Gateway (DEFAULT - ARM64-nativ)
    # ==========================================================================
    if ocr_method == "tesseract":
        try:
            from tesseract_vocab_extractor import extract_vocabulary_tesseract, is_tesseract_available

            if not is_tesseract_available():
                logger.warning("Tesseract not available, falling back to Vision LLM")
                ocr_method = "vision_llm"
            else:
                logger.info(f"Using TESSERACT OCR for {filename} (Loesung D)")

                vocab_dicts, confidence, error = await extract_vocabulary_tesseract(image_data, filename)

                if error:
                    logger.warning(f"Tesseract extraction had issues: {error}")
                elif vocab_dicts:
                    vocabulary = [
                        VocabularyEntry(
                            id=str(uuid.uuid4()),
                            english=v.get("source_word", "") if v.get("source_lang") == "en" else v.get("target_word", ""),
                            german=v.get("source_word", "") if v.get("source_lang") == "de" else v.get("target_word", ""),
                            example_sentence=v.get("context"),
                            source_page=page_number + 1
                        )
                        for v in vocab_dicts
                    ]
                    logger.info(f"Tesseract extraction: {len(vocabulary)} entries from {filename}")
                    return vocabulary, confidence, ""

        except ImportError as e:
            logger.warning(f"Tesseract extractor not available: {e}. Falling back to Vision LLM.")
            ocr_method = "vision_llm"
        except Exception as e:
            logger.warning(f"Tesseract extraction failed: {e}. Falling back to Vision LLM.")
            import traceback
            logger.debug(traceback.format_exc())
            ocr_method = "vision_llm"

    # ==========================================================================
    # LOESUNG C: PaddleOCR + LLM Gateway (DEAKTIVIERT - Rosetta 2 Probleme)
    # ==========================================================================
    if ocr_method == "paddleocr":
        try:
            from hybrid_vocab_extractor import extract_vocabulary_hybrid
            logger.info(f"Using PADDLEOCR for {filename} (Loesung C - experimentell)")

            vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)

            if error:
                logger.warning(f"PaddleOCR extraction had issues: {error}")
            elif vocab_dicts:
                vocabulary = [
                    VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=v.get("english", ""),
                        german=v.get("german", ""),
                        example_sentence=v.get("example"),
                        source_page=page_number + 1
                    )
                    for v in vocab_dicts
                    if v.get("english") and v.get("german")
                ]
                logger.info(f"PaddleOCR extraction: {len(vocabulary)} entries from {filename}")
                return vocabulary, confidence, ""

        except ImportError as e:
            logger.warning(f"PaddleOCR not available: {e}. Falling back to Vision LLM.")
        except Exception as e:
            logger.warning(f"PaddleOCR failed: {e}. Falling back to Vision LLM.")
            import traceback
            logger.debug(traceback.format_exc())

    # ==========================================================================
    # FALLBACK: Vision LLM (Ollama llama3.2-vision)
    # ==========================================================================
    logger.info(f"Using VISION LLM extraction for {filename}")

    try:
        # First check if Ollama is available
        async with httpx.AsyncClient(timeout=10.0) as check_client:
            try:
                health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
                if health_response.status_code != 200:
                    logger.error(f"Ollama not available at {OLLAMA_URL}")
                    return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
            except Exception as e:
                logger.error(f"Ollama health check failed: {e}")
                return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"

        image_base64 = base64.b64encode(image_data).decode("utf-8")

        payload = {
            "model": VISION_MODEL,
            "messages": [
                {
                    "role": "user",
                    "content": VOCAB_EXTRACTION_PROMPT,
                    "images": [image_base64]
                }
            ],
            "stream": False,
            "options": {
                "temperature": 0.1,
                "num_predict": 4096,
            }
        }

        logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")

        # Increased timeout for Vision models (they can be slow)
        async with httpx.AsyncClient(timeout=600.0) as client:
            response = await client.post(
                f"{OLLAMA_URL}/api/chat",
                json=payload,
                timeout=300.0  # 5 minutes per page
            )
            response.raise_for_status()

            data = response.json()
            extracted_text = data.get("message", {}).get("content", "")

        logger.info(f"Ollama response received: {len(extracted_text)} chars")

        # Parse JSON from response
        vocabulary = parse_vocabulary_json(extracted_text)

        # Set source_page for each entry
        for v in vocabulary:
            v.source_page = page_number + 1

        # Estimate confidence
        confidence = 0.85 if len(vocabulary) > 0 else 0.1

        logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")

        return vocabulary, confidence, ""

    except httpx.TimeoutException:
        logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
        return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
    except Exception as e:
        logger.error(f"Vocabulary extraction failed for {filename}: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"


def _get_demo_vocabulary() -> List[VocabularyEntry]:
    """Return demo vocabulary for testing when Vision LLM is not available."""
    demo_entries = [
        {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
        {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
        {"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
        {"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
        {"english": "success", "german": "Erfolg", "example": "The project was a success."},
        {"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
        {"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
        {"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
    ]
    return [
        VocabularyEntry(
            id=str(uuid.uuid4()),
            english=e["english"],
            german=e["german"],
            example_sentence=e.get("example"),
        )
        for e in demo_entries
    ]


def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
    """Parse vocabulary JSON from LLM response with robust error handling."""
    import re

    def clean_json_string(s: str) -> str:
        """Clean a JSON string by removing control characters and fixing common issues."""
        # Remove control characters except newlines and tabs
        s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
        # Replace unescaped newlines within strings with space
        # This is a simplistic approach - replace actual newlines with escaped ones
        s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
        return s

    def try_parse_json(json_str: str) -> dict:
        """Try multiple strategies to parse JSON."""
        # Strategy 1: Direct parse
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass

        # Strategy 2: Clean and parse
        try:
            cleaned = clean_json_string(json_str)
            return json.loads(cleaned)
        except json.JSONDecodeError:
            pass

        # Strategy 3: Try to fix common issues
        try:
            # Remove trailing commas before } or ]
            fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
            # Fix unquoted keys
            fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
            return json.loads(fixed)
        except json.JSONDecodeError:
            pass

        return None

    try:
        # Find JSON in response (may have extra text)
        start = text.find('{')
        end = text.rfind('}') + 1

        if start == -1 or end == 0:
            logger.warning("No JSON found in response")
            return []

        json_str = text[start:end]
        data = try_parse_json(json_str)

        if data is None:
            # Strategy 4: Extract vocabulary entries using regex as fallback
            logger.warning("JSON parsing failed, trying regex extraction")
            vocabulary = []
            # Match patterns like {"english": "...", "german": "...", ...}
            pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)

            for match in matches:
                english = match[0].strip() if match[0] else ""
                german = match[1].strip() if match[1] else ""
                example = match[2].strip() if len(match) > 2 and match[2] else None

                if english and german:
                    vocab_entry = VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=english,
                        german=german,
                        example_sentence=example,
                    )
                    vocabulary.append(vocab_entry)

            if vocabulary:
                logger.info(f"Regex extraction found {len(vocabulary)} entries")
            return vocabulary

        # Normal JSON parsing succeeded
        vocabulary = []
        for i, entry in enumerate(data.get("vocabulary", [])):
            english = entry.get("english", "").strip()
            german = entry.get("german", "").strip()

            # Skip entries that look like hallucinations (very long or containing unusual patterns)
            if len(english) > 100 or len(german) > 200:
                logger.warning(f"Skipping suspicious entry: {english[:50]}...")
                continue

            if not english or not german:
                continue

            vocab_entry = VocabularyEntry(
                id=str(uuid.uuid4()),
                english=english,
                german=german,
                example_sentence=entry.get("example"),
                word_type=entry.get("word_type"),
            )
            vocabulary.append(vocab_entry)

        return vocabulary

    except Exception as e:
        logger.error(f"Failed to parse vocabulary JSON: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return []


# =============================================================================
# Worksheet PDF Generation
# =============================================================================

def generate_worksheet_html(
    vocabulary: List[VocabularyEntry],
    worksheet_type: WorksheetType,
    title: str,
    show_solutions: bool = False,
    repetitions: int = 3,
    line_height: str = "normal"
) -> str:
    """Generate HTML for a worksheet."""

    # Line height CSS
    line_heights = {
        "normal": "2.5em",
        "large": "3.5em",
        "extra-large": "4.5em"
    }
    lh = line_heights.get(line_height, "2.5em")

    html = f"""<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <style>
        @page {{ size: A4; margin: 2cm; }}
        body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
        h1 {{ font-size: 24px; margin-bottom: 10px; }}
        .meta {{ color: #666; margin-bottom: 20px; }}
        .name-line {{ margin-bottom: 30px; }}
        .vocab-table {{ width: 100%; border-collapse: collapse; }}
        .vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
        .vocab-word {{ width: 40%; font-weight: 500; }}
        .vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
        .vocab-answer {{ width: 60%; color: #2563eb; }}
        .gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
        .hint {{ color: #666; font-style: italic; font-size: 12px; }}
        .section {{ margin-top: 30px; }}
        .section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
    </style>
</head>
<body>
    <h1>{title}</h1>
    <div class="name-line">Name: _________________________ Datum: _____________</div>
"""

    if worksheet_type == WorksheetType.EN_TO_DE:
        html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'

    elif worksheet_type == WorksheetType.DE_TO_EN:
        html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            if show_solutions:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
            else:
                html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
        html += '</table></div>'

    elif worksheet_type == WorksheetType.COPY_PRACTICE:
        html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
        html += '<table class="vocab-table">'
        for entry in vocabulary:
            html += f'<tr><td class="vocab-word">{entry.english}</td>'
            html += '<td class="vocab-blank">'
            if show_solutions:
                html += f' {entry.english} ' * repetitions
            html += '</td></tr>'
        html += '</table></div>'

    elif worksheet_type == WorksheetType.GAP_FILL:
        entries_with_examples = [e for e in vocabulary if e.example_sentence]
        if entries_with_examples:
            html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
            for i, entry in enumerate(entries_with_examples, 1):
                # Create gap sentence by removing the English word
                gap_sentence = entry.example_sentence
                for word in entry.english.split():
                    if word.lower() in gap_sentence.lower():
                        gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
                        gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
                        break

                html += f'<p>{i}. {gap_sentence}</p>'
                if show_solutions:
                    html += f'<p class="hint">Loesung: {entry.english}</p>'
                else:
                    html += f'<p class="hint">({entry.german})</p>'
            html += '</div>'

    html += '</body></html>'
    return html


async def generate_worksheet_pdf(html: str) -> bytes:
    """Generate PDF from HTML using WeasyPrint."""
    try:
        from weasyprint import HTML
        pdf_bytes = HTML(string=html).write_pdf()
        return pdf_bytes
    except ImportError:
        logger.warning("WeasyPrint not available, returning HTML")
        return html.encode('utf-8')
    except Exception as e:
        logger.error(f"PDF generation failed: {e}")
        raise


# =============================================================================
# API Endpoints
# =============================================================================

@router.post("/sessions", response_model=SessionResponse)
async def create_session(session: SessionCreate):
    """Create a new vocabulary extraction session."""
    session_id = str(uuid.uuid4())

    # Store in PostgreSQL
    db_session = await create_session_db(
        session_id=session_id,
        name=session.name,
        description=session.description,
        source_language=session.source_language,
        target_language=session.target_language,
        ocr_prompts=session.ocr_prompts.model_dump() if session.ocr_prompts else None,
    )

    if db_session is None:
        raise HTTPException(status_code=500, detail="Failed to create session in database")

    # Create storage directory for files
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)

    return SessionResponse(
        id=session_id,
        name=session.name,
        description=session.description,
        source_language=session.source_language,
        target_language=session.target_language,
        status=SessionStatus.PENDING.value,
        vocabulary_count=0,
        image_path=None,
        created_at=db_session.created_at or datetime.utcnow(),
    )


@router.get("/sessions", response_model=List[SessionResponse])
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
    """List all vocabulary sessions."""
    sessions = await list_sessions_db(limit=limit)

    return [
        SessionResponse(
            id=s.id,
            name=s.name,
            description=s.description,
            source_language=s.source_language,
            target_language=s.target_language,
            status=s.status,
            vocabulary_count=s.vocabulary_count,
            image_path=s.image_path,
            created_at=s.created_at or datetime.utcnow(),
        )
        for s in sessions
    ]


@router.get("/sessions/{session_id}", response_model=SessionResponse)
async def get_session(session_id: str):
    """Get a specific session."""
    s = await get_session_db(session_id)
    if s is None:
        raise HTTPException(status_code=404, detail="Session not found")

    return SessionResponse(
        id=s.id,
        name=s.name,
        description=s.description,
        source_language=s.source_language,
        target_language=s.target_language,
        status=s.status,
        vocabulary_count=s.vocabulary_count,
        image_path=s.image_path,
        created_at=s.created_at or datetime.utcnow(),
    )


def get_pdf_page_count(pdf_data: bytes) -> int:
    """Get the number of pages in a PDF."""
    try:
        import fitz
        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
        count = pdf_document.page_count
        pdf_document.close()
        return count
    except Exception as e:
        logger.error(f"Failed to get PDF page count: {e}")
        return 0


async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
    """Convert a specific page of PDF to PNG image using PyMuPDF.

    Args:
        pdf_data: PDF file as bytes
        page_number: 0-indexed page number
        thumbnail: If True, return a smaller thumbnail image
    """
    import gc
    pix = None
    pdf_document = None

    try:
        import fitz  # PyMuPDF

        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")

        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")

        if page_number >= pdf_document.page_count:
            raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")

        page = pdf_document[page_number]

        # Render page to image
        # For thumbnails: lower resolution, for OCR: higher resolution
        zoom = 0.5 if thumbnail else 2.0
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)

        png_data = pix.tobytes("png")

        logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
        return png_data

    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
    finally:
        # Explicit cleanup to prevent OOM
        if pix is not None:
            del pix
        if pdf_document is not None:
            pdf_document.close()
            del pdf_document
        gc.collect()


async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
    """Convert multiple pages of PDF to PNG images.

    Args:
        pdf_data: PDF file as bytes
        pages: List of 0-indexed page numbers to convert. If None, convert all pages.
    """
    import gc
    pdf_document = None

    try:
        import fitz

        pdf_document = fitz.open(stream=pdf_data, filetype="pdf")

        if pdf_document.page_count == 0:
            raise ValueError("PDF has no pages")

        # If no pages specified, convert all
        if pages is None:
            pages = list(range(pdf_document.page_count))

        images = []
        zoom = 2.0
        mat = fitz.Matrix(zoom, zoom)

        for page_num in pages:
            if page_num < pdf_document.page_count:
                page = pdf_document[page_num]
                pix = page.get_pixmap(matrix=mat)
                images.append(pix.tobytes("png"))
                # Cleanup pixmap immediately to prevent memory buildup
                del pix
                gc.collect()

        logger.info(f"Converted {len(images)} PDF pages to images")
        return images

    except ImportError:
        logger.error("PyMuPDF (fitz) not installed")
        raise HTTPException(status_code=500, detail="PDF conversion not available")
    except Exception as e:
        logger.error(f"PDF conversion failed: {e}")
        raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
    finally:
        if pdf_document is not None:
            pdf_document.close()
            del pdf_document
        gc.collect()


@router.post("/sessions/{session_id}/upload")
async def upload_image(
    session_id: str,
    file: UploadFile = File(...),
):
    """
    Upload a textbook page image or PDF and extract vocabulary.

    Supported formats: PNG, JPG, JPEG, PDF
    """
    logger.info(f"Upload request for session {session_id}")
    logger.info(f"File: filename={file.filename}, content_type={file.content_type}")

    session = await get_session_db(session_id)
    if session is None:
        logger.error(f"Session {session_id} not found")
        raise HTTPException(status_code=404, detail="Session not found")

    # Validate file type - check both extension and content type
    extension = file.filename.split('.')[-1].lower() if file.filename else ''
    content_type = file.content_type or ''

    # Accept images and PDFs
    valid_image_extensions = ['png', 'jpg', 'jpeg']
    valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
    is_pdf = extension == 'pdf' or content_type == 'application/pdf'
    is_image = extension in valid_image_extensions or content_type in valid_image_content_types

    if not is_pdf and not is_image:
        logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
        raise HTTPException(
            status_code=400,
            detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
        )

    # Determine final extension for saving
    if is_pdf:
        save_extension = 'png'  # PDFs will be converted to PNG
    elif extension in valid_image_extensions:
        save_extension = extension
    elif content_type == 'image/png':
        save_extension = 'png'
    else:
        save_extension = 'jpg'

    # Read file content
    content = await file.read()
    logger.info(f"Read {len(content)} bytes from uploaded file")

    # Convert PDF to image if needed (first page only for single upload)
    if is_pdf:
        logger.info("Converting PDF to image...")
        content = await convert_pdf_page_to_image(content, page_number=0, thumbnail=False)
        logger.info(f"PDF converted, image size: {len(content)} bytes")

    # Save image
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    image_path = os.path.join(session_dir, f"source.{save_extension}")

    with open(image_path, 'wb') as f:
        f.write(content)

    # Update session status in DB
    await update_session_db(session_id, status=SessionStatus.PROCESSING.value, image_path=image_path)

    # Extract vocabulary using Vision LLM
    vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)

    # Store vocabulary in DB
    vocab_dicts = [v.dict() for v in vocabulary]
    await add_vocabulary_db(session_id, vocab_dicts)

    # Update session with extraction results
    await update_session_db(
        session_id,
        status=SessionStatus.EXTRACTED.value,
        extraction_confidence=confidence,
        vocabulary_count=len(vocabulary),
    )

    result = {
        "session_id": session_id,
        "filename": file.filename,
        "image_path": image_path,
        "vocabulary_count": len(vocabulary),
        "extraction_confidence": confidence,
        "status": SessionStatus.EXTRACTED.value,
    }

    if error:
        result["error"] = error

    return result


@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
async def get_vocabulary(session_id: str):
    """Get extracted vocabulary for a session."""
    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    vocab_dicts = await get_vocabulary_db(session_id)
    vocabulary = [VocabularyEntry(**v) for v in vocab_dicts]

    return VocabularyResponse(
        session_id=session_id,
        vocabulary=vocabulary,
        extraction_confidence=session.extraction_confidence,
    )


@router.put("/sessions/{session_id}/vocabulary")
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
    """Update vocabulary entries (for manual corrections)."""
    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    # Replace all vocabulary entries
    vocab_dicts = [v.dict() for v in update.vocabulary]
    success = await update_vocabulary_db(session_id, vocab_dicts)

    if not success:
        raise HTTPException(status_code=500, detail="Failed to update vocabulary")

    return {
        "session_id": session_id,
        "vocabulary_count": len(update.vocabulary),
        "message": "Vocabulary updated successfully",
    }


@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
    """Generate worksheet PDF(s) from extracted vocabulary."""
    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    vocab_dicts = await get_vocabulary_db(session_id)
    vocabulary = [VocabularyEntry(**v) for v in vocab_dicts]

    if not vocabulary:
        raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")

    worksheet_id = str(uuid.uuid4())
    title = request.title or session.name

    # Generate HTML for each worksheet type
    combined_html = ""
    for wtype in request.worksheet_types:
        html = generate_worksheet_html(
            vocabulary=vocabulary,
            worksheet_type=wtype,
            title=f"{title} - {wtype.value}",
            show_solutions=False,
            repetitions=request.repetitions,
            line_height=request.line_height,
        )
        combined_html += html + '<div style="page-break-after: always;"></div>'

    # Generate PDF
    try:
        pdf_bytes = await generate_worksheet_pdf(combined_html)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")

    # Save PDF
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
    with open(pdf_path, 'wb') as f:
        f.write(pdf_bytes)

    # Generate solution PDF if requested
    solution_path = None
    if request.include_solutions:
        solution_html = ""
        for wtype in request.worksheet_types:
            html = generate_worksheet_html(
                vocabulary=vocabulary,
                worksheet_type=wtype,
                title=f"{title} - {wtype.value} (Loesung)",
                show_solutions=True,
                repetitions=request.repetitions,
                line_height=request.line_height,
            )
            solution_html += html + '<div style="page-break-after: always;"></div>'

        solution_bytes = await generate_worksheet_pdf(solution_html)
        solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
        with open(solution_path, 'wb') as f:
            f.write(solution_bytes)

    # Store worksheet info in DB
    worksheet = await create_worksheet_db(
        worksheet_id=worksheet_id,
        session_id=session_id,
        worksheet_types=[wt.value for wt in request.worksheet_types],
        pdf_path=pdf_path,
        solution_path=solution_path,
    )

    if worksheet is None:
        raise HTTPException(status_code=500, detail="Failed to save worksheet to database")

    # Update session status
    await update_session_db(session_id, status=SessionStatus.COMPLETED.value)

    return WorksheetResponse(
        id=worksheet_id,
        session_id=session_id,
        worksheet_types=worksheet.worksheet_types,
        pdf_path=pdf_path,
        solution_path=solution_path,
        generated_at=worksheet.generated_at or datetime.utcnow(),
    )


@router.get("/worksheets/{worksheet_id}/pdf")
async def download_worksheet_pdf(worksheet_id: str):
    """Download the generated worksheet PDF."""
    worksheet = await get_worksheet_db(worksheet_id)
    if worksheet is None:
        raise HTTPException(status_code=404, detail="Worksheet not found")

    pdf_path = worksheet.pdf_path

    if not pdf_path or not os.path.exists(pdf_path):
        raise HTTPException(status_code=404, detail="PDF file not found")

    with open(pdf_path, 'rb') as f:
        pdf_bytes = f.read()

    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
    )


@router.get("/worksheets/{worksheet_id}/solution")
async def download_solution_pdf(worksheet_id: str):
    """Download the solution PDF."""
    worksheet = await get_worksheet_db(worksheet_id)
    if worksheet is None:
        raise HTTPException(status_code=404, detail="Worksheet not found")

    solution_path = worksheet.solution_path

    if not solution_path or not os.path.exists(solution_path):
        raise HTTPException(status_code=404, detail="Solution PDF not found")

    with open(solution_path, 'rb') as f:
        pdf_bytes = f.read()

    return StreamingResponse(
        io.BytesIO(pdf_bytes),
        media_type="application/pdf",
        headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
    )


@router.get("/sessions/{session_id}/image")
async def get_session_image(session_id: str):
    """Get the uploaded source image for a session."""
    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    image_path = session.image_path

    if not image_path or not os.path.exists(image_path):
        raise HTTPException(status_code=404, detail="Image not found")

    # Determine content type
    extension = image_path.split('.')[-1].lower()
    content_type = {
        'png': 'image/png',
        'jpg': 'image/jpeg',
        'jpeg': 'image/jpeg',
    }.get(extension, 'application/octet-stream')

    with open(image_path, 'rb') as f:
        image_bytes = f.read()

    return StreamingResponse(
        io.BytesIO(image_bytes),
        media_type=content_type,
    )


@router.post("/sessions/{session_id}/upload-pdf-info")
async def upload_pdf_get_info(
    session_id: str,
    file: UploadFile = File(...),
):
    """
    Upload a PDF and get page count and thumbnails for preview.
    Use this before processing to let user select pages.
    """
    logger.info(f"PDF info request for session {session_id}")

    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    # Validate file type
    extension = file.filename.split('.')[-1].lower() if file.filename else ''
    content_type = file.content_type or ''

    if extension != 'pdf' and content_type != 'application/pdf':
        raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")

    content = await file.read()

    # Save PDF temporarily
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    os.makedirs(session_dir, exist_ok=True)
    pdf_path = os.path.join(session_dir, "source.pdf")

    with open(pdf_path, 'wb') as f:
        f.write(content)

    # Get page count
    page_count = get_pdf_page_count(content)

    # Cache PDF data for later processing (in-memory for multi-page workflow)
    cache_pdf_data(session_id, content)

    # Update session in DB
    await update_session_db(
        session_id,
        pdf_path=pdf_path,
        pdf_page_count=page_count,
        status="pdf_uploaded",
    )

    return {
        "session_id": session_id,
        "page_count": page_count,
        "filename": file.filename,
    }


@router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = False):
    """Get a thumbnail image of a specific PDF page.

    Args:
        session_id: Session ID
        page_number: 0-indexed page number
        hires: If True, return high-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5)
    """
    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    # Try cached PDF data first
    pdf_data = get_cached_pdf_data(session_id)

    # If not cached, try to load from file
    if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
        with open(session.pdf_path, 'rb') as f:
            pdf_data = f.read()
        cache_pdf_data(session_id, pdf_data)

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    # Use thumbnail=False for high-res (zoom=2.0), thumbnail=True for low-res (zoom=0.5)
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=not hires)

    return StreamingResponse(
        io.BytesIO(image_data),
        media_type="image/png",
    )


@router.post("/sessions/{session_id}/process-single-page/{page_number}")
async def process_single_page(
    session_id: str,
    page_number: int,
):
    """
    Process a SINGLE page of an uploaded PDF - completely isolated.

    This endpoint processes one page at a time to avoid LLM context issues.
    The frontend should call this sequentially for each page.

    Returns the vocabulary for just this one page.
    """
    logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")

    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    # Try cached PDF data first
    pdf_data = get_cached_pdf_data(session_id)

    # If not cached, try to load from file
    if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
        with open(session.pdf_path, 'rb') as f:
            pdf_data = f.read()
        cache_pdf_data(session_id, pdf_data)

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.pdf_page_count or 1

    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")

    # Convert just this ONE page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)

    # Extract vocabulary from this single page
    vocabulary, confidence, error = await extract_vocabulary_from_image(
        image_data,
        f"page_{page_number + 1}.png",
        page_number=page_number
    )

    if error:
        logger.warning(f"Page {page_number + 1} failed: {error}")
        return {
            "session_id": session_id,
            "page_number": page_number + 1,
            "success": False,
            "error": error,
            "vocabulary": [],
            "vocabulary_count": 0,
        }

    # Convert vocabulary entries to dicts with page info
    page_vocabulary = []
    for entry in vocabulary:
        entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
        entry_dict['source_page'] = page_number + 1
        page_vocabulary.append(entry_dict)

    logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")

    # Clear existing entries for this page (in case of re-processing)
    await clear_page_vocabulary_db(session_id, page_number + 1)

    # Add new vocabulary entries to DB
    await add_vocabulary_db(session_id, page_vocabulary)

    # Update session status
    await update_session_db(session_id, status=SessionStatus.EXTRACTED.value)

    # Get total count
    all_vocab = await get_vocabulary_db(session_id)

    return {
        "session_id": session_id,
        "page_number": page_number + 1,
        "success": True,
        "vocabulary": page_vocabulary,
        "vocabulary_count": len(page_vocabulary),
        "total_vocabulary_count": len(all_vocab),
        "extraction_confidence": confidence,
    }


@router.post("/sessions/{session_id}/compare-ocr/{page_number}")
async def compare_ocr_methods(
    session_id: str,
    page_number: int,
):
    """
    Compare different OCR methods on a single page.

    Runs available OCR solutions and compares:
    - Extraction time
    - Vocabulary found
    - Confidence scores

    Solutions tested:
    - Loesung B: Vision LLM (qwen2.5vl:32b via Ollama)
    - Loesung D: Tesseract OCR + LLM structuring
    - Loesung E: Claude Vision API (Anthropic)

    Returns comparison data for frontend visualization.
    """
    import time
    import httpx

    logger.info(f"OCR Comparison for session {session_id}, page {page_number}")

    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    # Try cached PDF data first
    pdf_data = get_cached_pdf_data(session_id)

    # If not cached, try to load from file
    if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
        with open(session.pdf_path, 'rb') as f:
            pdf_data = f.read()
        cache_pdf_data(session_id, pdf_data)

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.pdf_page_count or 1

    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")

    # Convert page to image once (shared by all methods)
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)

    results = {
        "session_id": session_id,
        "page_number": page_number + 1,
        "methods": {}
    }

    # ==========================================================================
    # LOESUNG B: Vision LLM (qwen2.5vl:32b)
    # ==========================================================================
    try:
        start_time = time.time()
        vocab_b, confidence_b, error_b = await extract_vocabulary_from_image(
            image_data, f"page_{page_number + 1}.png", page_number, ocr_method="vision_llm"
        )
        duration_b = time.time() - start_time

        results["methods"]["vision_llm"] = {
            "name": "Loesung B: Vision LLM",
            "model": VISION_MODEL,
            "duration_seconds": round(duration_b, 2),
            "vocabulary_count": len(vocab_b),
            "vocabulary": [
                {"english": v.english, "german": v.german, "example": v.example_sentence}
                for v in vocab_b
            ],
            "confidence": confidence_b,
            "error": error_b if error_b else None,
            "success": len(vocab_b) > 0
        }
        logger.info(f"Vision LLM: {len(vocab_b)} entries in {duration_b:.2f}s")
    except Exception as e:
        results["methods"]["vision_llm"] = {
            "name": "Loesung B: Vision LLM",
            "error": str(e),
            "success": False
        }
        logger.error(f"Vision LLM comparison failed: {e}")

    # ==========================================================================
    # LOESUNG D: Tesseract OCR + LLM
    # ==========================================================================
    try:
        start_time = time.time()
        vocab_d, confidence_d, error_d = await extract_vocabulary_from_image(
            image_data, f"page_{page_number + 1}.png", page_number, ocr_method="tesseract"
        )
        duration_d = time.time() - start_time

        results["methods"]["tesseract"] = {
            "name": "Loesung D: Tesseract OCR",
            "model": "tesseract + qwen2.5:14b",
            "duration_seconds": round(duration_d, 2),
            "vocabulary_count": len(vocab_d),
            "vocabulary": [
                {"english": v.english, "german": v.german, "example": v.example_sentence}
                for v in vocab_d
            ],
            "confidence": confidence_d,
            "error": error_d if error_d else None,
            "success": len(vocab_d) > 0
        }
        logger.info(f"Tesseract: {len(vocab_d)} entries in {duration_d:.2f}s")
    except Exception as e:
        results["methods"]["tesseract"] = {
            "name": "Loesung D: Tesseract OCR",
            "error": str(e),
            "success": False
        }
        logger.error(f"Tesseract comparison failed: {e}")

    # ==========================================================================
    # LOESUNG E: Claude Vision API (Anthropic)
    # ==========================================================================
    try:
        from claude_vocab_extractor import extract_vocabulary_claude, is_claude_available

        if is_claude_available():
            start_time = time.time()
            vocab_e_raw, confidence_e, error_e = await extract_vocabulary_claude(
                image_data, f"page_{page_number + 1}.png"
            )
            duration_e = time.time() - start_time

            # Convert to consistent format
            vocab_e = []
            for v in vocab_e_raw:
                source_word = v.get("source_word", "")
                target_word = v.get("target_word", "")
                source_lang = v.get("source_lang", "en")
                # Determine which is English and which is German
                if source_lang == "en":
                    english = source_word
                    german = target_word
                else:
                    english = target_word
                    german = source_word

                vocab_e.append({
                    "english": english,
                    "german": german,
                    "example": v.get("context", "")
                })

            results["methods"]["claude_vision"] = {
                "name": "Loesung E: Claude Vision",
                "model": "claude-sonnet-4-20250514",
                "duration_seconds": round(duration_e, 2),
                "vocabulary_count": len(vocab_e),
                "vocabulary": vocab_e,
                "confidence": confidence_e,
                "error": error_e if error_e else None,
                "success": len(vocab_e) > 0
            }
            logger.info(f"Claude Vision: {len(vocab_e)} entries in {duration_e:.2f}s")
        else:
            results["methods"]["claude_vision"] = {
                "name": "Loesung E: Claude Vision",
                "error": "Anthropic API Key nicht konfiguriert",
                "success": False
            }
    except Exception as e:
        results["methods"]["claude_vision"] = {
            "name": "Loesung E: Claude Vision",
            "error": str(e),
            "success": False
        }
        logger.error(f"Claude Vision comparison failed: {e}")

    # ==========================================================================
    # Comparison Analysis
    # ==========================================================================
    all_vocab = {}
    for method_key, method_data in results["methods"].items():
        if method_data.get("success"):
            for v in method_data.get("vocabulary", []):
                key = f"{v['english']}|{v['german']}"
                if key not in all_vocab:
                    all_vocab[key] = {"english": v["english"], "german": v["german"], "found_by": []}
                all_vocab[key]["found_by"].append(method_key)

    # Categorize vocabulary
    found_by_all = []
    found_by_some = []

    num_methods = len([m for m in results["methods"].values() if m.get("success")])

    for key, data in all_vocab.items():
        entry = {"english": data["english"], "german": data["german"], "methods": data["found_by"]}
        if len(data["found_by"]) == num_methods:
            found_by_all.append(entry)
        else:
            found_by_some.append(entry)

    results["comparison"] = {
        "found_by_all_methods": found_by_all,
        "found_by_some_methods": found_by_some,
        "total_unique_vocabulary": len(all_vocab),
        "agreement_rate": len(found_by_all) / len(all_vocab) if all_vocab else 0
    }

    # Determine best method
    best_method = None
    best_count = 0
    for method_key, method_data in results["methods"].items():
        if method_data.get("success") and method_data.get("vocabulary_count", 0) > best_count:
            best_count = method_data["vocabulary_count"]
            best_method = method_key

    results["recommendation"] = {
        "best_method": best_method,
        "reason": f"Meiste Vokabeln erkannt ({best_count})"
    }

    return results


# =============================================================================
# Grid Detection and Analysis
# =============================================================================

@router.post("/sessions/{session_id}/analyze-grid/{page_number}")
async def analyze_grid(session_id: str, page_number: int):
    """
    Analyze a page and detect grid structure for layout-preserving OCR.

    This endpoint:
    1. Applies deskewing to straighten the image
    2. Runs OCR with bounding box extraction
    3. Detects row and column structure
    4. Identifies recognized, empty, and problematic cells

    Returns grid structure with cell positions and recognition status.
    """
    import numpy as np
    from PIL import Image
    import io

    logger.info(f"Grid analysis for session {session_id}, page {page_number}")

    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    # Get PDF data
    pdf_data = get_cached_pdf_data(session_id)
    if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
        with open(session.pdf_path, 'rb') as f:
            pdf_data = f.read()
        cache_pdf_data(session_id, pdf_data)

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.pdf_page_count or 1
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid page number. PDF has {page_count} pages (0-indexed)."
        )

    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)

    # Load image as numpy array
    img = Image.open(io.BytesIO(image_data))
    img_array = np.array(img)
    img_height, img_width = img_array.shape[:2]

    # Step 1: Deskewing
    deskew_angle = 0.0
    try:
        from services.image_preprocessing import deskew_image
        img_array, deskew_angle = deskew_image(img_array)
        logger.info(f"Applied deskew correction: {deskew_angle:.2f}°")
    except ImportError:
        logger.warning("Image preprocessing not available, skipping deskew")
    except Exception as e:
        logger.warning(f"Deskewing failed: {e}")

    # Step 2: Run OCR with position data
    ocr_regions = []
    try:
        import pytesseract
        from pytesseract import Output
        from services.grid_detection_service import convert_tesseract_regions

        # Convert back to PIL Image if we modified it
        if deskew_angle != 0:
            img = Image.fromarray(img_array)

        ocr_data = pytesseract.image_to_data(
            img,
            lang='eng+deu',
            output_type=Output.DICT
        )
        ocr_regions = convert_tesseract_regions(ocr_data, img_width, img_height)
        logger.info(f"OCR found {len(ocr_regions)} text regions")

    except ImportError:
        logger.warning("Tesseract not available, trying PaddleOCR")
        try:
            from hybrid_vocab_extractor import call_paddleocr_service
            from services.grid_detection_service import convert_paddleocr_regions

            # Convert to bytes for PaddleOCR
            buffer = io.BytesIO()
            Image.fromarray(img_array).save(buffer, format='PNG')
            paddle_regions, _ = await call_paddleocr_service(buffer.getvalue())

            ocr_regions = convert_paddleocr_regions(
                [{"text": r.text, "confidence": r.confidence,
                  "bbox": [[r.x1, r.y1], [r.x2, r.y1], [r.x2, r.y2], [r.x1, r.y2]]}
                 for r in paddle_regions],
                img_width, img_height
            )
        except Exception as e:
            logger.error(f"PaddleOCR also failed: {e}")

    if not ocr_regions:
        return {
            "session_id": session_id,
            "page_number": page_number + 1,
            "success": False,
            "error": "No text regions detected",
            "grid": None,
            "deskew_angle": deskew_angle,
        }

    # Step 3: Detect grid structure
    try:
        from services.grid_detection_service import GridDetectionService

        grid_service = GridDetectionService()
        result = grid_service.detect_grid(ocr_regions, img_array, deskew_angle)

        # Store grid data in session
        await update_session_db(
            session_id,
            grid_data=result.to_dict(),
            deskew_angle=deskew_angle
        )

        return {
            "session_id": session_id,
            "page_number": page_number + 1,
            "success": True,
            "grid": result.to_dict(),
            "deskew_angle": deskew_angle,
            "image_dimensions": {
                "width": img_width,
                "height": img_height
            }
        }

    except ImportError as e:
        logger.error(f"Grid detection service not available: {e}")
        raise HTTPException(status_code=500, detail="Grid detection service not available")
    except Exception as e:
        logger.error(f"Grid detection failed: {e}")
        import traceback
        logger.error(traceback.format_exc())
        raise HTTPException(status_code=500, detail=f"Grid detection failed: {str(e)}")


@router.get("/sessions/{session_id}/grid")
async def get_grid(session_id: str):
    """
    Get the stored grid structure for a session.
    """
    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    if not session.grid_data:
        raise HTTPException(status_code=404, detail="No grid data found. Run analyze-grid first.")

    return {
        "session_id": session_id,
        "grid": session.grid_data,
        "deskew_angle": session.deskew_angle
    }


@router.get("/sessions/{session_id}/cell-crop/{page_number}/{row}/{col}")
async def get_cell_crop(session_id: str, page_number: int, row: int, col: int):
    """
    Get a cropped image of a specific grid cell.

    Useful for showing the original image content when manually correcting cells.
    """
    from PIL import Image
    import io

    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    if not session.grid_data:
        raise HTTPException(status_code=400, detail="No grid data. Run analyze-grid first.")

    # Get cell from grid
    cells = session.grid_data.get("cells", [])
    if row >= len(cells) or col >= len(cells[row] if row < len(cells) else []):
        raise HTTPException(status_code=404, detail="Cell not found")

    cell = cells[row][col]

    # Get PDF image
    pdf_data = get_cached_pdf_data(session_id)
    if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
        with open(session.pdf_path, 'rb') as f:
            pdf_data = f.read()

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF data available")

    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    img = Image.open(io.BytesIO(image_data))
    img_width, img_height = img.size

    # Crop cell region
    x1 = int(img_width * cell["x"] / 100)
    y1 = int(img_height * cell["y"] / 100)
    x2 = int(img_width * (cell["x"] + cell["width"]) / 100)
    y2 = int(img_height * (cell["y"] + cell["height"]) / 100)

    # Add small padding
    padding = 5
    x1 = max(0, x1 - padding)
    y1 = max(0, y1 - padding)
    x2 = min(img_width, x2 + padding)
    y2 = min(img_height, y2 + padding)

    cropped = img.crop((x1, y1, x2, y2))

    # Convert to PNG
    buffer = io.BytesIO()
    cropped.save(buffer, format='PNG')
    buffer.seek(0)

    return StreamingResponse(buffer, media_type="image/png")


@router.put("/sessions/{session_id}/cell/{row}/{col}")
async def update_cell(session_id: str, row: int, col: int, text: str = Form(...)):
    """
    Manually update the text content of a grid cell.

    Sets recognition_status to 'manual' for the updated cell.
    """
    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    if not session.grid_data:
        raise HTTPException(status_code=400, detail="No grid data. Run analyze-grid first.")

    # Update cell in grid
    grid_data = session.grid_data
    cells = grid_data.get("cells", [])

    if row >= len(cells) or col >= len(cells[row] if row < len(cells) else []):
        raise HTTPException(status_code=404, detail="Cell not found")

    cells[row][col]["text"] = text
    cells[row][col]["status"] = "manual"
    cells[row][col]["confidence"] = 1.0

    # Update statistics
    recognized = sum(1 for r in cells for c in r if c.get("status") == "recognized")
    manual = sum(1 for r in cells for c in r if c.get("status") == "manual")
    problematic = sum(1 for r in cells for c in r if c.get("status") == "problematic")
    total = len(cells) * len(cells[0]) if cells and cells[0] else 0

    grid_data["stats"] = {
        "recognized": recognized,
        "manual": manual,
        "problematic": problematic,
        "empty": total - recognized - manual - problematic,
        "total": total,
        "coverage": (recognized + manual) / total if total > 0 else 0
    }

    await update_session_db(session_id, grid_data=grid_data)

    return {
        "success": True,
        "cell": cells[row][col],
        "stats": grid_data["stats"]
    }


@router.post("/sessions/{session_id}/process-pages")
async def process_pdf_pages(
    session_id: str,
    pages: List[int] = None,
    process_all: bool = False,
):
    """
    Process specific pages of an uploaded PDF.

    DEPRECATED: Use /process-single-page/{page_number} instead for better results.

    Args:
        pages: List of 0-indexed page numbers to process
        process_all: If True, process all pages
    """
    logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")

    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    # Try cached PDF data first
    pdf_data = get_cached_pdf_data(session_id)

    # If not cached, try to load from file
    if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
        with open(session.pdf_path, 'rb') as f:
            pdf_data = f.read()
        cache_pdf_data(session_id, pdf_data)

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.pdf_page_count or 1

    # Determine which pages to process
    if process_all:
        pages = list(range(page_count))
    elif pages is None or len(pages) == 0:
        pages = [0]  # Default to first page

    # Convert selected pages to images
    images = await convert_pdf_to_images(pdf_data, pages)

    # Extract vocabulary from each page SEQUENTIALLY
    all_vocabulary = []
    total_confidence = 0.0
    successful_pages = []
    failed_pages = []
    error_messages = []

    for i, image_data in enumerate(images):
        page_num = pages[i]
        logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")

        vocabulary, confidence, error = await extract_vocabulary_from_image(
            image_data,
            f"page_{page_num + 1}.png",
            page_number=page_num
        )

        if error:
            failed_pages.append(page_num + 1)
            error_messages.append(error)
            logger.warning(f"Page {page_num + 1} failed: {error}")
        else:
            successful_pages.append(page_num + 1)
            total_confidence += confidence

            # Add page info to each entry and convert to dict
            for entry in vocabulary:
                entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
                entry_dict['source_page'] = page_num + 1
                all_vocabulary.append(entry_dict)

            logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")

    avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0

    # Store vocabulary in DB (replace existing)
    await update_vocabulary_db(session_id, all_vocabulary)

    # Save first page as preview image
    image_path = None
    if images:
        session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
        os.makedirs(session_dir, exist_ok=True)
        image_path = os.path.join(session_dir, "source.png")
        with open(image_path, 'wb') as f:
            f.write(images[0])

    # Update session in DB
    await update_session_db(
        session_id,
        status=SessionStatus.EXTRACTED.value,
        extraction_confidence=avg_confidence,
        processed_pages=pages,
        successful_pages=successful_pages,
        failed_pages=failed_pages,
        image_path=image_path,
    )

    result = {
        "session_id": session_id,
        "pages_processed": len(pages),
        "pages_successful": len(successful_pages),
        "pages_failed": len(failed_pages),
        "successful_pages": successful_pages,
        "failed_pages": failed_pages,
        "vocabulary_count": len(all_vocabulary),
        "extraction_confidence": avg_confidence,
        "status": SessionStatus.EXTRACTED.value,
    }

    if error_messages:
        result["errors"] = error_messages

    return result


@router.delete("/sessions/{session_id}")
async def delete_session(session_id: str):
    """Delete a vocabulary session and all associated files."""
    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    # Delete session directory
    session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
    if os.path.exists(session_dir):
        import shutil
        shutil.rmtree(session_dir)

    # Clear cached PDF data
    clear_cached_pdf_data(session_id)

    # Delete from database (CASCADE deletes vocab_entries and vocab_worksheets)
    success = await delete_session_db(session_id)

    if not success:
        raise HTTPException(status_code=500, detail="Failed to delete session from database")

    return {"message": "Session deleted successfully", "session_id": session_id}


# =============================================================================
# NRU Format Worksheet Generation
# =============================================================================

class NRUWorksheetRequest(BaseModel):
    """Request model for NRU format worksheet generation."""
    title: Optional[str] = "Vokabeltest"
    include_solutions: bool = True
    specific_pages: Optional[List[int]] = None  # 1-indexed page numbers, None = all


@router.post("/sessions/{session_id}/generate-nru")
async def generate_nru_worksheet(session_id: str, request: NRUWorksheetRequest):
    """
    Generate worksheet PDF in NRU format.

    NRU Format:
    - Per scanned page, generates 2 worksheet pages:
      1. Vocabulary table (3 columns: English, German blank, Correction blank)
      2. Sentence practice (German sentence, 2 empty lines for English translation)

    Automatically separates vocabulary entries into:
    - Single words/phrases -> Vocabulary table
    - Full sentences (end with . ! ? or are long) -> Sentence practice

    Args:
        session_id: Session with extracted vocabulary
        request: Generation options (title, include_solutions, specific_pages)

    Returns:
        Worksheet and solution PDF download info
    """
    logger.info(f"Generating NRU worksheet for session {session_id}")

    session = await get_session_db(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found")

    vocab_dicts = await get_vocabulary_db(session_id)
    if not vocab_dicts:
        raise HTTPException(status_code=400, detail="No vocabulary found in session")

    # Generate PDFs using NRU format
    try:
        from nru_worksheet_generator import generate_nru_pdf, separate_vocab_and_sentences

        # Get statistics
        vocab_list, sentence_list = separate_vocab_and_sentences(vocab_dicts)

        worksheet_pdf, solution_pdf = await generate_nru_pdf(
            entries=vocab_dicts,
            title=request.title or session.name,
            include_solutions=request.include_solutions
        )

        # Save PDFs
        worksheet_id = str(uuid.uuid4())
        session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
        os.makedirs(session_dir, exist_ok=True)

        pdf_path = os.path.join(session_dir, f"nru_worksheet_{worksheet_id}.pdf")
        with open(pdf_path, 'wb') as f:
            f.write(worksheet_pdf)

        solution_path = None
        if solution_pdf:
            solution_path = os.path.join(session_dir, f"nru_solution_{worksheet_id}.pdf")
            with open(solution_path, 'wb') as f:
                f.write(solution_pdf)

        # Store worksheet info
        await create_worksheet_db(
            worksheet_id=worksheet_id,
            session_id=session_id,
            worksheet_types=["nru_format"],
            pdf_path=pdf_path,
            solution_path=solution_path,
        )

        # Get unique pages
        pages = sorted(set(v.get("source_page", 1) for v in vocab_dicts))

        return {
            "worksheet_id": worksheet_id,
            "session_id": session_id,
            "format": "nru",
            "pdf_path": pdf_path,
            "solution_path": solution_path,
            "statistics": {
                "total_entries": len(vocab_dicts),
                "vocabulary_count": len(vocab_list),
                "sentence_count": len(sentence_list),
                "source_pages": pages,
                "worksheet_pages": len(pages) * 2,  # 2 pages per source page
            },
            "download_url": f"/api/v1/vocab/worksheets/{worksheet_id}/pdf",
            "solution_url": f"/api/v1/vocab/worksheets/{worksheet_id}/solution" if solution_path else None,
        }

    except ImportError as e:
        logger.error(f"NRU generator not available: {e}")
        raise HTTPException(status_code=500, detail="NRU worksheet generator not available")
    except Exception as e:
        logger.error(f"NRU worksheet generation failed: {e}")
        import traceback
        logger.error(traceback.format_exc())
        raise HTTPException(status_code=500, detail=f"Worksheet generation failed: {str(e)}")