breakpilot-lehrer/klausur-service/backend/vocab_worksheet_compare_api.py

"""
Vocabulary Worksheet Compare & Grid Analysis API.

Split from vocab_worksheet_analysis_api.py — contains the two largest
route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
"""

from fastapi import APIRouter, HTTPException, Query
from typing import Dict, Any
import base64
import json
import logging
import os

from vocab_worksheet_extraction import extract_vocabulary_from_image

OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")

def _get_sessions():
    from vocab_worksheet_api import _sessions
    return _sessions
from vocab_worksheet_generation import convert_pdf_page_to_image

# Try to import Tesseract extractor
try:
    from tesseract_vocab_extractor import (
        run_tesseract_pipeline,
        match_positions_to_vocab, TESSERACT_AVAILABLE,
    )
except ImportError:
    TESSERACT_AVAILABLE = False

# Try to import CV Pipeline
try:
    from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
except ImportError:
    CV_PIPELINE_AVAILABLE = False

# Try to import Grid Detection Service
try:
    from services.grid_detection_service import GridDetectionService
    GRID_SERVICE_AVAILABLE = True
except ImportError:
    GRID_SERVICE_AVAILABLE = False

logger = logging.getLogger(__name__)

compare_router = APIRouter()


# =============================================================================
# OCR Compare & Grid Analysis Endpoints
# =============================================================================


@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
async def compare_ocr_methods(session_id: str, page_number: int):
    """
    Run multiple OCR methods on a page and compare results.

    This endpoint:
    1. Gets the page image from the session's uploaded PDF
    2. Runs Vision LLM extraction (primary method)
    3. Optionally runs Tesseract extraction
    4. Compares found vocabulary across methods
    5. Returns structured comparison results

    page_number is 0-indexed.
    """
    import httpx
    import time

    logger.info(f"Compare OCR for session {session_id}, page {page_number}")

    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")

    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")

    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)

    methods_results = {}
    all_vocab_sets = {}

    # --- Method: Vision LLM ---
    try:
        start = time.time()
        vocab, confidence, error = await extract_vocabulary_from_image(
            image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
        )
        duration = time.time() - start

        vocab_list = []
        for v in vocab:
            entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
            vocab_list.append({
                "english": entry.get("english", ""),
                "german": entry.get("german", ""),
                "example": entry.get("example_sentence", ""),
            })

        methods_results["vision_llm"] = {
            "name": "Vision LLM",
            "model": VISION_MODEL,
            "duration_seconds": round(duration, 1),
            "vocabulary_count": len(vocab_list),
            "vocabulary": vocab_list,
            "confidence": confidence,
            "success": len(vocab_list) > 0 and not error,
            "error": error if error else None,
        }
        all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
    except Exception as e:
        logger.error(f"Vision LLM failed: {e}")
        methods_results["vision_llm"] = {
            "name": "Vision LLM",
            "model": VISION_MODEL,
            "duration_seconds": 0,
            "vocabulary_count": 0,
            "vocabulary": [],
            "confidence": 0,
            "success": False,
            "error": str(e),
        }
        all_vocab_sets["vision_llm"] = set()

    # --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
    if TESSERACT_AVAILABLE:
        try:
            start = time.time()
            tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
            duration = time.time() - start

            tess_vocab = tess_result.get("vocabulary", [])
            tess_words = tess_result.get("words", [])

            # Store Tesseract words in session for later use (grid analysis, position matching)
            session["tesseract_words"] = tess_words
            session["tesseract_image_width"] = tess_result.get("image_width", 0)
            session["tesseract_image_height"] = tess_result.get("image_height", 0)
            session[f"tesseract_page_{page_number}"] = tess_result

            vocab_list_tess = []
            for v in tess_vocab:
                vocab_list_tess.append({
                    "english": v.get("english", ""),
                    "german": v.get("german", ""),
                    "example": v.get("example", ""),
                })

            methods_results["tesseract"] = {
                "name": "Tesseract OCR",
                "model": "tesseract-ocr (eng+deu)",
                "duration_seconds": round(duration, 1),
                "vocabulary_count": len(vocab_list_tess),
                "vocabulary": vocab_list_tess,
                "confidence": 0.7 if tess_vocab else 0,
                "success": len(vocab_list_tess) > 0,
                "error": tess_result.get("error"),
                "word_count": tess_result.get("word_count", 0),
                "columns_detected": len(tess_result.get("columns", [])),
            }
            all_vocab_sets["tesseract"] = {
                (v["english"].lower().strip(), v["german"].lower().strip())
                for v in vocab_list_tess if v["english"] and v["german"]
            }

            # Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
            if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
                llm_vocab_with_bbox = match_positions_to_vocab(
                    tess_words,
                    methods_results["vision_llm"]["vocabulary"],
                    tess_result.get("image_width", 1),
                    tess_result.get("image_height", 1),
                )
                methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox

        except Exception as e:
            logger.error(f"Tesseract failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            methods_results["tesseract"] = {
                "name": "Tesseract OCR",
                "model": "tesseract-ocr",
                "duration_seconds": 0,
                "vocabulary_count": 0,
                "vocabulary": [],
                "confidence": 0,
                "success": False,
                "error": str(e),
            }
            all_vocab_sets["tesseract"] = set()

    # --- Method: CV Pipeline (Document Reconstruction) ---
    if CV_PIPELINE_AVAILABLE:
        try:
            start = time.time()
            cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
            duration = time.time() - start

            cv_vocab = cv_result.vocabulary if not cv_result.error else []
            vocab_list_cv = []
            for v in cv_vocab:
                vocab_list_cv.append({
                    "english": v.get("english", ""),
                    "german": v.get("german", ""),
                    "example": v.get("example", ""),
                })

            methods_results["cv_pipeline"] = {
                "name": "CV Pipeline (Document Reconstruction)",
                "model": "opencv + tesseract (multi-pass)",
                "duration_seconds": round(duration, 1),
                "vocabulary_count": len(vocab_list_cv),
                "vocabulary": vocab_list_cv,
                "confidence": 0.8 if cv_vocab else 0,
                "success": len(vocab_list_cv) > 0,
                "error": cv_result.error,
                "word_count": cv_result.word_count,
                "columns_detected": cv_result.columns_detected,
                "stages": cv_result.stages,
            }
            all_vocab_sets["cv_pipeline"] = {
                (v["english"].lower().strip(), v["german"].lower().strip())
                for v in vocab_list_cv if v["english"] and v["german"]
            }

        except Exception as e:
            logger.error(f"CV Pipeline failed: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            methods_results["cv_pipeline"] = {
                "name": "CV Pipeline (Document Reconstruction)",
                "model": "opencv + tesseract (multi-pass)",
                "duration_seconds": 0,
                "vocabulary_count": 0,
                "vocabulary": [],
                "confidence": 0,
                "success": False,
                "error": str(e),
            }
            all_vocab_sets["cv_pipeline"] = set()

    # --- Build comparison ---
    all_unique = set()
    for vs in all_vocab_sets.values():
        all_unique |= vs

    found_by_all = []
    found_by_some = []
    for english, german in sorted(all_unique):
        found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
        entry = {"english": english, "german": german, "methods": found_in}
        if len(found_in) == len(all_vocab_sets):
            found_by_all.append(entry)
        else:
            found_by_some.append(entry)

    total_methods = max(len(all_vocab_sets), 1)
    agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0

    # Find best method
    best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"

    return {
        "session_id": session_id,
        "page_number": page_number,
        "methods": methods_results,
        "comparison": {
            "found_by_all_methods": found_by_all,
            "found_by_some_methods": found_by_some,
            "total_unique_vocabulary": len(all_unique),
            "agreement_rate": agreement_rate,
        },
        "recommendation": {
            "best_method": best_method,
            "reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
        },
    }


@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
    """
    Analyze the grid/table structure of a vocabulary page.

    Hybrid approach:
    1. If Tesseract bounding boxes are available (from compare-ocr), use them for
       real spatial positions via GridDetectionService.
    2. Otherwise fall back to Vision LLM for grid structure detection.

    page_number is 0-indexed.
    Returns GridData structure expected by the frontend GridOverlay component.
    """
    import httpx
    import time

    logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")

    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")

    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number.")

    # Convert page to image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)

    # --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
    tess_page_data = session.get(f"tesseract_page_{page_number}")

    if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
        try:
            # Run Tesseract if not already cached
            if not tess_page_data:
                logger.info("Running Tesseract for grid analysis (not cached)")
                from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
                tess_page_data = await _run_tess(image_data, lang="eng+deu")
                session[f"tesseract_page_{page_number}"] = tess_page_data
                session["tesseract_words"] = tess_page_data.get("words", [])
                session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
                session["tesseract_image_height"] = tess_page_data.get("image_height", 0)

            tess_words = tess_page_data.get("words", [])
            img_w = tess_page_data.get("image_width", 0)
            img_h = tess_page_data.get("image_height", 0)

            if tess_words and img_w > 0 and img_h > 0:
                service = GridDetectionService()
                regions = service.convert_tesseract_regions(tess_words, img_w, img_h)

                if regions:
                    grid_result = service.detect_grid(regions)
                    grid_dict = grid_result.to_dict()

                    # Merge LLM text if available (better quality than Tesseract text)
                    # The LLM vocab was stored during compare-ocr
                    grid_dict["source"] = "tesseract+grid_service"
                    grid_dict["word_count"] = len(tess_words)

                    logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
                                f"{grid_result.stats.get('recognized', 0)} recognized")

                    return {"success": True, "grid": grid_dict}

            logger.info("Tesseract data insufficient, falling back to LLM")

        except Exception as e:
            logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
            import traceback
            logger.debug(traceback.format_exc())

    # --- Strategy 2: Fall back to Vision LLM ---
    image_base64 = base64.b64encode(image_data).decode("utf-8")

    grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.

Your task: Identify the TABLE STRUCTURE and extract each cell's content.

Return a JSON object with this EXACT structure:
{
  "rows": <number of rows>,
  "columns": <number of columns>,
  "column_types": ["english", "german", "example"],
  "entries": [
    {
      "row": 0,
      "col": 0,
      "text": "the word or phrase in this cell",
      "column_type": "english",
      "confidence": 0.95
    }
  ]
}

Rules:
- row and col are 0-indexed
- column_type is one of: "english", "german", "example", "unknown"
- Detect whether each column contains English words, German translations, or example sentences
- Include ALL non-empty cells
- confidence is 0.0-1.0 based on how clear the text is
- If a cell is empty, don't include it
- Return ONLY the JSON, no other text"""

    try:
        import asyncio

        raw_text = ""
        max_retries = 3
        for attempt in range(max_retries):
            async with httpx.AsyncClient(timeout=300.0) as client:
                response = await client.post(
                    f"{OLLAMA_URL}/api/chat",
                    json={
                        "model": VISION_MODEL,
                        "messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
                        "stream": False,
                        "options": {"temperature": 0.1, "num_predict": 8192},
                    },
                    timeout=300.0,
                )

            if response.status_code == 500 and attempt < max_retries - 1:
                wait_time = 10 * (attempt + 1)
                logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
                await asyncio.sleep(wait_time)
                continue
            elif response.status_code != 200:
                error_detail = response.text[:200] if response.text else "Unknown error"
                return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}

            raw_text = response.json().get("message", {}).get("content", "")
            break

        # Parse JSON from response
        import re
        json_match = re.search(r'\{[\s\S]*\}', raw_text)
        if not json_match:
            return {"success": False, "error": "Could not parse grid structure from LLM response"}

        grid_raw = json.loads(json_match.group())

        num_rows = grid_raw.get("rows", 0)
        num_cols = grid_raw.get("columns", 0)
        column_types = grid_raw.get("column_types", [])
        entries = grid_raw.get("entries", [])

        if num_rows == 0 or num_cols == 0:
            return {"success": False, "error": "No grid structure detected"}

        # Ensure column_types has the right length
        while len(column_types) < num_cols:
            column_types.append("unknown")

        # Build cell grid with percentage-based coordinates
        row_height = 100.0 / num_rows
        col_width = 100.0 / num_cols

        # Track which cells have content
        cell_map = {}
        for entry in entries:
            r = entry.get("row", 0)
            c = entry.get("col", 0)
            cell_map[(r, c)] = entry

        cells = []
        recognized_count = 0
        empty_count = 0
        problematic_count = 0

        for r in range(num_rows):
            row_cells = []
            for c in range(num_cols):
                x = c * col_width
                y = r * row_height

                if (r, c) in cell_map:
                    entry = cell_map[(r, c)]
                    text = entry.get("text", "").strip()
                    conf = entry.get("confidence", 0.8)
                    col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")

                    if text:
                        status = "recognized" if conf >= 0.5 else "problematic"
                        if status == "recognized":
                            recognized_count += 1
                        else:
                            problematic_count += 1
                    else:
                        status = "empty"
                        empty_count += 1
                else:
                    text = ""
                    conf = 0.0
                    col_type = column_types[c] if c < len(column_types) else "unknown"
                    status = "empty"
                    empty_count += 1

                row_cells.append({
                    "row": r,
                    "col": c,
                    "x": round(x, 2),
                    "y": round(y, 2),
                    "width": round(col_width, 2),
                    "height": round(row_height, 2),
                    "text": text,
                    "confidence": conf,
                    "status": status,
                    "column_type": col_type,
                })
            cells.append(row_cells)

        total = num_rows * num_cols
        coverage = (recognized_count + problematic_count) / max(total, 1)

        # Column and row boundaries as percentages
        col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
        row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]

        grid_data = {
            "rows": num_rows,
            "columns": num_cols,
            "cells": cells,
            "column_types": column_types,
            "column_boundaries": col_boundaries,
            "row_boundaries": row_boundaries,
            "deskew_angle": 0.0,
            "source": "vision_llm",
            "stats": {
                "recognized": recognized_count,
                "problematic": problematic_count,
                "empty": empty_count,
                "manual": 0,
                "total": total,
                "coverage": round(coverage, 3),
            },
        }

        return {"success": True, "grid": grid_data}

    except httpx.TimeoutException:
        logger.error("Grid analysis timed out")
        return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
    except Exception as e:
        logger.error(f"Grid analysis failed: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}