breakpilot-lehrer/klausur-service/backend/vocab_worksheet_analysis_api.py

"""
Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
extract-with-boxes, deskewed images, and learning unit generation.

The two large handlers (compare_ocr_methods, analyze_grid) live in
vocab_worksheet_compare_api.py and are included via compare_router.
"""

from fastapi import APIRouter, Body, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional, Dict, Any
from datetime import datetime
import os
import io
import json
import logging

def _get_sessions():
    from vocab_worksheet_api import _sessions
    return _sessions

def _get_local_storage_path():
    from vocab_worksheet_api import LOCAL_STORAGE_PATH
    return LOCAL_STORAGE_PATH
from vocab_worksheet_generation import convert_pdf_page_to_image

# Try to import Tesseract extractor
try:
    from tesseract_vocab_extractor import (
        extract_bounding_boxes, TESSERACT_AVAILABLE,
    )
except ImportError:
    TESSERACT_AVAILABLE = False

# Try to import Grid Detection Service
try:
    from services.grid_detection_service import GridDetectionService
    GRID_SERVICE_AVAILABLE = True
except ImportError:
    GRID_SERVICE_AVAILABLE = False

logger = logging.getLogger(__name__)

analysis_router = APIRouter()

def _ocr_export_dir():
    return os.path.join(_get_local_storage_path(), "ocr-exports")

def _ground_truth_dir():
    return os.path.join(_get_local_storage_path(), "ground-truth")


# =============================================================================
# OCR Export Endpoints (for cross-app OCR data sharing)
# =============================================================================


@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
    """
    Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).

    Both apps proxy to klausur-service via /klausur-api/, so this endpoint
    serves as shared storage accessible from both ports.
    """

    logger.info(f"Saving OCR export for session {session_id}, page {page_number}")

    os.makedirs(_ocr_export_dir(), exist_ok=True)

    # Save the export data
    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
    with open(export_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    # Update latest pointer
    latest_path = os.path.join(_ocr_export_dir(), "latest.json")
    with open(latest_path, 'w', encoding='utf-8') as f:
        json.dump({
            "session_id": session_id,
            "page_number": page_number,
            "saved_at": datetime.utcnow().isoformat(),
        }, f, ensure_ascii=False, indent=2)

    return {
        "success": True,
        "session_id": session_id,
        "page_number": page_number,
        "message": "OCR export saved successfully",
    }


@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
async def load_ocr_export(session_id: str, page_number: int):
    """Load a specific OCR export by session and page number."""

    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")

    if not os.path.exists(export_path):
        raise HTTPException(status_code=404, detail="OCR export not found")

    with open(export_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    return data


@analysis_router.get("/ocr-export/latest")
async def load_latest_ocr_export():
    """Load the most recently saved OCR export data."""

    latest_path = os.path.join(_ocr_export_dir(), "latest.json")

    if not os.path.exists(latest_path):
        raise HTTPException(status_code=404, detail="No OCR exports found")

    with open(latest_path, 'r', encoding='utf-8') as f:
        pointer = json.load(f)

    session_id = pointer.get("session_id")
    page_number = pointer.get("page_number")

    export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")

    if not os.path.exists(export_path):
        raise HTTPException(status_code=404, detail="Latest OCR export file not found")

    with open(export_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    return data


# =============================================================================
# Extract with Boxes & Deskewed Image
# =============================================================================


async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
    """Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.

    Returns dict with 'entries' list and 'image_width'/'image_height'.
    Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
    All bbox coordinates are in percent (0-100).
    """
    if not TESSERACT_AVAILABLE:
        raise HTTPException(status_code=500, detail="Tesseract not available")
    if not GRID_SERVICE_AVAILABLE:
        raise HTTPException(status_code=500, detail="GridDetectionService not available")

    # Step 1: Tesseract word-level bounding boxes
    tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
    words = tess_result.get("words", [])
    img_w = tess_result.get("image_width", 0)
    img_h = tess_result.get("image_height", 0)

    if not words or img_w == 0 or img_h == 0:
        return {"entries": [], "image_width": img_w, "image_height": img_h}

    # Step 2: Convert to OCR regions (percentage-based)
    service = GridDetectionService()
    regions = service.convert_tesseract_regions(words, img_w, img_h)

    if not regions:
        return {"entries": [], "image_width": img_w, "image_height": img_h}

    # Step 3: Detect grid
    grid_result = service.detect_grid(regions)

    if not grid_result.cells:
        return {"entries": [], "image_width": img_w, "image_height": img_h}

    # Step 4: Group cells by logical_row and column_type
    from services.grid_detection_service import ColumnType

    entries = []
    for row_idx, row_cells in enumerate(grid_result.cells):
        en_text = ""
        de_text = ""
        ex_text = ""
        en_bbox = None
        de_bbox = None
        ex_bbox = None
        row_conf_sum = 0.0
        row_conf_count = 0

        for cell in row_cells:
            cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
                         "w": round(cell.width, 2), "h": round(cell.height, 2)}

            if cell.column_type == ColumnType.ENGLISH:
                en_text = cell.text.strip()
                en_bbox = cell_bbox
            elif cell.column_type == ColumnType.GERMAN:
                de_text = cell.text.strip()
                de_bbox = cell_bbox
            elif cell.column_type == ColumnType.EXAMPLE:
                ex_text = cell.text.strip()
                ex_bbox = cell_bbox

            if cell.text.strip():
                row_conf_sum += cell.confidence
                row_conf_count += 1

        # Skip completely empty rows
        if not en_text and not de_text and not ex_text:
            continue

        # Calculate whole-row bounding box
        all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
        if all_bboxes:
            row_x = min(b["x"] for b in all_bboxes)
            row_y = min(b["y"] for b in all_bboxes)
            row_right = max(b["x"] + b["w"] for b in all_bboxes)
            row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
            row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
                        "w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
        else:
            row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}

        avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)

        entries.append({
            "row_index": row_idx,
            "english": en_text,
            "german": de_text,
            "example": ex_text,
            "confidence": avg_conf,
            "bbox": row_bbox,
            "bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
            "bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
            "bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
        })

    return {"entries": entries, "image_width": img_w, "image_height": img_h}


@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
async def extract_with_boxes(session_id: str, page_number: int):
    """Extract vocabulary entries with bounding boxes for ground truth labeling.

    Uses Tesseract + GridDetectionService for spatial positioning.
    page_number is 0-indexed.
    """
    logger.info(f"Extract with boxes for session {session_id}, page {page_number}")

    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")

    session = _get_sessions()[session_id]
    pdf_data = session.get("pdf_data")

    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    page_count = session.get("pdf_page_count", 1)
    if page_number < 0 or page_number >= page_count:
        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")

    # Convert page to hires image
    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)

    # Deskew image before OCR
    deskew_angle = 0.0
    try:
        from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
        if CV2_AVAILABLE:
            image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
            logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
    except Exception as e:
        logger.warning(f"Deskew failed for page {page_number}: {e}")

    # Cache deskewed image in session for later serving
    if "deskewed_images" not in session:
        session["deskewed_images"] = {}
    session["deskewed_images"][str(page_number)] = image_data

    # Extract entries with boxes (now on deskewed image)
    result = await extract_entries_with_boxes(image_data)

    # Cache in session
    if "gt_entries" not in session:
        session["gt_entries"] = {}
    session["gt_entries"][str(page_number)] = result["entries"]

    return {
        "success": True,
        "entries": result["entries"],
        "entry_count": len(result["entries"]),
        "image_width": result["image_width"],
        "image_height": result["image_height"],
        "deskew_angle": round(deskew_angle, 2),
        "deskewed": abs(deskew_angle) > 0.05,
    }


@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
async def get_deskewed_image(session_id: str, page_number: int):
    """Return the deskewed page image as PNG.

    Falls back to the original hires image if no deskewed version is cached.
    """
    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")

    session = _get_sessions()[session_id]
    deskewed = session.get("deskewed_images", {}).get(str(page_number))

    if deskewed:
        return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")

    # Fallback: render original hires image
    pdf_data = session.get("pdf_data")
    if not pdf_data:
        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")

    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
    return StreamingResponse(io.BytesIO(image_data), media_type="image/png")


# =============================================================================
# Ground Truth Labeling
# =============================================================================


@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
    """Save ground truth labels for a page.

    Expects body with 'entries' list - each entry has english, german, example,
    status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
    """
    logger.info(f"Save ground truth for session {session_id}, page {page_number}")

    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")

    entries = data.get("entries", [])
    if not entries:
        raise HTTPException(status_code=400, detail="No entries provided")

    # Save in session
    session = _get_sessions()[session_id]
    if "ground_truth" not in session:
        session["ground_truth"] = {}
    session["ground_truth"][str(page_number)] = entries

    # Also save to disk
    os.makedirs(_ground_truth_dir(), exist_ok=True)
    gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
    gt_data = {
        "session_id": session_id,
        "page_number": page_number,
        "saved_at": datetime.now().isoformat(),
        "entry_count": len(entries),
        "entries": entries,
    }
    with open(gt_path, 'w', encoding='utf-8') as f:
        json.dump(gt_data, f, ensure_ascii=False, indent=2)

    logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")

    confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
    edited = sum(1 for e in entries if e.get("status") == "edited")
    skipped = sum(1 for e in entries if e.get("status") == "skipped")

    return {
        "success": True,
        "saved_count": len(entries),
        "confirmed": confirmed,
        "edited": edited,
        "skipped": skipped,
        "file_path": gt_path,
    }


@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
async def load_ground_truth(session_id: str, page_number: int):
    """Load saved ground truth for a page."""
    logger.info(f"Load ground truth for session {session_id}, page {page_number}")

    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")

    # Try session cache first
    session = _get_sessions()[session_id]
    cached = session.get("ground_truth", {}).get(str(page_number))
    if cached:
        return {"success": True, "entries": cached, "source": "cache"}

    # Try disk
    gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
    if not os.path.exists(gt_path):
        raise HTTPException(status_code=404, detail="No ground truth found for this page")

    with open(gt_path, 'r', encoding='utf-8') as f:
        gt_data = json.load(f)

    return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}


# ─── Learning Module Generation ─────────────────────────────────────────────


class GenerateLearningUnitRequest(BaseModel):
    grade: Optional[str] = None
    generate_modules: bool = True


@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
    """
    Create a Learning Unit from the vocabulary in this session.

    1. Takes vocabulary from the session
    2. Creates a Learning Unit in backend-lehrer
    3. Optionally triggers MC/Cloze/QA generation

    Returns the created unit info and generation status.
    """
    if request is None:
        request = GenerateLearningUnitRequest()

    if session_id not in _get_sessions():
        raise HTTPException(status_code=404, detail="Session not found")

    session = _get_sessions()[session_id]
    vocabulary = session.get("vocabulary", [])

    if not vocabulary:
        raise HTTPException(status_code=400, detail="No vocabulary in this session")

    try:
        from vocab_learn_bridge import create_learning_unit, generate_learning_modules

        # Step 1: Create Learning Unit
        result = await create_learning_unit(
            session_name=session["name"],
            vocabulary=vocabulary,
            grade=request.grade,
        )

        # Step 2: Generate modules if requested
        if request.generate_modules:
            try:
                gen_result = await generate_learning_modules(
                    unit_id=result["unit_id"],
                    analysis_path=result["analysis_path"],
                )
                result["generation"] = gen_result
            except Exception as e:
                logger.warning(f"Module generation failed (unit created): {e}")
                result["generation"] = {"status": "error", "reason": str(e)}

        return result

    except ImportError:
        raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except RuntimeError as e:
        raise HTTPException(status_code=502, detail=str(e))


# =============================================================================
# Include compare_ocr_methods & analyze_grid from companion module
# =============================================================================

from vocab_worksheet_compare_api import compare_router  # noqa: E402

analysis_router.include_router(compare_router)