breakpilot-lehrer/klausur-service/backend/ocr_pipeline_structure.py

"""
OCR Pipeline Structure Detection and Exclude Regions

Detect document structure (boxes, zones, color regions, graphics)
and manage user-drawn exclude regions.
Extracted from ocr_pipeline_geometry.py for file-size compliance.
"""

import logging
import time
from typing import Any, Dict, List

import cv2
import numpy as np
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel

from cv_box_detect import detect_boxes
from cv_color_detect import _COLOR_RANGES, _COLOR_HEX
from cv_graphic_detect import detect_graphic_elements
from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
)
from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
    _filter_border_ghost_words,
)

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])


# ---------------------------------------------------------------------------
# Structure Detection Endpoint
# ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/detect-structure")
async def detect_structure(session_id: str):
    """Detect document structure: boxes, zones, and color regions.

    Runs box detection (line + shading) and color analysis on the cropped
    image.  Returns structured JSON with all detected elements for the
    structure visualization step.
    """
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    img_bgr = (
        cached.get("cropped_bgr")
        if cached.get("cropped_bgr") is not None
        else cached.get("dewarped_bgr")
    )
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")

    t0 = time.time()
    h, w = img_bgr.shape[:2]

    # --- Content bounds from word result (if available) or full image ---
    word_result = cached.get("word_result")
    words: List[Dict] = []
    if word_result and word_result.get("cells"):
        for cell in word_result["cells"]:
            for wb in (cell.get("word_boxes") or []):
                words.append(wb)
    # Fallback: use raw OCR words if cell word_boxes are empty
    if not words and word_result:
        for key in ("raw_paddle_words_split", "raw_tesseract_words", "raw_paddle_words"):
            raw = word_result.get(key, [])
            if raw:
                words = raw
                logger.info("detect-structure: using %d words from %s (no cell word_boxes)", len(words), key)
                break
    # If no words yet, use image dimensions with small margin
    if words:
        content_x = max(0, min(int(wb["left"]) for wb in words))
        content_y = max(0, min(int(wb["top"]) for wb in words))
        content_r = min(w, max(int(wb["left"] + wb["width"]) for wb in words))
        content_b = min(h, max(int(wb["top"] + wb["height"]) for wb in words))
        content_w_px = content_r - content_x
        content_h_px = content_b - content_y
    else:
        margin = int(min(w, h) * 0.03)
        content_x, content_y = margin, margin
        content_w_px = w - 2 * margin
        content_h_px = h - 2 * margin

    # --- Box detection ---
    boxes = detect_boxes(
        img_bgr,
        content_x=content_x,
        content_w=content_w_px,
        content_y=content_y,
        content_h=content_h_px,
    )

    # --- Zone splitting ---
    from cv_box_detect import split_page_into_zones as _split_zones
    zones = _split_zones(content_x, content_y, content_w_px, content_h_px, boxes)

    # --- Color region sampling ---
    # Sample background shading in each detected box
    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    box_colors = []
    for box in boxes:
        # Sample the center region of each box
        cy1 = box.y + box.height // 4
        cy2 = box.y + 3 * box.height // 4
        cx1 = box.x + box.width // 4
        cx2 = box.x + 3 * box.width // 4
        cy1 = max(0, min(cy1, h - 1))
        cy2 = max(0, min(cy2, h - 1))
        cx1 = max(0, min(cx1, w - 1))
        cx2 = max(0, min(cx2, w - 1))
        if cy2 > cy1 and cx2 > cx1:
            roi_hsv = hsv[cy1:cy2, cx1:cx2]
            med_h = float(np.median(roi_hsv[:, :, 0]))
            med_s = float(np.median(roi_hsv[:, :, 1]))
            med_v = float(np.median(roi_hsv[:, :, 2]))
            if med_s > 15:
                from cv_color_detect import _hue_to_color_name
                bg_name = _hue_to_color_name(med_h)
                bg_hex = _COLOR_HEX.get(bg_name, "#6b7280")
            else:
                bg_name = "gray" if med_v < 220 else "white"
                bg_hex = "#6b7280" if bg_name == "gray" else "#ffffff"
        else:
            bg_name = "unknown"
            bg_hex = "#6b7280"
        box_colors.append({"color_name": bg_name, "color_hex": bg_hex})

    # --- Color text detection overview ---
    # Quick scan for colored text regions across the page
    color_summary: Dict[str, int] = {}
    for color_name, ranges in _COLOR_RANGES.items():
        mask = np.zeros((h, w), dtype=np.uint8)
        for lower, upper in ranges:
            mask = cv2.bitwise_or(mask, cv2.inRange(hsv, lower, upper))
        pixel_count = int(np.sum(mask > 0))
        if pixel_count > 50:  # minimum threshold
            color_summary[color_name] = pixel_count

    # --- Graphic element detection ---
    box_dicts = [
        {"x": b.x, "y": b.y, "w": b.width, "h": b.height}
        for b in boxes
    ]
    graphics = detect_graphic_elements(
        img_bgr, words,
        detected_boxes=box_dicts,
    )

    # --- Filter border-ghost words from OCR result ---
    ghost_count = 0
    if boxes and word_result:
        ghost_count = _filter_border_ghost_words(word_result, boxes)
        if ghost_count:
            logger.info("detect-structure: removed %d border-ghost words", ghost_count)
            await update_session_db(session_id, word_result=word_result)
            cached["word_result"] = word_result

    duration = time.time() - t0

    # Preserve user-drawn exclude regions from previous run
    prev_sr = cached.get("structure_result") or {}
    prev_exclude = prev_sr.get("exclude_regions", [])

    result_dict = {
        "image_width": w,
        "image_height": h,
        "content_bounds": {
            "x": content_x, "y": content_y,
            "w": content_w_px, "h": content_h_px,
        },
        "boxes": [
            {
                "x": b.x, "y": b.y, "w": b.width, "h": b.height,
                "confidence": b.confidence,
                "border_thickness": b.border_thickness,
                "bg_color_name": box_colors[i]["color_name"],
                "bg_color_hex": box_colors[i]["color_hex"],
            }
            for i, b in enumerate(boxes)
        ],
        "zones": [
            {
                "index": z.index,
                "zone_type": z.zone_type,
                "y": z.y, "h": z.height,
                "x": z.x, "w": z.width,
            }
            for z in zones
        ],
        "graphics": [
            {
                "x": g.x, "y": g.y, "w": g.width, "h": g.height,
                "area": g.area,
                "shape": g.shape,
                "color_name": g.color_name,
                "color_hex": g.color_hex,
                "confidence": round(g.confidence, 2),
            }
            for g in graphics
        ],
        "exclude_regions": prev_exclude,
        "color_pixel_counts": color_summary,
        "has_words": len(words) > 0,
        "word_count": len(words),
        "border_ghosts_removed": ghost_count,
        "duration_seconds": round(duration, 2),
    }

    # Persist to session
    await update_session_db(session_id, structure_result=result_dict)
    cached["structure_result"] = result_dict

    logger.info("detect-structure session %s: %d boxes, %d zones, %d graphics, %.2fs",
                session_id, len(boxes), len(zones), len(graphics), duration)

    return {"session_id": session_id, **result_dict}


# ---------------------------------------------------------------------------
# Exclude Regions -- user-drawn rectangles to exclude from OCR results
# ---------------------------------------------------------------------------

class _ExcludeRegionIn(BaseModel):
    x: int
    y: int
    w: int
    h: int
    label: str = ""


class _ExcludeRegionsBatchIn(BaseModel):
    regions: list[_ExcludeRegionIn]


@router.put("/sessions/{session_id}/exclude-regions")
async def set_exclude_regions(session_id: str, body: _ExcludeRegionsBatchIn):
    """Replace all exclude regions for a session.

    Regions are stored inside ``structure_result.exclude_regions``.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    sr = session.get("structure_result") or {}
    sr["exclude_regions"] = [r.model_dump() for r in body.regions]

    # Invalidate grid so it rebuilds with new exclude regions
    await update_session_db(session_id, structure_result=sr, grid_editor_result=None)

    # Update cache
    if session_id in _cache:
        _cache[session_id]["structure_result"] = sr
        _cache[session_id].pop("grid_editor_result", None)

    return {
        "session_id": session_id,
        "exclude_regions": sr["exclude_regions"],
        "count": len(sr["exclude_regions"]),
    }


@router.delete("/sessions/{session_id}/exclude-regions/{region_index}")
async def delete_exclude_region(session_id: str, region_index: int):
    """Remove a single exclude region by index."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    sr = session.get("structure_result") or {}
    regions = sr.get("exclude_regions", [])

    if region_index < 0 or region_index >= len(regions):
        raise HTTPException(status_code=404, detail="Region index out of range")

    removed = regions.pop(region_index)
    sr["exclude_regions"] = regions

    # Invalidate grid so it rebuilds with new exclude regions
    await update_session_db(session_id, structure_result=sr, grid_editor_result=None)

    if session_id in _cache:
        _cache[session_id]["structure_result"] = sr
        _cache[session_id].pop("grid_editor_result", None)

    return {
        "session_id": session_id,
        "removed": removed,
        "remaining": len(regions),
    }