breakpilot-lehrer/klausur-service/backend/ocr_pipeline_geometry.py

"""
OCR Pipeline Geometry API - Deskew, Dewarp, Structure Detection, Column Detection.

Extracted from ocr_pipeline_api.py to keep modules focused.
Each endpoint group handles a geometric correction or detection step:
- Deskew (Step 2): Correct scan rotation
- Dewarp (Step 3): Correct vertical shear / book warp
- Structure Detection: Boxes, zones, color regions, graphics
- Column Detection (Step 5): Find invisible columns

Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import os
import time
from dataclasses import asdict
from datetime import datetime
from typing import Any, Dict, List, Optional

import cv2
import numpy as np
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel

from cv_vocab_pipeline import (
    _apply_shear,
    _detect_header_footer_gaps,
    _detect_sub_columns,
    classify_column_types,
    create_layout_image,
    create_ocr_image,
    analyze_layout,
    deskew_image,
    deskew_image_by_word_alignment,
    deskew_two_pass,
    detect_column_geometry_zoned,
    dewarp_image,
    dewarp_image_manual,
    expand_narrow_columns,
)
from cv_box_detect import detect_boxes
from cv_color_detect import _COLOR_RANGES, _COLOR_HEX
from cv_graphic_detect import detect_graphic_elements
from ocr_pipeline_session_store import (
    get_session_db,
    update_session_db,
)
from ocr_pipeline_common import (
    _cache,
    _load_session_to_cache,
    _get_cached,
    _get_base_image_png,
    _append_pipeline_log,
    _filter_border_ghost_words,
    ManualDeskewRequest,
    DeskewGroundTruthRequest,
    ManualDewarpRequest,
    CombinedAdjustRequest,
    DewarpGroundTruthRequest,
    ManualColumnsRequest,
    ColumnGroundTruthRequest,
)

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])

# ---------------------------------------------------------------------------
# Deskew Endpoints (Step 2)
# ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/deskew")
async def auto_deskew(session_id: str):
    """Two-pass deskew: iterative projection (wide range) + word-alignment residual."""
    # Ensure session is in cache
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    # Deskew runs right after orientation — use oriented image, fall back to original
    img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
                     if (v := cached.get(k)) is not None), None)
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="No image available for deskewing")

    t0 = time.time()

    # Two-pass deskew: iterative (±5°) + word-alignment residual check
    deskewed_bgr, angle_applied, two_pass_debug = deskew_two_pass(img_bgr.copy())

    # Also run individual methods for reporting (non-authoritative)
    try:
        _, angle_hough = deskew_image(img_bgr.copy())
    except Exception:
        angle_hough = 0.0

    success_enc, png_orig = cv2.imencode(".png", img_bgr)
    orig_bytes = png_orig.tobytes() if success_enc else b""
    try:
        _, angle_wa = deskew_image_by_word_alignment(orig_bytes)
    except Exception:
        angle_wa = 0.0

    angle_iterative = two_pass_debug.get("pass1_angle", 0.0)
    angle_residual = two_pass_debug.get("pass2_angle", 0.0)
    angle_textline = two_pass_debug.get("pass3_angle", 0.0)

    duration = time.time() - t0

    method_used = "three_pass" if abs(angle_textline) >= 0.01 else (
        "two_pass" if abs(angle_residual) >= 0.01 else "iterative"
    )

    # Encode as PNG
    success, deskewed_png_buf = cv2.imencode(".png", deskewed_bgr)
    deskewed_png = deskewed_png_buf.tobytes() if success else b""

    # Create binarized version
    binarized_png = None
    try:
        binarized = create_ocr_image(deskewed_bgr)
        success_bin, bin_buf = cv2.imencode(".png", binarized)
        binarized_png = bin_buf.tobytes() if success_bin else None
    except Exception as e:
        logger.warning(f"Binarization failed: {e}")

    confidence = max(0.5, 1.0 - abs(angle_applied) / 5.0)

    deskew_result = {
        "angle_hough": round(angle_hough, 3),
        "angle_word_alignment": round(angle_wa, 3),
        "angle_iterative": round(angle_iterative, 3),
        "angle_residual": round(angle_residual, 3),
        "angle_textline": round(angle_textline, 3),
        "angle_applied": round(angle_applied, 3),
        "method_used": method_used,
        "confidence": round(confidence, 2),
        "duration_seconds": round(duration, 2),
        "two_pass_debug": two_pass_debug,
    }

    # Update cache
    cached["deskewed_bgr"] = deskewed_bgr
    cached["binarized_png"] = binarized_png
    cached["deskew_result"] = deskew_result

    # Persist to DB
    db_update = {
        "deskewed_png": deskewed_png,
        "deskew_result": deskew_result,
        "current_step": 3,
    }
    if binarized_png:
        db_update["binarized_png"] = binarized_png
    await update_session_db(session_id, **db_update)

    logger.info(f"OCR Pipeline: deskew session {session_id}: "
                f"hough={angle_hough:.2f} wa={angle_wa:.2f} "
                f"iter={angle_iterative:.2f} residual={angle_residual:.2f} "
                f"textline={angle_textline:.2f} "
                f"-> {method_used} total={angle_applied:.2f}")

    await _append_pipeline_log(session_id, "deskew", {
        "angle_applied": round(angle_applied, 3),
        "angle_iterative": round(angle_iterative, 3),
        "angle_residual": round(angle_residual, 3),
        "angle_textline": round(angle_textline, 3),
        "confidence": round(confidence, 2),
        "method": method_used,
    }, duration_ms=int(duration * 1000))

    return {
        "session_id": session_id,
        **deskew_result,
        "deskewed_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/deskewed",
        "binarized_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/binarized",
    }


@router.post("/sessions/{session_id}/deskew/manual")
async def manual_deskew(session_id: str, req: ManualDeskewRequest):
    """Apply a manual rotation angle to the oriented image."""
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
                     if (v := cached.get(k)) is not None), None)
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="No image available for deskewing")

    angle = max(-5.0, min(5.0, req.angle))

    h, w = img_bgr.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(img_bgr, M, (w, h),
                              flags=cv2.INTER_LINEAR,
                              borderMode=cv2.BORDER_REPLICATE)

    success, png_buf = cv2.imencode(".png", rotated)
    deskewed_png = png_buf.tobytes() if success else b""

    # Binarize
    binarized_png = None
    try:
        binarized = create_ocr_image(rotated)
        success_bin, bin_buf = cv2.imencode(".png", binarized)
        binarized_png = bin_buf.tobytes() if success_bin else None
    except Exception:
        pass

    deskew_result = {
        **(cached.get("deskew_result") or {}),
        "angle_applied": round(angle, 3),
        "method_used": "manual",
    }

    # Update cache
    cached["deskewed_bgr"] = rotated
    cached["binarized_png"] = binarized_png
    cached["deskew_result"] = deskew_result

    # Persist to DB
    db_update = {
        "deskewed_png": deskewed_png,
        "deskew_result": deskew_result,
    }
    if binarized_png:
        db_update["binarized_png"] = binarized_png
    await update_session_db(session_id, **db_update)

    logger.info(f"OCR Pipeline: manual deskew session {session_id}: {angle:.2f}")

    return {
        "session_id": session_id,
        "angle_applied": round(angle, 3),
        "method_used": "manual",
        "deskewed_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/deskewed",
    }


@router.post("/sessions/{session_id}/ground-truth/deskew")
async def save_deskew_ground_truth(session_id: str, req: DeskewGroundTruthRequest):
    """Save ground truth feedback for the deskew step."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    ground_truth = session.get("ground_truth") or {}
    gt = {
        "is_correct": req.is_correct,
        "corrected_angle": req.corrected_angle,
        "notes": req.notes,
        "saved_at": datetime.utcnow().isoformat(),
        "deskew_result": session.get("deskew_result"),
    }
    ground_truth["deskew"] = gt

    await update_session_db(session_id, ground_truth=ground_truth)

    # Update cache
    if session_id in _cache:
        _cache[session_id]["ground_truth"] = ground_truth

    logger.info(f"OCR Pipeline: ground truth deskew session {session_id}: "
                f"correct={req.is_correct}, corrected_angle={req.corrected_angle}")

    return {"session_id": session_id, "ground_truth": gt}


# ---------------------------------------------------------------------------
# Dewarp Endpoints
# ---------------------------------------------------------------------------

async def _detect_shear_with_vlm(image_bytes: bytes) -> Dict[str, Any]:
    """Ask qwen2.5vl:32b to estimate the vertical shear angle of a scanned page.

    The VLM is shown the image and asked: are the column/table borders tilted?
    If yes, by how many degrees? Returns a dict with shear_degrees and confidence.
    Confidence is 0.0 if Ollama is unavailable or parsing fails.
    """
    import httpx
    import base64
    import re

    ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
    model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")

    prompt = (
        "This is a scanned vocabulary worksheet. Look at the vertical borders of the table columns. "
        "Are they perfectly vertical, or do they tilt slightly? "
        "If they tilt, estimate the tilt angle in degrees (positive = top tilts right, negative = top tilts left). "
        "Reply with ONLY a JSON object like: {\"shear_degrees\": 1.2, \"confidence\": 0.8} "
        "Use confidence 0.0-1.0 based on how clearly you can see the tilt. "
        "If the columns look straight, return {\"shear_degrees\": 0.0, \"confidence\": 0.9}"
    )

    img_b64 = base64.b64encode(image_bytes).decode("utf-8")
    payload = {
        "model": model,
        "prompt": prompt,
        "images": [img_b64],
        "stream": False,
    }

    try:
        async with httpx.AsyncClient(timeout=60.0) as client:
            resp = await client.post(f"{ollama_base}/api/generate", json=payload)
            resp.raise_for_status()
            text = resp.json().get("response", "")

        # Parse JSON from response (may have surrounding text)
        match = re.search(r'\{[^}]+\}', text)
        if match:
            import json
            data = json.loads(match.group(0))
            shear = float(data.get("shear_degrees", 0.0))
            conf = float(data.get("confidence", 0.0))
            # Clamp to reasonable range
            shear = max(-3.0, min(3.0, shear))
            conf = max(0.0, min(1.0, conf))
            return {"method": "vlm_qwen2.5vl", "shear_degrees": round(shear, 3), "confidence": round(conf, 2)}
    except Exception as e:
        logger.warning(f"VLM dewarp failed: {e}")

    return {"method": "vlm_qwen2.5vl", "shear_degrees": 0.0, "confidence": 0.0}


@router.post("/sessions/{session_id}/dewarp")
async def auto_dewarp(
    session_id: str,
    method: str = Query("ensemble", description="Detection method: ensemble | vlm | cv"),
):
    """Detect and correct vertical shear on the deskewed image.

    Methods:
    - **ensemble** (default): 3-method CV ensemble (vertical edges + projection + Hough)
    - **cv**: CV ensemble only (same as ensemble)
    - **vlm**: Ask qwen2.5vl:32b to estimate the shear angle visually
    """
    if method not in ("ensemble", "cv", "vlm"):
        raise HTTPException(status_code=400, detail="method must be one of: ensemble, cv, vlm")

    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    deskewed_bgr = cached.get("deskewed_bgr")
    if deskewed_bgr is None:
        raise HTTPException(status_code=400, detail="Deskew must be completed before dewarp")

    t0 = time.time()

    if method == "vlm":
        # Encode deskewed image to PNG for VLM
        success, png_buf = cv2.imencode(".png", deskewed_bgr)
        img_bytes = png_buf.tobytes() if success else b""
        vlm_det = await _detect_shear_with_vlm(img_bytes)
        shear_deg = vlm_det["shear_degrees"]
        if abs(shear_deg) >= 0.05 and vlm_det["confidence"] >= 0.3:
            dewarped_bgr = _apply_shear(deskewed_bgr, -shear_deg)
        else:
            dewarped_bgr = deskewed_bgr
        dewarp_info = {
            "method": vlm_det["method"],
            "shear_degrees": shear_deg,
            "confidence": vlm_det["confidence"],
            "detections": [vlm_det],
        }
    else:
        dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)

    duration = time.time() - t0

    # Encode as PNG
    success, png_buf = cv2.imencode(".png", dewarped_bgr)
    dewarped_png = png_buf.tobytes() if success else b""

    dewarp_result = {
        "method_used": dewarp_info["method"],
        "shear_degrees": dewarp_info["shear_degrees"],
        "confidence": dewarp_info["confidence"],
        "duration_seconds": round(duration, 2),
        "detections": dewarp_info.get("detections", []),
    }

    # Update cache
    cached["dewarped_bgr"] = dewarped_bgr
    cached["dewarp_result"] = dewarp_result

    # Persist to DB
    await update_session_db(
        session_id,
        dewarped_png=dewarped_png,
        dewarp_result=dewarp_result,
        auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
        current_step=4,
    )

    logger.info(f"OCR Pipeline: dewarp session {session_id}: "
                f"method={dewarp_info['method']} shear={dewarp_info['shear_degrees']:.3f} "
                f"conf={dewarp_info['confidence']:.2f} ({duration:.2f}s)")

    await _append_pipeline_log(session_id, "dewarp", {
        "shear_degrees": dewarp_info["shear_degrees"],
        "confidence": dewarp_info["confidence"],
        "method": dewarp_info["method"],
        "ensemble_methods": [d.get("method", "") for d in dewarp_info.get("detections", [])],
    }, duration_ms=int(duration * 1000))

    return {
        "session_id": session_id,
        **dewarp_result,
        "dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped",
    }


@router.post("/sessions/{session_id}/dewarp/manual")
async def manual_dewarp(session_id: str, req: ManualDewarpRequest):
    """Apply shear correction with a manual angle."""
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    deskewed_bgr = cached.get("deskewed_bgr")
    if deskewed_bgr is None:
        raise HTTPException(status_code=400, detail="Deskew must be completed before dewarp")

    shear_deg = max(-2.0, min(2.0, req.shear_degrees))

    if abs(shear_deg) < 0.001:
        dewarped_bgr = deskewed_bgr
    else:
        dewarped_bgr = dewarp_image_manual(deskewed_bgr, shear_deg)

    success, png_buf = cv2.imencode(".png", dewarped_bgr)
    dewarped_png = png_buf.tobytes() if success else b""

    dewarp_result = {
        **(cached.get("dewarp_result") or {}),
        "method_used": "manual",
        "shear_degrees": round(shear_deg, 3),
    }

    # Update cache
    cached["dewarped_bgr"] = dewarped_bgr
    cached["dewarp_result"] = dewarp_result

    # Persist to DB
    await update_session_db(
        session_id,
        dewarped_png=dewarped_png,
        dewarp_result=dewarp_result,
    )

    logger.info(f"OCR Pipeline: manual dewarp session {session_id}: shear={shear_deg:.3f}")

    return {
        "session_id": session_id,
        "shear_degrees": round(shear_deg, 3),
        "method_used": "manual",
        "dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped",
    }


@router.post("/sessions/{session_id}/adjust-combined")
async def adjust_combined(session_id: str, req: CombinedAdjustRequest):
    """Apply rotation + shear combined to the original image.

    Used by the fine-tuning sliders to preview arbitrary rotation/shear
    combinations without re-running the full deskew/dewarp pipeline.
    """
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    img_bgr = cached.get("original_bgr")
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Original image not available")

    rotation = max(-15.0, min(15.0, req.rotation_degrees))
    shear_deg = max(-5.0, min(5.0, req.shear_degrees))

    h, w = img_bgr.shape[:2]
    result_bgr = img_bgr

    # Step 1: Apply rotation
    if abs(rotation) >= 0.001:
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, rotation, 1.0)
        result_bgr = cv2.warpAffine(result_bgr, M, (w, h),
                                     flags=cv2.INTER_LINEAR,
                                     borderMode=cv2.BORDER_REPLICATE)

    # Step 2: Apply shear
    if abs(shear_deg) >= 0.001:
        result_bgr = dewarp_image_manual(result_bgr, shear_deg)

    # Encode
    success, png_buf = cv2.imencode(".png", result_bgr)
    dewarped_png = png_buf.tobytes() if success else b""

    # Binarize
    binarized_png = None
    try:
        binarized = create_ocr_image(result_bgr)
        success_bin, bin_buf = cv2.imencode(".png", binarized)
        binarized_png = bin_buf.tobytes() if success_bin else None
    except Exception:
        pass

    # Build combined result dicts
    deskew_result = {
        **(cached.get("deskew_result") or {}),
        "angle_applied": round(rotation, 3),
        "method_used": "manual_combined",
    }
    dewarp_result = {
        **(cached.get("dewarp_result") or {}),
        "method_used": "manual_combined",
        "shear_degrees": round(shear_deg, 3),
    }

    # Update cache
    cached["deskewed_bgr"] = result_bgr
    cached["dewarped_bgr"] = result_bgr
    cached["deskew_result"] = deskew_result
    cached["dewarp_result"] = dewarp_result

    # Persist to DB
    db_update = {
        "dewarped_png": dewarped_png,
        "deskew_result": deskew_result,
        "dewarp_result": dewarp_result,
    }
    if binarized_png:
        db_update["binarized_png"] = binarized_png
        db_update["deskewed_png"] = dewarped_png
    await update_session_db(session_id, **db_update)

    logger.info(f"OCR Pipeline: combined adjust session {session_id}: "
                f"rotation={rotation:.3f} shear={shear_deg:.3f}")

    return {
        "session_id": session_id,
        "rotation_degrees": round(rotation, 3),
        "shear_degrees": round(shear_deg, 3),
        "method_used": "manual_combined",
        "dewarped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/dewarped",
    }


@router.post("/sessions/{session_id}/ground-truth/dewarp")
async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthRequest):
    """Save ground truth feedback for the dewarp step."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    ground_truth = session.get("ground_truth") or {}
    gt = {
        "is_correct": req.is_correct,
        "corrected_shear": req.corrected_shear,
        "notes": req.notes,
        "saved_at": datetime.utcnow().isoformat(),
        "dewarp_result": session.get("dewarp_result"),
    }
    ground_truth["dewarp"] = gt

    await update_session_db(session_id, ground_truth=ground_truth)

    if session_id in _cache:
        _cache[session_id]["ground_truth"] = ground_truth

    logger.info(f"OCR Pipeline: ground truth dewarp session {session_id}: "
                f"correct={req.is_correct}, corrected_shear={req.corrected_shear}")

    return {"session_id": session_id, "ground_truth": gt}


# ---------------------------------------------------------------------------
# Structure Detection Endpoint
# ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/detect-structure")
async def detect_structure(session_id: str):
    """Detect document structure: boxes, zones, and color regions.

    Runs box detection (line + shading) and color analysis on the cropped
    image.  Returns structured JSON with all detected elements for the
    structure visualization step.
    """
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    img_bgr = (
        cached.get("cropped_bgr")
        if cached.get("cropped_bgr") is not None
        else cached.get("dewarped_bgr")
    )
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")

    t0 = time.time()
    h, w = img_bgr.shape[:2]

    # --- Content bounds from word result (if available) or full image ---
    word_result = cached.get("word_result")
    words: List[Dict] = []
    if word_result and word_result.get("cells"):
        for cell in word_result["cells"]:
            for wb in (cell.get("word_boxes") or []):
                words.append(wb)
    # Fallback: use raw OCR words if cell word_boxes are empty
    if not words and word_result:
        for key in ("raw_paddle_words_split", "raw_tesseract_words", "raw_paddle_words"):
            raw = word_result.get(key, [])
            if raw:
                words = raw
                logger.info("detect-structure: using %d words from %s (no cell word_boxes)", len(words), key)
                break
    # If no words yet, use image dimensions with small margin
    if words:
        content_x = max(0, min(int(wb["left"]) for wb in words))
        content_y = max(0, min(int(wb["top"]) for wb in words))
        content_r = min(w, max(int(wb["left"] + wb["width"]) for wb in words))
        content_b = min(h, max(int(wb["top"] + wb["height"]) for wb in words))
        content_w_px = content_r - content_x
        content_h_px = content_b - content_y
    else:
        margin = int(min(w, h) * 0.03)
        content_x, content_y = margin, margin
        content_w_px = w - 2 * margin
        content_h_px = h - 2 * margin

    # --- Box detection ---
    boxes = detect_boxes(
        img_bgr,
        content_x=content_x,
        content_w=content_w_px,
        content_y=content_y,
        content_h=content_h_px,
    )

    # --- Zone splitting ---
    from cv_box_detect import split_page_into_zones as _split_zones
    zones = _split_zones(content_x, content_y, content_w_px, content_h_px, boxes)

    # --- Color region sampling ---
    # Sample background shading in each detected box
    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    box_colors = []
    for box in boxes:
        # Sample the center region of each box
        cy1 = box.y + box.height // 4
        cy2 = box.y + 3 * box.height // 4
        cx1 = box.x + box.width // 4
        cx2 = box.x + 3 * box.width // 4
        cy1 = max(0, min(cy1, h - 1))
        cy2 = max(0, min(cy2, h - 1))
        cx1 = max(0, min(cx1, w - 1))
        cx2 = max(0, min(cx2, w - 1))
        if cy2 > cy1 and cx2 > cx1:
            roi_hsv = hsv[cy1:cy2, cx1:cx2]
            med_h = float(np.median(roi_hsv[:, :, 0]))
            med_s = float(np.median(roi_hsv[:, :, 1]))
            med_v = float(np.median(roi_hsv[:, :, 2]))
            if med_s > 15:
                from cv_color_detect import _hue_to_color_name
                bg_name = _hue_to_color_name(med_h)
                bg_hex = _COLOR_HEX.get(bg_name, "#6b7280")
            else:
                bg_name = "gray" if med_v < 220 else "white"
                bg_hex = "#6b7280" if bg_name == "gray" else "#ffffff"
        else:
            bg_name = "unknown"
            bg_hex = "#6b7280"
        box_colors.append({"color_name": bg_name, "color_hex": bg_hex})

    # --- Color text detection overview ---
    # Quick scan for colored text regions across the page
    color_summary: Dict[str, int] = {}
    for color_name, ranges in _COLOR_RANGES.items():
        mask = np.zeros((h, w), dtype=np.uint8)
        for lower, upper in ranges:
            mask = cv2.bitwise_or(mask, cv2.inRange(hsv, lower, upper))
        pixel_count = int(np.sum(mask > 0))
        if pixel_count > 50:  # minimum threshold
            color_summary[color_name] = pixel_count

    # --- Graphic element detection ---
    box_dicts = [
        {"x": b.x, "y": b.y, "w": b.width, "h": b.height}
        for b in boxes
    ]
    graphics = detect_graphic_elements(
        img_bgr, words,
        detected_boxes=box_dicts,
    )

    # --- Filter border-ghost words from OCR result ---
    ghost_count = 0
    if boxes and word_result:
        ghost_count = _filter_border_ghost_words(word_result, boxes)
        if ghost_count:
            logger.info("detect-structure: removed %d border-ghost words", ghost_count)
            await update_session_db(session_id, word_result=word_result)
            cached["word_result"] = word_result

    duration = time.time() - t0

    # Preserve user-drawn exclude regions from previous run
    prev_sr = cached.get("structure_result") or {}
    prev_exclude = prev_sr.get("exclude_regions", [])

    result_dict = {
        "image_width": w,
        "image_height": h,
        "content_bounds": {
            "x": content_x, "y": content_y,
            "w": content_w_px, "h": content_h_px,
        },
        "boxes": [
            {
                "x": b.x, "y": b.y, "w": b.width, "h": b.height,
                "confidence": b.confidence,
                "border_thickness": b.border_thickness,
                "bg_color_name": box_colors[i]["color_name"],
                "bg_color_hex": box_colors[i]["color_hex"],
            }
            for i, b in enumerate(boxes)
        ],
        "zones": [
            {
                "index": z.index,
                "zone_type": z.zone_type,
                "y": z.y, "h": z.height,
                "x": z.x, "w": z.width,
            }
            for z in zones
        ],
        "graphics": [
            {
                "x": g.x, "y": g.y, "w": g.width, "h": g.height,
                "area": g.area,
                "shape": g.shape,
                "color_name": g.color_name,
                "color_hex": g.color_hex,
                "confidence": round(g.confidence, 2),
            }
            for g in graphics
        ],
        "exclude_regions": prev_exclude,
        "color_pixel_counts": color_summary,
        "has_words": len(words) > 0,
        "word_count": len(words),
        "border_ghosts_removed": ghost_count,
        "duration_seconds": round(duration, 2),
    }

    # Persist to session
    await update_session_db(session_id, structure_result=result_dict)
    cached["structure_result"] = result_dict

    logger.info("detect-structure session %s: %d boxes, %d zones, %d graphics, %.2fs",
                session_id, len(boxes), len(zones), len(graphics), duration)

    return {"session_id": session_id, **result_dict}


# ---------------------------------------------------------------------------
# Exclude Regions — user-drawn rectangles to exclude from OCR results
# ---------------------------------------------------------------------------

class _ExcludeRegionIn(BaseModel):
    x: int
    y: int
    w: int
    h: int
    label: str = ""


class _ExcludeRegionsBatchIn(BaseModel):
    regions: list[_ExcludeRegionIn]


@router.put("/sessions/{session_id}/exclude-regions")
async def set_exclude_regions(session_id: str, body: _ExcludeRegionsBatchIn):
    """Replace all exclude regions for a session.

    Regions are stored inside ``structure_result.exclude_regions``.
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    sr = session.get("structure_result") or {}
    sr["exclude_regions"] = [r.model_dump() for r in body.regions]

    await update_session_db(session_id, structure_result=sr)

    # Update cache
    if session_id in _cache:
        _cache[session_id]["structure_result"] = sr

    return {
        "session_id": session_id,
        "exclude_regions": sr["exclude_regions"],
        "count": len(sr["exclude_regions"]),
    }


@router.delete("/sessions/{session_id}/exclude-regions/{region_index}")
async def delete_exclude_region(session_id: str, region_index: int):
    """Remove a single exclude region by index."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")

    sr = session.get("structure_result") or {}
    regions = sr.get("exclude_regions", [])

    if region_index < 0 or region_index >= len(regions):
        raise HTTPException(status_code=404, detail="Region index out of range")

    removed = regions.pop(region_index)
    sr["exclude_regions"] = regions

    await update_session_db(session_id, structure_result=sr)

    if session_id in _cache:
        _cache[session_id]["structure_result"] = sr

    return {
        "session_id": session_id,
        "removed": removed,
        "remaining": len(regions),
    }


# ---------------------------------------------------------------------------
# Column Detection Endpoints (Step 3)
# ---------------------------------------------------------------------------

@router.post("/sessions/{session_id}/columns")
async def detect_columns(session_id: str):
    """Run column detection on the cropped (or dewarped) image."""
    if session_id not in _cache:
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    img_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")

    # -----------------------------------------------------------------------
    # Sub-sessions (box crops): skip column detection entirely.
    # Instead, create a single pseudo-column spanning the full image width.
    # Also run Tesseract + binarization here so that the row detection step
    # can reuse the cached intermediates (_word_dicts, _inv, _content_bounds)
    # instead of falling back to detect_column_geometry() which may fail
    # on small box images with < 5 words.
    # -----------------------------------------------------------------------
    session = await get_session_db(session_id)
    if session and session.get("parent_session_id"):
        h, w = img_bgr.shape[:2]

        # Binarize + invert for row detection (horizontal projection profile)
        ocr_img = create_ocr_image(img_bgr)
        inv = cv2.bitwise_not(ocr_img)

        # Run Tesseract to get word bounding boxes.
        # Word positions are relative to the full image (no ROI crop needed
        # because the sub-session image IS the cropped box already).
        # detect_row_geometry expects word positions relative to content ROI,
        # so with content_bounds = (0, w, 0, h) the coordinates are correct.
        try:
            from PIL import Image as PILImage
            pil_img = PILImage.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
            import pytesseract
            data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
            word_dicts = []
            for i in range(len(data['text'])):
                conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
                text = str(data['text'][i]).strip()
                if conf < 30 or not text:
                    continue
                word_dicts.append({
                    'text': text, 'conf': conf,
                    'left': int(data['left'][i]),
                    'top': int(data['top'][i]),
                    'width': int(data['width'][i]),
                    'height': int(data['height'][i]),
                })
            # Log all words including low-confidence ones for debugging
            all_count = sum(1 for i in range(len(data['text']))
                            if str(data['text'][i]).strip())
            low_conf = [(str(data['text'][i]).strip(), int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1)
                        for i in range(len(data['text']))
                        if str(data['text'][i]).strip()
                        and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) < 30
                        and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) >= 0]
            if low_conf:
                logger.info(f"OCR Pipeline: sub-session {session_id}: {len(low_conf)} words below conf 30: {low_conf[:20]}")
            logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)}/{all_count} words (conf>=30)")
        except Exception as e:
            logger.warning(f"OCR Pipeline: sub-session {session_id}: Tesseract failed: {e}")
            word_dicts = []

        # Cache intermediates for row detection (detect_rows reuses these)
        cached["_word_dicts"] = word_dicts
        cached["_inv"] = inv
        cached["_content_bounds"] = (0, w, 0, h)

        column_result = {
            "columns": [{
                "type": "column_text",
                "x": 0, "y": 0,
                "width": w, "height": h,
            }],
            "zones": None,
            "boxes_detected": 0,
            "duration_seconds": 0,
            "method": "sub_session_pseudo_column",
        }
        await update_session_db(
            session_id,
            column_result=column_result,
            row_result=None,
            word_result=None,
            current_step=6,
        )
        cached["column_result"] = column_result
        cached.pop("row_result", None)
        cached.pop("word_result", None)
        logger.info(f"OCR Pipeline: sub-session {session_id}: pseudo-column {w}x{h}px")
        return {"session_id": session_id, **column_result}

    t0 = time.time()

    # Binarized image for layout analysis
    ocr_img = create_ocr_image(img_bgr)
    h, w = ocr_img.shape[:2]

    # Phase A: Zone-aware geometry detection
    zoned_result = detect_column_geometry_zoned(ocr_img, img_bgr)

    if zoned_result is None:
        # Fallback to projection-based layout
        layout_img = create_layout_image(img_bgr)
        regions = analyze_layout(layout_img, ocr_img)
        zones_data = None
        boxes_detected = 0
    else:
        geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv, zones_data, boxes = zoned_result
        content_w = right_x - left_x
        boxes_detected = len(boxes)

        # Cache intermediates for row detection (avoids second Tesseract run)
        cached["_word_dicts"] = word_dicts
        cached["_inv"] = inv
        cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
        cached["_zones_data"] = zones_data
        cached["_boxes_detected"] = boxes_detected

        # Detect header/footer early so sub-column clustering ignores them
        header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)

        # Split sub-columns (e.g. page references) before classification
        geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
                                          top_y=top_y, header_y=header_y, footer_y=footer_y)

        # Expand narrow columns (sub-columns are often very narrow)
        geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)

        # Phase B: Content-based classification
        regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
                                        left_x=left_x, right_x=right_x, inv=inv)

    duration = time.time() - t0

    columns = [asdict(r) for r in regions]

    # Determine classification methods used
    methods = list(set(
        c.get("classification_method", "") for c in columns
        if c.get("classification_method")
    ))

    column_result = {
        "columns": columns,
        "classification_methods": methods,
        "duration_seconds": round(duration, 2),
        "boxes_detected": boxes_detected,
    }

    # Add zone data when boxes are present
    if zones_data and boxes_detected > 0:
        column_result["zones"] = zones_data

    # Persist to DB — also invalidate downstream results (rows, words)
    await update_session_db(
        session_id,
        column_result=column_result,
        row_result=None,
        word_result=None,
        current_step=6,
    )

    # Update cache
    cached["column_result"] = column_result
    cached.pop("row_result", None)
    cached.pop("word_result", None)

    col_count = len([c for c in columns if c["type"].startswith("column")])
    logger.info(f"OCR Pipeline: columns session {session_id}: "
                f"{col_count} columns detected, {boxes_detected} box(es) ({duration:.2f}s)")

    img_w = img_bgr.shape[1]
    await _append_pipeline_log(session_id, "columns", {
        "total_columns": len(columns),
        "column_widths_pct": [round(c["width"] / img_w * 100, 1) for c in columns],
        "column_types": [c["type"] for c in columns],
        "boxes_detected": boxes_detected,
    }, duration_ms=int(duration * 1000))

    return {
        "session_id": session_id,
        **column_result,
    }


@router.post("/sessions/{session_id}/columns/manual")
async def set_manual_columns(session_id: str, req: ManualColumnsRequest):
    """Override detected columns with manual definitions."""
    column_result = {
        "columns": req.columns,
        "duration_seconds": 0,
        "method": "manual",
    }

    await update_session_db(session_id, column_result=column_result,
                           row_result=None, word_result=None)

    if session_id in _cache:
        _cache[session_id]["column_result"] = column_result
        _cache[session_id].pop("row_result", None)
        _cache[session_id].pop("word_result", None)

    logger.info(f"OCR Pipeline: manual columns session {session_id}: "
                f"{len(req.columns)} columns set")

    return {"session_id": session_id, **column_result}


@router.post("/sessions/{session_id}/ground-truth/columns")
async def save_column_ground_truth(session_id: str, req: ColumnGroundTruthRequest):
    """Save ground truth feedback for the column detection step."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    ground_truth = session.get("ground_truth") or {}
    gt = {
        "is_correct": req.is_correct,
        "corrected_columns": req.corrected_columns,
        "notes": req.notes,
        "saved_at": datetime.utcnow().isoformat(),
        "column_result": session.get("column_result"),
    }
    ground_truth["columns"] = gt

    await update_session_db(session_id, ground_truth=ground_truth)

    if session_id in _cache:
        _cache[session_id]["ground_truth"] = ground_truth

    return {"session_id": session_id, "ground_truth": gt}


@router.get("/sessions/{session_id}/ground-truth/columns")
async def get_column_ground_truth(session_id: str):
    """Retrieve saved ground truth for column detection, including auto vs GT diff."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    ground_truth = session.get("ground_truth") or {}
    columns_gt = ground_truth.get("columns")
    if not columns_gt:
        raise HTTPException(status_code=404, detail="No column ground truth saved")

    return {
        "session_id": session_id,
        "columns_gt": columns_gt,
        "columns_auto": session.get("column_result"),
    }