""" OCR Pipeline Structure Detection and Exclude Regions Detect document structure (boxes, zones, color regions, graphics) and manage user-drawn exclude regions. Extracted from ocr_pipeline_geometry.py for file-size compliance. """ import logging import time from typing import Any, Dict, List import cv2 import numpy as np from fastapi import APIRouter, HTTPException from pydantic import BaseModel from cv_box_detect import detect_boxes from cv_color_detect import _COLOR_RANGES, _COLOR_HEX from cv_graphic_detect import detect_graphic_elements from ocr_pipeline_session_store import ( get_session_db, update_session_db, ) from ocr_pipeline_common import ( _cache, _load_session_to_cache, _get_cached, _filter_border_ghost_words, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"]) # --------------------------------------------------------------------------- # Structure Detection Endpoint # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/detect-structure") async def detect_structure(session_id: str): """Detect document structure: boxes, zones, and color regions. Runs box detection (line + shading) and color analysis on the cropped image. Returns structured JSON with all detected elements for the structure visualization step. """ if session_id not in _cache: await _load_session_to_cache(session_id) cached = _get_cached(session_id) img_bgr = ( cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr") ) if img_bgr is None: raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first") t0 = time.time() h, w = img_bgr.shape[:2] # --- Content bounds from word result (if available) or full image --- word_result = cached.get("word_result") words: List[Dict] = [] if word_result and word_result.get("cells"): for cell in word_result["cells"]: for wb in (cell.get("word_boxes") or []): words.append(wb) # Fallback: use raw OCR words if cell word_boxes are empty if not words and word_result: for key in ("raw_paddle_words_split", "raw_tesseract_words", "raw_paddle_words"): raw = word_result.get(key, []) if raw: words = raw logger.info("detect-structure: using %d words from %s (no cell word_boxes)", len(words), key) break # If no words yet, use image dimensions with small margin if words: content_x = max(0, min(int(wb["left"]) for wb in words)) content_y = max(0, min(int(wb["top"]) for wb in words)) content_r = min(w, max(int(wb["left"] + wb["width"]) for wb in words)) content_b = min(h, max(int(wb["top"] + wb["height"]) for wb in words)) content_w_px = content_r - content_x content_h_px = content_b - content_y else: margin = int(min(w, h) * 0.03) content_x, content_y = margin, margin content_w_px = w - 2 * margin content_h_px = h - 2 * margin # --- Box detection --- boxes = detect_boxes( img_bgr, content_x=content_x, content_w=content_w_px, content_y=content_y, content_h=content_h_px, ) # --- Zone splitting --- from cv_box_detect import split_page_into_zones as _split_zones zones = _split_zones(content_x, content_y, content_w_px, content_h_px, boxes) # --- Color region sampling --- # Sample background shading in each detected box hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) box_colors = [] for box in boxes: # Sample the center region of each box cy1 = box.y + box.height // 4 cy2 = box.y + 3 * box.height // 4 cx1 = box.x + box.width // 4 cx2 = box.x + 3 * box.width // 4 cy1 = max(0, min(cy1, h - 1)) cy2 = max(0, min(cy2, h - 1)) cx1 = max(0, min(cx1, w - 1)) cx2 = max(0, min(cx2, w - 1)) if cy2 > cy1 and cx2 > cx1: roi_hsv = hsv[cy1:cy2, cx1:cx2] med_h = float(np.median(roi_hsv[:, :, 0])) med_s = float(np.median(roi_hsv[:, :, 1])) med_v = float(np.median(roi_hsv[:, :, 2])) if med_s > 15: from cv_color_detect import _hue_to_color_name bg_name = _hue_to_color_name(med_h) bg_hex = _COLOR_HEX.get(bg_name, "#6b7280") else: bg_name = "gray" if med_v < 220 else "white" bg_hex = "#6b7280" if bg_name == "gray" else "#ffffff" else: bg_name = "unknown" bg_hex = "#6b7280" box_colors.append({"color_name": bg_name, "color_hex": bg_hex}) # --- Color text detection overview --- # Quick scan for colored text regions across the page color_summary: Dict[str, int] = {} for color_name, ranges in _COLOR_RANGES.items(): mask = np.zeros((h, w), dtype=np.uint8) for lower, upper in ranges: mask = cv2.bitwise_or(mask, cv2.inRange(hsv, lower, upper)) pixel_count = int(np.sum(mask > 0)) if pixel_count > 50: # minimum threshold color_summary[color_name] = pixel_count # --- Graphic element detection --- box_dicts = [ {"x": b.x, "y": b.y, "w": b.width, "h": b.height} for b in boxes ] graphics = detect_graphic_elements( img_bgr, words, detected_boxes=box_dicts, ) # --- Filter border-ghost words from OCR result --- ghost_count = 0 if boxes and word_result: ghost_count = _filter_border_ghost_words(word_result, boxes) if ghost_count: logger.info("detect-structure: removed %d border-ghost words", ghost_count) await update_session_db(session_id, word_result=word_result) cached["word_result"] = word_result duration = time.time() - t0 # Preserve user-drawn exclude regions from previous run prev_sr = cached.get("structure_result") or {} prev_exclude = prev_sr.get("exclude_regions", []) result_dict = { "image_width": w, "image_height": h, "content_bounds": { "x": content_x, "y": content_y, "w": content_w_px, "h": content_h_px, }, "boxes": [ { "x": b.x, "y": b.y, "w": b.width, "h": b.height, "confidence": b.confidence, "border_thickness": b.border_thickness, "bg_color_name": box_colors[i]["color_name"], "bg_color_hex": box_colors[i]["color_hex"], } for i, b in enumerate(boxes) ], "zones": [ { "index": z.index, "zone_type": z.zone_type, "y": z.y, "h": z.height, "x": z.x, "w": z.width, } for z in zones ], "graphics": [ { "x": g.x, "y": g.y, "w": g.width, "h": g.height, "area": g.area, "shape": g.shape, "color_name": g.color_name, "color_hex": g.color_hex, "confidence": round(g.confidence, 2), } for g in graphics ], "exclude_regions": prev_exclude, "color_pixel_counts": color_summary, "has_words": len(words) > 0, "word_count": len(words), "border_ghosts_removed": ghost_count, "duration_seconds": round(duration, 2), } # Persist to session await update_session_db(session_id, structure_result=result_dict) cached["structure_result"] = result_dict logger.info("detect-structure session %s: %d boxes, %d zones, %d graphics, %.2fs", session_id, len(boxes), len(zones), len(graphics), duration) return {"session_id": session_id, **result_dict} # --------------------------------------------------------------------------- # Exclude Regions -- user-drawn rectangles to exclude from OCR results # --------------------------------------------------------------------------- class _ExcludeRegionIn(BaseModel): x: int y: int w: int h: int label: str = "" class _ExcludeRegionsBatchIn(BaseModel): regions: list[_ExcludeRegionIn] @router.put("/sessions/{session_id}/exclude-regions") async def set_exclude_regions(session_id: str, body: _ExcludeRegionsBatchIn): """Replace all exclude regions for a session. Regions are stored inside ``structure_result.exclude_regions``. """ session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail="Session not found") sr = session.get("structure_result") or {} sr["exclude_regions"] = [r.model_dump() for r in body.regions] # Invalidate grid so it rebuilds with new exclude regions await update_session_db(session_id, structure_result=sr, grid_editor_result=None) # Update cache if session_id in _cache: _cache[session_id]["structure_result"] = sr _cache[session_id].pop("grid_editor_result", None) return { "session_id": session_id, "exclude_regions": sr["exclude_regions"], "count": len(sr["exclude_regions"]), } @router.delete("/sessions/{session_id}/exclude-regions/{region_index}") async def delete_exclude_region(session_id: str, region_index: int): """Remove a single exclude region by index.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail="Session not found") sr = session.get("structure_result") or {} regions = sr.get("exclude_regions", []) if region_index < 0 or region_index >= len(regions): raise HTTPException(status_code=404, detail="Region index out of range") removed = regions.pop(region_index) sr["exclude_regions"] = regions # Invalidate grid so it rebuilds with new exclude regions await update_session_db(session_id, structure_result=sr, grid_editor_result=None) if session_id in _cache: _cache[session_id]["structure_result"] = sr _cache[session_id].pop("grid_editor_result", None) return { "session_id": session_id, "removed": removed, "remaining": len(regions), }