""" Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone detection and zone-aware grid building. Extracted from grid_build_core.py for maintainability. """ import logging from typing import Any, Dict, List, Optional import cv2 import numpy as np from cv_box_detect import detect_boxes, split_page_into_zones from cv_graphic_detect import detect_graphic_elements from cv_color_detect import recover_colored_text from cv_vocab_types import PageZone from ocr_pipeline_session_store import get_session_image from grid_editor_helpers import ( _filter_border_strip_words, _filter_border_ghosts, _words_in_zone, _PIPE_RE_VSPLIT, _detect_vertical_dividers, _split_zone_at_vertical_dividers, _merge_content_zones_across_boxes, _build_zone_grid, ) logger = logging.getLogger(__name__) async def _build_zones( session_id: str, session: dict, all_words: List[Dict[str, Any]], graphic_rects: List[Dict[str, int]], content_x: int, content_y: int, content_w: int, content_h: int, img_w: int, img_h: int, ) -> Dict[str, Any]: """Load image, detect graphics/boxes, build zone-aware grids. Returns a dict with keys: zones_data, boxes_detected, recovered_count, border_prefiltered, img_bgr, all_words (modified in-place but returned for clarity). """ zones_data: List[Dict[str, Any]] = [] boxes_detected = 0 recovered_count = 0 border_prefiltered = False img_bgr = None # 3. Load image for box detection img_png = await get_session_image(session_id, "cropped") if not img_png: img_png = await get_session_image(session_id, "dewarped") if not img_png: img_png = await get_session_image(session_id, "original") if img_png: # Decode image for color detection + box detection arr = np.frombuffer(img_png, dtype=np.uint8) img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR) if img_bgr is not None: # --- 3a. Detect graphic/image regions via CV and hard-filter --- sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3] fresh_graphics = detect_graphic_elements(img_bgr, sig_words) if fresh_graphics: fresh_rects = [ {"x": g.x, "y": g.y, "w": g.width, "h": g.height} for g in fresh_graphics ] graphic_rects.extend(fresh_rects) logger.info( "build-grid session %s: detected %d graphic region(s) via CV", session_id, len(fresh_graphics), ) # Hard-filter words inside newly detected graphic regions before = len(all_words) all_words[:] = [ w for w in all_words if not any( gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] for gr in fresh_rects ) ] removed = before - len(all_words) if removed: logger.info( "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)", session_id, removed, len(fresh_rects), ) # --- Recover colored text that OCR missed (before grid building) --- recovered = recover_colored_text(img_bgr, all_words) if recovered and graphic_rects: # Filter recovered chars inside graphic regions recovered = [ r for r in recovered if not any( gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"] and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"] for gr in graphic_rects ) ] if recovered: recovered_count = len(recovered) all_words.extend(recovered) logger.info( "build-grid session %s: +%d recovered colored words", session_id, recovered_count, ) # Detect bordered boxes boxes = detect_boxes( img_bgr, content_x=content_x, content_w=content_w, content_y=content_y, content_h=content_h, ) boxes_detected = len(boxes) if boxes: # Filter border ghost words before grid building all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes) if ghost_count: all_words[:] = all_words_new logger.info( "build-grid session %s: removed %d border ghost words", session_id, ghost_count, ) # Split page into zones page_zones = split_page_into_zones( content_x, content_y, content_w, content_h, boxes ) # Merge content zones separated by box zones page_zones = _merge_content_zones_across_boxes( page_zones, content_x, content_w ) # 3b. Detect vertical dividers and split content zones page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers( page_zones, all_words ) # --- First pass: build grids per zone independently --- zone_grids = _build_grids_per_zone( page_zones, all_words, img_w, img_h ) border_prefiltered = border_prefiltered or any( zg.get("_border_prefiltered") for zg in zone_grids ) # --- Second pass: merge column boundaries from all content zones --- _merge_content_zone_columns( zone_grids, all_words, content_w, img_w, img_h, session_id ) # --- Build zones_data from zone_grids --- for zg in zone_grids: pz = zg["pz"] grid = zg["grid"] grid.pop("_raw_columns", None) zone_entry: Dict[str, Any] = { "zone_index": pz.index, "zone_type": pz.zone_type, "bbox_px": { "x": pz.x, "y": pz.y, "w": pz.width, "h": pz.height, }, "bbox_pct": { "x": round(pz.x / img_w * 100, 2) if img_w else 0, "y": round(pz.y / img_h * 100, 2) if img_h else 0, "w": round(pz.width / img_w * 100, 2) if img_w else 0, "h": round(pz.height / img_h * 100, 2) if img_h else 0, }, "border": None, "word_count": len(zg["words"]), **grid, } if pz.box: zone_entry["border"] = { "thickness": pz.box.border_thickness, "confidence": pz.box.confidence, } if pz.image_overlays: zone_entry["image_overlays"] = pz.image_overlays if pz.layout_hint: zone_entry["layout_hint"] = pz.layout_hint if pz.vsplit_group is not None: zone_entry["vsplit_group"] = pz.vsplit_group zones_data.append(zone_entry) # 4. Fallback: no boxes detected -> single zone with all words if not zones_data: before = len(all_words) filtered_words = [ w for w in all_words if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2) ] removed = before - len(filtered_words) if removed: logger.info( "build-grid session %s: filtered %d recovered artifacts (fallback zone)", session_id, removed, ) filtered_words, bs_removed = _filter_border_strip_words(filtered_words) if bs_removed: border_prefiltered = True logger.info( "build-grid session %s: pre-filtered %d border-strip words", session_id, bs_removed, ) grid = _build_zone_grid( filtered_words, content_x, content_y, content_w, content_h, 0, img_w, img_h, ) grid.pop("_raw_columns", None) zones_data.append({ "zone_index": 0, "zone_type": "content", "bbox_px": { "x": content_x, "y": content_y, "w": content_w, "h": content_h, }, "bbox_pct": { "x": round(content_x / img_w * 100, 2) if img_w else 0, "y": round(content_y / img_h * 100, 2) if img_h else 0, "w": round(content_w / img_w * 100, 2) if img_w else 0, "h": round(content_h / img_h * 100, 2) if img_h else 0, }, "border": None, "word_count": len(all_words), **grid, }) return { "zones_data": zones_data, "boxes_detected": boxes_detected, "recovered_count": recovered_count, "border_prefiltered": border_prefiltered, "img_bgr": img_bgr, } def _detect_and_split_vertical_dividers( page_zones: List[PageZone], all_words: List[Dict[str, Any]], ) -> tuple: """Detect vertical dividers and split content zones. Returns (expanded_zones, border_prefiltered_from_vsplit). """ vsplit_group_counter = 0 expanded_zones: List = [] for pz in page_zones: if pz.zone_type != "content": expanded_zones.append(pz) continue zone_words = _words_in_zone( all_words, pz.y, pz.height, pz.x, pz.width ) divider_xs = _detect_vertical_dividers( zone_words, pz.x, pz.width, pz.y, pz.height ) if divider_xs: sub_zones = _split_zone_at_vertical_dividers( pz, divider_xs, vsplit_group_counter ) expanded_zones.extend(sub_zones) vsplit_group_counter += 1 # Remove pipe words so they don't appear in sub-zones pipe_ids = set( id(w) for w in zone_words if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip()) ) all_words[:] = [w for w in all_words if id(w) not in pipe_ids] logger.info( "build-grid: vertical split zone %d at x=%s -> %d sub-zones", pz.index, [int(x) for x in divider_xs], len(sub_zones), ) else: expanded_zones.append(pz) # Re-index zones for i, pz in enumerate(expanded_zones): pz.index = i return expanded_zones, False def _build_grids_per_zone( page_zones: List[PageZone], all_words: List[Dict[str, Any]], img_w: int, img_h: int, ) -> List[Dict[str, Any]]: """Build grids for each zone independently (first pass).""" zone_grids: List[Dict] = [] for pz in page_zones: zone_words = _words_in_zone( all_words, pz.y, pz.height, pz.x, pz.width ) if pz.zone_type == "content": logger.info( "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words", pz.index, pz.zone_type, pz.x, pz.x + pz.width, pz.y, pz.y + pz.height, len(zone_words), len(all_words), ) # Filter recovered single-char artifacts in ALL zones before = len(zone_words) zone_words = [ w for w in zone_words if not ( w.get("recovered") and len(w.get("text", "").strip()) <= 2 ) ] removed = before - len(zone_words) if removed: logger.info( "build-grid: filtered %d recovered artifacts from %s zone %d", removed, pz.zone_type, pz.index, ) # Filter words inside image overlay regions (merged box zones) if pz.image_overlays: before_ov = len(zone_words) zone_words = [ w for w in zone_words if not any( ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"] and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"] for ov in pz.image_overlays ) ] ov_removed = before_ov - len(zone_words) if ov_removed: logger.info( "build-grid: filtered %d words inside image overlays from zone %d", ov_removed, pz.index, ) zone_words, bs_removed = _filter_border_strip_words(zone_words) bp = False if bs_removed: bp = True logger.info( "build-grid: pre-filtered %d border-strip words from zone %d", bs_removed, pz.index, ) grid = _build_zone_grid( zone_words, pz.x, pz.y, pz.width, pz.height, pz.index, img_w, img_h, skip_first_row_header=bool(pz.image_overlays), ) zone_grids.append({ "pz": pz, "words": zone_words, "grid": grid, "_border_prefiltered": bp, }) return zone_grids def _merge_content_zone_columns( zone_grids: List[Dict[str, Any]], all_words: List[Dict[str, Any]], content_w: int, img_w: int, img_h: int, session_id: str, ) -> None: """Second pass: merge column boundaries from all content zones. Modifies zone_grids in place. """ content_zones = [ zg for zg in zone_grids if zg["pz"].zone_type == "content" and zg["pz"].vsplit_group is None ] if len(content_zones) <= 1: return # Collect column split points (x_min of non-first columns) all_split_xs: List[float] = [] for zg in content_zones: raw_cols = zg["grid"].get("_raw_columns", []) for col in raw_cols[1:]: all_split_xs.append(col["x_min"]) if not all_split_xs: return all_split_xs.sort() merge_distance = max(25, int(content_w * 0.03)) merged_xs = [all_split_xs[0]] for x in all_split_xs[1:]: if x - merged_xs[-1] < merge_distance: merged_xs[-1] = (merged_xs[-1] + x) / 2 else: merged_xs.append(x) total_cols = len(merged_xs) + 1 max_zone_cols = max( len(zg["grid"].get("_raw_columns", [])) for zg in content_zones ) if total_cols < max_zone_cols: return cx_min = min(w["left"] for w in all_words) cx_max = max(w["left"] + w["width"] for w in all_words) merged_columns: List[Dict[str, Any]] = [] prev_x = cx_min for i, sx in enumerate(merged_xs): merged_columns.append({ "index": i, "type": f"column_{i + 1}", "x_min": prev_x, "x_max": sx, }) prev_x = sx merged_columns.append({ "index": len(merged_xs), "type": f"column_{len(merged_xs) + 1}", "x_min": prev_x, "x_max": cx_max, }) # Re-build ALL content zones with merged columns for zg in zone_grids: pz = zg["pz"] if pz.zone_type == "content": grid = _build_zone_grid( zg["words"], pz.x, pz.y, pz.width, pz.height, pz.index, img_w, img_h, global_columns=merged_columns, skip_first_row_header=bool(pz.image_overlays), ) zg["grid"] = grid logger.info( "build-grid session %s: union of %d content " "zones -> %d merged columns (max single zone: %d)", session_id, len(content_zones), total_cols, max_zone_cols, )