""" Box layout classifier — detects internal layout type of embedded boxes. Classifies each box as: flowing | columnar | bullet_list | header_only and provides layout-appropriate grid building. Used by the Box-Grid-Review step to rebuild box zones with correct structure. """ import logging import re import statistics from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) # Bullet / list-item patterns at the start of a line _BULLET_RE = re.compile( r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s' # dash, bullet chars r'|^\d{1,2}[.)]\s' # numbered: "1) " or "1. " r'|^[a-z][.)]\s' # lettered: "a) " or "a. " ) def classify_box_layout( words: List[Dict], box_w: int, box_h: int, ) -> str: """Classify the internal layout of a detected box. Args: words: OCR word dicts within the box (with top, left, width, height, text) box_w: Box width in pixels box_h: Box height in pixels Returns: 'header_only' | 'bullet_list' | 'columnar' | 'flowing' """ if not words: return "header_only" # Group words into lines by y-proximity lines = _group_into_lines(words) # Header only: very few words or single line total_words = sum(len(line) for line in lines) if total_words <= 5 or len(lines) <= 1: return "header_only" # Bullet list: check if majority of lines start with bullet patterns bullet_count = 0 for line in lines: first_text = line[0].get("text", "") if line else "" if _BULLET_RE.match(first_text): bullet_count += 1 # Also check if first word IS a bullet char elif first_text.strip() in ("-", "–", "—", "•", "·", "▪", "▸"): bullet_count += 1 if bullet_count >= len(lines) * 0.4 and bullet_count >= 2: return "bullet_list" # Columnar: check for multiple distinct x-clusters if len(lines) >= 3 and _has_column_structure(words, box_w): return "columnar" # Default: flowing text return "flowing" def _group_into_lines(words: List[Dict]) -> List[List[Dict]]: """Group words into lines by y-proximity.""" if not words: return [] sorted_words = sorted(words, key=lambda w: (w["top"], w["left"])) heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0] median_h = statistics.median(heights) if heights else 20 y_tolerance = max(median_h * 0.5, 5) lines: List[List[Dict]] = [] current_line: List[Dict] = [sorted_words[0]] current_y = sorted_words[0]["top"] for w in sorted_words[1:]: if abs(w["top"] - current_y) <= y_tolerance: current_line.append(w) else: lines.append(sorted(current_line, key=lambda ww: ww["left"])) current_line = [w] current_y = w["top"] if current_line: lines.append(sorted(current_line, key=lambda ww: ww["left"])) return lines def _has_column_structure(words: List[Dict], box_w: int) -> bool: """Check if words have multiple distinct left-edge clusters (columns).""" if box_w <= 0: return False lines = _group_into_lines(words) if len(lines) < 3: return False # Collect left-edges of non-first words in each line # (first word of each line often aligns regardless of columns) left_edges = [] for line in lines: for w in line[1:]: # skip first word left_edges.append(w["left"]) if len(left_edges) < 4: return False # Check if left edges cluster into 2+ distinct groups left_edges.sort() gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)] if not gaps: return False median_gap = statistics.median(gaps) # A column gap is typically > 15% of box width column_gap_threshold = box_w * 0.15 large_gaps = [g for g in gaps if g > column_gap_threshold] return len(large_gaps) >= 1 def build_box_zone_grid( zone_words: List[Dict], box_x: int, box_y: int, box_w: int, box_h: int, zone_index: int, img_w: int, img_h: int, layout_type: Optional[str] = None, ) -> Dict[str, Any]: """Build a grid for a box zone with layout-aware processing. If layout_type is None, auto-detects it. For 'flowing' and 'bullet_list', forces single-column layout. For 'columnar', uses the standard multi-column detection. For 'header_only', creates a single cell. Returns the same format as _build_zone_grid (columns, rows, cells, header_rows). """ from grid_editor_helpers import _build_zone_grid, _cluster_rows if not zone_words: return { "columns": [], "rows": [], "cells": [], "header_rows": [], "box_layout_type": layout_type or "header_only", "box_grid_reviewed": False, } # Auto-detect layout if not specified if not layout_type: layout_type = classify_box_layout(zone_words, box_w, box_h) logger.info( "Box zone %d: layout_type=%s, %d words, %dx%d", zone_index, layout_type, len(zone_words), box_w, box_h, ) if layout_type == "header_only": # Single cell with all text concatenated all_text = " ".join( w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"])) ).strip() return { "columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}], "rows": [{"index": 0, "row_index": 0, "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2}], "cells": [{ "cell_id": f"Z{zone_index}_R0C0", "row_index": 0, "col_index": 0, "col_type": "column_1", "text": all_text, "word_boxes": zone_words, }], "header_rows": [0], "box_layout_type": layout_type, "box_grid_reviewed": False, } if layout_type in ("flowing", "bullet_list"): # Force single column — each line becomes one row with one cell lines = _group_into_lines(zone_words) column = {"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"} rows = [] cells = [] for row_idx, line_words in enumerate(lines): if not line_words: continue y_min = min(w["top"] for w in line_words) y_max = max(w["top"] + w["height"] for w in line_words) y_center = (y_min + y_max) / 2 row = { "index": row_idx, "row_index": row_idx, "y_min": y_min, "y_max": y_max, "y_center": y_center, } rows.append(row) line_text = " ".join(w.get("text", "") for w in line_words).strip() cell = { "cell_id": f"Z{zone_index}_R{row_idx}C0", "row_index": row_idx, "col_index": 0, "col_type": "column_1", "text": line_text, "word_boxes": line_words, } cells.append(cell) # Detect header: first row if it's notably different (bold, larger, or short) header_rows = [] if len(lines) >= 2: first_line = lines[0] first_text = " ".join(w.get("text", "") for w in first_line).strip() # Header heuristic: short text, or all-caps, or ends with ':' if (len(first_text) < 40 or first_text.isupper() or first_text.rstrip().endswith(':')): header_rows = [0] return { "columns": [column], "rows": rows, "cells": cells, "header_rows": header_rows, "box_layout_type": layout_type, "box_grid_reviewed": False, } # Columnar: use standard grid builder with independent column detection result = _build_zone_grid( zone_words, box_x, box_y, box_w, box_h, zone_index, img_w, img_h, global_columns=None, # detect columns independently ) result["box_layout_type"] = layout_type result["box_grid_reviewed"] = False return result