""" Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe divider removal, connector normalization, border strip detection, and alphabet sidebar removal. Extracted from grid_build_core.py for maintainability. """ import logging import re from typing import Any, Dict, List from cv_ocr_engines import _words_to_reading_order_text logger = logging.getLogger(__name__) _PIPE_RE = re.compile(r"^\|+$") def _cleanup_zones( zones_data: List[Dict[str, Any]], border_prefiltered: bool, session_id: str, ) -> bool: """Clean up zone data: remove junk rows, artifacts, pipes, border strips. Args: zones_data: List of zone dicts (modified in place). border_prefiltered: Whether border words were already pre-filtered. session_id: For logging. Returns: Updated border_prefiltered flag. """ _remove_junk_rows(zones_data) _remove_artifact_cells(zones_data) _remove_oversized_word_boxes(zones_data) _remove_pipe_dividers(zones_data) _normalize_connector_columns(zones_data) border_prefiltered = _remove_border_strips(zones_data, border_prefiltered) _remove_alphabet_sidebars(zones_data) return border_prefiltered def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None: """Remove rows where ALL cells contain only short, low-confidence text. Also removes 'oversized stub' rows and 'scattered debris' rows. """ _JUNK_CONF_THRESHOLD = 50 _JUNK_MAX_TEXT_LEN = 3 for z in zones_data: cells = z.get("cells", []) rows = z.get("rows", []) if not cells or not rows: continue # Compute median word height across the zone for oversized detection all_wb_heights = [ wb["height"] for cell in cells for wb in cell.get("word_boxes") or [] if wb.get("height", 0) > 0 ] median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28 junk_row_indices = set() for row in rows: ri = row["index"] row_cells = [c for c in cells if c.get("row_index") == ri] if not row_cells: continue row_wbs = [ wb for cell in row_cells for wb in cell.get("word_boxes") or [] ] # Rule 1: ALL word_boxes are low-conf AND short text all_junk = True for wb in row_wbs: text = (wb.get("text") or "").strip() conf = wb.get("conf", 0) if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN: all_junk = False break if all_junk and row_wbs: junk_row_indices.add(ri) continue # Rule 2: oversized stub -- <=3 words, short total text, # and word height > 1.8x median if len(row_wbs) <= 3: total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) has_page_ref = any( re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip()) for wb in row_wbs ) if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref: junk_row_indices.add(ri) continue # Rule 3: scattered debris -- rows with only tiny fragments longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs) if longest <= 2: junk_row_indices.add(ri) continue if junk_row_indices: z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices] z["rows"] = [r for r in rows if r["index"] not in junk_row_indices] logger.info( "build-grid: removed %d junk rows from zone %d: %s", len(junk_row_indices), z["zone_index"], sorted(junk_row_indices), ) def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None: """Remove individual cells with a single very-short, low-conf word.""" _ARTIFACT_MAX_LEN = 2 _ARTIFACT_CONF_THRESHOLD = 65 for z in zones_data: cells = z.get("cells", []) if not cells: continue artifact_ids = set() for cell in cells: wbs = cell.get("word_boxes") or [] if len(wbs) != 1: continue wb = wbs[0] text = (wb.get("text") or "").strip() conf = wb.get("conf", 100) if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD: artifact_ids.add(cell.get("cell_id")) if artifact_ids: z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids] logger.info( "build-grid: removed %d artifact cells from zone %d: %s", len(artifact_ids), z.get("zone_index", 0), [c.get("text") for c in cells if c.get("cell_id") in artifact_ids], ) def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None: """Remove word_boxes whose height is 3x+ the median (graphic artifacts).""" for z in zones_data: cells = z.get("cells", []) if not cells: continue all_wh = [ wb["height"] for cell in cells for wb in cell.get("word_boxes") or [] if wb.get("height", 0) > 0 ] if not all_wh: continue med_h = sorted(all_wh)[len(all_wh) // 2] oversized_threshold = med_h * 3 removed_oversized = 0 for cell in cells: wbs = cell.get("word_boxes") or [] filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold] if len(filtered) < len(wbs): removed_oversized += len(wbs) - len(filtered) cell["word_boxes"] = filtered cell["text"] = _words_to_reading_order_text(filtered) if removed_oversized: z["cells"] = [c for c in cells if c.get("word_boxes")] logger.info( "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d", removed_oversized, oversized_threshold, z.get("zone_index", 0), ) def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None: """Remove pipe-character word_boxes (column divider artifacts).""" for z in zones_data: if z.get("vsplit_group") is not None: continue # pipes already removed before split removed_pipes = 0 for cell in z.get("cells", []): wbs = cell.get("word_boxes") or [] filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())] if len(filtered) < len(wbs): removed_pipes += len(wbs) - len(filtered) cell["word_boxes"] = filtered cell["text"] = _words_to_reading_order_text(filtered) if removed_pipes: z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())] logger.info( "build-grid: removed %d pipe-divider word_boxes from zone %d", removed_pipes, z.get("zone_index", 0), ) # Strip pipe chars ONLY from cell edges (OCR artifacts). # Preserve pipes embedded in words as syllable separators. for z in zones_data: for cell in z.get("cells", []): text = cell.get("text", "") if "|" in text: cleaned = text.strip("|").strip() if cleaned != text.strip(): cell["text"] = cleaned def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None: """Normalize narrow connector columns where OCR appends noise chars. In synonym dictionaries a narrow column repeats the same word (e.g. "oder") in every row. OCR sometimes appends noise chars. """ for z in zones_data: cols = z.get("columns", []) cells = z.get("cells", []) if not cols or not cells: continue for col in cols: ci = col.get("index") col_cells = [c for c in cells if c.get("col_index") == ci] if len(col_cells) < 3: continue text_counts: Dict[str, int] = {} for c in col_cells: t = (c.get("text") or "").strip() if t: text_counts[t] = text_counts.get(t, 0) + 1 if not text_counts: continue dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type] dominant_count = text_counts[dominant_text] if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6: continue fixed = 0 for c in col_cells: t = (c.get("text") or "").strip() if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2: c["text"] = dominant_text wbs = c.get("word_boxes") or [] if len(wbs) == 1: wbs[0]["text"] = dominant_text fixed += 1 if fixed: logger.info( "build-grid: normalized %d outlier cells in connector column %d " "(dominant='%s') zone %d", fixed, ci, dominant_text, z.get("zone_index", 0), ) def _remove_border_strips( zones_data: List[Dict[str, Any]], border_prefiltered: bool, ) -> bool: """Detect and remove page-border decoration strips. Returns updated border_prefiltered flag. """ border_strip_removed = 0 if border_prefiltered: logger.info("Step 4e: skipped (border pre-filter already applied)") return border_prefiltered for z in zones_data: cells = z.get("cells", []) if not cells: continue all_wbs_with_cell: list = [] for cell in cells: for wb in cell.get("word_boxes") or []: all_wbs_with_cell.append((wb.get("left", 0), wb, cell)) if len(all_wbs_with_cell) < 10: continue all_wbs_with_cell.sort(key=lambda t: t[0]) total = len(all_wbs_with_cell) # -- Left-edge scan -- left_strip_count = 0 left_gap = 0 running_right = 0 for gi in range(total - 1): running_right = max( running_right, all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0), ) gap = all_wbs_with_cell[gi + 1][0] - running_right if gap > 30: left_strip_count = gi + 1 left_gap = gap break # -- Right-edge scan -- right_strip_count = 0 right_gap = 0 running_left = all_wbs_with_cell[-1][0] for gi in range(total - 1, 0, -1): running_left = min(running_left, all_wbs_with_cell[gi][0]) prev_right = ( all_wbs_with_cell[gi - 1][0] + all_wbs_with_cell[gi - 1][1].get("width", 0) ) gap = running_left - prev_right if gap > 30: right_strip_count = total - gi right_gap = gap break strip_wbs: set = set() strip_side = "" strip_gap = 0 strip_count = 0 if left_strip_count > 0 and left_strip_count / total < 0.20: strip_side = "left" strip_count = left_strip_count strip_gap = left_gap strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]} elif right_strip_count > 0 and right_strip_count / total < 0.20: strip_side = "right" strip_count = right_strip_count strip_gap = right_gap strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]} if not strip_wbs: continue for cell in cells: wbs = cell.get("word_boxes") or [] filtered = [wb for wb in wbs if id(wb) not in strip_wbs] if len(filtered) < len(wbs): border_strip_removed += len(wbs) - len(filtered) cell["word_boxes"] = filtered cell["text"] = _words_to_reading_order_text(filtered) z["cells"] = [c for c in cells if (c.get("word_boxes") or c.get("text", "").strip())] logger.info( "Step 4e: removed %d border-strip word_boxes (%s) from zone %d " "(gap=%dpx, strip=%d/%d wbs)", border_strip_removed, strip_side, z.get("zone_index", 0), strip_gap, strip_count, total, ) return border_prefiltered def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None: """Remove decorative edge columns (alphabet sidebar safety net). Dictionary pages have A-Z letter sidebars that OCR reads as single- character word_boxes. """ for z in zones_data: columns = z.get("columns", []) cells = z.get("cells", []) if len(columns) < 3 or not cells: continue col_cells: Dict[str, List[Dict]] = {} for cell in cells: ct = cell.get("col_type", "") if ct.startswith("column_"): col_cells.setdefault(ct, []).append(cell) col_types_ordered = sorted(col_cells.keys()) if len(col_types_ordered) < 3: continue for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]: edge_cells_list = col_cells.get(edge_ct, []) if len(edge_cells_list) < 3: continue texts = [(c.get("text") or "").strip() for c in edge_cells_list] avg_len = sum(len(t) for t in texts) / len(texts) single_char = sum(1 for t in texts if len(t) <= 1) single_ratio = single_char / len(texts) if avg_len > 1.5: continue if single_ratio < 0.7: continue removed_count = len(edge_cells_list) edge_ids = {id(c) for c in edge_cells_list} z["cells"] = [c for c in cells if id(c) not in edge_ids] z["columns"] = [col for col in columns if col.get("col_type") != edge_ct] logger.info( "Step 4f: removed decorative edge column '%s' from zone %d " "(%d cells, avg_len=%.1f, single_char=%.0f%%)", edge_ct, z.get("zone_index", 0), removed_count, avg_len, single_ratio * 100, ) break # only remove one edge per zone