""" Grid Build Cell Ops — Cell-level operations: bullet/artifact removal, garbled cell cleanup, word-box reordering, and max_columns enforcement. Extracted from grid_build_core.py for maintainability. """ import logging import re from typing import Any, Dict, List, Tuple from cv_ocr_engines import ( _words_to_reading_order_text, _group_words_into_lines, _lookup_ipa, ) logger = logging.getLogger(__name__) def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None: """Remove blue bullet/artifact word_boxes (Step 5i). Handles tiny coloured symbols, overlapping word_boxes, duplicate text, and syllable-split word merging. """ _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$') _REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'} bullet_removed = 0 for z in zones_data: for cell in z.get("cells", []): wbs = cell.get("word_boxes") or [] if len(wbs) < 2: continue to_remove: set = set() # Rule (a): tiny coloured symbols for i, wb in enumerate(wbs): cn = wb.get("color_name", "black") if (cn != "black" and wb.get("width", 0) * wb.get("height", 0) < 200 and wb.get("conf", 100) < 85): to_remove.add(i) # Rule (a2): isolated non-alphanumeric symbols for i, wb in enumerate(wbs): t = (wb.get("text") or "").strip() if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: if t in _REMOVE_SYMBOLS: to_remove.add(i) # Rule (b) + (c): overlap and duplicate detection to_merge: List[Tuple[int, int]] = [] indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0)) for p in range(len(indexed) - 1): i1, w1 = indexed[p] i2, w2 = indexed[p + 1] x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0) x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0) overlap = max(0, min(x1e, x2e) - max(x1s, x2s)) min_w = min(w1.get("width", 1), w2.get("width", 1)) gap = x2s - x1e overlap_pct = overlap / min_w if min_w > 0 else 0 if overlap_pct > 0.20: t1 = (w1.get("text") or "").strip() t2 = (w2.get("text") or "").strip() # Syllable-split words if (overlap_pct <= 0.75 and _ALPHA_WORD_RE.match(t1) and _ALPHA_WORD_RE.match(t2)): to_merge.append((i1, i2)) continue # High overlap with short prefix if (overlap_pct > 0.75 and _ALPHA_WORD_RE.match(t1) and _ALPHA_WORD_RE.match(t2) and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower() and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4): to_merge.append((i1, i2)) continue if overlap_pct <= 0.40: continue c1 = w1.get("conf", 50) c2 = w2.get("conf", 50) # Very high overlap: prefer IPA-dictionary word if overlap_pct > 0.90 and t1.lower() != t2.lower(): in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False if in_dict_1 and not in_dict_2: to_remove.add(i2) continue elif in_dict_2 and not in_dict_1: to_remove.add(i1) continue if c1 < c2: to_remove.add(i1) elif c2 < c1: to_remove.add(i2) else: if w1.get("height", 0) > w2.get("height", 0): to_remove.add(i1) else: to_remove.add(i2) elif (gap < 6 and w1.get("color_name") == "blue" and w2.get("color_name") == "blue" and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()): c1 = w1.get("conf", 50) c2 = w2.get("conf", 50) to_remove.add(i1 if c1 <= c2 else i2) # Execute merges first (syllable-split words) if to_merge: merge_parent: Dict[int, int] = {} for mi1, mi2 in to_merge: actual_mi1 = mi1 while actual_mi1 in merge_parent: actual_mi1 = merge_parent[actual_mi1] if actual_mi1 in to_remove or mi2 in to_remove: continue if mi2 in merge_parent: continue mw1, mw2 = wbs[actual_mi1], wbs[mi2] mt1 = (mw1.get("text") or "").rstrip(".,;:!?") mt2 = (mw2.get("text") or "").strip() merged_text = mt1 + mt2 mx = min(mw1["left"], mw2["left"]) my = min(mw1["top"], mw2["top"]) mr = max(mw1["left"] + mw1["width"], mw2["left"] + mw2["width"]) mb = max(mw1["top"] + mw1["height"], mw2["top"] + mw2["height"]) mw1["text"] = merged_text mw1["left"] = mx mw1["top"] = my mw1["width"] = mr - mx mw1["height"] = mb - my mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2 to_remove.add(mi2) merge_parent[mi2] = actual_mi1 bullet_removed -= 1 if to_remove: bullet_removed += len(to_remove) filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove] cell["word_boxes"] = filtered if not cell.get("_ipa_corrected"): cell["text"] = _words_to_reading_order_text(filtered) if bullet_removed: for z in zones_data: z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())] logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed) def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None: """Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre).""" _COMMON_SHORT_WORDS = { "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja", "ob", "so", "um", "zu", "wo", "je", "oh", "or", "die", "der", "das", "dem", "den", "des", "ein", "und", "auf", "aus", "bei", "bis", "für", "mit", "nur", "von", "a", "i", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok", "on", "or", "so", "to", "up", "us", "we", "the", "and", "but", "for", "not", } _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$') artifact_cells_removed = 0 for z in zones_data: before = len(z.get("cells", [])) kept = [] for cell in z.get("cells", []): text = (cell.get("text") or "").strip() core = text.rstrip(".,;:!?'\"") is_artifact = False if not core: is_artifact = True elif _PURE_JUNK_RE.match(core): if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'): is_artifact = True elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha(): is_artifact = True elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS: is_artifact = True elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core) and not re.match(r'^[pPsS]\.?\d+$', core)): is_artifact = True if is_artifact: kept.append(None) else: kept.append(cell) z["cells"] = [c for c in kept if c is not None] artifact_cells_removed += before - len(z["cells"]) if artifact_cells_removed: for z in zones_data: cell_ris = {c.get("row_index") for c in z.get("cells", [])} z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris] logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed) def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None: """Normalise word_box order to reading order (Step 5j).""" wb_reordered = 0 for z in zones_data: for cell in z.get("cells", []): wbs = cell.get("word_boxes") or [] if len(wbs) < 2: continue lines = _group_words_into_lines(wbs, y_tolerance_px=15) sorted_wbs = [w for line in lines for w in line] if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]: cell["word_boxes"] = sorted_wbs wb_reordered += 1 if wb_reordered: logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered) def _enforce_max_columns( zones_data: List[Dict[str, Any]], max_columns: int, ) -> None: """Enforce max_columns by merging narrowest columns (Step 5k).""" for z in zones_data: if z.get("zone_type") != "content": continue cols = z.get("columns", []) cells = z.get("cells", []) if len(cols) <= max_columns: continue logger.info( "max_columns=%d: zone %s has %d columns -> merging", max_columns, z.get("zone_index"), len(cols), ) cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0))) while len(cols) > max_columns: narrowest = cols_by_width.pop(0) ni = narrowest["index"] sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0))) pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni) if pos + 1 < len(sorted_by_x): merge_target = sorted_by_x[pos + 1] elif pos > 0: merge_target = sorted_by_x[pos - 1] else: break ti = merge_target["index"] merge_target["x_min_px"] = min( merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)), narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)), ) merge_target["x_max_px"] = max( merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)), narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)), ) if "x_min_pct" in merge_target and "x_min_pct" in narrowest: merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"]) merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"]) for cell in cells: if cell.get("col_index") == ni: cell["col_index"] = ti existing = next( (c for c in cells if c["col_index"] == ti and c["row_index"] == cell["row_index"] and c is not cell), None, ) if existing: existing["text"] = ( (existing.get("text", "") + " " + cell.get("text", "")).strip() ) existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", []) cell["_merged"] = True z["cells"] = [c for c in cells if not c.get("_merged")] cells = z["cells"] cols.remove(narrowest) cols_by_width = [c for c in cols_by_width if c["index"] != ni] # Re-index columns 0..N-1 for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))): old_idx = col["index"] col["index"] = new_idx for cell in cells: if cell.get("col_index") == old_idx: cell["col_index"] = new_idx logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))