""" Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations, dictionary detection, syllable dividers, spell checking, empty column removal, and result assembly. Extracted from grid_build_core.py for maintainability. """ import logging import re from typing import Any, Dict, List, Optional from grid_build_cell_ops import ( _remove_bullets_and_artifacts, _remove_garbled_cells, _normalize_word_order, _enforce_max_columns, ) logger = logging.getLogger(__name__) def _finalize_grid( zones_data: List[Dict[str, Any]], all_words: List[Dict[str, Any]], img_bgr: Any, img_w: int, img_h: int, session_id: str, max_columns: Optional[int], ipa_mode: str, syllable_mode: str, en_col_type: Optional[str], ipa_target_cols: set, all_content_cols: set, skip_ipa: bool, document_category: Optional[str], margin_strip_detected: bool, page_number_info: Optional[Dict], boxes_detected: int, recovered_count: int, duration: float, ) -> dict: """Run final processing steps and assemble result dict. Handles: bullet removal, artifact cells, word ordering, max_columns, dictionary detection, syllable dividers, spell check, empty columns, internal flag cleanup, and result assembly. """ total_cols = sum(len(z.get("columns", [])) for z in zones_data) # 5i. Remove blue bullet/artifact word_boxes _remove_bullets_and_artifacts(zones_data) # 5j-pre. Remove cells whose text is entirely garbled / artifact noise _remove_garbled_cells(zones_data) # 5j. Normalise word_box order to reading order _normalize_word_order(zones_data) # 5k. Enforce max_columns by merging narrowest columns if max_columns and max_columns > 0: _enforce_max_columns(zones_data, max_columns) # --- Dictionary detection on assembled grid --- dict_detection = _detect_dictionary( zones_data, img_w, img_h, document_category, margin_strip_detected ) # --- Word-gap merge --- try: from cv_syllable_detect import merge_word_gaps_in_zones merge_word_gaps_in_zones(zones_data, session_id) except Exception as e: logger.warning("Word-gap merge failed: %s", e) # --- Pipe auto-correction --- try: from cv_syllable_detect import autocorrect_pipe_artifacts autocorrect_pipe_artifacts(zones_data, session_id) except Exception as e: logger.warning("Pipe autocorrect failed: %s", e) # --- Syllable divider insertion --- syllable_insertions = _insert_syllable_dividers( zones_data, img_bgr, session_id, syllable_mode, dict_detection, en_col_type, all_content_cols, total_cols, ) # --- Split merged words --- _split_merged_words(zones_data, session_id) # --- Ensure space before IPA/phonetic brackets --- _fix_ipa_spacing(zones_data) # --- SmartSpellChecker --- _run_spell_checker(zones_data, session_id, en_col_type, total_cols) # --- Debug log cell counts per column --- for z in zones_data: if z.get("zone_type") == "content": from collections import Counter as _Counter _cc = _Counter(c.get("col_index") for c in z.get("cells", [])) _cols = z.get("columns", []) logger.info( "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s", z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())), ) # --- Remove empty columns --- _remove_empty_columns(zones_data) # Clean up internal flags before returning for z in zones_data: for cell in z.get("cells", []): cell.pop("_ipa_corrected", None) # 6. Build result return _assemble_result( zones_data, all_words, img_w, img_h, session_id, ipa_mode, syllable_mode, ipa_target_cols, skip_ipa, dict_detection, page_number_info, boxes_detected, recovered_count, duration, syllable_insertions, ) def _detect_dictionary( zones_data: List[Dict[str, Any]], img_w: int, img_h: int, document_category: Optional[str], margin_strip_detected: bool, ) -> Dict[str, Any]: """Run dictionary detection on the assembled grid.""" from cv_layout import _score_dictionary_signals dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0} try: from cv_vocab_types import ColumnGeometry for z in zones_data: zone_cells = z.get("cells", []) zone_cols = z.get("columns", []) if len(zone_cols) < 2 or len(zone_cells) < 10: continue pseudo_geoms = [] for col in zone_cols: ci = col["index"] col_cells = [c for c in zone_cells if c.get("col_index") == ci] col_words = [] for cell in col_cells: for wb in cell.get("word_boxes") or []: col_words.append({ "text": wb.get("text", ""), "conf": wb.get("conf", 0), "top": wb.get("top", 0), "left": wb.get("left", 0), "height": wb.get("height", 0), "width": wb.get("width", 0), }) if not cell.get("word_boxes") and cell.get("text"): col_words.append({ "text": cell["text"], "conf": cell.get("confidence", 50), "top": cell.get("bbox_px", {}).get("y", 0), "left": cell.get("bbox_px", {}).get("x", 0), "height": cell.get("bbox_px", {}).get("h", 20), "width": cell.get("bbox_px", {}).get("w", 50), }) col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0) pseudo_geoms.append(ColumnGeometry( index=ci, x=col.get("x_min_px", 0), y=0, width=max(col_w, 1), height=img_h, word_count=len(col_words), words=col_words, width_ratio=col_w / max(img_w, 1), )) if len(pseudo_geoms) >= 2: dd = _score_dictionary_signals( pseudo_geoms, document_category=document_category, margin_strip_detected=margin_strip_detected, ) if dd["confidence"] > dict_detection["confidence"]: dict_detection = dd except Exception as e: logger.warning("Dictionary detection failed: %s", e) return dict_detection def _insert_syllable_dividers( zones_data: List[Dict[str, Any]], img_bgr: Any, session_id: str, syllable_mode: str, dict_detection: Dict[str, Any], en_col_type: Optional[str], all_content_cols: set, total_cols: int, ) -> int: """Insert syllable dividers for dictionary pages. Returns insertion count.""" syllable_insertions = 0 if syllable_mode == "none" or img_bgr is None: if syllable_mode == "none": for z in zones_data: for cell in z.get("cells", []): t = cell.get("text", "") if "|" in t: cell["text"] = t.replace("|", "") return syllable_insertions _syllable_eligible = False if syllable_mode in ("all", "de", "en"): _syllable_eligible = True elif (dict_detection.get("is_dictionary") and dict_detection.get("article_col_index") is not None): _syllable_eligible = True _syllable_col_filter: Optional[set] = None if syllable_mode == "en": _syllable_col_filter = {en_col_type} if en_col_type else set() elif syllable_mode == "de": if en_col_type and total_cols >= 3: _syllable_col_filter = all_content_cols - {en_col_type} if _syllable_eligible: try: from cv_syllable_detect import insert_syllable_dividers force_syllables = (syllable_mode in ("all", "de", "en")) syllable_insertions = insert_syllable_dividers( zones_data, img_bgr, session_id, force=force_syllables, col_filter=_syllable_col_filter, ) except Exception as e: logger.warning("Syllable insertion failed: %s", e) return syllable_insertions def _split_merged_words( zones_data: List[Dict[str, Any]], session_id: str, ) -> None: """Split merged words using dictionary lookup.""" try: from cv_review import _try_split_merged_word, _SPELL_AVAILABLE if not _SPELL_AVAILABLE: return split_count = 0 for z in zones_data: for cell in z.get("cells", []): text = cell.get("text", "") if not text: continue parts = [] changed = False for token in text.split(): clean = token bracket_pos = clean.find('[') suffix_ipa = "" if bracket_pos > 0: suffix_ipa = clean[bracket_pos:] clean = clean[:bracket_pos] suffix_punct = "" stripped = clean.rstrip(".,!?;:'\")") if stripped != clean: suffix_punct = clean[len(stripped):] clean = stripped suffix = suffix_punct + suffix_ipa contraction = "" if "'" in clean and clean.index("'") >= 2: apos_pos = clean.index("'") contraction = clean[apos_pos:] clean = clean[:apos_pos] suffix = contraction + suffix if len(clean) >= 4 and clean.isalpha(): split = _try_split_merged_word(clean) if split: parts.append(split + suffix) changed = True continue parts.append(token) if changed: cell["text"] = " ".join(parts) split_count += 1 if split_count: logger.info("build-grid session %s: split %d merged words", session_id, split_count) except ImportError: pass def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None: """Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'.""" _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])') for z in zones_data: for cell in z.get("cells", []): text = cell.get("text", "") if text and "[" in text: fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text) if fixed != text: cell["text"] = fixed def _run_spell_checker( zones_data: List[Dict[str, Any]], session_id: str, en_col_type: Optional[str], total_cols: int, ) -> None: """Run SmartSpellChecker on all cells.""" try: from smart_spell import SmartSpellChecker _ssc = SmartSpellChecker() spell_fix_count = 0 for z in zones_data: for cell in z.get("cells", []): text = cell.get("text", "") if not text or not text.strip(): continue ct = cell.get("col_type", "") if not ct.startswith("column_"): continue if total_cols >= 3 and en_col_type: lang = "en" if ct == en_col_type else "de" elif total_cols <= 2: lang = "auto" else: lang = "auto" result = _ssc.correct_text(text, lang=lang) if result.changed: cell["text"] = result.corrected spell_fix_count += 1 if spell_fix_count: logger.info( "build-grid session %s: SmartSpellChecker fixed %d cells", session_id, spell_fix_count, ) except ImportError: logger.debug("SmartSpellChecker not available in build-grid") except Exception as e: logger.warning("SmartSpellChecker error in build-grid: %s", e) def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None: """Remove columns that have no cells assigned.""" for z in zones_data: cells = z.get("cells", []) used_col_indices = {c.get("col_index") for c in cells} old_cols = z.get("columns", []) new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices] if len(new_cols) < len(old_cols): old_to_new = {} for new_i, col in enumerate(new_cols): old_i = col.get("col_index", col.get("index", new_i)) old_to_new[old_i] = new_i col["col_index"] = new_i col["index"] = new_i col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text" for cell in cells: old_ci = cell.get("col_index", 0) cell["col_index"] = old_to_new.get(old_ci, old_ci) cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text" z["columns"] = new_cols def _assemble_result( zones_data: List[Dict[str, Any]], all_words: List[Dict[str, Any]], img_w: int, img_h: int, session_id: str, ipa_mode: str, syllable_mode: str, ipa_target_cols: set, skip_ipa: bool, dict_detection: Dict[str, Any], page_number_info: Optional[Dict], boxes_detected: int, recovered_count: int, duration: float, syllable_insertions: int, ) -> dict: """Build the final result dict (Phase 6).""" total_cells = sum(len(z.get("cells", [])) for z in zones_data) total_columns = sum(len(z.get("columns", [])) for z in zones_data) total_rows = sum(len(z.get("rows", [])) for z in zones_data) # Collect color statistics color_stats: Dict[str, int] = {} for z in zones_data: for cell in z.get("cells", []): for wb in cell.get("word_boxes", []): cn = wb.get("color_name", "black") color_stats[cn] = color_stats.get(cn, 0) + 1 # Compute layout metrics all_content_row_heights: List[float] = [] for z in zones_data: for row in z.get("rows", []): if not row.get("is_header", False): h = row.get("y_max_px", 0) - row.get("y_min_px", 0) if h > 0: all_content_row_heights.append(h) avg_row_height = ( sum(all_content_row_heights) / len(all_content_row_heights) if all_content_row_heights else 30.0 ) font_size_suggestion = max(10, int(avg_row_height * 0.6)) return { "session_id": session_id, "image_width": img_w, "image_height": img_h, "zones": zones_data, "boxes_detected": boxes_detected, "summary": { "total_zones": len(zones_data), "total_columns": total_columns, "total_rows": total_rows, "total_cells": total_cells, "total_words": len(all_words), "recovered_colored": recovered_count, "color_stats": color_stats, }, "formatting": { "bold_columns": [], "header_rows": [], }, "layout_metrics": { "page_width_px": img_w, "page_height_px": img_h, "avg_row_height_px": round(avg_row_height, 1), "font_size_suggestion_px": font_size_suggestion, }, "dictionary_detection": { "is_dictionary": dict_detection.get("is_dictionary", False), "confidence": dict_detection.get("confidence", 0.0), "signals": dict_detection.get("signals", {}), "article_col_index": dict_detection.get("article_col_index"), "headword_col_index": dict_detection.get("headword_col_index"), }, "processing_modes": { "ipa_mode": ipa_mode, "syllable_mode": syllable_mode, "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False, "syllables_applied": syllable_insertions > 0, }, "page_number": page_number_info, "duration_seconds": round(duration, 2), }