""" Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection, parenthesis fix, IPA phonetic correction, page ref extraction, and slash-IPA conversion. Extracted from grid_build_core.py for maintainability. """ import logging import re from typing import Any, Dict, List, Optional, Set, Tuple from cv_color_detect import detect_word_colors from cv_ocr_engines import ( fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, ) from grid_editor_helpers import ( _detect_heading_rows_by_color, _detect_heading_rows_by_single_cell, ) logger = logging.getLogger(__name__) def _process_text( zones_data: List[Dict[str, Any]], img_bgr: Any, img_w: int, img_h: int, ipa_mode: str, page_number_info: Optional[Dict], ) -> Dict[str, Any]: """Run color annotation, heading detection, IPA correction, and page refs. Args: zones_data: List of zone dicts (modified in place). img_bgr: BGR image array (or None). img_w: Image width. img_h: Image height. ipa_mode: IPA processing mode. page_number_info: Existing page number metadata (may be None). Returns: Dict with keys: en_col_type, ipa_target_cols, all_content_cols, skip_ipa, page_number_info. """ # 5. Color annotation on final word_boxes in cells if img_bgr is not None: all_wb: List[Dict] = [] for z in zones_data: for cell in z.get("cells", []): all_wb.extend(cell.get("word_boxes", [])) detect_word_colors(img_bgr, all_wb) # 5a. Heading detection by color + height heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h) if heading_count: logger.info("Detected %d heading rows by color+height", heading_count) # 5b. Fix unmatched parentheses in cell text for z in zones_data: for cell in z.get("cells", []): text = cell.get("text", "") if ")" in text and "(" not in text: cell["text"] = "(" + text # 5c. IPA phonetic correction all_cells = [cell for z in zones_data for cell in z.get("cells", [])] total_cols = sum(len(z.get("columns", [])) for z in zones_data) en_col_type = None ipa_target_cols: set = set() all_content_cols: set = set() skip_ipa = (ipa_mode == "none") # When ipa_mode=none, strip ALL square brackets from ALL content columns if skip_ipa: _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]') for cell in all_cells: ct = cell.get("col_type", "") if not ct.startswith("column_"): continue text = cell.get("text", "") if "[" in text: stripped = _SQUARE_BRACKET_RE_NONE.sub("", text) if stripped != text: cell["text"] = stripped.strip() cell["_ipa_corrected"] = True if not skip_ipa and total_cols >= 3: en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction( all_cells, total_cols, ipa_mode, zones_data ) elif not skip_ipa: # Collect all_content_cols even when <3 cols (needed by finalize) for cell in all_cells: ct = cell.get("col_type", "") if ct.startswith("column_") and (cell.get("text") or "").strip(): all_content_cols.add(ct) # 5e. Heading detection by single-cell rows single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h) if single_heading_count: logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count) # 5f. Strip IPA from headings for z in zones_data: for cell in z.get("cells", []): if cell.get("col_type") != "heading": continue text = cell.get("text", "") stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip() if stripped and stripped != text: cell["text"] = stripped # 5g. Extract page_ref cells and footer rows _extract_page_refs_and_footers(zones_data, page_number_info) # 5h. Convert slash-delimited IPA to bracket notation _convert_slash_ipa(zones_data, skip_ipa, en_col_type) return { "en_col_type": en_col_type, "ipa_target_cols": ipa_target_cols, "all_content_cols": all_content_cols, "skip_ipa": skip_ipa, "page_number_info": page_number_info, } def _run_ipa_correction( all_cells: List[Dict], total_cols: int, ipa_mode: str, zones_data: List[Dict[str, Any]], ) -> Tuple[Optional[str], set, set]: """Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols).""" en_col_type = None all_content_cols: set = set() # Detect English headword column via IPA signals col_ipa_count: Dict[str, int] = {} for cell in all_cells: ct = cell.get("col_type", "") if not ct.startswith("column_"): continue txt = cell.get("text", "") or "" if txt.strip(): all_content_cols.add(ct) if '[' in txt or _text_has_garbled_ipa(txt): col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1 if col_ipa_count: en_col_type = max(col_ipa_count, key=col_ipa_count.get) elif ipa_mode == "all": col_cell_count: Dict[str, int] = {} for cell in all_cells: ct = cell.get("col_type", "") if ct.startswith("column_") and (cell.get("text") or "").strip(): col_cell_count[ct] = col_cell_count.get(ct, 0) + 1 if col_cell_count: en_col_type = max(col_cell_count, key=col_cell_count.get) # Decide which columns to process based on ipa_mode en_ipa_target_cols: set = set() de_ipa_target_cols: set = set() if ipa_mode in ("auto", "en"): if en_col_type: en_ipa_target_cols.add(en_col_type) elif ipa_mode == "de": de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols elif ipa_mode == "all": if en_col_type: en_ipa_target_cols.add(en_col_type) de_ipa_target_cols = all_content_cols - en_ipa_target_cols # --- Strip IPA from columns NOT in the target set --- _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]') strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols if strip_en_ipa or ipa_mode == "none": strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols for cell in all_cells: ct = cell.get("col_type", "") if ct not in strip_cols: continue text = cell.get("text", "") if "[" in text: stripped = _SQUARE_BRACKET_RE.sub("", text) if stripped != text: cell["text"] = stripped.strip() cell["_ipa_corrected"] = True # --- English IPA (Britfone + eng_to_ipa) --- if en_ipa_target_cols: for cell in all_cells: ct = cell.get("col_type") if ct in en_ipa_target_cols: cell["_orig_col_type"] = ct cell["col_type"] = "column_en" _pre_ipa = {id(c): c.get("text", "") for c in all_cells} fix_cell_phonetics(all_cells, pronunciation="british") for cell in all_cells: orig = cell.pop("_orig_col_type", None) if orig: cell["col_type"] = orig if cell.get("text", "") != _pre_ipa.get(id(cell), ""): cell["_ipa_corrected"] = True # --- German IPA (wiki-pronunciation-dict + epitran) --- if de_ipa_target_cols: from cv_ipa_german import insert_german_ipa insert_german_ipa(all_cells, de_ipa_target_cols) ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols # Mark cells whose text was changed by IPA correction for cell in all_cells: if cell.get("text", "") != _pre_ipa.get(id(cell), ""): cell["_ipa_corrected"] = True # 5d. Fix IPA continuation cells skip_ipa = (ipa_mode == "none") _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") ipa_cont_fixed = 0 for z in ([] if skip_ipa else zones_data): rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) z_cells = z.get("cells", []) for idx, row in enumerate(rows_sorted): if idx == 0: continue ri = row["index"] row_cells = [c for c in z_cells if c.get("row_index") == ri] for cell in row_cells: ct = cell.get("col_type", "") if not ct.startswith("column_"): continue cell_text = (cell.get("text") or "").strip() if not cell_text: wb_texts = [w.get("text", "") for w in cell.get("word_boxes", [])] cell_text = " ".join(wb_texts).strip() if not cell_text: continue is_bracketed = ( cell_text.startswith('[') and cell_text.endswith(']') ) if is_bracketed: if not _text_has_garbled_ipa(cell_text): continue if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text): continue else: content_cells_in_row = [ c for c in row_cells if c.get("col_type", "").startswith("column_") and c.get("col_type") != "column_1" ] if len(content_cells_in_row) != 1: continue if not _text_has_garbled_ipa(cell_text): continue if any(c in _REAL_IPA_CHARS for c in cell_text): continue _words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text) if len(_words_in_text) >= 3: continue # Find headword in previous row, same column prev_ri = rows_sorted[idx - 1]["index"] prev_same_col = [ c for c in z_cells if c.get("row_index") == prev_ri and c.get("col_type") == ct ] if not prev_same_col: continue prev_text = prev_same_col[0].get("text", "") fixed = fix_ipa_continuation_cell( cell_text, prev_text, pronunciation="british", ) if fixed != cell_text: cell["text"] = fixed ipa_cont_fixed += 1 logger.info( "IPA continuation R%d %s: '%s' -> '%s'", ri, ct, cell_text, fixed, ) if ipa_cont_fixed: logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed) return en_col_type, ipa_target_cols, all_content_cols def _extract_page_refs_and_footers( zones_data: List[Dict[str, Any]], page_number_info: Optional[Dict], ) -> None: """Extract page_ref cells and footer rows from content zones. Modifies zones_data in place. Updates page_number_info if a page number footer is found. """ _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$') _NUMBER_WORDS = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "and", "einhundert", "zweihundert", "dreihundert", "vierhundert", "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig", } for z in zones_data: if z.get("zone_type") != "content": continue cells = z.get("cells", []) rows = z.get("rows", []) if not rows: continue # Extract column_1 cells that look like page references page_refs = [] page_ref_cell_ids = set() for cell in cells: if cell.get("col_type") != "column_1": continue text = (cell.get("text") or "").strip() if not text: continue if not _PAGE_REF_RE.match(text): continue page_refs.append({ "row_index": cell.get("row_index"), "text": text, "bbox_pct": cell.get("bbox_pct", {}), }) page_ref_cell_ids.add(cell.get("cell_id")) # Detect footer: last non-header row if it has only 1 cell footer_rows = [] non_header_rows = [r for r in rows if not r.get("is_header")] if non_header_rows: last_row = non_header_rows[-1] last_ri = last_row["index"] last_cells = [c for c in z["cells"] if c.get("row_index") == last_ri] if len(last_cells) == 1: text = (last_cells[0].get("text") or "").strip() has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text) has_commas = ',' in text text_words = set(text.lower().split()) is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS) is_page_number = len(text) <= 20 or is_written_number if (text and not has_real_ipa and not has_commas and is_page_number and last_cells[0].get("col_type") != "heading"): footer_rows.append({ "row_index": last_ri, "text": text, "bbox_pct": last_cells[0].get("bbox_pct", {}), }) # Classify footer rows page_number_footers = [] other_footers = [] for fr in footer_rows: ft = fr["text"].strip() digits = "".join(c for c in ft if c.isdigit()) if digits and re.match(r'^[\d\s.]+$', ft): page_number_footers.append(fr) elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS): page_number_footers.append(fr) else: other_footers.append(fr) # Remove page-number footer rows from grid entirely if page_number_footers: pn_ris = {fr["row_index"] for fr in page_number_footers} z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris] z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris] pn_text = page_number_footers[0]["text"].strip() pn_digits = "".join(c for c in pn_text if c.isdigit()) if not page_number_info: page_number_info = { "text": pn_text, "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95), } if pn_digits: page_number_info["number"] = int(pn_digits) # Mark remaining footer rows if other_footers: footer_ris = {fr["row_index"] for fr in other_footers} for r in z["rows"]: if r["index"] in footer_ris: r["is_footer"] = True for c in z["cells"]: if c.get("row_index") in footer_ris: c["col_type"] = "footer" if page_refs or footer_rows: logger.info( "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d", len(page_refs), len(footer_rows), len(page_number_footers), z.get("zone_index", 0), ) if page_refs: z["page_refs"] = page_refs if other_footers: z["footer"] = other_footers def _convert_slash_ipa( zones_data: List[Dict[str, Any]], skip_ipa: bool, en_col_type: Optional[str], ) -> None: """Convert slash-delimited IPA to bracket notation. Dictionary-style pages print IPA between slashes: "tiger /'taiga/" """ _SLASH_IPA_RE = re.compile( r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1) r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars ) _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/') _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]') slash_ipa_fixed = 0 for z in ([] if skip_ipa else zones_data): for cell in z.get("cells", []): if en_col_type and cell.get("col_type") != en_col_type: continue text = cell.get("text", "") if "/" not in text: continue def _replace_slash_ipa(m: re.Match) -> str: nonlocal slash_ipa_fixed headword = m.group(1) ocr_ipa = m.group(2) inner_raw = ocr_ipa.strip("/").strip() if _SLASH_IPA_REJECT_RE.search(inner_raw): return m.group(0) clean_hw = re.sub(r'[²³¹\d]', '', headword).strip() ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None if ipa: slash_ipa_fixed += 1 return f"{headword} [{ipa}]" inner = inner_raw.lstrip("'").strip() if inner: slash_ipa_fixed += 1 return f"{headword} [{inner}]" return m.group(0) new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text) _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)') def _replace_trailing_slash(m: re.Match) -> str: nonlocal slash_ipa_fixed inner = m.group(1).strip("/").strip().lstrip("'").strip() if _SLASH_IPA_REJECT_RE.search(inner): return m.group(0) if inner: slash_ipa_fixed += 1 return f" [{inner}]" return m.group(0) new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text) if new_text == text: m = _STANDALONE_SLASH_IPA_RE.match(text) if m: inner = m.group(1).strip() if not _SLASH_IPA_REJECT_RE.search(inner): inner = inner.lstrip("'").strip() if inner: new_text = "[" + inner + "]" + text[m.end():] slash_ipa_fixed += 1 if new_text != text: cell["text"] = new_text if slash_ipa_fixed: logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)