"""Cell-level IPA phonetic fixes for overlay mode. In the normal pipeline, _fix_phonetic_brackets operates on vocab entries (entry['english']). But the overlay reads cell['text'] directly, so phonetic fixes must be applied to cells too. Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers. """ import logging import re from typing import Any, Dict, List from cv_vocab_types import IPA_AVAILABLE from cv_ocr_ipa_lookup import ( _insert_missing_ipa, _replace_phonetics_in_text, _text_has_garbled_ipa, ) from cv_ocr_ipa_repair import ( _has_non_dict_trailing, _insert_headword_ipa, _strip_post_bracket_garbled, ) logger = logging.getLogger(__name__) def fix_cell_phonetics( cells: List[Dict[str, Any]], pronunciation: str = 'british', ) -> List[Dict[str, Any]]: """Apply IPA phonetic fixes to cell texts for overlay mode. In the normal pipeline, _fix_phonetic_brackets operates on vocab entries (entry['english']). But the overlay reads cell['text'] directly, so phonetic fixes must be applied to cells too. Processing depends on column type: - column_en: Full processing (replace garbled IPA + strip orphan brackets + insert missing IPA). Safe because these cells contain only English headwords. - column_text: Light processing (replace garbled IPA ONLY). No orphan bracket stripping (brackets may be German content like "(probieren)") and no IPA insertion (would add tokens and break overlay positioning). """ if not IPA_AVAILABLE: return cells ipa_col_types = {'column_en', 'column_text'} replaced = 0 for cell in cells: col_type = cell.get('col_type', '') if col_type not in ipa_col_types: continue text = cell.get('text', '') or '' if not text.strip(): continue if col_type == 'column_en': # Full processing: replace garbled IPA, strip orphan brackets. new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True) if new_text == text: # Insert IPA when garbled phonetics exist OR when trailing # non-dictionary words suggest garbled IPA in plain ASCII. if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation): new_text = _insert_missing_ipa(text, pronunciation) # Strip trailing garbled fragments after proper [IPA] brackets # (e.g. "sea [sˈiː] si:" → "sea [sˈiː]") if ']' in new_text: new_text = _strip_post_bracket_garbled(new_text, pronunciation) else: # column_text: replace garbled IPA, no orphan stripping new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False) # Insert headword IPA ONLY if there's a gap in word_boxes # suggesting Tesseract missed an IPA bracket on the page. # Without gap evidence, the original page had no IPA. if new_text == text: wb = cell.get('word_boxes', []) if _has_ipa_gap(text, wb): inserted = _insert_headword_ipa(text, pronunciation) if inserted != text: new_text = inserted _sync_word_boxes_after_ipa_insert(cell, text, new_text) if new_text != text: logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'") cell['text'] = new_text replaced += 1 if replaced: logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells") return cells def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool: """Check if word_boxes show a gap where IPA brackets should be. On a typical vocab page, the layout is: headword [ipa] German translation If Tesseract missed the IPA bracket, the gap between the headword and the next word (German translation) is unusually large (>80px) because the IPA occupied physical space on the page. If no IPA was on the page (e.g. "be good at sth."), the words are close together (<30px). """ if not word_boxes or len(word_boxes) < 2: return False tokens = text.split() if not tokens: return False # Find the headword index: skip numeric prefixes like "».55", "0.56" hw_box_idx = 0 for i, wb in enumerate(word_boxes): wt = wb.get('text', '') clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt) if len(clean) >= 2: hw_box_idx = i break if hw_box_idx >= len(word_boxes) - 1: return False # Check gap between headword and the next word_box hw = word_boxes[hw_box_idx] next_wb = word_boxes[hw_box_idx + 1] gap = next_wb['left'] - (hw['left'] + hw['width']) return gap > 80 def _sync_word_boxes_after_ipa_insert( cell: Dict[str, Any], old_text: str, new_text: str, ) -> None: """Insert a synthetic word_box for an IPA token added by IPA insertion. E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..." Adds a new word_box right after the headword's box so the 1:1 token-to-box mapping in the frontend overlay stays consistent. """ word_boxes = cell.get('word_boxes') if not word_boxes: return old_tokens = old_text.split() new_tokens = new_text.split() if len(new_tokens) != len(old_tokens) + 1: return # unexpected change, skip # Find the inserted token by walking both lists in parallel. # One token in new_tokens won't match — that's the inserted IPA. insert_idx = -1 j = 0 # index into old_tokens for i in range(len(new_tokens)): if j < len(old_tokens) and new_tokens[i] == old_tokens[j]: j += 1 else: insert_idx = i break if insert_idx < 0 or insert_idx >= len(new_tokens): return ipa_token = new_tokens[insert_idx] # The headword is at insert_idx - 1 in old_tokens (and word_boxes) ref_idx = insert_idx - 1 if ref_idx < 0 or ref_idx >= len(word_boxes): return ref_box = word_boxes[ref_idx] ipa_box = { 'text': ipa_token, 'left': ref_box['left'] + ref_box['width'] + 2, 'top': ref_box['top'], 'width': ref_box['width'], 'height': ref_box['height'], 'conf': ref_box.get('conf', 90), } word_boxes.insert(insert_idx, ipa_box)