""" Advanced IPA repair for OCR-extracted vocabulary. Functions that detect and fix garbled IPA fragments trailing after headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py to stay within the 500 LOC budget. Contains: - _has_non_dict_trailing: detect non-dictionary trailing words - _strip_post_bracket_garbled: strip garbled IPA after [brackets] - fix_ipa_continuation_cell: replace garbled IPA in continuation rows - _insert_headword_ipa: insert IPA for first headword in mixed-lang lines """ import logging import re from typing import Any, Dict, List, Optional from cv_vocab_types import IPA_AVAILABLE from cv_ocr_ipa_lookup import ( _lookup_ipa, _GRAMMAR_BRACKET_WORDS, ) logger = logging.getLogger(__name__) def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool: """Check if text has a headword followed by non-dictionary trailing words. Used as an additional trigger for ``_insert_missing_ipa`` when ``_text_has_garbled_ipa`` returns False because the garbled IPA happens to look like plain ASCII (e.g. "skea" for /skɛə/). """ if not IPA_AVAILABLE: return False words = text.strip().split() if len(words) < 2 or len(words) > 6: return False # Find first dictionary word hw_idx = -1 for i, w in enumerate(words): clean = re.sub(r'[^a-zA-Z\'-]', '', w) if not clean or len(clean) < 2: continue if clean.lower() in _GRAMMAR_BRACKET_WORDS: continue if _lookup_ipa(clean, pronunciation): hw_idx = i break if hw_idx < 0 or hw_idx >= len(words) - 1: return False # Check ALL remaining words — if none are dictionary/delimiter/German, # they are likely garbled IPA. for j in range(hw_idx + 1, len(words)): wj = words[j] if wj in ('–', '—', '-', '/', '|', ',', ';'): return False # Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA if re.match(r'^[\d.)\-]+$', wj): return False clean_j = re.sub(r'[^a-zA-Z]', '', wj) if clean_j and clean_j[0].isupper(): return False if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation): return False return True def _strip_post_bracket_garbled( text: str, pronunciation: str = 'british', ) -> str: """Strip garbled IPA fragments that trail after proper [IPA] brackets. E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]`` ``seat [sˈiːt] si:t`` → ``seat [sˈiːt]`` ``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt`` For multi-word headwords like "seat belt", a real English word ("belt") may be followed by garbled IPA duplicates. We detect this by checking whether the sequence after a real word contains IPA markers (`:`, `ə`, etc.) — if so, everything from the first garbled token onward is stripped. """ if ']' not in text: return text last_bracket = text.rfind(']') if last_bracket >= len(text) - 1: return text before = text[:last_bracket + 1].rstrip() after = text[last_bracket + 1:].strip() if not after: return text _IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ') after_words = after.split() kept: List[str] = [] for idx, w in enumerate(after_words): # Delimiter — keep rest if w in ('–', '—', '-', '/', '|', ',', ';'): kept.extend(after_words[idx:]) break # Contains IPA markers (length mark, IPA chars) — garbled, skip if any(c in w for c in _IPA_MARKER_CHARS): # Everything from here is garbled IPA — stop scanning # but look ahead: if any remaining words are real English # words WITHOUT IPA markers, they might be a different headword # following. Only skip the contiguous garbled run. continue clean = re.sub(r'[^a-zA-Z]', '', w) # Uppercase — likely German, keep rest if clean and clean[0].isupper(): kept.extend(after_words[idx:]) break # Known English word — keep it, but check if followed by garbled IPA # (multi-word headword case like "seat [siːt] belt si:t belt") if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation): # Peek ahead: if next word has IPA markers, the rest is garbled remaining = after_words[idx + 1:] has_garbled_after = any( any(c in rw for c in _IPA_MARKER_CHARS) for rw in remaining ) if has_garbled_after: # Keep this real word but stop — rest is garbled duplication kept.append(w) # Still scan for delimiters/German in the remaining words for ridx, rw in enumerate(remaining): if rw in ('–', '—', '-', '/', '|', ',', ';'): kept.extend(remaining[ridx:]) break rclean = re.sub(r'[^a-zA-Z]', '', rw) if rclean and rclean[0].isupper(): kept.extend(remaining[ridx:]) break break else: kept.extend(after_words[idx:]) break # Unknown short word — likely garbled, skip if kept: return before + ' ' + ' '.join(kept) return before def fix_ipa_continuation_cell( garbled_text: str, headword_text: str, pronunciation: str = 'british', ) -> str: """Replace garbled IPA in a continuation row with proper IPA. Continuation rows appear below the headword and contain only the printed phonetic transcription, which OCR garbles into fragments like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``). Args: garbled_text: The OCR-garbled IPA text from the continuation row. headword_text: The headword text from the previous row (e.g. ``scarf – scarves``). pronunciation: ``'british'`` or ``'american'``. Returns: Corrected IPA text, or the original if no fix could be applied. """ if not IPA_AVAILABLE or not garbled_text or not headword_text: return garbled_text # If headword already has inline IPA like "beat [bˈiːt] , beat, beaten", # only generate continuation IPA for words NOT already covered. covered_words: set = set() has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text)) if has_inline_ipa: # Words before the first bracket already have their IPA shown first_bracket = headword_text.index('[') pre_bracket = headword_text[:first_bracket].strip() for w in pre_bracket.split(): clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower() if clean and len(clean) >= 2: covered_words.add(clean) last_bracket_end = headword_text.rfind(']') tail = headword_text[last_bracket_end + 1:].strip() if not tail or not re.search(r'[a-zA-Z]{2,}', tail): # Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]") # — return the inline IPA directly (continuation duplicates it) last_bracket_start = headword_text.rfind('[') inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1] return inline_ipa # Only the tail words need continuation IPA headword_text = tail # Strip existing IPA brackets and parenthetical grammar annotations # like "(no pl)", "(sth)", "(sb)" from headword text clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text) clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip() if not clean_hw: return garbled_text # Split headword by delimiters (– — -) # "scarf – scarves" → ["scarf", "scarves"] # "see - saw - seen" → ["see", "saw", "seen"] parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw) parts = [p.strip() for p in parts if p.strip()] if not parts: return garbled_text # Look up IPA for each headword part. # Skip articles (the, a, an) — they never get IPA in vocab books. # Other function words like "down", "up" are kept because they are # integral parts of phrasal verbs (e.g. "close down"). # Skip words that already have inline IPA in the headword row. _ARTICLES = {'the', 'a', 'an'} ipa_parts: List[str] = [] for part in parts: # A part may be multi-word like "secondary school" words = part.split() word_ipas: List[str] = [] for w in words: clean_w = re.sub(r'[^a-zA-Z\'-]', '', w) if not clean_w or len(clean_w) < 2: continue if covered_words and clean_w.lower() in covered_words: continue # Already has IPA inline in the headword if clean_w.lower() in _ARTICLES: continue # Articles never get IPA in vocab books ipa = _lookup_ipa(clean_w, pronunciation) if ipa: word_ipas.append(ipa) if word_ipas: ipa_parts.append('[' + ' '.join(word_ipas) + ']') if not ipa_parts: return garbled_text # Join with delimiter result = ' – '.join(ipa_parts) logger.debug( "fix_ipa_continuation: '%s' → '%s' (headwords: '%s')", garbled_text, result, headword_text, ) return result def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str: """Insert IPA for the first English headword in a long mixed-language line. Unlike _insert_missing_ipa (for short column_en cells), this handles column_text lines of any length. It only inserts IPA for the FIRST word if that word: - has no bracket following it already - has an IPA entry in the dictionary - is not a number/symbol prefix like "».55" Returns the text with [ipa] inserted after the first word, or unchanged. """ if not IPA_AVAILABLE: return text if not text or not text.strip(): return text words = text.strip().split() if not words: return text # Check if text already starts with a bracket (IPA already present) if len(words) > 1 and words[1].startswith(('[', '{', '(')): return text # Try the first few words (skip numeric prefixes like "».55", "0.56") for i in range(min(3, len(words))): w = words[i] clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w) if not clean or len(clean) < 2: continue if clean.lower() in _GRAMMAR_BRACKET_WORDS: continue ipa = _lookup_ipa(clean, pronunciation) if ipa: words[i] = f"{w} [{ipa}]" return ' '.join(words) # Stop at first real word even if no IPA found break return text