""" IPA lookup and phonetic bracket handling for OCR-extracted vocabulary. Tesseract and other OCR engines frequently garble IPA phonetic transcriptions in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)). This module provides functions to: - Look up correct IPA pronunciations (British/American) for English words. - Detect and replace garbled phonetic brackets with dictionary IPA. - Insert missing IPA for headwords where OCR destroyed the brackets entirely. - Strip orphan brackets and post-bracket garbled fragments. - Handle IPA continuation cells (phonetics on a separate row from headword). All IPA data comes from open-source dictionaries: - Britfone (MIT) for British English - eng_to_ipa / CMU (MIT) for American English Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Any, Dict, List, Optional from cv_vocab_types import ( IPA_AVAILABLE, _britfone_dict, _ipa_convert_american, ) logger = logging.getLogger(__name__) # --- D. Phonetic Bracket IPA Replacement --- # Pattern: word followed by any bracket type containing phonetic content. # Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc. # Match any opener ([, {, () with any closer (], }, )) — even mixed pairs. # This intentionally matches mixed brackets (e.g. {content]) because # Tesseract frequently misrecognizes bracket characters. _PHONETIC_BRACKET_RE = re.compile( r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]' ) # Unicode IPA characters — used to distinguish correct IPA (from dictionary # lookup) from garbled OCR content when stripping orphan brackets. _IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ') # Minimum word confidence for full-page Tesseract results (0-100). # Words below this threshold are OCR noise (scanner shadows, borders). _MIN_WORD_CONF = 30 def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]: """Look up IPA for a word using the selected pronunciation dictionary. Args: word: English word to look up. pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT). Returns: IPA string or None if not found. """ word_lower = word.lower().strip() if not word_lower: return None if pronunciation == 'british' and _britfone_dict: ipa = _britfone_dict.get(word_lower) if ipa: return ipa # Fallback to American if not in Britfone if _ipa_convert_american: result = _ipa_convert_american(word_lower) if result and '*' not in result: return result return None if pronunciation == 'american' and _ipa_convert_american: result = _ipa_convert_american(word_lower) if result and '*' not in result: return result # Fallback to Britfone if not in CMU if _britfone_dict: ipa = _britfone_dict.get(word_lower) if ipa: return ipa return None # Try any available source if _britfone_dict: ipa = _britfone_dict.get(word_lower) if ipa: return ipa if _ipa_convert_american: result = _ipa_convert_american(word_lower) if result and '*' not in result: return result return None def _fix_phonetic_brackets( entries: List[Dict[str, Any]], pronunciation: str = 'british', ) -> List[Dict[str, Any]]: """Replace OCR'd phonetic transcriptions with dictionary IPA. Detects patterns like "dance [du:ns]" and replaces with correct IPA: - British: "dance [dˈɑːns]" (Britfone, MIT) - American: "dance [dæns]" (eng_to_ipa/CMU, MIT) Only replaces if the word before brackets is found in the dictionary. """ if not IPA_AVAILABLE: return entries # IPA phonetics only appear in the ENGLISH field of vocab tables. # German and example fields contain meaningful parenthetical content: # german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)" # example: "(sich beschweren)", "(brauchen)", "(jammern)" # These must NEVER be processed as phonetic transcriptions. replaced_count = 0 for entry in entries: text = entry.get('english', '') or '' if not any(ch in text for ch in '[{('): continue new_text = _replace_phonetics_in_text(text, pronunciation) if new_text != text: logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'") replaced_count += 1 entry['english'] = new_text if replaced_count: logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries") return entries # Grammar particles that appear in brackets after English words: # cross (with), complain (about/of), agree (on/with), look (sth) up # These must NOT be replaced with IPA. Only used for the English field # (German/example fields are never processed for IPA replacement). _GRAMMAR_BRACKET_WORDS = frozenset({ # English prepositions/particles commonly in vocab tables 'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by', 'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through', # English grammar abbreviations used in vocab tables 'sth', 'sb', 'adj', 'adv', # Number/plural/grammar annotations 'pl', 'sg', 'sing', 'no', 'also', 'auch', # Regional English markers 'ae', 'be', 'ame', 'bre', }) def _is_grammar_bracket_content(content: str) -> bool: """Return True if bracket content is grammar info in the ENGLISH field. Grammar info: cross (with), complain (about/of), agree (on/with) NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test] Since we only process the English field, we only need to recognize English grammar particles. Everything else is (garbled) IPA. """ if not content: return False # Split on / and spaces for patterns like (about/of), (no pl) tokens = re.split(r'[/\s]+', content.strip().lower()) tokens = [t for t in tokens if t] if not tokens: return False # ALL tokens must be known grammar words return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens) def _replace_phonetics_in_text( text: str, pronunciation: str = 'british', strip_orphans: bool = True, ) -> str: """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA. Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno]. We match any bracket type and replace with dictionary IPA if found. Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved. Args: strip_orphans: If True, strip orphan brackets that look like garbled IPA. Set to False for column_text where brackets may be German content. """ if not IPA_AVAILABLE: return text def replacer(match): word = match.group(1) bracket_content = match.group(2).strip() full_match = match.group(0) # Skip if bracket content looks like regular text (multiple words) if len(bracket_content.split()) > 3: return full_match # Look up IPA for the word before brackets ipa = _lookup_ipa(word, pronunciation) if ipa: # Word has IPA → bracket content is phonetic (garbled or correct). # Exception: grammar particles like cross (with) — keep those. if _is_grammar_bracket_content(bracket_content): return full_match logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'") return f"{word} [{ipa}]" # No IPA for this word — keep as-is return full_match text = _PHONETIC_BRACKET_RE.sub(replacer, text) if strip_orphans: # Second pass: strip remaining orphan brackets that are garbled IPA. # These have no word before them (the main regex requires \b word \s* bracket). # Examples: "[mais]", "{'mani setva]", trailing "(kros]" # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]" def _strip_orphan_bracket(m): content = m.group(1).strip() # Keep grammar info: (sich beschweren), (about/of) if _is_grammar_bracket_content(content): return m.group(0) # Keep correct IPA (contains Unicode IPA characters) if any(ch in _IPA_CHARS for ch in content): return m.group(0) # Keep real-word parentheticals like (probieren), (Profit), (Geld). # Garbled IPA fragments are short nonsense like (kros), (cy), (mais) # — they never contain a real word ≥4 letters with proper casing. content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content) if len(content_alpha) >= 4: return m.group(0) logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'") return '' text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text) text = text.strip() return text def _text_has_garbled_ipa(text: str) -> bool: """Check if text contains garbled IPA-like fragments from OCR. Returns True if there is evidence of OCR-mangled phonetic transcription, e.g. stress marks, length marks, or IPA special chars. This is used to decide whether ``_insert_missing_ipa`` should run: it must only insert IPA to *replace* garbled phonetics that are already in the text — never to ADD phonetics where none existed on the page. """ # Bracketed text that doesn't contain valid IPA symbols is garbled OCR # of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]". stripped = text.strip() if stripped.startswith('[') and stripped.endswith(']'): inner = stripped[1:-1] # Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ) if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'): # Not a valid dictionary-style bracket like "(no pl)" — those # use parentheses, not square brackets. Square brackets with # no IPA chars are garbled phonetics. return True for w in text.strip().split(): # Skip delimiters and very short tokens if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'): continue # Starts with stress mark (OCR read IPA stress ' as apostrophe) if w.startswith("'") and len(w) > 1 and not w[1:].istitle(): return True if w.startswith("\u02c8") or w.startswith("\u02cc"): # ˈ ˌ return True # Contains IPA length mark ':' in a short non-word fragment if ':' in w and len(w) < 12: # But not things like "3:00" (time) or common words stripped = re.sub(r'[^a-zA-Z:]', '', w) if ':' in stripped and not stripped.replace(':', '').isalpha(): continue return True # Contains IPA special characters if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'): return True # Embedded apostrophe suggesting merged garbled IPA with stress mark. # E.g. "Scotland'skotland" — OCR reads ˈ as '. # Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase # chars to avoid contractions (don't, won't, o'clock). if "'" in w and not w.startswith("'"): apos_idx = w.index("'") after = w[apos_idx + 1:] if apos_idx >= 3 and len(after) >= 3 and after[0].islower(): return True return False def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]: """Try to decompose a compound word and concatenate IPA for each part. E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated. Only returns IPA if ALL parts are found in the dictionary. Tries splits at every position (min 3 chars per part) and picks the split where the first part is longest. """ if not IPA_AVAILABLE: return None lower = word.lower().strip() if len(lower) < 6: return None # too short for a compound best_ipa = None best_first_len = 0 for split_pos in range(3, len(lower) - 2): # min 3 chars each part first = lower[:split_pos] second = lower[split_pos:] ipa_first = _lookup_ipa(first, pronunciation) ipa_second = _lookup_ipa(second, pronunciation) if ipa_first and ipa_second: if split_pos > best_first_len: best_first_len = split_pos best_ipa = ipa_first + ipa_second return best_ipa def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: """Insert IPA pronunciation for English words that have no brackets at all. OCR sometimes garbles the phonetic transcription into plain-text fragments (e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text for the headword, inserts correct [IPA], and strips the garbled fragments. Only inserts for words that: - are standalone (not already followed by a bracket) - have an IPA entry in the dictionary - appear to be English headwords (at the start of text or after common separators like ",", ";", "•") This is intentionally conservative: it only inserts at the END of each whitespace-separated token group to avoid breaking phrases. """ if not IPA_AVAILABLE: return text if not text or not text.strip(): return text # Skip if already has brackets (IPA replacement handles those) if any(ch in text for ch in '[{('): return text # Only process short text fragments (typical vocab cells). # Long sentences / paragraphs should not get IPA insertions. words = text.strip().split() if len(words) > 6: return text # Try to insert IPA for the first alphanumeric word # Typical patterns: "challenge", "profit", "film", "badge" for i, w in enumerate(words): # Clean punctuation for lookup clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w) if not clean or len(clean) < 2: continue # Skip German/grammar words if clean.lower() in _GRAMMAR_BRACKET_WORDS: continue ipa = _lookup_ipa(clean, pronunciation) # Fallback: try without hyphens (e.g. "second-hand" → "secondhand") if not ipa and '-' in clean: ipa = _lookup_ipa(clean.replace('-', ''), pronunciation) # Fallback 0b: compound word decomposition # E.g. "schoolbag" → "school"+"bag" → concatenated IPA if not ipa: ipa = _decompose_compound(clean, pronunciation) # Fallback 1: IPA-marker split for merged tokens where OCR # joined headword with its IPA (e.g. "schoolbagsku:lbæg"). # Find the first IPA marker character (:, æ, ɪ, etc.), walk # backwards ≤3 chars for the onset consonant cluster, and # split into headword + OCR IPA. _IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ') if not ipa: first_marker = next( (p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1, ) if first_marker >= 3: split = first_marker while (split > 0 and split > first_marker - 3 and w[split - 1].isalpha() and w[split - 1].islower()): split -= 1 if split >= 2: headword = w[:split] ocr_ipa = w[split:] hw_ipa = _lookup_ipa(headword, pronunciation) if not hw_ipa: # Try compound decomposition for the headword part hw_ipa = _decompose_compound(headword, pronunciation) if hw_ipa: words[i] = f"{headword} [{hw_ipa}]" else: # Word not in dictionary — use OCR IPA words[i] = f"{headword} [{ocr_ipa}]" words = words[:i + 1] ipa = True # signal that we handled it break # Fallback 2: prefix matching for merged tokens WITHOUT IPA # markers (e.g. "Scotland'skotland"). Find longest dictionary # prefix using only alpha chars to avoid punctuation matches. if not ipa: alpha = re.sub(r'[^a-zA-Z]', '', clean) if len(alpha) > 5: # need at least 6 chars for meaningful split for end in range(len(alpha), 3, -1): # min prefix 4 chars prefix = alpha[:end] test_ipa = _lookup_ipa(prefix, pronunciation) if test_ipa: ipa = test_ipa w = prefix words[i] = prefix break if ipa: words[i] = f"{w} [{ipa}]" # Strip garbled OCR phonetics after the IPA bracket. # On scanned vocab pages, printed IPA is read as garbled # text (e.g. "scare skea" where "skea" is garbled /skɛə/). # After inserting correct IPA, remove remaining words that # aren't real English words, delimiters, or German text. kept = words[:i + 1] for j in range(i + 1, len(words)): wj = words[j] # Delimiter — keep this and everything after if wj in ('–', '—', '-', '/', '|', ',', ';'): kept.extend(words[j:]) break # Pure digits or numbering (e.g. "1", "2.", "3)") — keep if re.match(r'^[\d.)\-]+$', wj): kept.extend(words[j:]) break # Starts with uppercase — likely German or proper noun clean_j = re.sub(r'[^a-zA-Z]', '', wj) if clean_j and clean_j[0].isupper(): kept.extend(words[j:]) break # Known English word (≥2 chars) — keep it and rest if clean_j and len(clean_j) >= 2: if _lookup_ipa(clean_j, pronunciation): kept.extend(words[j:]) break # Merged token: dictionary word + garbled IPA stuck together. # E.g. "fictionsalans'fIkfn" starts with "fiction". # Extract the dictionary prefix (≥4 chars) and add it with # IPA, but only if enough chars remain after the prefix (≥3) # to look like garbled IPA, not just a plural 's'. if clean_j and len(clean_j) >= 7: for pend in range(min(len(clean_j) - 3, 15), 3, -1): prefix_j = clean_j[:pend] prefix_ipa = _lookup_ipa(prefix_j, pronunciation) if prefix_ipa: kept.append(f"{prefix_j} [{prefix_ipa}]") break break # rest of this token is garbled # Otherwise — likely garbled phonetics, skip words = kept break return ' '.join(words)