- Added ocr_region import to cell_grid/build.py and legacy.py - Fixed circular import in engines.py via lazy import - Auto-fixed 22 unused imports via ruff --fix Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
288 lines
11 KiB
Python
288 lines
11 KiB
Python
"""
|
||
Advanced IPA repair for OCR-extracted vocabulary.
|
||
|
||
Functions that detect and fix garbled IPA fragments trailing after
|
||
headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
|
||
to stay within the 500 LOC budget.
|
||
|
||
Contains:
|
||
- _has_non_dict_trailing: detect non-dictionary trailing words
|
||
- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
|
||
- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
|
||
- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
from typing import List
|
||
|
||
from ..types import IPA_AVAILABLE
|
||
from .ipa_lookup import (
|
||
_lookup_ipa,
|
||
_GRAMMAR_BRACKET_WORDS,
|
||
)
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
|
||
"""Check if text has a headword followed by non-dictionary trailing words.
|
||
|
||
Used as an additional trigger for ``_insert_missing_ipa`` when
|
||
``_text_has_garbled_ipa`` returns False because the garbled IPA
|
||
happens to look like plain ASCII (e.g. "skea" for /skɛə/).
|
||
"""
|
||
if not IPA_AVAILABLE:
|
||
return False
|
||
words = text.strip().split()
|
||
if len(words) < 2 or len(words) > 6:
|
||
return False
|
||
# Find first dictionary word
|
||
hw_idx = -1
|
||
for i, w in enumerate(words):
|
||
clean = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||
if not clean or len(clean) < 2:
|
||
continue
|
||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||
continue
|
||
if _lookup_ipa(clean, pronunciation):
|
||
hw_idx = i
|
||
break
|
||
if hw_idx < 0 or hw_idx >= len(words) - 1:
|
||
return False
|
||
# Check ALL remaining words — if none are dictionary/delimiter/German,
|
||
# they are likely garbled IPA.
|
||
for j in range(hw_idx + 1, len(words)):
|
||
wj = words[j]
|
||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||
return False
|
||
# Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
|
||
if re.match(r'^[\d.)\-]+$', wj):
|
||
return False
|
||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||
if clean_j and clean_j[0].isupper():
|
||
return False
|
||
if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
|
||
return False
|
||
return True
|
||
|
||
|
||
def _strip_post_bracket_garbled(
|
||
text: str, pronunciation: str = 'british',
|
||
) -> str:
|
||
"""Strip garbled IPA fragments that trail after proper [IPA] brackets.
|
||
|
||
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
|
||
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
|
||
``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
|
||
|
||
For multi-word headwords like "seat belt", a real English word ("belt")
|
||
may be followed by garbled IPA duplicates. We detect this by checking
|
||
whether the sequence after a real word contains IPA markers (`:`, `ə`,
|
||
etc.) — if so, everything from the first garbled token onward is stripped.
|
||
"""
|
||
if ']' not in text:
|
||
return text
|
||
last_bracket = text.rfind(']')
|
||
if last_bracket >= len(text) - 1:
|
||
return text
|
||
before = text[:last_bracket + 1].rstrip()
|
||
after = text[last_bracket + 1:].strip()
|
||
if not after:
|
||
return text
|
||
|
||
_IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||
after_words = after.split()
|
||
kept: List[str] = []
|
||
for idx, w in enumerate(after_words):
|
||
# Delimiter — keep rest
|
||
if w in ('–', '—', '-', '/', '|', ',', ';'):
|
||
kept.extend(after_words[idx:])
|
||
break
|
||
# Contains IPA markers (length mark, IPA chars) — garbled, skip
|
||
if any(c in w for c in _IPA_MARKER_CHARS):
|
||
# Everything from here is garbled IPA — stop scanning
|
||
# but look ahead: if any remaining words are real English
|
||
# words WITHOUT IPA markers, they might be a different headword
|
||
# following. Only skip the contiguous garbled run.
|
||
continue
|
||
clean = re.sub(r'[^a-zA-Z]', '', w)
|
||
# Uppercase — likely German, keep rest
|
||
if clean and clean[0].isupper():
|
||
kept.extend(after_words[idx:])
|
||
break
|
||
# Known English word — keep it, but check if followed by garbled IPA
|
||
# (multi-word headword case like "seat [siːt] belt si:t belt")
|
||
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
|
||
# Peek ahead: if next word has IPA markers, the rest is garbled
|
||
remaining = after_words[idx + 1:]
|
||
has_garbled_after = any(
|
||
any(c in rw for c in _IPA_MARKER_CHARS)
|
||
for rw in remaining
|
||
)
|
||
if has_garbled_after:
|
||
# Keep this real word but stop — rest is garbled duplication
|
||
kept.append(w)
|
||
# Still scan for delimiters/German in the remaining words
|
||
for ridx, rw in enumerate(remaining):
|
||
if rw in ('–', '—', '-', '/', '|', ',', ';'):
|
||
kept.extend(remaining[ridx:])
|
||
break
|
||
rclean = re.sub(r'[^a-zA-Z]', '', rw)
|
||
if rclean and rclean[0].isupper():
|
||
kept.extend(remaining[ridx:])
|
||
break
|
||
break
|
||
else:
|
||
kept.extend(after_words[idx:])
|
||
break
|
||
# Unknown short word — likely garbled, skip
|
||
if kept:
|
||
return before + ' ' + ' '.join(kept)
|
||
return before
|
||
|
||
|
||
def fix_ipa_continuation_cell(
|
||
garbled_text: str,
|
||
headword_text: str,
|
||
pronunciation: str = 'british',
|
||
) -> str:
|
||
"""Replace garbled IPA in a continuation row with proper IPA.
|
||
|
||
Continuation rows appear below the headword and contain only the
|
||
printed phonetic transcription, which OCR garbles into fragments
|
||
like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``).
|
||
|
||
Args:
|
||
garbled_text: The OCR-garbled IPA text from the continuation row.
|
||
headword_text: The headword text from the previous row
|
||
(e.g. ``scarf – scarves``).
|
||
pronunciation: ``'british'`` or ``'american'``.
|
||
|
||
Returns:
|
||
Corrected IPA text, or the original if no fix could be applied.
|
||
"""
|
||
if not IPA_AVAILABLE or not garbled_text or not headword_text:
|
||
return garbled_text
|
||
|
||
# If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
|
||
# only generate continuation IPA for words NOT already covered.
|
||
covered_words: set = set()
|
||
has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
|
||
if has_inline_ipa:
|
||
# Words before the first bracket already have their IPA shown
|
||
first_bracket = headword_text.index('[')
|
||
pre_bracket = headword_text[:first_bracket].strip()
|
||
for w in pre_bracket.split():
|
||
clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
|
||
if clean and len(clean) >= 2:
|
||
covered_words.add(clean)
|
||
|
||
last_bracket_end = headword_text.rfind(']')
|
||
tail = headword_text[last_bracket_end + 1:].strip()
|
||
|
||
if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
|
||
# Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
|
||
# — return the inline IPA directly (continuation duplicates it)
|
||
last_bracket_start = headword_text.rfind('[')
|
||
inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
|
||
return inline_ipa
|
||
|
||
# Only the tail words need continuation IPA
|
||
headword_text = tail
|
||
|
||
# Strip existing IPA brackets and parenthetical grammar annotations
|
||
# like "(no pl)", "(sth)", "(sb)" from headword text
|
||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
|
||
clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
|
||
if not clean_hw:
|
||
return garbled_text
|
||
|
||
# Split headword by delimiters (– — -)
|
||
# "scarf – scarves" → ["scarf", "scarves"]
|
||
# "see - saw - seen" → ["see", "saw", "seen"]
|
||
parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
|
||
parts = [p.strip() for p in parts if p.strip()]
|
||
|
||
if not parts:
|
||
return garbled_text
|
||
|
||
# Look up IPA for each headword part.
|
||
# Skip articles (the, a, an) — they never get IPA in vocab books.
|
||
# Other function words like "down", "up" are kept because they are
|
||
# integral parts of phrasal verbs (e.g. "close down").
|
||
# Skip words that already have inline IPA in the headword row.
|
||
_ARTICLES = {'the', 'a', 'an'}
|
||
ipa_parts: List[str] = []
|
||
for part in parts:
|
||
# A part may be multi-word like "secondary school"
|
||
words = part.split()
|
||
word_ipas: List[str] = []
|
||
for w in words:
|
||
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||
if not clean_w or len(clean_w) < 2:
|
||
continue
|
||
if covered_words and clean_w.lower() in covered_words:
|
||
continue # Already has IPA inline in the headword
|
||
if clean_w.lower() in _ARTICLES:
|
||
continue # Articles never get IPA in vocab books
|
||
ipa = _lookup_ipa(clean_w, pronunciation)
|
||
if ipa:
|
||
word_ipas.append(ipa)
|
||
if word_ipas:
|
||
ipa_parts.append('[' + ' '.join(word_ipas) + ']')
|
||
|
||
if not ipa_parts:
|
||
return garbled_text
|
||
|
||
# Join with delimiter
|
||
result = ' – '.join(ipa_parts)
|
||
logger.debug(
|
||
"fix_ipa_continuation: '%s' → '%s' (headwords: '%s')",
|
||
garbled_text, result, headword_text,
|
||
)
|
||
return result
|
||
|
||
|
||
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
|
||
"""Insert IPA for the first English headword in a long mixed-language line.
|
||
|
||
Unlike _insert_missing_ipa (for short column_en cells), this handles
|
||
column_text lines of any length. It only inserts IPA for the FIRST word
|
||
if that word:
|
||
- has no bracket following it already
|
||
- has an IPA entry in the dictionary
|
||
- is not a number/symbol prefix like "».55"
|
||
|
||
Returns the text with [ipa] inserted after the first word, or unchanged.
|
||
"""
|
||
if not IPA_AVAILABLE:
|
||
return text
|
||
if not text or not text.strip():
|
||
return text
|
||
|
||
words = text.strip().split()
|
||
if not words:
|
||
return text
|
||
|
||
# Check if text already starts with a bracket (IPA already present)
|
||
if len(words) > 1 and words[1].startswith(('[', '{', '(')):
|
||
return text
|
||
|
||
# Try the first few words (skip numeric prefixes like "».55", "0.56")
|
||
for i in range(min(3, len(words))):
|
||
w = words[i]
|
||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||
if not clean or len(clean) < 2:
|
||
continue
|
||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||
continue
|
||
ipa = _lookup_ipa(clean, pronunciation)
|
||
if ipa:
|
||
words[i] = f"{w} [{ipa}]"
|
||
return ' '.join(words)
|
||
# Stop at first real word even if no IPA found
|
||
break
|
||
|
||
return text
|