Use OCR-recognized IPA when word not in dictionary
For merged tokens like "schoolbagsku:lbæg", split at IPA marker boundary instead of prefix-matching to a shorter dictionary word. Result: "schoolbag [sku:lbæg]" instead of "school [skˈuːl]". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1060,10 +1060,38 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
|
||||
if not ipa and '-' in clean:
|
||||
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
|
||||
# Fallback: prefix matching for merged tokens where OCR joined
|
||||
# headword with garbled IPA (e.g. "schoolbagsku:lbæg",
|
||||
# "Scotland'skotland"). Find longest dictionary prefix.
|
||||
# Use only alpha chars to avoid false matches on punctuation.
|
||||
# Fallback 1: IPA-marker split for merged tokens where OCR
|
||||
# joined headword with its IPA (e.g. "schoolbagsku:lbæg").
|
||||
# Find the first IPA marker character (:, æ, ɪ, etc.), walk
|
||||
# backwards ≤3 chars for the onset consonant cluster, and
|
||||
# split into headword + OCR IPA.
|
||||
_IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||||
if not ipa:
|
||||
first_marker = next(
|
||||
(p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
|
||||
)
|
||||
if first_marker >= 3:
|
||||
split = first_marker
|
||||
while (split > 0
|
||||
and split > first_marker - 4
|
||||
and w[split - 1].isalpha()
|
||||
and w[split - 1].islower()):
|
||||
split -= 1
|
||||
if split >= 2:
|
||||
headword = w[:split]
|
||||
ocr_ipa = w[split:]
|
||||
hw_ipa = _lookup_ipa(headword, pronunciation)
|
||||
if hw_ipa:
|
||||
words[i] = f"{headword} [{hw_ipa}]"
|
||||
else:
|
||||
# Word not in dictionary — use OCR IPA
|
||||
words[i] = f"{headword} [{ocr_ipa}]"
|
||||
words = words[:i + 1]
|
||||
ipa = True # signal that we handled it
|
||||
break
|
||||
# Fallback 2: prefix matching for merged tokens WITHOUT IPA
|
||||
# markers (e.g. "Scotland'skotland"). Find longest dictionary
|
||||
# prefix using only alpha chars to avoid punctuation matches.
|
||||
if not ipa:
|
||||
alpha = re.sub(r'[^a-zA-Z]', '', clean)
|
||||
if len(alpha) > 5: # need at least 6 chars for meaningful split
|
||||
|
||||
Reference in New Issue
Block a user