Strip garbled OCR phonetics after IPA insertion
_insert_missing_ipa now removes garbled phonetic text (e.g. "skea", "sku:l", "'sizaz") that follows the inserted IPA bracket. Keeps delimiters (–, -), uppercase words (German), and known English words. Fixes: "scare [skˈɛə] skea" → "scare [skˈɛə]" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1028,8 +1028,30 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
|||||||
ipa = _lookup_ipa(clean, pronunciation)
|
ipa = _lookup_ipa(clean, pronunciation)
|
||||||
if ipa:
|
if ipa:
|
||||||
words[i] = f"{w} [{ipa}]"
|
words[i] = f"{w} [{ipa}]"
|
||||||
# Only insert for the FIRST word that has IPA
|
# Strip garbled OCR phonetics after the IPA bracket.
|
||||||
# (headword in English column)
|
# On scanned vocab pages, printed IPA is read as garbled
|
||||||
|
# text (e.g. "scare skea" where "skea" is garbled /skɛə/).
|
||||||
|
# After inserting correct IPA, remove remaining words that
|
||||||
|
# aren't real English words, delimiters, or German text.
|
||||||
|
kept = words[:i + 1]
|
||||||
|
for j in range(i + 1, len(words)):
|
||||||
|
wj = words[j]
|
||||||
|
# Delimiter — keep this and everything after
|
||||||
|
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||||
|
kept.extend(words[j:])
|
||||||
|
break
|
||||||
|
# Starts with uppercase — likely German or proper noun
|
||||||
|
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||||
|
if clean_j and clean_j[0].isupper():
|
||||||
|
kept.extend(words[j:])
|
||||||
|
break
|
||||||
|
# Known English word (≥2 chars) — keep it and rest
|
||||||
|
if clean_j and len(clean_j) >= 2:
|
||||||
|
if _lookup_ipa(clean_j, pronunciation):
|
||||||
|
kept.extend(words[j:])
|
||||||
|
break
|
||||||
|
# Otherwise — likely garbled phonetics, skip
|
||||||
|
words = kept
|
||||||
break
|
break
|
||||||
|
|
||||||
return ' '.join(words)
|
return ' '.join(words)
|
||||||
|
|||||||
Reference in New Issue
Block a user