Strip garbled OCR phonetics after IPA insertion

_insert_missing_ipa now removes garbled phonetic text (e.g. "skea", "sku:l", "'sizaz") that follows the inserted IPA bracket. Keeps delimiters (–, -), uppercase words (German), and known English words. Fixes: "scare [skˈɛə] skea" → "scare [skˈɛə]" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 11:15:14 +01:00
parent f139d0903e
commit b98ea33a3a
1 changed files with 24 additions and 2 deletions
@@ -1028,8 +1028,30 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
        ipa = _lookup_ipa(clean, pronunciation)
        if ipa:
            words[i] = f"{w} [{ipa}]"
-            # Only insert for the FIRST word that has IPA
+            # Strip garbled OCR phonetics after the IPA bracket.
-            # (headword in English column)
+            # On scanned vocab pages, printed IPA is read as garbled
            # text (e.g. "scare skea" where "skea" is garbled /skɛə/).
            # After inserting correct IPA, remove remaining words that
            # aren't real English words, delimiters, or German text.
            kept = words[:i + 1]
            for j in range(i + 1, len(words)):
                wj = words[j]
                # Delimiter — keep this and everything after
                if wj in ('–', '—', '-', '/', '|', ',', ';'):
                    kept.extend(words[j:])
                    break
                # Starts with uppercase — likely German or proper noun
                clean_j = re.sub(r'[^a-zA-Z]', '', wj)
                if clean_j and clean_j[0].isupper():
                    kept.extend(words[j:])
                    break
                # Known English word (≥2 chars) — keep it and rest
                if clean_j and len(clean_j) >= 2:
                    if _lookup_ipa(clean_j, pronunciation):
                        kept.extend(words[j:])
                        break
                # Otherwise — likely garbled phonetics, skip
            words = kept
            break
    return ' '.join(words)