Improve garbled IPA cleanup: trailing strip, prefix match, broader guard
1. Strip trailing garbled IPA after proper [IPA] brackets (e.g. "sea [sˈiː] si:" → "sea [sˈiː]") 2. Add prefix matching for merged tokens where OCR joined headword with garbled IPA (e.g. "schoolbagsku:lbæg" → "schoolbag [skˈuːlbæɡ]") 3. Broaden guard to also trigger on trailing non-dictionary words (e.g. "scare skea" → "scare [skˈɛə]") Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1022,11 +1022,6 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
(e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text
|
||||
for the headword, inserts correct [IPA], and strips the garbled fragments.
|
||||
|
||||
IMPORTANT: This function must only be called when ``_text_has_garbled_ipa``
|
||||
confirms that the text actually contains garbled phonetics. If the text
|
||||
is clean (e.g. just "scissors"), IPA must NOT be inserted — the original
|
||||
page had no phonetics on that line.
|
||||
|
||||
Only inserts for words that:
|
||||
- are standalone (not already followed by a bracket)
|
||||
- have an IPA entry in the dictionary
|
||||
@@ -1065,6 +1060,19 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
|
||||
if not ipa and '-' in clean:
|
||||
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
|
||||
# Fallback: prefix matching for merged tokens where OCR joined
|
||||
# headword with garbled IPA (e.g. "schoolbagsku:lbæg",
|
||||
# "Scotland'skotland"). Find longest dictionary prefix.
|
||||
if not ipa and len(clean) > 4:
|
||||
for end in range(len(clean) - 1, 2, -1):
|
||||
prefix = clean[:end]
|
||||
test_ipa = _lookup_ipa(prefix, pronunciation)
|
||||
if test_ipa:
|
||||
ipa = test_ipa
|
||||
# Replace token with just the headword prefix
|
||||
w = prefix
|
||||
words[i] = prefix
|
||||
break
|
||||
if ipa:
|
||||
words[i] = f"{w} [{ipa}]"
|
||||
# Strip garbled OCR phonetics after the IPA bracket.
|
||||
@@ -1096,6 +1104,87 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
return ' '.join(words)
|
||||
|
||||
|
||||
def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
|
||||
"""Check if text has a headword followed by non-dictionary trailing words.
|
||||
|
||||
Used as an additional trigger for ``_insert_missing_ipa`` when
|
||||
``_text_has_garbled_ipa`` returns False because the garbled IPA
|
||||
happens to look like plain ASCII (e.g. "skea" for /skɛə/).
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return False
|
||||
words = text.strip().split()
|
||||
if len(words) < 2 or len(words) > 6:
|
||||
return False
|
||||
# Find first dictionary word
|
||||
hw_idx = -1
|
||||
for i, w in enumerate(words):
|
||||
clean = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
if _lookup_ipa(clean, pronunciation):
|
||||
hw_idx = i
|
||||
break
|
||||
if hw_idx < 0 or hw_idx >= len(words) - 1:
|
||||
return False
|
||||
# Check ALL remaining words — if none are dictionary/delimiter/German,
|
||||
# they are likely garbled IPA.
|
||||
for j in range(hw_idx + 1, len(words)):
|
||||
wj = words[j]
|
||||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
return False
|
||||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||
if clean_j and clean_j[0].isupper():
|
||||
return False
|
||||
if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _strip_post_bracket_garbled(
|
||||
text: str, pronunciation: str = 'british',
|
||||
) -> str:
|
||||
"""Strip garbled IPA fragments that trail after proper [IPA] brackets.
|
||||
|
||||
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
|
||||
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
|
||||
"""
|
||||
if ']' not in text:
|
||||
return text
|
||||
last_bracket = text.rfind(']')
|
||||
if last_bracket >= len(text) - 1:
|
||||
return text
|
||||
before = text[:last_bracket + 1].rstrip()
|
||||
after = text[last_bracket + 1:].strip()
|
||||
if not after:
|
||||
return text
|
||||
after_words = after.split()
|
||||
kept: List[str] = []
|
||||
for idx, w in enumerate(after_words):
|
||||
# Delimiter — keep rest
|
||||
if w in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Contains IPA markers (length mark, IPA chars) — garbled, skip
|
||||
if ':' in w or any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋˈˌ'):
|
||||
continue
|
||||
clean = re.sub(r'[^a-zA-Z]', '', w)
|
||||
# Uppercase — likely German, keep rest
|
||||
if clean and clean[0].isupper():
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Known English word — keep rest
|
||||
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Unknown short word — likely garbled, skip
|
||||
if kept:
|
||||
return before + ' ' + ' '.join(kept)
|
||||
return before
|
||||
|
||||
|
||||
def fix_ipa_continuation_cell(
|
||||
garbled_text: str,
|
||||
headword_text: str,
|
||||
@@ -1242,11 +1331,15 @@ def fix_cell_phonetics(
|
||||
if col_type == 'column_en':
|
||||
# Full processing: replace garbled IPA, strip orphan brackets.
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
|
||||
if new_text == text and _text_has_garbled_ipa(text):
|
||||
# Only insert IPA when there IS garbled phonetics in the
|
||||
# text — never add IPA to clean text that had none on the
|
||||
# original page.
|
||||
new_text = _insert_missing_ipa(text, pronunciation)
|
||||
if new_text == text:
|
||||
# Insert IPA when garbled phonetics exist OR when trailing
|
||||
# non-dictionary words suggest garbled IPA in plain ASCII.
|
||||
if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
|
||||
new_text = _insert_missing_ipa(text, pronunciation)
|
||||
# Strip trailing garbled fragments after proper [IPA] brackets
|
||||
# (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
|
||||
if ']' in new_text:
|
||||
new_text = _strip_post_bracket_garbled(new_text, pronunciation)
|
||||
else:
|
||||
# column_text: replace garbled IPA, no orphan stripping
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
|
||||
|
||||
Reference in New Issue
Block a user