Improve garbled IPA cleanup: trailing strip, prefix match, broader guard

1. Strip trailing garbled IPA after proper [IPA] brackets
   (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
2. Add prefix matching for merged tokens where OCR joined headword
   with garbled IPA (e.g. "schoolbagsku:lbæg" → "schoolbag [skˈuːlbæɡ]")
3. Broaden guard to also trigger on trailing non-dictionary words
   (e.g. "scare skea" → "scare [skˈɛə]")

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-19 10:36:25 +01:00
parent fc0ab84e40
commit 19cbbf310a

View File

@@ -1022,11 +1022,6 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
(e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text
for the headword, inserts correct [IPA], and strips the garbled fragments.
IMPORTANT: This function must only be called when ``_text_has_garbled_ipa``
confirms that the text actually contains garbled phonetics. If the text
is clean (e.g. just "scissors"), IPA must NOT be inserted — the original
page had no phonetics on that line.
Only inserts for words that:
- are standalone (not already followed by a bracket)
- have an IPA entry in the dictionary
@@ -1065,6 +1060,19 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
if not ipa and '-' in clean:
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
# Fallback: prefix matching for merged tokens where OCR joined
# headword with garbled IPA (e.g. "schoolbagsku:lbæg",
# "Scotland'skotland"). Find longest dictionary prefix.
if not ipa and len(clean) > 4:
for end in range(len(clean) - 1, 2, -1):
prefix = clean[:end]
test_ipa = _lookup_ipa(prefix, pronunciation)
if test_ipa:
ipa = test_ipa
# Replace token with just the headword prefix
w = prefix
words[i] = prefix
break
if ipa:
words[i] = f"{w} [{ipa}]"
# Strip garbled OCR phonetics after the IPA bracket.
@@ -1096,6 +1104,87 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
return ' '.join(words)
def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
"""Check if text has a headword followed by non-dictionary trailing words.
Used as an additional trigger for ``_insert_missing_ipa`` when
``_text_has_garbled_ipa`` returns False because the garbled IPA
happens to look like plain ASCII (e.g. "skea" for /skɛə/).
"""
if not IPA_AVAILABLE:
return False
words = text.strip().split()
if len(words) < 2 or len(words) > 6:
return False
# Find first dictionary word
hw_idx = -1
for i, w in enumerate(words):
clean = re.sub(r'[^a-zA-Z\'-]', '', w)
if not clean or len(clean) < 2:
continue
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
if _lookup_ipa(clean, pronunciation):
hw_idx = i
break
if hw_idx < 0 or hw_idx >= len(words) - 1:
return False
# Check ALL remaining words — if none are dictionary/delimiter/German,
# they are likely garbled IPA.
for j in range(hw_idx + 1, len(words)):
wj = words[j]
if wj in ('', '', '-', '/', '|', ',', ';'):
return False
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
if clean_j and clean_j[0].isupper():
return False
if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
return False
return True
def _strip_post_bracket_garbled(
text: str, pronunciation: str = 'british',
) -> str:
"""Strip garbled IPA fragments that trail after proper [IPA] brackets.
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
"""
if ']' not in text:
return text
last_bracket = text.rfind(']')
if last_bracket >= len(text) - 1:
return text
before = text[:last_bracket + 1].rstrip()
after = text[last_bracket + 1:].strip()
if not after:
return text
after_words = after.split()
kept: List[str] = []
for idx, w in enumerate(after_words):
# Delimiter — keep rest
if w in ('', '', '-', '/', '|', ',', ';'):
kept.extend(after_words[idx:])
break
# Contains IPA markers (length mark, IPA chars) — garbled, skip
if ':' in w or any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋˈˌ'):
continue
clean = re.sub(r'[^a-zA-Z]', '', w)
# Uppercase — likely German, keep rest
if clean and clean[0].isupper():
kept.extend(after_words[idx:])
break
# Known English word — keep rest
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
kept.extend(after_words[idx:])
break
# Unknown short word — likely garbled, skip
if kept:
return before + ' ' + ' '.join(kept)
return before
def fix_ipa_continuation_cell(
garbled_text: str,
headword_text: str,
@@ -1242,11 +1331,15 @@ def fix_cell_phonetics(
if col_type == 'column_en':
# Full processing: replace garbled IPA, strip orphan brackets.
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
if new_text == text and _text_has_garbled_ipa(text):
# Only insert IPA when there IS garbled phonetics in the
# text — never add IPA to clean text that had none on the
# original page.
new_text = _insert_missing_ipa(text, pronunciation)
if new_text == text:
# Insert IPA when garbled phonetics exist OR when trailing
# non-dictionary words suggest garbled IPA in plain ASCII.
if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
new_text = _insert_missing_ipa(text, pronunciation)
# Strip trailing garbled fragments after proper [IPA] brackets
# (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
if ']' in new_text:
new_text = _strip_post_bracket_garbled(new_text, pronunciation)
else:
# column_text: replace garbled IPA, no orphan stripping
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)