Improve garbled IPA cleanup: trailing strip, prefix match, broader guard

1. Strip trailing garbled IPA after proper [IPA] brackets (e.g. "sea [sˈiː] si:" → "sea [sˈiː]") 2. Add prefix matching for merged tokens where OCR joined headword with garbled IPA (e.g. "schoolbagsku:lbæg" → "schoolbag [skˈuːlbæɡ]") 3. Broaden guard to also trigger on trailing non-dictionary words (e.g. "scare skea" → "scare [skˈɛə]") Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 10:36:25 +01:00
parent fc0ab84e40
commit 19cbbf310a
1 changed files with 103 additions and 10 deletions
@@ -1022,11 +1022,6 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
    (e.g. "scare skea" where "skea" is garbled /skɛə/).  This scans the text
    for the headword, inserts correct [IPA], and strips the garbled fragments.

-    IMPORTANT: This function must only be called when ``_text_has_garbled_ipa``
-    confirms that the text actually contains garbled phonetics.  If the text
-    is clean (e.g. just "scissors"), IPA must NOT be inserted — the original
-    page had no phonetics on that line.
-
    Only inserts for words that:
    - are standalone (not already followed by a bracket)
    - have an IPA entry in the dictionary
@@ -1065,6 +1060,19 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
        # Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
        if not ipa and '-' in clean:
            ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
+        # Fallback: prefix matching for merged tokens where OCR joined
+        # headword with garbled IPA (e.g. "schoolbagsku:lbæg",
+        # "Scotland'skotland").  Find longest dictionary prefix.
+        if not ipa and len(clean) > 4:
+            for end in range(len(clean) - 1, 2, -1):
+                prefix = clean[:end]
+                test_ipa = _lookup_ipa(prefix, pronunciation)
+                if test_ipa:
+                    ipa = test_ipa
+                    # Replace token with just the headword prefix
+                    w = prefix
+                    words[i] = prefix
+                    break
        if ipa:
            words[i] = f"{w} [{ipa}]"
            # Strip garbled OCR phonetics after the IPA bracket.
@@ -1096,6 +1104,87 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
    return ' '.join(words)


+def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
+    """Check if text has a headword followed by non-dictionary trailing words.
+
+    Used as an additional trigger for ``_insert_missing_ipa`` when
+    ``_text_has_garbled_ipa`` returns False because the garbled IPA
+    happens to look like plain ASCII (e.g. "skea" for /skɛə/).
+    """
+    if not IPA_AVAILABLE:
+        return False
+    words = text.strip().split()
+    if len(words) < 2 or len(words) > 6:
+        return False
+    # Find first dictionary word
+    hw_idx = -1
+    for i, w in enumerate(words):
+        clean = re.sub(r'[^a-zA-Z\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        if _lookup_ipa(clean, pronunciation):
+            hw_idx = i
+            break
+    if hw_idx < 0 or hw_idx >= len(words) - 1:
+        return False
+    # Check ALL remaining words — if none are dictionary/delimiter/German,
+    # they are likely garbled IPA.
+    for j in range(hw_idx + 1, len(words)):
+        wj = words[j]
+        if wj in ('–', '—', '-', '/', '|', ',', ';'):
+            return False
+        clean_j = re.sub(r'[^a-zA-Z]', '', wj)
+        if clean_j and clean_j[0].isupper():
+            return False
+        if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
+            return False
+    return True
+
+
+def _strip_post_bracket_garbled(
+    text: str, pronunciation: str = 'british',
+) -> str:
+    """Strip garbled IPA fragments that trail after proper [IPA] brackets.
+
+    E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
+         ``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
+    """
+    if ']' not in text:
+        return text
+    last_bracket = text.rfind(']')
+    if last_bracket >= len(text) - 1:
+        return text
+    before = text[:last_bracket + 1].rstrip()
+    after = text[last_bracket + 1:].strip()
+    if not after:
+        return text
+    after_words = after.split()
+    kept: List[str] = []
+    for idx, w in enumerate(after_words):
+        # Delimiter — keep rest
+        if w in ('–', '—', '-', '/', '|', ',', ';'):
+            kept.extend(after_words[idx:])
+            break
+        # Contains IPA markers (length mark, IPA chars) — garbled, skip
+        if ':' in w or any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋˈˌ'):
+            continue
+        clean = re.sub(r'[^a-zA-Z]', '', w)
+        # Uppercase — likely German, keep rest
+        if clean and clean[0].isupper():
+            kept.extend(after_words[idx:])
+            break
+        # Known English word — keep rest
+        if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
+            kept.extend(after_words[idx:])
+            break
+        # Unknown short word — likely garbled, skip
+    if kept:
+        return before + ' ' + ' '.join(kept)
+    return before
+
+
 def fix_ipa_continuation_cell(
    garbled_text: str,
    headword_text: str,
@@ -1242,11 +1331,15 @@ def fix_cell_phonetics(
        if col_type == 'column_en':
            # Full processing: replace garbled IPA, strip orphan brackets.
            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
-            if new_text == text and _text_has_garbled_ipa(text):
-                # Only insert IPA when there IS garbled phonetics in the
-                # text — never add IPA to clean text that had none on the
-                # original page.
-                new_text = _insert_missing_ipa(text, pronunciation)
+            if new_text == text:
+                # Insert IPA when garbled phonetics exist OR when trailing
+                # non-dictionary words suggest garbled IPA in plain ASCII.
+                if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
+                    new_text = _insert_missing_ipa(text, pronunciation)
+            # Strip trailing garbled fragments after proper [IPA] brackets
+            # (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
+            if ']' in new_text:
+                new_text = _strip_post_bracket_garbled(new_text, pronunciation)
        else:
            # column_text: replace garbled IPA, no orphan stripping
            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)