Fix prefix matching: use alpha-only chars, min 4-char prefix

Prevents false positives where punctuation (apostrophes) in merged
tokens caused wrong dictionary matches (e.g. "'se" from "'sekandarr"
matching as a word, breaking IPA continuation row fix).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-19 10:40:37 +01:00
parent 19cbbf310a
commit 21d37b5da1

View File

@@ -1063,16 +1063,18 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
# Fallback: prefix matching for merged tokens where OCR joined # Fallback: prefix matching for merged tokens where OCR joined
# headword with garbled IPA (e.g. "schoolbagsku:lbæg", # headword with garbled IPA (e.g. "schoolbagsku:lbæg",
# "Scotland'skotland"). Find longest dictionary prefix. # "Scotland'skotland"). Find longest dictionary prefix.
if not ipa and len(clean) > 4: # Use only alpha chars to avoid false matches on punctuation.
for end in range(len(clean) - 1, 2, -1): if not ipa:
prefix = clean[:end] alpha = re.sub(r'[^a-zA-Z]', '', clean)
test_ipa = _lookup_ipa(prefix, pronunciation) if len(alpha) > 5: # need at least 6 chars for meaningful split
if test_ipa: for end in range(len(alpha), 3, -1): # min prefix 4 chars
ipa = test_ipa prefix = alpha[:end]
# Replace token with just the headword prefix test_ipa = _lookup_ipa(prefix, pronunciation)
w = prefix if test_ipa:
words[i] = prefix ipa = test_ipa
break w = prefix
words[i] = prefix
break
if ipa: if ipa:
words[i] = f"{w} [{ipa}]" words[i] = f"{w} [{ipa}]"
# Strip garbled OCR phonetics after the IPA bracket. # Strip garbled OCR phonetics after the IPA bracket.