Fix prefix matching: use alpha-only chars, min 4-char prefix
Prevents false positives where punctuation (apostrophes) in merged tokens caused wrong dictionary matches (e.g. "'se" from "'sekandarr" matching as a word, breaking IPA continuation row fix). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1063,16 +1063,18 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
|||||||
# Fallback: prefix matching for merged tokens where OCR joined
|
# Fallback: prefix matching for merged tokens where OCR joined
|
||||||
# headword with garbled IPA (e.g. "schoolbagsku:lbæg",
|
# headword with garbled IPA (e.g. "schoolbagsku:lbæg",
|
||||||
# "Scotland'skotland"). Find longest dictionary prefix.
|
# "Scotland'skotland"). Find longest dictionary prefix.
|
||||||
if not ipa and len(clean) > 4:
|
# Use only alpha chars to avoid false matches on punctuation.
|
||||||
for end in range(len(clean) - 1, 2, -1):
|
if not ipa:
|
||||||
prefix = clean[:end]
|
alpha = re.sub(r'[^a-zA-Z]', '', clean)
|
||||||
test_ipa = _lookup_ipa(prefix, pronunciation)
|
if len(alpha) > 5: # need at least 6 chars for meaningful split
|
||||||
if test_ipa:
|
for end in range(len(alpha), 3, -1): # min prefix 4 chars
|
||||||
ipa = test_ipa
|
prefix = alpha[:end]
|
||||||
# Replace token with just the headword prefix
|
test_ipa = _lookup_ipa(prefix, pronunciation)
|
||||||
w = prefix
|
if test_ipa:
|
||||||
words[i] = prefix
|
ipa = test_ipa
|
||||||
break
|
w = prefix
|
||||||
|
words[i] = prefix
|
||||||
|
break
|
||||||
if ipa:
|
if ipa:
|
||||||
words[i] = f"{w} [{ipa}]"
|
words[i] = f"{w} [{ipa}]"
|
||||||
# Strip garbled OCR phonetics after the IPA bracket.
|
# Strip garbled OCR phonetics after the IPA bracket.
|
||||||
|
|||||||
Reference in New Issue
Block a user