From 21d37b5da1e987f439dcfae7ef126a6da8baa885 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 19 Mar 2026 10:40:37 +0100 Subject: [PATCH] Fix prefix matching: use alpha-only chars, min 4-char prefix Prevents false positives where punctuation (apostrophes) in merged tokens caused wrong dictionary matches (e.g. "'se" from "'sekandarr" matching as a word, breaking IPA continuation row fix). Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 2647922..894b0fc 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1063,16 +1063,18 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: # Fallback: prefix matching for merged tokens where OCR joined # headword with garbled IPA (e.g. "schoolbagsku:lbæg", # "Scotland'skotland"). Find longest dictionary prefix. - if not ipa and len(clean) > 4: - for end in range(len(clean) - 1, 2, -1): - prefix = clean[:end] - test_ipa = _lookup_ipa(prefix, pronunciation) - if test_ipa: - ipa = test_ipa - # Replace token with just the headword prefix - w = prefix - words[i] = prefix - break + # Use only alpha chars to avoid false matches on punctuation. + if not ipa: + alpha = re.sub(r'[^a-zA-Z]', '', clean) + if len(alpha) > 5: # need at least 6 chars for meaningful split + for end in range(len(alpha), 3, -1): # min prefix 4 chars + prefix = alpha[:end] + test_ipa = _lookup_ipa(prefix, pronunciation) + if test_ipa: + ipa = test_ipa + w = prefix + words[i] = prefix + break if ipa: words[i] = f"{w} [{ipa}]" # Strip garbled OCR phonetics after the IPA bracket.