diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index e0e6992..4a4dd4f 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -888,6 +888,10 @@ _GRAMMAR_BRACKET_WORDS = frozenset({ 'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through', # English grammar abbreviations used in vocab tables 'sth', 'sb', 'adj', 'adv', + # Number/plural/grammar annotations + 'pl', 'sg', 'sing', 'no', 'also', 'auch', + # Regional English markers + 'ae', 'be', 'ame', 'bre', }) @@ -1293,10 +1297,11 @@ def fix_ipa_continuation_cell( return garbled_text # Look up IPA for each headword part. - # Do NOT skip grammar words here — they are integral parts of the - # headword (e.g. "close down", "the United Kingdom"). Grammar - # annotations like "(sth)", "(no pl)" are already stripped above. + # Skip articles (the, a, an) — they never get IPA in vocab books. + # Other function words like "down", "up" are kept because they are + # integral parts of phrasal verbs (e.g. "close down"). # Skip words that already have inline IPA in the headword row. + _ARTICLES = {'the', 'a', 'an'} ipa_parts: List[str] = [] for part in parts: # A part may be multi-word like "secondary school" @@ -1308,6 +1313,8 @@ def fix_ipa_continuation_cell( continue if covered_words and clean_w.lower() in covered_words: continue # Already has IPA inline in the headword + if clean_w.lower() in _ARTICLES: + continue # Articles never get IPA in vocab books ipa = _lookup_ipa(clean_w, pronunciation) if ipa: word_ipas.append(ipa) diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index 8280a22..708bf57 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -485,13 +485,14 @@ class TestGarbledIpaDetection: assert _text_has_garbled_ipa("equipment") is False def test_fix_continuation_united_kingdom(self): - """IPA continuation for 'the United Kingdom' → proper IPA.""" + """IPA continuation for 'the United Kingdom' → IPA without 'the'.""" fixed = fix_ipa_continuation_cell( "[n, nn]", "the United Kingdom", pronunciation="british", ) # Should contain proper IPA, not the garbled text assert fixed != "[n, nn]" assert "kˈɪŋdəm" in fixed # Kingdom IPA + assert "ðə" not in fixed # "the" must NOT get IPA def test_fix_continuation_equipment(self): """IPA continuation for 'equipment' → proper IPA."""