Preserve grammar annotations (pl), (no pl) and skip articles in IPA

Two fixes: 1. Add pl, sg, no, also, ae, be etc. to _GRAMMAR_BRACKET_WORDS so annotations like (pl) and (no pl) are not replaced with IPA. 2. Skip articles (the, a, an) in fix_ipa_continuation_cell — they never get IPA in vocabulary books. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 11:42:44 +01:00
parent 7dc00e737a
commit ef5aed6a98
2 changed files with 12 additions and 4 deletions
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -888,6 +888,10 @@ _GRAMMAR_BRACKET_WORDS = frozenset({
    'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
    # English grammar abbreviations used in vocab tables
    'sth', 'sb', 'adj', 'adv',
+    # Number/plural/grammar annotations
+    'pl', 'sg', 'sing', 'no', 'also', 'auch',
+    # Regional English markers
+    'ae', 'be', 'ame', 'bre',
 })


@@ -1293,10 +1297,11 @@ def fix_ipa_continuation_cell(
        return garbled_text

    # Look up IPA for each headword part.
-    # Do NOT skip grammar words here — they are integral parts of the
-    # headword (e.g. "close down", "the United Kingdom").  Grammar
-    # annotations like "(sth)", "(no pl)" are already stripped above.
+    # Skip articles (the, a, an) — they never get IPA in vocab books.
+    # Other function words like "down", "up" are kept because they are
+    # integral parts of phrasal verbs (e.g. "close down").
    # Skip words that already have inline IPA in the headword row.
+    _ARTICLES = {'the', 'a', 'an'}
    ipa_parts: List[str] = []
    for part in parts:
        # A part may be multi-word like "secondary school"
@@ -1308,6 +1313,8 @@ def fix_ipa_continuation_cell(
                continue
            if covered_words and clean_w.lower() in covered_words:
                continue  # Already has IPA inline in the headword
+            if clean_w.lower() in _ARTICLES:
+                continue  # Articles never get IPA in vocab books
            ipa = _lookup_ipa(clean_w, pronunciation)
            if ipa:
                word_ipas.append(ipa)