Fix IPA continuation: skip words with inline IPA, recover emptied cells

Three fixes: 1. fix_ipa_continuation_cell: when headword has inline IPA like "beat [bˈiːt] , beat, beaten", only generate IPA for uncovered words (beaten), not words already shown (beat). When bracket is at end like "the Highlands [ˈhaɪləndz]", return inline IPA directly. 2. Step 5d: recover garbled IPA from word_boxes when Step 5c emptied the cell text (e.g. "[n, nn]" → ""). 3. Added 2 tests for inline IPA behavior (35 total). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 09:31:54 +01:00
parent 0f9c0d2ad0
commit a579c31ddb
3 changed files with 53 additions and 1 deletions
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1250,6 +1250,32 @@ def fix_ipa_continuation_cell(
    if not IPA_AVAILABLE or not garbled_text or not headword_text:
        return garbled_text

+    # If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
+    # only generate continuation IPA for words NOT already covered.
+    covered_words: set = set()
+    has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
+    if has_inline_ipa:
+        # Words before the first bracket already have their IPA shown
+        first_bracket = headword_text.index('[')
+        pre_bracket = headword_text[:first_bracket].strip()
+        for w in pre_bracket.split():
+            clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
+            if clean and len(clean) >= 2:
+                covered_words.add(clean)
+
+        last_bracket_end = headword_text.rfind(']')
+        tail = headword_text[last_bracket_end + 1:].strip()
+
+        if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
+            # Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
+            # — return the inline IPA directly (continuation duplicates it)
+            last_bracket_start = headword_text.rfind('[')
+            inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
+            return inline_ipa
+
+        # Only the tail words need continuation IPA
+        headword_text = tail
+
    # Strip existing IPA brackets and parenthetical grammar annotations
    # like "(no pl)", "(sth)", "(sb)" from headword text
    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
@@ -1270,6 +1296,7 @@ def fix_ipa_continuation_cell(
    # Do NOT skip grammar words here — they are integral parts of the
    # headword (e.g. "close down", "the United Kingdom").  Grammar
    # annotations like "(sth)", "(no pl)" are already stripped above.
+    # Skip words that already have inline IPA in the headword row.
    ipa_parts: List[str] = []
    for part in parts:
        # A part may be multi-word like "secondary school"
@@ -1279,6 +1306,8 @@ def fix_ipa_continuation_cell(
            clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
            if not clean_w or len(clean_w) < 2:
                continue
+            if covered_words and clean_w.lower() in covered_words:
+                continue  # Already has IPA inline in the headword
            ipa = _lookup_ipa(clean_w, pronunciation)
            if ipa:
                word_ipas.append(ipa)