diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index a244a89..e0e6992 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1250,6 +1250,32 @@ def fix_ipa_continuation_cell( if not IPA_AVAILABLE or not garbled_text or not headword_text: return garbled_text + # If headword already has inline IPA like "beat [bˈiːt] , beat, beaten", + # only generate continuation IPA for words NOT already covered. + covered_words: set = set() + has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text)) + if has_inline_ipa: + # Words before the first bracket already have their IPA shown + first_bracket = headword_text.index('[') + pre_bracket = headword_text[:first_bracket].strip() + for w in pre_bracket.split(): + clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower() + if clean and len(clean) >= 2: + covered_words.add(clean) + + last_bracket_end = headword_text.rfind(']') + tail = headword_text[last_bracket_end + 1:].strip() + + if not tail or not re.search(r'[a-zA-Z]{2,}', tail): + # Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]") + # — return the inline IPA directly (continuation duplicates it) + last_bracket_start = headword_text.rfind('[') + inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1] + return inline_ipa + + # Only the tail words need continuation IPA + headword_text = tail + # Strip existing IPA brackets and parenthetical grammar annotations # like "(no pl)", "(sth)", "(sb)" from headword text clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text) @@ -1270,6 +1296,7 @@ def fix_ipa_continuation_cell( # Do NOT skip grammar words here — they are integral parts of the # headword (e.g. "close down", "the United Kingdom"). Grammar # annotations like "(sth)", "(no pl)" are already stripped above. + # Skip words that already have inline IPA in the headword row. ipa_parts: List[str] = [] for part in parts: # A part may be multi-word like "secondary school" @@ -1279,6 +1306,8 @@ def fix_ipa_continuation_cell( clean_w = re.sub(r'[^a-zA-Z\'-]', '', w) if not clean_w or len(clean_w) < 2: continue + if covered_words and clean_w.lower() in covered_words: + continue # Already has IPA inline in the headword ipa = _lookup_ipa(clean_w, pronunciation) if ipa: word_ipas.append(ipa) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index a04836c..eb3406a 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1798,7 +1798,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: continue cell_text = (cell.get("text") or "").strip() if not cell_text: - continue + # Step 5c may have emptied garbled IPA cells like + # "[n, nn]" — recover text from word_boxes. + wb_texts = [w.get("text", "") + for w in cell.get("word_boxes", [])] + cell_text = " ".join(wb_texts).strip() + if not cell_text: + continue is_bracketed = ( cell_text.startswith('[') and cell_text.endswith(']') diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index 5e5f99b..8280a22 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -510,6 +510,23 @@ class TestGarbledIpaDetection: assert "klˈəʊs" in fixed # close IPA assert "dˈaʊn" in fixed # down IPA — must NOT be skipped + def test_continuation_skips_words_with_inline_ipa(self): + """'beat [bˈiːt] , beat, beaten' → continuation only for 'beaten'.""" + fixed = fix_ipa_continuation_cell( + "[bi:tan]", "beat [bˈiːt] , beat, beaten", pronunciation="british", + ) + # Should only have IPA for "beaten", NOT for "beat" (already inline) + assert "bˈiːtən" in fixed + assert fixed.count("bˈiːt") == 0 or fixed == "[bˈiːtən]" + + def test_continuation_bracket_at_end_returns_inline(self): + """'the Highlands [ˈhaɪləndz]' → return inline IPA, not IPA for 'the'.""" + fixed = fix_ipa_continuation_cell( + "'hailandz", "the Highlands [ˈhaɪləndz]", pronunciation="british", + ) + assert fixed == "[ˈhaɪləndz]" + assert "ðə" not in fixed # "the" must NOT get IPA + def test_headword_with_brackets_not_continuation(self): """'employee [im'ploi:]' has a headword outside brackets → not garbled.