From 92a7b85c2d179daf82e8bdfb7cfa6d67c0336640 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 00:43:51 +0100 Subject: [PATCH] Fix IPA continuation: only process fully-bracketed cells, keep phrasal verb particles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes: 1. Step 5d now only treats cells as continuation when text is entirely inside brackets (e.g. "[n, nn]"). Cells with headwords outside brackets (e.g. "employee [im'ploi:]") are no longer overwritten. 2. fix_ipa_continuation_cell no longer skips grammar words like "down" — they are part of the headword in phrasal verbs like "close sth. down". Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 8 +++---- klausur-service/backend/grid_editor_api.py | 8 ++++++- .../backend/tests/test_grid_editor_api.py | 21 +++++++++++++++++++ 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index c662be1..a244a89 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1266,7 +1266,10 @@ def fix_ipa_continuation_cell( if not parts: return garbled_text - # Look up IPA for each headword part + # Look up IPA for each headword part. + # Do NOT skip grammar words here — they are integral parts of the + # headword (e.g. "close down", "the United Kingdom"). Grammar + # annotations like "(sth)", "(no pl)" are already stripped above. ipa_parts: List[str] = [] for part in parts: # A part may be multi-word like "secondary school" @@ -1276,9 +1279,6 @@ def fix_ipa_continuation_cell( clean_w = re.sub(r'[^a-zA-Z\'-]', '', w) if not clean_w or len(clean_w) < 2: continue - # Skip grammar words like "to" at the start - if clean_w.lower() in _GRAMMAR_BRACKET_WORDS: - continue ipa = _lookup_ipa(clean_w, pronunciation) if ipa: word_ipas.append(ipa) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 67fc02a..75076af 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1634,7 +1634,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: ct = cell.get("col_type", "") if not ct.startswith("column_"): continue - cell_text = cell.get("text", "") + cell_text = (cell.get("text") or "").strip() + # Only treat as continuation when text is entirely + # inside brackets — e.g. "[n, nn]", "[klaoz 'daun]". + # Text like "employee [im'ploi:]" has a headword + # OUTSIDE brackets and must NOT be overwritten. + if not (cell_text.startswith('[') and cell_text.endswith(']')): + continue if not _text_has_garbled_ipa(cell_text): continue # Already has proper IPA brackets → already fixed diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index 814f51f..74f0822 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -499,3 +499,24 @@ class TestGarbledIpaDetection: ) assert fixed != "[1uedtX,1]" assert "ɪkwˈɪpmənt" in fixed # equipment IPA + + def test_fix_continuation_close_down(self): + """IPA continuation for 'close sth. down' → IPA for both words.""" + fixed = fix_ipa_continuation_cell( + "[klaoz 'daun]", "close sth. down", pronunciation="british", + ) + assert fixed != "[klaoz 'daun]" + assert "klˈəʊs" in fixed # close IPA + assert "dˈaʊn" in fixed # down IPA — must NOT be skipped + + def test_headword_with_brackets_not_continuation(self): + """'employee [im'ploi:]' has a headword outside brackets → not garbled. + + _text_has_garbled_ipa returns True (has ':'), but Step 5d should + skip this cell because text doesn't start with '['. + """ + # The garbled check still triggers (has IPA-like ':') + assert _text_has_garbled_ipa("employee [im'ploi:]") is True + # But text does NOT start with '[' — Step 5d bracket guard blocks it + text = "employee [im'ploi:]" + assert not (text.strip().startswith('[') and text.strip().endswith(']'))