From 4290f70885e6dac91c84c0ab00dee0919c301e2a Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 08:30:44 +0100 Subject: [PATCH] Fix unbracketed IPA continuations: detect garbled IPA in single-cell rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 5d now also processes IPA continuations without brackets (e.g. "ska:f – ska:vz", "'sekandarr sku:l") when the row has only 1 content cell and the text is pure-ASCII garbled IPA (no real IPA Unicode symbols). Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 45 +++++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 0c20a5a..b05c344 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1782,6 +1782,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # headword in the previous row's same column. # Note: We check ALL columns, not just en_col_type, because # the EN headword column may not be the longest-average column. + _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") ipa_cont_fixed = 0 for z in zones_data: rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) @@ -1796,17 +1797,41 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if not ct.startswith("column_"): continue cell_text = (cell.get("text") or "").strip() - # Only treat as continuation when text is entirely - # inside brackets — e.g. "[n, nn]", "[klaoz 'daun]". - # Text like "employee [im'ploi:]" has a headword - # OUTSIDE brackets and must NOT be overwritten. - if not (cell_text.startswith('[') and cell_text.endswith(']')): - continue - if not _text_has_garbled_ipa(cell_text): - continue - # Already has proper IPA brackets → already fixed - if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text): + if not cell_text: continue + + is_bracketed = ( + cell_text.startswith('[') and cell_text.endswith(']') + ) + + if is_bracketed: + # Bracketed continuation: "[n, nn]", "[klaoz 'daun]" + # Text like "employee [im'ploi:]" is NOT fully + # bracketed and won't match here. + if not _text_has_garbled_ipa(cell_text): + continue + # Already has proper IPA brackets → skip + if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text): + continue + else: + # Unbracketed continuation: "ska:f – ska:vz", + # "'sekandarr sku:l". Only treat as IPA + # continuation if this is the ONLY content cell + # in the row (single-cell row) and the text is + # garbled IPA without real IPA Unicode symbols. + content_cells_in_row = [ + c for c in row_cells + if c.get("col_type", "").startswith("column_") + and c.get("col_type") != "column_1" + ] + if len(content_cells_in_row) != 1: + continue + if not _text_has_garbled_ipa(cell_text): + continue + # Has real IPA symbols → already fixed or valid + if any(c in _REAL_IPA_CHARS for c in cell_text): + continue + # Find headword in previous row, same column prev_ri = rows_sorted[idx - 1]["index"] prev_same_col = [