From 5f89913a9a45127e9dd91c168f6641b00de0252e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 19 Mar 2026 23:34:41 +0100 Subject: [PATCH] Fix IPA continuation to check all columns, not just en_col_type The en_col_type heuristic (longest avg text) picks the example column, missing IPA continuation cells in the actual headword column. Now Step 5d checks all column_* cells for garbled IPA patterns independently. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 91 +++++++++------------- 1 file changed, 38 insertions(+), 53 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 15e43ec..67fc02a 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1614,69 +1614,54 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if orig: cell["col_type"] = orig - # 5d. Fix IPA continuation rows — rows where the printed + # 5d. Fix IPA continuation cells — cells where the printed # phonetic transcription wraps to a line below the headword. - # These contain garbled IPA in the EN column. Replace garbled - # text with proper IPA looked up from the headword in the - # previous row. + # These contain garbled IPA (e.g. "[n, nn]", "[1uedtX,1]"). + # Replace garbled text with proper IPA looked up from the + # headword in the previous row's same column. + # Note: We check ALL columns, not just en_col_type, because + # the EN headword column may not be the longest-average column. ipa_cont_fixed = 0 for z in zones_data: rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) z_cells = z.get("cells", []) for idx, row in enumerate(rows_sorted): - ri = row["index"] - row_cells = [c for c in z_cells if c.get("row_index") == ri] - en_cells = [ - c for c in row_cells - if c.get("col_type") == en_col_type - ] - if not en_cells: - continue - en_text = en_cells[0].get("text", "") - if not _text_has_garbled_ipa(en_text): - continue - # Already has proper IPA brackets → already fixed - if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text): - continue - # When the EN cell does NOT look obviously garbled - # (e.g. bracketed non-IPA), require that other columns - # are empty — otherwise it's a normal content row. - en_stripped = en_text.strip() - is_bracket_garbled = ( - en_stripped.startswith('[') and en_stripped.endswith(']') - ) - if not is_bracket_garbled: - other_cells = [ - c for c in row_cells - if c.get("col_type") != en_col_type - and len((c.get("text") or "").strip()) >= 3 - ] - if other_cells: - continue - # Find headword in previous row if idx == 0: continue - prev_ri = rows_sorted[idx - 1]["index"] - prev_en = [ - c for c in z_cells - if c.get("row_index") == prev_ri - and c.get("col_type") == en_col_type - ] - if not prev_en: - continue - prev_text = prev_en[0].get("text", "") - fixed = fix_ipa_continuation_cell( - en_text, prev_text, pronunciation="british", - ) - if fixed != en_text: - en_cells[0]["text"] = fixed - ipa_cont_fixed += 1 - logger.info( - "IPA continuation R%d: '%s' → '%s'", - ri, en_text, fixed, + ri = row["index"] + row_cells = [c for c in z_cells if c.get("row_index") == ri] + for cell in row_cells: + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + cell_text = cell.get("text", "") + if not _text_has_garbled_ipa(cell_text): + continue + # Already has proper IPA brackets → already fixed + if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text): + continue + # Find headword in previous row, same column + prev_ri = rows_sorted[idx - 1]["index"] + prev_same_col = [ + c for c in z_cells + if c.get("row_index") == prev_ri + and c.get("col_type") == ct + ] + if not prev_same_col: + continue + prev_text = prev_same_col[0].get("text", "") + fixed = fix_ipa_continuation_cell( + cell_text, prev_text, pronunciation="british", ) + if fixed != cell_text: + cell["text"] = fixed + ipa_cont_fixed += 1 + logger.info( + "IPA continuation R%d %s: '%s' → '%s'", + ri, ct, cell_text, fixed, + ) if ipa_cont_fixed: - logger.info("Fixed %d IPA continuation rows", ipa_cont_fixed) + logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed) duration = time.time() - t0