From 58c9565ba5d0e811df967a5e8f2388416cd73b39 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 06:50:47 +0100 Subject: [PATCH] Fix en_col_type detection: use bracket IPA count instead of longest avg text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous heuristic picked the column with the longest average text as the English headword column. In layouts with long example sentences, this picked the wrong column (examples instead of headwords). Now counts cells with bracket patterns per column — the column with the most brackets is the headword column where IPA needs fixing. Fixes garbled OCR-IPA like "change [tfeind3]" → "change [tʃˈeɪndʒ]". Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 31 +++++++++++++++------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 75076af..cd804be 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1588,21 +1588,32 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: all_cells = [cell for z in zones_data for cell in z.get("cells", [])] total_cols = sum(len(z.get("columns", [])) for z in zones_data) if total_cols >= 3: - # Find which col_type has the longest average text → English headwords + # Find the column that contains IPA brackets → English headwords. + # Count cells with bracket patterns per col_type. The column with + # the most brackets is the headword column (IPA sits after or below + # headwords). Falls back to longest-average if no brackets found. + col_bracket_count: Dict[str, int] = {} col_avg_len: Dict[str, List[int]] = {} for cell in all_cells: ct = cell.get("col_type", "") - txt = cell.get("text", "") + txt = cell.get("text", "") or "" col_avg_len.setdefault(ct, []).append(len(txt)) + if ct.startswith("column_") and '[' in txt: + col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1 + # Pick column with most bracket IPA patterns en_col_type = None - best_avg = 0 - for ct, lengths in col_avg_len.items(): - if not ct.startswith("column_"): - continue - avg = sum(lengths) / len(lengths) if lengths else 0 - if avg > best_avg: - best_avg = avg - en_col_type = ct + if col_bracket_count: + en_col_type = max(col_bracket_count, key=col_bracket_count.get) + else: + # Fallback: longest average text + best_avg = 0 + for ct, lengths in col_avg_len.items(): + if not ct.startswith("column_"): + continue + avg = sum(lengths) / len(lengths) if lengths else 0 + if avg > best_avg: + best_avg = avg + en_col_type = ct if en_col_type: for cell in all_cells: if cell.get("col_type") == en_col_type: