From 821e5481c2208a5d0655da769c6d672edc23753b Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 11:50:03 +0100 Subject: [PATCH] =?UTF-8?q?Only=20apply=20IPA=20correction=20on=20vocabula?= =?UTF-8?q?ry=20tables=20(=E2=89=A53=20columns)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single-column German text pages were getting IPA inserted for words that happen to exist in the English dictionary ("die" → [dˈaɪ], "Das" → [dɑs]). Now IPA correction only runs when the grid has ≥3 columns, which is the minimum for a vocabulary table layout (English | article | German). Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 55 +++++++++++----------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 792ec4d..8d8a937 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1165,35 +1165,36 @@ async def build_grid(session_id: str): # 5c. IPA phonetic correction — replace garbled OCR phonetics with # correct IPA from the dictionary (same as in the OCR pipeline). - # The grid uses generic col_types (column_1, column_2, ...) but - # fix_cell_phonetics expects column_en / column_text. Identify - # the English headword column (longest average text) and mark it. + # Only applies to vocabulary tables (≥3 columns: EN | article | DE). + # Single/two-column layouts are continuous text, not vocab tables. all_cells = [cell for z in zones_data for cell in z.get("cells", [])] - # Find which col_type has the longest average text → English headwords - col_avg_len: Dict[str, List[int]] = {} - for cell in all_cells: - ct = cell.get("col_type", "") - txt = cell.get("text", "") - col_avg_len.setdefault(ct, []).append(len(txt)) - en_col_type = None - best_avg = 0 - for ct, lengths in col_avg_len.items(): - if not ct.startswith("column_"): - continue - avg = sum(lengths) / len(lengths) if lengths else 0 - if avg > best_avg: - best_avg = avg - en_col_type = ct - if en_col_type: + total_cols = sum(len(z.get("columns", [])) for z in zones_data) + if total_cols >= 3: + # Find which col_type has the longest average text → English headwords + col_avg_len: Dict[str, List[int]] = {} for cell in all_cells: - if cell.get("col_type") == en_col_type: - cell["_orig_col_type"] = en_col_type - cell["col_type"] = "column_en" - fix_cell_phonetics(all_cells, pronunciation="british") - for cell in all_cells: - orig = cell.pop("_orig_col_type", None) - if orig: - cell["col_type"] = orig + ct = cell.get("col_type", "") + txt = cell.get("text", "") + col_avg_len.setdefault(ct, []).append(len(txt)) + en_col_type = None + best_avg = 0 + for ct, lengths in col_avg_len.items(): + if not ct.startswith("column_"): + continue + avg = sum(lengths) / len(lengths) if lengths else 0 + if avg > best_avg: + best_avg = avg + en_col_type = ct + if en_col_type: + for cell in all_cells: + if cell.get("col_type") == en_col_type: + cell["_orig_col_type"] = en_col_type + cell["col_type"] = "column_en" + fix_cell_phonetics(all_cells, pronunciation="british") + for cell in all_cells: + orig = cell.pop("_orig_col_type", None) + if orig: + cell["col_type"] = orig duration = time.time() - t0