From 821e5481c2208a5d0655da769c6d672edc23753b Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 18 Mar 2026 11:50:03 +0100
Subject: [PATCH] =?UTF-8?q?Only=20apply=20IPA=20correction=20on=20vocabula?=
 =?UTF-8?q?ry=20tables=20(=E2=89=A53=20columns)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Single-column German text pages were getting IPA inserted for words
that happen to exist in the English dictionary ("die" → [dˈaɪ],
"Das" → [dɑs]). Now IPA correction only runs when the grid has ≥3
columns, which is the minimum for a vocabulary table layout
(English | article | German).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 55 +++++++++++-----------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 792ec4d..8d8a937 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1165,35 +1165,36 @@ async def build_grid(session_id: str):
 
     # 5c. IPA phonetic correction — replace garbled OCR phonetics with
     # correct IPA from the dictionary (same as in the OCR pipeline).
-    # The grid uses generic col_types (column_1, column_2, ...) but
-    # fix_cell_phonetics expects column_en / column_text.  Identify
-    # the English headword column (longest average text) and mark it.
+    # Only applies to vocabulary tables (≥3 columns: EN | article | DE).
+    # Single/two-column layouts are continuous text, not vocab tables.
     all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
-    # Find which col_type has the longest average text → English headwords
-    col_avg_len: Dict[str, List[int]] = {}
-    for cell in all_cells:
-        ct = cell.get("col_type", "")
-        txt = cell.get("text", "")
-        col_avg_len.setdefault(ct, []).append(len(txt))
-    en_col_type = None
-    best_avg = 0
-    for ct, lengths in col_avg_len.items():
-        if not ct.startswith("column_"):
-            continue
-        avg = sum(lengths) / len(lengths) if lengths else 0
-        if avg > best_avg:
-            best_avg = avg
-            en_col_type = ct
-    if en_col_type:
+    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
+    if total_cols >= 3:
+        # Find which col_type has the longest average text → English headwords
+        col_avg_len: Dict[str, List[int]] = {}
         for cell in all_cells:
-            if cell.get("col_type") == en_col_type:
-                cell["_orig_col_type"] = en_col_type
-                cell["col_type"] = "column_en"
-    fix_cell_phonetics(all_cells, pronunciation="british")
-    for cell in all_cells:
-        orig = cell.pop("_orig_col_type", None)
-        if orig:
-            cell["col_type"] = orig
+            ct = cell.get("col_type", "")
+            txt = cell.get("text", "")
+            col_avg_len.setdefault(ct, []).append(len(txt))
+        en_col_type = None
+        best_avg = 0
+        for ct, lengths in col_avg_len.items():
+            if not ct.startswith("column_"):
+                continue
+            avg = sum(lengths) / len(lengths) if lengths else 0
+            if avg > best_avg:
+                best_avg = avg
+                en_col_type = ct
+        if en_col_type:
+            for cell in all_cells:
+                if cell.get("col_type") == en_col_type:
+                    cell["_orig_col_type"] = en_col_type
+                    cell["col_type"] = "column_en"
+        fix_cell_phonetics(all_cells, pronunciation="british")
+        for cell in all_cells:
+            orig = cell.pop("_orig_col_type", None)
+            if orig:
+                cell["col_type"] = orig
 
     duration = time.time() - t0