From 58c9565ba5d0e811df967a5e8f2388416cd73b39 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Fri, 20 Mar 2026 06:50:47 +0100
Subject: [PATCH] Fix en_col_type detection: use bracket IPA count instead of
 longest avg text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous heuristic picked the column with the longest average text as
the English headword column. In layouts with long example sentences, this
picked the wrong column (examples instead of headwords). Now counts cells
with bracket patterns per column — the column with the most brackets is
the headword column where IPA needs fixing.

Fixes garbled OCR-IPA like "change [tfeind3]" → "change [tʃˈeɪndʒ]".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 31 +++++++++++++++-------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 75076af..cd804be 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1588,21 +1588,32 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
     all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
     total_cols = sum(len(z.get("columns", [])) for z in zones_data)
     if total_cols >= 3:
-        # Find which col_type has the longest average text → English headwords
+        # Find the column that contains IPA brackets → English headwords.
+        # Count cells with bracket patterns per col_type.  The column with
+        # the most brackets is the headword column (IPA sits after or below
+        # headwords).  Falls back to longest-average if no brackets found.
+        col_bracket_count: Dict[str, int] = {}
         col_avg_len: Dict[str, List[int]] = {}
         for cell in all_cells:
             ct = cell.get("col_type", "")
-            txt = cell.get("text", "")
+            txt = cell.get("text", "") or ""
             col_avg_len.setdefault(ct, []).append(len(txt))
+            if ct.startswith("column_") and '[' in txt:
+                col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
+        # Pick column with most bracket IPA patterns
         en_col_type = None
-        best_avg = 0
-        for ct, lengths in col_avg_len.items():
-            if not ct.startswith("column_"):
-                continue
-            avg = sum(lengths) / len(lengths) if lengths else 0
-            if avg > best_avg:
-                best_avg = avg
-                en_col_type = ct
+        if col_bracket_count:
+            en_col_type = max(col_bracket_count, key=col_bracket_count.get)
+        else:
+            # Fallback: longest average text
+            best_avg = 0
+            for ct, lengths in col_avg_len.items():
+                if not ct.startswith("column_"):
+                    continue
+                avg = sum(lengths) / len(lengths) if lengths else 0
+                if avg > best_avg:
+                    best_avg = avg
+                    en_col_type = ct
         if en_col_type:
             for cell in all_cells:
                 if cell.get("col_type") == en_col_type: