Fix en_col_type detection: use bracket IPA count instead of longest avg text

The previous heuristic picked the column with the longest average text as the English headword column. In layouts with long example sentences, this picked the wrong column (examples instead of headwords). Now counts cells with bracket patterns per column — the column with the most brackets is the headword column where IPA needs fixing. Fixes garbled OCR-IPA like "change [tfeind3]" → "change [tʃˈeɪndʒ]". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 06:50:47 +01:00
parent 92a7b85c2d
commit 58c9565ba5
1 changed files with 21 additions and 10 deletions
@@ -1588,21 +1588,32 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
    if total_cols >= 3:
-        # Find which col_type has the longest average text → English headwords
+        # Find the column that contains IPA brackets → English headwords.
+        # Count cells with bracket patterns per col_type.  The column with
+        # the most brackets is the headword column (IPA sits after or below
+        # headwords).  Falls back to longest-average if no brackets found.
+        col_bracket_count: Dict[str, int] = {}
        col_avg_len: Dict[str, List[int]] = {}
        for cell in all_cells:
            ct = cell.get("col_type", "")
-            txt = cell.get("text", "")
+            txt = cell.get("text", "") or ""
            col_avg_len.setdefault(ct, []).append(len(txt))
+            if ct.startswith("column_") and '[' in txt:
+                col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
+        # Pick column with most bracket IPA patterns
        en_col_type = None
-        best_avg = 0
-        for ct, lengths in col_avg_len.items():
-            if not ct.startswith("column_"):
-                continue
-            avg = sum(lengths) / len(lengths) if lengths else 0
-            if avg > best_avg:
-                best_avg = avg
-                en_col_type = ct
+        if col_bracket_count:
+            en_col_type = max(col_bracket_count, key=col_bracket_count.get)
+        else:
+            # Fallback: longest average text
+            best_avg = 0
+            for ct, lengths in col_avg_len.items():
+                if not ct.startswith("column_"):
+                    continue
+                avg = sum(lengths) / len(lengths) if lengths else 0
+                if avg > best_avg:
+                    best_avg = avg
+                    en_col_type = ct
        if en_col_type:
            for cell in all_cells:
                if cell.get("col_type") == en_col_type: