From fe754398c031d30e2111216df6699814f9b48a8c Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Tue, 24 Mar 2026 14:10:43 +0100
Subject: [PATCH] fix: Step 4f sidebar detection uses avg text length instead
 of fill ratio
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Column_1 data showed avg_len=1.0 with 13 single-char cells (alphabet
letters from sidebar). Old fill_ratio check (76% > 35%) missed it.
New criteria: avg_len ≤ 1.5 AND ≥ 70% single chars → removes column.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 50 ++++++++++------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index fb92dd8..895c172 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -2130,53 +2130,49 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                 strip_gap, strip_count, total,
             )
 
-    # 4f. Remove thin decorative edge columns (alphabet sidebar safety net).
-    # If the leftmost or rightmost column has very few filled cells AND
-    # most of its text is short (≤2 chars), it's likely an alphabet sidebar
-    # that slipped through word-level pre-filters.
+    # 4f. Remove decorative edge columns (alphabet sidebar safety net).
+    # Dictionary pages have A-Z letter sidebars that OCR reads as single-
+    # character word_boxes.  These form narrow columns with very short text.
+    # Detection: edge column where almost ALL cells are single characters.
     for z in zones_data:
         columns = z.get("columns", [])
         cells = z.get("cells", [])
         if len(columns) < 3 or not cells:
             continue
-        # Group cells by col_type
+        # Group cells by col_type (skip spanning_header)
         col_cells: Dict[str, List[Dict]] = {}
         for cell in cells:
             ct = cell.get("col_type", "")
-            col_cells.setdefault(ct, []).append(cell)
-        # Find edge column types (first and last)
+            if ct.startswith("column_"):
+                col_cells.setdefault(ct, []).append(cell)
         col_types_ordered = sorted(col_cells.keys())
-        if not col_types_ordered:
-            continue
-        # Median cell count across columns (excluding heading rows)
-        col_counts = [len(v) for v in col_cells.values()]
-        median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0
-        if median_count < 3:
+        if len(col_types_ordered) < 3:
             continue
         for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
             edge_cells_list = col_cells.get(edge_ct, [])
-            if not edge_cells_list:
+            if len(edge_cells_list) < 3:
                 continue
-            fill_ratio = len(edge_cells_list) / median_count
-            if fill_ratio > 0.35:
-                continue  # well-filled column → not decorative
-            short_count = sum(
-                1 for c in edge_cells_list
-                if len((c.get("text") or "").strip()) <= 2
-            )
-            short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0
-            if short_ratio < 0.6:
-                continue  # too much real content → not decorative
+            # Key criterion: average text length and single-char ratio.
+            # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells
+            # are single characters.
+            texts = [(c.get("text") or "").strip() for c in edge_cells_list]
+            avg_len = sum(len(t) for t in texts) / len(texts)
+            single_char = sum(1 for t in texts if len(t) <= 1)
+            single_ratio = single_char / len(texts)
+            if avg_len > 1.5:
+                continue  # real content has longer text
+            if single_ratio < 0.7:
+                continue  # not dominated by single chars
             # Remove this edge column
             removed_count = len(edge_cells_list)
             edge_ids = {id(c) for c in edge_cells_list}
             z["cells"] = [c for c in cells if id(c) not in edge_ids]
             z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
             logger.info(
-                "Step 4f: removed thin decorative edge column '%s' from zone %d "
-                "(%d cells, fill=%.0f%%, short=%.0f%%)",
+                "Step 4f: removed decorative edge column '%s' from zone %d "
+                "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
                 edge_ct, z.get("zone_index", 0), removed_count,
-                fill_ratio * 100, short_ratio * 100,
+                avg_len, single_ratio * 100,
             )
             break  # only remove one edge per zone