fix: Step 4f sidebar detection uses avg text length instead of fill ratio

Column_1 data showed avg_len=1.0 with 13 single-char cells (alphabet letters from sidebar). Old fill_ratio check (76% > 35%) missed it. New criteria: avg_len ≤ 1.5 AND ≥ 70% single chars → removes column. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 14:10:43 +01:00
parent be86a7d14d
commit fe754398c0
1 changed files with 23 additions and 27 deletions
@@ -2130,53 +2130,49 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                strip_gap, strip_count, total,
            )

-    # 4f. Remove thin decorative edge columns (alphabet sidebar safety net).
-    # If the leftmost or rightmost column has very few filled cells AND
-    # most of its text is short (≤2 chars), it's likely an alphabet sidebar
-    # that slipped through word-level pre-filters.
+    # 4f. Remove decorative edge columns (alphabet sidebar safety net).
+    # Dictionary pages have A-Z letter sidebars that OCR reads as single-
+    # character word_boxes.  These form narrow columns with very short text.
+    # Detection: edge column where almost ALL cells are single characters.
    for z in zones_data:
        columns = z.get("columns", [])
        cells = z.get("cells", [])
        if len(columns) < 3 or not cells:
            continue
-        # Group cells by col_type
+        # Group cells by col_type (skip spanning_header)
        col_cells: Dict[str, List[Dict]] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
+            if ct.startswith("column_"):
                col_cells.setdefault(ct, []).append(cell)
-        # Find edge column types (first and last)
        col_types_ordered = sorted(col_cells.keys())
-        if not col_types_ordered:
-            continue
-        # Median cell count across columns (excluding heading rows)
-        col_counts = [len(v) for v in col_cells.values()]
-        median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0
-        if median_count < 3:
+        if len(col_types_ordered) < 3:
            continue
        for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
            edge_cells_list = col_cells.get(edge_ct, [])
-            if not edge_cells_list:
+            if len(edge_cells_list) < 3:
                continue
-            fill_ratio = len(edge_cells_list) / median_count
-            if fill_ratio > 0.35:
-                continue  # well-filled column → not decorative
-            short_count = sum(
-                1 for c in edge_cells_list
-                if len((c.get("text") or "").strip()) <= 2
-            )
-            short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0
-            if short_ratio < 0.6:
-                continue  # too much real content → not decorative
+            # Key criterion: average text length and single-char ratio.
+            # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells
+            # are single characters.
+            texts = [(c.get("text") or "").strip() for c in edge_cells_list]
+            avg_len = sum(len(t) for t in texts) / len(texts)
+            single_char = sum(1 for t in texts if len(t) <= 1)
+            single_ratio = single_char / len(texts)
+            if avg_len > 1.5:
+                continue  # real content has longer text
+            if single_ratio < 0.7:
+                continue  # not dominated by single chars
            # Remove this edge column
            removed_count = len(edge_cells_list)
            edge_ids = {id(c) for c in edge_cells_list}
            z["cells"] = [c for c in cells if id(c) not in edge_ids]
            z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
            logger.info(
-                "Step 4f: removed thin decorative edge column '%s' from zone %d "
-                "(%d cells, fill=%.0f%%, short=%.0f%%)",
+                "Step 4f: removed decorative edge column '%s' from zone %d "
+                "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
                edge_ct, z.get("zone_index", 0), removed_count,
-                fill_ratio * 100, short_ratio * 100,
+                avg_len, single_ratio * 100,
            )
            break  # only remove one edge per zone