fix: Step 4f sidebar detection uses avg text length instead of fill ratio

Column_1 data showed avg_len=1.0 with 13 single-char cells (alphabet
letters from sidebar). Old fill_ratio check (76% > 35%) missed it.
New criteria: avg_len ≤ 1.5 AND ≥ 70% single chars → removes column.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 14:10:43 +01:00
parent be86a7d14d
commit fe754398c0

View File

@@ -2130,53 +2130,49 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
strip_gap, strip_count, total, strip_gap, strip_count, total,
) )
# 4f. Remove thin decorative edge columns (alphabet sidebar safety net). # 4f. Remove decorative edge columns (alphabet sidebar safety net).
# If the leftmost or rightmost column has very few filled cells AND # Dictionary pages have A-Z letter sidebars that OCR reads as single-
# most of its text is short (≤2 chars), it's likely an alphabet sidebar # character word_boxes. These form narrow columns with very short text.
# that slipped through word-level pre-filters. # Detection: edge column where almost ALL cells are single characters.
for z in zones_data: for z in zones_data:
columns = z.get("columns", []) columns = z.get("columns", [])
cells = z.get("cells", []) cells = z.get("cells", [])
if len(columns) < 3 or not cells: if len(columns) < 3 or not cells:
continue continue
# Group cells by col_type # Group cells by col_type (skip spanning_header)
col_cells: Dict[str, List[Dict]] = {} col_cells: Dict[str, List[Dict]] = {}
for cell in cells: for cell in cells:
ct = cell.get("col_type", "") ct = cell.get("col_type", "")
if ct.startswith("column_"):
col_cells.setdefault(ct, []).append(cell) col_cells.setdefault(ct, []).append(cell)
# Find edge column types (first and last)
col_types_ordered = sorted(col_cells.keys()) col_types_ordered = sorted(col_cells.keys())
if not col_types_ordered: if len(col_types_ordered) < 3:
continue
# Median cell count across columns (excluding heading rows)
col_counts = [len(v) for v in col_cells.values()]
median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0
if median_count < 3:
continue continue
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]: for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
edge_cells_list = col_cells.get(edge_ct, []) edge_cells_list = col_cells.get(edge_ct, [])
if not edge_cells_list: if len(edge_cells_list) < 3:
continue continue
fill_ratio = len(edge_cells_list) / median_count # Key criterion: average text length and single-char ratio.
if fill_ratio > 0.35: # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells
continue # well-filled column → not decorative # are single characters.
short_count = sum( texts = [(c.get("text") or "").strip() for c in edge_cells_list]
1 for c in edge_cells_list avg_len = sum(len(t) for t in texts) / len(texts)
if len((c.get("text") or "").strip()) <= 2 single_char = sum(1 for t in texts if len(t) <= 1)
) single_ratio = single_char / len(texts)
short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0 if avg_len > 1.5:
if short_ratio < 0.6: continue # real content has longer text
continue # too much real content → not decorative if single_ratio < 0.7:
continue # not dominated by single chars
# Remove this edge column # Remove this edge column
removed_count = len(edge_cells_list) removed_count = len(edge_cells_list)
edge_ids = {id(c) for c in edge_cells_list} edge_ids = {id(c) for c in edge_cells_list}
z["cells"] = [c for c in cells if id(c) not in edge_ids] z["cells"] = [c for c in cells if id(c) not in edge_ids]
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct] z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
logger.info( logger.info(
"Step 4f: removed thin decorative edge column '%s' from zone %d " "Step 4f: removed decorative edge column '%s' from zone %d "
"(%d cells, fill=%.0f%%, short=%.0f%%)", "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
edge_ct, z.get("zone_index", 0), removed_count, edge_ct, z.get("zone_index", 0), removed_count,
fill_ratio * 100, short_ratio * 100, avg_len, single_ratio * 100,
) )
break # only remove one edge per zone break # only remove one edge per zone