fix: Step 4f sidebar detection uses avg text length instead of fill ratio

Column_1 data showed avg_len=1.0 with 13 single-char cells (alphabet
letters from sidebar). Old fill_ratio check (76% > 35%) missed it.
New criteria: avg_len ≤ 1.5 AND ≥ 70% single chars → removes column.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 14:10:43 +01:00
parent be86a7d14d
commit fe754398c0

View File

@@ -2130,53 +2130,49 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
strip_gap, strip_count, total,
)
# 4f. Remove thin decorative edge columns (alphabet sidebar safety net).
# If the leftmost or rightmost column has very few filled cells AND
# most of its text is short (≤2 chars), it's likely an alphabet sidebar
# that slipped through word-level pre-filters.
# 4f. Remove decorative edge columns (alphabet sidebar safety net).
# Dictionary pages have A-Z letter sidebars that OCR reads as single-
# character word_boxes. These form narrow columns with very short text.
# Detection: edge column where almost ALL cells are single characters.
for z in zones_data:
columns = z.get("columns", [])
cells = z.get("cells", [])
if len(columns) < 3 or not cells:
continue
# Group cells by col_type
# Group cells by col_type (skip spanning_header)
col_cells: Dict[str, List[Dict]] = {}
for cell in cells:
ct = cell.get("col_type", "")
if ct.startswith("column_"):
col_cells.setdefault(ct, []).append(cell)
# Find edge column types (first and last)
col_types_ordered = sorted(col_cells.keys())
if not col_types_ordered:
continue
# Median cell count across columns (excluding heading rows)
col_counts = [len(v) for v in col_cells.values()]
median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0
if median_count < 3:
if len(col_types_ordered) < 3:
continue
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
edge_cells_list = col_cells.get(edge_ct, [])
if not edge_cells_list:
if len(edge_cells_list) < 3:
continue
fill_ratio = len(edge_cells_list) / median_count
if fill_ratio > 0.35:
continue # well-filled column → not decorative
short_count = sum(
1 for c in edge_cells_list
if len((c.get("text") or "").strip()) <= 2
)
short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0
if short_ratio < 0.6:
continue # too much real content → not decorative
# Key criterion: average text length and single-char ratio.
# Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells
# are single characters.
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
avg_len = sum(len(t) for t in texts) / len(texts)
single_char = sum(1 for t in texts if len(t) <= 1)
single_ratio = single_char / len(texts)
if avg_len > 1.5:
continue # real content has longer text
if single_ratio < 0.7:
continue # not dominated by single chars
# Remove this edge column
removed_count = len(edge_cells_list)
edge_ids = {id(c) for c in edge_cells_list}
z["cells"] = [c for c in cells if id(c) not in edge_ids]
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
logger.info(
"Step 4f: removed thin decorative edge column '%s' from zone %d "
"(%d cells, fill=%.0f%%, short=%.0f%%)",
"Step 4f: removed decorative edge column '%s' from zone %d "
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
edge_ct, z.get("zone_index", 0), removed_count,
fill_ratio * 100, short_ratio * 100,
avg_len, single_ratio * 100,
)
break # only remove one edge per zone