fix: Step 4f sidebar detection uses avg text length instead of fill ratio
Column_1 data showed avg_len=1.0 with 13 single-char cells (alphabet letters from sidebar). Old fill_ratio check (76% > 35%) missed it. New criteria: avg_len ≤ 1.5 AND ≥ 70% single chars → removes column. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2130,53 +2130,49 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
strip_gap, strip_count, total,
|
strip_gap, strip_count, total,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4f. Remove thin decorative edge columns (alphabet sidebar safety net).
|
# 4f. Remove decorative edge columns (alphabet sidebar safety net).
|
||||||
# If the leftmost or rightmost column has very few filled cells AND
|
# Dictionary pages have A-Z letter sidebars that OCR reads as single-
|
||||||
# most of its text is short (≤2 chars), it's likely an alphabet sidebar
|
# character word_boxes. These form narrow columns with very short text.
|
||||||
# that slipped through word-level pre-filters.
|
# Detection: edge column where almost ALL cells are single characters.
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
columns = z.get("columns", [])
|
columns = z.get("columns", [])
|
||||||
cells = z.get("cells", [])
|
cells = z.get("cells", [])
|
||||||
if len(columns) < 3 or not cells:
|
if len(columns) < 3 or not cells:
|
||||||
continue
|
continue
|
||||||
# Group cells by col_type
|
# Group cells by col_type (skip spanning_header)
|
||||||
col_cells: Dict[str, List[Dict]] = {}
|
col_cells: Dict[str, List[Dict]] = {}
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
ct = cell.get("col_type", "")
|
ct = cell.get("col_type", "")
|
||||||
col_cells.setdefault(ct, []).append(cell)
|
if ct.startswith("column_"):
|
||||||
# Find edge column types (first and last)
|
col_cells.setdefault(ct, []).append(cell)
|
||||||
col_types_ordered = sorted(col_cells.keys())
|
col_types_ordered = sorted(col_cells.keys())
|
||||||
if not col_types_ordered:
|
if len(col_types_ordered) < 3:
|
||||||
continue
|
|
||||||
# Median cell count across columns (excluding heading rows)
|
|
||||||
col_counts = [len(v) for v in col_cells.values()]
|
|
||||||
median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0
|
|
||||||
if median_count < 3:
|
|
||||||
continue
|
continue
|
||||||
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
|
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
|
||||||
edge_cells_list = col_cells.get(edge_ct, [])
|
edge_cells_list = col_cells.get(edge_ct, [])
|
||||||
if not edge_cells_list:
|
if len(edge_cells_list) < 3:
|
||||||
continue
|
continue
|
||||||
fill_ratio = len(edge_cells_list) / median_count
|
# Key criterion: average text length and single-char ratio.
|
||||||
if fill_ratio > 0.35:
|
# Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells
|
||||||
continue # well-filled column → not decorative
|
# are single characters.
|
||||||
short_count = sum(
|
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
|
||||||
1 for c in edge_cells_list
|
avg_len = sum(len(t) for t in texts) / len(texts)
|
||||||
if len((c.get("text") or "").strip()) <= 2
|
single_char = sum(1 for t in texts if len(t) <= 1)
|
||||||
)
|
single_ratio = single_char / len(texts)
|
||||||
short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0
|
if avg_len > 1.5:
|
||||||
if short_ratio < 0.6:
|
continue # real content has longer text
|
||||||
continue # too much real content → not decorative
|
if single_ratio < 0.7:
|
||||||
|
continue # not dominated by single chars
|
||||||
# Remove this edge column
|
# Remove this edge column
|
||||||
removed_count = len(edge_cells_list)
|
removed_count = len(edge_cells_list)
|
||||||
edge_ids = {id(c) for c in edge_cells_list}
|
edge_ids = {id(c) for c in edge_cells_list}
|
||||||
z["cells"] = [c for c in cells if id(c) not in edge_ids]
|
z["cells"] = [c for c in cells if id(c) not in edge_ids]
|
||||||
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
|
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
|
||||||
logger.info(
|
logger.info(
|
||||||
"Step 4f: removed thin decorative edge column '%s' from zone %d "
|
"Step 4f: removed decorative edge column '%s' from zone %d "
|
||||||
"(%d cells, fill=%.0f%%, short=%.0f%%)",
|
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
|
||||||
edge_ct, z.get("zone_index", 0), removed_count,
|
edge_ct, z.get("zone_index", 0), removed_count,
|
||||||
fill_ratio * 100, short_ratio * 100,
|
avg_len, single_ratio * 100,
|
||||||
)
|
)
|
||||||
break # only remove one edge per zone
|
break # only remove one edge per zone
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user