fix: use header/footer row boundaries for _heal_row_gaps in cell-first OCR
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 20s
Prevents first content row from expanding into header area (causing "ulary" from "VOCABULARY" to appear in DE column) and last content row from expanding into footer area (causing page numbers to appear as content). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4885,12 +4885,22 @@ def build_cell_grid_v2(
|
||||
logger.warning("build_cell_grid_v2: no usable columns found")
|
||||
return [], []
|
||||
|
||||
# Heal row gaps
|
||||
_heal_row_gaps(
|
||||
content_rows,
|
||||
top_bound=min(c.y for c in relevant_cols),
|
||||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||||
)
|
||||
# Heal row gaps — use header/footer boundaries (NOT column bounds!)
|
||||
# In Cell-First OCR, the crop IS the OCR input, so extending into
|
||||
# header/footer means OCR'ing header/footer text ("VOCABULARY", page nums).
|
||||
content_rows.sort(key=lambda r: r.y)
|
||||
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
||||
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
||||
if header_rows:
|
||||
top_bound = max(r.y + r.height for r in header_rows)
|
||||
else:
|
||||
top_bound = content_rows[0].y
|
||||
if footer_rows:
|
||||
bottom_bound = min(r.y for r in footer_rows)
|
||||
else:
|
||||
bottom_bound = content_rows[-1].y + content_rows[-1].height
|
||||
|
||||
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
|
||||
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
@@ -5006,11 +5016,20 @@ def build_cell_grid_v2_streaming(
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
_heal_row_gaps(
|
||||
content_rows,
|
||||
top_bound=min(c.y for c in relevant_cols),
|
||||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||||
)
|
||||
# Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2)
|
||||
content_rows.sort(key=lambda r: r.y)
|
||||
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
||||
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
||||
if header_rows:
|
||||
top_bound = max(r.y + r.height for r in header_rows)
|
||||
else:
|
||||
top_bound = content_rows[0].y
|
||||
if footer_rows:
|
||||
bottom_bound = min(r.y for r in footer_rows)
|
||||
else:
|
||||
bottom_bound = content_rows[-1].y + content_rows[-1].height
|
||||
|
||||
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
|
||||
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user