fix(rows): filter artifact rows and heal gaps for full OCR height

Two new functions:
- _is_artifact_row(): marks rows as artifacts if all detected tokens
  are single characters (scanner shadows produce dots/dashes, not words).
  A real vocabulary row always contains at least one 2+ char word.
- _heal_row_gaps(): after removing empty/artifact rows, expands each
  remaining content row to the midpoint of adjacent gaps, so OCR crops
  are not artificially narrow. First row extends to content top_bound;
  last row to content bottom_bound.

Applied in both build_cell_grid() and build_cell_grid_streaming() after
the word_count>0 filter and before OCR.

Addresses cases like:
- Row 21: scan shadow → single-char artifacts → filtered before OCR
- Row 23: completely empty (word_count=0) → already filtered
- Row 22: real content → now expanded upward/downward to fill the space
  that rows 21 and 23 occupied, giving OCR the correct full height

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 15:38:58 +01:00
parent 6623a5d10e
commit cb61fab77b

View File

@@ -4303,6 +4303,66 @@ def _ocr_single_cell(
}
def _is_artifact_row(row: RowGeometry) -> bool:
"""Return True if this row contains only scan artifacts, not real text.
Artifact rows (scanner shadows, noise) typically produce only single-character
detections. A real content row always has at least one token with 2+ characters.
"""
if row.word_count == 0:
return True
texts = [w.get('text', '').strip() for w in row.words]
return all(len(t) <= 1 for t in texts)
def _heal_row_gaps(
rows: List[RowGeometry],
top_bound: int,
bottom_bound: int,
) -> None:
"""Expand row y/height to fill vertical gaps caused by removed adjacent rows.
After filtering out empty or artifact rows, remaining content rows may have
gaps between them where the removed rows used to be. This function mutates
each row to extend upward/downward to the midpoint of such gaps so that
OCR crops cover the full available content area.
The first row always extends to top_bound; the last row to bottom_bound.
"""
if not rows:
return
rows.sort(key=lambda r: r.y)
n = len(rows)
orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
for i, row in enumerate(rows):
# New top: midpoint between previous row's bottom and this row's top
if i == 0:
new_top = top_bound
else:
prev_bot = orig[i - 1][1]
my_top = orig[i][0]
gap = my_top - prev_bot
new_top = prev_bot + gap // 2 if gap > 1 else my_top
# New bottom: midpoint between this row's bottom and next row's top
if i == n - 1:
new_bottom = bottom_bound
else:
my_bot = orig[i][1]
next_top = orig[i + 1][0]
gap = next_top - my_bot
new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
row.y = new_top
row.height = max(5, new_bottom - new_top)
logger.debug(
f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
f"(bounds: top={top_bound}, bottom={bottom_bound})"
)
def build_cell_grid(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
@@ -4374,6 +4434,25 @@ def build_cell_grid(
logger.warning("build_cell_grid: no usable columns found")
return [], []
# Filter artifact rows: rows whose detected words are all single characters
# are caused by scanner shadows or noise, not real text.
before_art = len(content_rows)
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
artifact_skipped = before_art - len(content_rows)
if artifact_skipped > 0:
logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
if not content_rows:
logger.warning("build_cell_grid: no content rows after artifact filtering")
return [], []
# Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
# to fill the space so OCR crops are not artificially narrow.
_heal_row_gaps(
content_rows,
top_bound=min(c.y for c in relevant_cols),
bottom_bound=max(c.y + c.height for c in relevant_cols),
)
# Sort columns left-to-right
relevant_cols.sort(key=lambda c: c.x)
@@ -4555,6 +4634,20 @@ def build_cell_grid_streaming(
if not relevant_cols:
return
# Filter artifact rows + heal gaps (same logic as build_cell_grid)
before_art = len(content_rows)
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
artifact_skipped = before_art - len(content_rows)
if artifact_skipped > 0:
logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
if not content_rows:
return
_heal_row_gaps(
content_rows,
top_bound=min(c.y for c in relevant_cols),
bottom_bound=max(c.y + c.height for c in relevant_cols),
)
relevant_cols.sort(key=lambda c: c.x)
columns_meta = [