fix(rows): filter artifact rows and heal gaps for full OCR height
Two new functions: - _is_artifact_row(): marks rows as artifacts if all detected tokens are single characters (scanner shadows produce dots/dashes, not words). A real vocabulary row always contains at least one 2+ char word. - _heal_row_gaps(): after removing empty/artifact rows, expands each remaining content row to the midpoint of adjacent gaps, so OCR crops are not artificially narrow. First row extends to content top_bound; last row to content bottom_bound. Applied in both build_cell_grid() and build_cell_grid_streaming() after the word_count>0 filter and before OCR. Addresses cases like: - Row 21: scan shadow → single-char artifacts → filtered before OCR - Row 23: completely empty (word_count=0) → already filtered - Row 22: real content → now expanded upward/downward to fill the space that rows 21 and 23 occupied, giving OCR the correct full height Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4303,6 +4303,66 @@ def _ocr_single_cell(
|
||||
}
|
||||
|
||||
|
||||
def _is_artifact_row(row: RowGeometry) -> bool:
|
||||
"""Return True if this row contains only scan artifacts, not real text.
|
||||
|
||||
Artifact rows (scanner shadows, noise) typically produce only single-character
|
||||
detections. A real content row always has at least one token with 2+ characters.
|
||||
"""
|
||||
if row.word_count == 0:
|
||||
return True
|
||||
texts = [w.get('text', '').strip() for w in row.words]
|
||||
return all(len(t) <= 1 for t in texts)
|
||||
|
||||
|
||||
def _heal_row_gaps(
|
||||
rows: List[RowGeometry],
|
||||
top_bound: int,
|
||||
bottom_bound: int,
|
||||
) -> None:
|
||||
"""Expand row y/height to fill vertical gaps caused by removed adjacent rows.
|
||||
|
||||
After filtering out empty or artifact rows, remaining content rows may have
|
||||
gaps between them where the removed rows used to be. This function mutates
|
||||
each row to extend upward/downward to the midpoint of such gaps so that
|
||||
OCR crops cover the full available content area.
|
||||
|
||||
The first row always extends to top_bound; the last row to bottom_bound.
|
||||
"""
|
||||
if not rows:
|
||||
return
|
||||
rows.sort(key=lambda r: r.y)
|
||||
n = len(rows)
|
||||
orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
# New top: midpoint between previous row's bottom and this row's top
|
||||
if i == 0:
|
||||
new_top = top_bound
|
||||
else:
|
||||
prev_bot = orig[i - 1][1]
|
||||
my_top = orig[i][0]
|
||||
gap = my_top - prev_bot
|
||||
new_top = prev_bot + gap // 2 if gap > 1 else my_top
|
||||
|
||||
# New bottom: midpoint between this row's bottom and next row's top
|
||||
if i == n - 1:
|
||||
new_bottom = bottom_bound
|
||||
else:
|
||||
my_bot = orig[i][1]
|
||||
next_top = orig[i + 1][0]
|
||||
gap = next_top - my_bot
|
||||
new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
|
||||
|
||||
row.y = new_top
|
||||
row.height = max(5, new_bottom - new_top)
|
||||
|
||||
logger.debug(
|
||||
f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
|
||||
f"(bounds: top={top_bound}, bottom={bottom_bound})"
|
||||
)
|
||||
|
||||
|
||||
def build_cell_grid(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
@@ -4374,6 +4434,25 @@ def build_cell_grid(
|
||||
logger.warning("build_cell_grid: no usable columns found")
|
||||
return [], []
|
||||
|
||||
# Filter artifact rows: rows whose detected words are all single characters
|
||||
# are caused by scanner shadows or noise, not real text.
|
||||
before_art = len(content_rows)
|
||||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||||
artifact_skipped = before_art - len(content_rows)
|
||||
if artifact_skipped > 0:
|
||||
logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid: no content rows after artifact filtering")
|
||||
return [], []
|
||||
|
||||
# Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
|
||||
# to fill the space so OCR crops are not artificially narrow.
|
||||
_heal_row_gaps(
|
||||
content_rows,
|
||||
top_bound=min(c.y for c in relevant_cols),
|
||||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||||
)
|
||||
|
||||
# Sort columns left-to-right
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
@@ -4555,6 +4634,20 @@ def build_cell_grid_streaming(
|
||||
if not relevant_cols:
|
||||
return
|
||||
|
||||
# Filter artifact rows + heal gaps (same logic as build_cell_grid)
|
||||
before_art = len(content_rows)
|
||||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||||
artifact_skipped = before_art - len(content_rows)
|
||||
if artifact_skipped > 0:
|
||||
logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
|
||||
if not content_rows:
|
||||
return
|
||||
_heal_row_gaps(
|
||||
content_rows,
|
||||
top_bound=min(c.y for c in relevant_cols),
|
||||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||||
)
|
||||
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
columns_meta = [
|
||||
|
||||
Reference in New Issue
Block a user