fix(rows): filter artifact rows and heal gaps for full OCR height
Two new functions: - _is_artifact_row(): marks rows as artifacts if all detected tokens are single characters (scanner shadows produce dots/dashes, not words). A real vocabulary row always contains at least one 2+ char word. - _heal_row_gaps(): after removing empty/artifact rows, expands each remaining content row to the midpoint of adjacent gaps, so OCR crops are not artificially narrow. First row extends to content top_bound; last row to content bottom_bound. Applied in both build_cell_grid() and build_cell_grid_streaming() after the word_count>0 filter and before OCR. Addresses cases like: - Row 21: scan shadow → single-char artifacts → filtered before OCR - Row 23: completely empty (word_count=0) → already filtered - Row 22: real content → now expanded upward/downward to fill the space that rows 21 and 23 occupied, giving OCR the correct full height Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4303,6 +4303,66 @@ def _ocr_single_cell(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_artifact_row(row: RowGeometry) -> bool:
|
||||||
|
"""Return True if this row contains only scan artifacts, not real text.
|
||||||
|
|
||||||
|
Artifact rows (scanner shadows, noise) typically produce only single-character
|
||||||
|
detections. A real content row always has at least one token with 2+ characters.
|
||||||
|
"""
|
||||||
|
if row.word_count == 0:
|
||||||
|
return True
|
||||||
|
texts = [w.get('text', '').strip() for w in row.words]
|
||||||
|
return all(len(t) <= 1 for t in texts)
|
||||||
|
|
||||||
|
|
||||||
|
def _heal_row_gaps(
|
||||||
|
rows: List[RowGeometry],
|
||||||
|
top_bound: int,
|
||||||
|
bottom_bound: int,
|
||||||
|
) -> None:
|
||||||
|
"""Expand row y/height to fill vertical gaps caused by removed adjacent rows.
|
||||||
|
|
||||||
|
After filtering out empty or artifact rows, remaining content rows may have
|
||||||
|
gaps between them where the removed rows used to be. This function mutates
|
||||||
|
each row to extend upward/downward to the midpoint of such gaps so that
|
||||||
|
OCR crops cover the full available content area.
|
||||||
|
|
||||||
|
The first row always extends to top_bound; the last row to bottom_bound.
|
||||||
|
"""
|
||||||
|
if not rows:
|
||||||
|
return
|
||||||
|
rows.sort(key=lambda r: r.y)
|
||||||
|
n = len(rows)
|
||||||
|
orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
|
||||||
|
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
# New top: midpoint between previous row's bottom and this row's top
|
||||||
|
if i == 0:
|
||||||
|
new_top = top_bound
|
||||||
|
else:
|
||||||
|
prev_bot = orig[i - 1][1]
|
||||||
|
my_top = orig[i][0]
|
||||||
|
gap = my_top - prev_bot
|
||||||
|
new_top = prev_bot + gap // 2 if gap > 1 else my_top
|
||||||
|
|
||||||
|
# New bottom: midpoint between this row's bottom and next row's top
|
||||||
|
if i == n - 1:
|
||||||
|
new_bottom = bottom_bound
|
||||||
|
else:
|
||||||
|
my_bot = orig[i][1]
|
||||||
|
next_top = orig[i + 1][0]
|
||||||
|
gap = next_top - my_bot
|
||||||
|
new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
|
||||||
|
|
||||||
|
row.y = new_top
|
||||||
|
row.height = max(5, new_bottom - new_top)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
|
||||||
|
f"(bounds: top={top_bound}, bottom={bottom_bound})"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def build_cell_grid(
|
def build_cell_grid(
|
||||||
ocr_img: np.ndarray,
|
ocr_img: np.ndarray,
|
||||||
column_regions: List[PageRegion],
|
column_regions: List[PageRegion],
|
||||||
@@ -4374,6 +4434,25 @@ def build_cell_grid(
|
|||||||
logger.warning("build_cell_grid: no usable columns found")
|
logger.warning("build_cell_grid: no usable columns found")
|
||||||
return [], []
|
return [], []
|
||||||
|
|
||||||
|
# Filter artifact rows: rows whose detected words are all single characters
|
||||||
|
# are caused by scanner shadows or noise, not real text.
|
||||||
|
before_art = len(content_rows)
|
||||||
|
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||||||
|
artifact_skipped = before_art - len(content_rows)
|
||||||
|
if artifact_skipped > 0:
|
||||||
|
logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
|
||||||
|
if not content_rows:
|
||||||
|
logger.warning("build_cell_grid: no content rows after artifact filtering")
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
# Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
|
||||||
|
# to fill the space so OCR crops are not artificially narrow.
|
||||||
|
_heal_row_gaps(
|
||||||
|
content_rows,
|
||||||
|
top_bound=min(c.y for c in relevant_cols),
|
||||||
|
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||||||
|
)
|
||||||
|
|
||||||
# Sort columns left-to-right
|
# Sort columns left-to-right
|
||||||
relevant_cols.sort(key=lambda c: c.x)
|
relevant_cols.sort(key=lambda c: c.x)
|
||||||
|
|
||||||
@@ -4555,6 +4634,20 @@ def build_cell_grid_streaming(
|
|||||||
if not relevant_cols:
|
if not relevant_cols:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Filter artifact rows + heal gaps (same logic as build_cell_grid)
|
||||||
|
before_art = len(content_rows)
|
||||||
|
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||||||
|
artifact_skipped = before_art - len(content_rows)
|
||||||
|
if artifact_skipped > 0:
|
||||||
|
logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
|
||||||
|
if not content_rows:
|
||||||
|
return
|
||||||
|
_heal_row_gaps(
|
||||||
|
content_rows,
|
||||||
|
top_bound=min(c.y for c in relevant_cols),
|
||||||
|
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||||||
|
)
|
||||||
|
|
||||||
relevant_cols.sort(key=lambda c: c.x)
|
relevant_cols.sort(key=lambda c: c.x)
|
||||||
|
|
||||||
columns_meta = [
|
columns_meta = [
|
||||||
|
|||||||
Reference in New Issue
Block a user