fix(rows): filter artifact rows and heal gaps for full OCR height

Two new functions: - _is_artifact_row(): marks rows as artifacts if all detected tokens are single characters (scanner shadows produce dots/dashes, not words). A real vocabulary row always contains at least one 2+ char word. - _heal_row_gaps(): after removing empty/artifact rows, expands each remaining content row to the midpoint of adjacent gaps, so OCR crops are not artificially narrow. First row extends to content top_bound; last row to content bottom_bound. Applied in both build_cell_grid() and build_cell_grid_streaming() after the word_count>0 filter and before OCR. Addresses cases like: - Row 21: scan shadow → single-char artifacts → filtered before OCR - Row 23: completely empty (word_count=0) → already filtered - Row 22: real content → now expanded upward/downward to fill the space that rows 21 and 23 occupied, giving OCR the correct full height Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 15:38:58 +01:00
parent 6623a5d10e
commit cb61fab77b
1 changed files with 93 additions and 0 deletions
@@ -4303,6 +4303,66 @@ def _ocr_single_cell(
    }


+def _is_artifact_row(row: RowGeometry) -> bool:
+    """Return True if this row contains only scan artifacts, not real text.
+
+    Artifact rows (scanner shadows, noise) typically produce only single-character
+    detections. A real content row always has at least one token with 2+ characters.
+    """
+    if row.word_count == 0:
+        return True
+    texts = [w.get('text', '').strip() for w in row.words]
+    return all(len(t) <= 1 for t in texts)
+
+
+def _heal_row_gaps(
+    rows: List[RowGeometry],
+    top_bound: int,
+    bottom_bound: int,
+) -> None:
+    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
+
+    After filtering out empty or artifact rows, remaining content rows may have
+    gaps between them where the removed rows used to be. This function mutates
+    each row to extend upward/downward to the midpoint of such gaps so that
+    OCR crops cover the full available content area.
+
+    The first row always extends to top_bound; the last row to bottom_bound.
+    """
+    if not rows:
+        return
+    rows.sort(key=lambda r: r.y)
+    n = len(rows)
+    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation
+
+    for i, row in enumerate(rows):
+        # New top: midpoint between previous row's bottom and this row's top
+        if i == 0:
+            new_top = top_bound
+        else:
+            prev_bot = orig[i - 1][1]
+            my_top = orig[i][0]
+            gap = my_top - prev_bot
+            new_top = prev_bot + gap // 2 if gap > 1 else my_top
+
+        # New bottom: midpoint between this row's bottom and next row's top
+        if i == n - 1:
+            new_bottom = bottom_bound
+        else:
+            my_bot = orig[i][1]
+            next_top = orig[i + 1][0]
+            gap = next_top - my_bot
+            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
+
+        row.y = new_top
+        row.height = max(5, new_bottom - new_top)
+
+    logger.debug(
+        f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
+        f"(bounds: top={top_bound}, bottom={bottom_bound})"
+    )
+
+
 def build_cell_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -4374,6 +4434,25 @@ def build_cell_grid(
        logger.warning("build_cell_grid: no usable columns found")
        return [], []

+    # Filter artifact rows: rows whose detected words are all single characters
+    # are caused by scanner shadows or noise, not real text.
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows after artifact filtering")
+        return [], []
+
+    # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
+    # to fill the space so OCR crops are not artificially narrow.
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
    # Sort columns left-to-right
    relevant_cols.sort(key=lambda c: c.x)

@@ -4555,6 +4634,20 @@ def build_cell_grid_streaming(
    if not relevant_cols:
        return

+    # Filter artifact rows + heal gaps (same logic as build_cell_grid)
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
+    if not content_rows:
+        return
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
    relevant_cols.sort(key=lambda c: c.x)

    columns_meta = [