fix(rows): filter artifact rows and heal gaps for full OCR height

Two new functions: - _is_artifact_row(): marks rows as artifacts if all detected tokens are single characters (scanner shadows produce dots/dashes, not words). A real vocabulary row always contains at least one 2+ char word. - _heal_row_gaps(): after removing empty/artifact rows, expands each remaining content row to the midpoint of adjacent gaps, so OCR crops are not artificially narrow. First row extends to content top_bound; last row to content bottom_bound. Applied in both build_cell_grid() and build_cell_grid_streaming() after the word_count>0 filter and before OCR. Addresses cases like: - Row 21: scan shadow → single-char artifacts → filtered before OCR - Row 23: completely empty (word_count=0) → already filtered - Row 22: real content → now expanded upward/downward to fill the space that rows 21 and 23 occupied, giving OCR the correct full height Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 15:38:58 +01:00
parent 6623a5d10e
commit cb61fab77b
1 changed files with 93 additions and 0 deletions
@@ -4303,6 +4303,66 @@ def _ocr_single_cell(
    }
 def _is_artifact_row(row: RowGeometry) -> bool:
    """Return True if this row contains only scan artifacts, not real text.
    Artifact rows (scanner shadows, noise) typically produce only single-character
    detections. A real content row always has at least one token with 2+ characters.
    """
    if row.word_count == 0:
        return True
    texts = [w.get('text', '').strip() for w in row.words]
    return all(len(t) <= 1 for t in texts)
 def _heal_row_gaps(
    rows: List[RowGeometry],
    top_bound: int,
    bottom_bound: int,
 ) -> None:
    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
    After filtering out empty or artifact rows, remaining content rows may have
    gaps between them where the removed rows used to be. This function mutates
    each row to extend upward/downward to the midpoint of such gaps so that
    OCR crops cover the full available content area.
    The first row always extends to top_bound; the last row to bottom_bound.
    """
    if not rows:
        return
    rows.sort(key=lambda r: r.y)
    n = len(rows)
    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation
    for i, row in enumerate(rows):
        # New top: midpoint between previous row's bottom and this row's top
        if i == 0:
            new_top = top_bound
        else:
            prev_bot = orig[i - 1][1]
            my_top = orig[i][0]
            gap = my_top - prev_bot
            new_top = prev_bot + gap // 2 if gap > 1 else my_top
        # New bottom: midpoint between this row's bottom and next row's top
        if i == n - 1:
            new_bottom = bottom_bound
        else:
            my_bot = orig[i][1]
            next_top = orig[i + 1][0]
            gap = next_top - my_bot
            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
        row.y = new_top
        row.height = max(5, new_bottom - new_top)
    logger.debug(
        f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
        f"(bounds: top={top_bound}, bottom={bottom_bound})"
    )
 def build_cell_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -4374,6 +4434,25 @@ def build_cell_grid(
        logger.warning("build_cell_grid: no usable columns found")
        return [], []
    # Filter artifact rows: rows whose detected words are all single characters
    # are caused by scanner shadows or noise, not real text.
    before_art = len(content_rows)
    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
    artifact_skipped = before_art - len(content_rows)
    if artifact_skipped > 0:
        logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
    if not content_rows:
        logger.warning("build_cell_grid: no content rows after artifact filtering")
        return [], []
    # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
    # to fill the space so OCR crops are not artificially narrow.
    _heal_row_gaps(
        content_rows,
        top_bound=min(c.y for c in relevant_cols),
        bottom_bound=max(c.y + c.height for c in relevant_cols),
    )
    # Sort columns left-to-right
    relevant_cols.sort(key=lambda c: c.x)
@@ -4555,6 +4634,20 @@ def build_cell_grid_streaming(
    if not relevant_cols:
        return
    # Filter artifact rows + heal gaps (same logic as build_cell_grid)
    before_art = len(content_rows)
    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
    artifact_skipped = before_art - len(content_rows)
    if artifact_skipped > 0:
        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
    if not content_rows:
        return
    _heal_row_gaps(
        content_rows,
        top_bound=min(c.y for c in relevant_cols),
        bottom_bound=max(c.y + c.height for c in relevant_cols),
    )
    relevant_cols.sort(key=lambda c: c.x)
    columns_meta = [