fix(ocr-pipeline): filter phantom rows with word_count=0 from cell grid

Rows in inter-line whitespace gaps have no Tesseract words assigned but
were still processed by build_cell_grid, producing garbage OCR output.
Filter these phantom rows using the word_count field set during Step 4.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-01 18:40:13 +01:00
parent 7f27783008
commit 89b5f49918

View File

@@ -3145,6 +3145,17 @@ def build_cell_grid(
logger.warning("build_cell_grid: no content rows found")
return [], []
# Filter phantom rows: rows with no Tesseract words assigned are
# inter-line whitespace gaps that would produce garbage OCR.
before = len(content_rows)
content_rows = [r for r in content_rows if r.word_count > 0]
skipped = before - len(content_rows)
if skipped > 0:
logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
if not content_rows:
logger.warning("build_cell_grid: no content rows with words found")
return [], []
# Use columns only — skip ignore, header, footer, page_ref
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
@@ -3222,6 +3233,16 @@ def build_cell_grid_streaming(
if not content_rows:
return
# Filter phantom rows: rows with no Tesseract words assigned are
# inter-line whitespace gaps that would produce garbage OCR.
before = len(content_rows)
content_rows = [r for r in content_rows if r.word_count > 0]
skipped = before - len(content_rows)
if skipped > 0:
logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
if not content_rows:
return
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
if not relevant_cols: