fix(ocr-pipeline): filter phantom rows with word_count=0 from cell grid

Rows in inter-line whitespace gaps have no Tesseract words assigned but were still processed by build_cell_grid, producing garbage OCR output. Filter these phantom rows using the word_count field set during Step 4. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 18:40:13 +01:00
parent 7f27783008
commit 89b5f49918
1 changed files with 21 additions and 0 deletions
@@ -3145,6 +3145,17 @@ def build_cell_grid(
        logger.warning("build_cell_grid: no content rows found")
        return [], []

+    # Filter phantom rows: rows with no Tesseract words assigned are
+    # inter-line whitespace gaps that would produce garbage OCR.
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows with words found")
+        return [], []
+
    # Use columns only — skip ignore, header, footer, page_ref
    _skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
@@ -3222,6 +3233,16 @@ def build_cell_grid_streaming(
    if not content_rows:
        return

+    # Filter phantom rows: rows with no Tesseract words assigned are
+    # inter-line whitespace gaps that would produce garbage OCR.
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        return
+
    _skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
    if not relevant_cols: