fix(ocr-pipeline): filter phantom rows with word_count=0 from cell grid
Rows in inter-line whitespace gaps have no Tesseract words assigned but were still processed by build_cell_grid, producing garbage OCR output. Filter these phantom rows using the word_count field set during Step 4. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3145,6 +3145,17 @@ def build_cell_grid(
|
||||
logger.warning("build_cell_grid: no content rows found")
|
||||
return [], []
|
||||
|
||||
# Filter phantom rows: rows with no Tesseract words assigned are
|
||||
# inter-line whitespace gaps that would produce garbage OCR.
|
||||
before = len(content_rows)
|
||||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||||
skipped = before - len(content_rows)
|
||||
if skipped > 0:
|
||||
logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid: no content rows with words found")
|
||||
return [], []
|
||||
|
||||
# Use columns only — skip ignore, header, footer, page_ref
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
@@ -3222,6 +3233,16 @@ def build_cell_grid_streaming(
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
# Filter phantom rows: rows with no Tesseract words assigned are
|
||||
# inter-line whitespace gaps that would produce garbage OCR.
|
||||
before = len(content_rows)
|
||||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||||
skipped = before - len(content_rows)
|
||||
if skipped > 0:
|
||||
logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
if not relevant_cols:
|
||||
|
||||
Reference in New Issue
Block a user