fix(ocr-pipeline): filter phantom rows with word_count=0 from cell grid
Rows in inter-line whitespace gaps have no Tesseract words assigned but were still processed by build_cell_grid, producing garbage OCR output. Filter these phantom rows using the word_count field set during Step 4. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3145,6 +3145,17 @@ def build_cell_grid(
|
|||||||
logger.warning("build_cell_grid: no content rows found")
|
logger.warning("build_cell_grid: no content rows found")
|
||||||
return [], []
|
return [], []
|
||||||
|
|
||||||
|
# Filter phantom rows: rows with no Tesseract words assigned are
|
||||||
|
# inter-line whitespace gaps that would produce garbage OCR.
|
||||||
|
before = len(content_rows)
|
||||||
|
content_rows = [r for r in content_rows if r.word_count > 0]
|
||||||
|
skipped = before - len(content_rows)
|
||||||
|
if skipped > 0:
|
||||||
|
logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
|
||||||
|
if not content_rows:
|
||||||
|
logger.warning("build_cell_grid: no content rows with words found")
|
||||||
|
return [], []
|
||||||
|
|
||||||
# Use columns only — skip ignore, header, footer, page_ref
|
# Use columns only — skip ignore, header, footer, page_ref
|
||||||
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
||||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||||
@@ -3222,6 +3233,16 @@ def build_cell_grid_streaming(
|
|||||||
if not content_rows:
|
if not content_rows:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Filter phantom rows: rows with no Tesseract words assigned are
|
||||||
|
# inter-line whitespace gaps that would produce garbage OCR.
|
||||||
|
before = len(content_rows)
|
||||||
|
content_rows = [r for r in content_rows if r.word_count > 0]
|
||||||
|
skipped = before - len(content_rows)
|
||||||
|
if skipped > 0:
|
||||||
|
logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
|
||||||
|
if not content_rows:
|
||||||
|
return
|
||||||
|
|
||||||
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
||||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||||
if not relevant_cols:
|
if not relevant_cols:
|
||||||
|
|||||||
Reference in New Issue
Block a user