diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 2931906..e3b88a7 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3145,6 +3145,17 @@ def build_cell_grid( logger.warning("build_cell_grid: no content rows found") return [], [] + # Filter phantom rows: rows with no Tesseract words assigned are + # inter-line whitespace gaps that would produce garbage OCR. + before = len(content_rows) + content_rows = [r for r in content_rows if r.word_count > 0] + skipped = before - len(content_rows) + if skipped > 0: + logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)") + if not content_rows: + logger.warning("build_cell_grid: no content rows with words found") + return [], [] + # Use columns only — skip ignore, header, footer, page_ref _skip_types = {'column_ignore', 'header', 'footer', 'page_ref'} relevant_cols = [c for c in column_regions if c.type not in _skip_types] @@ -3222,6 +3233,16 @@ def build_cell_grid_streaming( if not content_rows: return + # Filter phantom rows: rows with no Tesseract words assigned are + # inter-line whitespace gaps that would produce garbage OCR. + before = len(content_rows) + content_rows = [r for r in content_rows if r.word_count > 0] + skipped = before - len(content_rows) + if skipped > 0: + logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)") + if not content_rows: + return + _skip_types = {'column_ignore', 'header', 'footer', 'page_ref'} relevant_cols = [c for c in column_regions if c.type not in _skip_types] if not relevant_cols: