From 3bcb7aa6384d1f89a9ec8c47e7abb0cc621a1d53 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 1 Mar 2026 13:01:27 +0100 Subject: [PATCH] fix(ocr-pipeline): remove overzealous grid row count validation The validation that rejected word-center grid when it produced more rows than gap-based detection was causing fallback to gap-based rows (large boxes). The word-center grid regularization works correctly after the center-based grouping and cluster merging fixes. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 5479446..b2cc866 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1829,13 +1829,6 @@ def _regularize_row_grid( # Remove empty grid rows (no words assigned) grid_rows = [gr for gr in grid_rows if gr.word_count > 0] - # The grid must not produce MORE rows than gap-based detection. - # More rows means the clustering split actual lines — that's worse. - if len(grid_rows) > len(content_rows): - logger.info(f"RowGrid: grid produced {len(grid_rows)} rows > " - f"{len(content_rows)} gap-based → keeping gap-based rows") - return rows - # --- Step H: Merge header/footer + re-index --- result = list(non_content) + grid_rows result.sort(key=lambda r: r.y)