fix(ocr-pipeline): remove overzealous grid row count validation
The validation that rejected word-center grid when it produced more rows than gap-based detection was causing fallback to gap-based rows (large boxes). The word-center grid regularization works correctly after the center-based grouping and cluster merging fixes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1829,13 +1829,6 @@ def _regularize_row_grid(
|
|||||||
# Remove empty grid rows (no words assigned)
|
# Remove empty grid rows (no words assigned)
|
||||||
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
||||||
|
|
||||||
# The grid must not produce MORE rows than gap-based detection.
|
|
||||||
# More rows means the clustering split actual lines — that's worse.
|
|
||||||
if len(grid_rows) > len(content_rows):
|
|
||||||
logger.info(f"RowGrid: grid produced {len(grid_rows)} rows > "
|
|
||||||
f"{len(content_rows)} gap-based → keeping gap-based rows")
|
|
||||||
return rows
|
|
||||||
|
|
||||||
# --- Step H: Merge header/footer + re-index ---
|
# --- Step H: Merge header/footer + re-index ---
|
||||||
result = list(non_content) + grid_rows
|
result = list(non_content) + grid_rows
|
||||||
result.sort(key=lambda r: r.y)
|
result.sort(key=lambda r: r.y)
|
||||||
|
|||||||
Reference in New Issue
Block a user