From d0e7966925bf383881ba353aae7a210ac19be6a9 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 4 Mar 2026 15:44:13 +0100 Subject: [PATCH] fix: use header/footer row boundaries for _heal_row_gaps in cell-first OCR Prevents first content row from expanding into header area (causing "ulary" from "VOCABULARY" to appear in DE column) and last content row from expanding into footer area (causing page numbers to appear as content). Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 41 ++++++++++++++------ 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 1c0012e..837fcc6 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4885,12 +4885,22 @@ def build_cell_grid_v2( logger.warning("build_cell_grid_v2: no usable columns found") return [], [] - # Heal row gaps - _heal_row_gaps( - content_rows, - top_bound=min(c.y for c in relevant_cols), - bottom_bound=max(c.y + c.height for c in relevant_cols), - ) + # Heal row gaps — use header/footer boundaries (NOT column bounds!) + # In Cell-First OCR, the crop IS the OCR input, so extending into + # header/footer means OCR'ing header/footer text ("VOCABULARY", page nums). + content_rows.sort(key=lambda r: r.y) + header_rows = [r for r in row_geometries if r.row_type == 'header'] + footer_rows = [r for r in row_geometries if r.row_type == 'footer'] + if header_rows: + top_bound = max(r.y + r.height for r in header_rows) + else: + top_bound = content_rows[0].y + if footer_rows: + bottom_bound = min(r.y for r in footer_rows) + else: + bottom_bound = content_rows[-1].y + content_rows[-1].height + + _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) relevant_cols.sort(key=lambda c: c.x) @@ -5006,11 +5016,20 @@ def build_cell_grid_v2_streaming( if not content_rows: return - _heal_row_gaps( - content_rows, - top_bound=min(c.y for c in relevant_cols), - bottom_bound=max(c.y + c.height for c in relevant_cols), - ) + # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2) + content_rows.sort(key=lambda r: r.y) + header_rows = [r for r in row_geometries if r.row_type == 'header'] + footer_rows = [r for r in row_geometries if r.row_type == 'footer'] + if header_rows: + top_bound = max(r.y + r.height for r in header_rows) + else: + top_bound = content_rows[0].y + if footer_rows: + bottom_bound = min(r.y for r in footer_rows) + else: + bottom_bound = content_rows[-1].y + content_rows[-1].height + + _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) relevant_cols.sort(key=lambda c: c.x)