diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 1c0012e..837fcc6 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4885,12 +4885,22 @@ def build_cell_grid_v2( logger.warning("build_cell_grid_v2: no usable columns found") return [], [] - # Heal row gaps - _heal_row_gaps( - content_rows, - top_bound=min(c.y for c in relevant_cols), - bottom_bound=max(c.y + c.height for c in relevant_cols), - ) + # Heal row gaps — use header/footer boundaries (NOT column bounds!) + # In Cell-First OCR, the crop IS the OCR input, so extending into + # header/footer means OCR'ing header/footer text ("VOCABULARY", page nums). + content_rows.sort(key=lambda r: r.y) + header_rows = [r for r in row_geometries if r.row_type == 'header'] + footer_rows = [r for r in row_geometries if r.row_type == 'footer'] + if header_rows: + top_bound = max(r.y + r.height for r in header_rows) + else: + top_bound = content_rows[0].y + if footer_rows: + bottom_bound = min(r.y for r in footer_rows) + else: + bottom_bound = content_rows[-1].y + content_rows[-1].height + + _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) relevant_cols.sort(key=lambda c: c.x) @@ -5006,11 +5016,20 @@ def build_cell_grid_v2_streaming( if not content_rows: return - _heal_row_gaps( - content_rows, - top_bound=min(c.y for c in relevant_cols), - bottom_bound=max(c.y + c.height for c in relevant_cols), - ) + # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2) + content_rows.sort(key=lambda r: r.y) + header_rows = [r for r in row_geometries if r.row_type == 'header'] + footer_rows = [r for r in row_geometries if r.row_type == 'footer'] + if header_rows: + top_bound = max(r.y + r.height for r in header_rows) + else: + top_bound = content_rows[0].y + if footer_rows: + bottom_bound = min(r.y for r in footer_rows) + else: + bottom_bound = content_rows[-1].y + content_rows[-1].height + + _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) relevant_cols.sort(key=lambda c: c.x)