From 606bef059114b37f8473b266f38c2afc4398662c Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 3 Mar 2026 11:00:29 +0100 Subject: [PATCH] fix(ocr-pipeline): overlap-based word assignment and empty row filtering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Word-to-column assignment now uses overlap-based matching instead of center-point matching. This fixes narrow page_ref columns losing their last digit (e.g. "p.59" → "p.5") when the digit's center falls slightly past the midpoint boundary into the next column. 2. Post-OCR empty row filter: rows where ALL cells have empty text are removed after OCR. This catches inter-row gaps that had stray Tesseract artifacts giving word_count > 0 but no actual content. Co-Authored-By: Claude Sonnet 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 68 ++++++++++++++------ klausur-service/backend/ocr_pipeline_api.py | 12 ++++ 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index f0092ec..5452f31 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3519,28 +3519,46 @@ def _assign_row_words_to_columns( col_ranges_rel.append((assign_left, assign_right)) for w in row.words: - w_center_x = w['left'] + w['width'] / 2 + w_left = w['left'] + w_right = w_left + w['width'] + w_center_x = w_left + w['width'] / 2 - # Find which column range contains this word - assigned = False - for ci, (al, ar) in enumerate(col_ranges_rel): - if al <= w_center_x < ar: - result[ci].append(w) - assigned = True - break + # Primary: overlap-based matching — assign to column with most overlap. + # This is more robust than center-based for narrow columns (page_ref) + # where the last character's center may fall into the next column. + best_col = -1 + best_overlap = 0 + for ci, col in enumerate(columns): + col_left_rel = col.x - left_x + col_right_rel = col_left_rel + col.width + overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel)) + if overlap > best_overlap: + best_overlap = overlap + best_col = ci - if not assigned: - # Fallback: nearest column center - best_col = 0 - col_left_0 = columns[0].x - left_x - best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2)) - for ci in range(1, n): - col_left = columns[ci].x - left_x - dist = abs(w_center_x - (col_left + columns[ci].width / 2)) - if dist < best_dist: - best_dist = dist - best_col = ci + if best_col >= 0 and best_overlap > 0: result[best_col].append(w) + else: + # Fallback: center-based range matching + assigned = False + for ci, (al, ar) in enumerate(col_ranges_rel): + if al <= w_center_x < ar: + result[ci].append(w) + assigned = True + break + + if not assigned: + # Last resort: nearest column center + best_col = 0 + col_left_0 = columns[0].x - left_x + best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2)) + for ci in range(1, n): + col_left = columns[ci].x - left_x + dist = abs(w_center_x - (col_left + columns[ci].width / 2)) + if dist < best_dist: + best_dist = dist + best_col = ci + result[best_col].append(w) return result @@ -4115,6 +4133,18 @@ def build_cell_grid( f"empty cells in column {col_idx}" ) + # Post-OCR: remove rows where ALL cells are empty (inter-row gaps + # that had stray Tesseract artifacts giving word_count > 0). + rows_with_text: set = set() + for cell in cells: + if cell['text'].strip(): + rows_with_text.add(cell['row_index']) + before_filter = len(cells) + cells = [c for c in cells if c['row_index'] in rows_with_text] + empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1) + if empty_rows_removed > 0: + logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR") + logger.info(f"build_cell_grid: {len(cells)} cells from " f"{len(content_rows)} rows × {len(relevant_cols)} columns, " f"engine={engine_name}") diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 2d92727..a989c4e 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1291,6 +1291,18 @@ async def _word_stream_generator( if columns_meta is None: columns_meta = [] + # Post-OCR: remove rows where ALL cells are empty (inter-row gaps + # that had stray Tesseract artifacts giving word_count > 0). + rows_with_text: set = set() + for c in all_cells: + if c.get("text", "").strip(): + rows_with_text.add(c["row_index"]) + before_filter = len(all_cells) + all_cells = [c for c in all_cells if c["row_index"] in rows_with_text] + empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1) + if empty_rows_removed > 0: + logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR") + used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine word_result = {