From 606bef059114b37f8473b266f38c2afc4398662c Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Tue, 3 Mar 2026 11:00:29 +0100
Subject: [PATCH] fix(ocr-pipeline): overlap-based word assignment and empty
 row filtering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Word-to-column assignment now uses overlap-based matching instead of
   center-point matching. This fixes narrow page_ref columns losing
   their last digit (e.g. "p.59" → "p.5") when the digit's center
   falls slightly past the midpoint boundary into the next column.

2. Post-OCR empty row filter: rows where ALL cells have empty text
   are removed after OCR. This catches inter-row gaps that had stray
   Tesseract artifacts giving word_count > 0 but no actual content.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 68 ++++++++++++++------
 klausur-service/backend/ocr_pipeline_api.py  | 12 ++++
 2 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index f0092ec..5452f31 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3519,28 +3519,46 @@ def _assign_row_words_to_columns(
         col_ranges_rel.append((assign_left, assign_right))
 
     for w in row.words:
-        w_center_x = w['left'] + w['width'] / 2
+        w_left = w['left']
+        w_right = w_left + w['width']
+        w_center_x = w_left + w['width'] / 2
 
-        # Find which column range contains this word
-        assigned = False
-        for ci, (al, ar) in enumerate(col_ranges_rel):
-            if al <= w_center_x < ar:
-                result[ci].append(w)
-                assigned = True
-                break
+        # Primary: overlap-based matching — assign to column with most overlap.
+        # This is more robust than center-based for narrow columns (page_ref)
+        # where the last character's center may fall into the next column.
+        best_col = -1
+        best_overlap = 0
+        for ci, col in enumerate(columns):
+            col_left_rel = col.x - left_x
+            col_right_rel = col_left_rel + col.width
+            overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
+            if overlap > best_overlap:
+                best_overlap = overlap
+                best_col = ci
 
-        if not assigned:
-            # Fallback: nearest column center
-            best_col = 0
-            col_left_0 = columns[0].x - left_x
-            best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
-            for ci in range(1, n):
-                col_left = columns[ci].x - left_x
-                dist = abs(w_center_x - (col_left + columns[ci].width / 2))
-                if dist < best_dist:
-                    best_dist = dist
-                    best_col = ci
+        if best_col >= 0 and best_overlap > 0:
             result[best_col].append(w)
+        else:
+            # Fallback: center-based range matching
+            assigned = False
+            for ci, (al, ar) in enumerate(col_ranges_rel):
+                if al <= w_center_x < ar:
+                    result[ci].append(w)
+                    assigned = True
+                    break
+
+            if not assigned:
+                # Last resort: nearest column center
+                best_col = 0
+                col_left_0 = columns[0].x - left_x
+                best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
+                for ci in range(1, n):
+                    col_left = columns[ci].x - left_x
+                    dist = abs(w_center_x - (col_left + columns[ci].width / 2))
+                    if dist < best_dist:
+                        best_dist = dist
+                        best_col = ci
+                result[best_col].append(w)
 
     return result
 
@@ -4115,6 +4133,18 @@ def build_cell_grid(
                 f"empty cells in column {col_idx}"
             )
 
+    # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
+    # that had stray Tesseract artifacts giving word_count > 0).
+    rows_with_text: set = set()
+    for cell in cells:
+        if cell['text'].strip():
+            rows_with_text.add(cell['row_index'])
+    before_filter = len(cells)
+    cells = [c for c in cells if c['row_index'] in rows_with_text]
+    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+    if empty_rows_removed > 0:
+        logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
+
     logger.info(f"build_cell_grid: {len(cells)} cells from "
                 f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
                 f"engine={engine_name}")
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index 2d92727..a989c4e 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1291,6 +1291,18 @@ async def _word_stream_generator(
     if columns_meta is None:
         columns_meta = []
 
+    # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
+    # that had stray Tesseract artifacts giving word_count > 0).
+    rows_with_text: set = set()
+    for c in all_cells:
+        if c.get("text", "").strip():
+            rows_with_text.add(c["row_index"])
+    before_filter = len(all_cells)
+    all_cells = [c for c in all_cells if c["row_index"] in rows_with_text]
+    empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1)
+    if empty_rows_removed > 0:
+        logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR")
+
     used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine
 
     word_result = {