From 40a77a82f63ed37602d34e718f2dcc197e323f1f Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 12:53:56 +0100
Subject: [PATCH] fix(ocr-pipeline): use midpoint boundaries for column word
 assignment

Replace containment-with-padding approach with midpoint-based column
ranges. For adjacent columns, the assignment boundary is the midpoint
between them (Voronoi-style). This prevents padding overlap where words
near column borders (e.g. "We" at the start of example sentences) were
assigned to the preceding column. The last column extends generously to
capture all rightmost text.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 69 ++++++++++++--------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 4dee8c5..a53d3af 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3093,43 +3093,56 @@ def _assign_row_words_to_columns(
 
     left_x = row.x  # content ROI left (absolute)
 
-    # Pre-compute column bounds and centers in relative coordinates
-    col_bounds_rel = []  # (left, right, center) per column
-    for col in columns:
+    # Build non-overlapping column assignment ranges using midpoints.
+    # For adjacent columns, the boundary is the midpoint between them.
+    # This prevents words near column borders from being assigned to
+    # the wrong column (e.g. "We" at the start of an example sentence
+    # being stolen by the preceding DE column).
+    n = len(columns)
+    col_ranges_rel = []  # (assign_left, assign_right) per column
+    for ci, col in enumerate(columns):
         col_left_rel = col.x - left_x
         col_right_rel = col_left_rel + col.width
-        col_center_rel = col_left_rel + col.width / 2
-        col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel))
 
-    # Padding: allow words slightly outside column bounds (e.g. due to
-    # imprecise column detection).  Use 15% of average column width.
-    avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100
-    pad = avg_col_w * 0.15
+        # Left boundary: midpoint to previous column, or 0
+        if ci == 0:
+            assign_left = 0
+        else:
+            prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
+            assign_left = (prev_right + col_left_rel) / 2
+
+        # Right boundary: midpoint to next column, or infinity (row width)
+        if ci == n - 1:
+            assign_right = row.width + 100  # generous for last column
+        else:
+            next_left = columns[ci + 1].x - left_x
+            assign_right = (col_right_rel + next_left) / 2
+
+        col_ranges_rel.append((assign_left, assign_right))
 
     for w in row.words:
         w_center_x = w['left'] + w['width'] / 2
 
-        # Pass 1: containment check (word center within column bounds + pad)
-        contained_col = -1
-        for ci, (cl, cr, _) in enumerate(col_bounds_rel):
-            if (cl - pad) <= w_center_x <= (cr + pad):
-                contained_col = ci
+        # Find which column range contains this word
+        assigned = False
+        for ci, (al, ar) in enumerate(col_ranges_rel):
+            if al <= w_center_x < ar:
+                result[ci].append(w)
+                assigned = True
                 break
 
-        if contained_col >= 0:
-            result[contained_col].append(w)
-            continue
-
-        # Pass 2: nearest center fallback
-        best_col = 0
-        best_dist = abs(w_center_x - col_bounds_rel[0][2])
-        for ci in range(1, len(columns)):
-            dist = abs(w_center_x - col_bounds_rel[ci][2])
-            if dist < best_dist:
-                best_dist = dist
-                best_col = ci
-
-        result[best_col].append(w)
+        if not assigned:
+            # Fallback: nearest column center
+            best_col = 0
+            col_left_0 = columns[0].x - left_x
+            best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
+            for ci in range(1, n):
+                col_left = columns[ci].x - left_x
+                dist = abs(w_center_x - (col_left + columns[ci].width / 2))
+                if dist < best_dist:
+                    best_dist = dist
+                    best_col = ci
+            result[best_col].append(w)
 
     return result