fix(ocr-pipeline): use midpoint boundaries for column word assignment

Replace containment-with-padding approach with midpoint-based column
ranges. For adjacent columns, the assignment boundary is the midpoint
between them (Voronoi-style). This prevents padding overlap where words
near column borders (e.g. "We" at the start of example sentences) were
assigned to the preceding column. The last column extends generously to
capture all rightmost text.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 12:53:56 +01:00
parent 87931c35e4
commit 40a77a82f6

View File

@@ -3093,43 +3093,56 @@ def _assign_row_words_to_columns(
left_x = row.x # content ROI left (absolute)
# Pre-compute column bounds and centers in relative coordinates
col_bounds_rel = [] # (left, right, center) per column
for col in columns:
# Build non-overlapping column assignment ranges using midpoints.
# For adjacent columns, the boundary is the midpoint between them.
# This prevents words near column borders from being assigned to
# the wrong column (e.g. "We" at the start of an example sentence
# being stolen by the preceding DE column).
n = len(columns)
col_ranges_rel = [] # (assign_left, assign_right) per column
for ci, col in enumerate(columns):
col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
col_center_rel = col_left_rel + col.width / 2
col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel))
# Padding: allow words slightly outside column bounds (e.g. due to
# imprecise column detection). Use 15% of average column width.
avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100
pad = avg_col_w * 0.15
# Left boundary: midpoint to previous column, or 0
if ci == 0:
assign_left = 0
else:
prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
assign_left = (prev_right + col_left_rel) / 2
# Right boundary: midpoint to next column, or infinity (row width)
if ci == n - 1:
assign_right = row.width + 100 # generous for last column
else:
next_left = columns[ci + 1].x - left_x
assign_right = (col_right_rel + next_left) / 2
col_ranges_rel.append((assign_left, assign_right))
for w in row.words:
w_center_x = w['left'] + w['width'] / 2
# Pass 1: containment check (word center within column bounds + pad)
contained_col = -1
for ci, (cl, cr, _) in enumerate(col_bounds_rel):
if (cl - pad) <= w_center_x <= (cr + pad):
contained_col = ci
# Find which column range contains this word
assigned = False
for ci, (al, ar) in enumerate(col_ranges_rel):
if al <= w_center_x < ar:
result[ci].append(w)
assigned = True
break
if contained_col >= 0:
result[contained_col].append(w)
continue
# Pass 2: nearest center fallback
best_col = 0
best_dist = abs(w_center_x - col_bounds_rel[0][2])
for ci in range(1, len(columns)):
dist = abs(w_center_x - col_bounds_rel[ci][2])
if dist < best_dist:
best_dist = dist
best_col = ci
result[best_col].append(w)
if not assigned:
# Fallback: nearest column center
best_col = 0
col_left_0 = columns[0].x - left_x
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
for ci in range(1, n):
col_left = columns[ci].x - left_x
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
if dist < best_dist:
best_dist = dist
best_col = ci
result[best_col].append(w)
return result