fix(ocr-pipeline): use midpoint boundaries for column word assignment
Replace containment-with-padding approach with midpoint-based column ranges. For adjacent columns, the assignment boundary is the midpoint between them (Voronoi-style). This prevents padding overlap where words near column borders (e.g. "We" at the start of example sentences) were assigned to the preceding column. The last column extends generously to capture all rightmost text. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3093,43 +3093,56 @@ def _assign_row_words_to_columns(
|
|||||||
|
|
||||||
left_x = row.x # content ROI left (absolute)
|
left_x = row.x # content ROI left (absolute)
|
||||||
|
|
||||||
# Pre-compute column bounds and centers in relative coordinates
|
# Build non-overlapping column assignment ranges using midpoints.
|
||||||
col_bounds_rel = [] # (left, right, center) per column
|
# For adjacent columns, the boundary is the midpoint between them.
|
||||||
for col in columns:
|
# This prevents words near column borders from being assigned to
|
||||||
|
# the wrong column (e.g. "We" at the start of an example sentence
|
||||||
|
# being stolen by the preceding DE column).
|
||||||
|
n = len(columns)
|
||||||
|
col_ranges_rel = [] # (assign_left, assign_right) per column
|
||||||
|
for ci, col in enumerate(columns):
|
||||||
col_left_rel = col.x - left_x
|
col_left_rel = col.x - left_x
|
||||||
col_right_rel = col_left_rel + col.width
|
col_right_rel = col_left_rel + col.width
|
||||||
col_center_rel = col_left_rel + col.width / 2
|
|
||||||
col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel))
|
|
||||||
|
|
||||||
# Padding: allow words slightly outside column bounds (e.g. due to
|
# Left boundary: midpoint to previous column, or 0
|
||||||
# imprecise column detection). Use 15% of average column width.
|
if ci == 0:
|
||||||
avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100
|
assign_left = 0
|
||||||
pad = avg_col_w * 0.15
|
else:
|
||||||
|
prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
|
||||||
|
assign_left = (prev_right + col_left_rel) / 2
|
||||||
|
|
||||||
|
# Right boundary: midpoint to next column, or infinity (row width)
|
||||||
|
if ci == n - 1:
|
||||||
|
assign_right = row.width + 100 # generous for last column
|
||||||
|
else:
|
||||||
|
next_left = columns[ci + 1].x - left_x
|
||||||
|
assign_right = (col_right_rel + next_left) / 2
|
||||||
|
|
||||||
|
col_ranges_rel.append((assign_left, assign_right))
|
||||||
|
|
||||||
for w in row.words:
|
for w in row.words:
|
||||||
w_center_x = w['left'] + w['width'] / 2
|
w_center_x = w['left'] + w['width'] / 2
|
||||||
|
|
||||||
# Pass 1: containment check (word center within column bounds + pad)
|
# Find which column range contains this word
|
||||||
contained_col = -1
|
assigned = False
|
||||||
for ci, (cl, cr, _) in enumerate(col_bounds_rel):
|
for ci, (al, ar) in enumerate(col_ranges_rel):
|
||||||
if (cl - pad) <= w_center_x <= (cr + pad):
|
if al <= w_center_x < ar:
|
||||||
contained_col = ci
|
result[ci].append(w)
|
||||||
|
assigned = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if contained_col >= 0:
|
if not assigned:
|
||||||
result[contained_col].append(w)
|
# Fallback: nearest column center
|
||||||
continue
|
best_col = 0
|
||||||
|
col_left_0 = columns[0].x - left_x
|
||||||
# Pass 2: nearest center fallback
|
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
|
||||||
best_col = 0
|
for ci in range(1, n):
|
||||||
best_dist = abs(w_center_x - col_bounds_rel[0][2])
|
col_left = columns[ci].x - left_x
|
||||||
for ci in range(1, len(columns)):
|
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
|
||||||
dist = abs(w_center_x - col_bounds_rel[ci][2])
|
if dist < best_dist:
|
||||||
if dist < best_dist:
|
best_dist = dist
|
||||||
best_dist = dist
|
best_col = ci
|
||||||
best_col = ci
|
result[best_col].append(w)
|
||||||
|
|
||||||
result[best_col].append(w)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user