diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 4dee8c5..a53d3af 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3093,43 +3093,56 @@ def _assign_row_words_to_columns( left_x = row.x # content ROI left (absolute) - # Pre-compute column bounds and centers in relative coordinates - col_bounds_rel = [] # (left, right, center) per column - for col in columns: + # Build non-overlapping column assignment ranges using midpoints. + # For adjacent columns, the boundary is the midpoint between them. + # This prevents words near column borders from being assigned to + # the wrong column (e.g. "We" at the start of an example sentence + # being stolen by the preceding DE column). + n = len(columns) + col_ranges_rel = [] # (assign_left, assign_right) per column + for ci, col in enumerate(columns): col_left_rel = col.x - left_x col_right_rel = col_left_rel + col.width - col_center_rel = col_left_rel + col.width / 2 - col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel)) - # Padding: allow words slightly outside column bounds (e.g. due to - # imprecise column detection). Use 15% of average column width. - avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100 - pad = avg_col_w * 0.15 + # Left boundary: midpoint to previous column, or 0 + if ci == 0: + assign_left = 0 + else: + prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width + assign_left = (prev_right + col_left_rel) / 2 + + # Right boundary: midpoint to next column, or infinity (row width) + if ci == n - 1: + assign_right = row.width + 100 # generous for last column + else: + next_left = columns[ci + 1].x - left_x + assign_right = (col_right_rel + next_left) / 2 + + col_ranges_rel.append((assign_left, assign_right)) for w in row.words: w_center_x = w['left'] + w['width'] / 2 - # Pass 1: containment check (word center within column bounds + pad) - contained_col = -1 - for ci, (cl, cr, _) in enumerate(col_bounds_rel): - if (cl - pad) <= w_center_x <= (cr + pad): - contained_col = ci + # Find which column range contains this word + assigned = False + for ci, (al, ar) in enumerate(col_ranges_rel): + if al <= w_center_x < ar: + result[ci].append(w) + assigned = True break - if contained_col >= 0: - result[contained_col].append(w) - continue - - # Pass 2: nearest center fallback - best_col = 0 - best_dist = abs(w_center_x - col_bounds_rel[0][2]) - for ci in range(1, len(columns)): - dist = abs(w_center_x - col_bounds_rel[ci][2]) - if dist < best_dist: - best_dist = dist - best_col = ci - - result[best_col].append(w) + if not assigned: + # Fallback: nearest column center + best_col = 0 + col_left_0 = columns[0].x - left_x + best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2)) + for ci in range(1, n): + col_left = columns[ci].x - left_x + dist = abs(w_center_x - (col_left + columns[ci].width / 2)) + if dist < best_dist: + best_dist = dist + best_col = ci + result[best_col].append(w) return result