From aae8a96aa2ca0d6560819eae5591271493cff0e5 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 17 Mar 2026 10:41:30 +0100 Subject: [PATCH] fix: sort word_boxes in reading order (Y-grouped, then X-sorted) Words on the same visual line can have slightly different top values (1-6px). Sorting by (top, left) produced wrong word order in the frontend display. Now uses _group_words_into_lines to group by Y proximity first, then sort by X within each line. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_words_first.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py index b62b547..723f62c 100644 --- a/klausur-service/backend/cv_words_first.py +++ b/klausur-service/backend/cv_words_first.py @@ -191,8 +191,16 @@ def _build_cells( # but the overlay slide mechanism expects one box per word. Split multi-word # boxes into individual word positions proportional to character length. # Also split at "[" boundaries (IPA patterns like "badge[bxd3]"). + # + # Sort in reading order: group by Y (same visual line), then sort by X. + # Simple (top, left) sort fails when words on the same line have slightly + # different top values (1-6px), causing wrong word order. + y_tol_wb = max(10, int(bh * 0.4)) + reading_lines = _group_words_into_lines(cell_words, y_tolerance_px=y_tol_wb) + ordered_cell_words = [w for line in reading_lines for w in line] + word_boxes = [] - for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])): + for w in ordered_cell_words: raw_text = w.get('text', '').strip() # Split by whitespace, at "[" boundaries (IPA), and after leading "!" # e.g. "badge[bxd3]" → ["badge", "[bxd3]"]