From aae8a96aa2ca0d6560819eae5591271493cff0e5 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Tue, 17 Mar 2026 10:41:30 +0100
Subject: [PATCH] fix: sort word_boxes in reading order (Y-grouped, then
 X-sorted)

Words on the same visual line can have slightly different top values
(1-6px). Sorting by (top, left) produced wrong word order in the
frontend display. Now uses _group_words_into_lines to group by Y
proximity first, then sort by X within each line.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_words_first.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py
index b62b547..723f62c 100644
--- a/klausur-service/backend/cv_words_first.py
+++ b/klausur-service/backend/cv_words_first.py
@@ -191,8 +191,16 @@ def _build_cells(
         # but the overlay slide mechanism expects one box per word. Split multi-word
         # boxes into individual word positions proportional to character length.
         # Also split at "[" boundaries (IPA patterns like "badge[bxd3]").
+        #
+        # Sort in reading order: group by Y (same visual line), then sort by X.
+        # Simple (top, left) sort fails when words on the same line have slightly
+        # different top values (1-6px), causing wrong word order.
+        y_tol_wb = max(10, int(bh * 0.4))
+        reading_lines = _group_words_into_lines(cell_words, y_tolerance_px=y_tol_wb)
+        ordered_cell_words = [w for line in reading_lines for w in line]
+
         word_boxes = []
-        for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
+        for w in ordered_cell_words:
             raw_text = w.get('text', '').strip()
             # Split by whitespace, at "[" boundaries (IPA), and after leading "!"
             # e.g. "badge[bxd3]" → ["badge", "[bxd3]"]