diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 384d56c..c7f25f3 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request from cv_box_detect import detect_boxes, split_page_into_zones from cv_vocab_types import PageZone from cv_color_detect import detect_word_colors, recover_colored_text -from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text +from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines from cv_words_first import _cluster_rows, _build_cells from ocr_pipeline_session_store import ( get_session_db, @@ -2317,6 +2317,24 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if (c.get("word_boxes") or c.get("text", "").strip())] logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed) + # 5j. Normalise word_box order to reading order (group by Y, sort by X). + # The frontend renders colored cells from word_boxes array order + # (GridTable.tsx), so they MUST be in left-to-right reading order. + wb_reordered = 0 + for z in zones_data: + for cell in z.get("cells", []): + wbs = cell.get("word_boxes") or [] + if len(wbs) < 2: + continue + lines = _group_words_into_lines(wbs, y_tolerance_px=15) + sorted_wbs = [w for line in lines for w in line] + # Check if order actually changed + if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]: + cell["word_boxes"] = sorted_wbs + wb_reordered += 1 + if wb_reordered: + logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered) + duration = time.time() - t0 # 6. Build result diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index ff03ab5..f32a8d2 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -1046,3 +1046,50 @@ class TestBlueBulletFilter: # so it doesn't matter which one is removed — the text stays correct. # The key thing is ONE of the duplicates is removed. assert True # Removing either duplicate is correct + + +# --------------------------------------------------------------------------- +# Word_box reading order normalisation (Step 5j) +# --------------------------------------------------------------------------- + +class TestWordBoxReadingOrder: + """Verify word_boxes are sorted into reading order for frontend rendering.""" + + def test_single_line_sorted_by_left(self): + """Words on same Y line sorted by X (left) position.""" + from cv_ocr_engines import _group_words_into_lines + wbs = [ + {"text": "up", "left": 376, "top": 264, "width": 22, "height": 19}, + {"text": "tie", "left": 284, "top": 264, "width": 23, "height": 14}, + {"text": "sb/sth", "left": 309, "top": 264, "width": 57, "height": 20}, + ] + lines = _group_words_into_lines(wbs, y_tolerance_px=15) + sorted_wbs = [w for line in lines for w in line] + assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"] + + def test_two_lines_preserves_line_order(self): + """Words on two Y lines: first line first, then second line.""" + from cv_ocr_engines import _group_words_into_lines + wbs = [ + {"text": "b)", "left": 100, "top": 290, "width": 20, "height": 15}, + {"text": "cat", "left": 50, "top": 264, "width": 30, "height": 15}, + {"text": "dog", "left": 100, "top": 264, "width": 30, "height": 15}, + {"text": "a)", "left": 50, "top": 290, "width": 20, "height": 15}, + ] + lines = _group_words_into_lines(wbs, y_tolerance_px=10) + sorted_wbs = [w for line in lines for w in line] + assert [w["text"] for w in sorted_wbs] == ["cat", "dog", "a)", "b)"] + + def test_already_sorted_unchanged(self): + """Already-sorted word_boxes stay in same order.""" + from cv_ocr_engines import _group_words_into_lines + wbs = [ + {"text": "tie", "left": 284, "top": 264, "width": 23, "height": 14}, + {"text": "sb/sth", "left": 309, "top": 264, "width": 57, "height": 20}, + {"text": "up", "left": 376, "top": 264, "width": 22, "height": 19}, + ] + lines = _group_words_into_lines(wbs, y_tolerance_px=15) + sorted_wbs = [w for line in lines for w in line] + assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"] + # Same objects, same order + assert [id(w) for w in sorted_wbs] == [id(w) for w in wbs]