fix: normalize word_box order to reading order for frontend display (Step 5j)

The frontend renders colored cells from the word_boxes array order, not from cell.text. After post-processing steps (5i bullet removal etc), word_boxes could remain in their original insertion order instead of left-to-right reading order. Step 5j now explicitly sorts word_boxes using _group_words_into_lines before the result is built. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 19:21:37 +01:00
parent bacbfd88f1
commit f31a7175a2
2 changed files with 66 additions and 1 deletions
@@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request
 from cv_box_detect import detect_boxes, split_page_into_zones
 from cv_vocab_types import PageZone
 from cv_color_detect import detect_word_colors, recover_colored_text
-from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text
+from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines
 from cv_words_first import _cluster_rows, _build_cells
 from ocr_pipeline_session_store import (
    get_session_db,
@@ -2317,6 +2317,24 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                          if (c.get("word_boxes") or c.get("text", "").strip())]
        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
    # 5j. Normalise word_box order to reading order (group by Y, sort by X).
    # The frontend renders colored cells from word_boxes array order
    # (GridTable.tsx), so they MUST be in left-to-right reading order.
    wb_reordered = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []
            if len(wbs) < 2:
                continue
            lines = _group_words_into_lines(wbs, y_tolerance_px=15)
            sorted_wbs = [w for line in lines for w in line]
            # Check if order actually changed
            if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
                cell["word_boxes"] = sorted_wbs
                wb_reordered += 1
    if wb_reordered:
        logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
    duration = time.time() - t0
    # 6. Build result
@@ -1046,3 +1046,50 @@ class TestBlueBulletFilter:
        # so it doesn't matter which one is removed — the text stays correct.
        # The key thing is ONE of the duplicates is removed.
        assert True  # Removing either duplicate is correct
 # ---------------------------------------------------------------------------
 # Word_box reading order normalisation (Step 5j)
 # ---------------------------------------------------------------------------
 class TestWordBoxReadingOrder:
    """Verify word_boxes are sorted into reading order for frontend rendering."""
    def test_single_line_sorted_by_left(self):
        """Words on same Y line sorted by X (left) position."""
        from cv_ocr_engines import _group_words_into_lines
        wbs = [
            {"text": "up",        "left": 376, "top": 264, "width": 22, "height": 19},
            {"text": "tie",       "left": 284, "top": 264, "width": 23, "height": 14},
            {"text": "sb/sth",    "left": 309, "top": 264, "width": 57, "height": 20},
        ]
        lines = _group_words_into_lines(wbs, y_tolerance_px=15)
        sorted_wbs = [w for line in lines for w in line]
        assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"]
    def test_two_lines_preserves_line_order(self):
        """Words on two Y lines: first line first, then second line."""
        from cv_ocr_engines import _group_words_into_lines
        wbs = [
            {"text": "b)", "left": 100, "top": 290, "width": 20, "height": 15},
            {"text": "cat", "left": 50, "top": 264, "width": 30, "height": 15},
            {"text": "dog", "left": 100, "top": 264, "width": 30, "height": 15},
            {"text": "a)", "left": 50, "top": 290, "width": 20, "height": 15},
        ]
        lines = _group_words_into_lines(wbs, y_tolerance_px=10)
        sorted_wbs = [w for line in lines for w in line]
        assert [w["text"] for w in sorted_wbs] == ["cat", "dog", "a)", "b)"]
    def test_already_sorted_unchanged(self):
        """Already-sorted word_boxes stay in same order."""
        from cv_ocr_engines import _group_words_into_lines
        wbs = [
            {"text": "tie",    "left": 284, "top": 264, "width": 23, "height": 14},
            {"text": "sb/sth", "left": 309, "top": 264, "width": 57, "height": 20},
            {"text": "up",     "left": 376, "top": 264, "width": 22, "height": 19},
        ]
        lines = _group_words_into_lines(wbs, y_tolerance_px=15)
        sorted_wbs = [w for line in lines for w in line]
        assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"]
        # Same objects, same order
        assert [id(w) for w in sorted_wbs] == [id(w) for w in wbs]