fix: normalize word_box order to reading order for frontend display (Step 5j)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s

The frontend renders colored cells from the word_boxes array order,
not from cell.text. After post-processing steps (5i bullet removal etc),
word_boxes could remain in their original insertion order instead of
left-to-right reading order. Step 5j now explicitly sorts word_boxes
using _group_words_into_lines before the result is built.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 19:21:37 +01:00
parent bacbfd88f1
commit f31a7175a2
2 changed files with 66 additions and 1 deletions

View File

@@ -1046,3 +1046,50 @@ class TestBlueBulletFilter:
# so it doesn't matter which one is removed — the text stays correct.
# The key thing is ONE of the duplicates is removed.
assert True # Removing either duplicate is correct
# ---------------------------------------------------------------------------
# Word_box reading order normalisation (Step 5j)
# ---------------------------------------------------------------------------
class TestWordBoxReadingOrder:
"""Verify word_boxes are sorted into reading order for frontend rendering."""
def test_single_line_sorted_by_left(self):
"""Words on same Y line sorted by X (left) position."""
from cv_ocr_engines import _group_words_into_lines
wbs = [
{"text": "up", "left": 376, "top": 264, "width": 22, "height": 19},
{"text": "tie", "left": 284, "top": 264, "width": 23, "height": 14},
{"text": "sb/sth", "left": 309, "top": 264, "width": 57, "height": 20},
]
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
sorted_wbs = [w for line in lines for w in line]
assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"]
def test_two_lines_preserves_line_order(self):
"""Words on two Y lines: first line first, then second line."""
from cv_ocr_engines import _group_words_into_lines
wbs = [
{"text": "b)", "left": 100, "top": 290, "width": 20, "height": 15},
{"text": "cat", "left": 50, "top": 264, "width": 30, "height": 15},
{"text": "dog", "left": 100, "top": 264, "width": 30, "height": 15},
{"text": "a)", "left": 50, "top": 290, "width": 20, "height": 15},
]
lines = _group_words_into_lines(wbs, y_tolerance_px=10)
sorted_wbs = [w for line in lines for w in line]
assert [w["text"] for w in sorted_wbs] == ["cat", "dog", "a)", "b)"]
def test_already_sorted_unchanged(self):
"""Already-sorted word_boxes stay in same order."""
from cv_ocr_engines import _group_words_into_lines
wbs = [
{"text": "tie", "left": 284, "top": 264, "width": 23, "height": 14},
{"text": "sb/sth", "left": 309, "top": 264, "width": 57, "height": 20},
{"text": "up", "left": 376, "top": 264, "width": 22, "height": 19},
]
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
sorted_wbs = [w for line in lines for w in line]
assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"]
# Same objects, same order
assert [id(w) for w in sorted_wbs] == [id(w) for w in wbs]