fix: normalize word_box order to reading order for frontend display (Step 5j)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
The frontend renders colored cells from the word_boxes array order, not from cell.text. After post-processing steps (5i bullet removal etc), word_boxes could remain in their original insertion order instead of left-to-right reading order. Step 5j now explicitly sorts word_boxes using _group_words_into_lines before the result is built. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request
|
|||||||
from cv_box_detect import detect_boxes, split_page_into_zones
|
from cv_box_detect import detect_boxes, split_page_into_zones
|
||||||
from cv_vocab_types import PageZone
|
from cv_vocab_types import PageZone
|
||||||
from cv_color_detect import detect_word_colors, recover_colored_text
|
from cv_color_detect import detect_word_colors, recover_colored_text
|
||||||
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text
|
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines
|
||||||
from cv_words_first import _cluster_rows, _build_cells
|
from cv_words_first import _cluster_rows, _build_cells
|
||||||
from ocr_pipeline_session_store import (
|
from ocr_pipeline_session_store import (
|
||||||
get_session_db,
|
get_session_db,
|
||||||
@@ -2317,6 +2317,24 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||||
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
||||||
|
|
||||||
|
# 5j. Normalise word_box order to reading order (group by Y, sort by X).
|
||||||
|
# The frontend renders colored cells from word_boxes array order
|
||||||
|
# (GridTable.tsx), so they MUST be in left-to-right reading order.
|
||||||
|
wb_reordered = 0
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
if len(wbs) < 2:
|
||||||
|
continue
|
||||||
|
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
|
||||||
|
sorted_wbs = [w for line in lines for w in line]
|
||||||
|
# Check if order actually changed
|
||||||
|
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
|
||||||
|
cell["word_boxes"] = sorted_wbs
|
||||||
|
wb_reordered += 1
|
||||||
|
if wb_reordered:
|
||||||
|
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
|
||||||
|
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|
||||||
# 6. Build result
|
# 6. Build result
|
||||||
|
|||||||
@@ -1046,3 +1046,50 @@ class TestBlueBulletFilter:
|
|||||||
# so it doesn't matter which one is removed — the text stays correct.
|
# so it doesn't matter which one is removed — the text stays correct.
|
||||||
# The key thing is ONE of the duplicates is removed.
|
# The key thing is ONE of the duplicates is removed.
|
||||||
assert True # Removing either duplicate is correct
|
assert True # Removing either duplicate is correct
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Word_box reading order normalisation (Step 5j)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestWordBoxReadingOrder:
|
||||||
|
"""Verify word_boxes are sorted into reading order for frontend rendering."""
|
||||||
|
|
||||||
|
def test_single_line_sorted_by_left(self):
|
||||||
|
"""Words on same Y line sorted by X (left) position."""
|
||||||
|
from cv_ocr_engines import _group_words_into_lines
|
||||||
|
wbs = [
|
||||||
|
{"text": "up", "left": 376, "top": 264, "width": 22, "height": 19},
|
||||||
|
{"text": "tie", "left": 284, "top": 264, "width": 23, "height": 14},
|
||||||
|
{"text": "sb/sth", "left": 309, "top": 264, "width": 57, "height": 20},
|
||||||
|
]
|
||||||
|
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
|
||||||
|
sorted_wbs = [w for line in lines for w in line]
|
||||||
|
assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"]
|
||||||
|
|
||||||
|
def test_two_lines_preserves_line_order(self):
|
||||||
|
"""Words on two Y lines: first line first, then second line."""
|
||||||
|
from cv_ocr_engines import _group_words_into_lines
|
||||||
|
wbs = [
|
||||||
|
{"text": "b)", "left": 100, "top": 290, "width": 20, "height": 15},
|
||||||
|
{"text": "cat", "left": 50, "top": 264, "width": 30, "height": 15},
|
||||||
|
{"text": "dog", "left": 100, "top": 264, "width": 30, "height": 15},
|
||||||
|
{"text": "a)", "left": 50, "top": 290, "width": 20, "height": 15},
|
||||||
|
]
|
||||||
|
lines = _group_words_into_lines(wbs, y_tolerance_px=10)
|
||||||
|
sorted_wbs = [w for line in lines for w in line]
|
||||||
|
assert [w["text"] for w in sorted_wbs] == ["cat", "dog", "a)", "b)"]
|
||||||
|
|
||||||
|
def test_already_sorted_unchanged(self):
|
||||||
|
"""Already-sorted word_boxes stay in same order."""
|
||||||
|
from cv_ocr_engines import _group_words_into_lines
|
||||||
|
wbs = [
|
||||||
|
{"text": "tie", "left": 284, "top": 264, "width": 23, "height": 14},
|
||||||
|
{"text": "sb/sth", "left": 309, "top": 264, "width": 57, "height": 20},
|
||||||
|
{"text": "up", "left": 376, "top": 264, "width": 22, "height": 19},
|
||||||
|
]
|
||||||
|
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
|
||||||
|
sorted_wbs = [w for line in lines for w in line]
|
||||||
|
assert [w["text"] for w in sorted_wbs] == ["tie", "sb/sth", "up"]
|
||||||
|
# Same objects, same order
|
||||||
|
assert [id(w) for w in sorted_wbs] == [id(w) for w in wbs]
|
||||||
|
|||||||
Reference in New Issue
Block a user