Fix word ordering in cell text rebuild (Steps 4c, 4d, 5i)

Cell text was rebuilt using naive (top, left) sorting after removing
word_boxes in Steps 4c/4d/5i. This produced wrong word order when
words on the same visual line had slightly different top values (1-6px).

Now uses _words_to_reading_order_text() which groups words into visual
lines by y-tolerance before sorting by x within each line, matching
the initial cell text construction in _build_cells.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 18:45:33 +01:00
parent 2c63beff04
commit bacbfd88f1

View File

@@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request
from cv_box_detect import detect_boxes, split_page_into_zones
from cv_vocab_types import PageZone
from cv_color_detect import detect_word_colors, recover_colored_text
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text
from cv_words_first import _cluster_rows, _build_cells
from ocr_pipeline_session_store import (
get_session_db,
@@ -1850,11 +1850,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if len(filtered) < len(wbs):
removed_oversized += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
cell["text"] = _words_to_reading_order_text(filtered)
if removed_oversized:
# Remove cells that became empty after oversized removal
z["cells"] = [c for c in cells if c.get("word_boxes")]
@@ -1879,11 +1875,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if len(filtered) < len(wbs):
removed_pipes += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
cell["text"] = _words_to_reading_order_text(filtered)
# Remove cells that became empty after pipe removal
if removed_pipes:
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
@@ -2316,11 +2308,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
bullet_removed += len(to_remove)
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
cell["text"] = _words_to_reading_order_text(filtered)
# Remove cells that became empty after bullet removal
if bullet_removed: