Fix word ordering in cell text rebuild (Steps 4c, 4d, 5i)

Cell text was rebuilt using naive (top, left) sorting after removing word_boxes in Steps 4c/4d/5i. This produced wrong word order when words on the same visual line had slightly different top values (1-6px). Now uses _words_to_reading_order_text() which groups words into visual lines by y-tolerance before sorting by x within each line, matching the initial cell text construction in _build_cells. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 18:45:33 +01:00
parent 2c63beff04
commit bacbfd88f1
1 changed files with 4 additions and 16 deletions
@@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request
 from cv_box_detect import detect_boxes, split_page_into_zones
 from cv_vocab_types import PageZone
 from cv_color_detect import detect_word_colors, recover_colored_text
-from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa
+from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text
 from cv_words_first import _cluster_rows, _build_cells
 from ocr_pipeline_session_store import (
    get_session_db,
@@ -1850,11 +1850,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
            if len(filtered) < len(wbs):
                removed_oversized += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
-                cell["text"] = " ".join(
-                    wb.get("text", "").strip()
-                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
-                    if wb.get("text", "").strip()
-                )
+                cell["text"] = _words_to_reading_order_text(filtered)
        if removed_oversized:
            # Remove cells that became empty after oversized removal
            z["cells"] = [c for c in cells if c.get("word_boxes")]
@@ -1879,11 +1875,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
            if len(filtered) < len(wbs):
                removed_pipes += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
-                cell["text"] = " ".join(
-                    wb.get("text", "").strip()
-                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
-                    if wb.get("text", "").strip()
-                )
+                cell["text"] = _words_to_reading_order_text(filtered)
        # Remove cells that became empty after pipe removal
        if removed_pipes:
            z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
@@ -2316,11 +2308,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                bullet_removed += len(to_remove)
                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
                cell["word_boxes"] = filtered
-                cell["text"] = " ".join(
-                    wb.get("text", "").strip()
-                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
-                    if wb.get("text", "").strip()
-                )
+                cell["text"] = _words_to_reading_order_text(filtered)

    # Remove cells that became empty after bullet removal
    if bullet_removed: