Fix word ordering in cell text rebuild (Steps 4c, 4d, 5i)
Cell text was rebuilt using naive (top, left) sorting after removing word_boxes in Steps 4c/4d/5i. This produced wrong word order when words on the same visual line had slightly different top values (1-6px). Now uses _words_to_reading_order_text() which groups words into visual lines by y-tolerance before sorting by x within each line, matching the initial cell text construction in _build_cells. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request
|
||||
from cv_box_detect import detect_boxes, split_page_into_zones
|
||||
from cv_vocab_types import PageZone
|
||||
from cv_color_detect import detect_word_colors, recover_colored_text
|
||||
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa
|
||||
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text
|
||||
from cv_words_first import _cluster_rows, _build_cells
|
||||
from ocr_pipeline_session_store import (
|
||||
get_session_db,
|
||||
@@ -1850,11 +1850,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if len(filtered) < len(wbs):
|
||||
removed_oversized += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = " ".join(
|
||||
wb.get("text", "").strip()
|
||||
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||||
if wb.get("text", "").strip()
|
||||
)
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
if removed_oversized:
|
||||
# Remove cells that became empty after oversized removal
|
||||
z["cells"] = [c for c in cells if c.get("word_boxes")]
|
||||
@@ -1879,11 +1875,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if len(filtered) < len(wbs):
|
||||
removed_pipes += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = " ".join(
|
||||
wb.get("text", "").strip()
|
||||
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||||
if wb.get("text", "").strip()
|
||||
)
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
# Remove cells that became empty after pipe removal
|
||||
if removed_pipes:
|
||||
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
@@ -2316,11 +2308,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
bullet_removed += len(to_remove)
|
||||
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = " ".join(
|
||||
wb.get("text", "").strip()
|
||||
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||||
if wb.get("text", "").strip()
|
||||
)
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
|
||||
# Remove cells that became empty after bullet removal
|
||||
if bullet_removed:
|
||||
|
||||
Reference in New Issue
Block a user