From bacbfd88f134f055f69bdfc2c073b0bdff062c9e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 18:45:33 +0100 Subject: [PATCH] Fix word ordering in cell text rebuild (Steps 4c, 4d, 5i) Cell text was rebuilt using naive (top, left) sorting after removing word_boxes in Steps 4c/4d/5i. This produced wrong word order when words on the same visual line had slightly different top values (1-6px). Now uses _words_to_reading_order_text() which groups words into visual lines by y-tolerance before sorting by x within each line, matching the initial cell text construction in _build_cells. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 20c048a..384d56c 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request from cv_box_detect import detect_boxes, split_page_into_zones from cv_vocab_types import PageZone from cv_color_detect import detect_word_colors, recover_colored_text -from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa +from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text from cv_words_first import _cluster_rows, _build_cells from ocr_pipeline_session_store import ( get_session_db, @@ -1850,11 +1850,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if len(filtered) < len(wbs): removed_oversized += len(wbs) - len(filtered) cell["word_boxes"] = filtered - cell["text"] = " ".join( - wb.get("text", "").strip() - for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0))) - if wb.get("text", "").strip() - ) + cell["text"] = _words_to_reading_order_text(filtered) if removed_oversized: # Remove cells that became empty after oversized removal z["cells"] = [c for c in cells if c.get("word_boxes")] @@ -1879,11 +1875,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if len(filtered) < len(wbs): removed_pipes += len(wbs) - len(filtered) cell["word_boxes"] = filtered - cell["text"] = " ".join( - wb.get("text", "").strip() - for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0))) - if wb.get("text", "").strip() - ) + cell["text"] = _words_to_reading_order_text(filtered) # Remove cells that became empty after pipe removal if removed_pipes: z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())] @@ -2316,11 +2308,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: bullet_removed += len(to_remove) filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove] cell["word_boxes"] = filtered - cell["text"] = " ".join( - wb.get("text", "").strip() - for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0))) - if wb.get("text", "").strip() - ) + cell["text"] = _words_to_reading_order_text(filtered) # Remove cells that became empty after bullet removal if bullet_removed: