Fix IPA correction persistence and false-positive prefix matching

Step 5i was overwriting IPA-corrected text from Step 5c when reconstructing cells from word_boxes. Added _ipa_corrected flag to preserve corrections. Also tightened merged-token prefix matching (min prefix 4 chars, min suffix 3 chars) to prevent false positives like "sis" being extracted from "si:said". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-25 07:26:32 +01:00
parent 9ea217bdfc
commit c42924a94a
2 changed files with 20 additions and 4 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -890,11 +890,18 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                if cell.get("col_type") == en_col_type:
                    cell["_orig_col_type"] = en_col_type
                    cell["col_type"] = "column_en"
+        # Snapshot text before IPA fix to detect which cells were modified
+        _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
        fix_cell_phonetics(all_cells, pronunciation="british")
        for cell in all_cells:
            orig = cell.pop("_orig_col_type", None)
            if orig:
                cell["col_type"] = orig
+            # Mark cells whose text was changed by IPA correction so that
+            # later steps (5i) don't overwrite the corrected text when
+            # reconstructing from word_boxes.
+            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
+                cell["_ipa_corrected"] = True

        # 5d. Fix IPA continuation cells — cells where the printed
        # phonetic transcription wraps to a line below the headword.
@@ -1296,7 +1303,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                bullet_removed += len(to_remove)
                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
                cell["word_boxes"] = filtered
-                cell["text"] = _words_to_reading_order_text(filtered)
+                # Don't overwrite text that was corrected by Step 5c IPA fix
+                if not cell.get("_ipa_corrected"):
+                    cell["text"] = _words_to_reading_order_text(filtered)

    # Remove cells that became empty after bullet removal
    if bullet_removed:
@@ -1477,6 +1486,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
        except Exception as e:
            logger.warning("Syllable insertion failed: %s", e)

+    # Clean up internal flags before returning
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            cell.pop("_ipa_corrected", None)
+
    result = {
        "session_id": session_id,
        "image_width": img_w,