From c42924a94a140a3d0dca7a1f648c0c19f52c8dc2 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 25 Mar 2026 07:26:32 +0100 Subject: [PATCH] Fix IPA correction persistence and false-positive prefix matching Step 5i was overwriting IPA-corrected text from Step 5c when reconstructing cells from word_boxes. Added _ipa_corrected flag to preserve corrections. Also tightened merged-token prefix matching (min prefix 4 chars, min suffix 3 chars) to prevent false positives like "sis" being extracted from "si:said". Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 8 +++++--- klausur-service/backend/grid_editor_api.py | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index f0a7cb8..6a2ca8e 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1194,9 +1194,11 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: break # Merged token: dictionary word + garbled IPA stuck together. # E.g. "fictionsalans'fIkfn" starts with "fiction". - # Extract the dictionary prefix and add it with IPA. - if clean_j and len(clean_j) >= 5: - for pend in range(min(len(clean_j), 15), 2, -1): + # Extract the dictionary prefix (≥4 chars) and add it with + # IPA, but only if enough chars remain after the prefix (≥3) + # to look like garbled IPA, not just a plural 's'. + if clean_j and len(clean_j) >= 7: + for pend in range(min(len(clean_j) - 3, 15), 3, -1): prefix_j = clean_j[:pend] prefix_ipa = _lookup_ipa(prefix_j, pronunciation) if prefix_ipa: diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 91011bd..69fe315 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -890,11 +890,18 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if cell.get("col_type") == en_col_type: cell["_orig_col_type"] = en_col_type cell["col_type"] = "column_en" + # Snapshot text before IPA fix to detect which cells were modified + _pre_ipa = {id(c): c.get("text", "") for c in all_cells} fix_cell_phonetics(all_cells, pronunciation="british") for cell in all_cells: orig = cell.pop("_orig_col_type", None) if orig: cell["col_type"] = orig + # Mark cells whose text was changed by IPA correction so that + # later steps (5i) don't overwrite the corrected text when + # reconstructing from word_boxes. + if cell.get("text", "") != _pre_ipa.get(id(cell), ""): + cell["_ipa_corrected"] = True # 5d. Fix IPA continuation cells — cells where the printed # phonetic transcription wraps to a line below the headword. @@ -1296,7 +1303,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: bullet_removed += len(to_remove) filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove] cell["word_boxes"] = filtered - cell["text"] = _words_to_reading_order_text(filtered) + # Don't overwrite text that was corrected by Step 5c IPA fix + if not cell.get("_ipa_corrected"): + cell["text"] = _words_to_reading_order_text(filtered) # Remove cells that became empty after bullet removal if bullet_removed: @@ -1477,6 +1486,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: except Exception as e: logger.warning("Syllable insertion failed: %s", e) + # Clean up internal flags before returning + for z in zones_data: + for cell in z.get("cells", []): + cell.pop("_ipa_corrected", None) + result = { "session_id": session_id, "image_width": img_w,