Fix IPA correction persistence and false-positive prefix matching

Step 5i was overwriting IPA-corrected text from Step 5c when reconstructing cells from word_boxes. Added _ipa_corrected flag to preserve corrections. Also tightened merged-token prefix matching (min prefix 4 chars, min suffix 3 chars) to prevent false positives like "sis" being extracted from "si:said". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Fix IPA correction for dictionary pages (WIP)
2026-03-25 07:26:32 +01:00 · 2026-03-24 23:54:14 +01:00
2 changed files with 42 additions and 2 deletions
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1030,6 +1030,15 @@ def _text_has_garbled_ipa(text: str) -> bool:
        # Contains IPA special characters
        if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
            return True
        # Embedded apostrophe suggesting merged garbled IPA with stress mark.
        # E.g. "Scotland'skotland" — OCR reads ˈ as '.
        # Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
        # chars to avoid contractions (don't, won't, o'clock).
        if "'" in w and not w.startswith("'"):
            apos_idx = w.index("'")
            after = w[apos_idx + 1:]
            if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
                return True
    return False
@@ -1183,6 +1192,19 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
                    if _lookup_ipa(clean_j, pronunciation):
                        kept.extend(words[j:])
                        break
                # Merged token: dictionary word + garbled IPA stuck together.
                # E.g. "fictionsalans'fIkfn" starts with "fiction".
                # Extract the dictionary prefix (≥4 chars) and add it with
                # IPA, but only if enough chars remain after the prefix (≥3)
                # to look like garbled IPA, not just a plural 's'.
                if clean_j and len(clean_j) >= 7:
                    for pend in range(min(len(clean_j) - 3, 15), 3, -1):
                        prefix_j = clean_j[:pend]
                        prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
                        if prefix_ipa:
                            kept.append(f"{prefix_j} [{prefix_ipa}]")
                            break
                    break  # rest of this token is garbled
                # Otherwise — likely garbled phonetics, skip
            words = kept
            break
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -858,6 +858,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    # Single/two-column layouts are continuous text, not vocab tables.
    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
    en_col_type = None
    if total_cols >= 3:
        # Find the column that contains IPA brackets → English headwords.
        # Count cells with bracket patterns per col_type.  The column with
@@ -872,7 +873,6 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
            if ct.startswith("column_") and '[' in txt:
                col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
        # Pick column with most bracket IPA patterns
        en_col_type = None
        if col_bracket_count:
            en_col_type = max(col_bracket_count, key=col_bracket_count.get)
        else:
@@ -890,11 +890,18 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                if cell.get("col_type") == en_col_type:
                    cell["_orig_col_type"] = en_col_type
                    cell["col_type"] = "column_en"
        # Snapshot text before IPA fix to detect which cells were modified
        _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
        fix_cell_phonetics(all_cells, pronunciation="british")
        for cell in all_cells:
            orig = cell.pop("_orig_col_type", None)
            if orig:
                cell["col_type"] = orig
            # Mark cells whose text was changed by IPA correction so that
            # later steps (5i) don't overwrite the corrected text when
            # reconstructing from word_boxes.
            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
                cell["_ipa_corrected"] = True
        # 5d. Fix IPA continuation cells — cells where the printed
        # phonetic transcription wraps to a line below the headword.
@@ -1105,6 +1112,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    slash_ipa_fixed = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            # Only process English headword column — avoid converting
            # German text like "der/die/das" to IPA.
            if en_col_type and cell.get("col_type") != en_col_type:
                continue
            text = cell.get("text", "")
            if "/" not in text:
                continue
@@ -1292,7 +1303,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                bullet_removed += len(to_remove)
                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
                cell["word_boxes"] = filtered
-                cell["text"] = _words_to_reading_order_text(filtered)
+                # Don't overwrite text that was corrected by Step 5c IPA fix
                if not cell.get("_ipa_corrected"):
                    cell["text"] = _words_to_reading_order_text(filtered)
    # Remove cells that became empty after bullet removal
    if bullet_removed:
@@ -1473,6 +1486,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
        except Exception as e:
            logger.warning("Syllable insertion failed: %s", e)
    # Clean up internal flags before returning
    for z in zones_data:
        for cell in z.get("cells", []):
            cell.pop("_ipa_corrected", None)
    result = {
        "session_id": session_id,
        "image_width": img_w,