From fc0ab84e400c70142614f2253b9228e0b1e98092 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 19 Mar 2026 10:28:14 +0100
Subject: [PATCH] Fix garbled IPA in continuation rows using headword lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IPA continuation rows (phonetic transcription that wraps below the
headword) now get proper IPA by looking up headwords from the row
above. E.g. "ska:f – ska:vz" → "[skˈɑːf] – [skˈɑːvz]".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py  | 68 ++++++++++++++++++++++
 klausur-service/backend/grid_editor_api.py | 59 +++++++++++++++++--
 2 files changed, 123 insertions(+), 4 deletions(-)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index 8348b2f..397097b 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1096,6 +1096,74 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
     return ' '.join(words)
 
 
+def fix_ipa_continuation_cell(
+    garbled_text: str,
+    headword_text: str,
+    pronunciation: str = 'british',
+) -> str:
+    """Replace garbled IPA in a continuation row with proper IPA.
+
+    Continuation rows appear below the headword and contain only the
+    printed phonetic transcription, which OCR garbles into fragments
+    like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``).
+
+    Args:
+        garbled_text: The OCR-garbled IPA text from the continuation row.
+        headword_text: The headword text from the previous row
+            (e.g. ``scarf – scarves``).
+        pronunciation: ``'british'`` or ``'american'``.
+
+    Returns:
+        Corrected IPA text, or the original if no fix could be applied.
+    """
+    if not IPA_AVAILABLE or not garbled_text or not headword_text:
+        return garbled_text
+
+    # Strip existing IPA brackets from headword text
+    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text).strip()
+    if not clean_hw:
+        return garbled_text
+
+    # Split headword by delimiters (– — -)
+    # "scarf – scarves" → ["scarf", "scarves"]
+    # "see - saw - seen" → ["see", "saw", "seen"]
+    parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
+    parts = [p.strip() for p in parts if p.strip()]
+
+    if not parts:
+        return garbled_text
+
+    # Look up IPA for each headword part
+    ipa_parts: List[str] = []
+    for part in parts:
+        # A part may be multi-word like "secondary school"
+        words = part.split()
+        word_ipas: List[str] = []
+        for w in words:
+            clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
+            if not clean_w or len(clean_w) < 2:
+                continue
+            # Skip grammar words like "to" at the start
+            if clean_w.lower() in _GRAMMAR_BRACKET_WORDS:
+                continue
+            ipa = _lookup_ipa(clean_w, pronunciation)
+            if ipa:
+                word_ipas.append(ipa)
+        if word_ipas:
+            ipa_parts.append('[' + ' '.join(word_ipas) + ']')
+
+    if not ipa_parts:
+        return garbled_text
+
+    # Join with delimiter
+    result = ' – '.join(ipa_parts)
+    logger.debug(
+        "fix_ipa_continuation: '%s' → '%s' (headwords: '%s')",
+        garbled_text, result, headword_text,
+    )
+    return result
+
+
 def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
     """Insert IPA for the first English headword in a long mixed-language line.
 
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 4aaed6f..c80daca 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -22,7 +22,7 @@ from fastapi import APIRouter, HTTPException, Request
 
 from cv_box_detect import detect_boxes, split_page_into_zones
 from cv_color_detect import detect_word_colors, recover_colored_text
-from cv_ocr_engines import fix_cell_phonetics
+from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa
 from cv_words_first import _cluster_rows, _build_cells
 from ocr_pipeline_session_store import (
     get_session_db,
@@ -1324,9 +1324,60 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
             if orig:
                 cell["col_type"] = orig
 
-        # 5d. IPA continuation rows are preserved — they contain the
-        # printed phonetic transcription that wraps to a line below the
-        # headword.  The user can manually delete them if not needed.
+        # 5d. Fix IPA continuation rows — rows where the printed
+        # phonetic transcription wraps to a line below the headword.
+        # These contain only garbled IPA in the EN column and nothing
+        # in other columns.  Replace garbled text with proper IPA
+        # looked up from the headword in the previous row.
+        ipa_cont_fixed = 0
+        for z in zones_data:
+            rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
+            z_cells = z.get("cells", [])
+            for idx, row in enumerate(rows_sorted):
+                ri = row["index"]
+                row_cells = [c for c in z_cells if c.get("row_index") == ri]
+                en_cells = [
+                    c for c in row_cells
+                    if c.get("col_type") == en_col_type
+                ]
+                # Other cells with ≥3 chars (ignore margin noise)
+                other_cells = [
+                    c for c in row_cells
+                    if c.get("col_type") != en_col_type
+                    and len((c.get("text") or "").strip()) >= 3
+                ]
+                if not en_cells or other_cells:
+                    continue
+                en_text = en_cells[0].get("text", "")
+                if not _text_has_garbled_ipa(en_text):
+                    continue
+                # Already has proper IPA brackets → already fixed
+                if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text):
+                    continue
+                # Find headword in previous row
+                if idx == 0:
+                    continue
+                prev_ri = rows_sorted[idx - 1]["index"]
+                prev_en = [
+                    c for c in z_cells
+                    if c.get("row_index") == prev_ri
+                    and c.get("col_type") == en_col_type
+                ]
+                if not prev_en:
+                    continue
+                prev_text = prev_en[0].get("text", "")
+                fixed = fix_ipa_continuation_cell(
+                    en_text, prev_text, pronunciation="british",
+                )
+                if fixed != en_text:
+                    en_cells[0]["text"] = fixed
+                    ipa_cont_fixed += 1
+                    logger.info(
+                        "IPA continuation R%d: '%s' → '%s'",
+                        ri, en_text, fixed,
+                    )
+        if ipa_cont_fixed:
+            logger.info("Fixed %d IPA continuation rows", ipa_cont_fixed)
 
     duration = time.time() - t0