From fc0ab84e400c70142614f2253b9228e0b1e98092 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 19 Mar 2026 10:28:14 +0100 Subject: [PATCH] Fix garbled IPA in continuation rows using headword lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IPA continuation rows (phonetic transcription that wraps below the headword) now get proper IPA by looking up headwords from the row above. E.g. "ska:f – ska:vz" → "[skˈɑːf] – [skˈɑːvz]". Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 68 ++++++++++++++++++++++ klausur-service/backend/grid_editor_api.py | 59 +++++++++++++++++-- 2 files changed, 123 insertions(+), 4 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 8348b2f..397097b 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1096,6 +1096,74 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: return ' '.join(words) +def fix_ipa_continuation_cell( + garbled_text: str, + headword_text: str, + pronunciation: str = 'british', +) -> str: + """Replace garbled IPA in a continuation row with proper IPA. + + Continuation rows appear below the headword and contain only the + printed phonetic transcription, which OCR garbles into fragments + like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``). + + Args: + garbled_text: The OCR-garbled IPA text from the continuation row. + headword_text: The headword text from the previous row + (e.g. ``scarf – scarves``). + pronunciation: ``'british'`` or ``'american'``. + + Returns: + Corrected IPA text, or the original if no fix could be applied. + """ + if not IPA_AVAILABLE or not garbled_text or not headword_text: + return garbled_text + + # Strip existing IPA brackets from headword text + clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text).strip() + if not clean_hw: + return garbled_text + + # Split headword by delimiters (– — -) + # "scarf – scarves" → ["scarf", "scarves"] + # "see - saw - seen" → ["see", "saw", "seen"] + parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw) + parts = [p.strip() for p in parts if p.strip()] + + if not parts: + return garbled_text + + # Look up IPA for each headword part + ipa_parts: List[str] = [] + for part in parts: + # A part may be multi-word like "secondary school" + words = part.split() + word_ipas: List[str] = [] + for w in words: + clean_w = re.sub(r'[^a-zA-Z\'-]', '', w) + if not clean_w or len(clean_w) < 2: + continue + # Skip grammar words like "to" at the start + if clean_w.lower() in _GRAMMAR_BRACKET_WORDS: + continue + ipa = _lookup_ipa(clean_w, pronunciation) + if ipa: + word_ipas.append(ipa) + if word_ipas: + ipa_parts.append('[' + ' '.join(word_ipas) + ']') + + if not ipa_parts: + return garbled_text + + # Join with delimiter + result = ' – '.join(ipa_parts) + logger.debug( + "fix_ipa_continuation: '%s' → '%s' (headwords: '%s')", + garbled_text, result, headword_text, + ) + return result + + def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str: """Insert IPA for the first English headword in a long mixed-language line. diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 4aaed6f..c80daca 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -22,7 +22,7 @@ from fastapi import APIRouter, HTTPException, Request from cv_box_detect import detect_boxes, split_page_into_zones from cv_color_detect import detect_word_colors, recover_colored_text -from cv_ocr_engines import fix_cell_phonetics +from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa from cv_words_first import _cluster_rows, _build_cells from ocr_pipeline_session_store import ( get_session_db, @@ -1324,9 +1324,60 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if orig: cell["col_type"] = orig - # 5d. IPA continuation rows are preserved — they contain the - # printed phonetic transcription that wraps to a line below the - # headword. The user can manually delete them if not needed. + # 5d. Fix IPA continuation rows — rows where the printed + # phonetic transcription wraps to a line below the headword. + # These contain only garbled IPA in the EN column and nothing + # in other columns. Replace garbled text with proper IPA + # looked up from the headword in the previous row. + ipa_cont_fixed = 0 + for z in zones_data: + rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) + z_cells = z.get("cells", []) + for idx, row in enumerate(rows_sorted): + ri = row["index"] + row_cells = [c for c in z_cells if c.get("row_index") == ri] + en_cells = [ + c for c in row_cells + if c.get("col_type") == en_col_type + ] + # Other cells with ≥3 chars (ignore margin noise) + other_cells = [ + c for c in row_cells + if c.get("col_type") != en_col_type + and len((c.get("text") or "").strip()) >= 3 + ] + if not en_cells or other_cells: + continue + en_text = en_cells[0].get("text", "") + if not _text_has_garbled_ipa(en_text): + continue + # Already has proper IPA brackets → already fixed + if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text): + continue + # Find headword in previous row + if idx == 0: + continue + prev_ri = rows_sorted[idx - 1]["index"] + prev_en = [ + c for c in z_cells + if c.get("row_index") == prev_ri + and c.get("col_type") == en_col_type + ] + if not prev_en: + continue + prev_text = prev_en[0].get("text", "") + fixed = fix_ipa_continuation_cell( + en_text, prev_text, pronunciation="british", + ) + if fixed != en_text: + en_cells[0]["text"] = fixed + ipa_cont_fixed += 1 + logger.info( + "IPA continuation R%d: '%s' → '%s'", + ri, en_text, fixed, + ) + if ipa_cont_fixed: + logger.info("Fixed %d IPA continuation rows", ipa_cont_fixed) duration = time.time() - t0