From 8ef4c089cfc2b8e1b5e52bfcfb63ad3efb4c0ab3 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 12:05:38 +0100 Subject: [PATCH] Remove IPA continuation rows and support hyphenated word lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - grid_editor_api: After IPA correction, detect rows containing only garbled phonetics in the English column (no German translation, no IPA brackets inserted). These are wrap-around lines where printed IPA extends to the line below the headword. Remove them since the headword row already has correct IPA. - cv_ocr_engines: _insert_missing_ipa now tries dehyphenated form as fallback (e.g. "second-hand" → "secondhand") for dictionary lookup, fixing IPA insertion for compound words. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 3 ++ klausur-service/backend/grid_editor_api.py | 44 ++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 51d4d54..67648c3 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1026,6 +1026,9 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: if clean.lower() in _GRAMMAR_BRACKET_WORDS: continue ipa = _lookup_ipa(clean, pronunciation) + # Fallback: try without hyphens (e.g. "second-hand" → "secondhand") + if not ipa and '-' in clean: + ipa = _lookup_ipa(clean.replace('-', ''), pronunciation) if ipa: words[i] = f"{w} [{ipa}]" # Strip garbled OCR phonetics after the IPA bracket. diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 8d8a937..94282e4 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1196,6 +1196,50 @@ async def build_grid(session_id: str): if orig: cell["col_type"] = orig + # 5d. Remove IPA continuation rows — rows where the printed + # phonetic transcription wraps to a line below the headword. + # These rows have text only in the English column (+ margin + # noise) and fix_cell_phonetics did NOT insert IPA brackets + # (because there's no real English word to look up). + ipa_cont_rows: set = set() + for z in zones_data: + for row in z.get("rows", []): + ri = row["index"] + row_cells = [ + c for c in z.get("cells", []) + if c.get("row_index") == ri + ] + en_cells = [ + c for c in row_cells + if c.get("col_type") == en_col_type + ] + # Other cells with ≥3 chars (ignore margin noise) + other_cells = [ + c for c in row_cells + if c.get("col_type") != en_col_type + and len((c.get("text") or "").strip()) >= 3 + ] + if en_cells and not other_cells: + en_text = en_cells[0].get("text", "") + # No IPA brackets → phonetics not recognized → + # this is a garbled IPA continuation row + if "[" not in en_text: + ipa_cont_rows.add(ri) + if ipa_cont_rows: + for z in zones_data: + z["rows"] = [ + r for r in z.get("rows", []) + if r["index"] not in ipa_cont_rows + ] + z["cells"] = [ + c for c in z.get("cells", []) + if c.get("row_index") not in ipa_cont_rows + ] + logger.info( + "removed %d IPA continuation rows: %s", + len(ipa_cont_rows), sorted(ipa_cont_rows), + ) + duration = time.time() - t0 # 6. Build result