diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 4daf05c..9f0da47 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request from cv_box_detect import detect_boxes, split_page_into_zones from cv_vocab_types import PageZone from cv_color_detect import detect_word_colors, recover_colored_text -from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa +from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa from cv_words_first import _cluster_rows, _build_cells from ocr_pipeline_session_store import ( get_session_db, @@ -2002,6 +2002,61 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if footer_rows: z["footer"] = footer_rows + # 5h. Convert slash-delimited IPA to bracket notation. + # Dictionary-style pages print IPA between slashes: "tiger /'taiga/" + # Detect the pattern /ocr_ipa/ and replace with [dict_ipa] + # using the IPA dictionary when available, falling back to the OCR text. + # The regex requires a word character (or ² ³) right before the opening + # slash to avoid false positives like "sb/sth". + _SLASH_IPA_RE = re.compile( + r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1) + r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars + ) + # Standalone slash IPA at start of text (headword on previous line) + _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/') + slash_ipa_fixed = 0 + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if "/" not in text: + continue + + def _replace_slash_ipa(m: re.Match) -> str: + nonlocal slash_ipa_fixed + headword = m.group(1) + ocr_ipa = m.group(2) # includes slashes + # Strip superscript digits for lookup + clean_hw = re.sub(r'[²³¹\d]', '', headword).strip() + ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None + if ipa: + slash_ipa_fixed += 1 + return f"{headword} [{ipa}]" + # Fallback: keep OCR IPA but convert slashes to brackets + inner = ocr_ipa.strip("/").strip() + # Strip leading ' (OCR stress marker) + inner = inner.lstrip("'").strip() + if inner: + slash_ipa_fixed += 1 + return f"{headword} [{inner}]" + return m.group(0) + + new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text) + + # Handle standalone /ipa/ at start (no headword in this cell) + if new_text == text: + m = _STANDALONE_SLASH_IPA_RE.match(text) + if m: + inner = m.group(1).strip().lstrip("'").strip() + if inner: + new_text = "[" + inner + "]" + text[m.end():] + slash_ipa_fixed += 1 + + if new_text != text: + cell["text"] = new_text + + if slash_ipa_fixed: + logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed) + duration = time.time() - t0 # 6. Build result diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index a268c65..acc548a 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -804,3 +804,95 @@ class TestDetectHeadingRowsBySingleCell: heading_cells = [c for c in zone["cells"] if c.get("col_type") == "heading"] assert all(c["row_index"] != 7 for c in heading_cells) + + +# --------------------------------------------------------------------------- +# Step 5h: Slash-IPA to bracket conversion +# --------------------------------------------------------------------------- + +class TestSlashIpaConversion: + """Step 5h converts /ocr_ipa/ patterns to [dictionary_ipa] notation.""" + + def _run_step_5h(self, text: str) -> str: + """Run the Step 5h regex logic on a single text string.""" + import re + from cv_ocr_engines import _lookup_ipa + + _SLASH_IPA_RE = re.compile( + r'(\b[a-zA-Z]+[²³¹]?)\s*' + r"(/[^/]{2,}/)" + ) + _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/') + + def _replace(m): + headword = m.group(1) + ocr_ipa = m.group(2) + clean_hw = re.sub(r'[²³¹\d]', '', headword).strip() + ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None + if ipa: + return f"{headword} [{ipa}]" + inner = ocr_ipa.strip("/").strip().lstrip("'").strip() + if inner: + return f"{headword} [{inner}]" + return m.group(0) + + new_text = _SLASH_IPA_RE.sub(_replace, text) + if new_text == text: + m = _STANDALONE_SLASH_IPA_RE.match(text) + if m: + inner = m.group(1).strip().lstrip("'").strip() + if inner: + new_text = "[" + inner + "]" + text[m.end():] + return new_text + + def test_tiger_dict_lookup(self): + """tiger /'taiga/ → tiger [tˈaɪgə] (from dictionary).""" + result = self._run_step_5h("tiger /'taiga/ Nomen Tiger") + assert "[tˈaɪgə]" in result + assert "/'taiga/" not in result + assert result.startswith("tiger") + + def test_tight_no_space(self): + """tight²/tait/ → tight² [tˈaɪt] (no space before slash).""" + result = self._run_step_5h("tight²/tait/ Adv fest") + assert "[tˈaɪt]" in result + assert "/tait/" not in result + + def test_unknown_word_falls_back_to_ocr(self): + """tinned/und/ → tinned [und] (not in dictionary, keeps OCR IPA).""" + result = self._run_step_5h("tinned/und/ Adj Dosen-") + assert "[und]" in result + assert "/und/" not in result + + def test_sb_sth_not_matched(self): + """sb/sth should NOT be treated as IPA (too short or grammar).""" + text = "(tie sb/sth up) jdn/etwas anbinden" + result = self._run_step_5h(text) + # /sth up) jdn/ has length > 2 but the headword is "sb" which is + # not a real word — the regex would match, but "sb" won't be in dict + # and the inner text would contain grammar, not IPA. + # Key assertion: "jdn/etwas" is not corrupted + assert "etwas" in result + + def test_double_ipa(self): + """times/taimz/ /tamz/ → both converted.""" + result = self._run_step_5h("times/taimz/ /tamz/ Präp") + assert "[tˈaɪmz]" in result + # Second /tamz/ is standalone after first replacement + assert "/taimz/" not in result + + def test_standalone_slash_ipa_at_start(self): + """/tam/ Nomen → [tam] Nomen (no headword in cell).""" + result = self._run_step_5h("/tam/ Nomen 1 Zeit") + assert result.startswith("[tam]") + assert "/tam/" not in result + + def test_no_slashes_unchanged(self): + """Text without slashes passes through unchanged.""" + text = "hello world" + assert self._run_step_5h(text) == text + + def test_tile_dict_lookup(self): + """tile /tail/ → tile [tˈaɪl].""" + result = self._run_step_5h("tile /tail/ Nomen Dachziegel") + assert "[tˈaɪl]" in result