diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 9f0da47..8470b40 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -2014,6 +2014,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: ) # Standalone slash IPA at start of text (headword on previous line) _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/') + # IPA between slashes never contains spaces, parentheses, or commas. + # Reject matches that look like grammar: "sb/sth up a) jdn/" + _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]') slash_ipa_fixed = 0 for z in zones_data: for cell in z.get("cells", []): @@ -2025,6 +2028,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: nonlocal slash_ipa_fixed headword = m.group(1) ocr_ipa = m.group(2) # includes slashes + inner_raw = ocr_ipa.strip("/").strip() + # Reject if inner content has spaces/parens/commas (grammar) + if _SLASH_IPA_REJECT_RE.search(inner_raw): + return m.group(0) # Strip superscript digits for lookup clean_hw = re.sub(r'[²³¹\d]', '', headword).strip() ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None @@ -2032,9 +2039,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: slash_ipa_fixed += 1 return f"{headword} [{ipa}]" # Fallback: keep OCR IPA but convert slashes to brackets - inner = ocr_ipa.strip("/").strip() - # Strip leading ' (OCR stress marker) - inner = inner.lstrip("'").strip() + inner = inner_raw.lstrip("'").strip() if inner: slash_ipa_fixed += 1 return f"{headword} [{inner}]" @@ -2042,14 +2047,30 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text) + # Second pass: convert remaining /ipa/ after [ipa] from first pass. + # Pattern: [ipa] /ipa2/ → [ipa] [ipa2] (second pronunciation variant) + _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)') + def _replace_trailing_slash(m: re.Match) -> str: + nonlocal slash_ipa_fixed + inner = m.group(1).strip("/").strip().lstrip("'").strip() + if _SLASH_IPA_REJECT_RE.search(inner): + return m.group(0) + if inner: + slash_ipa_fixed += 1 + return f" [{inner}]" + return m.group(0) + new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text) + # Handle standalone /ipa/ at start (no headword in this cell) if new_text == text: m = _STANDALONE_SLASH_IPA_RE.match(text) if m: - inner = m.group(1).strip().lstrip("'").strip() - if inner: - new_text = "[" + inner + "]" + text[m.end():] - slash_ipa_fixed += 1 + inner = m.group(1).strip() + if not _SLASH_IPA_REJECT_RE.search(inner): + inner = inner.lstrip("'").strip() + if inner: + new_text = "[" + inner + "]" + text[m.end():] + slash_ipa_fixed += 1 if new_text != text: cell["text"] = new_text diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index acc548a..69c32eb 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -823,26 +823,44 @@ class TestSlashIpaConversion: r"(/[^/]{2,}/)" ) _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/') + _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]') def _replace(m): headword = m.group(1) ocr_ipa = m.group(2) + inner_raw = ocr_ipa.strip("/").strip() + if _SLASH_IPA_REJECT_RE.search(inner_raw): + return m.group(0) clean_hw = re.sub(r'[²³¹\d]', '', headword).strip() ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None if ipa: return f"{headword} [{ipa}]" - inner = ocr_ipa.strip("/").strip().lstrip("'").strip() + inner = inner_raw.lstrip("'").strip() if inner: return f"{headword} [{inner}]" return m.group(0) new_text = _SLASH_IPA_RE.sub(_replace, text) + + # Second pass: trailing /ipa/ after [ipa] + _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)') + def _replace_trailing(m): + inner = m.group(1).strip("/").strip().lstrip("'").strip() + if _SLASH_IPA_REJECT_RE.search(inner): + return m.group(0) + if inner: + return f" [{inner}]" + return m.group(0) + new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing, new_text) + if new_text == text: m = _STANDALONE_SLASH_IPA_RE.match(text) if m: - inner = m.group(1).strip().lstrip("'").strip() - if inner: - new_text = "[" + inner + "]" + text[m.end():] + inner = m.group(1).strip() + if not _SLASH_IPA_REJECT_RE.search(inner): + inner = inner.lstrip("'").strip() + if inner: + new_text = "[" + inner + "]" + text[m.end():] return new_text def test_tiger_dict_lookup(self): @@ -865,21 +883,19 @@ class TestSlashIpaConversion: assert "/und/" not in result def test_sb_sth_not_matched(self): - """sb/sth should NOT be treated as IPA (too short or grammar).""" + """sb/sth should NOT be treated as IPA (contains space/parens).""" text = "(tie sb/sth up) jdn/etwas anbinden" result = self._run_step_5h(text) - # /sth up) jdn/ has length > 2 but the headword is "sb" which is - # not a real word — the regex would match, but "sb" won't be in dict - # and the inner text would contain grammar, not IPA. - # Key assertion: "jdn/etwas" is not corrupted - assert "etwas" in result + # The inner content "sth up) jdn" has spaces and parens → rejected + assert result == text # unchanged - def test_double_ipa(self): - """times/taimz/ /tamz/ → both converted.""" + def test_double_ipa_both_converted(self): + """times/taimz/ /tamz/ → times [tˈaɪmz] [tamz] (both converted).""" result = self._run_step_5h("times/taimz/ /tamz/ Präp") assert "[tˈaɪmz]" in result - # Second /tamz/ is standalone after first replacement + assert "[tamz]" in result assert "/taimz/" not in result + assert "/tamz/" not in result def test_standalone_slash_ipa_at_start(self): """/tam/ Nomen → [tam] Nomen (no headword in cell)."""