Fix Step 5h: reject grammar patterns in slash-IPA, convert trailing variants

- Reject /.../ matches containing spaces, parens, or commas (e.g. sb/sth up) - Second pass converts trailing /ipa2/ after [ipa1] (double pronunciation) - Validate standalone /ipa/ at start against same reject pattern Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 12:40:28 +01:00
parent 7fafd297e7
commit 04092a0a66
2 changed files with 57 additions and 20 deletions
@@ -2014,6 +2014,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    )
    # Standalone slash IPA at start of text (headword on previous line)
    _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
    # IPA between slashes never contains spaces, parentheses, or commas.
    # Reject matches that look like grammar: "sb/sth up a) jdn/"
    _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
    slash_ipa_fixed = 0
    for z in zones_data:
        for cell in z.get("cells", []):
@@ -2025,6 +2028,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                nonlocal slash_ipa_fixed
                headword = m.group(1)
                ocr_ipa = m.group(2)  # includes slashes
                inner_raw = ocr_ipa.strip("/").strip()
                # Reject if inner content has spaces/parens/commas (grammar)
                if _SLASH_IPA_REJECT_RE.search(inner_raw):
                    return m.group(0)
                # Strip superscript digits for lookup
                clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
                ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
@@ -2032,9 +2039,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    slash_ipa_fixed += 1
                    return f"{headword} [{ipa}]"
                # Fallback: keep OCR IPA but convert slashes to brackets
-                inner = ocr_ipa.strip("/").strip()
+                inner = inner_raw.lstrip("'").strip()
                # Strip leading ' (OCR stress marker)
                inner = inner.lstrip("'").strip()
                if inner:
                    slash_ipa_fixed += 1
                    return f"{headword} [{inner}]"
@@ -2042,14 +2047,30 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
            new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
            # Second pass: convert remaining /ipa/ after [ipa] from first pass.
            # Pattern: [ipa] /ipa2/ → [ipa] [ipa2]  (second pronunciation variant)
            _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
            def _replace_trailing_slash(m: re.Match) -> str:
                nonlocal slash_ipa_fixed
                inner = m.group(1).strip("/").strip().lstrip("'").strip()
                if _SLASH_IPA_REJECT_RE.search(inner):
                    return m.group(0)
                if inner:
                    slash_ipa_fixed += 1
                    return f" [{inner}]"
                return m.group(0)
            new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
            # Handle standalone /ipa/ at start (no headword in this cell)
            if new_text == text:
                m = _STANDALONE_SLASH_IPA_RE.match(text)
                if m:
-                    inner = m.group(1).strip().lstrip("'").strip()
+                    inner = m.group(1).strip()
-                    if inner:
+                    if not _SLASH_IPA_REJECT_RE.search(inner):
-                        new_text = "[" + inner + "]" + text[m.end():]
+                        inner = inner.lstrip("'").strip()
-                        slash_ipa_fixed += 1
+                        if inner:
                            new_text = "[" + inner + "]" + text[m.end():]
                            slash_ipa_fixed += 1
            if new_text != text:
                cell["text"] = new_text
@@ -823,26 +823,44 @@ class TestSlashIpaConversion:
            r"(/[^/]{2,}/)"
        )
        _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
        _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
        def _replace(m):
            headword = m.group(1)
            ocr_ipa = m.group(2)
            inner_raw = ocr_ipa.strip("/").strip()
            if _SLASH_IPA_REJECT_RE.search(inner_raw):
                return m.group(0)
            clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
            ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
            if ipa:
                return f"{headword} [{ipa}]"
-            inner = ocr_ipa.strip("/").strip().lstrip("'").strip()
+            inner = inner_raw.lstrip("'").strip()
            if inner:
                return f"{headword} [{inner}]"
            return m.group(0)
        new_text = _SLASH_IPA_RE.sub(_replace, text)
        # Second pass: trailing /ipa/ after [ipa]
        _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
        def _replace_trailing(m):
            inner = m.group(1).strip("/").strip().lstrip("'").strip()
            if _SLASH_IPA_REJECT_RE.search(inner):
                return m.group(0)
            if inner:
                return f" [{inner}]"
            return m.group(0)
        new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing, new_text)
        if new_text == text:
            m = _STANDALONE_SLASH_IPA_RE.match(text)
            if m:
-                inner = m.group(1).strip().lstrip("'").strip()
+                inner = m.group(1).strip()
-                if inner:
+                if not _SLASH_IPA_REJECT_RE.search(inner):
-                    new_text = "[" + inner + "]" + text[m.end():]
+                    inner = inner.lstrip("'").strip()
                    if inner:
                        new_text = "[" + inner + "]" + text[m.end():]
        return new_text
    def test_tiger_dict_lookup(self):
@@ -865,21 +883,19 @@ class TestSlashIpaConversion:
        assert "/und/" not in result
    def test_sb_sth_not_matched(self):
-        """sb/sth should NOT be treated as IPA (too short or grammar)."""
+        """sb/sth should NOT be treated as IPA (contains space/parens)."""
        text = "(tie sb/sth up) jdn/etwas anbinden"
        result = self._run_step_5h(text)
-        # /sth up) jdn/ has length > 2 but the headword is "sb" which is
+        # The inner content "sth up) jdn" has spaces and parens → rejected
-        # not a real word — the regex would match, but "sb" won't be in dict
+        assert result == text  # unchanged
        # and the inner text would contain grammar, not IPA.
        # Key assertion: "jdn/etwas" is not corrupted
        assert "etwas" in result
-    def test_double_ipa(self):
+    def test_double_ipa_both_converted(self):
-        """times/taimz/ /tamz/ → both converted."""
+        """times/taimz/ /tamz/ → times [tˈaɪmz] [tamz] (both converted)."""
        result = self._run_step_5h("times/taimz/ /tamz/ Präp")
        assert "[tˈaɪmz]" in result
-        # Second /tamz/ is standalone after first replacement
+        assert "[tamz]" in result
        assert "/taimz/" not in result
        assert "/tamz/" not in result
    def test_standalone_slash_ipa_at_start(self):
        """/tam/ Nomen → [tam] Nomen (no headword in cell)."""