Step 5h: convert slash-delimited IPA to bracket notation with dict lookup

Dictionary-style pages print IPA between slashes (e.g. tiger /'taiga/).
Step 5h detects these patterns, looks up the headword in the IPA dictionary
for proper Unicode IPA, and falls back to OCR text when not found.
Converts /ipa/ to [ipa] bracket notation matching the rest of the pipeline.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 12:36:08 +01:00
parent 7ac09b5941
commit 7fafd297e7
2 changed files with 148 additions and 1 deletions

View File

@@ -804,3 +804,95 @@ class TestDetectHeadingRowsBySingleCell:
heading_cells = [c for c in zone["cells"]
if c.get("col_type") == "heading"]
assert all(c["row_index"] != 7 for c in heading_cells)
# ---------------------------------------------------------------------------
# Step 5h: Slash-IPA to bracket conversion
# ---------------------------------------------------------------------------
class TestSlashIpaConversion:
"""Step 5h converts /ocr_ipa/ patterns to [dictionary_ipa] notation."""
def _run_step_5h(self, text: str) -> str:
"""Run the Step 5h regex logic on a single text string."""
import re
from cv_ocr_engines import _lookup_ipa
_SLASH_IPA_RE = re.compile(
r'(\b[a-zA-Z]+[²³¹]?)\s*'
r"(/[^/]{2,}/)"
)
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
def _replace(m):
headword = m.group(1)
ocr_ipa = m.group(2)
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
if ipa:
return f"{headword} [{ipa}]"
inner = ocr_ipa.strip("/").strip().lstrip("'").strip()
if inner:
return f"{headword} [{inner}]"
return m.group(0)
new_text = _SLASH_IPA_RE.sub(_replace, text)
if new_text == text:
m = _STANDALONE_SLASH_IPA_RE.match(text)
if m:
inner = m.group(1).strip().lstrip("'").strip()
if inner:
new_text = "[" + inner + "]" + text[m.end():]
return new_text
def test_tiger_dict_lookup(self):
"""tiger /'taiga/ → tiger [tˈaɪgə] (from dictionary)."""
result = self._run_step_5h("tiger /'taiga/ Nomen Tiger")
assert "[tˈaɪgə]" in result
assert "/'taiga/" not in result
assert result.startswith("tiger")
def test_tight_no_space(self):
"""tight²/tait/ → tight² [tˈaɪt] (no space before slash)."""
result = self._run_step_5h("tight²/tait/ Adv fest")
assert "[tˈaɪt]" in result
assert "/tait/" not in result
def test_unknown_word_falls_back_to_ocr(self):
"""tinned/und/ → tinned [und] (not in dictionary, keeps OCR IPA)."""
result = self._run_step_5h("tinned/und/ Adj Dosen-")
assert "[und]" in result
assert "/und/" not in result
def test_sb_sth_not_matched(self):
"""sb/sth should NOT be treated as IPA (too short or grammar)."""
text = "(tie sb/sth up) jdn/etwas anbinden"
result = self._run_step_5h(text)
# /sth up) jdn/ has length > 2 but the headword is "sb" which is
# not a real word — the regex would match, but "sb" won't be in dict
# and the inner text would contain grammar, not IPA.
# Key assertion: "jdn/etwas" is not corrupted
assert "etwas" in result
def test_double_ipa(self):
"""times/taimz/ /tamz/ → both converted."""
result = self._run_step_5h("times/taimz/ /tamz/ Präp")
assert "[tˈaɪmz]" in result
# Second /tamz/ is standalone after first replacement
assert "/taimz/" not in result
def test_standalone_slash_ipa_at_start(self):
"""/tam/ Nomen → [tam] Nomen (no headword in cell)."""
result = self._run_step_5h("/tam/ Nomen 1 Zeit")
assert result.startswith("[tam]")
assert "/tam/" not in result
def test_no_slashes_unchanged(self):
"""Text without slashes passes through unchanged."""
text = "hello world"
assert self._run_step_5h(text) == text
def test_tile_dict_lookup(self):
"""tile /tail/ → tile [tˈaɪl]."""
result = self._run_step_5h("tile /tail/ Nomen Dachziegel")
assert "[tˈaɪl]" in result