Step 5h: convert slash-delimited IPA to bracket notation with dict lookup
Dictionary-style pages print IPA between slashes (e.g. tiger /'taiga/). Step 5h detects these patterns, looks up the headword in the IPA dictionary for proper Unicode IPA, and falls back to OCR text when not found. Converts /ipa/ to [ipa] bracket notation matching the rest of the pipeline. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -23,7 +23,7 @@ from fastapi import APIRouter, HTTPException, Request
|
|||||||
from cv_box_detect import detect_boxes, split_page_into_zones
|
from cv_box_detect import detect_boxes, split_page_into_zones
|
||||||
from cv_vocab_types import PageZone
|
from cv_vocab_types import PageZone
|
||||||
from cv_color_detect import detect_word_colors, recover_colored_text
|
from cv_color_detect import detect_word_colors, recover_colored_text
|
||||||
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa
|
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa
|
||||||
from cv_words_first import _cluster_rows, _build_cells
|
from cv_words_first import _cluster_rows, _build_cells
|
||||||
from ocr_pipeline_session_store import (
|
from ocr_pipeline_session_store import (
|
||||||
get_session_db,
|
get_session_db,
|
||||||
@@ -2002,6 +2002,61 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if footer_rows:
|
if footer_rows:
|
||||||
z["footer"] = footer_rows
|
z["footer"] = footer_rows
|
||||||
|
|
||||||
|
# 5h. Convert slash-delimited IPA to bracket notation.
|
||||||
|
# Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
|
||||||
|
# Detect the pattern <headword> /ocr_ipa/ and replace with [dict_ipa]
|
||||||
|
# using the IPA dictionary when available, falling back to the OCR text.
|
||||||
|
# The regex requires a word character (or ² ³) right before the opening
|
||||||
|
# slash to avoid false positives like "sb/sth".
|
||||||
|
_SLASH_IPA_RE = re.compile(
|
||||||
|
r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
|
||||||
|
r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
|
||||||
|
)
|
||||||
|
# Standalone slash IPA at start of text (headword on previous line)
|
||||||
|
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
||||||
|
slash_ipa_fixed = 0
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if "/" not in text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
def _replace_slash_ipa(m: re.Match) -> str:
|
||||||
|
nonlocal slash_ipa_fixed
|
||||||
|
headword = m.group(1)
|
||||||
|
ocr_ipa = m.group(2) # includes slashes
|
||||||
|
# Strip superscript digits for lookup
|
||||||
|
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
||||||
|
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
||||||
|
if ipa:
|
||||||
|
slash_ipa_fixed += 1
|
||||||
|
return f"{headword} [{ipa}]"
|
||||||
|
# Fallback: keep OCR IPA but convert slashes to brackets
|
||||||
|
inner = ocr_ipa.strip("/").strip()
|
||||||
|
# Strip leading ' (OCR stress marker)
|
||||||
|
inner = inner.lstrip("'").strip()
|
||||||
|
if inner:
|
||||||
|
slash_ipa_fixed += 1
|
||||||
|
return f"{headword} [{inner}]"
|
||||||
|
return m.group(0)
|
||||||
|
|
||||||
|
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
|
||||||
|
|
||||||
|
# Handle standalone /ipa/ at start (no headword in this cell)
|
||||||
|
if new_text == text:
|
||||||
|
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
||||||
|
if m:
|
||||||
|
inner = m.group(1).strip().lstrip("'").strip()
|
||||||
|
if inner:
|
||||||
|
new_text = "[" + inner + "]" + text[m.end():]
|
||||||
|
slash_ipa_fixed += 1
|
||||||
|
|
||||||
|
if new_text != text:
|
||||||
|
cell["text"] = new_text
|
||||||
|
|
||||||
|
if slash_ipa_fixed:
|
||||||
|
logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
|
||||||
|
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|
||||||
# 6. Build result
|
# 6. Build result
|
||||||
|
|||||||
@@ -804,3 +804,95 @@ class TestDetectHeadingRowsBySingleCell:
|
|||||||
heading_cells = [c for c in zone["cells"]
|
heading_cells = [c for c in zone["cells"]
|
||||||
if c.get("col_type") == "heading"]
|
if c.get("col_type") == "heading"]
|
||||||
assert all(c["row_index"] != 7 for c in heading_cells)
|
assert all(c["row_index"] != 7 for c in heading_cells)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Step 5h: Slash-IPA to bracket conversion
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestSlashIpaConversion:
|
||||||
|
"""Step 5h converts /ocr_ipa/ patterns to [dictionary_ipa] notation."""
|
||||||
|
|
||||||
|
def _run_step_5h(self, text: str) -> str:
|
||||||
|
"""Run the Step 5h regex logic on a single text string."""
|
||||||
|
import re
|
||||||
|
from cv_ocr_engines import _lookup_ipa
|
||||||
|
|
||||||
|
_SLASH_IPA_RE = re.compile(
|
||||||
|
r'(\b[a-zA-Z]+[²³¹]?)\s*'
|
||||||
|
r"(/[^/]{2,}/)"
|
||||||
|
)
|
||||||
|
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
||||||
|
|
||||||
|
def _replace(m):
|
||||||
|
headword = m.group(1)
|
||||||
|
ocr_ipa = m.group(2)
|
||||||
|
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
||||||
|
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
||||||
|
if ipa:
|
||||||
|
return f"{headword} [{ipa}]"
|
||||||
|
inner = ocr_ipa.strip("/").strip().lstrip("'").strip()
|
||||||
|
if inner:
|
||||||
|
return f"{headword} [{inner}]"
|
||||||
|
return m.group(0)
|
||||||
|
|
||||||
|
new_text = _SLASH_IPA_RE.sub(_replace, text)
|
||||||
|
if new_text == text:
|
||||||
|
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
||||||
|
if m:
|
||||||
|
inner = m.group(1).strip().lstrip("'").strip()
|
||||||
|
if inner:
|
||||||
|
new_text = "[" + inner + "]" + text[m.end():]
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
def test_tiger_dict_lookup(self):
|
||||||
|
"""tiger /'taiga/ → tiger [tˈaɪgə] (from dictionary)."""
|
||||||
|
result = self._run_step_5h("tiger /'taiga/ Nomen Tiger")
|
||||||
|
assert "[tˈaɪgə]" in result
|
||||||
|
assert "/'taiga/" not in result
|
||||||
|
assert result.startswith("tiger")
|
||||||
|
|
||||||
|
def test_tight_no_space(self):
|
||||||
|
"""tight²/tait/ → tight² [tˈaɪt] (no space before slash)."""
|
||||||
|
result = self._run_step_5h("tight²/tait/ Adv fest")
|
||||||
|
assert "[tˈaɪt]" in result
|
||||||
|
assert "/tait/" not in result
|
||||||
|
|
||||||
|
def test_unknown_word_falls_back_to_ocr(self):
|
||||||
|
"""tinned/und/ → tinned [und] (not in dictionary, keeps OCR IPA)."""
|
||||||
|
result = self._run_step_5h("tinned/und/ Adj Dosen-")
|
||||||
|
assert "[und]" in result
|
||||||
|
assert "/und/" not in result
|
||||||
|
|
||||||
|
def test_sb_sth_not_matched(self):
|
||||||
|
"""sb/sth should NOT be treated as IPA (too short or grammar)."""
|
||||||
|
text = "(tie sb/sth up) jdn/etwas anbinden"
|
||||||
|
result = self._run_step_5h(text)
|
||||||
|
# /sth up) jdn/ has length > 2 but the headword is "sb" which is
|
||||||
|
# not a real word — the regex would match, but "sb" won't be in dict
|
||||||
|
# and the inner text would contain grammar, not IPA.
|
||||||
|
# Key assertion: "jdn/etwas" is not corrupted
|
||||||
|
assert "etwas" in result
|
||||||
|
|
||||||
|
def test_double_ipa(self):
|
||||||
|
"""times/taimz/ /tamz/ → both converted."""
|
||||||
|
result = self._run_step_5h("times/taimz/ /tamz/ Präp")
|
||||||
|
assert "[tˈaɪmz]" in result
|
||||||
|
# Second /tamz/ is standalone after first replacement
|
||||||
|
assert "/taimz/" not in result
|
||||||
|
|
||||||
|
def test_standalone_slash_ipa_at_start(self):
|
||||||
|
"""/tam/ Nomen → [tam] Nomen (no headword in cell)."""
|
||||||
|
result = self._run_step_5h("/tam/ Nomen 1 Zeit")
|
||||||
|
assert result.startswith("[tam]")
|
||||||
|
assert "/tam/" not in result
|
||||||
|
|
||||||
|
def test_no_slashes_unchanged(self):
|
||||||
|
"""Text without slashes passes through unchanged."""
|
||||||
|
text = "hello world"
|
||||||
|
assert self._run_step_5h(text) == text
|
||||||
|
|
||||||
|
def test_tile_dict_lookup(self):
|
||||||
|
"""tile /tail/ → tile [tˈaɪl]."""
|
||||||
|
result = self._run_step_5h("tile /tail/ Nomen Dachziegel")
|
||||||
|
assert "[tˈaɪl]" in result
|
||||||
|
|||||||
Reference in New Issue
Block a user