Fix Step 5h: reject grammar patterns in slash-IPA, convert trailing variants
- Reject /.../ matches containing spaces, parens, or commas (e.g. sb/sth up) - Second pass converts trailing /ipa2/ after [ipa1] (double pronunciation) - Validate standalone /ipa/ at start against same reject pattern Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2014,6 +2014,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
)
|
)
|
||||||
# Standalone slash IPA at start of text (headword on previous line)
|
# Standalone slash IPA at start of text (headword on previous line)
|
||||||
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
||||||
|
# IPA between slashes never contains spaces, parentheses, or commas.
|
||||||
|
# Reject matches that look like grammar: "sb/sth up a) jdn/"
|
||||||
|
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
|
||||||
slash_ipa_fixed = 0
|
slash_ipa_fixed = 0
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
for cell in z.get("cells", []):
|
for cell in z.get("cells", []):
|
||||||
@@ -2025,6 +2028,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
nonlocal slash_ipa_fixed
|
nonlocal slash_ipa_fixed
|
||||||
headword = m.group(1)
|
headword = m.group(1)
|
||||||
ocr_ipa = m.group(2) # includes slashes
|
ocr_ipa = m.group(2) # includes slashes
|
||||||
|
inner_raw = ocr_ipa.strip("/").strip()
|
||||||
|
# Reject if inner content has spaces/parens/commas (grammar)
|
||||||
|
if _SLASH_IPA_REJECT_RE.search(inner_raw):
|
||||||
|
return m.group(0)
|
||||||
# Strip superscript digits for lookup
|
# Strip superscript digits for lookup
|
||||||
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
||||||
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
||||||
@@ -2032,9 +2039,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
slash_ipa_fixed += 1
|
slash_ipa_fixed += 1
|
||||||
return f"{headword} [{ipa}]"
|
return f"{headword} [{ipa}]"
|
||||||
# Fallback: keep OCR IPA but convert slashes to brackets
|
# Fallback: keep OCR IPA but convert slashes to brackets
|
||||||
inner = ocr_ipa.strip("/").strip()
|
inner = inner_raw.lstrip("'").strip()
|
||||||
# Strip leading ' (OCR stress marker)
|
|
||||||
inner = inner.lstrip("'").strip()
|
|
||||||
if inner:
|
if inner:
|
||||||
slash_ipa_fixed += 1
|
slash_ipa_fixed += 1
|
||||||
return f"{headword} [{inner}]"
|
return f"{headword} [{inner}]"
|
||||||
@@ -2042,14 +2047,30 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
|
|
||||||
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
|
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
|
||||||
|
|
||||||
|
# Second pass: convert remaining /ipa/ after [ipa] from first pass.
|
||||||
|
# Pattern: [ipa] /ipa2/ → [ipa] [ipa2] (second pronunciation variant)
|
||||||
|
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
|
||||||
|
def _replace_trailing_slash(m: re.Match) -> str:
|
||||||
|
nonlocal slash_ipa_fixed
|
||||||
|
inner = m.group(1).strip("/").strip().lstrip("'").strip()
|
||||||
|
if _SLASH_IPA_REJECT_RE.search(inner):
|
||||||
|
return m.group(0)
|
||||||
|
if inner:
|
||||||
|
slash_ipa_fixed += 1
|
||||||
|
return f" [{inner}]"
|
||||||
|
return m.group(0)
|
||||||
|
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
|
||||||
|
|
||||||
# Handle standalone /ipa/ at start (no headword in this cell)
|
# Handle standalone /ipa/ at start (no headword in this cell)
|
||||||
if new_text == text:
|
if new_text == text:
|
||||||
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
||||||
if m:
|
if m:
|
||||||
inner = m.group(1).strip().lstrip("'").strip()
|
inner = m.group(1).strip()
|
||||||
if inner:
|
if not _SLASH_IPA_REJECT_RE.search(inner):
|
||||||
new_text = "[" + inner + "]" + text[m.end():]
|
inner = inner.lstrip("'").strip()
|
||||||
slash_ipa_fixed += 1
|
if inner:
|
||||||
|
new_text = "[" + inner + "]" + text[m.end():]
|
||||||
|
slash_ipa_fixed += 1
|
||||||
|
|
||||||
if new_text != text:
|
if new_text != text:
|
||||||
cell["text"] = new_text
|
cell["text"] = new_text
|
||||||
|
|||||||
@@ -823,26 +823,44 @@ class TestSlashIpaConversion:
|
|||||||
r"(/[^/]{2,}/)"
|
r"(/[^/]{2,}/)"
|
||||||
)
|
)
|
||||||
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
||||||
|
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
|
||||||
|
|
||||||
def _replace(m):
|
def _replace(m):
|
||||||
headword = m.group(1)
|
headword = m.group(1)
|
||||||
ocr_ipa = m.group(2)
|
ocr_ipa = m.group(2)
|
||||||
|
inner_raw = ocr_ipa.strip("/").strip()
|
||||||
|
if _SLASH_IPA_REJECT_RE.search(inner_raw):
|
||||||
|
return m.group(0)
|
||||||
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
||||||
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
||||||
if ipa:
|
if ipa:
|
||||||
return f"{headword} [{ipa}]"
|
return f"{headword} [{ipa}]"
|
||||||
inner = ocr_ipa.strip("/").strip().lstrip("'").strip()
|
inner = inner_raw.lstrip("'").strip()
|
||||||
if inner:
|
if inner:
|
||||||
return f"{headword} [{inner}]"
|
return f"{headword} [{inner}]"
|
||||||
return m.group(0)
|
return m.group(0)
|
||||||
|
|
||||||
new_text = _SLASH_IPA_RE.sub(_replace, text)
|
new_text = _SLASH_IPA_RE.sub(_replace, text)
|
||||||
|
|
||||||
|
# Second pass: trailing /ipa/ after [ipa]
|
||||||
|
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
|
||||||
|
def _replace_trailing(m):
|
||||||
|
inner = m.group(1).strip("/").strip().lstrip("'").strip()
|
||||||
|
if _SLASH_IPA_REJECT_RE.search(inner):
|
||||||
|
return m.group(0)
|
||||||
|
if inner:
|
||||||
|
return f" [{inner}]"
|
||||||
|
return m.group(0)
|
||||||
|
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing, new_text)
|
||||||
|
|
||||||
if new_text == text:
|
if new_text == text:
|
||||||
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
||||||
if m:
|
if m:
|
||||||
inner = m.group(1).strip().lstrip("'").strip()
|
inner = m.group(1).strip()
|
||||||
if inner:
|
if not _SLASH_IPA_REJECT_RE.search(inner):
|
||||||
new_text = "[" + inner + "]" + text[m.end():]
|
inner = inner.lstrip("'").strip()
|
||||||
|
if inner:
|
||||||
|
new_text = "[" + inner + "]" + text[m.end():]
|
||||||
return new_text
|
return new_text
|
||||||
|
|
||||||
def test_tiger_dict_lookup(self):
|
def test_tiger_dict_lookup(self):
|
||||||
@@ -865,21 +883,19 @@ class TestSlashIpaConversion:
|
|||||||
assert "/und/" not in result
|
assert "/und/" not in result
|
||||||
|
|
||||||
def test_sb_sth_not_matched(self):
|
def test_sb_sth_not_matched(self):
|
||||||
"""sb/sth should NOT be treated as IPA (too short or grammar)."""
|
"""sb/sth should NOT be treated as IPA (contains space/parens)."""
|
||||||
text = "(tie sb/sth up) jdn/etwas anbinden"
|
text = "(tie sb/sth up) jdn/etwas anbinden"
|
||||||
result = self._run_step_5h(text)
|
result = self._run_step_5h(text)
|
||||||
# /sth up) jdn/ has length > 2 but the headword is "sb" which is
|
# The inner content "sth up) jdn" has spaces and parens → rejected
|
||||||
# not a real word — the regex would match, but "sb" won't be in dict
|
assert result == text # unchanged
|
||||||
# and the inner text would contain grammar, not IPA.
|
|
||||||
# Key assertion: "jdn/etwas" is not corrupted
|
|
||||||
assert "etwas" in result
|
|
||||||
|
|
||||||
def test_double_ipa(self):
|
def test_double_ipa_both_converted(self):
|
||||||
"""times/taimz/ /tamz/ → both converted."""
|
"""times/taimz/ /tamz/ → times [tˈaɪmz] [tamz] (both converted)."""
|
||||||
result = self._run_step_5h("times/taimz/ /tamz/ Präp")
|
result = self._run_step_5h("times/taimz/ /tamz/ Präp")
|
||||||
assert "[tˈaɪmz]" in result
|
assert "[tˈaɪmz]" in result
|
||||||
# Second /tamz/ is standalone after first replacement
|
assert "[tamz]" in result
|
||||||
assert "/taimz/" not in result
|
assert "/taimz/" not in result
|
||||||
|
assert "/tamz/" not in result
|
||||||
|
|
||||||
def test_standalone_slash_ipa_at_start(self):
|
def test_standalone_slash_ipa_at_start(self):
|
||||||
"""/tam/ Nomen → [tam] Nomen (no headword in cell)."""
|
"""/tam/ Nomen → [tam] Nomen (no headword in cell)."""
|
||||||
|
|||||||
Reference in New Issue
Block a user