Fix Step 5h: reject grammar patterns in slash-IPA, convert trailing variants

- Reject /.../ matches containing spaces, parens, or commas (e.g. sb/sth up)
- Second pass converts trailing /ipa2/ after [ipa1] (double pronunciation)
- Validate standalone /ipa/ at start against same reject pattern

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 12:40:28 +01:00
parent 7fafd297e7
commit 04092a0a66
2 changed files with 57 additions and 20 deletions

View File

@@ -2014,6 +2014,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
)
# Standalone slash IPA at start of text (headword on previous line)
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
# IPA between slashes never contains spaces, parentheses, or commas.
# Reject matches that look like grammar: "sb/sth up a) jdn/"
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
slash_ipa_fixed = 0
for z in zones_data:
for cell in z.get("cells", []):
@@ -2025,6 +2028,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
nonlocal slash_ipa_fixed
headword = m.group(1)
ocr_ipa = m.group(2) # includes slashes
inner_raw = ocr_ipa.strip("/").strip()
# Reject if inner content has spaces/parens/commas (grammar)
if _SLASH_IPA_REJECT_RE.search(inner_raw):
return m.group(0)
# Strip superscript digits for lookup
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
@@ -2032,9 +2039,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
slash_ipa_fixed += 1
return f"{headword} [{ipa}]"
# Fallback: keep OCR IPA but convert slashes to brackets
inner = ocr_ipa.strip("/").strip()
# Strip leading ' (OCR stress marker)
inner = inner.lstrip("'").strip()
inner = inner_raw.lstrip("'").strip()
if inner:
slash_ipa_fixed += 1
return f"{headword} [{inner}]"
@@ -2042,14 +2047,30 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
# Second pass: convert remaining /ipa/ after [ipa] from first pass.
# Pattern: [ipa] /ipa2/ → [ipa] [ipa2] (second pronunciation variant)
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
def _replace_trailing_slash(m: re.Match) -> str:
nonlocal slash_ipa_fixed
inner = m.group(1).strip("/").strip().lstrip("'").strip()
if _SLASH_IPA_REJECT_RE.search(inner):
return m.group(0)
if inner:
slash_ipa_fixed += 1
return f" [{inner}]"
return m.group(0)
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
# Handle standalone /ipa/ at start (no headword in this cell)
if new_text == text:
m = _STANDALONE_SLASH_IPA_RE.match(text)
if m:
inner = m.group(1).strip().lstrip("'").strip()
if inner:
new_text = "[" + inner + "]" + text[m.end():]
slash_ipa_fixed += 1
inner = m.group(1).strip()
if not _SLASH_IPA_REJECT_RE.search(inner):
inner = inner.lstrip("'").strip()
if inner:
new_text = "[" + inner + "]" + text[m.end():]
slash_ipa_fixed += 1
if new_text != text:
cell["text"] = new_text