Fix IPA continuation: only process fully-bracketed cells, keep phrasal verb particles
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Two fixes: 1. Step 5d now only treats cells as continuation when text is entirely inside brackets (e.g. "[n, nn]"). Cells with headwords outside brackets (e.g. "employee [im'ploi:]") are no longer overwritten. 2. fix_ipa_continuation_cell no longer skips grammar words like "down" — they are part of the headword in phrasal verbs like "close sth. down". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1266,7 +1266,10 @@ def fix_ipa_continuation_cell(
|
|||||||
if not parts:
|
if not parts:
|
||||||
return garbled_text
|
return garbled_text
|
||||||
|
|
||||||
# Look up IPA for each headword part
|
# Look up IPA for each headword part.
|
||||||
|
# Do NOT skip grammar words here — they are integral parts of the
|
||||||
|
# headword (e.g. "close down", "the United Kingdom"). Grammar
|
||||||
|
# annotations like "(sth)", "(no pl)" are already stripped above.
|
||||||
ipa_parts: List[str] = []
|
ipa_parts: List[str] = []
|
||||||
for part in parts:
|
for part in parts:
|
||||||
# A part may be multi-word like "secondary school"
|
# A part may be multi-word like "secondary school"
|
||||||
@@ -1276,9 +1279,6 @@ def fix_ipa_continuation_cell(
|
|||||||
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
|
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||||
if not clean_w or len(clean_w) < 2:
|
if not clean_w or len(clean_w) < 2:
|
||||||
continue
|
continue
|
||||||
# Skip grammar words like "to" at the start
|
|
||||||
if clean_w.lower() in _GRAMMAR_BRACKET_WORDS:
|
|
||||||
continue
|
|
||||||
ipa = _lookup_ipa(clean_w, pronunciation)
|
ipa = _lookup_ipa(clean_w, pronunciation)
|
||||||
if ipa:
|
if ipa:
|
||||||
word_ipas.append(ipa)
|
word_ipas.append(ipa)
|
||||||
|
|||||||
@@ -1634,7 +1634,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
ct = cell.get("col_type", "")
|
ct = cell.get("col_type", "")
|
||||||
if not ct.startswith("column_"):
|
if not ct.startswith("column_"):
|
||||||
continue
|
continue
|
||||||
cell_text = cell.get("text", "")
|
cell_text = (cell.get("text") or "").strip()
|
||||||
|
# Only treat as continuation when text is entirely
|
||||||
|
# inside brackets — e.g. "[n, nn]", "[klaoz 'daun]".
|
||||||
|
# Text like "employee [im'ploi:]" has a headword
|
||||||
|
# OUTSIDE brackets and must NOT be overwritten.
|
||||||
|
if not (cell_text.startswith('[') and cell_text.endswith(']')):
|
||||||
|
continue
|
||||||
if not _text_has_garbled_ipa(cell_text):
|
if not _text_has_garbled_ipa(cell_text):
|
||||||
continue
|
continue
|
||||||
# Already has proper IPA brackets → already fixed
|
# Already has proper IPA brackets → already fixed
|
||||||
|
|||||||
@@ -499,3 +499,24 @@ class TestGarbledIpaDetection:
|
|||||||
)
|
)
|
||||||
assert fixed != "[1uedtX,1]"
|
assert fixed != "[1uedtX,1]"
|
||||||
assert "ɪkwˈɪpmənt" in fixed # equipment IPA
|
assert "ɪkwˈɪpmənt" in fixed # equipment IPA
|
||||||
|
|
||||||
|
def test_fix_continuation_close_down(self):
|
||||||
|
"""IPA continuation for 'close sth. down' → IPA for both words."""
|
||||||
|
fixed = fix_ipa_continuation_cell(
|
||||||
|
"[klaoz 'daun]", "close sth. down", pronunciation="british",
|
||||||
|
)
|
||||||
|
assert fixed != "[klaoz 'daun]"
|
||||||
|
assert "klˈəʊs" in fixed # close IPA
|
||||||
|
assert "dˈaʊn" in fixed # down IPA — must NOT be skipped
|
||||||
|
|
||||||
|
def test_headword_with_brackets_not_continuation(self):
|
||||||
|
"""'employee [im'ploi:]' has a headword outside brackets → not garbled.
|
||||||
|
|
||||||
|
_text_has_garbled_ipa returns True (has ':'), but Step 5d should
|
||||||
|
skip this cell because text doesn't start with '['.
|
||||||
|
"""
|
||||||
|
# The garbled check still triggers (has IPA-like ':')
|
||||||
|
assert _text_has_garbled_ipa("employee [im'ploi:]") is True
|
||||||
|
# But text does NOT start with '[' — Step 5d bracket guard blocks it
|
||||||
|
text = "employee [im'ploi:]"
|
||||||
|
assert not (text.strip().startswith('[') and text.strip().endswith(']'))
|
||||||
|
|||||||
Reference in New Issue
Block a user