Fix unbracketed IPA continuations: detect garbled IPA in single-cell rows
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m42s
CI / test-python-agent-core (push) Successful in 13s
CI / test-nodejs-website (push) Successful in 14s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m42s
CI / test-python-agent-core (push) Successful in 13s
CI / test-nodejs-website (push) Successful in 14s
Step 5d now also processes IPA continuations without brackets (e.g. "ska:f – ska:vz", "'sekandarr sku:l") when the row has only 1 content cell and the text is pure-ASCII garbled IPA (no real IPA Unicode symbols). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1782,6 +1782,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
# headword in the previous row's same column.
|
||||
# Note: We check ALL columns, not just en_col_type, because
|
||||
# the EN headword column may not be the longest-average column.
|
||||
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||
ipa_cont_fixed = 0
|
||||
for z in zones_data:
|
||||
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
||||
@@ -1796,17 +1797,41 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
cell_text = (cell.get("text") or "").strip()
|
||||
# Only treat as continuation when text is entirely
|
||||
# inside brackets — e.g. "[n, nn]", "[klaoz 'daun]".
|
||||
# Text like "employee [im'ploi:]" has a headword
|
||||
# OUTSIDE brackets and must NOT be overwritten.
|
||||
if not (cell_text.startswith('[') and cell_text.endswith(']')):
|
||||
continue
|
||||
if not _text_has_garbled_ipa(cell_text):
|
||||
continue
|
||||
# Already has proper IPA brackets → already fixed
|
||||
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
||||
if not cell_text:
|
||||
continue
|
||||
|
||||
is_bracketed = (
|
||||
cell_text.startswith('[') and cell_text.endswith(']')
|
||||
)
|
||||
|
||||
if is_bracketed:
|
||||
# Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
|
||||
# Text like "employee [im'ploi:]" is NOT fully
|
||||
# bracketed and won't match here.
|
||||
if not _text_has_garbled_ipa(cell_text):
|
||||
continue
|
||||
# Already has proper IPA brackets → skip
|
||||
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
||||
continue
|
||||
else:
|
||||
# Unbracketed continuation: "ska:f – ska:vz",
|
||||
# "'sekandarr sku:l". Only treat as IPA
|
||||
# continuation if this is the ONLY content cell
|
||||
# in the row (single-cell row) and the text is
|
||||
# garbled IPA without real IPA Unicode symbols.
|
||||
content_cells_in_row = [
|
||||
c for c in row_cells
|
||||
if c.get("col_type", "").startswith("column_")
|
||||
and c.get("col_type") != "column_1"
|
||||
]
|
||||
if len(content_cells_in_row) != 1:
|
||||
continue
|
||||
if not _text_has_garbled_ipa(cell_text):
|
||||
continue
|
||||
# Has real IPA symbols → already fixed or valid
|
||||
if any(c in _REAL_IPA_CHARS for c in cell_text):
|
||||
continue
|
||||
|
||||
# Find headword in previous row, same column
|
||||
prev_ri = rows_sorted[idx - 1]["index"]
|
||||
prev_same_col = [
|
||||
|
||||
Reference in New Issue
Block a user