Fix unbracketed IPA continuations: detect garbled IPA in single-cell rows
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m42s
CI / test-python-agent-core (push) Successful in 13s
CI / test-nodejs-website (push) Successful in 14s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m42s
CI / test-python-agent-core (push) Successful in 13s
CI / test-nodejs-website (push) Successful in 14s
Step 5d now also processes IPA continuations without brackets (e.g. "ska:f – ska:vz", "'sekandarr sku:l") when the row has only 1 content cell and the text is pure-ASCII garbled IPA (no real IPA Unicode symbols). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1782,6 +1782,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
# headword in the previous row's same column.
|
# headword in the previous row's same column.
|
||||||
# Note: We check ALL columns, not just en_col_type, because
|
# Note: We check ALL columns, not just en_col_type, because
|
||||||
# the EN headword column may not be the longest-average column.
|
# the EN headword column may not be the longest-average column.
|
||||||
|
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||||
ipa_cont_fixed = 0
|
ipa_cont_fixed = 0
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
||||||
@@ -1796,17 +1797,41 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if not ct.startswith("column_"):
|
if not ct.startswith("column_"):
|
||||||
continue
|
continue
|
||||||
cell_text = (cell.get("text") or "").strip()
|
cell_text = (cell.get("text") or "").strip()
|
||||||
# Only treat as continuation when text is entirely
|
if not cell_text:
|
||||||
# inside brackets — e.g. "[n, nn]", "[klaoz 'daun]".
|
|
||||||
# Text like "employee [im'ploi:]" has a headword
|
|
||||||
# OUTSIDE brackets and must NOT be overwritten.
|
|
||||||
if not (cell_text.startswith('[') and cell_text.endswith(']')):
|
|
||||||
continue
|
|
||||||
if not _text_has_garbled_ipa(cell_text):
|
|
||||||
continue
|
|
||||||
# Already has proper IPA brackets → already fixed
|
|
||||||
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
is_bracketed = (
|
||||||
|
cell_text.startswith('[') and cell_text.endswith(']')
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_bracketed:
|
||||||
|
# Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
|
||||||
|
# Text like "employee [im'ploi:]" is NOT fully
|
||||||
|
# bracketed and won't match here.
|
||||||
|
if not _text_has_garbled_ipa(cell_text):
|
||||||
|
continue
|
||||||
|
# Already has proper IPA brackets → skip
|
||||||
|
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Unbracketed continuation: "ska:f – ska:vz",
|
||||||
|
# "'sekandarr sku:l". Only treat as IPA
|
||||||
|
# continuation if this is the ONLY content cell
|
||||||
|
# in the row (single-cell row) and the text is
|
||||||
|
# garbled IPA without real IPA Unicode symbols.
|
||||||
|
content_cells_in_row = [
|
||||||
|
c for c in row_cells
|
||||||
|
if c.get("col_type", "").startswith("column_")
|
||||||
|
and c.get("col_type") != "column_1"
|
||||||
|
]
|
||||||
|
if len(content_cells_in_row) != 1:
|
||||||
|
continue
|
||||||
|
if not _text_has_garbled_ipa(cell_text):
|
||||||
|
continue
|
||||||
|
# Has real IPA symbols → already fixed or valid
|
||||||
|
if any(c in _REAL_IPA_CHARS for c in cell_text):
|
||||||
|
continue
|
||||||
|
|
||||||
# Find headword in previous row, same column
|
# Find headword in previous row, same column
|
||||||
prev_ri = rows_sorted[idx - 1]["index"]
|
prev_ri = rows_sorted[idx - 1]["index"]
|
||||||
prev_same_col = [
|
prev_same_col = [
|
||||||
|
|||||||
Reference in New Issue
Block a user