Fix unbracketed IPA continuations: detect garbled IPA in single-cell rows
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m42s
CI / test-python-agent-core (push) Successful in 13s
CI / test-nodejs-website (push) Successful in 14s

Step 5d now also processes IPA continuations without brackets (e.g.
"ska:f – ska:vz", "'sekandarr sku:l") when the row has only 1 content
cell and the text is pure-ASCII garbled IPA (no real IPA Unicode symbols).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 08:30:44 +01:00
parent 5c935eec23
commit 4290f70885

View File

@@ -1782,6 +1782,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# headword in the previous row's same column. # headword in the previous row's same column.
# Note: We check ALL columns, not just en_col_type, because # Note: We check ALL columns, not just en_col_type, because
# the EN headword column may not be the longest-average column. # the EN headword column may not be the longest-average column.
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
ipa_cont_fixed = 0 ipa_cont_fixed = 0
for z in zones_data: for z in zones_data:
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
@@ -1796,17 +1797,41 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if not ct.startswith("column_"): if not ct.startswith("column_"):
continue continue
cell_text = (cell.get("text") or "").strip() cell_text = (cell.get("text") or "").strip()
# Only treat as continuation when text is entirely if not cell_text:
# inside brackets — e.g. "[n, nn]", "[klaoz 'daun]".
# Text like "employee [im'ploi:]" has a headword
# OUTSIDE brackets and must NOT be overwritten.
if not (cell_text.startswith('[') and cell_text.endswith(']')):
continue
if not _text_has_garbled_ipa(cell_text):
continue
# Already has proper IPA brackets → already fixed
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
continue continue
is_bracketed = (
cell_text.startswith('[') and cell_text.endswith(']')
)
if is_bracketed:
# Bracketed continuation: "[n, nn]", "[klaoz 'daun]"
# Text like "employee [im'ploi:]" is NOT fully
# bracketed and won't match here.
if not _text_has_garbled_ipa(cell_text):
continue
# Already has proper IPA brackets → skip
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
continue
else:
# Unbracketed continuation: "ska:f ska:vz",
# "'sekandarr sku:l". Only treat as IPA
# continuation if this is the ONLY content cell
# in the row (single-cell row) and the text is
# garbled IPA without real IPA Unicode symbols.
content_cells_in_row = [
c for c in row_cells
if c.get("col_type", "").startswith("column_")
and c.get("col_type") != "column_1"
]
if len(content_cells_in_row) != 1:
continue
if not _text_has_garbled_ipa(cell_text):
continue
# Has real IPA symbols → already fixed or valid
if any(c in _REAL_IPA_CHARS for c in cell_text):
continue
# Find headword in previous row, same column # Find headword in previous row, same column
prev_ri = rows_sorted[idx - 1]["index"] prev_ri = rows_sorted[idx - 1]["index"]
prev_same_col = [ prev_same_col = [