Fix IPA continuation to check all columns, not just en_col_type
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s

The en_col_type heuristic (longest avg text) picks the example column,
missing IPA continuation cells in the actual headword column. Now Step 5d
checks all column_* cells for garbled IPA patterns independently.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-19 23:34:41 +01:00
parent 3c7fc43f43
commit 5f89913a9a

View File

@@ -1614,69 +1614,54 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if orig:
cell["col_type"] = orig
# 5d. Fix IPA continuation rows — rows where the printed
# 5d. Fix IPA continuation cells — cells where the printed
# phonetic transcription wraps to a line below the headword.
# These contain garbled IPA in the EN column. Replace garbled
# text with proper IPA looked up from the headword in the
# previous row.
# These contain garbled IPA (e.g. "[n, nn]", "[1uedtX,1]").
# Replace garbled text with proper IPA looked up from the
# headword in the previous row's same column.
# Note: We check ALL columns, not just en_col_type, because
# the EN headword column may not be the longest-average column.
ipa_cont_fixed = 0
for z in zones_data:
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
z_cells = z.get("cells", [])
for idx, row in enumerate(rows_sorted):
ri = row["index"]
row_cells = [c for c in z_cells if c.get("row_index") == ri]
en_cells = [
c for c in row_cells
if c.get("col_type") == en_col_type
]
if not en_cells:
continue
en_text = en_cells[0].get("text", "")
if not _text_has_garbled_ipa(en_text):
continue
# Already has proper IPA brackets → already fixed
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text):
continue
# When the EN cell does NOT look obviously garbled
# (e.g. bracketed non-IPA), require that other columns
# are empty — otherwise it's a normal content row.
en_stripped = en_text.strip()
is_bracket_garbled = (
en_stripped.startswith('[') and en_stripped.endswith(']')
)
if not is_bracket_garbled:
other_cells = [
c for c in row_cells
if c.get("col_type") != en_col_type
and len((c.get("text") or "").strip()) >= 3
]
if other_cells:
continue
# Find headword in previous row
if idx == 0:
continue
ri = row["index"]
row_cells = [c for c in z_cells if c.get("row_index") == ri]
for cell in row_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
cell_text = cell.get("text", "")
if not _text_has_garbled_ipa(cell_text):
continue
# Already has proper IPA brackets → already fixed
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
continue
# Find headword in previous row, same column
prev_ri = rows_sorted[idx - 1]["index"]
prev_en = [
prev_same_col = [
c for c in z_cells
if c.get("row_index") == prev_ri
and c.get("col_type") == en_col_type
and c.get("col_type") == ct
]
if not prev_en:
if not prev_same_col:
continue
prev_text = prev_en[0].get("text", "")
prev_text = prev_same_col[0].get("text", "")
fixed = fix_ipa_continuation_cell(
en_text, prev_text, pronunciation="british",
cell_text, prev_text, pronunciation="british",
)
if fixed != en_text:
en_cells[0]["text"] = fixed
if fixed != cell_text:
cell["text"] = fixed
ipa_cont_fixed += 1
logger.info(
"IPA continuation R%d: '%s''%s'",
ri, en_text, fixed,
"IPA continuation R%d %s: '%s''%s'",
ri, ct, cell_text, fixed,
)
if ipa_cont_fixed:
logger.info("Fixed %d IPA continuation rows", ipa_cont_fixed)
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
duration = time.time() - t0