Fix IPA continuation to check all columns, not just en_col_type
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
The en_col_type heuristic (longest avg text) picks the example column, missing IPA continuation cells in the actual headword column. Now Step 5d checks all column_* cells for garbled IPA patterns independently. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1614,69 +1614,54 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if orig:
|
if orig:
|
||||||
cell["col_type"] = orig
|
cell["col_type"] = orig
|
||||||
|
|
||||||
# 5d. Fix IPA continuation rows — rows where the printed
|
# 5d. Fix IPA continuation cells — cells where the printed
|
||||||
# phonetic transcription wraps to a line below the headword.
|
# phonetic transcription wraps to a line below the headword.
|
||||||
# These contain garbled IPA in the EN column. Replace garbled
|
# These contain garbled IPA (e.g. "[n, nn]", "[1uedtX,1]").
|
||||||
# text with proper IPA looked up from the headword in the
|
# Replace garbled text with proper IPA looked up from the
|
||||||
# previous row.
|
# headword in the previous row's same column.
|
||||||
|
# Note: We check ALL columns, not just en_col_type, because
|
||||||
|
# the EN headword column may not be the longest-average column.
|
||||||
ipa_cont_fixed = 0
|
ipa_cont_fixed = 0
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
||||||
z_cells = z.get("cells", [])
|
z_cells = z.get("cells", [])
|
||||||
for idx, row in enumerate(rows_sorted):
|
for idx, row in enumerate(rows_sorted):
|
||||||
ri = row["index"]
|
|
||||||
row_cells = [c for c in z_cells if c.get("row_index") == ri]
|
|
||||||
en_cells = [
|
|
||||||
c for c in row_cells
|
|
||||||
if c.get("col_type") == en_col_type
|
|
||||||
]
|
|
||||||
if not en_cells:
|
|
||||||
continue
|
|
||||||
en_text = en_cells[0].get("text", "")
|
|
||||||
if not _text_has_garbled_ipa(en_text):
|
|
||||||
continue
|
|
||||||
# Already has proper IPA brackets → already fixed
|
|
||||||
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text):
|
|
||||||
continue
|
|
||||||
# When the EN cell does NOT look obviously garbled
|
|
||||||
# (e.g. bracketed non-IPA), require that other columns
|
|
||||||
# are empty — otherwise it's a normal content row.
|
|
||||||
en_stripped = en_text.strip()
|
|
||||||
is_bracket_garbled = (
|
|
||||||
en_stripped.startswith('[') and en_stripped.endswith(']')
|
|
||||||
)
|
|
||||||
if not is_bracket_garbled:
|
|
||||||
other_cells = [
|
|
||||||
c for c in row_cells
|
|
||||||
if c.get("col_type") != en_col_type
|
|
||||||
and len((c.get("text") or "").strip()) >= 3
|
|
||||||
]
|
|
||||||
if other_cells:
|
|
||||||
continue
|
|
||||||
# Find headword in previous row
|
|
||||||
if idx == 0:
|
if idx == 0:
|
||||||
continue
|
continue
|
||||||
prev_ri = rows_sorted[idx - 1]["index"]
|
ri = row["index"]
|
||||||
prev_en = [
|
row_cells = [c for c in z_cells if c.get("row_index") == ri]
|
||||||
c for c in z_cells
|
for cell in row_cells:
|
||||||
if c.get("row_index") == prev_ri
|
ct = cell.get("col_type", "")
|
||||||
and c.get("col_type") == en_col_type
|
if not ct.startswith("column_"):
|
||||||
]
|
continue
|
||||||
if not prev_en:
|
cell_text = cell.get("text", "")
|
||||||
continue
|
if not _text_has_garbled_ipa(cell_text):
|
||||||
prev_text = prev_en[0].get("text", "")
|
continue
|
||||||
fixed = fix_ipa_continuation_cell(
|
# Already has proper IPA brackets → already fixed
|
||||||
en_text, prev_text, pronunciation="british",
|
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
||||||
)
|
continue
|
||||||
if fixed != en_text:
|
# Find headword in previous row, same column
|
||||||
en_cells[0]["text"] = fixed
|
prev_ri = rows_sorted[idx - 1]["index"]
|
||||||
ipa_cont_fixed += 1
|
prev_same_col = [
|
||||||
logger.info(
|
c for c in z_cells
|
||||||
"IPA continuation R%d: '%s' → '%s'",
|
if c.get("row_index") == prev_ri
|
||||||
ri, en_text, fixed,
|
and c.get("col_type") == ct
|
||||||
|
]
|
||||||
|
if not prev_same_col:
|
||||||
|
continue
|
||||||
|
prev_text = prev_same_col[0].get("text", "")
|
||||||
|
fixed = fix_ipa_continuation_cell(
|
||||||
|
cell_text, prev_text, pronunciation="british",
|
||||||
)
|
)
|
||||||
|
if fixed != cell_text:
|
||||||
|
cell["text"] = fixed
|
||||||
|
ipa_cont_fixed += 1
|
||||||
|
logger.info(
|
||||||
|
"IPA continuation R%d %s: '%s' → '%s'",
|
||||||
|
ri, ct, cell_text, fixed,
|
||||||
|
)
|
||||||
if ipa_cont_fixed:
|
if ipa_cont_fixed:
|
||||||
logger.info("Fixed %d IPA continuation rows", ipa_cont_fixed)
|
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
|
||||||
|
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user