Fix garbled IPA detection for bracket-notation like [n, nn] and [1uedtX,1]
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled

- Detect bracketed text without real IPA symbols as garbled OCR phonetics
- Allow IPA continuation fix even when other columns have content (for rows
  where EN cell is clearly garbled bracketed IPA)
- Strip parenthetical grammar annotations like (no pl) from headword before
  IPA lookup in fix_ipa_continuation_cell

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-19 23:28:00 +01:00
parent 7750b2a05f
commit 6bfa9eed86
3 changed files with 77 additions and 12 deletions

View File

@@ -1616,9 +1616,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# 5d. Fix IPA continuation rows — rows where the printed
# phonetic transcription wraps to a line below the headword.
# These contain only garbled IPA in the EN column and nothing
# in other columns. Replace garbled text with proper IPA
# looked up from the headword in the previous row.
# These contain garbled IPA in the EN column. Replace garbled
# text with proper IPA looked up from the headword in the
# previous row.
ipa_cont_fixed = 0
for z in zones_data:
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
@@ -1630,13 +1630,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
c for c in row_cells
if c.get("col_type") == en_col_type
]
# Other cells with ≥3 chars (ignore margin noise)
other_cells = [
c for c in row_cells
if c.get("col_type") != en_col_type
and len((c.get("text") or "").strip()) >= 3
]
if not en_cells or other_cells:
if not en_cells:
continue
en_text = en_cells[0].get("text", "")
if not _text_has_garbled_ipa(en_text):
@@ -1644,6 +1638,21 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# Already has proper IPA brackets → already fixed
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text):
continue
# When the EN cell does NOT look obviously garbled
# (e.g. bracketed non-IPA), require that other columns
# are empty — otherwise it's a normal content row.
en_stripped = en_text.strip()
is_bracket_garbled = (
en_stripped.startswith('[') and en_stripped.endswith(']')
)
if not is_bracket_garbled:
other_cells = [
c for c in row_cells
if c.get("col_type") != en_col_type
and len((c.get("text") or "").strip()) >= 3
]
if other_cells:
continue
# Find headword in previous row
if idx == 0:
continue