Fix garbled IPA in continuation rows using headword lookup

IPA continuation rows (phonetic transcription that wraps below the
headword) now get proper IPA by looking up headwords from the row
above. E.g. "ska:f – ska:vz" → "[skˈɑːf] – [skˈɑːvz]".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-19 10:28:14 +01:00
parent 050d410ba0
commit fc0ab84e40
2 changed files with 123 additions and 4 deletions

View File

@@ -1096,6 +1096,74 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
return ' '.join(words)
def fix_ipa_continuation_cell(
garbled_text: str,
headword_text: str,
pronunciation: str = 'british',
) -> str:
"""Replace garbled IPA in a continuation row with proper IPA.
Continuation rows appear below the headword and contain only the
printed phonetic transcription, which OCR garbles into fragments
like ``ska:f ska:vz`` (should be ``[skˈɑːf] [skˈɑːvz]``).
Args:
garbled_text: The OCR-garbled IPA text from the continuation row.
headword_text: The headword text from the previous row
(e.g. ``scarf scarves``).
pronunciation: ``'british'`` or ``'american'``.
Returns:
Corrected IPA text, or the original if no fix could be applied.
"""
if not IPA_AVAILABLE or not garbled_text or not headword_text:
return garbled_text
# Strip existing IPA brackets from headword text
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text).strip()
if not clean_hw:
return garbled_text
# Split headword by delimiters ( — -)
# "scarf scarves" → ["scarf", "scarves"]
# "see - saw - seen" → ["see", "saw", "seen"]
parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
parts = [p.strip() for p in parts if p.strip()]
if not parts:
return garbled_text
# Look up IPA for each headword part
ipa_parts: List[str] = []
for part in parts:
# A part may be multi-word like "secondary school"
words = part.split()
word_ipas: List[str] = []
for w in words:
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
if not clean_w or len(clean_w) < 2:
continue
# Skip grammar words like "to" at the start
if clean_w.lower() in _GRAMMAR_BRACKET_WORDS:
continue
ipa = _lookup_ipa(clean_w, pronunciation)
if ipa:
word_ipas.append(ipa)
if word_ipas:
ipa_parts.append('[' + ' '.join(word_ipas) + ']')
if not ipa_parts:
return garbled_text
# Join with delimiter
result = ' '.join(ipa_parts)
logger.debug(
"fix_ipa_continuation: '%s''%s' (headwords: '%s')",
garbled_text, result, headword_text,
)
return result
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
"""Insert IPA for the first English headword in a long mixed-language line.

View File

@@ -22,7 +22,7 @@ from fastapi import APIRouter, HTTPException, Request
from cv_box_detect import detect_boxes, split_page_into_zones
from cv_color_detect import detect_word_colors, recover_colored_text
from cv_ocr_engines import fix_cell_phonetics
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa
from cv_words_first import _cluster_rows, _build_cells
from ocr_pipeline_session_store import (
get_session_db,
@@ -1324,9 +1324,60 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if orig:
cell["col_type"] = orig
# 5d. IPA continuation rows are preserved — they contain the
# printed phonetic transcription that wraps to a line below the
# headword. The user can manually delete them if not needed.
# 5d. Fix IPA continuation rows — rows where the printed
# phonetic transcription wraps to a line below the headword.
# These contain only garbled IPA in the EN column and nothing
# in other columns. Replace garbled text with proper IPA
# looked up from the headword in the previous row.
ipa_cont_fixed = 0
for z in zones_data:
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
z_cells = z.get("cells", [])
for idx, row in enumerate(rows_sorted):
ri = row["index"]
row_cells = [c for c in z_cells if c.get("row_index") == ri]
en_cells = [
c for c in row_cells
if c.get("col_type") == en_col_type
]
# Other cells with ≥3 chars (ignore margin noise)
other_cells = [
c for c in row_cells
if c.get("col_type") != en_col_type
and len((c.get("text") or "").strip()) >= 3
]
if not en_cells or other_cells:
continue
en_text = en_cells[0].get("text", "")
if not _text_has_garbled_ipa(en_text):
continue
# Already has proper IPA brackets → already fixed
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text):
continue
# Find headword in previous row
if idx == 0:
continue
prev_ri = rows_sorted[idx - 1]["index"]
prev_en = [
c for c in z_cells
if c.get("row_index") == prev_ri
and c.get("col_type") == en_col_type
]
if not prev_en:
continue
prev_text = prev_en[0].get("text", "")
fixed = fix_ipa_continuation_cell(
en_text, prev_text, pronunciation="british",
)
if fixed != en_text:
en_cells[0]["text"] = fixed
ipa_cont_fixed += 1
logger.info(
"IPA continuation R%d: '%s''%s'",
ri, en_text, fixed,
)
if ipa_cont_fixed:
logger.info("Fixed %d IPA continuation rows", ipa_cont_fixed)
duration = time.time() - t0