Fix garbled IPA in continuation rows using headword lookup
IPA continuation rows (phonetic transcription that wraps below the headword) now get proper IPA by looking up headwords from the row above. E.g. "ska:f – ska:vz" → "[skˈɑːf] – [skˈɑːvz]". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,7 +22,7 @@ from fastapi import APIRouter, HTTPException, Request
|
||||
|
||||
from cv_box_detect import detect_boxes, split_page_into_zones
|
||||
from cv_color_detect import detect_word_colors, recover_colored_text
|
||||
from cv_ocr_engines import fix_cell_phonetics
|
||||
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa
|
||||
from cv_words_first import _cluster_rows, _build_cells
|
||||
from ocr_pipeline_session_store import (
|
||||
get_session_db,
|
||||
@@ -1324,9 +1324,60 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if orig:
|
||||
cell["col_type"] = orig
|
||||
|
||||
# 5d. IPA continuation rows are preserved — they contain the
|
||||
# printed phonetic transcription that wraps to a line below the
|
||||
# headword. The user can manually delete them if not needed.
|
||||
# 5d. Fix IPA continuation rows — rows where the printed
|
||||
# phonetic transcription wraps to a line below the headword.
|
||||
# These contain only garbled IPA in the EN column and nothing
|
||||
# in other columns. Replace garbled text with proper IPA
|
||||
# looked up from the headword in the previous row.
|
||||
ipa_cont_fixed = 0
|
||||
for z in zones_data:
|
||||
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
||||
z_cells = z.get("cells", [])
|
||||
for idx, row in enumerate(rows_sorted):
|
||||
ri = row["index"]
|
||||
row_cells = [c for c in z_cells if c.get("row_index") == ri]
|
||||
en_cells = [
|
||||
c for c in row_cells
|
||||
if c.get("col_type") == en_col_type
|
||||
]
|
||||
# Other cells with ≥3 chars (ignore margin noise)
|
||||
other_cells = [
|
||||
c for c in row_cells
|
||||
if c.get("col_type") != en_col_type
|
||||
and len((c.get("text") or "").strip()) >= 3
|
||||
]
|
||||
if not en_cells or other_cells:
|
||||
continue
|
||||
en_text = en_cells[0].get("text", "")
|
||||
if not _text_has_garbled_ipa(en_text):
|
||||
continue
|
||||
# Already has proper IPA brackets → already fixed
|
||||
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text):
|
||||
continue
|
||||
# Find headword in previous row
|
||||
if idx == 0:
|
||||
continue
|
||||
prev_ri = rows_sorted[idx - 1]["index"]
|
||||
prev_en = [
|
||||
c for c in z_cells
|
||||
if c.get("row_index") == prev_ri
|
||||
and c.get("col_type") == en_col_type
|
||||
]
|
||||
if not prev_en:
|
||||
continue
|
||||
prev_text = prev_en[0].get("text", "")
|
||||
fixed = fix_ipa_continuation_cell(
|
||||
en_text, prev_text, pronunciation="british",
|
||||
)
|
||||
if fixed != en_text:
|
||||
en_cells[0]["text"] = fixed
|
||||
ipa_cont_fixed += 1
|
||||
logger.info(
|
||||
"IPA continuation R%d: '%s' → '%s'",
|
||||
ri, en_text, fixed,
|
||||
)
|
||||
if ipa_cont_fixed:
|
||||
logger.info("Fixed %d IPA continuation rows", ipa_cont_fixed)
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user