Remove IPA continuation rows and support hyphenated word lookup
- grid_editor_api: After IPA correction, detect rows containing only garbled phonetics in the English column (no German translation, no IPA brackets inserted). These are wrap-around lines where printed IPA extends to the line below the headword. Remove them since the headword row already has correct IPA. - cv_ocr_engines: _insert_missing_ipa now tries dehyphenated form as fallback (e.g. "second-hand" → "secondhand") for dictionary lookup, fixing IPA insertion for compound words. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1026,6 +1026,9 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
|||||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||||
continue
|
continue
|
||||||
ipa = _lookup_ipa(clean, pronunciation)
|
ipa = _lookup_ipa(clean, pronunciation)
|
||||||
|
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
|
||||||
|
if not ipa and '-' in clean:
|
||||||
|
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
|
||||||
if ipa:
|
if ipa:
|
||||||
words[i] = f"{w} [{ipa}]"
|
words[i] = f"{w} [{ipa}]"
|
||||||
# Strip garbled OCR phonetics after the IPA bracket.
|
# Strip garbled OCR phonetics after the IPA bracket.
|
||||||
|
|||||||
@@ -1196,6 +1196,50 @@ async def build_grid(session_id: str):
|
|||||||
if orig:
|
if orig:
|
||||||
cell["col_type"] = orig
|
cell["col_type"] = orig
|
||||||
|
|
||||||
|
# 5d. Remove IPA continuation rows — rows where the printed
|
||||||
|
# phonetic transcription wraps to a line below the headword.
|
||||||
|
# These rows have text only in the English column (+ margin
|
||||||
|
# noise) and fix_cell_phonetics did NOT insert IPA brackets
|
||||||
|
# (because there's no real English word to look up).
|
||||||
|
ipa_cont_rows: set = set()
|
||||||
|
for z in zones_data:
|
||||||
|
for row in z.get("rows", []):
|
||||||
|
ri = row["index"]
|
||||||
|
row_cells = [
|
||||||
|
c for c in z.get("cells", [])
|
||||||
|
if c.get("row_index") == ri
|
||||||
|
]
|
||||||
|
en_cells = [
|
||||||
|
c for c in row_cells
|
||||||
|
if c.get("col_type") == en_col_type
|
||||||
|
]
|
||||||
|
# Other cells with ≥3 chars (ignore margin noise)
|
||||||
|
other_cells = [
|
||||||
|
c for c in row_cells
|
||||||
|
if c.get("col_type") != en_col_type
|
||||||
|
and len((c.get("text") or "").strip()) >= 3
|
||||||
|
]
|
||||||
|
if en_cells and not other_cells:
|
||||||
|
en_text = en_cells[0].get("text", "")
|
||||||
|
# No IPA brackets → phonetics not recognized →
|
||||||
|
# this is a garbled IPA continuation row
|
||||||
|
if "[" not in en_text:
|
||||||
|
ipa_cont_rows.add(ri)
|
||||||
|
if ipa_cont_rows:
|
||||||
|
for z in zones_data:
|
||||||
|
z["rows"] = [
|
||||||
|
r for r in z.get("rows", [])
|
||||||
|
if r["index"] not in ipa_cont_rows
|
||||||
|
]
|
||||||
|
z["cells"] = [
|
||||||
|
c for c in z.get("cells", [])
|
||||||
|
if c.get("row_index") not in ipa_cont_rows
|
||||||
|
]
|
||||||
|
logger.info(
|
||||||
|
"removed %d IPA continuation rows: %s",
|
||||||
|
len(ipa_cont_rows), sorted(ipa_cont_rows),
|
||||||
|
)
|
||||||
|
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|
||||||
# 6. Build result
|
# 6. Build result
|
||||||
|
|||||||
Reference in New Issue
Block a user