Remove IPA continuation rows and support hyphenated word lookup

- grid_editor_api: After IPA correction, detect rows containing only
  garbled phonetics in the English column (no German translation, no
  IPA brackets inserted). These are wrap-around lines where printed
  IPA extends to the line below the headword. Remove them since the
  headword row already has correct IPA.
- cv_ocr_engines: _insert_missing_ipa now tries dehyphenated form
  as fallback (e.g. "second-hand" → "secondhand") for dictionary
  lookup, fixing IPA insertion for compound words.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-18 12:05:38 +01:00
parent 821e5481c2
commit 8ef4c089cf
2 changed files with 47 additions and 0 deletions

View File

@@ -1196,6 +1196,50 @@ async def build_grid(session_id: str):
if orig:
cell["col_type"] = orig
# 5d. Remove IPA continuation rows — rows where the printed
# phonetic transcription wraps to a line below the headword.
# These rows have text only in the English column (+ margin
# noise) and fix_cell_phonetics did NOT insert IPA brackets
# (because there's no real English word to look up).
ipa_cont_rows: set = set()
for z in zones_data:
for row in z.get("rows", []):
ri = row["index"]
row_cells = [
c for c in z.get("cells", [])
if c.get("row_index") == ri
]
en_cells = [
c for c in row_cells
if c.get("col_type") == en_col_type
]
# Other cells with ≥3 chars (ignore margin noise)
other_cells = [
c for c in row_cells
if c.get("col_type") != en_col_type
and len((c.get("text") or "").strip()) >= 3
]
if en_cells and not other_cells:
en_text = en_cells[0].get("text", "")
# No IPA brackets → phonetics not recognized →
# this is a garbled IPA continuation row
if "[" not in en_text:
ipa_cont_rows.add(ri)
if ipa_cont_rows:
for z in zones_data:
z["rows"] = [
r for r in z.get("rows", [])
if r["index"] not in ipa_cont_rows
]
z["cells"] = [
c for c in z.get("cells", [])
if c.get("row_index") not in ipa_cont_rows
]
logger.info(
"removed %d IPA continuation rows: %s",
len(ipa_cont_rows), sorted(ipa_cont_rows),
)
duration = time.time() - t0
# 6. Build result