Remove IPA continuation rows and support hyphenated word lookup

- grid_editor_api: After IPA correction, detect rows containing only garbled phonetics in the English column (no German translation, no IPA brackets inserted). These are wrap-around lines where printed IPA extends to the line below the headword. Remove them since the headword row already has correct IPA. - cv_ocr_engines: _insert_missing_ipa now tries dehyphenated form as fallback (e.g. "second-hand" → "secondhand") for dictionary lookup, fixing IPA insertion for compound words. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 12:05:38 +01:00
parent 821e5481c2
commit 8ef4c089cf
2 changed files with 47 additions and 0 deletions
@@ -1026,6 +1026,9 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
            continue
        ipa = _lookup_ipa(clean, pronunciation)
        # Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
        if not ipa and '-' in clean:
            ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
        if ipa:
            words[i] = f"{w} [{ipa}]"
            # Strip garbled OCR phonetics after the IPA bracket.
@@ -1196,6 +1196,50 @@ async def build_grid(session_id: str):
            if orig:
                cell["col_type"] = orig
        # 5d. Remove IPA continuation rows — rows where the printed
        # phonetic transcription wraps to a line below the headword.
        # These rows have text only in the English column (+ margin
        # noise) and fix_cell_phonetics did NOT insert IPA brackets
        # (because there's no real English word to look up).
        ipa_cont_rows: set = set()
        for z in zones_data:
            for row in z.get("rows", []):
                ri = row["index"]
                row_cells = [
                    c for c in z.get("cells", [])
                    if c.get("row_index") == ri
                ]
                en_cells = [
                    c for c in row_cells
                    if c.get("col_type") == en_col_type
                ]
                # Other cells with ≥3 chars (ignore margin noise)
                other_cells = [
                    c for c in row_cells
                    if c.get("col_type") != en_col_type
                    and len((c.get("text") or "").strip()) >= 3
                ]
                if en_cells and not other_cells:
                    en_text = en_cells[0].get("text", "")
                    # No IPA brackets → phonetics not recognized →
                    # this is a garbled IPA continuation row
                    if "[" not in en_text:
                        ipa_cont_rows.add(ri)
        if ipa_cont_rows:
            for z in zones_data:
                z["rows"] = [
                    r for r in z.get("rows", [])
                    if r["index"] not in ipa_cont_rows
                ]
                z["cells"] = [
                    c for c in z.get("cells", [])
                    if c.get("row_index") not in ipa_cont_rows
                ]
            logger.info(
                "removed %d IPA continuation rows: %s",
                len(ipa_cont_rows), sorted(ipa_cont_rows),
            )
    duration = time.time() - t0
    # 6. Build result