breakpilot-lehrer/klausur-service/backend/ocr/engines/cell_phonetics.py

"""Cell-level IPA phonetic fixes for overlay mode.

In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
(entry['english']).  But the overlay reads cell['text'] directly, so
phonetic fixes must be applied to cells too.

Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers.
"""

import logging
import re
from typing import Any, Dict, List

from cv_vocab_types import IPA_AVAILABLE

from cv_ocr_ipa_lookup import (
    _insert_missing_ipa,
    _replace_phonetics_in_text,
    _text_has_garbled_ipa,
)
from cv_ocr_ipa_repair import (
    _has_non_dict_trailing,
    _insert_headword_ipa,
    _strip_post_bracket_garbled,
)

logger = logging.getLogger(__name__)


def fix_cell_phonetics(
    cells: List[Dict[str, Any]],
    pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
    """Apply IPA phonetic fixes to cell texts for overlay mode.

    In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
    (entry['english']).  But the overlay reads cell['text'] directly, so
    phonetic fixes must be applied to cells too.

    Processing depends on column type:
    - column_en: Full processing (replace garbled IPA + strip orphan brackets
      + insert missing IPA). Safe because these cells contain only English
      headwords.
    - column_text: Light processing (replace garbled IPA ONLY). No orphan
      bracket stripping (brackets may be German content like "(probieren)")
      and no IPA insertion (would add tokens and break overlay positioning).
    """
    if not IPA_AVAILABLE:
        return cells

    ipa_col_types = {'column_en', 'column_text'}
    replaced = 0

    for cell in cells:
        col_type = cell.get('col_type', '')
        if col_type not in ipa_col_types:
            continue
        text = cell.get('text', '') or ''
        if not text.strip():
            continue

        if col_type == 'column_en':
            # Full processing: replace garbled IPA, strip orphan brackets.
            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
            if new_text == text:
                # Insert IPA when garbled phonetics exist OR when trailing
                # non-dictionary words suggest garbled IPA in plain ASCII.
                if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
                    new_text = _insert_missing_ipa(text, pronunciation)
            # Strip trailing garbled fragments after proper [IPA] brackets
            # (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
            if ']' in new_text:
                new_text = _strip_post_bracket_garbled(new_text, pronunciation)
        else:
            # column_text: replace garbled IPA, no orphan stripping
            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
            # Insert headword IPA ONLY if there's a gap in word_boxes
            # suggesting Tesseract missed an IPA bracket on the page.
            # Without gap evidence, the original page had no IPA.
            if new_text == text:
                wb = cell.get('word_boxes', [])
                if _has_ipa_gap(text, wb):
                    inserted = _insert_headword_ipa(text, pronunciation)
                    if inserted != text:
                        new_text = inserted
                        _sync_word_boxes_after_ipa_insert(cell, text, new_text)

        if new_text != text:
            logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
            cell['text'] = new_text
            replaced += 1

    if replaced:
        logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
    return cells


def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
    """Check if word_boxes show a gap where IPA brackets should be.

    On a typical vocab page, the layout is:
        headword [ipa]              German translation

    If Tesseract missed the IPA bracket, the gap between the headword
    and the next word (German translation) is unusually large (>80px)
    because the IPA occupied physical space on the page.

    If no IPA was on the page (e.g. "be good at sth."), the words are
    close together (<30px).
    """
    if not word_boxes or len(word_boxes) < 2:
        return False

    tokens = text.split()
    if not tokens:
        return False

    # Find the headword index: skip numeric prefixes like "».55", "0.56"
    hw_box_idx = 0
    for i, wb in enumerate(word_boxes):
        wt = wb.get('text', '')
        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
        if len(clean) >= 2:
            hw_box_idx = i
            break

    if hw_box_idx >= len(word_boxes) - 1:
        return False

    # Check gap between headword and the next word_box
    hw = word_boxes[hw_box_idx]
    next_wb = word_boxes[hw_box_idx + 1]
    gap = next_wb['left'] - (hw['left'] + hw['width'])

    return gap > 80


def _sync_word_boxes_after_ipa_insert(
    cell: Dict[str, Any],
    old_text: str,
    new_text: str,
) -> None:
    """Insert a synthetic word_box for an IPA token added by IPA insertion.

    E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..."
    Adds a new word_box right after the headword's box so the 1:1
    token-to-box mapping in the frontend overlay stays consistent.
    """
    word_boxes = cell.get('word_boxes')
    if not word_boxes:
        return

    old_tokens = old_text.split()
    new_tokens = new_text.split()

    if len(new_tokens) != len(old_tokens) + 1:
        return  # unexpected change, skip

    # Find the inserted token by walking both lists in parallel.
    # One token in new_tokens won't match — that's the inserted IPA.
    insert_idx = -1
    j = 0  # index into old_tokens
    for i in range(len(new_tokens)):
        if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
            j += 1
        else:
            insert_idx = i
            break

    if insert_idx < 0 or insert_idx >= len(new_tokens):
        return

    ipa_token = new_tokens[insert_idx]

    # The headword is at insert_idx - 1 in old_tokens (and word_boxes)
    ref_idx = insert_idx - 1
    if ref_idx < 0 or ref_idx >= len(word_boxes):
        return

    ref_box = word_boxes[ref_idx]
    ipa_box = {
        'text': ipa_token,
        'left': ref_box['left'] + ref_box['width'] + 2,
        'top': ref_box['top'],
        'width': ref_box['width'],
        'height': ref_box['height'],
        'conf': ref_box.get('conf', 90),
    }
    word_boxes.insert(insert_idx, ipa_box)