breakpilot-lehrer/klausur-service/backend/cv_ocr_ipa_repair.py

"""
Advanced IPA repair for OCR-extracted vocabulary.

Functions that detect and fix garbled IPA fragments trailing after
headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
to stay within the 500 LOC budget.

Contains:
- _has_non_dict_trailing: detect non-dictionary trailing words
- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
"""

import logging
import re
from typing import Any, Dict, List, Optional

from cv_vocab_types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import (
    _lookup_ipa,
    _GRAMMAR_BRACKET_WORDS,
)

logger = logging.getLogger(__name__)


def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
    """Check if text has a headword followed by non-dictionary trailing words.

    Used as an additional trigger for ``_insert_missing_ipa`` when
    ``_text_has_garbled_ipa`` returns False because the garbled IPA
    happens to look like plain ASCII (e.g. "skea" for /skɛə/).
    """
    if not IPA_AVAILABLE:
        return False
    words = text.strip().split()
    if len(words) < 2 or len(words) > 6:
        return False
    # Find first dictionary word
    hw_idx = -1
    for i, w in enumerate(words):
        clean = re.sub(r'[^a-zA-Z\'-]', '', w)
        if not clean or len(clean) < 2:
            continue
        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
            continue
        if _lookup_ipa(clean, pronunciation):
            hw_idx = i
            break
    if hw_idx < 0 or hw_idx >= len(words) - 1:
        return False
    # Check ALL remaining words — if none are dictionary/delimiter/German,
    # they are likely garbled IPA.
    for j in range(hw_idx + 1, len(words)):
        wj = words[j]
        if wj in ('–', '—', '-', '/', '|', ',', ';'):
            return False
        # Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
        if re.match(r'^[\d.)\-]+$', wj):
            return False
        clean_j = re.sub(r'[^a-zA-Z]', '', wj)
        if clean_j and clean_j[0].isupper():
            return False
        if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
            return False
    return True


def _strip_post_bracket_garbled(
    text: str, pronunciation: str = 'british',
) -> str:
    """Strip garbled IPA fragments that trail after proper [IPA] brackets.

    E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
         ``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
         ``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``

    For multi-word headwords like "seat belt", a real English word ("belt")
    may be followed by garbled IPA duplicates.  We detect this by checking
    whether the sequence after a real word contains IPA markers (`:`, `ə`,
    etc.) — if so, everything from the first garbled token onward is stripped.
    """
    if ']' not in text:
        return text
    last_bracket = text.rfind(']')
    if last_bracket >= len(text) - 1:
        return text
    before = text[:last_bracket + 1].rstrip()
    after = text[last_bracket + 1:].strip()
    if not after:
        return text

    _IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
    after_words = after.split()
    kept: List[str] = []
    for idx, w in enumerate(after_words):
        # Delimiter — keep rest
        if w in ('–', '—', '-', '/', '|', ',', ';'):
            kept.extend(after_words[idx:])
            break
        # Contains IPA markers (length mark, IPA chars) — garbled, skip
        if any(c in w for c in _IPA_MARKER_CHARS):
            # Everything from here is garbled IPA — stop scanning
            # but look ahead: if any remaining words are real English
            # words WITHOUT IPA markers, they might be a different headword
            # following. Only skip the contiguous garbled run.
            continue
        clean = re.sub(r'[^a-zA-Z]', '', w)
        # Uppercase — likely German, keep rest
        if clean and clean[0].isupper():
            kept.extend(after_words[idx:])
            break
        # Known English word — keep it, but check if followed by garbled IPA
        # (multi-word headword case like "seat [siːt] belt si:t belt")
        if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
            # Peek ahead: if next word has IPA markers, the rest is garbled
            remaining = after_words[idx + 1:]
            has_garbled_after = any(
                any(c in rw for c in _IPA_MARKER_CHARS)
                for rw in remaining
            )
            if has_garbled_after:
                # Keep this real word but stop — rest is garbled duplication
                kept.append(w)
                # Still scan for delimiters/German in the remaining words
                for ridx, rw in enumerate(remaining):
                    if rw in ('–', '—', '-', '/', '|', ',', ';'):
                        kept.extend(remaining[ridx:])
                        break
                    rclean = re.sub(r'[^a-zA-Z]', '', rw)
                    if rclean and rclean[0].isupper():
                        kept.extend(remaining[ridx:])
                        break
                break
            else:
                kept.extend(after_words[idx:])
                break
        # Unknown short word — likely garbled, skip
    if kept:
        return before + ' ' + ' '.join(kept)
    return before


def fix_ipa_continuation_cell(
    garbled_text: str,
    headword_text: str,
    pronunciation: str = 'british',
) -> str:
    """Replace garbled IPA in a continuation row with proper IPA.

    Continuation rows appear below the headword and contain only the
    printed phonetic transcription, which OCR garbles into fragments
    like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``).

    Args:
        garbled_text: The OCR-garbled IPA text from the continuation row.
        headword_text: The headword text from the previous row
            (e.g. ``scarf – scarves``).
        pronunciation: ``'british'`` or ``'american'``.

    Returns:
        Corrected IPA text, or the original if no fix could be applied.
    """
    if not IPA_AVAILABLE or not garbled_text or not headword_text:
        return garbled_text

    # If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
    # only generate continuation IPA for words NOT already covered.
    covered_words: set = set()
    has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
    if has_inline_ipa:
        # Words before the first bracket already have their IPA shown
        first_bracket = headword_text.index('[')
        pre_bracket = headword_text[:first_bracket].strip()
        for w in pre_bracket.split():
            clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
            if clean and len(clean) >= 2:
                covered_words.add(clean)

        last_bracket_end = headword_text.rfind(']')
        tail = headword_text[last_bracket_end + 1:].strip()

        if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
            # Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
            # — return the inline IPA directly (continuation duplicates it)
            last_bracket_start = headword_text.rfind('[')
            inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
            return inline_ipa

        # Only the tail words need continuation IPA
        headword_text = tail

    # Strip existing IPA brackets and parenthetical grammar annotations
    # like "(no pl)", "(sth)", "(sb)" from headword text
    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
    clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
    if not clean_hw:
        return garbled_text

    # Split headword by delimiters (– — -)
    # "scarf – scarves" → ["scarf", "scarves"]
    # "see - saw - seen" → ["see", "saw", "seen"]
    parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
    parts = [p.strip() for p in parts if p.strip()]

    if not parts:
        return garbled_text

    # Look up IPA for each headword part.
    # Skip articles (the, a, an) — they never get IPA in vocab books.
    # Other function words like "down", "up" are kept because they are
    # integral parts of phrasal verbs (e.g. "close down").
    # Skip words that already have inline IPA in the headword row.
    _ARTICLES = {'the', 'a', 'an'}
    ipa_parts: List[str] = []
    for part in parts:
        # A part may be multi-word like "secondary school"
        words = part.split()
        word_ipas: List[str] = []
        for w in words:
            clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
            if not clean_w or len(clean_w) < 2:
                continue
            if covered_words and clean_w.lower() in covered_words:
                continue  # Already has IPA inline in the headword
            if clean_w.lower() in _ARTICLES:
                continue  # Articles never get IPA in vocab books
            ipa = _lookup_ipa(clean_w, pronunciation)
            if ipa:
                word_ipas.append(ipa)
        if word_ipas:
            ipa_parts.append('[' + ' '.join(word_ipas) + ']')

    if not ipa_parts:
        return garbled_text

    # Join with delimiter
    result = ' – '.join(ipa_parts)
    logger.debug(
        "fix_ipa_continuation: '%s' → '%s' (headwords: '%s')",
        garbled_text, result, headword_text,
    )
    return result


def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
    """Insert IPA for the first English headword in a long mixed-language line.

    Unlike _insert_missing_ipa (for short column_en cells), this handles
    column_text lines of any length.  It only inserts IPA for the FIRST word
    if that word:
    - has no bracket following it already
    - has an IPA entry in the dictionary
    - is not a number/symbol prefix like "».55"

    Returns the text with [ipa] inserted after the first word, or unchanged.
    """
    if not IPA_AVAILABLE:
        return text
    if not text or not text.strip():
        return text

    words = text.strip().split()
    if not words:
        return text

    # Check if text already starts with a bracket (IPA already present)
    if len(words) > 1 and words[1].startswith(('[', '{', '(')):
        return text

    # Try the first few words (skip numeric prefixes like "».55", "0.56")
    for i in range(min(3, len(words))):
        w = words[i]
        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
        if not clean or len(clean) < 2:
            continue
        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
            continue
        ipa = _lookup_ipa(clean, pronunciation)
        if ipa:
            words[i] = f"{w} [{ipa}]"
            return ' '.join(words)
        # Stop at first real word even if no IPA found
        break

    return text