breakpilot-lehrer/klausur-service/backend/ocr/engines/ipa_lookup.py

"""
IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.

Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)).  This module
provides functions to:

- Look up correct IPA pronunciations (British/American) for English words.
- Detect and replace garbled phonetic brackets with dictionary IPA.
- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
- Strip orphan brackets and post-bracket garbled fragments.
- Handle IPA continuation cells (phonetics on a separate row from headword).

All IPA data comes from open-source dictionaries:
- Britfone (MIT) for British English
- eng_to_ipa / CMU (MIT) for American English

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import re
from typing import Any, Dict, List, Optional

from cv_vocab_types import (
    IPA_AVAILABLE,
    _britfone_dict,
    _ipa_convert_american,
)

logger = logging.getLogger(__name__)


# --- D. Phonetic Bracket IPA Replacement ---

# Pattern: word followed by any bracket type containing phonetic content.
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
# This intentionally matches mixed brackets (e.g. {content]) because
# Tesseract frequently misrecognizes bracket characters.
_PHONETIC_BRACKET_RE = re.compile(
    r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
)

# Unicode IPA characters — used to distinguish correct IPA (from dictionary
# lookup) from garbled OCR content when stripping orphan brackets.
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')

# Minimum word confidence for full-page Tesseract results (0-100).
# Words below this threshold are OCR noise (scanner shadows, borders).
_MIN_WORD_CONF = 30


def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
    """Look up IPA for a word using the selected pronunciation dictionary.

    Args:
        word: English word to look up.
        pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).

    Returns:
        IPA string or None if not found.
    """
    word_lower = word.lower().strip()
    if not word_lower:
        return None

    if pronunciation == 'british' and _britfone_dict:
        ipa = _britfone_dict.get(word_lower)
        if ipa:
            return ipa
        # Fallback to American if not in Britfone
        if _ipa_convert_american:
            result = _ipa_convert_american(word_lower)
            if result and '*' not in result:
                return result
        return None

    if pronunciation == 'american' and _ipa_convert_american:
        result = _ipa_convert_american(word_lower)
        if result and '*' not in result:
            return result
        # Fallback to Britfone if not in CMU
        if _britfone_dict:
            ipa = _britfone_dict.get(word_lower)
            if ipa:
                return ipa
        return None

    # Try any available source
    if _britfone_dict:
        ipa = _britfone_dict.get(word_lower)
        if ipa:
            return ipa
    if _ipa_convert_american:
        result = _ipa_convert_american(word_lower)
        if result and '*' not in result:
            return result

    return None


def _fix_phonetic_brackets(
    entries: List[Dict[str, Any]],
    pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
    """Replace OCR'd phonetic transcriptions with dictionary IPA.

    Detects patterns like "dance [du:ns]" and replaces with correct IPA:
    - British: "dance [dˈɑːns]"  (Britfone, MIT)
    - American: "dance [dæns]"    (eng_to_ipa/CMU, MIT)

    Only replaces if the word before brackets is found in the dictionary.
    """
    if not IPA_AVAILABLE:
        return entries

    # IPA phonetics only appear in the ENGLISH field of vocab tables.
    # German and example fields contain meaningful parenthetical content:
    #   german:  "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
    #   example: "(sich beschweren)", "(brauchen)", "(jammern)"
    # These must NEVER be processed as phonetic transcriptions.
    replaced_count = 0
    for entry in entries:
        text = entry.get('english', '') or ''
        if not any(ch in text for ch in '[{('):
            continue
        new_text = _replace_phonetics_in_text(text, pronunciation)
        if new_text != text:
            logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
            replaced_count += 1
        entry['english'] = new_text

    if replaced_count:
        logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
    return entries


# Grammar particles that appear in brackets after English words:
#   cross (with), complain (about/of), agree (on/with), look (sth) up
# These must NOT be replaced with IPA.  Only used for the English field
# (German/example fields are never processed for IPA replacement).
_GRAMMAR_BRACKET_WORDS = frozenset({
    # English prepositions/particles commonly in vocab tables
    'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
    'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
    # English grammar abbreviations used in vocab tables
    'sth', 'sb', 'adj', 'adv',
    # Number/plural/grammar annotations
    'pl', 'sg', 'sing', 'no', 'also', 'auch',
    # Regional English markers
    'ae', 'be', 'ame', 'bre',
})


def _is_grammar_bracket_content(content: str) -> bool:
    """Return True if bracket content is grammar info in the ENGLISH field.

    Grammar info:  cross (with), complain (about/of), agree (on/with)
    NOT grammar:   [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]

    Since we only process the English field, we only need to recognize
    English grammar particles. Everything else is (garbled) IPA.
    """
    if not content:
        return False

    # Split on / and spaces for patterns like (about/of), (no pl)
    tokens = re.split(r'[/\s]+', content.strip().lower())
    tokens = [t for t in tokens if t]
    if not tokens:
        return False

    # ALL tokens must be known grammar words
    return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)


def _replace_phonetics_in_text(
    text: str,
    pronunciation: str = 'british',
    strip_orphans: bool = True,
) -> str:
    """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.

    Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
    We match any bracket type and replace with dictionary IPA if found.
    Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.

    Args:
        strip_orphans: If True, strip orphan brackets that look like garbled IPA.
            Set to False for column_text where brackets may be German content.
    """
    if not IPA_AVAILABLE:
        return text

    def replacer(match):
        word = match.group(1)
        bracket_content = match.group(2).strip()
        full_match = match.group(0)

        # Skip if bracket content looks like regular text (multiple words)
        if len(bracket_content.split()) > 3:
            return full_match

        # Look up IPA for the word before brackets
        ipa = _lookup_ipa(word, pronunciation)

        if ipa:
            # Word has IPA → bracket content is phonetic (garbled or correct).
            # Exception: grammar particles like cross (with) — keep those.
            if _is_grammar_bracket_content(bracket_content):
                return full_match
            logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
            return f"{word} [{ipa}]"

        # No IPA for this word — keep as-is
        return full_match

    text = _PHONETIC_BRACKET_RE.sub(replacer, text)

    if strip_orphans:
        # Second pass: strip remaining orphan brackets that are garbled IPA.
        # These have no word before them (the main regex requires \b word \s* bracket).
        # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
        # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
        def _strip_orphan_bracket(m):
            content = m.group(1).strip()
            # Keep grammar info: (sich beschweren), (about/of)
            if _is_grammar_bracket_content(content):
                return m.group(0)
            # Keep correct IPA (contains Unicode IPA characters)
            if any(ch in _IPA_CHARS for ch in content):
                return m.group(0)
            # Keep real-word parentheticals like (probieren), (Profit), (Geld).
            # Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
            # — they never contain a real word ≥4 letters with proper casing.
            content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
            if len(content_alpha) >= 4:
                return m.group(0)
            logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
            return ''

        text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)

    text = text.strip()

    return text


def _text_has_garbled_ipa(text: str) -> bool:
    """Check if text contains garbled IPA-like fragments from OCR.

    Returns True if there is evidence of OCR-mangled phonetic
    transcription, e.g. stress marks, length marks, or IPA special chars.
    This is used to decide whether ``_insert_missing_ipa`` should run:
    it must only insert IPA to *replace* garbled phonetics that are already
    in the text — never to ADD phonetics where none existed on the page.
    """
    # Bracketed text that doesn't contain valid IPA symbols is garbled OCR
    # of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
    stripped = text.strip()
    if stripped.startswith('[') and stripped.endswith(']'):
        inner = stripped[1:-1]
        # Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
        if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
            # Not a valid dictionary-style bracket like "(no pl)" — those
            # use parentheses, not square brackets.  Square brackets with
            # no IPA chars are garbled phonetics.
            return True

    for w in text.strip().split():
        # Skip delimiters and very short tokens
        if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
            continue
        # Starts with stress mark (OCR read IPA stress ' as apostrophe)
        if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
            return True
        if w.startswith("\u02c8") or w.startswith("\u02cc"):  # ˈ ˌ
            return True
        # Contains IPA length mark ':' in a short non-word fragment
        if ':' in w and len(w) < 12:
            # But not things like "3:00" (time) or common words
            stripped = re.sub(r'[^a-zA-Z:]', '', w)
            if ':' in stripped and not stripped.replace(':', '').isalpha():
                continue
            return True
        # Contains IPA special characters
        if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
            return True
        # Embedded apostrophe suggesting merged garbled IPA with stress mark.
        # E.g. "Scotland'skotland" — OCR reads ˈ as '.
        # Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
        # chars to avoid contractions (don't, won't, o'clock).
        if "'" in w and not w.startswith("'"):
            apos_idx = w.index("'")
            after = w[apos_idx + 1:]
            if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
                return True
    return False


def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
    """Try to decompose a compound word and concatenate IPA for each part.

    E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated.
    Only returns IPA if ALL parts are found in the dictionary.

    Tries splits at every position (min 3 chars per part) and picks the
    split where the first part is longest.
    """
    if not IPA_AVAILABLE:
        return None
    lower = word.lower().strip()
    if len(lower) < 6:
        return None  # too short for a compound

    best_ipa = None
    best_first_len = 0

    for split_pos in range(3, len(lower) - 2):  # min 3 chars each part
        first = lower[:split_pos]
        second = lower[split_pos:]
        ipa_first = _lookup_ipa(first, pronunciation)
        ipa_second = _lookup_ipa(second, pronunciation)
        if ipa_first and ipa_second:
            if split_pos > best_first_len:
                best_first_len = split_pos
                best_ipa = ipa_first + ipa_second

    return best_ipa


def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
    """Insert IPA pronunciation for English words that have no brackets at all.

    OCR sometimes garbles the phonetic transcription into plain-text fragments
    (e.g. "scare skea" where "skea" is garbled /skɛə/).  This scans the text
    for the headword, inserts correct [IPA], and strips the garbled fragments.

    Only inserts for words that:
    - are standalone (not already followed by a bracket)
    - have an IPA entry in the dictionary
    - appear to be English headwords (at the start of text or after common
      separators like ",", ";", "•")

    This is intentionally conservative: it only inserts at the END of each
    whitespace-separated token group to avoid breaking phrases.
    """
    if not IPA_AVAILABLE:
        return text
    if not text or not text.strip():
        return text

    # Skip if already has brackets (IPA replacement handles those)
    if any(ch in text for ch in '[{('):
        return text

    # Only process short text fragments (typical vocab cells).
    # Long sentences / paragraphs should not get IPA insertions.
    words = text.strip().split()
    if len(words) > 6:
        return text

    # Try to insert IPA for the first alphanumeric word
    # Typical patterns: "challenge", "profit", "film", "badge"
    for i, w in enumerate(words):
        # Clean punctuation for lookup
        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
        if not clean or len(clean) < 2:
            continue
        # Skip German/grammar words
        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
            continue
        ipa = _lookup_ipa(clean, pronunciation)
        # Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
        if not ipa and '-' in clean:
            ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
        # Fallback 0b: compound word decomposition
        # E.g. "schoolbag" → "school"+"bag" → concatenated IPA
        if not ipa:
            ipa = _decompose_compound(clean, pronunciation)
        # Fallback 1: IPA-marker split for merged tokens where OCR
        # joined headword with its IPA (e.g. "schoolbagsku:lbæg").
        # Find the first IPA marker character (:, æ, ɪ, etc.), walk
        # backwards ≤3 chars for the onset consonant cluster, and
        # split into headword + OCR IPA.
        _IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
        if not ipa:
            first_marker = next(
                (p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
            )
            if first_marker >= 3:
                split = first_marker
                while (split > 0
                       and split > first_marker - 3
                       and w[split - 1].isalpha()
                       and w[split - 1].islower()):
                    split -= 1
                if split >= 2:
                    headword = w[:split]
                    ocr_ipa = w[split:]
                    hw_ipa = _lookup_ipa(headword, pronunciation)
                    if not hw_ipa:
                        # Try compound decomposition for the headword part
                        hw_ipa = _decompose_compound(headword, pronunciation)
                    if hw_ipa:
                        words[i] = f"{headword} [{hw_ipa}]"
                    else:
                        # Word not in dictionary — use OCR IPA
                        words[i] = f"{headword} [{ocr_ipa}]"
                    words = words[:i + 1]
                    ipa = True  # signal that we handled it
                    break
        # Fallback 2: prefix matching for merged tokens WITHOUT IPA
        # markers (e.g. "Scotland'skotland").  Find longest dictionary
        # prefix using only alpha chars to avoid punctuation matches.
        if not ipa:
            alpha = re.sub(r'[^a-zA-Z]', '', clean)
            if len(alpha) > 5:  # need at least 6 chars for meaningful split
                for end in range(len(alpha), 3, -1):  # min prefix 4 chars
                    prefix = alpha[:end]
                    test_ipa = _lookup_ipa(prefix, pronunciation)
                    if test_ipa:
                        ipa = test_ipa
                        w = prefix
                        words[i] = prefix
                        break
        if ipa:
            words[i] = f"{w} [{ipa}]"
            # Strip garbled OCR phonetics after the IPA bracket.
            # On scanned vocab pages, printed IPA is read as garbled
            # text (e.g. "scare skea" where "skea" is garbled /skɛə/).
            # After inserting correct IPA, remove remaining words that
            # aren't real English words, delimiters, or German text.
            kept = words[:i + 1]
            for j in range(i + 1, len(words)):
                wj = words[j]
                # Delimiter — keep this and everything after
                if wj in ('–', '—', '-', '/', '|', ',', ';'):
                    kept.extend(words[j:])
                    break
                # Pure digits or numbering (e.g. "1", "2.", "3)") — keep
                if re.match(r'^[\d.)\-]+$', wj):
                    kept.extend(words[j:])
                    break
                # Starts with uppercase — likely German or proper noun
                clean_j = re.sub(r'[^a-zA-Z]', '', wj)
                if clean_j and clean_j[0].isupper():
                    kept.extend(words[j:])
                    break
                # Known English word (≥2 chars) — keep it and rest
                if clean_j and len(clean_j) >= 2:
                    if _lookup_ipa(clean_j, pronunciation):
                        kept.extend(words[j:])
                        break
                # Merged token: dictionary word + garbled IPA stuck together.
                # E.g. "fictionsalans'fIkfn" starts with "fiction".
                # Extract the dictionary prefix (≥4 chars) and add it with
                # IPA, but only if enough chars remain after the prefix (≥3)
                # to look like garbled IPA, not just a plural 's'.
                if clean_j and len(clean_j) >= 7:
                    for pend in range(min(len(clean_j) - 3, 15), 3, -1):
                        prefix_j = clean_j[:pend]
                        prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
                        if prefix_ipa:
                            kept.append(f"{prefix_j} [{prefix_ipa}]")
                            break
                    break  # rest of this token is garbled
                # Otherwise — likely garbled phonetics, skip
            words = kept
            break

    return ' '.join(words)