breakpilot-lehrer/klausur-service/backend/cv_ipa_german.py

"""German IPA insertion for grid editor cells.

Hybrid approach:
  1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA)
  2. Fallback: epitran rule-based G2P (MIT license)

German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0).
Attribution required — see grid editor UI.

Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
import re
from typing import Dict, List, Optional, Set

logger = logging.getLogger(__name__)

# IPA/phonetic characters — skip cells that already contain IPA
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')


def _lookup_ipa_de(word: str) -> Optional[str]:
    """Look up German IPA for a single word.

    Returns IPA string or None if not found.
    """
    from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE

    if not DE_IPA_AVAILABLE and _epitran_de is None:
        return None

    lower = word.lower().strip()
    if not lower:
        return None

    # 1. Dictionary lookup (636k entries)
    ipa = _de_ipa_dict.get(lower)
    if ipa:
        return ipa

    # 2. epitran fallback (rule-based)
    if _epitran_de is not None:
        try:
            result = _epitran_de.transliterate(word)
            if result and result != word.lower():
                return result
        except Exception:
            pass

    return None


def _insert_ipa_for_text(text: str) -> str:
    """Insert German IPA after each recognized word in a text string.

    Handles comma-separated lists:
      "bildschön, blendend" → "bildschön [bɪltʃøn], blendend [blɛndənt]"

    Skips cells already containing IPA brackets.
    """
    if not text or _IPA_RE.search(text):
        return text

    # Split on comma/semicolon sequences, keeping separators
    tokens = re.split(r'([,;:]+\s*)', text)
    result = []
    changed = False

    for tok in tokens:
        # Keep separators as-is
        if not tok or re.match(r'^[,;:\s]+$', tok):
            result.append(tok)
            continue

        # Process words within this token
        words = tok.split()
        new_words = []
        for w in words:
            # Strip punctuation for lookup
            clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w)
            if len(clean) < 3:
                new_words.append(w)
                continue

            ipa = _lookup_ipa_de(clean)
            if ipa:
                new_words.append(f"{w} [{ipa}]")
                changed = True
            else:
                new_words.append(w)

        result.append(' '.join(new_words))

    return ''.join(result) if changed else text


def insert_german_ipa(
    cells: List[Dict],
    target_cols: Set[str],
) -> int:
    """Insert German IPA transcriptions into cells of target columns.

    Args:
        cells: Flat list of all cells (modified in-place).
        target_cols: Set of col_type values to process.

    Returns:
        Number of cells modified.
    """
    from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de

    if not DE_IPA_AVAILABLE and _epitran_de is None:
        logger.warning("German IPA not available — skipping")
        return 0

    count = 0
    for cell in cells:
        ct = cell.get("col_type", "")
        if ct not in target_cols:
            continue
        text = cell.get("text", "")
        if not text.strip():
            continue

        new_text = _insert_ipa_for_text(text)
        if new_text != text:
            cell["text"] = new_text
            cell["_ipa_corrected"] = True
            count += 1

    if count:
        logger.info(f"German IPA inserted in {count} cells")
    return count