Add German IPA support (wiki-pronunciation-dict + epitran)

Hybrid approach mirroring English IPA: - Primary: wiki-pronunciation-dict (636k entries, CC-BY-SA, Wiktionary) - Fallback: epitran rule-based G2P (MIT license) IPA modes now use language-appropriate dictionaries: - auto/en: English IPA (Britfone + eng_to_ipa) - de: German IPA (wiki-pronunciation-dict + epitran) - all: EN column gets English IPA, other columns get German IPA - none: disabled Frontend shows CC-BY-SA attribution when German IPA is active. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-25 22:18:20 +01:00
parent a73ddce43d
commit f860eb66e6
7 changed files with 637123 additions and 34 deletions
@@ -22,6 +22,7 @@ interface GridToolbarProps {
 const IPA_LABELS: Record<IpaMode, string> = {
  auto: 'IPA: Auto',
  en: 'IPA: nur EN',
+  de: 'IPA: nur DE',
  all: 'IPA: Alle',
  none: 'IPA: Aus',
 }
@@ -93,16 +94,26 @@ export function GridToolbar({
      </button>

      {/* IPA mode */}
-      <select
-        value={ipaMode}
-        onChange={(e) => onIpaModeChange(e.target.value as IpaMode)}
-        className="px-2 py-1.5 text-xs rounded-md border border-gray-200 dark:border-gray-700 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-400"
-        title="Lautschrift (IPA): Auto = nur bei erkannten englischen Woertern, Alle = fuer alle Vokabeln, Aus = keine"
-      >
-        {(Object.keys(IPA_LABELS) as IpaMode[]).map((m) => (
-          <option key={m} value={m}>{IPA_LABELS[m]}</option>
-        ))}
-      </select>
+      <div className="flex items-center gap-1">
+        <select
+          value={ipaMode}
+          onChange={(e) => onIpaModeChange(e.target.value as IpaMode)}
+          className="px-2 py-1.5 text-xs rounded-md border border-gray-200 dark:border-gray-700 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-400"
+          title="Lautschrift (IPA): Auto = nur erkannte EN-Woerter, DE = deutsches IPA (Wiktionary), Alle = EN + DE, Aus = keine"
+        >
+          {(Object.keys(IPA_LABELS) as IpaMode[]).map((m) => (
+            <option key={m} value={m}>{IPA_LABELS[m]}</option>
+          ))}
+        </select>
+        {(ipaMode === 'de' || ipaMode === 'all') && (
+          <span
+            className="text-[9px] text-gray-400 dark:text-gray-500 cursor-help"
+            title="DE-Lautschrift: Wiktionary (CC-BY-SA 4.0) + epitran (MIT). EN-Lautschrift: Britfone (MIT) + eng_to_ipa (MIT)."
+          >
+            CC-BY-SA
+          </span>
+        )}
+      </div>

      {/* Syllable mode */}
      <select
@@ -14,7 +14,7 @@ export interface GridEditorState {
  selectedZone: number | null
 }

-export type IpaMode = 'auto' | 'all' | 'en' | 'none'
+export type IpaMode = 'auto' | 'all' | 'de' | 'en' | 'none'
 export type SyllableMode = 'auto' | 'all' | 'de' | 'en' | 'none'

 export function useGridEditor(sessionId: string | null) {
@@ -0,0 +1,135 @@
+"""German IPA insertion for grid editor cells.
+
+Hybrid approach:
+  1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA)
+  2. Fallback: epitran rule-based G2P (MIT license)
+
+German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0).
+Attribution required — see grid editor UI.
+
+Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Dict, List, Optional, Set
+
+logger = logging.getLogger(__name__)
+
+# IPA/phonetic characters — skip cells that already contain IPA
+_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
+
+
+def _lookup_ipa_de(word: str) -> Optional[str]:
+    """Look up German IPA for a single word.
+
+    Returns IPA string or None if not found.
+    """
+    from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
+
+    if not DE_IPA_AVAILABLE and _epitran_de is None:
+        return None
+
+    lower = word.lower().strip()
+    if not lower:
+        return None
+
+    # 1. Dictionary lookup (636k entries)
+    ipa = _de_ipa_dict.get(lower)
+    if ipa:
+        return ipa
+
+    # 2. epitran fallback (rule-based)
+    if _epitran_de is not None:
+        try:
+            result = _epitran_de.transliterate(word)
+            if result and result != word.lower():
+                return result
+        except Exception:
+            pass
+
+    return None
+
+
+def _insert_ipa_for_text(text: str) -> str:
+    """Insert German IPA after each recognized word in a text string.
+
+    Handles comma-separated lists:
+      "bildschön, blendend" → "bildschön [bɪltʃøn], blendend [blɛndənt]"
+
+    Skips cells already containing IPA brackets.
+    """
+    if not text or _IPA_RE.search(text):
+        return text
+
+    # Split on comma/semicolon sequences, keeping separators
+    tokens = re.split(r'([,;:]+\s*)', text)
+    result = []
+    changed = False
+
+    for tok in tokens:
+        # Keep separators as-is
+        if not tok or re.match(r'^[,;:\s]+$', tok):
+            result.append(tok)
+            continue
+
+        # Process words within this token
+        words = tok.split()
+        new_words = []
+        for w in words:
+            # Strip punctuation for lookup
+            clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w)
+            if len(clean) < 3:
+                new_words.append(w)
+                continue
+
+            ipa = _lookup_ipa_de(clean)
+            if ipa:
+                new_words.append(f"{w} [{ipa}]")
+                changed = True
+            else:
+                new_words.append(w)
+
+        result.append(' '.join(new_words))
+
+    return ''.join(result) if changed else text
+
+
+def insert_german_ipa(
+    cells: List[Dict],
+    target_cols: Set[str],
+) -> int:
+    """Insert German IPA transcriptions into cells of target columns.
+
+    Args:
+        cells: Flat list of all cells (modified in-place).
+        target_cols: Set of col_type values to process.
+
+    Returns:
+        Number of cells modified.
+    """
+    from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de
+
+    if not DE_IPA_AVAILABLE and _epitran_de is None:
+        logger.warning("German IPA not available — skipping")
+        return 0
+
+    count = 0
+    for cell in cells:
+        ct = cell.get("col_type", "")
+        if ct not in target_cols:
+            continue
+        text = cell.get("text", "")
+        if not text.strip():
+            continue
+
+        new_text = _insert_ipa_for_text(text)
+        if new_text != text:
+            cell["text"] = new_text
+            cell["_ipa_corrected"] = True
+            count += 1
+
+    if count:
+        logger.info(f"German IPA inserted in {count} cells")
+    return count
@@ -65,6 +65,38 @@ if os.path.exists(_britfone_path):
 else:
    logger.info("Britfone not found — British IPA disabled")

+# --- German IPA Dictionary (CC-BY-SA, Wiktionary) ---
+
+DE_IPA_AVAILABLE = False
+_de_ipa_dict: Dict[str, str] = {}
+
+_de_ipa_path = os.path.join(os.path.dirname(__file__), 'data', 'de_ipa.tsv')
+if os.path.exists(_de_ipa_path):
+    try:
+        with open(_de_ipa_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.rstrip('\n').split('\t', 1)
+                if len(parts) == 2:
+                    _de_ipa_dict[parts[0]] = parts[1]
+        DE_IPA_AVAILABLE = True
+        logger.info(f"German IPA loaded — {len(_de_ipa_dict)} entries (CC-BY-SA, Wiktionary)")
+    except Exception as e:
+        logger.warning(f"Failed to load German IPA: {e}")
+else:
+    logger.info("German IPA not found — German IPA disabled")
+
+# --- epitran German fallback (MIT license) ---
+
+_epitran_de = None
+try:
+    import epitran as _epitran_module
+    _epitran_de = _epitran_module.Epitran('deu-Latn')
+    logger.info("epitran loaded — German rule-based IPA fallback enabled")
+except ImportError:
+    logger.info("epitran not installed — German IPA fallback disabled")
+except Exception as e:
+    logger.warning(f"Failed to init epitran: {e}")
+
 # --- Language Detection Constants ---

 GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
@@ -900,42 +900,49 @@ async def _build_grid_core(
                en_col_type = max(col_cell_count, key=col_cell_count.get)

        # Decide which columns to process based on ipa_mode:
-        # auto/en: only the detected EN headword column
-        # de: all content columns EXCEPT the EN column
-        # all: all content columns
-        ipa_target_cols: set = set()
+        # auto/en: only the detected EN headword column (English IPA)
+        # de: all content columns EXCEPT the EN column (German IPA)
+        # all: EN column gets English IPA, other columns get German IPA
+        en_ipa_target_cols: set = set()
+        de_ipa_target_cols: set = set()
        if ipa_mode in ("auto", "en"):
            if en_col_type:
-                ipa_target_cols.add(en_col_type)
+                en_ipa_target_cols.add(en_col_type)
        elif ipa_mode == "de":
-            ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
+            de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
        elif ipa_mode == "all":
-            ipa_target_cols = all_content_cols
+            if en_col_type:
+                en_ipa_target_cols.add(en_col_type)
+            de_ipa_target_cols = all_content_cols - en_ipa_target_cols

-        if ipa_target_cols:
+        # --- English IPA (Britfone + eng_to_ipa) ---
+        if en_ipa_target_cols:
            for cell in all_cells:
                ct = cell.get("col_type")
-                if ct in ipa_target_cols:
+                if ct in en_ipa_target_cols:
                    cell["_orig_col_type"] = ct
-                    # Full IPA processing (incl. insertion) only for the
-                    # detected English column; other columns get light
-                    # processing (bracket replacement only) — our IPA
-                    # dictionary is English-only, so inserting IPA into
-                    # German text would corrupt it.
-                    if ct == en_col_type:
-                        cell["col_type"] = "column_en"
-                    else:
-                        cell["col_type"] = "column_text"
-        # Snapshot text before IPA fix to detect which cells were modified
+                    cell["col_type"] = "column_en"
        _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
        fix_cell_phonetics(all_cells, pronunciation="british")
        for cell in all_cells:
            orig = cell.pop("_orig_col_type", None)
            if orig:
                cell["col_type"] = orig
-            # Mark cells whose text was changed by IPA correction so that
-            # later steps (5i) don't overwrite the corrected text when
-            # reconstructing from word_boxes.
+            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
+                cell["_ipa_corrected"] = True
+
+        # --- German IPA (wiki-pronunciation-dict + epitran) ---
+        if de_ipa_target_cols:
+            from cv_ipa_german import insert_german_ipa
+            insert_german_ipa(all_cells, de_ipa_target_cols)
+
+        ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
+
+        # Mark cells whose text was changed by IPA correction so that
+        # later steps (5i) don't overwrite the corrected text when
+        # reconstructing from word_boxes.  (Already done inline above
+        # for English; insert_german_ipa sets _ipa_corrected too.)
+        for cell in all_cells:
            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
                cell["_ipa_corrected"] = True

@@ -1593,7 +1600,7 @@ async def _build_grid_core(
@router.post("/sessions/{session_id}/build-grid")
 async def build_grid(
    session_id: str,
-    ipa_mode: str = Query("auto", pattern="^(auto|all|en|none)$"),
+    ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
 ):
    """Build a structured, zone-aware grid from existing Kombi word results.
@@ -35,6 +35,9 @@ onnxruntime
 # IPA pronunciation dictionary lookup (MIT license, bundled CMU dict ~134k words)
 eng-to-ipa

+# German IPA rule-based fallback for OOV words (MIT license)
+epitran
+
 # Spell-checker for rule-based OCR correction (MIT license)
 pyspellchecker>=0.8.1