Add German IPA support (wiki-pronunciation-dict + epitran)

Hybrid approach mirroring English IPA: - Primary: wiki-pronunciation-dict (636k entries, CC-BY-SA, Wiktionary) - Fallback: epitran rule-based G2P (MIT license) IPA modes now use language-appropriate dictionaries: - auto/en: English IPA (Britfone + eng_to_ipa) - de: German IPA (wiki-pronunciation-dict + epitran) - all: EN column gets English IPA, other columns get German IPA - none: disabled Frontend shows CC-BY-SA attribution when German IPA is active. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-25 22:18:20 +01:00
parent a73ddce43d
commit f860eb66e6
7 changed files with 637123 additions and 34 deletions
--- a/klausur-service/backend/cv_ipa_german.py
+++ b/klausur-service/backend/cv_ipa_german.py
@@ -0,0 +1,135 @@
+"""German IPA insertion for grid editor cells.
+
+Hybrid approach:
+  1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA)
+  2. Fallback: epitran rule-based G2P (MIT license)
+
+German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0).
+Attribution required — see grid editor UI.
+
+Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Dict, List, Optional, Set
+
+logger = logging.getLogger(__name__)
+
+# IPA/phonetic characters — skip cells that already contain IPA
+_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
+
+
+def _lookup_ipa_de(word: str) -> Optional[str]:
+    """Look up German IPA for a single word.
+
+    Returns IPA string or None if not found.
+    """
+    from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
+
+    if not DE_IPA_AVAILABLE and _epitran_de is None:
+        return None
+
+    lower = word.lower().strip()
+    if not lower:
+        return None
+
+    # 1. Dictionary lookup (636k entries)
+    ipa = _de_ipa_dict.get(lower)
+    if ipa:
+        return ipa
+
+    # 2. epitran fallback (rule-based)
+    if _epitran_de is not None:
+        try:
+            result = _epitran_de.transliterate(word)
+            if result and result != word.lower():
+                return result
+        except Exception:
+            pass
+
+    return None
+
+
+def _insert_ipa_for_text(text: str) -> str:
+    """Insert German IPA after each recognized word in a text string.
+
+    Handles comma-separated lists:
+      "bildschön, blendend" → "bildschön [bɪltʃøn], blendend [blɛndənt]"
+
+    Skips cells already containing IPA brackets.
+    """
+    if not text or _IPA_RE.search(text):
+        return text
+
+    # Split on comma/semicolon sequences, keeping separators
+    tokens = re.split(r'([,;:]+\s*)', text)
+    result = []
+    changed = False
+
+    for tok in tokens:
+        # Keep separators as-is
+        if not tok or re.match(r'^[,;:\s]+$', tok):
+            result.append(tok)
+            continue
+
+        # Process words within this token
+        words = tok.split()
+        new_words = []
+        for w in words:
+            # Strip punctuation for lookup
+            clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w)
+            if len(clean) < 3:
+                new_words.append(w)
+                continue
+
+            ipa = _lookup_ipa_de(clean)
+            if ipa:
+                new_words.append(f"{w} [{ipa}]")
+                changed = True
+            else:
+                new_words.append(w)
+
+        result.append(' '.join(new_words))
+
+    return ''.join(result) if changed else text
+
+
+def insert_german_ipa(
+    cells: List[Dict],
+    target_cols: Set[str],
+) -> int:
+    """Insert German IPA transcriptions into cells of target columns.
+
+    Args:
+        cells: Flat list of all cells (modified in-place).
+        target_cols: Set of col_type values to process.
+
+    Returns:
+        Number of cells modified.
+    """
+    from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de
+
+    if not DE_IPA_AVAILABLE and _epitran_de is None:
+        logger.warning("German IPA not available — skipping")
+        return 0
+
+    count = 0
+    for cell in cells:
+        ct = cell.get("col_type", "")
+        if ct not in target_cols:
+            continue
+        text = cell.get("text", "")
+        if not text.strip():
+            continue
+
+        new_text = _insert_ipa_for_text(text)
+        if new_text != text:
+            cell["text"] = new_text
+            cell["_ipa_corrected"] = True
+            count += 1
+
+    if count:
+        logger.info(f"German IPA inserted in {count} cells")
+    return count