"""German IPA insertion for grid editor cells. Hybrid approach: 1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA) 2. Fallback: epitran rule-based G2P (MIT license) German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0). Attribution required — see grid editor UI. Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Dict, List, Optional, Set logger = logging.getLogger(__name__) # IPA/phonetic characters — skip cells that already contain IPA _IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]') def _lookup_ipa_de(word: str) -> Optional[str]: """Look up German IPA for a single word. Returns IPA string or None if not found. """ from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE if not DE_IPA_AVAILABLE and _epitran_de is None: return None lower = word.lower().strip() if not lower: return None # 1. Dictionary lookup (636k entries) ipa = _de_ipa_dict.get(lower) if ipa: return ipa # 2. epitran fallback (rule-based) if _epitran_de is not None: try: result = _epitran_de.transliterate(word) if result and result != word.lower(): return result except Exception: pass return None def _insert_ipa_for_text(text: str) -> str: """Insert German IPA after each recognized word in a text string. Handles comma-separated lists: "bildschön, blendend" → "bildschön [bɪltʃøn], blendend [blɛndənt]" Skips cells already containing IPA brackets. """ if not text or _IPA_RE.search(text): return text # Split on comma/semicolon sequences, keeping separators tokens = re.split(r'([,;:]+\s*)', text) result = [] changed = False for tok in tokens: # Keep separators as-is if not tok or re.match(r'^[,;:\s]+$', tok): result.append(tok) continue # Process words within this token words = tok.split() new_words = [] for w in words: # Strip punctuation for lookup clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w) if len(clean) < 3: new_words.append(w) continue ipa = _lookup_ipa_de(clean) if ipa: new_words.append(f"{w} [{ipa}]") changed = True else: new_words.append(w) result.append(' '.join(new_words)) return ''.join(result) if changed else text def insert_german_ipa( cells: List[Dict], target_cols: Set[str], ) -> int: """Insert German IPA transcriptions into cells of target columns. Args: cells: Flat list of all cells (modified in-place). target_cols: Set of col_type values to process. Returns: Number of cells modified. """ from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de if not DE_IPA_AVAILABLE and _epitran_de is None: logger.warning("German IPA not available — skipping") return 0 count = 0 for cell in cells: ct = cell.get("col_type", "") if ct not in target_cols: continue text = cell.get("text", "") if not text.strip(): continue new_text = _insert_ipa_for_text(text) if new_text != text: cell["text"] = new_text cell["_ipa_corrected"] = True count += 1 if count: logger.info(f"German IPA inserted in {count} cells") return count