Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m12s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
Hybrid approach mirroring English IPA: - Primary: wiki-pronunciation-dict (636k entries, CC-BY-SA, Wiktionary) - Fallback: epitran rule-based G2P (MIT license) IPA modes now use language-appropriate dictionaries: - auto/en: English IPA (Britfone + eng_to_ipa) - de: German IPA (wiki-pronunciation-dict + epitran) - all: EN column gets English IPA, other columns get German IPA - none: disabled Frontend shows CC-BY-SA attribution when German IPA is active. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
136 lines
3.6 KiB
Python
136 lines
3.6 KiB
Python
"""German IPA insertion for grid editor cells.
|
||
|
||
Hybrid approach:
|
||
1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA)
|
||
2. Fallback: epitran rule-based G2P (MIT license)
|
||
|
||
German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0).
|
||
Attribution required — see grid editor UI.
|
||
|
||
Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
from typing import Dict, List, Optional, Set
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# IPA/phonetic characters — skip cells that already contain IPA
|
||
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
||
|
||
|
||
def _lookup_ipa_de(word: str) -> Optional[str]:
|
||
"""Look up German IPA for a single word.
|
||
|
||
Returns IPA string or None if not found.
|
||
"""
|
||
from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
|
||
|
||
if not DE_IPA_AVAILABLE and _epitran_de is None:
|
||
return None
|
||
|
||
lower = word.lower().strip()
|
||
if not lower:
|
||
return None
|
||
|
||
# 1. Dictionary lookup (636k entries)
|
||
ipa = _de_ipa_dict.get(lower)
|
||
if ipa:
|
||
return ipa
|
||
|
||
# 2. epitran fallback (rule-based)
|
||
if _epitran_de is not None:
|
||
try:
|
||
result = _epitran_de.transliterate(word)
|
||
if result and result != word.lower():
|
||
return result
|
||
except Exception:
|
||
pass
|
||
|
||
return None
|
||
|
||
|
||
def _insert_ipa_for_text(text: str) -> str:
|
||
"""Insert German IPA after each recognized word in a text string.
|
||
|
||
Handles comma-separated lists:
|
||
"bildschön, blendend" → "bildschön [bɪltʃøn], blendend [blɛndənt]"
|
||
|
||
Skips cells already containing IPA brackets.
|
||
"""
|
||
if not text or _IPA_RE.search(text):
|
||
return text
|
||
|
||
# Split on comma/semicolon sequences, keeping separators
|
||
tokens = re.split(r'([,;:]+\s*)', text)
|
||
result = []
|
||
changed = False
|
||
|
||
for tok in tokens:
|
||
# Keep separators as-is
|
||
if not tok or re.match(r'^[,;:\s]+$', tok):
|
||
result.append(tok)
|
||
continue
|
||
|
||
# Process words within this token
|
||
words = tok.split()
|
||
new_words = []
|
||
for w in words:
|
||
# Strip punctuation for lookup
|
||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w)
|
||
if len(clean) < 3:
|
||
new_words.append(w)
|
||
continue
|
||
|
||
ipa = _lookup_ipa_de(clean)
|
||
if ipa:
|
||
new_words.append(f"{w} [{ipa}]")
|
||
changed = True
|
||
else:
|
||
new_words.append(w)
|
||
|
||
result.append(' '.join(new_words))
|
||
|
||
return ''.join(result) if changed else text
|
||
|
||
|
||
def insert_german_ipa(
|
||
cells: List[Dict],
|
||
target_cols: Set[str],
|
||
) -> int:
|
||
"""Insert German IPA transcriptions into cells of target columns.
|
||
|
||
Args:
|
||
cells: Flat list of all cells (modified in-place).
|
||
target_cols: Set of col_type values to process.
|
||
|
||
Returns:
|
||
Number of cells modified.
|
||
"""
|
||
from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de
|
||
|
||
if not DE_IPA_AVAILABLE and _epitran_de is None:
|
||
logger.warning("German IPA not available — skipping")
|
||
return 0
|
||
|
||
count = 0
|
||
for cell in cells:
|
||
ct = cell.get("col_type", "")
|
||
if ct not in target_cols:
|
||
continue
|
||
text = cell.get("text", "")
|
||
if not text.strip():
|
||
continue
|
||
|
||
new_text = _insert_ipa_for_text(text)
|
||
if new_text != text:
|
||
cell["text"] = new_text
|
||
cell["_ipa_corrected"] = True
|
||
count += 1
|
||
|
||
if count:
|
||
logger.info(f"German IPA inserted in {count} cells")
|
||
return count
|