Files
breakpilot-lehrer/klausur-service/backend/cv_ipa_german.py
Benjamin Admin f860eb66e6
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m12s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
Add German IPA support (wiki-pronunciation-dict + epitran)
Hybrid approach mirroring English IPA:
- Primary: wiki-pronunciation-dict (636k entries, CC-BY-SA, Wiktionary)
- Fallback: epitran rule-based G2P (MIT license)

IPA modes now use language-appropriate dictionaries:
- auto/en: English IPA (Britfone + eng_to_ipa)
- de: German IPA (wiki-pronunciation-dict + epitran)
- all: EN column gets English IPA, other columns get German IPA
- none: disabled

Frontend shows CC-BY-SA attribution when German IPA is active.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-25 22:18:20 +01:00

136 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""German IPA insertion for grid editor cells.
Hybrid approach:
1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA)
2. Fallback: epitran rule-based G2P (MIT license)
German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0).
Attribution required — see grid editor UI.
Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Dict, List, Optional, Set
logger = logging.getLogger(__name__)
# IPA/phonetic characters — skip cells that already contain IPA
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
def _lookup_ipa_de(word: str) -> Optional[str]:
"""Look up German IPA for a single word.
Returns IPA string or None if not found.
"""
from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
if not DE_IPA_AVAILABLE and _epitran_de is None:
return None
lower = word.lower().strip()
if not lower:
return None
# 1. Dictionary lookup (636k entries)
ipa = _de_ipa_dict.get(lower)
if ipa:
return ipa
# 2. epitran fallback (rule-based)
if _epitran_de is not None:
try:
result = _epitran_de.transliterate(word)
if result and result != word.lower():
return result
except Exception:
pass
return None
def _insert_ipa_for_text(text: str) -> str:
"""Insert German IPA after each recognized word in a text string.
Handles comma-separated lists:
"bildschön, blendend""bildschön [bɪltʃøn], blendend [blɛndənt]"
Skips cells already containing IPA brackets.
"""
if not text or _IPA_RE.search(text):
return text
# Split on comma/semicolon sequences, keeping separators
tokens = re.split(r'([,;:]+\s*)', text)
result = []
changed = False
for tok in tokens:
# Keep separators as-is
if not tok or re.match(r'^[,;:\s]+$', tok):
result.append(tok)
continue
# Process words within this token
words = tok.split()
new_words = []
for w in words:
# Strip punctuation for lookup
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w)
if len(clean) < 3:
new_words.append(w)
continue
ipa = _lookup_ipa_de(clean)
if ipa:
new_words.append(f"{w} [{ipa}]")
changed = True
else:
new_words.append(w)
result.append(' '.join(new_words))
return ''.join(result) if changed else text
def insert_german_ipa(
cells: List[Dict],
target_cols: Set[str],
) -> int:
"""Insert German IPA transcriptions into cells of target columns.
Args:
cells: Flat list of all cells (modified in-place).
target_cols: Set of col_type values to process.
Returns:
Number of cells modified.
"""
from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de
if not DE_IPA_AVAILABLE and _epitran_de is None:
logger.warning("German IPA not available — skipping")
return 0
count = 0
for cell in cells:
ct = cell.get("col_type", "")
if ct not in target_cols:
continue
text = cell.get("text", "")
if not text.strip():
continue
new_text = _insert_ipa_for_text(text)
if new_text != text:
cell["text"] = new_text
cell["_ipa_corrected"] = True
count += 1
if count:
logger.info(f"German IPA inserted in {count} cells")
return count