Add German IPA support (wiki-pronunciation-dict + epitran)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m12s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m12s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
Hybrid approach mirroring English IPA: - Primary: wiki-pronunciation-dict (636k entries, CC-BY-SA, Wiktionary) - Fallback: epitran rule-based G2P (MIT license) IPA modes now use language-appropriate dictionaries: - auto/en: English IPA (Britfone + eng_to_ipa) - de: German IPA (wiki-pronunciation-dict + epitran) - all: EN column gets English IPA, other columns get German IPA - none: disabled Frontend shows CC-BY-SA attribution when German IPA is active. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,7 @@ interface GridToolbarProps {
|
|||||||
const IPA_LABELS: Record<IpaMode, string> = {
|
const IPA_LABELS: Record<IpaMode, string> = {
|
||||||
auto: 'IPA: Auto',
|
auto: 'IPA: Auto',
|
||||||
en: 'IPA: nur EN',
|
en: 'IPA: nur EN',
|
||||||
|
de: 'IPA: nur DE',
|
||||||
all: 'IPA: Alle',
|
all: 'IPA: Alle',
|
||||||
none: 'IPA: Aus',
|
none: 'IPA: Aus',
|
||||||
}
|
}
|
||||||
@@ -93,16 +94,26 @@ export function GridToolbar({
|
|||||||
</button>
|
</button>
|
||||||
|
|
||||||
{/* IPA mode */}
|
{/* IPA mode */}
|
||||||
|
<div className="flex items-center gap-1">
|
||||||
<select
|
<select
|
||||||
value={ipaMode}
|
value={ipaMode}
|
||||||
onChange={(e) => onIpaModeChange(e.target.value as IpaMode)}
|
onChange={(e) => onIpaModeChange(e.target.value as IpaMode)}
|
||||||
className="px-2 py-1.5 text-xs rounded-md border border-gray-200 dark:border-gray-700 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-400"
|
className="px-2 py-1.5 text-xs rounded-md border border-gray-200 dark:border-gray-700 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-400"
|
||||||
title="Lautschrift (IPA): Auto = nur bei erkannten englischen Woertern, Alle = fuer alle Vokabeln, Aus = keine"
|
title="Lautschrift (IPA): Auto = nur erkannte EN-Woerter, DE = deutsches IPA (Wiktionary), Alle = EN + DE, Aus = keine"
|
||||||
>
|
>
|
||||||
{(Object.keys(IPA_LABELS) as IpaMode[]).map((m) => (
|
{(Object.keys(IPA_LABELS) as IpaMode[]).map((m) => (
|
||||||
<option key={m} value={m}>{IPA_LABELS[m]}</option>
|
<option key={m} value={m}>{IPA_LABELS[m]}</option>
|
||||||
))}
|
))}
|
||||||
</select>
|
</select>
|
||||||
|
{(ipaMode === 'de' || ipaMode === 'all') && (
|
||||||
|
<span
|
||||||
|
className="text-[9px] text-gray-400 dark:text-gray-500 cursor-help"
|
||||||
|
title="DE-Lautschrift: Wiktionary (CC-BY-SA 4.0) + epitran (MIT). EN-Lautschrift: Britfone (MIT) + eng_to_ipa (MIT)."
|
||||||
|
>
|
||||||
|
CC-BY-SA
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
{/* Syllable mode */}
|
{/* Syllable mode */}
|
||||||
<select
|
<select
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ export interface GridEditorState {
|
|||||||
selectedZone: number | null
|
selectedZone: number | null
|
||||||
}
|
}
|
||||||
|
|
||||||
export type IpaMode = 'auto' | 'all' | 'en' | 'none'
|
export type IpaMode = 'auto' | 'all' | 'de' | 'en' | 'none'
|
||||||
export type SyllableMode = 'auto' | 'all' | 'de' | 'en' | 'none'
|
export type SyllableMode = 'auto' | 'all' | 'de' | 'en' | 'none'
|
||||||
|
|
||||||
export function useGridEditor(sessionId: string | null) {
|
export function useGridEditor(sessionId: string | null) {
|
||||||
|
|||||||
135
klausur-service/backend/cv_ipa_german.py
Normal file
135
klausur-service/backend/cv_ipa_german.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
"""German IPA insertion for grid editor cells.
|
||||||
|
|
||||||
|
Hybrid approach:
|
||||||
|
1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA)
|
||||||
|
2. Fallback: epitran rule-based G2P (MIT license)
|
||||||
|
|
||||||
|
German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0).
|
||||||
|
Attribution required — see grid editor UI.
|
||||||
|
|
||||||
|
Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary)
|
||||||
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Optional, Set
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# IPA/phonetic characters — skip cells that already contain IPA
|
||||||
|
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
||||||
|
|
||||||
|
|
||||||
|
def _lookup_ipa_de(word: str) -> Optional[str]:
|
||||||
|
"""Look up German IPA for a single word.
|
||||||
|
|
||||||
|
Returns IPA string or None if not found.
|
||||||
|
"""
|
||||||
|
from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
|
||||||
|
|
||||||
|
if not DE_IPA_AVAILABLE and _epitran_de is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
lower = word.lower().strip()
|
||||||
|
if not lower:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 1. Dictionary lookup (636k entries)
|
||||||
|
ipa = _de_ipa_dict.get(lower)
|
||||||
|
if ipa:
|
||||||
|
return ipa
|
||||||
|
|
||||||
|
# 2. epitran fallback (rule-based)
|
||||||
|
if _epitran_de is not None:
|
||||||
|
try:
|
||||||
|
result = _epitran_de.transliterate(word)
|
||||||
|
if result and result != word.lower():
|
||||||
|
return result
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _insert_ipa_for_text(text: str) -> str:
|
||||||
|
"""Insert German IPA after each recognized word in a text string.
|
||||||
|
|
||||||
|
Handles comma-separated lists:
|
||||||
|
"bildschön, blendend" → "bildschön [bɪltʃøn], blendend [blɛndənt]"
|
||||||
|
|
||||||
|
Skips cells already containing IPA brackets.
|
||||||
|
"""
|
||||||
|
if not text or _IPA_RE.search(text):
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Split on comma/semicolon sequences, keeping separators
|
||||||
|
tokens = re.split(r'([,;:]+\s*)', text)
|
||||||
|
result = []
|
||||||
|
changed = False
|
||||||
|
|
||||||
|
for tok in tokens:
|
||||||
|
# Keep separators as-is
|
||||||
|
if not tok or re.match(r'^[,;:\s]+$', tok):
|
||||||
|
result.append(tok)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Process words within this token
|
||||||
|
words = tok.split()
|
||||||
|
new_words = []
|
||||||
|
for w in words:
|
||||||
|
# Strip punctuation for lookup
|
||||||
|
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w)
|
||||||
|
if len(clean) < 3:
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
|
||||||
|
ipa = _lookup_ipa_de(clean)
|
||||||
|
if ipa:
|
||||||
|
new_words.append(f"{w} [{ipa}]")
|
||||||
|
changed = True
|
||||||
|
else:
|
||||||
|
new_words.append(w)
|
||||||
|
|
||||||
|
result.append(' '.join(new_words))
|
||||||
|
|
||||||
|
return ''.join(result) if changed else text
|
||||||
|
|
||||||
|
|
||||||
|
def insert_german_ipa(
|
||||||
|
cells: List[Dict],
|
||||||
|
target_cols: Set[str],
|
||||||
|
) -> int:
|
||||||
|
"""Insert German IPA transcriptions into cells of target columns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cells: Flat list of all cells (modified in-place).
|
||||||
|
target_cols: Set of col_type values to process.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of cells modified.
|
||||||
|
"""
|
||||||
|
from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de
|
||||||
|
|
||||||
|
if not DE_IPA_AVAILABLE and _epitran_de is None:
|
||||||
|
logger.warning("German IPA not available — skipping")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for cell in cells:
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if ct not in target_cols:
|
||||||
|
continue
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if not text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_text = _insert_ipa_for_text(text)
|
||||||
|
if new_text != text:
|
||||||
|
cell["text"] = new_text
|
||||||
|
cell["_ipa_corrected"] = True
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
if count:
|
||||||
|
logger.info(f"German IPA inserted in {count} cells")
|
||||||
|
return count
|
||||||
@@ -65,6 +65,38 @@ if os.path.exists(_britfone_path):
|
|||||||
else:
|
else:
|
||||||
logger.info("Britfone not found — British IPA disabled")
|
logger.info("Britfone not found — British IPA disabled")
|
||||||
|
|
||||||
|
# --- German IPA Dictionary (CC-BY-SA, Wiktionary) ---
|
||||||
|
|
||||||
|
DE_IPA_AVAILABLE = False
|
||||||
|
_de_ipa_dict: Dict[str, str] = {}
|
||||||
|
|
||||||
|
_de_ipa_path = os.path.join(os.path.dirname(__file__), 'data', 'de_ipa.tsv')
|
||||||
|
if os.path.exists(_de_ipa_path):
|
||||||
|
try:
|
||||||
|
with open(_de_ipa_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
parts = line.rstrip('\n').split('\t', 1)
|
||||||
|
if len(parts) == 2:
|
||||||
|
_de_ipa_dict[parts[0]] = parts[1]
|
||||||
|
DE_IPA_AVAILABLE = True
|
||||||
|
logger.info(f"German IPA loaded — {len(_de_ipa_dict)} entries (CC-BY-SA, Wiktionary)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to load German IPA: {e}")
|
||||||
|
else:
|
||||||
|
logger.info("German IPA not found — German IPA disabled")
|
||||||
|
|
||||||
|
# --- epitran German fallback (MIT license) ---
|
||||||
|
|
||||||
|
_epitran_de = None
|
||||||
|
try:
|
||||||
|
import epitran as _epitran_module
|
||||||
|
_epitran_de = _epitran_module.Epitran('deu-Latn')
|
||||||
|
logger.info("epitran loaded — German rule-based IPA fallback enabled")
|
||||||
|
except ImportError:
|
||||||
|
logger.info("epitran not installed — German IPA fallback disabled")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to init epitran: {e}")
|
||||||
|
|
||||||
# --- Language Detection Constants ---
|
# --- Language Detection Constants ---
|
||||||
|
|
||||||
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
|
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
|
||||||
|
|||||||
636901
klausur-service/backend/data/de_ipa.tsv
Normal file
636901
klausur-service/backend/data/de_ipa.tsv
Normal file
File diff suppressed because it is too large
Load Diff
@@ -900,42 +900,49 @@ async def _build_grid_core(
|
|||||||
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
||||||
|
|
||||||
# Decide which columns to process based on ipa_mode:
|
# Decide which columns to process based on ipa_mode:
|
||||||
# auto/en: only the detected EN headword column
|
# auto/en: only the detected EN headword column (English IPA)
|
||||||
# de: all content columns EXCEPT the EN column
|
# de: all content columns EXCEPT the EN column (German IPA)
|
||||||
# all: all content columns
|
# all: EN column gets English IPA, other columns get German IPA
|
||||||
ipa_target_cols: set = set()
|
en_ipa_target_cols: set = set()
|
||||||
|
de_ipa_target_cols: set = set()
|
||||||
if ipa_mode in ("auto", "en"):
|
if ipa_mode in ("auto", "en"):
|
||||||
if en_col_type:
|
if en_col_type:
|
||||||
ipa_target_cols.add(en_col_type)
|
en_ipa_target_cols.add(en_col_type)
|
||||||
elif ipa_mode == "de":
|
elif ipa_mode == "de":
|
||||||
ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
||||||
elif ipa_mode == "all":
|
elif ipa_mode == "all":
|
||||||
ipa_target_cols = all_content_cols
|
if en_col_type:
|
||||||
|
en_ipa_target_cols.add(en_col_type)
|
||||||
|
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
|
||||||
|
|
||||||
if ipa_target_cols:
|
# --- English IPA (Britfone + eng_to_ipa) ---
|
||||||
|
if en_ipa_target_cols:
|
||||||
for cell in all_cells:
|
for cell in all_cells:
|
||||||
ct = cell.get("col_type")
|
ct = cell.get("col_type")
|
||||||
if ct in ipa_target_cols:
|
if ct in en_ipa_target_cols:
|
||||||
cell["_orig_col_type"] = ct
|
cell["_orig_col_type"] = ct
|
||||||
# Full IPA processing (incl. insertion) only for the
|
|
||||||
# detected English column; other columns get light
|
|
||||||
# processing (bracket replacement only) — our IPA
|
|
||||||
# dictionary is English-only, so inserting IPA into
|
|
||||||
# German text would corrupt it.
|
|
||||||
if ct == en_col_type:
|
|
||||||
cell["col_type"] = "column_en"
|
cell["col_type"] = "column_en"
|
||||||
else:
|
|
||||||
cell["col_type"] = "column_text"
|
|
||||||
# Snapshot text before IPA fix to detect which cells were modified
|
|
||||||
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||||
for cell in all_cells:
|
for cell in all_cells:
|
||||||
orig = cell.pop("_orig_col_type", None)
|
orig = cell.pop("_orig_col_type", None)
|
||||||
if orig:
|
if orig:
|
||||||
cell["col_type"] = orig
|
cell["col_type"] = orig
|
||||||
|
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||||
|
cell["_ipa_corrected"] = True
|
||||||
|
|
||||||
|
# --- German IPA (wiki-pronunciation-dict + epitran) ---
|
||||||
|
if de_ipa_target_cols:
|
||||||
|
from cv_ipa_german import insert_german_ipa
|
||||||
|
insert_german_ipa(all_cells, de_ipa_target_cols)
|
||||||
|
|
||||||
|
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
|
||||||
|
|
||||||
# Mark cells whose text was changed by IPA correction so that
|
# Mark cells whose text was changed by IPA correction so that
|
||||||
# later steps (5i) don't overwrite the corrected text when
|
# later steps (5i) don't overwrite the corrected text when
|
||||||
# reconstructing from word_boxes.
|
# reconstructing from word_boxes. (Already done inline above
|
||||||
|
# for English; insert_german_ipa sets _ipa_corrected too.)
|
||||||
|
for cell in all_cells:
|
||||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||||
cell["_ipa_corrected"] = True
|
cell["_ipa_corrected"] = True
|
||||||
|
|
||||||
@@ -1593,7 +1600,7 @@ async def _build_grid_core(
|
|||||||
@router.post("/sessions/{session_id}/build-grid")
|
@router.post("/sessions/{session_id}/build-grid")
|
||||||
async def build_grid(
|
async def build_grid(
|
||||||
session_id: str,
|
session_id: str,
|
||||||
ipa_mode: str = Query("auto", pattern="^(auto|all|en|none)$"),
|
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||||
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||||
):
|
):
|
||||||
"""Build a structured, zone-aware grid from existing Kombi word results.
|
"""Build a structured, zone-aware grid from existing Kombi word results.
|
||||||
|
|||||||
@@ -35,6 +35,9 @@ onnxruntime
|
|||||||
# IPA pronunciation dictionary lookup (MIT license, bundled CMU dict ~134k words)
|
# IPA pronunciation dictionary lookup (MIT license, bundled CMU dict ~134k words)
|
||||||
eng-to-ipa
|
eng-to-ipa
|
||||||
|
|
||||||
|
# German IPA rule-based fallback for OOV words (MIT license)
|
||||||
|
epitran
|
||||||
|
|
||||||
# Spell-checker for rule-based OCR correction (MIT license)
|
# Spell-checker for rule-based OCR correction (MIT license)
|
||||||
pyspellchecker>=0.8.1
|
pyspellchecker>=0.8.1
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user