Add German IPA support (wiki-pronunciation-dict + epitran)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m12s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s

Hybrid approach mirroring English IPA:
- Primary: wiki-pronunciation-dict (636k entries, CC-BY-SA, Wiktionary)
- Fallback: epitran rule-based G2P (MIT license)

IPA modes now use language-appropriate dictionaries:
- auto/en: English IPA (Britfone + eng_to_ipa)
- de: German IPA (wiki-pronunciation-dict + epitran)
- all: EN column gets English IPA, other columns get German IPA
- none: disabled

Frontend shows CC-BY-SA attribution when German IPA is active.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-25 22:18:20 +01:00
parent a73ddce43d
commit f860eb66e6
7 changed files with 637123 additions and 34 deletions

View File

@@ -900,42 +900,49 @@ async def _build_grid_core(
en_col_type = max(col_cell_count, key=col_cell_count.get)
# Decide which columns to process based on ipa_mode:
# auto/en: only the detected EN headword column
# de: all content columns EXCEPT the EN column
# all: all content columns
ipa_target_cols: set = set()
# auto/en: only the detected EN headword column (English IPA)
# de: all content columns EXCEPT the EN column (German IPA)
# all: EN column gets English IPA, other columns get German IPA
en_ipa_target_cols: set = set()
de_ipa_target_cols: set = set()
if ipa_mode in ("auto", "en"):
if en_col_type:
ipa_target_cols.add(en_col_type)
en_ipa_target_cols.add(en_col_type)
elif ipa_mode == "de":
ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
elif ipa_mode == "all":
ipa_target_cols = all_content_cols
if en_col_type:
en_ipa_target_cols.add(en_col_type)
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
if ipa_target_cols:
# --- English IPA (Britfone + eng_to_ipa) ---
if en_ipa_target_cols:
for cell in all_cells:
ct = cell.get("col_type")
if ct in ipa_target_cols:
if ct in en_ipa_target_cols:
cell["_orig_col_type"] = ct
# Full IPA processing (incl. insertion) only for the
# detected English column; other columns get light
# processing (bracket replacement only) — our IPA
# dictionary is English-only, so inserting IPA into
# German text would corrupt it.
if ct == en_col_type:
cell["col_type"] = "column_en"
else:
cell["col_type"] = "column_text"
# Snapshot text before IPA fix to detect which cells were modified
cell["col_type"] = "column_en"
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
fix_cell_phonetics(all_cells, pronunciation="british")
for cell in all_cells:
orig = cell.pop("_orig_col_type", None)
if orig:
cell["col_type"] = orig
# Mark cells whose text was changed by IPA correction so that
# later steps (5i) don't overwrite the corrected text when
# reconstructing from word_boxes.
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
cell["_ipa_corrected"] = True
# --- German IPA (wiki-pronunciation-dict + epitran) ---
if de_ipa_target_cols:
from cv_ipa_german import insert_german_ipa
insert_german_ipa(all_cells, de_ipa_target_cols)
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
# Mark cells whose text was changed by IPA correction so that
# later steps (5i) don't overwrite the corrected text when
# reconstructing from word_boxes. (Already done inline above
# for English; insert_german_ipa sets _ipa_corrected too.)
for cell in all_cells:
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
cell["_ipa_corrected"] = True
@@ -1593,7 +1600,7 @@ async def _build_grid_core(
@router.post("/sessions/{session_id}/build-grid")
async def build_grid(
session_id: str,
ipa_mode: str = Query("auto", pattern="^(auto|all|en|none)$"),
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
):
"""Build a structured, zone-aware grid from existing Kombi word results.