Add German IPA support (wiki-pronunciation-dict + epitran)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m12s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m12s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
Hybrid approach mirroring English IPA: - Primary: wiki-pronunciation-dict (636k entries, CC-BY-SA, Wiktionary) - Fallback: epitran rule-based G2P (MIT license) IPA modes now use language-appropriate dictionaries: - auto/en: English IPA (Britfone + eng_to_ipa) - de: German IPA (wiki-pronunciation-dict + epitran) - all: EN column gets English IPA, other columns get German IPA - none: disabled Frontend shows CC-BY-SA attribution when German IPA is active. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -900,42 +900,49 @@ async def _build_grid_core(
|
||||
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
||||
|
||||
# Decide which columns to process based on ipa_mode:
|
||||
# auto/en: only the detected EN headword column
|
||||
# de: all content columns EXCEPT the EN column
|
||||
# all: all content columns
|
||||
ipa_target_cols: set = set()
|
||||
# auto/en: only the detected EN headword column (English IPA)
|
||||
# de: all content columns EXCEPT the EN column (German IPA)
|
||||
# all: EN column gets English IPA, other columns get German IPA
|
||||
en_ipa_target_cols: set = set()
|
||||
de_ipa_target_cols: set = set()
|
||||
if ipa_mode in ("auto", "en"):
|
||||
if en_col_type:
|
||||
ipa_target_cols.add(en_col_type)
|
||||
en_ipa_target_cols.add(en_col_type)
|
||||
elif ipa_mode == "de":
|
||||
ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
||||
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
||||
elif ipa_mode == "all":
|
||||
ipa_target_cols = all_content_cols
|
||||
if en_col_type:
|
||||
en_ipa_target_cols.add(en_col_type)
|
||||
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
|
||||
|
||||
if ipa_target_cols:
|
||||
# --- English IPA (Britfone + eng_to_ipa) ---
|
||||
if en_ipa_target_cols:
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type")
|
||||
if ct in ipa_target_cols:
|
||||
if ct in en_ipa_target_cols:
|
||||
cell["_orig_col_type"] = ct
|
||||
# Full IPA processing (incl. insertion) only for the
|
||||
# detected English column; other columns get light
|
||||
# processing (bracket replacement only) — our IPA
|
||||
# dictionary is English-only, so inserting IPA into
|
||||
# German text would corrupt it.
|
||||
if ct == en_col_type:
|
||||
cell["col_type"] = "column_en"
|
||||
else:
|
||||
cell["col_type"] = "column_text"
|
||||
# Snapshot text before IPA fix to detect which cells were modified
|
||||
cell["col_type"] = "column_en"
|
||||
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||
for cell in all_cells:
|
||||
orig = cell.pop("_orig_col_type", None)
|
||||
if orig:
|
||||
cell["col_type"] = orig
|
||||
# Mark cells whose text was changed by IPA correction so that
|
||||
# later steps (5i) don't overwrite the corrected text when
|
||||
# reconstructing from word_boxes.
|
||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
# --- German IPA (wiki-pronunciation-dict + epitran) ---
|
||||
if de_ipa_target_cols:
|
||||
from cv_ipa_german import insert_german_ipa
|
||||
insert_german_ipa(all_cells, de_ipa_target_cols)
|
||||
|
||||
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
|
||||
|
||||
# Mark cells whose text was changed by IPA correction so that
|
||||
# later steps (5i) don't overwrite the corrected text when
|
||||
# reconstructing from word_boxes. (Already done inline above
|
||||
# for English; insert_german_ipa sets _ipa_corrected too.)
|
||||
for cell in all_cells:
|
||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
@@ -1593,7 +1600,7 @@ async def _build_grid_core(
|
||||
@router.post("/sessions/{session_id}/build-grid")
|
||||
async def build_grid(
|
||||
session_id: str,
|
||||
ipa_mode: str = Query("auto", pattern="^(auto|all|en|none)$"),
|
||||
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||
):
|
||||
"""Build a structured, zone-aware grid from existing Kombi word results.
|
||||
|
||||
Reference in New Issue
Block a user