Add German IPA support (wiki-pronunciation-dict + epitran)

Hybrid approach mirroring English IPA: - Primary: wiki-pronunciation-dict (636k entries, CC-BY-SA, Wiktionary) - Fallback: epitran rule-based G2P (MIT license) IPA modes now use language-appropriate dictionaries: - auto/en: English IPA (Britfone + eng_to_ipa) - de: German IPA (wiki-pronunciation-dict + epitran) - all: EN column gets English IPA, other columns get German IPA - none: disabled Frontend shows CC-BY-SA attribution when German IPA is active. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-25 22:18:20 +01:00
parent a73ddce43d
commit f860eb66e6
7 changed files with 637123 additions and 34 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -900,42 +900,49 @@ async def _build_grid_core(
                en_col_type = max(col_cell_count, key=col_cell_count.get)

        # Decide which columns to process based on ipa_mode:
-        # auto/en: only the detected EN headword column
-        # de: all content columns EXCEPT the EN column
-        # all: all content columns
-        ipa_target_cols: set = set()
+        # auto/en: only the detected EN headword column (English IPA)
+        # de: all content columns EXCEPT the EN column (German IPA)
+        # all: EN column gets English IPA, other columns get German IPA
+        en_ipa_target_cols: set = set()
+        de_ipa_target_cols: set = set()
        if ipa_mode in ("auto", "en"):
            if en_col_type:
-                ipa_target_cols.add(en_col_type)
+                en_ipa_target_cols.add(en_col_type)
        elif ipa_mode == "de":
-            ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
+            de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
        elif ipa_mode == "all":
-            ipa_target_cols = all_content_cols
+            if en_col_type:
+                en_ipa_target_cols.add(en_col_type)
+            de_ipa_target_cols = all_content_cols - en_ipa_target_cols

-        if ipa_target_cols:
+        # --- English IPA (Britfone + eng_to_ipa) ---
+        if en_ipa_target_cols:
            for cell in all_cells:
                ct = cell.get("col_type")
-                if ct in ipa_target_cols:
+                if ct in en_ipa_target_cols:
                    cell["_orig_col_type"] = ct
-                    # Full IPA processing (incl. insertion) only for the
-                    # detected English column; other columns get light
-                    # processing (bracket replacement only) — our IPA
-                    # dictionary is English-only, so inserting IPA into
-                    # German text would corrupt it.
-                    if ct == en_col_type:
-                        cell["col_type"] = "column_en"
-                    else:
-                        cell["col_type"] = "column_text"
-        # Snapshot text before IPA fix to detect which cells were modified
+                    cell["col_type"] = "column_en"
        _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
        fix_cell_phonetics(all_cells, pronunciation="british")
        for cell in all_cells:
            orig = cell.pop("_orig_col_type", None)
            if orig:
                cell["col_type"] = orig
-            # Mark cells whose text was changed by IPA correction so that
-            # later steps (5i) don't overwrite the corrected text when
-            # reconstructing from word_boxes.
+            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
+                cell["_ipa_corrected"] = True
+
+        # --- German IPA (wiki-pronunciation-dict + epitran) ---
+        if de_ipa_target_cols:
+            from cv_ipa_german import insert_german_ipa
+            insert_german_ipa(all_cells, de_ipa_target_cols)
+
+        ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
+
+        # Mark cells whose text was changed by IPA correction so that
+        # later steps (5i) don't overwrite the corrected text when
+        # reconstructing from word_boxes.  (Already done inline above
+        # for English; insert_german_ipa sets _ipa_corrected too.)
+        for cell in all_cells:
            if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
                cell["_ipa_corrected"] = True

@@ -1593,7 +1600,7 @@ async def _build_grid_core(
@router.post("/sessions/{session_id}/build-grid")
 async def build_grid(
    session_id: str,
-    ipa_mode: str = Query("auto", pattern="^(auto|all|en|none)$"),
+    ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
    syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
 ):
    """Build a structured, zone-aware grid from existing Kombi word results.