Add language-specific IPA and syllable modes (de/en)

Extend ipa_mode and syllable_mode toggles with language options: - auto: smart detection (default) - en: only English headword column - de: only German definition columns - all: all content columns - none: skip entirely Also improve English column auto-detection: use garbled IPA patterns (apostrophes, colons) in addition to bracket patterns. This correctly identifies English dictionary pages where OCR produces garbled ASCII instead of bracket IPA. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-25 08:16:29 +01:00
parent 34680732f8
commit 83c058e400
4 changed files with 68 additions and 31 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -80,9 +80,11 @@ async def _build_grid_core(
        session_id: Session identifier (for logging and image loading).
        session: Full session dict from get_session_db().
        ipa_mode: "auto" (only when English headwords detected), "all"
-            (force IPA on all content columns), or "none" (skip IPA entirely).
+            (force IPA on all content columns), "en" (English column only),
+            "de" (German/definition columns only), or "none" (skip entirely).
        syllable_mode: "auto" (only when original has pipe dividers),
-            "all" (force syllabification on all words), or "none" (skip).
+            "all" (force syllabification on all words), "en" (English only),
+            "de" (German only), or "none" (skip).

    Returns:
        StructuredGrid result dict.
@@ -869,32 +871,51 @@ async def _build_grid_core(
    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
    en_col_type = None
+    ipa_target_cols: set = set()
+    all_content_cols: set = set()
    skip_ipa = (ipa_mode == "none")
    if not skip_ipa and total_cols >= 3:
-        # Find the column that contains IPA brackets → English headwords.
-        # Count cells with bracket patterns per col_type.  The column with
-        # the most brackets is the headword column (IPA sits after or below
-        # headwords).
-        col_bracket_count: Dict[str, int] = {}
+        # Detect English headword column via IPA signals (brackets or garbled).
+        col_ipa_count: Dict[str, int] = {}
+        all_content_cols: set = set()
        for cell in all_cells:
            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
            txt = cell.get("text", "") or ""
-            if ct.startswith("column_") and '[' in txt:
-                col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
-        # Pick column with most bracket IPA patterns.
-        # ipa_mode="auto": only when OCR already found bracket IPA (no fallback).
-        # ipa_mode="all": fallback to headword_col_index from dictionary detection.
-        if col_bracket_count:
-            en_col_type = max(col_bracket_count, key=col_bracket_count.get)
-        elif ipa_mode == "all":
-            # Force IPA: use headword column from dictionary detection
-            hw_idx = dict_detection.get("headword_col_index")
-            if hw_idx is not None:
-                en_col_type = f"column_{hw_idx + 1}"
-        if en_col_type:
+            if txt.strip():
+                all_content_cols.add(ct)
+            if '[' in txt or _text_has_garbled_ipa(txt):
+                col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
+        if col_ipa_count:
+            en_col_type = max(col_ipa_count, key=col_ipa_count.get)
+        elif ipa_mode in ("all", "de", "en"):
+            # Force mode without auto-detection: pick column with most cells
+            col_cell_count: Dict[str, int] = {}
            for cell in all_cells:
-                if cell.get("col_type") == en_col_type:
-                    cell["_orig_col_type"] = en_col_type
+                ct = cell.get("col_type", "")
+                if ct.startswith("column_") and (cell.get("text") or "").strip():
+                    col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
+            if col_cell_count:
+                en_col_type = max(col_cell_count, key=col_cell_count.get)
+
+        # Decide which columns to process based on ipa_mode:
+        # auto/en: only the detected EN headword column
+        # de: all content columns EXCEPT the EN column
+        # all: all content columns
+        ipa_target_cols: set = set()
+        if ipa_mode in ("auto", "en"):
+            if en_col_type:
+                ipa_target_cols.add(en_col_type)
+        elif ipa_mode == "de":
+            ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
+        elif ipa_mode == "all":
+            ipa_target_cols = all_content_cols
+
+        if ipa_target_cols:
+            for cell in all_cells:
+                if cell.get("col_type") in ipa_target_cols:
+                    cell["_orig_col_type"] = cell["col_type"]
                    cell["col_type"] = "column_en"
        # Snapshot text before IPA fix to detect which cells were modified
        _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
@@ -1476,24 +1497,31 @@ async def _build_grid_core(

    # --- Syllable divider insertion for dictionary pages ---
    # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
-    #   "all" = force syllabification on all content words,
-    #   "none" = skip entirely.
+    #   "all" = force on all content words, "en" = English column only,
+    #   "de" = German columns only, "none" = skip entirely.
    syllable_insertions = 0
    if syllable_mode != "none" and img_bgr is not None:
        _syllable_eligible = False
-        if syllable_mode == "all":
+        if syllable_mode in ("all", "de", "en"):
            _syllable_eligible = True
        elif (dict_detection.get("is_dictionary")
                and dict_detection.get("article_col_index") is not None):
            # auto: only on dictionary pages with article columns
            _syllable_eligible = True
+        # For language-specific modes, determine allowed columns
+        _syllable_col_filter: Optional[set] = None  # None = all columns
+        if syllable_mode == "en" and en_col_type:
+            _syllable_col_filter = {en_col_type}
+        elif syllable_mode == "de" and en_col_type:
+            _syllable_col_filter = all_content_cols - {en_col_type} if total_cols >= 3 else None
        if _syllable_eligible:
            try:
                from cv_syllable_detect import insert_syllable_dividers
-                force_syllables = (syllable_mode == "all")
+                force_syllables = (syllable_mode in ("all", "de", "en"))
                syllable_insertions = insert_syllable_dividers(
                    zones_data, img_bgr, session_id,
                    force=force_syllables,
+                    col_filter=_syllable_col_filter,
                )
            except Exception as e:
                logger.warning("Syllable insertion failed: %s", e)
@@ -1538,7 +1566,7 @@ async def _build_grid_core(
        "processing_modes": {
            "ipa_mode": ipa_mode,
            "syllable_mode": syllable_mode,
-            "ipa_applied": en_col_type is not None and not skip_ipa,
+            "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
            "syllables_applied": syllable_insertions > 0,
        },
        "duration_seconds": round(duration, 2),
@@ -1554,8 +1582,8 @@ async def _build_grid_core(
@router.post("/sessions/{session_id}/build-grid")
 async def build_grid(
    session_id: str,
-    ipa_mode: str = Query("auto", pattern="^(auto|all|none)$"),
-    syllable_mode: str = Query("auto", pattern="^(auto|all|none)$"),
+    ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
+    syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
 ):
    """Build a structured, zone-aware grid from existing Kombi word results.