Fix IPA strip: match all square brackets, not just Unicode IPA

OCR text contains ASCII IPA approximations like [kompa'tifn] instead of Unicode [kˈɒmpətɪʃən]. The strip regex required Unicode IPA chars inside brackets and missed the ASCII ones. Now strips all [bracket] content from excluded columns since square brackets in vocab columns are always IPA. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 09:53:16 +02:00
parent 584e07eb21
commit a6c5f56003
1 changed files with 4 additions and 2 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1005,7 +1005,9 @@ async def _build_grid_core(
        # --- Strip IPA from columns NOT in the target set ---
        # When user selects "nur DE", English IPA from the OCR scan must
        # be removed.  When "none", all IPA is removed.
-        _IPA_BRACKET_STRIP_RE = re.compile(r'\s*\[[^\]]*[ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ][^\]]*\]')
+        # In vocab columns, square brackets [...] are always IPA (both
+        # Unicode like [ˈgrænˌdæd] and ASCII OCR like [kompa'tifn]).
+        _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
        strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
        if strip_en_ipa or ipa_mode == "none":
            strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
@@ -1015,7 +1017,7 @@ async def _build_grid_core(
                    continue
                text = cell.get("text", "")
                if "[" in text:
-                    stripped = _IPA_BRACKET_STRIP_RE.sub("", text)
+                    stripped = _SQUARE_BRACKET_RE.sub("", text)
                    if stripped != text:
                        cell["text"] = stripped.strip()
                        cell["_ipa_corrected"] = True