From a6c5f56003cef5591cb380187c336dda5f77ba45 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 12 Apr 2026 09:53:16 +0200 Subject: [PATCH] Fix IPA strip: match all square brackets, not just Unicode IPA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OCR text contains ASCII IPA approximations like [kompa'tifn] instead of Unicode [kˈɒmpətɪʃən]. The strip regex required Unicode IPA chars inside brackets and missed the ASCII ones. Now strips all [bracket] content from excluded columns since square brackets in vocab columns are always IPA. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_editor_api.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 308a2b6..6748d68 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1005,7 +1005,9 @@ async def _build_grid_core( # --- Strip IPA from columns NOT in the target set --- # When user selects "nur DE", English IPA from the OCR scan must # be removed. When "none", all IPA is removed. - _IPA_BRACKET_STRIP_RE = re.compile(r'\s*\[[^\]]*[ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ][^\]]*\]') + # In vocab columns, square brackets [...] are always IPA (both + # Unicode like [ˈgrænˌdæd] and ASCII OCR like [kompa'tifn]). + _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]') strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols if strip_en_ipa or ipa_mode == "none": strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols @@ -1015,7 +1017,7 @@ async def _build_grid_core( continue text = cell.get("text", "") if "[" in text: - stripped = _IPA_BRACKET_STRIP_RE.sub("", text) + stripped = _SQUARE_BRACKET_RE.sub("", text) if stripped != text: cell["text"] = stripped.strip() cell["_ipa_corrected"] = True