From 5eff4cf877ae1741496fa952e8158b8fdc0af9a5 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Sat, 11 Apr 2026 22:01:25 +0200
Subject: [PATCH] Fix page refs deleted as artifacts + IPA spacing for DE mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Step 5j-pre wrongly classified "p.43", "p.50" etc as artifacts
   (mixed digits+letters, <=5 chars). Added exception for page
   reference patterns (p.XX, S.XX).

2. IPA spacing regex was too narrow (only matched Unicode IPA chars).
   Now matches any [bracket] content >=2 chars directly after a letter,
   fixing German IPA like "Opa[oːpa]" → "Opa [oːpa]".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index ddc962b..1e4f022 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1507,8 +1507,10 @@ async def _build_grid_core(
                 is_artifact = True
             elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
                 is_artifact = True
-            elif len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core):
+            elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
+                  and not re.match(r'^[pPsS]\.?\d+$', core)):
                 # Mixed digits + letters in short text (e.g. "7 EN", "a=3")
+                # but NOT page references like "p.43", "p50", "S.12"
                 is_artifact = True
             if is_artifact:
                 kept.append(None)  # placeholder
@@ -1717,8 +1719,10 @@ async def _build_grid_core(
     except ImportError:
         pass
 
-    # --- Ensure space before IPA brackets: "word[ipa]" → "word [ipa]" ---
-    _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]*[ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾ][^\]]*\])')
+    # --- Ensure space before IPA/phonetic brackets: "word[ipa]" → "word [ipa]" ---
+    # Matches any [bracket] directly after a letter, as long as the bracket
+    # content doesn't look like a normal text annotation (e.g. "[adj]", "[noun]").
+    _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
     for z in zones_data:
         for cell in z.get("cells", []):
             text = cell.get("text", "")