diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index ddc962b..1e4f022 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1507,8 +1507,10 @@ async def _build_grid_core( is_artifact = True elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS: is_artifact = True - elif len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core): + elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core) + and not re.match(r'^[pPsS]\.?\d+$', core)): # Mixed digits + letters in short text (e.g. "7 EN", "a=3") + # but NOT page references like "p.43", "p50", "S.12" is_artifact = True if is_artifact: kept.append(None) # placeholder @@ -1717,8 +1719,10 @@ async def _build_grid_core( except ImportError: pass - # --- Ensure space before IPA brackets: "word[ipa]" → "word [ipa]" --- - _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]*[ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾ][^\]]*\])') + # --- Ensure space before IPA/phonetic brackets: "word[ipa]" → "word [ipa]" --- + # Matches any [bracket] directly after a letter, as long as the bracket + # content doesn't look like a normal text annotation (e.g. "[adj]", "[noun]"). + _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])') for z in zones_data: for cell in z.get("cells", []): text = cell.get("text", "")