From ad78e261439dbb3b466e3a93ad2e58d205895990 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 12 Apr 2026 09:13:02 +0200 Subject: [PATCH] Fix word-split: handle IPA brackets, contractions, and tiebreaker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Strip IPA brackets [ipa] before attempting word split, so "makeadecision[dɪsˈɪʒən]" is processed as "makeadecision" 2. Handle contractions: "solet's" → split "solet" → "so let" + "'s" 3. DP tiebreaker: prefer longer first word when scores are equal ("task is" over "ta skis") Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_editor_api.py | 23 ++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 02f105a..844e179 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1752,8 +1752,27 @@ async def _build_grid_core( changed = False for token in text.split(): # Try splitting pure-alpha tokens >= 4 chars - clean = token.rstrip(".,!?;:'\")") - suffix = token[len(clean):] + # Strip trailing punctuation AND IPA brackets + clean = token + # Remove trailing IPA like [dɪsˈɪʒən] first + bracket_pos = clean.find('[') + suffix_ipa = "" + if bracket_pos > 0: + suffix_ipa = clean[bracket_pos:] + clean = clean[:bracket_pos] + suffix_punct = "" + stripped = clean.rstrip(".,!?;:'\")") + if stripped != clean: + suffix_punct = clean[len(stripped):] + clean = stripped + suffix = suffix_punct + suffix_ipa + # Handle contractions: "solet's" → try "solet" + "'s" + contraction = "" + if "'" in clean and clean.index("'") >= 2: + apos_pos = clean.index("'") + contraction = clean[apos_pos:] + clean = clean[:apos_pos] + suffix = contraction + suffix if len(clean) >= 4 and clean.isalpha(): split = _try_split_merged_word(clean) if split: