From 1f527fcd494c28140b4e0669a0eb39066182ac82 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 12 Mar 2026 17:46:17 +0100 Subject: [PATCH] fix: split PaddleOCR boxes at leading ! for overlay word positioning When PaddleOCR returns "!Betonung" as a single word box, the overlay positions text starting at the "!" instead of the actual word. Split such boxes into ["!", "Betonung"] with proportional position splitting, matching the existing IPA bracket splitting logic. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_words_first.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py index 307d2ba..e5dd9ed 100644 --- a/klausur-service/backend/cv_words_first.py +++ b/klausur-service/backend/cv_words_first.py @@ -190,10 +190,11 @@ def _build_cells( word_boxes = [] for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])): raw_text = w.get('text', '').strip() - # Split by whitespace AND at "[" boundaries (IPA without space) + # Split by whitespace, at "[" boundaries (IPA), and after leading "!" # e.g. "badge[bxd3]" → ["badge", "[bxd3]"] # e.g. "profit['proft]" → ["profit", "['proft]"] - tokens = re.split(r'\s+|(?=\[)', raw_text) + # e.g. "!Betonung" → ["!", "Betonung"] + tokens = re.split(r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text) tokens = [t for t in tokens if t] # remove empty strings if len(tokens) <= 1: # Single word — keep as-is