From 1f527fcd494c28140b4e0669a0eb39066182ac82 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Thu, 12 Mar 2026 17:46:17 +0100
Subject: [PATCH] fix: split PaddleOCR boxes at leading ! for overlay word
 positioning

When PaddleOCR returns "!Betonung" as a single word box, the overlay
positions text starting at the "!" instead of the actual word. Split
such boxes into ["!", "Betonung"] with proportional position splitting,
matching the existing IPA bracket splitting logic.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_words_first.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py
index 307d2ba..e5dd9ed 100644
--- a/klausur-service/backend/cv_words_first.py
+++ b/klausur-service/backend/cv_words_first.py
@@ -190,10 +190,11 @@ def _build_cells(
         word_boxes = []
         for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
             raw_text = w.get('text', '').strip()
-            # Split by whitespace AND at "[" boundaries (IPA without space)
+            # Split by whitespace, at "[" boundaries (IPA), and after leading "!"
             # e.g. "badge[bxd3]" → ["badge", "[bxd3]"]
             # e.g. "profit['proft]" → ["profit", "['proft]"]
-            tokens = re.split(r'\s+|(?=\[)', raw_text)
+            # e.g. "!Betonung" → ["!", "Betonung"]
+            tokens = re.split(r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text)
             tokens = [t for t in tokens if t]  # remove empty strings
             if len(tokens) <= 1:
                 # Single word — keep as-is