From 3e65b14b83182ae290ceae096b7693bf3b80ffe7 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 12 Mar 2026 16:08:17 +0100 Subject: [PATCH] fix: split PaddleOCR boxes at IPA brackets for overlay positioning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PaddleOCR returns "badge[bxd3]" without space, but the IPA fixer produces "badge [bˈædʒ]" with space, creating a token count mismatch between cell.text and word_boxes. Now also split at "[" boundaries so each IPA bracket gets its own sub-box. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_words_first.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py index 2c78d45..307d2ba 100644 --- a/klausur-service/backend/cv_words_first.py +++ b/klausur-service/backend/cv_words_first.py @@ -15,6 +15,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging +import re import statistics from typing import Any, Dict, List, Tuple @@ -185,10 +186,15 @@ def _build_cells( # PaddleOCR returns phrase-level boxes (e.g. "competition [kompa'tifn]"), # but the overlay slide mechanism expects one box per word. Split multi-word # boxes into individual word positions proportional to character length. + # Also split at "[" boundaries (IPA patterns like "badge[bxd3]"). word_boxes = [] for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])): raw_text = w.get('text', '').strip() - tokens = raw_text.split() + # Split by whitespace AND at "[" boundaries (IPA without space) + # e.g. "badge[bxd3]" → ["badge", "[bxd3]"] + # e.g. "profit['proft]" → ["profit", "['proft]"] + tokens = re.split(r'\s+|(?=\[)', raw_text) + tokens = [t for t in tokens if t] # remove empty strings if len(tokens) <= 1: # Single word — keep as-is word_boxes.append({