fix: split PaddleOCR boxes at IPA brackets for overlay positioning

PaddleOCR returns "badge[bxd3]" without space, but the IPA fixer produces "badge [bˈædʒ]" with space, creating a token count mismatch between cell.text and word_boxes. Now also split at "[" boundaries so each IPA bracket gets its own sub-box. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 16:08:17 +01:00
parent 40ac593d28
commit 3e65b14b83
1 changed files with 7 additions and 1 deletions
@@ -15,6 +15,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """

 import logging
+import re
 import statistics
 from typing import Any, Dict, List, Tuple

@@ -185,10 +186,15 @@ def _build_cells(
        # PaddleOCR returns phrase-level boxes (e.g. "competition [kompa'tifn]"),
        # but the overlay slide mechanism expects one box per word. Split multi-word
        # boxes into individual word positions proportional to character length.
+        # Also split at "[" boundaries (IPA patterns like "badge[bxd3]").
        word_boxes = []
        for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
            raw_text = w.get('text', '').strip()
-            tokens = raw_text.split()
+            # Split by whitespace AND at "[" boundaries (IPA without space)
+            # e.g. "badge[bxd3]" → ["badge", "[bxd3]"]
+            # e.g. "profit['proft]" → ["profit", "['proft]"]
+            tokens = re.split(r'\s+|(?=\[)', raw_text)
+            tokens = [t for t in tokens if t]  # remove empty strings
            if len(tokens) <= 1:
                # Single word — keep as-is
                word_boxes.append({