From 40ac593d28b30f8b189b456e841fb49ef293a59d Mon Sep 17 00:00:00 2001
From: Benjamin Admin
 <benjaminadmin@bdc0df9b-4470-4f27-9760-8f96308c9820.fritz.box>
Date: Thu, 12 Mar 2026 16:00:06 +0100
Subject: [PATCH] fix: split PaddleOCR phrase boxes into per-word boxes for
 overlay slide

PaddleOCR returns phrase-level bounding boxes (e.g. "competition
[kompa'tifn]" as one box) but the overlay slide mechanism expects
one box per word for accurate positioning. Multi-word boxes are now
split proportionally by character count with small gaps between words.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_words_first.py | 46 ++++++++++++++++++-----
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py
index f1ca0bc..2c78d45 100644
--- a/klausur-service/backend/cv_words_first.py
+++ b/klausur-service/backend/cv_words_first.py
@@ -181,17 +181,45 @@ def _build_cells(
         confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
         avg_conf = sum(confs) / len(confs) if confs else 0.0
 
-        # Word boxes with absolute pixel coordinates (consistent with cv_cell_grid.py)
+        # Word boxes with absolute pixel coordinates (consistent with cv_cell_grid.py).
+        # PaddleOCR returns phrase-level boxes (e.g. "competition [kompa'tifn]"),
+        # but the overlay slide mechanism expects one box per word. Split multi-word
+        # boxes into individual word positions proportional to character length.
         word_boxes = []
         for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
-            word_boxes.append({
-                'text': w.get('text', ''),
-                'left': w['left'],
-                'top': w['top'],
-                'width': w['width'],
-                'height': w['height'],
-                'conf': w.get('conf', 0),
-            })
+            raw_text = w.get('text', '').strip()
+            tokens = raw_text.split()
+            if len(tokens) <= 1:
+                # Single word — keep as-is
+                word_boxes.append({
+                    'text': raw_text,
+                    'left': w['left'],
+                    'top': w['top'],
+                    'width': w['width'],
+                    'height': w['height'],
+                    'conf': w.get('conf', 0),
+                })
+            else:
+                # Multi-word phrase — split proportionally by character count
+                total_chars = sum(len(t) for t in tokens)
+                if total_chars == 0:
+                    continue
+                # Small gap between words (2% of box width per gap)
+                n_gaps = len(tokens) - 1
+                gap_px = w['width'] * 0.02
+                usable_w = w['width'] - gap_px * n_gaps
+                cursor = w['left']
+                for t in tokens:
+                    token_w = max(1, usable_w * len(t) / total_chars)
+                    word_boxes.append({
+                        'text': t,
+                        'left': round(cursor),
+                        'top': w['top'],
+                        'width': round(token_w),
+                        'height': w['height'],
+                        'conf': w.get('conf', 0),
+                    })
+                    cursor += token_w + gap_px
 
         cells.append({
             'cell_id': f"R{ri:02d}_C{ci}",