feat: Full-Row OCR mit Spacing fuer Box-Sub-Sessions

Sub-Sessions ueberspringen Spaltenerkennung und nutzen stattdessen eine Pseudo-Spalte ueber die volle Breite. Text wird mit proportionalem Spacing aus Wort-Positionen rekonstruiert, um raeumliches Layout zu erhalten. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 08:28:29 +01:00
parent 34adb437d0
commit 23b7840ea7
4 changed files with 91 additions and 1 deletions
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -124,6 +124,40 @@ def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) ->
    return '\n'.join(lines)


+def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+    """Join OCR words preserving proportional horizontal spacing.
+
+    Instead of single spaces between words, inserts multiple spaces based on
+    the pixel gap between words relative to average character width.
+    Useful for box sub-sessions where spatial layout matters.
+    """
+    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
+    result_lines = []
+
+    for line_words in lines:
+        if not line_words:
+            continue
+        sorted_words = sorted(line_words, key=lambda w: w['left'])
+
+        # Calculate average character width from all words in line
+        total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
+        total_width = sum(w['width'] for w in sorted_words if w.get('text'))
+        avg_char_width = total_width / total_chars if total_chars > 0 else 10
+
+        parts = []
+        for i, word in enumerate(sorted_words):
+            parts.append(word.get('text', ''))
+            if i < len(sorted_words) - 1:
+                next_word = sorted_words[i + 1]
+                gap_px = next_word['left'] - (word['left'] + word['width'])
+                num_spaces = max(1, round(gap_px / avg_char_width))
+                parts.append(' ' * num_spaces)
+
+        result_lines.append(''.join(parts))
+
+    return '\n'.join(result_lines)
+
+
 # --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---

 _rapid_engine = None