Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/cv_ocr_word_assembly.py
+++ b/klausur-service/backend/cv_ocr_word_assembly.py
@@ -0,0 +1,134 @@
+"""
+Word assembly helpers for OCR output.
+
+Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
+into visual lines, rejoins hyphenated words, and produces reading-order
+text.  All functions are pure standard-library; no NumPy or project
+imports required.
+"""
+
+import logging
+from typing import Dict, List
+
+logger = logging.getLogger(__name__)
+
+
+def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
+    """Group words by Y position into lines, sorted by X within each line."""
+    if not words:
+        return []
+
+    sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
+    lines: List[List[Dict]] = []
+    current_line: List[Dict] = [sorted_words[0]]
+    current_y = sorted_words[0]['top']
+
+    for word in sorted_words[1:]:
+        if abs(word['top'] - current_y) <= y_tolerance_px:
+            current_line.append(word)
+        else:
+            current_line.sort(key=lambda w: w['left'])
+            lines.append(current_line)
+            current_line = [word]
+            current_y = word['top']
+
+    if current_line:
+        current_line.sort(key=lambda w: w['left'])
+        lines.append(current_line)
+
+    return lines
+
+
+def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
+    """Group OCR words into visual lines in reading order.
+
+    Returns a list of line strings (one per visual line in the cell).
+    """
+    if not words:
+        return []
+
+    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
+    return [' '.join(w['text'] for w in line) for line in lines]
+
+
+def _rejoin_hyphenated(lines: List[str]) -> List[str]:
+    """Rejoin words split by line-break hyphenation.
+
+    E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
+         ['some text-', 'thing here'] \u2192 ['something here']
+    """
+    if len(lines) <= 1:
+        return lines
+
+    result = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        # If line ends with '-' and there's a next line, rejoin
+        if i + 1 < len(lines) and line.rstrip().endswith('-'):
+            stripped = line.rstrip()
+            # Get the word fragment before hyphen (last word)
+            prefix = stripped[:-1]  # remove trailing hyphen
+            next_line = lines[i + 1]
+            # Join: last word of this line + first word of next line
+            prefix_words = prefix.rsplit(' ', 1)
+            next_words = next_line.split(' ', 1)
+            if len(prefix_words) > 1:
+                joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
+            else:
+                joined = prefix_words[0] + next_words[0]
+            remainder = next_words[1] if len(next_words) > 1 else ''
+            if remainder:
+                result.append(joined + ' ' + remainder)
+            else:
+                result.append(joined)
+            i += 2
+        else:
+            result.append(line)
+            i += 1
+    return result
+
+
+def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+    """Join OCR words into text in correct reading order, preserving line breaks.
+
+    Groups words into visual lines by Y-tolerance, sorts each line by X,
+    rejoins hyphenated words, then joins lines with newlines.
+    """
+    lines = _words_to_reading_order_lines(words, y_tolerance_px)
+    lines = _rejoin_hyphenated(lines)
+    return '\n'.join(lines)
+
+
+def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+    """Join OCR words preserving proportional horizontal spacing.
+
+    Instead of single spaces between words, inserts multiple spaces based on
+    the pixel gap between words relative to average character width.
+    Useful for box sub-sessions where spatial layout matters.
+    """
+    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
+    result_lines = []
+
+    for line_words in lines:
+        if not line_words:
+            continue
+        sorted_words = sorted(line_words, key=lambda w: w['left'])
+
+        # Calculate average character width from all words in line
+        total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
+        total_width = sum(w['width'] for w in sorted_words if w.get('text'))
+        avg_char_width = total_width / total_chars if total_chars > 0 else 10
+
+        parts = []
+        for i, word in enumerate(sorted_words):
+            parts.append(word.get('text', ''))
+            if i < len(sorted_words) - 1:
+                next_word = sorted_words[i + 1]
+                gap_px = next_word['left'] - (word['left'] + word['width'])
+                num_spaces = max(1, round(gap_px / avg_char_width))
+                parts.append(' ' * num_spaces)
+
+        result_lines.append(''.join(parts))
+
+    return '\n'.join(result_lines)