feat: Full-Row OCR mit Spacing fuer Box-Sub-Sessions

Sub-Sessions ueberspringen Spaltenerkennung und nutzen stattdessen eine Pseudo-Spalte ueber die volle Breite. Text wird mit proportionalem Spacing aus Wort-Positionen rekonstruiert, um raeumliches Layout zu erhalten. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 08:28:29 +01:00
parent 34adb437d0
commit 23b7840ea7
4 changed files with 91 additions and 1 deletions
@@ -45,6 +45,7 @@ export function StepColumnDetection({ sessionId, onNext, onBoxSessionsCreated }:
  const [savedGtColumns, setSavedGtColumns] = useState<PageRegion[] | null>(null)
  const [creatingBoxSessions, setCreatingBoxSessions] = useState(false)
  const [existingSubSessions, setExistingSubSessions] = useState<SubSession[] | null>(null)
+  const [isSubSession, setIsSubSession] = useState(false)

  // Fetch session info (image dimensions) + check for cached column result
  useEffect(() => {
@@ -58,14 +59,31 @@ export function StepColumnDetection({ sessionId, onNext, onBoxSessionsCreated }:
          if (info.image_width && info.image_height) {
            setImageDimensions({ width: info.image_width, height: info.image_height })
          }
+          const isSub = !!info.parent_session_id
+          setIsSubSession(isSub)
          if (info.sub_sessions && info.sub_sessions.length > 0) {
            setExistingSubSessions(info.sub_sessions)
            onBoxSessionsCreated?.(info.sub_sessions)
          }
          if (info.column_result) {
            setColumnResult(info.column_result)
+            // Sub-session with pseudo-column already set → auto-advance
+            if (isSub) {
+              onNext()
+              return
+            }
            return
          }
+          // Sub-session without columns → auto-detect (creates pseudo-column)
+          if (isSub) {
+            const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/columns`, { method: 'POST' })
+            if (res.ok) {
+              const data: ColumnResult = await res.json()
+              setColumnResult(data)
+              onNext()
+              return
+            }
+          }
        }
      } catch (e) {
        console.error('Failed to fetch session info:', e)
@@ -24,6 +24,7 @@ from cv_ocr_engines import (
    _fix_phonetic_brackets,
    _split_comma_entries,
    _words_to_reading_order_text,
+    _words_to_spaced_text,
    ocr_region_lighton,
    ocr_region_rapid,
    ocr_region_trocr,
@@ -371,7 +372,16 @@ def build_cell_grid_v2(

                if words:
                    y_tol = max(15, row.height)
-                    text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+                    # Single full-width column (box sub-session): preserve spacing
+                    is_single_full_column = (
+                        len(relevant_cols) == 1
+                        and img_w > 0
+                        and relevant_cols[0].width / img_w > 0.9
+                    )
+                    if is_single_full_column:
+                        text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
+                    else:
+                        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
                else:
                    text = ''
@@ -124,6 +124,40 @@ def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) ->
    return '\n'.join(lines)


+def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+    """Join OCR words preserving proportional horizontal spacing.
+
+    Instead of single spaces between words, inserts multiple spaces based on
+    the pixel gap between words relative to average character width.
+    Useful for box sub-sessions where spatial layout matters.
+    """
+    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
+    result_lines = []
+
+    for line_words in lines:
+        if not line_words:
+            continue
+        sorted_words = sorted(line_words, key=lambda w: w['left'])
+
+        # Calculate average character width from all words in line
+        total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
+        total_width = sum(w['width'] for w in sorted_words if w.get('text'))
+        avg_char_width = total_width / total_chars if total_chars > 0 else 10
+
+        parts = []
+        for i, word in enumerate(sorted_words):
+            parts.append(word.get('text', ''))
+            if i < len(sorted_words) - 1:
+                next_word = sorted_words[i + 1]
+                gap_px = next_word['left'] - (word['left'] + word['width'])
+                num_spaces = max(1, round(gap_px / avg_char_width))
+                parts.append(' ' * num_spaces)
+
+        result_lines.append(''.join(parts))
+
+    return '\n'.join(result_lines)
+
+
 # --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---

 _rapid_engine = None
@@ -1209,6 +1209,34 @@ async def detect_columns(session_id: str):
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")

+    # Sub-sessions: skip column detection, create single pseudo-column
+    session = await get_session_db(session_id)
+    if session and session.get("parent_session_id"):
+        h, w = img_bgr.shape[:2]
+        column_result = {
+            "columns": [{
+                "type": "column_text",
+                "x": 0, "y": 0,
+                "width": w, "height": h,
+            }],
+            "zones": None,
+            "boxes_detected": 0,
+            "duration_seconds": 0,
+            "method": "sub_session_pseudo_column",
+        }
+        await update_session_db(
+            session_id,
+            column_result=column_result,
+            row_result=None,
+            word_result=None,
+            current_step=6,
+        )
+        cached["column_result"] = column_result
+        cached.pop("row_result", None)
+        cached.pop("word_result", None)
+        logger.info(f"OCR Pipeline: sub-session {session_id}: pseudo-column {w}x{h}px")
+        return {"session_id": session_id, **column_result}
+
    t0 = time.time()

    # Binarized image for layout analysis