feat: Full-Row OCR mit Spacing fuer Box-Sub-Sessions

Sub-Sessions ueberspringen Spaltenerkennung und nutzen stattdessen eine Pseudo-Spalte ueber die volle Breite. Text wird mit proportionalem Spacing aus Wort-Positionen rekonstruiert, um raeumliches Layout zu erhalten. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 08:28:29 +01:00
parent 34adb437d0
commit 23b7840ea7
4 changed files with 91 additions and 1 deletions
@@ -45,6 +45,7 @@ export function StepColumnDetection({ sessionId, onNext, onBoxSessionsCreated }:
  const [savedGtColumns, setSavedGtColumns] = useState<PageRegion[] | null>(null)
  const [creatingBoxSessions, setCreatingBoxSessions] = useState(false)
  const [existingSubSessions, setExistingSubSessions] = useState<SubSession[] | null>(null)
  const [isSubSession, setIsSubSession] = useState(false)
  // Fetch session info (image dimensions) + check for cached column result
  useEffect(() => {
@@ -58,14 +59,31 @@ export function StepColumnDetection({ sessionId, onNext, onBoxSessionsCreated }:
          if (info.image_width && info.image_height) {
            setImageDimensions({ width: info.image_width, height: info.image_height })
          }
          const isSub = !!info.parent_session_id
          setIsSubSession(isSub)
          if (info.sub_sessions && info.sub_sessions.length > 0) {
            setExistingSubSessions(info.sub_sessions)
            onBoxSessionsCreated?.(info.sub_sessions)
          }
          if (info.column_result) {
            setColumnResult(info.column_result)
            // Sub-session with pseudo-column already set → auto-advance
            if (isSub) {
              onNext()
              return
            }
            return
          }
          // Sub-session without columns → auto-detect (creates pseudo-column)
          if (isSub) {
            const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/columns`, { method: 'POST' })
            if (res.ok) {
              const data: ColumnResult = await res.json()
              setColumnResult(data)
              onNext()
              return
            }
          }
        }
      } catch (e) {
        console.error('Failed to fetch session info:', e)
@@ -24,6 +24,7 @@ from cv_ocr_engines import (
    _fix_phonetic_brackets,
    _split_comma_entries,
    _words_to_reading_order_text,
    _words_to_spaced_text,
    ocr_region_lighton,
    ocr_region_rapid,
    ocr_region_trocr,
@@ -371,7 +372,16 @@ def build_cell_grid_v2(
                if words:
                    y_tol = max(15, row.height)
-                    text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+                    # Single full-width column (box sub-session): preserve spacing
                    is_single_full_column = (
                        len(relevant_cols) == 1
                        and img_w > 0
                        and relevant_cols[0].width / img_w > 0.9
                    )
                    if is_single_full_column:
                        text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
                    else:
                        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
                else:
                    text = ''
@@ -124,6 +124,40 @@ def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) ->
    return '\n'.join(lines)
 def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
    """Join OCR words preserving proportional horizontal spacing.
    Instead of single spaces between words, inserts multiple spaces based on
    the pixel gap between words relative to average character width.
    Useful for box sub-sessions where spatial layout matters.
    """
    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
    result_lines = []
    for line_words in lines:
        if not line_words:
            continue
        sorted_words = sorted(line_words, key=lambda w: w['left'])
        # Calculate average character width from all words in line
        total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
        total_width = sum(w['width'] for w in sorted_words if w.get('text'))
        avg_char_width = total_width / total_chars if total_chars > 0 else 10
        parts = []
        for i, word in enumerate(sorted_words):
            parts.append(word.get('text', ''))
            if i < len(sorted_words) - 1:
                next_word = sorted_words[i + 1]
                gap_px = next_word['left'] - (word['left'] + word['width'])
                num_spaces = max(1, round(gap_px / avg_char_width))
                parts.append(' ' * num_spaces)
        result_lines.append(''.join(parts))
    return '\n'.join(result_lines)
 # --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
 _rapid_engine = None
@@ -1209,6 +1209,34 @@ async def detect_columns(session_id: str):
    if img_bgr is None:
        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")
    # Sub-sessions: skip column detection, create single pseudo-column
    session = await get_session_db(session_id)
    if session and session.get("parent_session_id"):
        h, w = img_bgr.shape[:2]
        column_result = {
            "columns": [{
                "type": "column_text",
                "x": 0, "y": 0,
                "width": w, "height": h,
            }],
            "zones": None,
            "boxes_detected": 0,
            "duration_seconds": 0,
            "method": "sub_session_pseudo_column",
        }
        await update_session_db(
            session_id,
            column_result=column_result,
            row_result=None,
            word_result=None,
            current_step=6,
        )
        cached["column_result"] = column_result
        cached.pop("row_result", None)
        cached.pop("word_result", None)
        logger.info(f"OCR Pipeline: sub-session {session_id}: pseudo-column {w}x{h}px")
        return {"session_id": session_id, **column_result}
    t0 = time.time()
    # Binarized image for layout analysis