feat: Paddle Direct — 1-click OCR without deskew/dewarp/crop

New 2-step mode (Upload → PaddleOCR+Overlay) alongside the existing 7-step pipeline. Backend endpoint runs PaddleOCR on the original image and clusters words into rows/cells directly. Frontend adds a mode toggle and PaddleDirectStep component. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 16:41:55 +01:00
parent 06d63d18f9
commit 90c1efd9b0
4 changed files with 403 additions and 16 deletions
@@ -10,11 +10,13 @@ import { StepCrop } from '@/components/ocr-pipeline/StepCrop'
 import { StepRowDetection } from '@/components/ocr-pipeline/StepRowDetection'
 import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecognition'
 import { OverlayReconstruction } from '@/components/ocr-overlay/OverlayReconstruction'
-import { OVERLAY_PIPELINE_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types'
+import { PaddleDirectStep } from '@/components/ocr-overlay/PaddleDirectStep'
+import { OVERLAY_PIPELINE_STEPS, PADDLE_DIRECT_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types'

 const KLAUSUR_API = '/klausur-api'

 export default function OcrOverlayPage() {
+  const [mode, setMode] = useState<'pipeline' | 'paddle-direct'>('pipeline')
  const [currentStep, setCurrentStep] = useState(0)
  const [sessionId, setSessionId] = useState<string | null>(null)
  const [sessionName, setSessionName] = useState<string>('')
@@ -61,17 +63,32 @@ export default function OcrOverlayPage() {
      setSessionName(data.name || data.filename || '')
      setActiveCategory(data.document_category || undefined)

-      // Map DB step to overlay UI step
-      const dbStep = data.current_step || 1
-      const uiStep = dbStepToOverlayUi(dbStep)
+      // Check if this session was processed with paddle_direct
+      const isPaddleDirect = data.word_result?.ocr_engine === 'paddle_direct'

-      setSteps(
-        OVERLAY_PIPELINE_STEPS.map((s, i) => ({
-          ...s,
-          status: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
-        })),
-      )
-      setCurrentStep(uiStep)
+      if (isPaddleDirect) {
+        setMode('paddle-direct')
+        setSteps(
+          PADDLE_DIRECT_STEPS.map((s, i) => ({
+            ...s,
+            status: i < 1 ? 'completed' : i === 1 ? 'active' : 'pending',
+          })),
+        )
+        setCurrentStep(1)
+      } else {
+        setMode('pipeline')
+        // Map DB step to overlay UI step
+        const dbStep = data.current_step || 1
+        const uiStep = dbStepToOverlayUi(dbStep)
+
+        setSteps(
+          OVERLAY_PIPELINE_STEPS.map((s, i) => ({
+            ...s,
+            status: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
+          })),
+        )
+        setCurrentStep(uiStep)
+      }
    } catch (e) {
      console.error('Failed to open session:', e)
    }
@@ -84,12 +101,13 @@ export default function OcrOverlayPage() {
      if (sessionId === sid) {
        setSessionId(null)
        setCurrentStep(0)
-        setSteps(OVERLAY_PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
+        const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
+        setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
      }
    } catch (e) {
      console.error('Failed to delete session:', e)
    }
-  }, [sessionId])
+  }, [sessionId, mode])

  const renameSession = useCallback(async (sid: string, newName: string) => {
    try {
@@ -140,7 +158,8 @@ export default function OcrOverlayPage() {
  const handleNext = () => {
    if (currentStep >= steps.length - 1) {
      // Last step completed — return to session list
-      setSteps(OVERLAY_PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
+      const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
+      setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
      setCurrentStep(0)
      setSessionId(null)
      loadSessions()
@@ -168,7 +187,8 @@ export default function OcrOverlayPage() {
    setSessionId(null)
    setSessionName('')
    setCurrentStep(0)
-    setSteps(OVERLAY_PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
+    const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
+    setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
  }

  const stepNames: Record<number, string> = {
@@ -206,6 +226,16 @@ export default function OcrOverlayPage() {
  }, [sessionId, goToStep])

  const renderStep = () => {
+    if (mode === 'paddle-direct') {
+      switch (currentStep) {
+        case 0:
+          return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} />
+        case 1:
+          return <PaddleDirectStep sessionId={sessionId} onNext={handleNext} />
+        default:
+          return null
+      }
+    }
    switch (currentStep) {
      case 0:
        return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} />
@@ -410,11 +440,47 @@ export default function OcrOverlayPage() {
        </div>
      )}

+      {/* Mode Toggle */}
+      <div className="flex items-center gap-1 bg-gray-100 dark:bg-gray-800 rounded-lg p-1 w-fit">
+        <button
+          onClick={() => {
+            if (mode === 'pipeline') return
+            setMode('pipeline')
+            setCurrentStep(0)
+            setSessionId(null)
+            setSteps(OVERLAY_PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
+          }}
+          className={`px-3 py-1.5 text-xs font-medium rounded-md transition-colors ${
+            mode === 'pipeline'
+              ? 'bg-white dark:bg-gray-700 text-gray-700 dark:text-gray-200 shadow-sm'
+              : 'text-gray-500 dark:text-gray-400 hover:text-gray-700 dark:hover:text-gray-300'
+          }`}
+        >
+          Pipeline (7 Schritte)
+        </button>
+        <button
+          onClick={() => {
+            if (mode === 'paddle-direct') return
+            setMode('paddle-direct')
+            setCurrentStep(0)
+            setSessionId(null)
+            setSteps(PADDLE_DIRECT_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
+          }}
+          className={`px-3 py-1.5 text-xs font-medium rounded-md transition-colors ${
+            mode === 'paddle-direct'
+              ? 'bg-white dark:bg-gray-700 text-gray-700 dark:text-gray-200 shadow-sm'
+              : 'text-gray-500 dark:text-gray-400 hover:text-gray-700 dark:hover:text-gray-300'
+          }`}
+        >
+          Paddle Direct (2 Schritte)
+        </button>
+      </div>
+
      <PipelineStepper
        steps={steps}
        currentStep={currentStep}
        onStepClick={handleStepClick}
-        onReprocess={sessionId ? reprocessFromStep : undefined}
+        onReprocess={mode === 'pipeline' && sessionId ? reprocessFromStep : undefined}
      />

      <div className="min-h-[400px]">{renderStep()}</div>
@@ -48,6 +48,15 @@ export const OVERLAY_UI_TO_DB: Record<number, number> = {
  6: 9,  // reconstruction
 }

+/**
+ * 2-step pipeline for Paddle Direct mode.
+ * Upload → PaddleOCR+Overlay (skips deskew/dewarp/crop/rows)
+ */
+export const PADDLE_DIRECT_STEPS: PipelineStep[] = [
+  { id: 'orientation', name: 'Upload', icon: '📤', status: 'pending' },
+  { id: 'paddle-direct', name: 'PaddleOCR + Overlay', icon: '⚡', status: 'pending' },
+]
+
 /** Map from DB step to overlay UI step index */
 export function dbStepToOverlayUi(dbStep: number): number {
  // DB: 1=start, 2=orient, 3=deskew, 4=dewarp, 5=crop, 6=columns, 7=rows, 8=words, 9=recon, 10=gt
@@ -0,0 +1,129 @@
+'use client'
+
+import { useCallback, useEffect, useState } from 'react'
+import { OverlayReconstruction } from './OverlayReconstruction'
+
+const KLAUSUR_API = '/klausur-api'
+
+type Phase = 'idle' | 'running' | 'overlay'
+
+interface PaddleDirectStepProps {
+  sessionId: string | null
+  onNext: () => void
+}
+
+export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) {
+  const [phase, setPhase] = useState<Phase>('idle')
+  const [error, setError] = useState<string | null>(null)
+  const [stats, setStats] = useState<{ cells: number; rows: number; duration: number } | null>(null)
+
+  // Auto-detect: if session already has paddle_direct word_result → show overlay
+  useEffect(() => {
+    if (!sessionId) return
+    let cancelled = false
+    ;(async () => {
+      try {
+        const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
+        if (!res.ok || cancelled) return
+        const data = await res.json()
+        if (data.word_result?.ocr_engine === 'paddle_direct') {
+          setPhase('overlay')
+        }
+      } catch {
+        // ignore
+      }
+    })()
+    return () => { cancelled = true }
+  }, [sessionId])
+
+  const runPaddleDirect = useCallback(async () => {
+    if (!sessionId) return
+    setPhase('running')
+    setError(null)
+    try {
+      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/paddle-direct`, {
+        method: 'POST',
+      })
+      if (!res.ok) {
+        const data = await res.json().catch(() => ({}))
+        throw new Error(data.detail || `HTTP ${res.status}`)
+      }
+      const data = await res.json()
+      setStats({
+        cells: data.summary?.total_cells || 0,
+        rows: data.grid_shape?.rows || 0,
+        duration: data.duration_seconds || 0,
+      })
+      setPhase('overlay')
+    } catch (e: unknown) {
+      setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
+      setPhase('idle')
+    }
+  }, [sessionId])
+
+  if (!sessionId) {
+    return (
+      <div className="text-sm text-gray-400 py-8 text-center">
+        Bitte zuerst ein Bild hochladen.
+      </div>
+    )
+  }
+
+  if (phase === 'overlay') {
+    return (
+      <div className="space-y-3">
+        {stats && (
+          <div className="flex items-center gap-4 text-xs text-gray-500 dark:text-gray-400">
+            <span>{stats.cells} Woerter erkannt</span>
+            <span>{stats.rows} Zeilen</span>
+            <span>{stats.duration.toFixed(1)}s</span>
+          </div>
+        )}
+        <OverlayReconstruction sessionId={sessionId} onNext={onNext} />
+      </div>
+    )
+  }
+
+  return (
+    <div className="flex flex-col items-center justify-center py-16 space-y-6">
+      {phase === 'running' ? (
+        <>
+          <div className="w-10 h-10 border-4 border-teal-200 dark:border-teal-800 border-t-teal-600 dark:border-t-teal-400 rounded-full animate-spin" />
+          <div className="text-center space-y-1">
+            <p className="text-sm font-medium text-gray-700 dark:text-gray-300">
+              PaddleOCR laeuft...
+            </p>
+            <p className="text-xs text-gray-400">
+              Bild wird analysiert (ca. 5-30s)
+            </p>
+          </div>
+        </>
+      ) : (
+        <>
+          <div className="text-center space-y-2">
+            <div className="text-4xl">⚡</div>
+            <h3 className="text-lg font-medium text-gray-700 dark:text-gray-300">
+              Paddle Direct
+            </h3>
+            <p className="text-sm text-gray-500 dark:text-gray-400 max-w-md">
+              PaddleOCR erkennt alle Woerter direkt auf dem Originalbild — ohne Begradigung, Entzerrung oder Zuschnitt.
+            </p>
+          </div>
+
+          {error && (
+            <div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 px-4 py-2 rounded-lg">
+              {error}
+            </div>
+          )}
+
+          <button
+            onClick={runPaddleDirect}
+            className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
+          >
+            PaddleOCR starten
+          </button>
+        </>
+      )}
+    </div>
+  )
+}
@@ -2509,6 +2509,189 @@ async def _word_stream_generator(
    yield f"data: {json.dumps(complete_event)}\n\n"


+@router.post("/sessions/{session_id}/paddle-direct")
+async def paddle_direct(session_id: str):
+    """Run PaddleOCR on the original image and build a word grid directly.
+
+    Skips deskew/dewarp/crop/rows — just Upload → PaddleOCR → Overlay.
+    The original image is stored as cropped_png so OverlayReconstruction
+    can display it as the background.
+    """
+    original_png = await get_session_image(session_id, "original")
+    if not original_png:
+        raise HTTPException(status_code=404, detail="No original image found for this session")
+
+    img_arr = np.frombuffer(original_png, dtype=np.uint8)
+    img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
+    if img_bgr is None:
+        raise HTTPException(status_code=400, detail="Failed to decode original image")
+
+    img_h, img_w = img_bgr.shape[:2]
+
+    from cv_ocr_engines import ocr_region_paddle
+
+    t0 = time.time()
+    word_dicts = await ocr_region_paddle(img_bgr, region=None)
+    if not word_dicts:
+        raise HTTPException(status_code=400, detail="PaddleOCR returned no words")
+
+    cells, columns_meta = _paddle_words_to_grid_cells(word_dicts, img_w, img_h)
+    duration = time.time() - t0
+
+    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
+    n_cols = len(columns_meta)
+
+    word_result = {
+        "cells": cells,
+        "grid_shape": {
+            "rows": n_rows,
+            "cols": n_cols,
+            "total_cells": len(cells),
+        },
+        "columns_used": columns_meta,
+        "layout": "generic",
+        "image_width": img_w,
+        "image_height": img_h,
+        "duration_seconds": round(duration, 2),
+        "ocr_engine": "paddle_direct",
+        "grid_method": "paddle_direct",
+        "summary": {
+            "total_cells": len(cells),
+            "non_empty_cells": sum(1 for c in cells if c.get("text")),
+            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
+        },
+    }
+
+    # Store original image as cropped_png so OverlayReconstruction shows it
+    await update_session_db(
+        session_id,
+        word_result=word_result,
+        cropped_png=original_png,
+        current_step=8,
+    )
+
+    logger.info(
+        "paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs",
+        session_id, len(cells), n_rows, n_cols, duration,
+    )
+
+    await _append_pipeline_log(session_id, "paddle_direct", {
+        "total_cells": len(cells),
+        "non_empty_cells": word_result["summary"]["non_empty_cells"],
+        "ocr_engine": "paddle_direct",
+    }, duration_ms=int(duration * 1000))
+
+    return {"session_id": session_id, **word_result}
+
+
+def _paddle_words_to_grid_cells(
+    word_dicts: List[Dict[str, Any]],
+    img_w: int,
+    img_h: int,
+) -> tuple:
+    """Convert PaddleOCR word dicts into GridCell dicts + columns_meta.
+
+    1. Sort words by (top, left).
+    2. Cluster into rows by Y-proximity (threshold = 50% of median word height).
+    3. Within each row, sort left→right and assign col_index.
+    4. Each word → 1 GridCell with word_boxes and bbox_pct.
+
+    Returns (cells, columns_meta) in the same format as build_grid_from_words.
+    """
+    if not word_dicts:
+        return [], []
+
+    # Sort by top then left
+    sorted_words = sorted(word_dicts, key=lambda w: (w["top"], w["left"]))
+
+    # Compute median word height for row clustering threshold
+    heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
+    median_h = sorted(heights)[len(heights) // 2] if heights else 30
+    row_threshold = max(median_h * 0.5, 8)
+
+    # Cluster into rows
+    rows: List[List[Dict]] = []
+    current_row: List[Dict] = []
+    current_y = -9999.0
+
+    for w in sorted_words:
+        center_y = w["top"] + w["height"] / 2
+        if current_row and abs(center_y - current_y) > row_threshold:
+            rows.append(current_row)
+            current_row = []
+        current_row.append(w)
+        # Running average Y center for the row
+        current_y = sum(ww["top"] + ww["height"] / 2 for ww in current_row) / len(current_row)
+
+    if current_row:
+        rows.append(current_row)
+
+    # Sort each row left→right and build cells
+    cells: List[Dict[str, Any]] = []
+    max_col = 0
+
+    for row_idx, row_words in enumerate(rows):
+        row_words.sort(key=lambda w: w["left"])
+        for col_idx, w in enumerate(row_words):
+            left = w["left"]
+            top = w["top"]
+            width = w["width"]
+            height = w["height"]
+            conf = w.get("confidence", 0)
+            if isinstance(conf, float) and conf <= 1.0:
+                conf = conf * 100  # normalize to 0-100
+
+            cell = {
+                "cell_id": f"PD_R{row_idx:02d}_W{col_idx:02d}",
+                "x": left,
+                "y": top,
+                "width": width,
+                "height": height,
+                "text": w.get("text", ""),
+                "confidence": round(conf, 1),
+                "column_index": col_idx,
+                "row_index": row_idx,
+                "zone_index": 0,
+                "ocr_engine": "paddle_direct",
+                "word_boxes": [{
+                    "text": w.get("text", ""),
+                    "left": left,
+                    "top": top,
+                    "width": width,
+                    "height": height,
+                    "confidence": round(conf, 1),
+                }],
+                "bbox_pct": {
+                    "x": round(left / img_w * 100, 3),
+                    "y": round(top / img_h * 100, 3),
+                    "w": round(width / img_w * 100, 3),
+                    "h": round(height / img_h * 100, 3),
+                },
+            }
+            cells.append(cell)
+            if col_idx > max_col:
+                max_col = col_idx
+
+    # Build columns_meta — one pseudo-column per column index
+    columns_meta = []
+    for ci in range(max_col + 1):
+        col_cells = [c for c in cells if c["column_index"] == ci]
+        if col_cells:
+            min_x = min(c["x"] for c in col_cells)
+            max_right = max(c["x"] + c["width"] for c in col_cells)
+            columns_meta.append({
+                "type": "column_text",
+                "x": min_x,
+                "y": 0,
+                "width": max_right - min_x,
+                "height": img_h,
+                "classification_confidence": 1.0,
+                "classification_method": "paddle_direct",
+            })
+
+    return cells, columns_meta
+
+
 class WordGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_entries: Optional[List[Dict[str, Any]]] = None