feat: add Kombi-Modus (PaddleOCR + Tesseract) for OCR Overlay

Runs both OCR engines on the preprocessed image and merges results: word boxes matched by IoU, coordinates averaged by confidence weight. Unmatched Tesseract words (bullets, symbols) are added for better coverage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 20:05:50 +01:00
parent d335a7bbf3
commit e9ccd1e35c
4 changed files with 279 additions and 26 deletions
@@ -11,12 +11,12 @@ import { StepRowDetection } from '@/components/ocr-pipeline/StepRowDetection'
 import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecognition'
 import { OverlayReconstruction } from '@/components/ocr-overlay/OverlayReconstruction'
 import { PaddleDirectStep } from '@/components/ocr-overlay/PaddleDirectStep'
-import { OVERLAY_PIPELINE_STEPS, PADDLE_DIRECT_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types'
+import { OVERLAY_PIPELINE_STEPS, PADDLE_DIRECT_STEPS, KOMBI_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types'

 const KLAUSUR_API = '/klausur-api'

 export default function OcrOverlayPage() {
-  const [mode, setMode] = useState<'pipeline' | 'paddle-direct'>('pipeline')
+  const [mode, setMode] = useState<'pipeline' | 'paddle-direct' | 'kombi'>('pipeline')
  const [currentStep, setCurrentStep] = useState(0)
  const [sessionId, setSessionId] = useState<string | null>(null)
  const [sessionName, setSessionName] = useState<string>('')
@@ -63,13 +63,17 @@ export default function OcrOverlayPage() {
      setSessionName(data.name || data.filename || '')
      setActiveCategory(data.document_category || undefined)

-      // Check if this session was processed with paddle_direct
-      const isPaddleDirect = data.word_result?.ocr_engine === 'paddle_direct'
+      // Check if this session was processed with paddle_direct or kombi
+      const ocrEngine = data.word_result?.ocr_engine
+      const isPaddleDirect = ocrEngine === 'paddle_direct'
+      const isKombi = ocrEngine === 'kombi'

-      if (isPaddleDirect) {
-        setMode('paddle-direct')
+      if (isPaddleDirect || isKombi) {
+        const m = isKombi ? 'kombi' : 'paddle-direct'
+        const baseSteps = isKombi ? KOMBI_STEPS : PADDLE_DIRECT_STEPS
+        setMode(m)
        setSteps(
-          PADDLE_DIRECT_STEPS.map((s, i) => ({
+          baseSteps.map((s, i) => ({
            ...s,
            status: i < 4 ? 'completed' : i === 4 ? 'active' : 'pending',
          })),
@@ -101,7 +105,7 @@ export default function OcrOverlayPage() {
      if (sessionId === sid) {
        setSessionId(null)
        setCurrentStep(0)
-        const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
+        const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
        setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
      }
    } catch (e) {
@@ -158,7 +162,7 @@ export default function OcrOverlayPage() {
  const handleNext = () => {
    if (currentStep >= steps.length - 1) {
      // Last step completed — return to session list
-      const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
+      const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
      setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
      setCurrentStep(0)
      setSessionId(null)
@@ -187,7 +191,7 @@ export default function OcrOverlayPage() {
    setSessionId(null)
    setSessionName('')
    setCurrentStep(0)
-    const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
+    const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
    setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
  }

@@ -226,7 +230,7 @@ export default function OcrOverlayPage() {
  }, [sessionId, goToStep])

  const renderStep = () => {
-    if (mode === 'paddle-direct') {
+    if (mode === 'paddle-direct' || mode === 'kombi') {
      switch (currentStep) {
        case 0:
          return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} />
@@ -237,7 +241,21 @@ export default function OcrOverlayPage() {
        case 3:
          return <StepCrop sessionId={sessionId} onNext={handleNext} />
        case 4:
-          return <PaddleDirectStep sessionId={sessionId} onNext={handleNext} />
+          return mode === 'kombi' ? (
+            <PaddleDirectStep
+              sessionId={sessionId}
+              onNext={handleNext}
+              endpoint="paddle-kombi"
+              title="Kombi-Modus"
+              description="PaddleOCR und Tesseract laufen parallel. Koordinaten werden gewichtet gemittelt fuer optimale Positionierung."
+              icon="🔀"
+              buttonLabel="Paddle + Tesseract starten"
+              runningLabel="Paddle + Tesseract laufen..."
+              engineKey="kombi"
+            />
+          ) : (
+            <PaddleDirectStep sessionId={sessionId} onNext={handleNext} />
+          )
        default:
          return null
      }
@@ -480,13 +498,29 @@ export default function OcrOverlayPage() {
        >
          Paddle Direct (5 Schritte)
        </button>
+        <button
+          onClick={() => {
+            if (mode === 'kombi') return
+            setMode('kombi')
+            setCurrentStep(0)
+            setSessionId(null)
+            setSteps(KOMBI_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
+          }}
+          className={`px-3 py-1.5 text-xs font-medium rounded-md transition-colors ${
+            mode === 'kombi'
+              ? 'bg-white dark:bg-gray-700 text-gray-700 dark:text-gray-200 shadow-sm'
+              : 'text-gray-500 dark:text-gray-400 hover:text-gray-700 dark:hover:text-gray-300'
+          }`}
+        >
+          Kombi (5 Schritte)
+        </button>
      </div>

      <PipelineStepper
        steps={steps}
        currentStep={currentStep}
        onStepClick={handleStepClick}
-        onReprocess={mode === 'pipeline' && sessionId ? reprocessFromStep : undefined}
+        onReprocess={mode === 'pipeline' && sessionId != null ? reprocessFromStep : undefined}
      />

      <div className="min-h-[400px]">{renderStep()}</div>
@@ -60,6 +60,18 @@ export const PADDLE_DIRECT_STEPS: PipelineStep[] = [
  { id: 'paddle-direct', name: 'PaddleOCR + Overlay', icon: '⚡', status: 'pending' },
 ]

+/**
+ * 5-step pipeline for Kombi mode (PaddleOCR + Tesseract).
+ * Same preprocessing, then both engines run and results are merged.
+ */
+export const KOMBI_STEPS: PipelineStep[] = [
+  { id: 'orientation', name: 'Orientierung', icon: '🔄', status: 'pending' },
+  { id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
+  { id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
+  { id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
+  { id: 'kombi', name: 'Paddle + Tesseract', icon: '🔀', status: 'pending' },
+]
+
 /** Map from DB step to overlay UI step index */
 export function dbStepToOverlayUi(dbStep: number): number {
  // DB: 1=start, 2=orient, 3=deskew, 4=dewarp, 5=crop, 6=columns, 7=rows, 8=words, 9=recon, 10=gt
@@ -10,14 +10,38 @@ type Phase = 'idle' | 'running' | 'overlay'
 interface PaddleDirectStepProps {
  sessionId: string | null
  onNext: () => void
+  /** Backend endpoint suffix, default: 'paddle-direct' */
+  endpoint?: string
+  /** Title shown in idle state */
+  title?: string
+  /** Description shown in idle state */
+  description?: string
+  /** Icon shown in idle state */
+  icon?: string
+  /** Button label */
+  buttonLabel?: string
+  /** Running label */
+  runningLabel?: string
+  /** OCR engine key to check for auto-detect */
+  engineKey?: string
 }

-export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) {
+export function PaddleDirectStep({
+  sessionId,
+  onNext,
+  endpoint = 'paddle-direct',
+  title = 'Paddle Direct',
+  description = 'PaddleOCR erkennt alle Woerter direkt auf dem Originalbild — ohne Begradigung, Entzerrung oder Zuschnitt.',
+  icon = '⚡',
+  buttonLabel = 'PaddleOCR starten',
+  runningLabel = 'PaddleOCR laeuft...',
+  engineKey = 'paddle_direct',
+}: PaddleDirectStepProps) {
  const [phase, setPhase] = useState<Phase>('idle')
  const [error, setError] = useState<string | null>(null)
  const [stats, setStats] = useState<{ cells: number; rows: number; duration: number } | null>(null)

-  // Auto-detect: if session already has paddle_direct word_result → show overlay
+  // Auto-detect: if session already has matching word_result → show overlay
  useEffect(() => {
    if (!sessionId) return
    let cancelled = false
@@ -26,7 +50,7 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) {
        const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
        if (!res.ok || cancelled) return
        const data = await res.json()
-        if (data.word_result?.ocr_engine === 'paddle_direct') {
+        if (data.word_result?.ocr_engine === engineKey) {
          setPhase('overlay')
        }
      } catch {
@@ -34,14 +58,14 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) {
      }
    })()
    return () => { cancelled = true }
-  }, [sessionId])
+  }, [sessionId, engineKey])

-  const runPaddleDirect = useCallback(async () => {
+  const runOcr = useCallback(async () => {
    if (!sessionId) return
    setPhase('running')
    setError(null)
    try {
-      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/paddle-direct`, {
+      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/${endpoint}`, {
        method: 'POST',
      })
      if (!res.ok) {
@@ -59,7 +83,7 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) {
      setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
      setPhase('idle')
    }
-  }, [sessionId])
+  }, [sessionId, endpoint])

  if (!sessionId) {
    return (
@@ -91,7 +115,7 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) {
          <div className="w-10 h-10 border-4 border-teal-200 dark:border-teal-800 border-t-teal-600 dark:border-t-teal-400 rounded-full animate-spin" />
          <div className="text-center space-y-1">
            <p className="text-sm font-medium text-gray-700 dark:text-gray-300">
-              PaddleOCR laeuft...
+              {runningLabel}
            </p>
            <p className="text-xs text-gray-400">
              Bild wird analysiert (ca. 5-30s)
@@ -101,12 +125,12 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) {
      ) : (
        <>
          <div className="text-center space-y-2">
-            <div className="text-4xl">⚡</div>
+            <div className="text-4xl">{icon}</div>
            <h3 className="text-lg font-medium text-gray-700 dark:text-gray-300">
-              Paddle Direct
+              {title}
            </h3>
            <p className="text-sm text-gray-500 dark:text-gray-400 max-w-md">
-              PaddleOCR erkennt alle Woerter direkt auf dem Originalbild — ohne Begradigung, Entzerrung oder Zuschnitt.
+              {description}
            </p>
          </div>

@@ -117,10 +141,10 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) {
          )}

          <button
-            onClick={runPaddleDirect}
+            onClick={runOcr}
            className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
          >
-            PaddleOCR starten
+            {buttonLabel}
          </button>
        </>
      )}
@@ -2599,6 +2599,189 @@ async def paddle_direct(session_id: str):
    return {"session_id": session_id, **word_result}


+def _box_iou(a: dict, b: dict) -> float:
+    """Compute IoU between two word boxes (each has left, top, width, height)."""
+    ax1, ay1 = a["left"], a["top"]
+    ax2, ay2 = ax1 + a["width"], ay1 + a["height"]
+    bx1, by1 = b["left"], b["top"]
+    bx2, by2 = bx1 + b["width"], by1 + b["height"]
+
+    ix1, iy1 = max(ax1, bx1), max(ay1, by1)
+    ix2, iy2 = min(ax2, bx2), min(ay2, by2)
+    inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
+    if inter == 0:
+        return 0.0
+    area_a = (ax2 - ax1) * (ay2 - ay1)
+    area_b = (bx2 - bx1) * (by2 - by1)
+    return inter / (area_a + area_b - inter) if (area_a + area_b - inter) > 0 else 0.0
+
+
+def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
+    """Merge word boxes from PaddleOCR and Tesseract.
+
+    Matching: IoU > 0.3 between bounding boxes.
+    Merging: Weighted average of coordinates by confidence.
+    """
+    merged = []
+    used_tess: set = set()
+
+    for pw in paddle_words:
+        best_iou, best_ti = 0.0, -1
+        for ti, tw in enumerate(tess_words):
+            if ti in used_tess:
+                continue
+            iou = _box_iou(pw, tw)
+            if iou > best_iou:
+                best_iou, best_ti = iou, ti
+
+        if best_iou > 0.3 and best_ti >= 0:
+            tw = tess_words[best_ti]
+            used_tess.add(best_ti)
+            pc = pw.get("conf", 80)
+            tc = tw.get("conf", 50)
+            total = pc + tc
+            if total == 0:
+                total = 1
+            merged.append({
+                "text": pw["text"],  # Paddle text usually better
+                "left": round((pw["left"] * pc + tw["left"] * tc) / total),
+                "top": round((pw["top"] * pc + tw["top"] * tc) / total),
+                "width": round((pw["width"] * pc + tw["width"] * tc) / total),
+                "height": round((pw["height"] * pc + tw["height"] * tc) / total),
+                "conf": max(pc, tc),
+            })
+        else:
+            merged.append(pw)
+
+    # Add unmatched Tesseract words (bullet points, symbols, etc.)
+    for ti, tw in enumerate(tess_words):
+        if ti not in used_tess and tw.get("conf", 0) >= 40:
+            merged.append(tw)
+
+    return merged
+
+
+@router.post("/sessions/{session_id}/paddle-kombi")
+async def paddle_kombi(session_id: str):
+    """Run PaddleOCR + Tesseract on the preprocessed image and merge results.
+
+    Both engines run on the same preprocessed (cropped/dewarped) image.
+    Word boxes are matched by IoU and coordinates are averaged weighted by
+    confidence. Unmatched Tesseract words (bullets, symbols) are added.
+    """
+    img_png = await get_session_image(session_id, "cropped")
+    if not img_png:
+        img_png = await get_session_image(session_id, "dewarped")
+    if not img_png:
+        img_png = await get_session_image(session_id, "original")
+    if not img_png:
+        raise HTTPException(status_code=404, detail="No image found for this session")
+
+    img_arr = np.frombuffer(img_png, dtype=np.uint8)
+    img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
+    if img_bgr is None:
+        raise HTTPException(status_code=400, detail="Failed to decode image")
+
+    img_h, img_w = img_bgr.shape[:2]
+
+    from cv_ocr_engines import ocr_region_paddle
+
+    t0 = time.time()
+
+    # --- PaddleOCR ---
+    paddle_words = await ocr_region_paddle(img_bgr, region=None)
+    if not paddle_words:
+        paddle_words = []
+
+    # --- Tesseract ---
+    from PIL import Image
+    import pytesseract
+
+    pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
+    data = pytesseract.image_to_data(
+        pil_img, lang="eng+deu",
+        config="--psm 6 --oem 3",
+        output_type=pytesseract.Output.DICT,
+    )
+    tess_words = []
+    for i in range(len(data["text"])):
+        text = str(data["text"][i]).strip()
+        conf_raw = str(data["conf"][i])
+        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
+        if not text or conf < 20:
+            continue
+        tess_words.append({
+            "text": text,
+            "left": data["left"][i],
+            "top": data["top"][i],
+            "width": data["width"][i],
+            "height": data["height"][i],
+            "conf": conf,
+        })
+
+    # --- Merge ---
+    if not paddle_words and not tess_words:
+        raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
+
+    merged_words = _merge_paddle_tesseract(paddle_words, tess_words)
+
+    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
+    duration = time.time() - t0
+
+    for cell in cells:
+        cell["ocr_engine"] = "kombi"
+
+    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
+    n_cols = len(columns_meta)
+    col_types = {c.get("type") for c in columns_meta}
+    is_vocab = bool(col_types & {"column_en", "column_de"})
+
+    word_result = {
+        "cells": cells,
+        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
+        "columns_used": columns_meta,
+        "layout": "vocab" if is_vocab else "generic",
+        "image_width": img_w,
+        "image_height": img_h,
+        "duration_seconds": round(duration, 2),
+        "ocr_engine": "kombi",
+        "grid_method": "kombi",
+        "summary": {
+            "total_cells": len(cells),
+            "non_empty_cells": sum(1 for c in cells if c.get("text")),
+            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
+            "paddle_words": len(paddle_words),
+            "tesseract_words": len(tess_words),
+            "merged_words": len(merged_words),
+        },
+    }
+
+    await update_session_db(
+        session_id,
+        word_result=word_result,
+        cropped_png=img_png,
+        current_step=8,
+    )
+
+    logger.info(
+        "paddle_kombi session %s: %d cells (%d rows, %d cols) in %.2fs "
+        "[paddle=%d, tess=%d, merged=%d]",
+        session_id, len(cells), n_rows, n_cols, duration,
+        len(paddle_words), len(tess_words), len(merged_words),
+    )
+
+    await _append_pipeline_log(session_id, "paddle_kombi", {
+        "total_cells": len(cells),
+        "non_empty_cells": word_result["summary"]["non_empty_cells"],
+        "paddle_words": len(paddle_words),
+        "tesseract_words": len(tess_words),
+        "merged_words": len(merged_words),
+        "ocr_engine": "kombi",
+    }, duration_ms=int(duration * 1000))
+
+    return {"session_id": session_id, **word_result}
+
+
 class WordGroundTruthRequest(BaseModel):
    is_correct: bool
    corrected_entries: Optional[List[Dict[str, Any]]] = None