feat(ocr-pipeline): 6 systematic improvements for robustness, performance & UX

1. Unit tests: 76 new parametrized tests for noise filter, phonetic detection, cell text cleaning, and row merging (116 total, all green) 2. Continuation-row merge: detect multi-line vocab entries where text wraps (lowercase EN + empty DE) and merge into previous entry 3. Empty DE fallback: secondary PSM=7 OCR pass for cells missed by PSM=6 4. Batch-OCR: collect empty cells per column, run single Tesseract call on column strip instead of per-cell (~66% fewer calls for 3+ empty cells) 5. StepReconstruction UI: font scaling via naturalHeight, empty EN/DE field highlighting, undo/redo (Ctrl+Z), per-cell reset button 6. Session reprocess: POST /sessions/{id}/reprocess endpoint to re-run from any step, with reprocess button on completed pipeline steps Also fixes pre-existing dewarp_image tuple unpacking bug in run_cv_pipeline and updates dewarp tests to match current (image, info) return signature. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 14:46:38 +01:00
parent c3a924a620
commit e718353d9f
6 changed files with 775 additions and 79 deletions
@@ -160,6 +160,29 @@ export default function OcrPipelinePage() {
    8: 'Validierung',
  }

+  const reprocessFromStep = useCallback(async (uiStep: number) => {
+    if (!sessionId) return
+    const dbStep = uiStep + 1 // UI is 0-indexed, DB is 1-indexed
+    if (!confirm(`Ab Schritt ${dbStep} (${stepNames[dbStep] || '?'}) neu verarbeiten? Nachfolgende Daten werden geloescht.`)) return
+    try {
+      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reprocess`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ from_step: dbStep }),
+      })
+      if (!res.ok) {
+        const data = await res.json().catch(() => ({}))
+        console.error('Reprocess failed:', data.detail || res.status)
+        return
+      }
+      // Reset UI steps
+      goToStep(uiStep)
+    } catch (e) {
+      console.error('Reprocess error:', e)
+    }
+  // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [sessionId, goToStep])
+
  const renderStep = () => {
    switch (currentStep) {
      case 0:
@@ -291,7 +314,7 @@ export default function OcrPipelinePage() {
        </div>
      )}

-      <PipelineStepper steps={steps} currentStep={currentStep} onStepClick={handleStepClick} />
+      <PipelineStepper steps={steps} currentStep={currentStep} onStepClick={handleStepClick} onReprocess={sessionId ? reprocessFromStep : undefined} />

      <div className="min-h-[400px]">{renderStep()}</div>
    </div>
@@ -6,9 +6,10 @@ interface PipelineStepperProps {
  steps: PipelineStep[]
  currentStep: number
  onStepClick: (index: number) => void
+  onReprocess?: (index: number) => void
 }

-export function PipelineStepper({ steps, currentStep, onStepClick }: PipelineStepperProps) {
+export function PipelineStepper({ steps, currentStep, onStepClick, onReprocess }: PipelineStepperProps) {
  return (
    <div className="flex items-center justify-between px-4 py-3 bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700">
      {steps.map((step, index) => {
@@ -26,25 +27,37 @@ export function PipelineStepper({ steps, currentStep, onStepClick }: PipelineSte
                }`}
              />
            )}
-            <button
-              onClick={() => isClickable && onStepClick(index)}
-              disabled={!isClickable}
-              className={`flex items-center gap-1.5 px-3 py-1.5 rounded-full text-sm font-medium transition-all ${
-                isActive
-                  ? 'bg-teal-100 text-teal-700 dark:bg-teal-900/40 dark:text-teal-300 ring-2 ring-teal-400'
-                  : isCompleted
-                    ? 'bg-green-100 text-green-700 dark:bg-green-900/40 dark:text-green-300'
-                    : isFailed
-                      ? 'bg-red-100 text-red-700 dark:bg-red-900/40 dark:text-red-300'
-                      : 'text-gray-400 dark:text-gray-500'
-              } ${isClickable ? 'cursor-pointer hover:opacity-80' : 'cursor-default'}`}
-            >
-              <span className="text-base">
-                {isCompleted ? '✓' : isFailed ? '✗' : step.icon}
-              </span>
-              <span className="hidden sm:inline">{step.name}</span>
-              <span className="sm:hidden">{index + 1}</span>
-            </button>
+            <div className="relative group">
+              <button
+                onClick={() => isClickable && onStepClick(index)}
+                disabled={!isClickable}
+                className={`flex items-center gap-1.5 px-3 py-1.5 rounded-full text-sm font-medium transition-all ${
+                  isActive
+                    ? 'bg-teal-100 text-teal-700 dark:bg-teal-900/40 dark:text-teal-300 ring-2 ring-teal-400'
+                    : isCompleted
+                      ? 'bg-green-100 text-green-700 dark:bg-green-900/40 dark:text-green-300'
+                      : isFailed
+                        ? 'bg-red-100 text-red-700 dark:bg-red-900/40 dark:text-red-300'
+                        : 'text-gray-400 dark:text-gray-500'
+                } ${isClickable ? 'cursor-pointer hover:opacity-80' : 'cursor-default'}`}
+              >
+                <span className="text-base">
+                  {isCompleted ? '\u2713' : isFailed ? '\u2717' : step.icon}
+                </span>
+                <span className="hidden sm:inline">{step.name}</span>
+                <span className="sm:hidden">{index + 1}</span>
+              </button>
+              {/* Reprocess button — shown on completed steps on hover */}
+              {isCompleted && onReprocess && (
+                <button
+                  onClick={(e) => { e.stopPropagation(); onReprocess(index) }}
+                  className="absolute -top-1 -right-1 w-4 h-4 bg-orange-500 text-white rounded-full text-[9px] leading-none opacity-0 group-hover:opacity-100 transition-opacity flex items-center justify-center"
+                  title={`Ab hier neu verarbeiten`}
+                >
+                  &#x21BB;
+                </button>
+              )}
+            </div>
          </div>
        )
      })}
@@ -20,13 +20,23 @@ interface EditableCell {
  colIndex: number
 }

+type UndoAction = { cellId: string; oldText: string; newText: string }
+
 export function StepReconstruction({ sessionId, onNext }: StepReconstructionProps) {
  const [status, setStatus] = useState<'loading' | 'ready' | 'saving' | 'saved' | 'error'>('loading')
  const [error, setError] = useState('')
  const [cells, setCells] = useState<EditableCell[]>([])
  const [editedTexts, setEditedTexts] = useState<Map<string, string>>(new Map())
  const [zoom, setZoom] = useState(100)
-  const [containerSize, setContainerSize] = useState<{ w: number; h: number } | null>(null)
+  const [imageNaturalH, setImageNaturalH] = useState(0)
+  const [showEmptyHighlight, setShowEmptyHighlight] = useState(true)
+
+  // Undo/Redo stacks
+  const [undoStack, setUndoStack] = useState<UndoAction[]>([])
+  const [redoStack, setRedoStack] = useState<UndoAction[]>([])
+
+  // All cells including empty ones (for empty field highlighting)
+  const [allCells, setAllCells] = useState<EditableCell[]>([])

  const containerRef = useRef<HTMLDivElement>(null)
  const imageRef = useRef<HTMLImageElement>(null)
@@ -38,16 +48,11 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
  // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [sessionId])

-  // Track container size for font scaling
-  useEffect(() => {
-    if (!containerRef.current) return
-    const observer = new ResizeObserver((entries) => {
-      for (const entry of entries) {
-        setContainerSize({ w: entry.contentRect.width, h: entry.contentRect.height })
-      }
-    })
-    observer.observe(containerRef.current)
-    return () => observer.disconnect()
+  // Track image natural height for font scaling
+  const handleImageLoad = useCallback(() => {
+    if (imageRef.current) {
+      setImageNaturalH(imageRef.current.naturalHeight)
+    }
  }, [])

  const loadSessionData = async () => {
@@ -67,19 +72,21 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp

      // Build editable cells from grid cells
      const gridCells: GridCell[] = wordResult.cells || []
-      const editableCells: EditableCell[] = gridCells
-        .filter(c => c.text.trim() !== '')
-        .map(c => ({
-          cellId: c.cell_id,
-          text: c.text,
-          originalText: c.text,
-          bboxPct: c.bbox_pct,
-          colType: c.col_type,
-          rowIndex: c.row_index,
-          colIndex: c.col_index,
-        }))
+      const allEditableCells: EditableCell[] = gridCells.map(c => ({
+        cellId: c.cell_id,
+        text: c.text,
+        originalText: c.text,
+        bboxPct: c.bbox_pct,
+        colType: c.col_type,
+        rowIndex: c.row_index,
+        colIndex: c.col_index,
+      }))

-      setCells(editableCells)
+      setAllCells(allEditableCells)
+      setCells(allEditableCells.filter(c => c.text.trim() !== ''))
+      setEditedTexts(new Map())
+      setUndoStack([])
+      setRedoStack([])
      setStatus('ready')
    } catch (e: unknown) {
      setError(e instanceof Error ? e.message : String(e))
@@ -89,12 +96,80 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp

  const handleTextChange = useCallback((cellId: string, newText: string) => {
    setEditedTexts(prev => {
+      const oldText = prev.get(cellId)
+      const cell = cells.find(c => c.cellId === cellId)
+      const prevText = oldText ?? cell?.text ?? ''
+
+      // Push to undo stack
+      setUndoStack(stack => [...stack, { cellId, oldText: prevText, newText }])
+      setRedoStack([]) // Clear redo on new edit
+
      const next = new Map(prev)
      next.set(cellId, newText)
      return next
    })
+  }, [cells])
+
+  const undo = useCallback(() => {
+    setUndoStack(stack => {
+      if (stack.length === 0) return stack
+      const action = stack[stack.length - 1]
+      const newStack = stack.slice(0, -1)
+
+      setRedoStack(rs => [...rs, action])
+      setEditedTexts(prev => {
+        const next = new Map(prev)
+        next.set(action.cellId, action.oldText)
+        return next
+      })
+
+      return newStack
+    })
  }, [])

+  const redo = useCallback(() => {
+    setRedoStack(stack => {
+      if (stack.length === 0) return stack
+      const action = stack[stack.length - 1]
+      const newStack = stack.slice(0, -1)
+
+      setUndoStack(us => [...us, action])
+      setEditedTexts(prev => {
+        const next = new Map(prev)
+        next.set(action.cellId, action.newText)
+        return next
+      })
+
+      return newStack
+    })
+  }, [])
+
+  const resetCell = useCallback((cellId: string) => {
+    const cell = cells.find(c => c.cellId === cellId)
+    if (!cell) return
+    setEditedTexts(prev => {
+      const next = new Map(prev)
+      next.delete(cellId)
+      return next
+    })
+  }, [cells])
+
+  // Global keyboard shortcuts for undo/redo
+  useEffect(() => {
+    const handler = (e: KeyboardEvent) => {
+      if ((e.metaKey || e.ctrlKey) && e.key === 'z') {
+        e.preventDefault()
+        if (e.shiftKey) {
+          redo()
+        } else {
+          undo()
+        }
+      }
+    }
+    document.addEventListener('keydown', handler)
+    return () => document.removeEventListener('keydown', handler)
+  }, [undo, redo])
+
  const getDisplayText = useCallback((cell: EditableCell): string => {
    return editedTexts.get(cell.cellId) ?? cell.text
  }, [editedTexts])
@@ -112,6 +187,18 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
    return count
  }, [cells, isEdited])

+  // Identify empty required cells (EN or DE columns with no text)
+  const emptyCellIds = useMemo(() => {
+    const required = new Set(['column_en', 'column_de'])
+    const ids = new Set<string>()
+    for (const cell of allCells) {
+      if (required.has(cell.colType) && !cell.text.trim()) {
+        ids.add(cell.cellId)
+      }
+    }
+    return ids
+  }, [allCells])
+
  // Sort cells for tab navigation: by row, then by column
  const sortedCellIds = useMemo(() => {
    return [...cells]
@@ -181,6 +268,13 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
    return colors[colType] || 'border-gray-400/40 focus:border-gray-500'
  }

+  // Font size based on image natural height (not container) scaled by zoom
+  const getFontSize = useCallback((bboxH: number): number => {
+    const baseH = imageNaturalH || 800
+    const px = (bboxH / 100) * baseH * 0.55
+    return Math.max(8, Math.min(18, px * (zoom / 100)))
+  }, [imageNaturalH, zoom])
+
  if (!sessionId) {
    return <div className="text-center py-12 text-gray-400">Bitte zuerst eine Session auswaehlen.</div>
  }
@@ -197,7 +291,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
  if (status === 'error') {
    return (
      <div className="flex flex-col items-center justify-center py-12 text-center">
-        <div className="text-5xl mb-4">⚠️</div>
+        <div className="text-5xl mb-4">&#x26A0;&#xFE0F;</div>
        <h3 className="text-lg font-medium text-red-600 dark:text-red-400 mb-2">Fehler</h3>
        <p className="text-sm text-gray-500 dark:text-gray-400 max-w-lg mb-4">{error}</p>
        <div className="flex gap-3">
@@ -207,7 +301,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
          </button>
          <button onClick={onNext}
            className="px-5 py-2 bg-gray-200 dark:bg-gray-700 text-gray-700 dark:text-gray-300 rounded-lg hover:bg-gray-300 dark:hover:bg-gray-600 transition-colors text-sm">
-            Ueberspringen →
+            Ueberspringen &rarr;
          </button>
        </div>
      </div>
@@ -217,14 +311,14 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
  if (status === 'saved') {
    return (
      <div className="flex flex-col items-center justify-center py-12 text-center">
-        <div className="text-5xl mb-4">✅</div>
+        <div className="text-5xl mb-4">&#x2705;</div>
        <h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">Rekonstruktion gespeichert</h3>
        <p className="text-sm text-gray-500 dark:text-gray-400 mb-6">
          {changedCount > 0 ? `${changedCount} Zellen wurden aktualisiert.` : 'Keine Aenderungen vorgenommen.'}
        </p>
        <button onClick={onNext}
          className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium">
-          Weiter →
+          Weiter &rarr;
        </button>
      </div>
    )
@@ -239,16 +333,54 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
            Schritt 7: Rekonstruktion
          </h3>
          <span className="text-xs text-gray-400">
-            {cells.length} Zellen · {changedCount} geaendert
+            {cells.length} Zellen &middot; {changedCount} geaendert
+            {emptyCellIds.size > 0 && showEmptyHighlight && (
+              <span className="text-red-400 ml-1">&middot; {emptyCellIds.size} leer</span>
+            )}
          </span>
        </div>
        <div className="flex items-center gap-2">
+          {/* Undo/Redo */}
+          <button
+            onClick={undo}
+            disabled={undoStack.length === 0}
+            className="px-2 py-1 text-xs border border-gray-300 dark:border-gray-600 rounded hover:bg-gray-50 dark:hover:bg-gray-700 disabled:opacity-30"
+            title="Rueckgaengig (Ctrl+Z)"
+          >
+            &#x21A9;
+          </button>
+          <button
+            onClick={redo}
+            disabled={redoStack.length === 0}
+            className="px-2 py-1 text-xs border border-gray-300 dark:border-gray-600 rounded hover:bg-gray-50 dark:hover:bg-gray-700 disabled:opacity-30"
+            title="Wiederholen (Ctrl+Shift+Z)"
+          >
+            &#x21AA;
+          </button>
+
+          <div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" />
+
+          {/* Empty field toggle */}
+          <button
+            onClick={() => setShowEmptyHighlight(v => !v)}
+            className={`px-2 py-1 text-xs border rounded transition-colors ${
+              showEmptyHighlight
+                ? 'border-red-300 bg-red-50 text-red-600 dark:border-red-700 dark:bg-red-900/30 dark:text-red-400'
+                : 'border-gray-300 dark:border-gray-600 hover:bg-gray-50 dark:hover:bg-gray-700'
+            }`}
+            title="Leere Pflichtfelder markieren"
+          >
+            Leer
+          </button>
+
+          <div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" />
+
          {/* Zoom controls */}
          <button
            onClick={() => setZoom(z => Math.max(50, z - 25))}
            className="px-2 py-1 text-xs border border-gray-300 dark:border-gray-600 rounded hover:bg-gray-50 dark:hover:bg-gray-700"
          >
-            −
+            &minus;
          </button>
          <span className="text-xs text-gray-500 w-10 text-center">{zoom}%</span>
          <button
@@ -291,34 +423,63 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
            alt="Dewarped"
            className="block"
            style={{ opacity: 0.3 }}
+            onLoad={handleImageLoad}
          />

+          {/* Empty field markers */}
+          {showEmptyHighlight && allCells
+            .filter(c => emptyCellIds.has(c.cellId))
+            .map(cell => (
+              <div
+                key={`empty-${cell.cellId}`}
+                className="absolute border-2 border-dashed border-red-400/60 rounded pointer-events-none"
+                style={{
+                  left: `${cell.bboxPct.x}%`,
+                  top: `${cell.bboxPct.y}%`,
+                  width: `${cell.bboxPct.w}%`,
+                  height: `${cell.bboxPct.h}%`,
+                }}
+              />
+            ))}
+
          {/* Editable text fields at bbox positions */}
          {cells.map((cell) => {
            const displayText = getDisplayText(cell)
            const edited = isEdited(cell)

            return (
-              <input
-                key={cell.cellId}
-                id={`cell-${cell.cellId}`}
-                type="text"
-                value={displayText}
-                onChange={(e) => handleTextChange(cell.cellId, e.target.value)}
-                onKeyDown={(e) => handleKeyDown(e, cell.cellId)}
-                className={`absolute bg-transparent text-black dark:text-white border px-0.5 outline-none transition-colors ${
-                  colTypeColor(cell.colType)
-                } ${edited ? 'border-green-500 bg-green-50/30 dark:bg-green-900/20' : ''}`}
-                style={{
-                  left: `${cell.bboxPct.x}%`,
-                  top: `${cell.bboxPct.y}%`,
-                  width: `${cell.bboxPct.w}%`,
-                  height: `${cell.bboxPct.h}%`,
-                  fontSize: `${Math.max(8, Math.min(16, (cell.bboxPct.h / 100) * (containerSize?.h || 800) * 0.6))}px`,
-                  lineHeight: '1',
-                }}
-                title={`${cell.cellId} (${cell.colType})`}
-              />
+              <div key={cell.cellId} className="absolute group" style={{
+                left: `${cell.bboxPct.x}%`,
+                top: `${cell.bboxPct.y}%`,
+                width: `${cell.bboxPct.w}%`,
+                height: `${cell.bboxPct.h}%`,
+              }}>
+                <input
+                  id={`cell-${cell.cellId}`}
+                  type="text"
+                  value={displayText}
+                  onChange={(e) => handleTextChange(cell.cellId, e.target.value)}
+                  onKeyDown={(e) => handleKeyDown(e, cell.cellId)}
+                  className={`w-full h-full bg-transparent text-black dark:text-white border px-0.5 outline-none transition-colors ${
+                    colTypeColor(cell.colType)
+                  } ${edited ? 'border-green-500 bg-green-50/30 dark:bg-green-900/20' : ''}`}
+                  style={{
+                    fontSize: `${getFontSize(cell.bboxPct.h)}px`,
+                    lineHeight: '1',
+                  }}
+                  title={`${cell.cellId} (${cell.colType})`}
+                />
+                {/* Per-cell reset button (X) — only shown for edited cells on hover */}
+                {edited && (
+                  <button
+                    onClick={() => resetCell(cell.cellId)}
+                    className="absolute -top-1 -right-1 w-4 h-4 bg-red-500 text-white rounded-full text-[9px] leading-none opacity-0 group-hover:opacity-100 transition-opacity flex items-center justify-center"
+                    title="Zuruecksetzen"
+                  >
+                    &times;
+                  </button>
+                )}
+              </div>
            )
          })}
        </div>
@@ -336,7 +497,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
          }}
          className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium text-sm"
        >
-          {changedCount > 0 ? 'Speichern & Weiter →' : 'Weiter →'}
+          {changedCount > 0 ? 'Speichern & Weiter \u2192' : 'Weiter \u2192'}
        </button>
      </div>
    </div>
@@ -3503,6 +3503,21 @@ def _ocr_single_cell(
                )
                used_engine = 'cell_ocr_fallback'

+        # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
+        if not text.strip() and _run_fallback and not use_rapid:
+            cell_lang = lang_map.get(col.type, lang)
+            psm7_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7)
+            if psm7_words:
+                psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+            if psm7_words:
+                p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+                if p7_text.strip():
+                    text = p7_text
+                    avg_conf = round(
+                        sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+                    )
+                    used_engine = 'cell_ocr_psm7'
+
    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
    if text.strip():
        text = _clean_cell_text(text)
@@ -3628,6 +3643,79 @@ def build_cell_grid(
            )
            cells.append(cell)

+    # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
+    # Collect cells that are still empty but have visible pixels.
+    # Instead of calling Tesseract once per cell (expensive), crop an entire
+    # column strip and run OCR once, then assign words to cells by Y position.
+    empty_by_col: Dict[int, List[int]] = {}  # col_idx → [cell list indices]
+    for ci, cell in enumerate(cells):
+        if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
+            bpx = cell['bbox_px']
+            x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
+            if w > 0 and h > 0 and ocr_img is not None:
+                crop = ocr_img[y:y + h, x:x + w]
+                if crop.size > 0:
+                    dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                    if dark_ratio > 0.005:
+                        empty_by_col.setdefault(cell['col_index'], []).append(ci)
+
+    for col_idx, cell_indices in empty_by_col.items():
+        if len(cell_indices) < 3:
+            continue  # Not worth batching for < 3 cells
+
+        # Find the column strip bounding box (union of all empty cell bboxes)
+        min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
+        max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
+        col_x = cells[cell_indices[0]]['bbox_px']['x']
+        col_w = cells[cell_indices[0]]['bbox_px']['w']
+
+        strip_region = PageRegion(
+            type=relevant_cols[col_idx].type,
+            x=col_x, y=min_y,
+            width=col_w, height=max_y_h - min_y,
+        )
+        strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
+
+        if use_rapid and img_bgr is not None:
+            strip_words = ocr_region_rapid(img_bgr, strip_region)
+        else:
+            strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
+
+        if not strip_words:
+            continue
+
+        strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
+        if not strip_words:
+            continue
+
+        # Assign words to cells by Y overlap
+        for ci in cell_indices:
+            cell_y = cells[ci]['bbox_px']['y']
+            cell_h = cells[ci]['bbox_px']['h']
+            cell_mid_y = cell_y + cell_h / 2
+
+            matched_words = [
+                w for w in strip_words
+                if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
+            ]
+            if matched_words:
+                matched_words.sort(key=lambda w: w['left'])
+                batch_text = ' '.join(w['text'] for w in matched_words)
+                batch_text = _clean_cell_text(batch_text)
+                if batch_text.strip():
+                    cells[ci]['text'] = batch_text
+                    cells[ci]['confidence'] = round(
+                        sum(w['conf'] for w in matched_words) / len(matched_words), 1
+                    )
+                    cells[ci]['ocr_engine'] = 'batch_column_ocr'
+
+        batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
+        if batch_filled > 0:
+            logger.info(
+                f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
+                f"empty cells in column {col_idx}"
+            )
+
    logger.info(f"build_cell_grid: {len(cells)} cells from "
                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
                f"engine={engine_name}")
@@ -3869,6 +3957,69 @@ def _merge_phonetic_continuation_rows(
    return merged


+def _merge_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge multi-line vocabulary entries where text wraps to the next row.
+
+    A row is a continuation of the previous entry when:
+    - EN has text, but DE is empty
+    - EN starts with a lowercase letter (not a new vocab entry)
+    - Previous entry's EN does NOT end with a sentence terminator (.!?)
+    - The continuation text has fewer than 4 words (not an example sentence)
+    - The row was not already merged as phonetic
+
+    Example:
+      Row 5: EN="to put up"       DE="aufstellen"
+      Row 6: EN="with sth."       DE=""
+      → Merged: EN="to put up with sth."  DE="aufstellen"
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+
+        if merged and en and not de:
+            # Check: not phonetic (already handled)
+            if _is_phonetic_only_text(en):
+                merged.append(entry)
+                continue
+
+            # Check: starts with lowercase
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            # Check: fewer than 4 words (not an example sentence)
+            word_count = len(en.split())
+            is_short = word_count < 4
+
+            # Check: previous entry doesn't end with sentence terminator
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
+
+            if starts_lower and is_short and not prev_ends_sentence:
+                # Merge into previous entry
+                prev['english'] = (prev_en + ' ' + en).strip()
+                # Merge example if present
+                ex = (entry.get('example') or '').strip()
+                if ex:
+                    prev_ex = (prev.get('example') or '').strip()
+                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+                logger.debug(
+                    f"Merged continuation row {entry.get('row_index')} "
+                    f"into previous entry: {prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    return merged
+
+
 def build_word_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -3920,9 +4071,12 @@ def build_word_grid(
    # --- Post-processing pipeline (deterministic, no LLM) ---
    n_raw = len(entries)

-    # 0. Merge phonetic-only continuation rows into previous entry
+    # 0a. Merge phonetic-only continuation rows into previous entry
    entries = _merge_phonetic_continuation_rows(entries)

+    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
+    entries = _merge_continuation_rows(entries)
+
    # 1. Fix character confusion (I/1/l based on context)
    entries = _fix_character_confusion(entries)

@@ -4361,7 +4515,7 @@ async def run_cv_pipeline(
        # Stage 3: Dewarp
        if enable_dewarp:
            t = time.time()
-            img = dewarp_image(img)
+            img, _dewarp_info = dewarp_image(img)
            result.stages['dewarp'] = round(time.time() - t, 2)

        # Stage 4: Dual image preparation
@@ -1623,6 +1623,69 @@ async def save_reconstruction(session_id: str, request: Request):
    }


+@router.post("/sessions/{session_id}/reprocess")
+async def reprocess_session(session_id: str, request: Request):
+    """Re-run pipeline from a specific step, clearing downstream data.
+
+    Body: {"from_step": 5}  (1-indexed step number)
+
+    Clears downstream results:
+    - from_step <= 1: deskew_result, dewarp_result, column_result, row_result, word_result
+    - from_step <= 2: dewarp_result, column_result, row_result, word_result
+    - from_step <= 3: column_result, row_result, word_result
+    - from_step <= 4: row_result, word_result
+    - from_step <= 5: word_result (cells, vocab_entries)
+    - from_step <= 6: word_result.llm_review only
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    body = await request.json()
+    from_step = body.get("from_step", 1)
+    if not isinstance(from_step, int) or from_step < 1 or from_step > 7:
+        raise HTTPException(status_code=400, detail="from_step must be between 1 and 7")
+
+    update_kwargs: Dict[str, Any] = {"current_step": from_step}
+
+    # Clear downstream data based on from_step
+    if from_step <= 5:
+        update_kwargs["word_result"] = None
+    elif from_step == 6:
+        # Only clear LLM review from word_result
+        word_result = session.get("word_result")
+        if word_result:
+            word_result.pop("llm_review", None)
+            word_result.pop("llm_corrections", None)
+            update_kwargs["word_result"] = word_result
+
+    if from_step <= 4:
+        update_kwargs["row_result"] = None
+    if from_step <= 3:
+        update_kwargs["column_result"] = None
+    if from_step <= 2:
+        update_kwargs["dewarp_result"] = None
+    if from_step <= 1:
+        update_kwargs["deskew_result"] = None
+
+    await update_session_db(session_id, **update_kwargs)
+
+    # Also clear cache
+    if session_id in _cache:
+        for key in list(update_kwargs.keys()):
+            if key != "current_step":
+                _cache[session_id][key] = update_kwargs[key]
+        _cache[session_id]["current_step"] = from_step
+
+    logger.info(f"Session {session_id} reprocessing from step {from_step}")
+
+    return {
+        "session_id": session_id,
+        "from_step": from_step,
+        "cleared": [k for k in update_kwargs if k != "current_step"],
+    }
+
+
 async def _get_rows_overlay(session_id: str) -> Response:
    """Generate dewarped image with row bands drawn on it."""
    session = await get_session_db(session_id)
@@ -9,6 +9,9 @@ Tests cover:
 - Stage 5: Layout analysis (content bounds, projection profiles, column detection)
 - Stage 6: Multi-pass OCR region handling
 - Stage 7: Line grouping and vocabulary matching
+- Noise filter functions (_is_noise_tail_token, _clean_cell_text)
+- Phonetic detection (_is_phonetic_only_text)
+- Phonetic & continuation row merging
 - Orchestrator (run_cv_pipeline)

 DSGVO Note: All tests run locally with synthetic data. No external API calls.
@@ -36,6 +39,11 @@ from cv_vocab_pipeline import (
    CV2_AVAILABLE,
    TESSERACT_AVAILABLE,
    CV_PIPELINE_AVAILABLE,
+    _is_noise_tail_token,
+    _clean_cell_text,
+    _is_phonetic_only_text,
+    _merge_phonetic_continuation_rows,
+    _merge_continuation_rows,
 )


@@ -202,16 +210,28 @@ class TestDeskew:

@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
 class TestDewarp:
-    """Test dewarp (pass-through) stage."""
+    """Test dewarp stage (returns (image, info) tuple)."""

-    def test_dewarp_passthrough(self, white_image):
-        """Current dewarp should return the same image (pass-through)."""
+    def test_dewarp_returns_tuple(self, white_image):
+        """dewarp_image must return (image, dewarp_info) tuple."""
        result = dewarp_image(white_image)
-        np.testing.assert_array_equal(result, white_image)
+        assert isinstance(result, tuple)
+        assert len(result) == 2
+        img_out, info = result
+        assert isinstance(img_out, np.ndarray)
+        assert isinstance(info, dict)
+        assert "shear_degrees" in info

    def test_dewarp_preserves_shape(self, text_like_image):
-        result = dewarp_image(text_like_image)
-        assert result.shape == text_like_image.shape
+        """Output image should have same shape as input."""
+        img_out, _ = dewarp_image(text_like_image)
+        assert img_out.shape == text_like_image.shape
+
+    def test_dewarp_white_image_no_correction(self, white_image):
+        """A uniform white image should get no shear correction."""
+        img_out, info = dewarp_image(white_image)
+        assert abs(info["shear_degrees"]) < 0.5
+        assert img_out.shape == white_image.shape


 # =============================================
@@ -561,6 +581,268 @@ class TestStageIntegration:
        assert layout_img.shape[:2] == corrected.shape[:2]


+# =============================================
+# NOISE FILTER TESTS
+# =============================================
+
+class TestNoiseFilter:
+    """Test _is_noise_tail_token for trailing OCR noise detection."""
+
+    # --- Tokens that should be KEPT (return False) ---
+
+    @pytest.mark.parametrize("token", [
+        # Compound words with hyphens
+        "money-saver",
+        "under-",
+        "well-known",
+        # Words with parenthesized parts (dictionary entries)
+        "Schild(chen)",
+        "(Salat-)Gurke",
+        "(auf)",
+        "(on)",
+        "selbst)",
+        "(wir",
+        "Tanz(veranstaltung)",
+        "(zer)brechen",
+        # Phonetic brackets
+        "serva]",
+        "['mani",
+        "[eg]",
+        "[maus]",
+        # Words with trailing punctuation
+        "cupcakes.",
+        "sister.",
+        "mice",
+        # Abbreviations
+        "e.g.",
+        "sth.",
+        "usw.",
+        "adj.",
+        # Ellipsis
+        "...",
+        "\u2026",
+        # Regular words
+        "the",
+        "cat",
+        "big",
+        "run",
+        "set",
+        "ago",
+    ])
+    def test_keep_real_tokens(self, token):
+        """Real words, dictionary punctuation, and phonetic brackets are kept."""
+        assert _is_noise_tail_token(token) is False, f"Should keep {token!r}"
+
+    # --- Tokens that should be FILTERED (return True) ---
+
+    @pytest.mark.parametrize("token", [
+        # Pure non-alpha
+        "B|",
+        "3d",
+        "x7",
+        ")",
+        "|",
+        "@",
+        "3",
+        # Very short non-dictionary fragments
+        "ee",
+        "k",
+        "zz",
+        "qq",
+        # Empty
+        "",
+        "  ",
+    ])
+    def test_filter_noise_tokens(self, token):
+        """OCR noise fragments are filtered."""
+        assert _is_noise_tail_token(token) is True, f"Should filter {token!r}"
+
+
+class TestCleanCellText:
+    """Test _clean_cell_text integration (full text → cleaned text)."""
+
+    def test_empty_returns_empty(self):
+        assert _clean_cell_text("") == ""
+        assert _clean_cell_text("   ") == ""
+
+    def test_real_word_unchanged(self):
+        assert _clean_cell_text("cupcakes") == "cupcakes"
+
+    def test_strips_trailing_noise(self):
+        """Trailing noise tokens should be removed."""
+        result = _clean_cell_text("cupcakes B|")
+        assert result == "cupcakes"
+
+    def test_keeps_trailing_real_word(self):
+        """Trailing real words should be kept."""
+        result = _clean_cell_text("big cat")
+        assert result == "big cat"
+
+    def test_abbreviation_kept(self):
+        """Known abbreviations should not be cleared."""
+        result = _clean_cell_text("e.g.")
+        assert result == "e.g."
+
+    def test_pure_garbage_cleared(self):
+        """OCR garbage without real words should be cleared."""
+        result = _clean_cell_text("3d |x")
+        assert result == ""
+
+    def test_compound_word_preserved(self):
+        """Compound words with hyphens should be preserved."""
+        result = _clean_cell_text("money-saver")
+        assert result == "money-saver"
+
+    def test_parenthesized_word_preserved(self):
+        result = _clean_cell_text("(Salat-)Gurke")
+        assert result == "(Salat-)Gurke"
+
+    def test_multiple_trailing_noise(self):
+        """Multiple trailing noise tokens should all be removed."""
+        result = _clean_cell_text("achieve 3 |")
+        assert result == "achieve"
+
+
+class TestPhoneticOnlyText:
+    """Test _is_phonetic_only_text for phonetic transcription detection."""
+
+    @pytest.mark.parametrize("text,expected", [
+        # Phonetic-only patterns → True
+        ("['mani serva]", True),
+        ("[dɑːns]", True),
+        ("[\"a:mand]", True),
+        ("['wɜːkʃɒp]", True),
+        # serva] has 5 alpha chars after bracket removal → NOT phonetic-only
+        ("serva]", False),
+        # NOT phonetic-only → False
+        ("almond ['a:mand]", False),
+        ("Mandel", False),
+        ("cupcakes", False),
+        ("", False),
+        ("achieve", False),
+        ("money-saver ['mani]", False),
+    ])
+    def test_phonetic_detection(self, text, expected):
+        assert _is_phonetic_only_text(text) is expected, \
+            f"_is_phonetic_only_text({text!r}) should be {expected}"
+
+
+class TestMergePhoneticContinuationRows:
+    """Test _merge_phonetic_continuation_rows for phonetic row merging."""
+
+    def test_empty_list(self):
+        assert _merge_phonetic_continuation_rows([]) == []
+
+    def test_single_entry(self):
+        entries = [{"english": "cat", "german": "Katze", "example": ""}]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 1
+        assert result[0]["english"] == "cat"
+
+    def test_merges_phonetic_row(self):
+        """Phonetic-only row should merge into previous entry."""
+        entries = [
+            {"english": "money-saver", "german": "Sparfuchs", "example": "", "row_index": 0},
+            {"english": "['mani serva]", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 1
+        assert result[0]["english"] == "money-saver ['mani serva]"
+        assert result[0]["german"] == "Sparfuchs"
+
+    def test_no_merge_when_de_present(self):
+        """Row with DE text should NOT be merged even if EN looks phonetic."""
+        entries = [
+            {"english": "cat", "german": "Katze", "example": ""},
+            {"english": "[kæt]", "german": "some text", "example": ""},
+        ]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_no_merge_regular_rows(self):
+        """Normal vocab rows should not be merged."""
+        entries = [
+            {"english": "cat", "german": "Katze", "example": ""},
+            {"english": "dog", "german": "Hund", "example": ""},
+        ]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_merges_example_too(self):
+        """If phonetic row has example text, it should merge into previous."""
+        entries = [
+            {"english": "dance", "german": "tanzen", "example": "", "row_index": 0},
+            {"english": "[dɑːns]", "german": "", "example": "Let's dance.", "row_index": 1},
+        ]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 1
+        assert result[0]["english"] == "dance [dɑːns]"
+        assert result[0]["example"] == "Let's dance."
+
+
+class TestMergeContinuationRows:
+    """Test _merge_continuation_rows for multi-line entry merging."""
+
+    def test_empty_list(self):
+        assert _merge_continuation_rows([]) == []
+
+    def test_no_merge_independent_rows(self):
+        """Rows with both EN and DE should not be merged."""
+        entries = [
+            {"english": "cat", "german": "Katze", "example": "", "row_index": 0},
+            {"english": "dog", "german": "Hund", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_merge_lowercase_continuation(self):
+        """Lowercase EN with empty DE should merge into previous."""
+        entries = [
+            {"english": "to put up", "german": "aufstellen", "example": "", "row_index": 0},
+            {"english": "with sth.", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 1
+        assert result[0]["english"] == "to put up with sth."
+        assert result[0]["german"] == "aufstellen"
+
+    def test_no_merge_uppercase_start(self):
+        """EN starting with uppercase and empty DE is likely its own entry, not a continuation."""
+        entries = [
+            {"english": "cat", "german": "Katze", "example": "", "row_index": 0},
+            {"english": "Dog", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_no_merge_when_previous_ends_with_period(self):
+        """If previous entry ends with sentence terminator, next is not continuation."""
+        entries = [
+            {"english": "That's great.", "german": "Das ist toll.", "example": "", "row_index": 0},
+            {"english": "really nice", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_no_merge_long_text(self):
+        """Text with 4+ words is likely an example sentence, not continuation."""
+        entries = [
+            {"english": "achieve", "german": "erreichen", "example": "", "row_index": 0},
+            {"english": "she achieved her goals", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_first_entry_not_merged(self):
+        """First entry with empty DE should not crash (no previous)."""
+        entries = [
+            {"english": "something", "german": "", "example": "", "row_index": 0},
+            {"english": "cat", "german": "Katze", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+
 # =============================================
 # RUN TESTS
 # =============================================