feat(ocr-pipeline): add Step 5 word recognition (grid from columns × rows)

Backend: build_word_grid() intersects column regions with content rows, OCRs each cell with language-specific Tesseract, and returns vocabulary entries with percent-based bounding boxes. New endpoints: POST /words, GET /image/words-overlay, ground-truth save/retrieve for words. Frontend: StepWordRecognition with overview + step-through labeling modes, goToStep callback for row correction feedback loop. MkDocs: OCR Pipeline documentation added. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 02:18:29 +01:00
parent 47dc2e6f7a
commit 954103cdf2
9 changed files with 1429 additions and 21 deletions
@@ -112,6 +112,16 @@ export default function OcrPipelinePage() {
    }
  }

+  const goToStep = (step: number) => {
+    setCurrentStep(step)
+    setSteps((prev) =>
+      prev.map((s, i) => ({
+        ...s,
+        status: i < step ? 'completed' : i === step ? 'active' : 'pending',
+      })),
+    )
+  }
+
  const handleNext = () => {
    if (currentStep < steps.length - 1) {
      setSteps((prev) =>
@@ -161,7 +171,7 @@ export default function OcrPipelinePage() {
      case 3:
        return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
      case 4:
-        return <StepWordRecognition />
+        return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
      case 5:
        return <StepCoordinates />
      case 6:
@@ -29,6 +29,7 @@ export interface SessionInfo {
  dewarp_result?: DewarpResult
  column_result?: ColumnResult
  row_result?: RowResult
+  word_result?: WordResult
 }

 export interface DeskewResult {
@@ -116,6 +117,46 @@ export interface RowGroundTruth {
  notes?: string
 }

+export interface WordBbox {
+  x: number
+  y: number
+  w: number
+  h: number
+}
+
+export interface WordEntry {
+  row_index: number
+  english: string
+  german: string
+  example: string
+  confidence: number
+  bbox: WordBbox
+  bbox_en: WordBbox | null
+  bbox_de: WordBbox | null
+  bbox_ex: WordBbox | null
+  status?: 'pending' | 'confirmed' | 'edited' | 'skipped'
+}
+
+export interface WordResult {
+  entries: WordEntry[]
+  entry_count: number
+  image_width: number
+  image_height: number
+  duration_seconds: number
+  summary: {
+    total_entries: number
+    with_english: number
+    with_german: number
+    low_confidence: number
+  }
+}
+
+export interface WordGroundTruth {
+  is_correct: boolean
+  corrected_entries?: WordEntry[]
+  notes?: string
+}
+
 export const PIPELINE_STEPS: PipelineStep[] = [
  { id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
  { id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
@@ -1,19 +1,602 @@
 'use client'

-export function StepWordRecognition() {
-  return (
-    <div className="flex flex-col items-center justify-center py-16 text-center">
-      <div className="text-5xl mb-4">🔤</div>
-      <h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
-        Schritt 4: Worterkennung
-      </h3>
-      <p className="text-gray-500 dark:text-gray-400 max-w-md">
-        OCR mit Bounding Boxes fuer jedes erkannte Wort.
-        Dieser Schritt wird in einer zukuenftigen Version implementiert.
-      </p>
-      <div className="mt-6 px-4 py-2 bg-amber-100 dark:bg-amber-900/30 text-amber-700 dark:text-amber-400 rounded-full text-sm font-medium">
-        Kommt bald
+import { useCallback, useEffect, useRef, useState } from 'react'
+import type { WordResult, WordEntry, WordGroundTruth } from '@/app/(admin)/ai/ocr-pipeline/types'
+
+const KLAUSUR_API = '/klausur-api'
+
+interface StepWordRecognitionProps {
+  sessionId: string | null
+  onNext: () => void
+  goToStep: (step: number) => void
+}
+
+export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRecognitionProps) {
+  const [wordResult, setWordResult] = useState<WordResult | null>(null)
+  const [detecting, setDetecting] = useState(false)
+  const [error, setError] = useState<string | null>(null)
+  const [gtNotes, setGtNotes] = useState('')
+  const [gtSaved, setGtSaved] = useState(false)
+
+  // Step-through labeling state
+  const [activeIndex, setActiveIndex] = useState(0)
+  const [editedEntries, setEditedEntries] = useState<WordEntry[]>([])
+  const [mode, setMode] = useState<'overview' | 'labeling'>('overview')
+
+  const enRef = useRef<HTMLInputElement>(null)
+
+  useEffect(() => {
+    if (!sessionId) return
+
+    const fetchSession = async () => {
+      try {
+        const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
+        if (res.ok) {
+          const info = await res.json()
+          if (info.word_result) {
+            setWordResult(info.word_result)
+            initEntries(info.word_result.entries)
+            return
+          }
+        }
+      } catch (e) {
+        console.error('Failed to fetch session info:', e)
+      }
+      runAutoDetection()
+    }
+
+    fetchSession()
+  // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [sessionId])
+
+  const initEntries = (entries: WordEntry[]) => {
+    setEditedEntries(entries.map(e => ({ ...e, status: e.status || 'pending' })))
+    setActiveIndex(0)
+  }
+
+  const runAutoDetection = useCallback(async () => {
+    if (!sessionId) return
+    setDetecting(true)
+    setError(null)
+    try {
+      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words`, {
+        method: 'POST',
+      })
+      if (!res.ok) {
+        const err = await res.json().catch(() => ({ detail: res.statusText }))
+        throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
+      }
+      const data: WordResult = await res.json()
+      setWordResult(data)
+      initEntries(data.entries)
+    } catch (e) {
+      setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
+    } finally {
+      setDetecting(false)
+    }
+  }, [sessionId])
+
+  const handleGroundTruth = useCallback(async (isCorrect: boolean) => {
+    if (!sessionId) return
+    const gt: WordGroundTruth = {
+      is_correct: isCorrect,
+      corrected_entries: isCorrect ? undefined : editedEntries,
+      notes: gtNotes || undefined,
+    }
+    try {
+      await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/ground-truth/words`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify(gt),
+      })
+      setGtSaved(true)
+    } catch (e) {
+      console.error('Ground truth save failed:', e)
+    }
+  }, [sessionId, gtNotes, editedEntries])
+
+  // Step-through: update entry field
+  const updateEntry = (index: number, field: 'english' | 'german' | 'example', value: string) => {
+    setEditedEntries(prev => prev.map((e, i) =>
+      i === index ? { ...e, [field]: value, status: 'edited' as const } : e
+    ))
+  }
+
+  // Step-through: confirm current entry
+  const confirmEntry = () => {
+    setEditedEntries(prev => prev.map((e, i) =>
+      i === activeIndex ? { ...e, status: e.status === 'edited' ? 'edited' : 'confirmed' } : e
+    ))
+    if (activeIndex < editedEntries.length - 1) {
+      setActiveIndex(activeIndex + 1)
+    }
+  }
+
+  // Step-through: skip current entry
+  const skipEntry = () => {
+    setEditedEntries(prev => prev.map((e, i) =>
+      i === activeIndex ? { ...e, status: 'skipped' as const } : e
+    ))
+    if (activeIndex < editedEntries.length - 1) {
+      setActiveIndex(activeIndex + 1)
+    }
+  }
+
+  // Focus english input when active entry changes in labeling mode
+  useEffect(() => {
+    if (mode === 'labeling' && enRef.current) {
+      enRef.current.focus()
+    }
+  }, [activeIndex, mode])
+
+  // Keyboard shortcuts in labeling mode
+  useEffect(() => {
+    if (mode !== 'labeling') return
+    const handler = (e: KeyboardEvent) => {
+      if (e.key === 'Enter' && !e.shiftKey) {
+        e.preventDefault()
+        confirmEntry()
+      } else if (e.key === 'Tab' && !e.shiftKey) {
+        // Let Tab move between fields naturally unless on last field
+      } else if (e.key === 'ArrowDown' && e.ctrlKey) {
+        e.preventDefault()
+        skipEntry()
+      } else if (e.key === 'ArrowUp' && e.ctrlKey) {
+        e.preventDefault()
+        if (activeIndex > 0) setActiveIndex(activeIndex - 1)
+      }
+    }
+    window.addEventListener('keydown', handler)
+    return () => window.removeEventListener('keydown', handler)
+  // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [mode, activeIndex, editedEntries])
+
+  if (!sessionId) {
+    return (
+      <div className="flex flex-col items-center justify-center py-16 text-center">
+        <div className="text-5xl mb-4">🔤</div>
+        <h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
+          Schritt 5: Worterkennung
+        </h3>
+        <p className="text-gray-500 dark:text-gray-400 max-w-md">
+          Bitte zuerst Schritte 1-4 abschliessen.
+        </p>
      </div>
+    )
+  }
+
+  const overlayUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/words-overlay`
+  const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped`
+
+  const confColor = (conf: number) => {
+    if (conf >= 70) return 'text-green-600 dark:text-green-400'
+    if (conf >= 50) return 'text-yellow-600 dark:text-yellow-400'
+    return 'text-red-600 dark:text-red-400'
+  }
+
+  const statusBadge = (status?: string) => {
+    const map: Record<string, string> = {
+      pending: 'bg-gray-100 dark:bg-gray-700 text-gray-500',
+      confirmed: 'bg-green-100 dark:bg-green-900/30 text-green-700 dark:text-green-400',
+      edited: 'bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-400',
+      skipped: 'bg-orange-100 dark:bg-orange-900/30 text-orange-700 dark:text-orange-400',
+    }
+    return map[status || 'pending'] || map.pending
+  }
+
+  const summary = wordResult?.summary
+  const confirmedCount = editedEntries.filter(e => e.status === 'confirmed' || e.status === 'edited').length
+  const totalCount = editedEntries.length
+
+  return (
+    <div className="space-y-4">
+      {/* Loading */}
+      {detecting && (
+        <div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
+          <div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
+          Worterkennung laeuft...
+        </div>
+      )}
+
+      {/* Mode toggle */}
+      {wordResult && (
+        <div className="flex items-center gap-2">
+          <button
+            onClick={() => setMode('overview')}
+            className={`px-3 py-1.5 text-xs rounded-lg font-medium transition-colors ${
+              mode === 'overview'
+                ? 'bg-teal-600 text-white'
+                : 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
+            }`}
+          >
+            Uebersicht
+          </button>
+          <button
+            onClick={() => setMode('labeling')}
+            className={`px-3 py-1.5 text-xs rounded-lg font-medium transition-colors ${
+              mode === 'labeling'
+                ? 'bg-teal-600 text-white'
+                : 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
+            }`}
+          >
+            Labeling ({confirmedCount}/{totalCount})
+          </button>
+        </div>
+      )}
+
+      {/* Overview mode: side-by-side images + entry list */}
+      {mode === 'overview' && (
+        <>
+          {/* Images: overlay vs clean */}
+          <div className="grid grid-cols-2 gap-4">
+            <div>
+              <div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
+                Mit Grid-Overlay
+              </div>
+              <div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900">
+                {wordResult ? (
+                  // eslint-disable-next-line @next/next/no-img-element
+                  <img
+                    src={`${overlayUrl}?t=${Date.now()}`}
+                    alt="Wort-Overlay"
+                    className="w-full h-auto"
+                  />
+                ) : (
+                  <div className="aspect-[3/4] flex items-center justify-center text-gray-400 text-sm">
+                    {detecting ? 'Erkenne Woerter...' : 'Keine Daten'}
+                  </div>
+                )}
+              </div>
+            </div>
+            <div>
+              <div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
+                Entzerrtes Bild
+              </div>
+              <div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900">
+                {/* eslint-disable-next-line @next/next/no-img-element */}
+                <img
+                  src={dewarpedUrl}
+                  alt="Entzerrt"
+                  className="w-full h-auto"
+                />
+              </div>
+            </div>
+          </div>
+
+          {/* Result summary */}
+          {wordResult && summary && (
+            <div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
+              <div className="flex items-center justify-between">
+                <h4 className="text-sm font-medium text-gray-700 dark:text-gray-300">
+                  Ergebnis: {summary.total_entries} Eintraege erkannt
+                </h4>
+                <span className="text-xs text-gray-400">
+                  {wordResult.duration_seconds}s
+                </span>
+              </div>
+
+              {/* Summary badges */}
+              <div className="flex gap-2 flex-wrap">
+                <span className="px-2 py-0.5 rounded text-xs font-medium bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-300">
+                  EN: {summary.with_english}
+                </span>
+                <span className="px-2 py-0.5 rounded text-xs font-medium bg-green-100 dark:bg-green-900/30 text-green-700 dark:text-green-300">
+                  DE: {summary.with_german}
+                </span>
+                {summary.low_confidence > 0 && (
+                  <span className="px-2 py-0.5 rounded text-xs font-medium bg-red-100 dark:bg-red-900/30 text-red-700 dark:text-red-300">
+                    Unsicher: {summary.low_confidence}
+                  </span>
+                )}
+              </div>
+
+              {/* Entry table */}
+              <div className="max-h-80 overflow-y-auto">
+                <table className="w-full text-xs">
+                  <thead className="sticky top-0 bg-white dark:bg-gray-800">
+                    <tr className="text-left text-gray-500 dark:text-gray-400 border-b dark:border-gray-700">
+                      <th className="py-1 pr-2 w-8">#</th>
+                      <th className="py-1 pr-2">English</th>
+                      <th className="py-1 pr-2">Deutsch</th>
+                      <th className="py-1 pr-2">Example</th>
+                      <th className="py-1 w-12 text-right">Conf</th>
+                    </tr>
+                  </thead>
+                  <tbody>
+                    {editedEntries.map((entry, idx) => (
+                      <tr
+                        key={idx}
+                        className={`border-b dark:border-gray-700/50 ${
+                          idx === activeIndex ? 'bg-teal-50 dark:bg-teal-900/20' : ''
+                        }`}
+                        onClick={() => { setActiveIndex(idx); setMode('labeling') }}
+                      >
+                        <td className="py-1 pr-2 text-gray-400">{idx + 1}</td>
+                        <td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
+                          {entry.english || <span className="text-gray-300 dark:text-gray-600">—</span>}
+                        </td>
+                        <td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
+                          {entry.german || <span className="text-gray-300 dark:text-gray-600">—</span>}
+                        </td>
+                        <td className="py-1 pr-2 font-mono text-gray-500 dark:text-gray-400 cursor-pointer max-w-[200px] truncate">
+                          {entry.example || <span className="text-gray-300 dark:text-gray-600">—</span>}
+                        </td>
+                        <td className={`py-1 text-right font-mono ${confColor(entry.confidence)}`}>
+                          {entry.confidence}%
+                        </td>
+                      </tr>
+                    ))}
+                  </tbody>
+                </table>
+              </div>
+            </div>
+          )}
+        </>
+      )}
+
+      {/* Labeling mode: image crop + editable fields */}
+      {mode === 'labeling' && editedEntries.length > 0 && (
+        <div className="grid grid-cols-3 gap-4">
+          {/* Left 2/3: Image with highlighted active row */}
+          <div className="col-span-2">
+            <div className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-1">
+              Eintrag {activeIndex + 1} von {editedEntries.length}
+            </div>
+            <div className="border rounded-lg overflow-hidden dark:border-gray-700 bg-gray-50 dark:bg-gray-900 relative">
+              {/* eslint-disable-next-line @next/next/no-img-element */}
+              <img
+                src={`${overlayUrl}?t=${Date.now()}`}
+                alt="Wort-Overlay"
+                className="w-full h-auto"
+              />
+              {/* Highlight overlay for active entry bbox */}
+              {editedEntries[activeIndex]?.bbox && (
+                <div
+                  className="absolute border-2 border-yellow-400 bg-yellow-400/10 pointer-events-none"
+                  style={{
+                    left: `${editedEntries[activeIndex].bbox.x}%`,
+                    top: `${editedEntries[activeIndex].bbox.y}%`,
+                    width: `${editedEntries[activeIndex].bbox.w}%`,
+                    height: `${editedEntries[activeIndex].bbox.h}%`,
+                  }}
+                />
+              )}
+            </div>
+          </div>
+
+          {/* Right 1/3: Editable entry fields */}
+          <div className="space-y-3">
+            {/* Navigation */}
+            <div className="flex items-center justify-between">
+              <button
+                onClick={() => setActiveIndex(Math.max(0, activeIndex - 1))}
+                disabled={activeIndex === 0}
+                className="px-2 py-1 text-xs border rounded hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-30"
+              >
+                Zurueck
+              </button>
+              <span className="text-xs text-gray-500">{activeIndex + 1} / {editedEntries.length}</span>
+              <button
+                onClick={() => setActiveIndex(Math.min(editedEntries.length - 1, activeIndex + 1))}
+                disabled={activeIndex >= editedEntries.length - 1}
+                className="px-2 py-1 text-xs border rounded hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-30"
+              >
+                Weiter
+              </button>
+            </div>
+
+            {/* Status badge */}
+            <div className="flex items-center gap-2">
+              <span className={`px-2 py-0.5 rounded text-[10px] uppercase font-semibold ${statusBadge(editedEntries[activeIndex]?.status)}`}>
+                {editedEntries[activeIndex]?.status || 'pending'}
+              </span>
+              <span className={`text-xs font-mono ${confColor(editedEntries[activeIndex]?.confidence || 0)}`}>
+                {editedEntries[activeIndex]?.confidence}% Konfidenz
+              </span>
+            </div>
+
+            {/* Cell crops */}
+            {editedEntries[activeIndex]?.bbox_en && (
+              <div>
+                <div className="text-[10px] font-medium text-blue-500 mb-0.5">EN-Zelle</div>
+                <div className="border rounded dark:border-gray-700 overflow-hidden bg-white dark:bg-gray-900 h-10 relative">
+                  <CellCrop
+                    imageUrl={dewarpedUrl}
+                    bbox={editedEntries[activeIndex].bbox_en!}
+                  />
+                </div>
+              </div>
+            )}
+            {editedEntries[activeIndex]?.bbox_de && (
+              <div>
+                <div className="text-[10px] font-medium text-green-500 mb-0.5">DE-Zelle</div>
+                <div className="border rounded dark:border-gray-700 overflow-hidden bg-white dark:bg-gray-900 h-10 relative">
+                  <CellCrop
+                    imageUrl={dewarpedUrl}
+                    bbox={editedEntries[activeIndex].bbox_de!}
+                  />
+                </div>
+              </div>
+            )}
+
+            {/* Editable fields */}
+            <div className="space-y-2">
+              <div>
+                <label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">English</label>
+                <input
+                  ref={enRef}
+                  type="text"
+                  value={editedEntries[activeIndex]?.english || ''}
+                  onChange={(e) => updateEntry(activeIndex, 'english', e.target.value)}
+                  className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
+                />
+              </div>
+              <div>
+                <label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Deutsch</label>
+                <input
+                  type="text"
+                  value={editedEntries[activeIndex]?.german || ''}
+                  onChange={(e) => updateEntry(activeIndex, 'german', e.target.value)}
+                  className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
+                />
+              </div>
+              <div>
+                <label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Example</label>
+                <input
+                  type="text"
+                  value={editedEntries[activeIndex]?.example || ''}
+                  onChange={(e) => updateEntry(activeIndex, 'example', e.target.value)}
+                  className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
+                />
+              </div>
+            </div>
+
+            {/* Action buttons */}
+            <div className="flex gap-2">
+              <button
+                onClick={confirmEntry}
+                className="flex-1 px-3 py-1.5 text-xs bg-green-600 text-white rounded-lg hover:bg-green-700 font-medium"
+              >
+                Bestaetigen (Enter)
+              </button>
+              <button
+                onClick={skipEntry}
+                className="px-3 py-1.5 text-xs border rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600"
+              >
+                Skip
+              </button>
+            </div>
+
+            {/* Shortcuts hint */}
+            <div className="text-[10px] text-gray-400 space-y-0.5">
+              <div>Enter = Bestaetigen & weiter</div>
+              <div>Ctrl+↓ = Ueberspringen</div>
+              <div>Ctrl+↑ = Zurueck</div>
+            </div>
+
+            {/* Entry list (compact) */}
+            <div className="border-t dark:border-gray-700 pt-2 mt-2">
+              <div className="text-[10px] font-medium text-gray-500 dark:text-gray-400 mb-1">
+                Alle Eintraege
+              </div>
+              <div className="max-h-48 overflow-y-auto space-y-0.5">
+                {editedEntries.map((entry, idx) => (
+                  <div
+                    key={idx}
+                    onClick={() => setActiveIndex(idx)}
+                    className={`flex items-center gap-1 px-2 py-1 rounded text-[10px] cursor-pointer transition-colors ${
+                      idx === activeIndex
+                        ? 'bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700'
+                        : 'hover:bg-gray-50 dark:hover:bg-gray-700/50'
+                    }`}
+                  >
+                    <span className="w-4 text-right text-gray-400">{idx + 1}</span>
+                    <span className={`w-2 h-2 rounded-full ${
+                      entry.status === 'confirmed' ? 'bg-green-500' :
+                      entry.status === 'edited' ? 'bg-blue-500' :
+                      entry.status === 'skipped' ? 'bg-orange-400' :
+                      'bg-gray-300 dark:bg-gray-600'
+                    }`} />
+                    <span className="truncate text-gray-600 dark:text-gray-400 font-mono">
+                      {entry.english || '—'} → {entry.german || '—'}
+                    </span>
+                  </div>
+                ))}
+              </div>
+            </div>
+          </div>
+        </div>
+      )}
+
+      {/* Controls */}
+      {wordResult && (
+        <div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
+          <div className="flex items-center gap-3 flex-wrap">
+            <button
+              onClick={() => runAutoDetection()}
+              disabled={detecting}
+              className="px-3 py-1.5 text-xs border rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 disabled:opacity-50"
+            >
+              Erneut erkennen
+            </button>
+
+            <button
+              onClick={() => goToStep(3)}
+              className="px-3 py-1.5 text-xs border rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 dark:border-gray-600 text-orange-600 dark:text-orange-400 border-orange-300 dark:border-orange-700"
+            >
+              Zeilen korrigieren (Step 4)
+            </button>
+
+            <div className="flex-1" />
+
+            {/* Ground truth */}
+            {!gtSaved ? (
+              <>
+                <input
+                  type="text"
+                  placeholder="Notizen (optional)"
+                  value={gtNotes}
+                  onChange={(e) => setGtNotes(e.target.value)}
+                  className="px-2 py-1 text-xs border rounded dark:bg-gray-700 dark:border-gray-600 w-48"
+                />
+                <button
+                  onClick={() => handleGroundTruth(true)}
+                  className="px-3 py-1.5 text-xs bg-green-600 text-white rounded-lg hover:bg-green-700"
+                >
+                  Korrekt
+                </button>
+                <button
+                  onClick={() => handleGroundTruth(false)}
+                  className="px-3 py-1.5 text-xs bg-red-600 text-white rounded-lg hover:bg-red-700"
+                >
+                  Fehlerhaft
+                </button>
+              </>
+            ) : (
+              <span className="text-xs text-green-600 dark:text-green-400">
+                Ground Truth gespeichert
+              </span>
+            )}
+
+            <button
+              onClick={onNext}
+              className="px-4 py-1.5 text-xs bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium"
+            >
+              Weiter
+            </button>
+          </div>
+        </div>
+      )}
+
+      {error && (
+        <div className="p-3 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 rounded-lg text-sm">
+          {error}
+        </div>
+      )}
    </div>
  )
 }
+
+
+/**
+ * CellCrop: Shows a cropped portion of the dewarped image based on percent bbox.
+ * Uses CSS background-image + background-position for efficient cropping.
+ */
+function CellCrop({ imageUrl, bbox }: { imageUrl: string; bbox: { x: number; y: number; w: number; h: number } }) {
+  // Scale factor: how much to zoom into the cell
+  const scaleX = 100 / bbox.w
+  const scaleY = 100 / bbox.h
+  const scale = Math.min(scaleX, scaleY, 8) // Cap zoom at 8x
+
+  return (
+    <div
+      className="w-full h-full"
+      style={{
+        backgroundImage: `url(${imageUrl})`,
+        backgroundSize: `${scale * 100}%`,
+        backgroundPosition: `${-bbox.x * scale}% ${-bbox.y * scale}%`,
+        backgroundRepeat: 'no-repeat',
+      }}
+    />
+  )
+}
@@ -0,0 +1,373 @@
+# OCR Pipeline - Schrittweise Seitenrekonstruktion
+
+**Version:** 1.0.0
+**Status:** In Entwicklung
+**URL:** https://macmini:3002/ai/ocr-pipeline
+
+## Uebersicht
+
+Die OCR Pipeline zerlegt den OCR-Prozess in **8 einzelne Schritte**, um eingescannte Vokabelseiten Wort fuer Wort zu rekonstruieren. Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten versehen werden.
+
+**Ziel:** 10 Vokabelseiten fehlerfrei rekonstruieren.
+
+### Pipeline-Schritte
+
+| Schritt | Name | Beschreibung | Status |
+|---------|------|--------------|--------|
+| 1 | Begradigung (Deskew) | Scan begradigen (Hough Lines + Word Alignment) | Implementiert |
+| 2 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert |
+| 3 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile) | Implementiert |
+| 4 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation | Implementiert |
+| 5 | Worterkennung | Grid aus Spalten x Zeilen, OCR pro Zelle | Implementiert |
+| 6 | Koordinatenzuweisung | Exakte Positionen innerhalb Zellen | Geplant |
+| 7 | Seitenrekonstruktion | Seite nachbauen aus Koordinaten | Geplant |
+| 8 | Ground Truth Validierung | Gesamtpruefung aller Schritte | Geplant |
+
+---
+
+## Architektur
+
+```
+Admin-Lehrer (Next.js)          klausur-service (FastAPI :8086)
+┌────────────────────┐          ┌─────────────────────────────┐
+│ /ai/ocr-pipeline   │          │ /api/v1/ocr-pipeline/       │
+│                    │  REST    │                             │
+│ PipelineStepper    │◄────────►│ Sessions CRUD               │
+│ StepDeskew         │          │ Image Serving               │
+│ StepDewarp         │          │ Deskew/Dewarp/Columns/Rows  │
+│ StepColumnDetection│          │ Word Recognition            │
+│ StepRowDetection   │          │ Ground Truth                │
+│ StepWordRecognition│          │ Overlay Images              │
+└────────────────────┘          └─────────────────────────────┘
+                                         │
+                                         ▼
+                                ┌─────────────────────┐
+                                │ PostgreSQL           │
+                                │ ocr_pipeline_sessions│
+                                │ (Images + JSONB)     │
+                                └─────────────────────┘
+```
+
+### Dateistruktur
+
+```
+klausur-service/backend/
+├── ocr_pipeline_api.py              # FastAPI Router (alle Endpoints)
+├── ocr_pipeline_session_store.py    # PostgreSQL Persistence
+├── cv_vocab_pipeline.py             # Computer Vision Algorithmen
+└── migrations/
+    ├── 002_ocr_pipeline_sessions.sql # Basis-Schema
+    ├── 003_add_row_result.sql        # Row-Result Spalte
+    └── 004_add_word_result.sql       # Word-Result Spalte
+
+admin-lehrer/
+├── app/(admin)/ai/ocr-pipeline/
+│   ├── page.tsx                      # Haupt-Page mit Session-Management
+│   └── types.ts                      # TypeScript Interfaces
+└── components/ocr-pipeline/
+    ├── PipelineStepper.tsx            # Fortschritts-Stepper
+    ├── StepDeskew.tsx                 # Schritt 1
+    ├── StepDewarp.tsx                 # Schritt 2
+    ├── StepColumnDetection.tsx        # Schritt 3
+    ├── StepRowDetection.tsx           # Schritt 4
+    ├── StepWordRecognition.tsx        # Schritt 5
+    ├── StepCoordinates.tsx            # Schritt 6 (Platzhalter)
+    ├── StepReconstruction.tsx         # Schritt 7 (Platzhalter)
+    └── StepGroundTruth.tsx            # Schritt 8 (Platzhalter)
+```
+
+---
+
+## API-Referenz
+
+Alle Endpoints unter `/api/v1/ocr-pipeline/`.
+
+### Sessions
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `POST` | `/sessions` | Neue Session erstellen (Bild hochladen) |
+| `GET` | `/sessions` | Alle Sessions auflisten |
+| `GET` | `/sessions/{id}` | Session-Info mit allen Step-Results |
+| `PUT` | `/sessions/{id}` | Session umbenennen |
+| `DELETE` | `/sessions/{id}` | Session loeschen |
+
+### Bilder
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `GET` | `/sessions/{id}/image/original` | Originalbild |
+| `GET` | `/sessions/{id}/image/deskewed` | Begradigtes Bild |
+| `GET` | `/sessions/{id}/image/dewarped` | Entzerrtes Bild |
+| `GET` | `/sessions/{id}/image/binarized` | Binarisiertes Bild |
+| `GET` | `/sessions/{id}/image/columns-overlay` | Spalten-Overlay |
+| `GET` | `/sessions/{id}/image/rows-overlay` | Zeilen-Overlay |
+| `GET` | `/sessions/{id}/image/words-overlay` | Wort-Grid-Overlay |
+
+### Schritt 1: Begradigung
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `POST` | `/sessions/{id}/deskew` | Automatische Begradigung |
+| `POST` | `/sessions/{id}/deskew/manual` | Manuelle Winkelkorrektur |
+| `POST` | `/sessions/{id}/ground-truth/deskew` | Ground Truth speichern |
+
+### Schritt 2: Entzerrung
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `POST` | `/sessions/{id}/dewarp` | Automatische Entzerrung |
+| `POST` | `/sessions/{id}/dewarp/manual` | Manueller Scherbungswinkel |
+| `POST` | `/sessions/{id}/ground-truth/dewarp` | Ground Truth speichern |
+
+### Schritt 3: Spalten
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `POST` | `/sessions/{id}/columns` | Automatische Spaltenerkennung |
+| `POST` | `/sessions/{id}/columns/manual` | Manuelle Spalten-Definition |
+| `POST` | `/sessions/{id}/ground-truth/columns` | Ground Truth speichern |
+
+### Schritt 4: Zeilen
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `POST` | `/sessions/{id}/rows` | Automatische Zeilenerkennung |
+| `POST` | `/sessions/{id}/rows/manual` | Manuelle Zeilen-Definition |
+| `POST` | `/sessions/{id}/ground-truth/rows` | Ground Truth speichern |
+| `GET` | `/sessions/{id}/ground-truth/rows` | Ground Truth abrufen |
+
+### Schritt 5: Worterkennung
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `POST` | `/sessions/{id}/words` | Wort-Grid aus Spalten x Zeilen erstellen |
+| `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern |
+| `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen |
+
+---
+
+## Schritt 5: Worterkennung (Detail)
+
+### Algorithmus: `build_word_grid()`
+
+Schritt 5 nutzt die Ergebnisse von Schritt 3 (Spalten) und Schritt 4 (Zeilen), um ein Grid zu erstellen und jede Zelle per OCR auszulesen.
+
+```
+Spalten (Step 3):    column_en  |  column_de  |  column_example
+                     ───────────┼─────────────┼────────────────
+Zeilen (Step 4):  R0 │  hello   │  hallo      │  Hello, World!
+                  R1 │  world   │  Welt       │  The whole world
+                  R2 │  book    │  Buch       │  Read a book
+                     ───────────┼─────────────┼────────────────
+```
+
+**Ablauf:**
+
+1. **Filterung**: Nur `content`-Zeilen (kein Header/Footer) und relevante Spalten (`column_en`, `column_de`, `column_example`)
+2. **Zell-Bildung**: Pro content-Zeile x pro relevante Spalte eine `PageRegion` berechnen
+3. **OCR**: `ocr_region()` mit PSM 7 (Single Line) pro Zelle aufrufen
+4. **Sprache**: `eng` fuer EN-Spalte, `deu` fuer DE-Spalte, `eng+deu` fuer Beispiele
+5. **Gruppierung**: Zellen zu Vokabel-Eintraegen zusammenfuehren
+
+### Response-Format
+
+```json
+{
+  "entries": [
+    {
+      "row_index": 0,
+      "english": "hello",
+      "german": "hallo",
+      "example": "Hello, how are you?",
+      "confidence": 85.3,
+      "bbox": {"x": 5.2, "y": 12.1, "w": 90.0, "h": 2.8},
+      "bbox_en": {"x": 5.2, "y": 12.1, "w": 30.0, "h": 2.8},
+      "bbox_de": {"x": 35.5, "y": 12.1, "w": 25.0, "h": 2.8},
+      "bbox_ex": {"x": 61.0, "y": 12.1, "w": 34.2, "h": 2.8}
+    }
+  ],
+  "entry_count": 25,
+  "image_width": 2480,
+  "image_height": 3508,
+  "duration_seconds": 3.2,
+  "summary": {
+    "total_entries": 25,
+    "with_english": 24,
+    "with_german": 22,
+    "low_confidence": 3
+  }
+}
+```
+
+!!! info "Bounding Boxes in Prozent"
+    Alle `bbox`-Werte sind Prozent (0-100) relativ zur Bildgroesse.
+    Das erleichtert die Darstellung im Frontend unabhaengig von der Bildaufloesung.
+
+### Frontend: StepWordRecognition
+
+Die Komponente bietet zwei Modi:
+
+**Uebersicht-Modus:**
+
+- Zwei Bilder nebeneinander: Grid-Overlay vs. sauberes Bild
+- Tabelle aller erkannten Eintraege mit Konfidenz-Werten
+- Klick auf Eintrag wechselt zum Labeling-Modus
+
+**Labeling-Modus (Step-Through):**
+
+- Links (2/3): Bild mit hervorgehobenem aktiven Eintrag (gelber Rahmen)
+- Rechts (1/3): Zell-Ausschnitte + editierbare Felder (English, Deutsch, Example)
+- Tastaturkuerzel:
+    - `Enter` = Bestaetigen und weiter
+    - `Ctrl+Pfeil runter` = Ueberspringen
+    - `Ctrl+Pfeil hoch` = Zurueck
+
+**Feedback-Loop:**
+
+- "Zeilen korrigieren" springt zurueck zu Schritt 4
+- Nach Korrektur der Zeilen kann Schritt 5 erneut ausgefuehrt werden
+
+---
+
+## Datenbank-Schema
+
+```sql
+CREATE TABLE ocr_pipeline_sessions (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    name VARCHAR(255),
+    filename VARCHAR(255),
+    status VARCHAR(50) DEFAULT 'active',
+    current_step INT DEFAULT 1,
+
+    -- Bilder (BYTEA)
+    original_png BYTEA,
+    deskewed_png BYTEA,
+    binarized_png BYTEA,
+    dewarped_png BYTEA,
+
+    -- Step-Results (JSONB)
+    deskew_result JSONB,
+    dewarp_result JSONB,
+    column_result JSONB,
+    row_result JSONB,
+    word_result JSONB,
+
+    -- Ground Truth + Meta
+    ground_truth JSONB,
+    auto_shear_degrees REAL,
+    created_at TIMESTAMP DEFAULT NOW(),
+    updated_at TIMESTAMP DEFAULT NOW()
+);
+```
+
+### Migrationen
+
+| Datei | Beschreibung |
+|-------|--------------|
+| `002_ocr_pipeline_sessions.sql` | Basis-Schema (Steps 1-3) |
+| `003_add_row_result.sql` | `row_result JSONB` fuer Step 4 |
+| `004_add_word_result.sql` | `word_result JSONB` fuer Step 5 |
+
+---
+
+## TypeScript Interfaces
+
+Die wichtigsten Typen in `types.ts`:
+
+```typescript
+interface WordEntry {
+  row_index: number
+  english: string
+  german: string
+  example: string
+  confidence: number
+  bbox: WordBbox          // Gesamte Zeile
+  bbox_en: WordBbox | null // EN-Zelle
+  bbox_de: WordBbox | null // DE-Zelle
+  bbox_ex: WordBbox | null // Example-Zelle
+  status?: 'pending' | 'confirmed' | 'edited' | 'skipped'
+}
+
+interface WordResult {
+  entries: WordEntry[]
+  entry_count: number
+  image_width: number
+  image_height: number
+  duration_seconds: number
+  summary: {
+    total_entries: number
+    with_english: number
+    with_german: number
+    low_confidence: number
+  }
+}
+```
+
+---
+
+## Ground Truth System
+
+Jeder Schritt kann mit Ground-Truth-Feedback versehen werden:
+
+```json
+{
+  "is_correct": false,
+  "corrected_entries": [...],
+  "notes": "Zeile 5 falsch erkannt",
+  "saved_at": "2026-02-28T10:30:00"
+}
+```
+
+Ground-Truth-Daten werden in der `ground_truth` JSONB-Spalte gespeichert, gruppiert nach Schritt:
+
+```json
+{
+  "deskew": { "is_correct": true, ... },
+  "dewarp": { "is_correct": true, ... },
+  "columns": { "is_correct": false, ... },
+  "rows": { "is_correct": true, ... },
+  "words": { "is_correct": false, ... }
+}
+```
+
+---
+
+## Deployment
+
+```bash
+# 1. Git push
+git push origin main && git push gitea main
+
+# 2. Mac Mini pull + build
+ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && git pull --no-rebase origin main"
+
+# klausur-service (Backend)
+ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \
+  /usr/local/bin/docker compose build --no-cache klausur-service && \
+  /usr/local/bin/docker compose up -d klausur-service"
+
+# admin-lehrer (Frontend)
+ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \
+  /usr/local/bin/docker compose build --no-cache admin-lehrer && \
+  /usr/local/bin/docker compose up -d admin-lehrer"
+
+# 3. Migration ausfuehren
+ssh macmini "/usr/local/bin/docker exec bp-lehrer-klausur-service \
+  python -c \"import asyncio; from ocr_pipeline_session_store import *; asyncio.run(init_ocr_pipeline_tables())\""
+
+# 4. Testen unter:
+# https://macmini:3002/ai/ocr-pipeline
+```
+
+---
+
+## Aenderungshistorie
+
+| Datum | Version | Aenderung |
+|-------|---------|----------|
+| 2026-02-28 | 1.0.0 | Schritt 5 (Worterkennung) implementiert |
+| 2026-02-22 | 0.4.0 | Schritt 4 (Zeilenerkennung) implementiert |
+| 2026-02-20 | 0.3.0 | Schritt 3 (Spaltenerkennung) mit Typ-Klassifikation |
+| 2026-02-15 | 0.2.0 | Schritt 2 (Entzerrung/Dewarp) |
+| 2026-02-12 | 0.1.0 | Schritt 1 (Begradigung/Deskew) + Session-Management |
@@ -2169,6 +2169,142 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
    return regions


+# =============================================================================
+# Pipeline Step 5: Word Grid from Columns × Rows
+# =============================================================================
+
+def build_word_grid(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+) -> List[Dict[str, Any]]:
+    """Build a word grid by intersecting columns and rows, then OCR each cell.
+
+    Args:
+        ocr_img: Binarized full-page image.
+        column_regions: Classified columns from Step 3 (PageRegion list).
+        row_geometries: Rows from Step 4 (RowGeometry list).
+        img_w: Image width in pixels.
+        img_h: Image height in pixels.
+        lang: Default Tesseract language.
+
+    Returns:
+        List of entry dicts with english/german/example text and bbox info (percent).
+    """
+    # Filter to content rows only (skip header/footer)
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_word_grid: no content rows found")
+        return []
+
+    # Map column types to roles
+    VOCAB_COLUMN_TYPES = {'column_en', 'column_de', 'column_example'}
+    relevant_cols = [c for c in column_regions if c.type in VOCAB_COLUMN_TYPES]
+    if not relevant_cols:
+        logger.warning("build_word_grid: no relevant vocabulary columns found")
+        return []
+
+    # Sort columns left-to-right
+    relevant_cols.sort(key=lambda c: c.x)
+
+    # Choose OCR language per column type
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    entries: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        entry: Dict[str, Any] = {
+            'row_index': row_idx,
+            'english': '',
+            'german': '',
+            'example': '',
+            'confidence': 0.0,
+            'bbox': {
+                'x': round(row.x / img_w * 100, 2),
+                'y': round(row.y / img_h * 100, 2),
+                'w': round(row.width / img_w * 100, 2),
+                'h': round(row.height / img_h * 100, 2),
+            },
+            'bbox_en': None,
+            'bbox_de': None,
+            'bbox_ex': None,
+        }
+
+        confidences: List[float] = []
+
+        for col in relevant_cols:
+            # Compute cell region: column x/width, row y/height
+            cell_x = col.x
+            cell_y = row.y
+            cell_w = col.width
+            cell_h = row.height
+
+            # Clamp to image bounds
+            cell_x = max(0, cell_x)
+            cell_y = max(0, cell_y)
+            if cell_x + cell_w > img_w:
+                cell_w = img_w - cell_x
+            if cell_y + cell_h > img_h:
+                cell_h = img_h - cell_y
+
+            if cell_w <= 0 or cell_h <= 0:
+                continue
+
+            cell_region = PageRegion(
+                type=col.type,
+                x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+
+            cell_lang = lang_map.get(col.type, lang)
+            words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7)
+
+            # Sort words by x position, join to text
+            words.sort(key=lambda w: w['left'])
+            text = ' '.join(w['text'] for w in words)
+            if words:
+                avg_conf = sum(w['conf'] for w in words) / len(words)
+                confidences.append(avg_conf)
+
+            # Bbox in percent
+            cell_bbox = {
+                'x': round(cell_x / img_w * 100, 2),
+                'y': round(cell_y / img_h * 100, 2),
+                'w': round(cell_w / img_w * 100, 2),
+                'h': round(cell_h / img_h * 100, 2),
+            }
+
+            if col.type == 'column_en':
+                entry['english'] = text
+                entry['bbox_en'] = cell_bbox
+            elif col.type == 'column_de':
+                entry['german'] = text
+                entry['bbox_de'] = cell_bbox
+            elif col.type == 'column_example':
+                entry['example'] = text
+                entry['bbox_ex'] = cell_bbox
+
+        entry['confidence'] = round(
+            sum(confidences) / len(confidences), 1
+        ) if confidences else 0.0
+
+        # Only include if at least one field has text
+        if entry['english'] or entry['german'] or entry['example']:
+            entries.append(entry)
+
+    logger.info(f"build_word_grid: {len(entries)} entries from "
+                f"{len(content_rows)} content rows × {len(relevant_cols)} columns")
+
+    return entries
+
+
 # =============================================================================
 # Stage 6: Multi-Pass OCR
 # =============================================================================
@@ -0,0 +1,4 @@
+-- Migration 004: Add word_result column for OCR Pipeline Step 5
+-- Stores the word recognition grid result (entries with english/german/example + bboxes)
+
+ALTER TABLE ocr_pipeline_sessions ADD COLUMN IF NOT EXISTS word_result JSONB;
@@ -29,8 +29,11 @@ from fastapi.responses import Response
 from pydantic import BaseModel

 from cv_vocab_pipeline import (
+    PageRegion,
+    RowGeometry,
    analyze_layout,
    analyze_layout_by_words,
+    build_word_grid,
    classify_column_types,
    create_layout_image,
    create_ocr_image,
@@ -261,6 +264,10 @@ async def get_session_info(session_id: str):
        result["dewarp_result"] = session["dewarp_result"]
    if session.get("column_result"):
        result["column_result"] = session["column_result"]
+    if session.get("row_result"):
+        result["row_result"] = session["row_result"]
+    if session.get("word_result"):
+        result["word_result"] = session["word_result"]

    return result

@@ -291,7 +298,7 @@ async def delete_session(session_id: str):
@router.get("/sessions/{session_id}/image/{image_type}")
 async def get_image(session_id: str, image_type: str):
    """Serve session images: original, deskewed, dewarped, binarized, columns-overlay, or rows-overlay."""
-    valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay"}
+    valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay"}
    if image_type not in valid_types:
        raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")

@@ -301,6 +308,9 @@ async def get_image(session_id: str, image_type: str):
    if image_type == "rows-overlay":
        return await _get_rows_overlay(session_id)

+    if image_type == "words-overlay":
+        return await _get_words_overlay(session_id)
+
    # Try cache first for fast serving
    cached = _cache.get(session_id)
    if cached:
@@ -992,6 +1002,153 @@ async def get_row_ground_truth(session_id: str):
    }


+# ---------------------------------------------------------------------------
+# Word Recognition Endpoints (Step 5)
+# ---------------------------------------------------------------------------
+
+@router.post("/sessions/{session_id}/words")
+async def detect_words(session_id: str):
+    """Build word grid from columns × rows, OCR each cell."""
+    if session_id not in _cache:
+        await _load_session_to_cache(session_id)
+    cached = _get_cached(session_id)
+
+    dewarped_bgr = cached.get("dewarped_bgr")
+    if dewarped_bgr is None:
+        raise HTTPException(status_code=400, detail="Dewarp must be completed before word detection")
+
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    column_result = session.get("column_result")
+    row_result = session.get("row_result")
+    if not column_result or not column_result.get("columns"):
+        raise HTTPException(status_code=400, detail="Column detection must be completed first")
+    if not row_result or not row_result.get("rows"):
+        raise HTTPException(status_code=400, detail="Row detection must be completed first")
+
+    t0 = time.time()
+
+    # Create binarized OCR image
+    ocr_img = create_ocr_image(dewarped_bgr)
+    img_h, img_w = dewarped_bgr.shape[:2]
+
+    # Convert column dicts back to PageRegion objects
+    col_regions = [
+        PageRegion(
+            type=c["type"],
+            x=c["x"], y=c["y"],
+            width=c["width"], height=c["height"],
+            classification_confidence=c.get("classification_confidence", 1.0),
+            classification_method=c.get("classification_method", ""),
+        )
+        for c in column_result["columns"]
+    ]
+
+    # Convert row dicts back to RowGeometry objects
+    row_geoms = [
+        RowGeometry(
+            index=r["index"],
+            x=r["x"], y=r["y"],
+            width=r["width"], height=r["height"],
+            word_count=r.get("word_count", 0),
+            words=[],
+            row_type=r.get("row_type", "content"),
+            gap_before=r.get("gap_before", 0),
+        )
+        for r in row_result["rows"]
+    ]
+
+    # Build word grid
+    entries = build_word_grid(ocr_img, col_regions, row_geoms, img_w, img_h)
+    duration = time.time() - t0
+
+    # Build summary
+    summary = {
+        "total_entries": len(entries),
+        "with_english": sum(1 for e in entries if e.get("english")),
+        "with_german": sum(1 for e in entries if e.get("german")),
+        "low_confidence": sum(1 for e in entries if e.get("confidence", 0) < 50),
+    }
+
+    word_result = {
+        "entries": entries,
+        "entry_count": len(entries),
+        "image_width": img_w,
+        "image_height": img_h,
+        "duration_seconds": round(duration, 2),
+        "summary": summary,
+    }
+
+    # Persist to DB
+    await update_session_db(
+        session_id,
+        word_result=word_result,
+        current_step=5,
+    )
+
+    cached["word_result"] = word_result
+
+    logger.info(f"OCR Pipeline: words session {session_id}: "
+                f"{len(entries)} entries ({duration:.2f}s), summary: {summary}")
+
+    return {
+        "session_id": session_id,
+        **word_result,
+    }
+
+
+class WordGroundTruthRequest(BaseModel):
+    is_correct: bool
+    corrected_entries: Optional[List[Dict[str, Any]]] = None
+    notes: Optional[str] = None
+
+
+@router.post("/sessions/{session_id}/ground-truth/words")
+async def save_word_ground_truth(session_id: str, req: WordGroundTruthRequest):
+    """Save ground truth feedback for the word recognition step."""
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    ground_truth = session.get("ground_truth") or {}
+    gt = {
+        "is_correct": req.is_correct,
+        "corrected_entries": req.corrected_entries,
+        "notes": req.notes,
+        "saved_at": datetime.utcnow().isoformat(),
+        "word_result": session.get("word_result"),
+    }
+    ground_truth["words"] = gt
+
+    await update_session_db(session_id, ground_truth=ground_truth)
+
+    if session_id in _cache:
+        _cache[session_id]["ground_truth"] = ground_truth
+
+    return {"session_id": session_id, "ground_truth": gt}
+
+
+@router.get("/sessions/{session_id}/ground-truth/words")
+async def get_word_ground_truth(session_id: str):
+    """Retrieve saved ground truth for word recognition."""
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    ground_truth = session.get("ground_truth") or {}
+    words_gt = ground_truth.get("words")
+    if not words_gt:
+        raise HTTPException(status_code=404, detail="No word ground truth saved")
+
+    return {
+        "session_id": session_id,
+        "words_gt": words_gt,
+        "words_auto": session.get("word_result"),
+    }
+
+
 async def _get_rows_overlay(session_id: str) -> Response:
    """Generate dewarped image with row bands drawn on it."""
    session = await get_session_db(session_id)
@@ -1049,3 +1206,106 @@ async def _get_rows_overlay(session_id: str) -> Response:
        raise HTTPException(status_code=500, detail="Failed to encode overlay image")

    return Response(content=result_png.tobytes(), media_type="image/png")
+
+
+async def _get_words_overlay(session_id: str) -> Response:
+    """Generate dewarped image with word grid cells drawn on it."""
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    word_result = session.get("word_result")
+    if not word_result or not word_result.get("entries"):
+        raise HTTPException(status_code=404, detail="No word data available")
+
+    column_result = session.get("column_result")
+    row_result = session.get("row_result")
+
+    # Load dewarped image
+    dewarped_png = await get_session_image(session_id, "dewarped")
+    if not dewarped_png:
+        raise HTTPException(status_code=404, detail="Dewarped image not available")
+
+    arr = np.frombuffer(dewarped_png, dtype=np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    if img is None:
+        raise HTTPException(status_code=500, detail="Failed to decode image")
+
+    img_h, img_w = img.shape[:2]
+
+    # Color map for cell types (BGR)
+    cell_colors = {
+        "column_en": (255, 180, 0),      # Blue
+        "column_de": (0, 200, 0),         # Green
+        "column_example": (0, 140, 255),  # Orange
+    }
+
+    overlay = img.copy()
+
+    # Draw column divider lines (vertical)
+    if column_result and column_result.get("columns"):
+        for col in column_result["columns"]:
+            col_type = col.get("type", "")
+            if col_type in cell_colors:
+                cx = col["x"]
+                cv2.line(img, (cx, 0), (cx, img_h), cell_colors[col_type], 1)
+                cx_end = col["x"] + col["width"]
+                cv2.line(img, (cx_end, 0), (cx_end, img_h), cell_colors[col_type], 1)
+
+    # Draw row divider lines (horizontal) for content rows
+    if row_result and row_result.get("rows"):
+        for row in row_result["rows"]:
+            if row.get("row_type") == "content":
+                ry = row["y"]
+                cv2.line(img, (0, ry), (img_w, ry), (180, 180, 180), 1)
+
+    # Draw entry cells with text labels
+    entries = word_result["entries"]
+    for entry in entries:
+        conf = entry.get("confidence", 0)
+        # Color by confidence: green > 70, yellow 50-70, red < 50
+        if conf >= 70:
+            text_color = (0, 180, 0)
+        elif conf >= 50:
+            text_color = (0, 180, 220)
+        else:
+            text_color = (0, 0, 220)
+
+        for bbox_key, field_key, col_type in [
+            ("bbox_en", "english", "column_en"),
+            ("bbox_de", "german", "column_de"),
+            ("bbox_ex", "example", "column_example"),
+        ]:
+            bbox = entry.get(bbox_key)
+            text = entry.get(field_key, "")
+            if not bbox or not text:
+                continue
+
+            # Convert percent to pixels
+            bx = int(bbox["x"] / 100 * img_w)
+            by = int(bbox["y"] / 100 * img_h)
+            bw = int(bbox["w"] / 100 * img_w)
+            bh = int(bbox["h"] / 100 * img_h)
+
+            color = cell_colors.get(col_type, (200, 200, 200))
+
+            # Semi-transparent fill
+            cv2.rectangle(overlay, (bx, by), (bx + bw, by + bh), color, -1)
+
+            # Border
+            cv2.rectangle(img, (bx, by), (bx + bw, by + bh), text_color, 1)
+
+            # Text label (truncate if too long)
+            label = text[:30] if len(text) > 30 else text
+            font_scale = 0.35
+            cv2.putText(img, label, (bx + 3, by + bh - 4),
+                        cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, 1)
+
+    # Blend overlay at 10% opacity
+    cv2.addWeighted(overlay, 0.1, img, 0.9, 0, img)
+
+    success, result_png = cv2.imencode(".png", img)
+    if not success:
+        raise HTTPException(status_code=500, detail="Failed to encode overlay image")
+
+    return Response(content=result_png.tobytes(), media_type="image/png")
@@ -80,7 +80,7 @@ async def create_session_db(
            ) VALUES ($1, $2, $3, $4, 'active', 1)
            RETURNING id, name, filename, status, current_step,
                      deskew_result, dewarp_result, column_result, row_result,
-                      ground_truth, auto_shear_degrees,
+                      word_result, ground_truth, auto_shear_degrees,
                      created_at, updated_at
        """, uuid.UUID(session_id), name, filename, original_png)

@@ -94,7 +94,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
        row = await conn.fetchrow("""
            SELECT id, name, filename, status, current_step,
                   deskew_result, dewarp_result, column_result, row_result,
-                   ground_truth, auto_shear_degrees,
+                   word_result, ground_truth, auto_shear_degrees,
                   created_at, updated_at
            FROM ocr_pipeline_sessions WHERE id = $1
        """, uuid.UUID(session_id))
@@ -136,10 +136,10 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
        'name', 'filename', 'status', 'current_step',
        'original_png', 'deskewed_png', 'binarized_png', 'dewarped_png',
        'deskew_result', 'dewarp_result', 'column_result', 'row_result',
-        'ground_truth', 'auto_shear_degrees',
+        'word_result', 'ground_truth', 'auto_shear_degrees',
    }

-    jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth'}
+    jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth'}

    for key, value in kwargs.items():
        if key in allowed_fields:
@@ -164,7 +164,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
            WHERE id = ${param_idx}
            RETURNING id, name, filename, status, current_step,
                      deskew_result, dewarp_result, column_result, row_result,
-                      ground_truth, auto_shear_degrees,
+                      word_result, ground_truth, auto_shear_degrees,
                      created_at, updated_at
        """, *values)

@@ -220,7 +220,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
            result[key] = result[key].isoformat()

    # JSONB → parsed (asyncpg returns str for JSONB)
-    for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth']:
+    for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth']:
        if key in result and result[key] is not None:
            if isinstance(result[key], str):
                result[key] = json.loads(result[key])
@@ -65,6 +65,7 @@ nav:
      - BYOEH Architektur: services/klausur-service/BYOEH-Architecture.md
      - BYOEH Developer Guide: services/klausur-service/BYOEH-Developer-Guide.md
      - NiBiS Pipeline: services/klausur-service/NiBiS-Ingestion-Pipeline.md
+      - OCR Pipeline: services/klausur-service/OCR-Pipeline.md
      - OCR Labeling: services/klausur-service/OCR-Labeling-Spec.md
      - OCR Vergleich: services/klausur-service/OCR-Compare.md
      - RAG Admin: services/klausur-service/RAG-Admin-Spec.md