feat: cell-first OCR + document type detection + dynamic pipeline steps

Cell-First OCR (v2): Each cell is cropped and OCR'd in isolation,
eliminating neighbour bleeding (e.g. "to", "ps" in marker columns).
Uses ThreadPoolExecutor for parallel Tesseract calls.

Document type detection: Classifies pages as vocab_table, full_text,
or generic_table using projection profiles (<2s, no OCR needed).
Frontend dynamically skips columns/rows steps for full-text pages.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 13:52:38 +01:00
parent 00a74b3144
commit 29c74a9962
7 changed files with 1001 additions and 75 deletions

View File

@@ -11,7 +11,7 @@ import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecogniti
import { StepLlmReview } from '@/components/ocr-pipeline/StepLlmReview'
import { StepReconstruction } from '@/components/ocr-pipeline/StepReconstruction'
import { StepGroundTruth } from '@/components/ocr-pipeline/StepGroundTruth'
import { PIPELINE_STEPS, type PipelineStep, type SessionListItem } from './types'
import { PIPELINE_STEPS, type PipelineStep, type SessionListItem, type DocumentTypeResult } from './types'
const KLAUSUR_API = '/klausur-api'
@@ -23,6 +23,7 @@ export default function OcrPipelinePage() {
const [loadingSessions, setLoadingSessions] = useState(true)
const [editingName, setEditingName] = useState<string | null>(null)
const [editNameValue, setEditNameValue] = useState('')
const [docTypeResult, setDocTypeResult] = useState<DocumentTypeResult | null>(null)
const [steps, setSteps] = useState<PipelineStep[]>(
PIPELINE_STEPS.map((s, i) => ({
...s,
@@ -59,16 +60,23 @@ export default function OcrPipelinePage() {
setSessionId(sid)
setSessionName(data.name || data.filename || '')
// Restore doc type result if available
const savedDocType: DocumentTypeResult | null = data.doc_type_result || null
setDocTypeResult(savedDocType)
// Determine which step to jump to based on current_step
const dbStep = data.current_step || 1
// Steps: 1=deskew, 2=dewarp, 3=columns, ...
// UI steps are 0-indexed: 0=deskew, 1=dewarp, 2=columns, ...
const uiStep = Math.max(0, dbStep - 1)
const skipSteps = savedDocType?.skip_steps || []
setSteps(
PIPELINE_STEPS.map((s, i) => ({
...s,
status: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
status: skipSteps.includes(s.id)
? 'skipped'
: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
})),
)
setCurrentStep(uiStep)
@@ -84,6 +92,7 @@ export default function OcrPipelinePage() {
if (sessionId === sid) {
setSessionId(null)
setCurrentStep(0)
setDocTypeResult(null)
setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
}
} catch (e) {
@@ -123,16 +132,28 @@ export default function OcrPipelinePage() {
}
const handleNext = () => {
if (currentStep < steps.length - 1) {
setSteps((prev) =>
prev.map((s, i) => {
if (i === currentStep) return { ...s, status: 'completed' }
if (i === currentStep + 1) return { ...s, status: 'active' }
return s
}),
)
setCurrentStep((prev) => prev + 1)
if (currentStep >= steps.length - 1) return
// Find the next non-skipped step
const skipSteps = docTypeResult?.skip_steps || []
let nextStep = currentStep + 1
while (nextStep < steps.length && skipSteps.includes(PIPELINE_STEPS[nextStep]?.id)) {
nextStep++
}
if (nextStep >= steps.length) nextStep = steps.length - 1
setSteps((prev) =>
prev.map((s, i) => {
if (i === currentStep) return { ...s, status: 'completed' }
if (i === nextStep) return { ...s, status: 'active' }
// Mark skipped steps between current and next
if (i > currentStep && i < nextStep && skipSteps.includes(PIPELINE_STEPS[i]?.id)) {
return { ...s, status: 'skipped' }
}
return s
}),
)
setCurrentStep(nextStep)
}
const handleDeskewComplete = (sid: string) => {
@@ -142,10 +163,69 @@ export default function OcrPipelinePage() {
handleNext()
}
const handleDewarpNext = async () => {
// Auto-detect document type after dewarp, then advance
if (sessionId) {
try {
const res = await fetch(
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/detect-type`,
{ method: 'POST' },
)
if (res.ok) {
const data: DocumentTypeResult = await res.json()
setDocTypeResult(data)
// Mark skipped steps immediately
const skipSteps = data.skip_steps || []
if (skipSteps.length > 0) {
setSteps((prev) =>
prev.map((s) =>
skipSteps.includes(s.id) ? { ...s, status: 'skipped' } : s,
),
)
}
}
} catch (e) {
console.error('Doc type detection failed:', e)
// Not critical — continue without it
}
}
handleNext()
}
const handleDocTypeChange = (newDocType: DocumentTypeResult['doc_type']) => {
if (!docTypeResult) return
// Build new skip_steps based on doc type
let skipSteps: string[] = []
if (newDocType === 'full_text') {
skipSteps = ['columns', 'rows']
}
// vocab_table and generic_table: no skips
const updated: DocumentTypeResult = {
...docTypeResult,
doc_type: newDocType,
skip_steps: skipSteps,
pipeline: newDocType === 'full_text' ? 'full_page' : 'cell_first',
}
setDocTypeResult(updated)
// Update step statuses
setSteps((prev) =>
prev.map((s) => {
if (skipSteps.includes(s.id)) return { ...s, status: 'skipped' as const }
if (s.status === 'skipped') return { ...s, status: 'pending' as const }
return s
}),
)
}
const handleNewSession = () => {
setSessionId(null)
setSessionName('')
setCurrentStep(0)
setDocTypeResult(null)
setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
}
@@ -188,7 +268,7 @@ export default function OcrPipelinePage() {
case 0:
return <StepDeskew sessionId={sessionId} onNext={handleDeskewComplete} />
case 1:
return <StepDewarp sessionId={sessionId} onNext={handleNext} />
return <StepDewarp sessionId={sessionId} onNext={handleDewarpNext} />
case 2:
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} />
case 3:
@@ -314,7 +394,14 @@ export default function OcrPipelinePage() {
</div>
)}
<PipelineStepper steps={steps} currentStep={currentStep} onStepClick={handleStepClick} onReprocess={sessionId ? reprocessFromStep : undefined} />
<PipelineStepper
steps={steps}
currentStep={currentStep}
onStepClick={handleStepClick}
onReprocess={sessionId ? reprocessFromStep : undefined}
docTypeResult={docTypeResult}
onDocTypeChange={handleDocTypeChange}
/>
<div className="min-h-[400px]">{renderStep()}</div>
</div>

View File

@@ -1,4 +1,4 @@
export type PipelineStepStatus = 'pending' | 'active' | 'completed' | 'failed'
export type PipelineStepStatus = 'pending' | 'active' | 'completed' | 'failed' | 'skipped'
export interface PipelineStep {
id: string
@@ -17,6 +17,15 @@ export interface SessionListItem {
updated_at?: string
}
export interface DocumentTypeResult {
doc_type: 'vocab_table' | 'full_text' | 'generic_table'
confidence: number
pipeline: 'cell_first' | 'full_page'
skip_steps: string[]
features?: Record<string, unknown>
duration_seconds?: number
}
export interface SessionInfo {
session_id: string
filename: string
@@ -30,6 +39,7 @@ export interface SessionInfo {
column_result?: ColumnResult
row_result?: RowResult
word_result?: GridResult
doc_type_result?: DocumentTypeResult
}
export interface DeskewResult {

View File

@@ -1,66 +1,115 @@
'use client'
import { PipelineStep } from '@/app/(admin)/ai/ocr-pipeline/types'
import { PipelineStep, DocumentTypeResult } from '@/app/(admin)/ai/ocr-pipeline/types'
const DOC_TYPE_LABELS: Record<string, string> = {
vocab_table: 'Vokabeltabelle',
full_text: 'Volltext',
generic_table: 'Tabelle',
}
interface PipelineStepperProps {
steps: PipelineStep[]
currentStep: number
onStepClick: (index: number) => void
onReprocess?: (index: number) => void
docTypeResult?: DocumentTypeResult | null
onDocTypeChange?: (docType: DocumentTypeResult['doc_type']) => void
}
export function PipelineStepper({ steps, currentStep, onStepClick, onReprocess }: PipelineStepperProps) {
export function PipelineStepper({
steps,
currentStep,
onStepClick,
onReprocess,
docTypeResult,
onDocTypeChange,
}: PipelineStepperProps) {
return (
<div className="flex items-center justify-between px-4 py-3 bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700">
{steps.map((step, index) => {
const isActive = index === currentStep
const isCompleted = step.status === 'completed'
const isFailed = step.status === 'failed'
const isClickable = index <= currentStep || isCompleted
<div className="space-y-2">
<div className="flex items-center justify-between px-4 py-3 bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700">
{steps.map((step, index) => {
const isActive = index === currentStep
const isCompleted = step.status === 'completed'
const isFailed = step.status === 'failed'
const isSkipped = step.status === 'skipped'
const isClickable = (index <= currentStep || isCompleted) && !isSkipped
return (
<div key={step.id} className="flex items-center">
{index > 0 && (
<div
className={`h-0.5 w-8 mx-1 ${
index <= currentStep ? 'bg-teal-400' : 'bg-gray-300 dark:bg-gray-600'
}`}
/>
)}
<div className="relative group">
<button
onClick={() => isClickable && onStepClick(index)}
disabled={!isClickable}
className={`flex items-center gap-1.5 px-3 py-1.5 rounded-full text-sm font-medium transition-all ${
isActive
? 'bg-teal-100 text-teal-700 dark:bg-teal-900/40 dark:text-teal-300 ring-2 ring-teal-400'
: isCompleted
? 'bg-green-100 text-green-700 dark:bg-green-900/40 dark:text-green-300'
: isFailed
? 'bg-red-100 text-red-700 dark:bg-red-900/40 dark:text-red-300'
: 'text-gray-400 dark:text-gray-500'
} ${isClickable ? 'cursor-pointer hover:opacity-80' : 'cursor-default'}`}
>
<span className="text-base">
{isCompleted ? '\u2713' : isFailed ? '\u2717' : step.icon}
</span>
<span className="hidden sm:inline">{step.name}</span>
<span className="sm:hidden">{index + 1}</span>
</button>
{/* Reprocess button — shown on completed steps on hover */}
{isCompleted && onReprocess && (
<button
onClick={(e) => { e.stopPropagation(); onReprocess(index) }}
className="absolute -top-1 -right-1 w-4 h-4 bg-orange-500 text-white rounded-full text-[9px] leading-none opacity-0 group-hover:opacity-100 transition-opacity flex items-center justify-center"
title={`Ab hier neu verarbeiten`}
>
&#x21BB;
</button>
return (
<div key={step.id} className="flex items-center">
{index > 0 && (
<div
className={`h-0.5 w-8 mx-1 ${
isSkipped
? 'bg-gray-200 dark:bg-gray-700 border-t border-dashed border-gray-400'
: index <= currentStep ? 'bg-teal-400' : 'bg-gray-300 dark:bg-gray-600'
}`}
/>
)}
<div className="relative group">
<button
onClick={() => isClickable && onStepClick(index)}
disabled={!isClickable}
className={`flex items-center gap-1.5 px-3 py-1.5 rounded-full text-sm font-medium transition-all ${
isSkipped
? 'bg-gray-100 text-gray-400 dark:bg-gray-800 dark:text-gray-600 line-through'
: isActive
? 'bg-teal-100 text-teal-700 dark:bg-teal-900/40 dark:text-teal-300 ring-2 ring-teal-400'
: isCompleted
? 'bg-green-100 text-green-700 dark:bg-green-900/40 dark:text-green-300'
: isFailed
? 'bg-red-100 text-red-700 dark:bg-red-900/40 dark:text-red-300'
: 'text-gray-400 dark:text-gray-500'
} ${isClickable ? 'cursor-pointer hover:opacity-80' : 'cursor-default'}`}
>
<span className="text-base">
{isSkipped ? '-' : isCompleted ? '\u2713' : isFailed ? '\u2717' : step.icon}
</span>
<span className="hidden sm:inline">{step.name}</span>
<span className="sm:hidden">{index + 1}</span>
</button>
{/* Reprocess button — shown on completed steps on hover */}
{isCompleted && onReprocess && (
<button
onClick={(e) => { e.stopPropagation(); onReprocess(index) }}
className="absolute -top-1 -right-1 w-4 h-4 bg-orange-500 text-white rounded-full text-[9px] leading-none opacity-0 group-hover:opacity-100 transition-opacity flex items-center justify-center"
title={`Ab hier neu verarbeiten`}
>
&#x21BB;
</button>
)}
</div>
</div>
</div>
)
})}
)
})}
</div>
{/* Document type badge */}
{docTypeResult && (
<div className="flex items-center gap-2 px-4 py-2 bg-blue-50 dark:bg-blue-900/20 rounded-lg border border-blue-200 dark:border-blue-800 text-sm">
<span className="text-blue-600 dark:text-blue-400 font-medium">
Dokumenttyp:
</span>
{onDocTypeChange ? (
<select
value={docTypeResult.doc_type}
onChange={(e) => onDocTypeChange(e.target.value as DocumentTypeResult['doc_type'])}
className="bg-white dark:bg-gray-800 border border-blue-300 dark:border-blue-700 rounded px-2 py-0.5 text-sm text-blue-700 dark:text-blue-300"
>
<option value="vocab_table">Vokabeltabelle</option>
<option value="generic_table">Tabelle (generisch)</option>
<option value="full_text">Volltext</option>
</select>
) : (
<span className="text-blue-700 dark:text-blue-300">
{DOC_TYPE_LABELS[docTypeResult.doc_type] || docTypeResult.doc_type}
</span>
)}
<span className="text-blue-400 dark:text-blue-500 text-xs">
({Math.round(docTypeResult.confidence * 100)}% Konfidenz)
</span>
</div>
)}
</div>
)
}