feat: cell-first OCR + document type detection + dynamic pipeline steps
Cell-First OCR (v2): Each cell is cropped and OCR'd in isolation, eliminating neighbour bleeding (e.g. "to", "ps" in marker columns). Uses ThreadPoolExecutor for parallel Tesseract calls. Document type detection: Classifies pages as vocab_table, full_text, or generic_table using projection profiles (<2s, no OCR needed). Frontend dynamically skips columns/rows steps for full-text pages. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -11,7 +11,7 @@ import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecogniti
|
||||
import { StepLlmReview } from '@/components/ocr-pipeline/StepLlmReview'
|
||||
import { StepReconstruction } from '@/components/ocr-pipeline/StepReconstruction'
|
||||
import { StepGroundTruth } from '@/components/ocr-pipeline/StepGroundTruth'
|
||||
import { PIPELINE_STEPS, type PipelineStep, type SessionListItem } from './types'
|
||||
import { PIPELINE_STEPS, type PipelineStep, type SessionListItem, type DocumentTypeResult } from './types'
|
||||
|
||||
const KLAUSUR_API = '/klausur-api'
|
||||
|
||||
@@ -23,6 +23,7 @@ export default function OcrPipelinePage() {
|
||||
const [loadingSessions, setLoadingSessions] = useState(true)
|
||||
const [editingName, setEditingName] = useState<string | null>(null)
|
||||
const [editNameValue, setEditNameValue] = useState('')
|
||||
const [docTypeResult, setDocTypeResult] = useState<DocumentTypeResult | null>(null)
|
||||
const [steps, setSteps] = useState<PipelineStep[]>(
|
||||
PIPELINE_STEPS.map((s, i) => ({
|
||||
...s,
|
||||
@@ -59,16 +60,23 @@ export default function OcrPipelinePage() {
|
||||
setSessionId(sid)
|
||||
setSessionName(data.name || data.filename || '')
|
||||
|
||||
// Restore doc type result if available
|
||||
const savedDocType: DocumentTypeResult | null = data.doc_type_result || null
|
||||
setDocTypeResult(savedDocType)
|
||||
|
||||
// Determine which step to jump to based on current_step
|
||||
const dbStep = data.current_step || 1
|
||||
// Steps: 1=deskew, 2=dewarp, 3=columns, ...
|
||||
// UI steps are 0-indexed: 0=deskew, 1=dewarp, 2=columns, ...
|
||||
const uiStep = Math.max(0, dbStep - 1)
|
||||
const skipSteps = savedDocType?.skip_steps || []
|
||||
|
||||
setSteps(
|
||||
PIPELINE_STEPS.map((s, i) => ({
|
||||
...s,
|
||||
status: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
|
||||
status: skipSteps.includes(s.id)
|
||||
? 'skipped'
|
||||
: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
|
||||
})),
|
||||
)
|
||||
setCurrentStep(uiStep)
|
||||
@@ -84,6 +92,7 @@ export default function OcrPipelinePage() {
|
||||
if (sessionId === sid) {
|
||||
setSessionId(null)
|
||||
setCurrentStep(0)
|
||||
setDocTypeResult(null)
|
||||
setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
|
||||
}
|
||||
} catch (e) {
|
||||
@@ -123,16 +132,28 @@ export default function OcrPipelinePage() {
|
||||
}
|
||||
|
||||
const handleNext = () => {
|
||||
if (currentStep < steps.length - 1) {
|
||||
setSteps((prev) =>
|
||||
prev.map((s, i) => {
|
||||
if (i === currentStep) return { ...s, status: 'completed' }
|
||||
if (i === currentStep + 1) return { ...s, status: 'active' }
|
||||
return s
|
||||
}),
|
||||
)
|
||||
setCurrentStep((prev) => prev + 1)
|
||||
if (currentStep >= steps.length - 1) return
|
||||
|
||||
// Find the next non-skipped step
|
||||
const skipSteps = docTypeResult?.skip_steps || []
|
||||
let nextStep = currentStep + 1
|
||||
while (nextStep < steps.length && skipSteps.includes(PIPELINE_STEPS[nextStep]?.id)) {
|
||||
nextStep++
|
||||
}
|
||||
if (nextStep >= steps.length) nextStep = steps.length - 1
|
||||
|
||||
setSteps((prev) =>
|
||||
prev.map((s, i) => {
|
||||
if (i === currentStep) return { ...s, status: 'completed' }
|
||||
if (i === nextStep) return { ...s, status: 'active' }
|
||||
// Mark skipped steps between current and next
|
||||
if (i > currentStep && i < nextStep && skipSteps.includes(PIPELINE_STEPS[i]?.id)) {
|
||||
return { ...s, status: 'skipped' }
|
||||
}
|
||||
return s
|
||||
}),
|
||||
)
|
||||
setCurrentStep(nextStep)
|
||||
}
|
||||
|
||||
const handleDeskewComplete = (sid: string) => {
|
||||
@@ -142,10 +163,69 @@ export default function OcrPipelinePage() {
|
||||
handleNext()
|
||||
}
|
||||
|
||||
const handleDewarpNext = async () => {
|
||||
// Auto-detect document type after dewarp, then advance
|
||||
if (sessionId) {
|
||||
try {
|
||||
const res = await fetch(
|
||||
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/detect-type`,
|
||||
{ method: 'POST' },
|
||||
)
|
||||
if (res.ok) {
|
||||
const data: DocumentTypeResult = await res.json()
|
||||
setDocTypeResult(data)
|
||||
|
||||
// Mark skipped steps immediately
|
||||
const skipSteps = data.skip_steps || []
|
||||
if (skipSteps.length > 0) {
|
||||
setSteps((prev) =>
|
||||
prev.map((s) =>
|
||||
skipSteps.includes(s.id) ? { ...s, status: 'skipped' } : s,
|
||||
),
|
||||
)
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Doc type detection failed:', e)
|
||||
// Not critical — continue without it
|
||||
}
|
||||
}
|
||||
handleNext()
|
||||
}
|
||||
|
||||
const handleDocTypeChange = (newDocType: DocumentTypeResult['doc_type']) => {
|
||||
if (!docTypeResult) return
|
||||
|
||||
// Build new skip_steps based on doc type
|
||||
let skipSteps: string[] = []
|
||||
if (newDocType === 'full_text') {
|
||||
skipSteps = ['columns', 'rows']
|
||||
}
|
||||
// vocab_table and generic_table: no skips
|
||||
|
||||
const updated: DocumentTypeResult = {
|
||||
...docTypeResult,
|
||||
doc_type: newDocType,
|
||||
skip_steps: skipSteps,
|
||||
pipeline: newDocType === 'full_text' ? 'full_page' : 'cell_first',
|
||||
}
|
||||
setDocTypeResult(updated)
|
||||
|
||||
// Update step statuses
|
||||
setSteps((prev) =>
|
||||
prev.map((s) => {
|
||||
if (skipSteps.includes(s.id)) return { ...s, status: 'skipped' as const }
|
||||
if (s.status === 'skipped') return { ...s, status: 'pending' as const }
|
||||
return s
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
const handleNewSession = () => {
|
||||
setSessionId(null)
|
||||
setSessionName('')
|
||||
setCurrentStep(0)
|
||||
setDocTypeResult(null)
|
||||
setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
|
||||
}
|
||||
|
||||
@@ -188,7 +268,7 @@ export default function OcrPipelinePage() {
|
||||
case 0:
|
||||
return <StepDeskew sessionId={sessionId} onNext={handleDeskewComplete} />
|
||||
case 1:
|
||||
return <StepDewarp sessionId={sessionId} onNext={handleNext} />
|
||||
return <StepDewarp sessionId={sessionId} onNext={handleDewarpNext} />
|
||||
case 2:
|
||||
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} />
|
||||
case 3:
|
||||
@@ -314,7 +394,14 @@ export default function OcrPipelinePage() {
|
||||
</div>
|
||||
)}
|
||||
|
||||
<PipelineStepper steps={steps} currentStep={currentStep} onStepClick={handleStepClick} onReprocess={sessionId ? reprocessFromStep : undefined} />
|
||||
<PipelineStepper
|
||||
steps={steps}
|
||||
currentStep={currentStep}
|
||||
onStepClick={handleStepClick}
|
||||
onReprocess={sessionId ? reprocessFromStep : undefined}
|
||||
docTypeResult={docTypeResult}
|
||||
onDocTypeChange={handleDocTypeChange}
|
||||
/>
|
||||
|
||||
<div className="min-h-[400px]">{renderStep()}</div>
|
||||
</div>
|
||||
|
||||
Reference in New Issue
Block a user