Add OCR Kombi Pipeline: modular 11-step architecture with multi-page support
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m24s
CI / test-python-agent-core (push) Successful in 22s
CI / test-nodejs-website (push) Successful in 20s

Phase 1 of the clean architecture refactor: Replaces the 751-line ocr-overlay
monolith with a modular pipeline. Each step gets its own component file.

Frontend: /ai/ocr-kombi route with 11 steps (Upload, Orientation, PageSplit,
Deskew, Dewarp, ContentCrop, OCR, Structure, GridBuild, GridReview, GroundTruth).
Session list supports document grouping for multi-page uploads.

Backend: New ocr_kombi/ module with multi-page PDF upload (splits PDF into N
sessions with shared document_group_id). DB migration adds document_group_id
and page_number columns.

Old /ai/ocr-overlay remains fully functional for A/B testing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-26 15:55:28 +01:00
parent d26233b5b3
commit d26a9f60ab
25 changed files with 1935 additions and 7 deletions

View File

@@ -0,0 +1,174 @@
'use client'
import { Suspense } from 'react'
import { PagePurpose } from '@/components/common/PagePurpose'
import { BoxSessionTabs } from '@/components/ocr-pipeline/BoxSessionTabs'
import { KombiStepper } from '@/components/ocr-kombi/KombiStepper'
import { SessionList } from '@/components/ocr-kombi/SessionList'
import { SessionHeader } from '@/components/ocr-kombi/SessionHeader'
import { StepUpload } from '@/components/ocr-kombi/StepUpload'
import { StepOrientation } from '@/components/ocr-kombi/StepOrientation'
import { StepPageSplit } from '@/components/ocr-kombi/StepPageSplit'
import { StepDeskew } from '@/components/ocr-kombi/StepDeskew'
import { StepDewarp } from '@/components/ocr-kombi/StepDewarp'
import { StepContentCrop } from '@/components/ocr-kombi/StepContentCrop'
import { StepOcr } from '@/components/ocr-kombi/StepOcr'
import { StepStructure } from '@/components/ocr-kombi/StepStructure'
import { StepGridBuild } from '@/components/ocr-kombi/StepGridBuild'
import { StepGridReview } from '@/components/ocr-kombi/StepGridReview'
import { StepGroundTruth } from '@/components/ocr-kombi/StepGroundTruth'
import { useKombiPipeline } from './useKombiPipeline'
function OcrKombiContent() {
const {
currentStep,
sessionId,
sessionName,
loadingSessions,
activeCategory,
isGroundTruth,
subSessions,
parentSessionId,
steps,
gridSaveRef,
groupedSessions,
loadSessions,
openSession,
handleStepClick,
handleNext,
handleNewSession,
deleteSession,
renameSession,
updateCategory,
handleOrientationComplete,
handleSessionChange,
setSessionId,
setSubSessions,
setParentSessionId,
setIsGroundTruth,
} = useKombiPipeline()
const renderStep = () => {
switch (currentStep) {
case 0:
return (
<StepUpload
onUploaded={(sid) => {
setSessionId(sid)
loadSessions()
openSession(sid)
handleNext()
}}
/>
)
case 1:
return (
<StepOrientation
sessionId={sessionId}
onNext={handleOrientationComplete}
onSessionList={() => { loadSessions(); handleNewSession() }}
/>
)
case 2:
return (
<StepPageSplit
sessionId={sessionId}
onNext={handleNext}
onSubSessionsCreated={(subs) => {
setSubSessions(subs)
if (sessionId) setParentSessionId(sessionId)
}}
/>
)
case 3:
return <StepDeskew sessionId={sessionId} onNext={handleNext} />
case 4:
return <StepDewarp sessionId={sessionId} onNext={handleNext} />
case 5:
return <StepContentCrop sessionId={sessionId} onNext={handleNext} />
case 6:
return <StepOcr sessionId={sessionId} onNext={handleNext} />
case 7:
return <StepStructure sessionId={sessionId} onNext={handleNext} />
case 8:
return <StepGridBuild sessionId={sessionId} onNext={handleNext} />
case 9:
return <StepGridReview sessionId={sessionId} onNext={handleNext} saveRef={gridSaveRef} />
case 10:
return (
<StepGroundTruth
sessionId={sessionId}
isGroundTruth={isGroundTruth}
onMarked={() => setIsGroundTruth(true)}
gridSaveRef={gridSaveRef}
/>
)
default:
return null
}
}
return (
<div className="space-y-6">
<PagePurpose
title="OCR Kombi Pipeline"
purpose="Modulare 11-Schritt-Pipeline: Upload, Vorverarbeitung, Dual-Engine-OCR (PP-OCRv5 + Tesseract), Strukturerkennung, Grid-Aufbau und Review. Multi-Page-Dokument-Unterstuetzung."
audience={['Entwickler']}
architecture={{
services: ['klausur-service (FastAPI)', 'OpenCV', 'Tesseract', 'PaddleOCR'],
databases: ['PostgreSQL Sessions'],
}}
relatedPages={[
{ name: 'OCR Overlay (Legacy)', href: '/ai/ocr-overlay', description: 'Alter 3-Modi-Monolith' },
{ name: 'OCR Regression', href: '/ai/ocr-regression', description: 'Regressionstests' },
]}
defaultCollapsed
/>
<SessionList
items={groupedSessions()}
loading={loadingSessions}
activeSessionId={sessionId}
onOpenSession={(sid) => openSession(sid)}
onNewSession={handleNewSession}
onDeleteSession={deleteSession}
onRenameSession={renameSession}
onUpdateCategory={updateCategory}
/>
{sessionId && sessionName && (
<SessionHeader
sessionName={sessionName}
activeCategory={activeCategory}
isGroundTruth={isGroundTruth}
onUpdateCategory={(cat) => updateCategory(sessionId, cat)}
/>
)}
<KombiStepper
steps={steps}
currentStep={currentStep}
onStepClick={handleStepClick}
/>
{subSessions.length > 0 && parentSessionId && sessionId && (
<BoxSessionTabs
parentSessionId={parentSessionId}
subSessions={subSessions}
activeSessionId={sessionId}
onSessionChange={handleSessionChange}
/>
)}
<div className="min-h-[400px]">{renderStep()}</div>
</div>
)
}
export default function OcrKombiPage() {
return (
<Suspense fallback={<div className="p-4 text-sm text-gray-400">Lade...</div>}>
<OcrKombiContent />
</Suspense>
)
}

View File

@@ -0,0 +1,118 @@
import type { PipelineStep, PipelineStepStatus, DocumentCategory } from '../ocr-pipeline/types'
// Re-export shared types
export type { PipelineStep, PipelineStepStatus, DocumentCategory }
export { DOCUMENT_CATEGORIES } from '../ocr-pipeline/types'
// Re-export grid/structure types used by later steps
export type {
SessionListItem,
SessionInfo,
SubSession,
OrientationResult,
CropResult,
DeskewResult,
DewarpResult,
GridResult,
GridCell,
OcrWordBox,
WordBbox,
ColumnMeta,
StructureResult,
StructureBox,
StructureZone,
StructureGraphic,
ExcludeRegion,
} from '../ocr-pipeline/types'
/**
* 11-step Kombi V2 pipeline.
* Each step has its own component file in components/ocr-kombi/.
*/
export const KOMBI_V2_STEPS: PipelineStep[] = [
{ id: 'upload', name: 'Upload', icon: '📤', status: 'pending' },
{ id: 'orientation', name: 'Orientierung', icon: '🔄', status: 'pending' },
{ id: 'page-split', name: 'Seitentrennung', icon: '📖', status: 'pending' },
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
{ id: 'content-crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
{ id: 'ocr', name: 'OCR', icon: '🔀', status: 'pending' },
{ id: 'structure', name: 'Strukturerkennung', icon: '🔍', status: 'pending' },
{ id: 'grid-build', name: 'Grid-Aufbau', icon: '🧱', status: 'pending' },
{ id: 'grid-review', name: 'Grid-Review', icon: '📊', status: 'pending' },
{ id: 'ground-truth', name: 'Ground Truth', icon: '✅', status: 'pending' },
]
/** Map from Kombi V2 UI step index to DB step number */
export const KOMBI_V2_UI_TO_DB: Record<number, number> = {
0: 1, // upload
1: 2, // orientation
2: 2, // page-split (same DB step as orientation)
3: 3, // deskew
4: 4, // dewarp
5: 5, // content-crop
6: 8, // ocr (word_result)
7: 9, // structure
8: 10, // grid-build
9: 11, // grid-review
10: 12, // ground-truth
}
/** Map from DB step to Kombi V2 UI step index */
export function dbStepToKombiV2Ui(dbStep: number): number {
if (dbStep <= 1) return 0 // upload
if (dbStep === 2) return 1 // orientation
if (dbStep === 3) return 3 // deskew
if (dbStep === 4) return 4 // dewarp
if (dbStep === 5) return 5 // content-crop
if (dbStep <= 8) return 6 // ocr
if (dbStep === 9) return 7 // structure
if (dbStep === 10) return 8 // grid-build
if (dbStep === 11) return 9 // grid-review
return 10 // ground-truth
}
/** Document group: groups multiple sessions from a multi-page upload */
export interface DocumentGroup {
group_id: string
title: string
page_count: number
sessions: DocumentGroupSession[]
}
export interface DocumentGroupSession {
id: string
name: string
page_number: number
current_step: number
status: string
document_category?: DocumentCategory
created_at: string
}
/** Engine source for OCR transparency */
export type OcrEngineSource = 'both' | 'paddle_only' | 'tesseract_only' | 'conflict_paddle' | 'conflict_tesseract'
export interface OcrTransparentWord {
text: string
left: number
top: number
width: number
height: number
conf: number
engine_source: OcrEngineSource
}
export interface OcrTransparentResult {
raw_tesseract: { words: OcrTransparentWord[] }
raw_paddle: { words: OcrTransparentWord[] }
merged: { words: OcrTransparentWord[] }
stats: {
total_words: number
both_agree: number
paddle_only: number
tesseract_only: number
conflict_paddle_wins: number
conflict_tesseract_wins: number
}
}

View File

@@ -0,0 +1,361 @@
'use client'
import { useCallback, useEffect, useState, useRef } from 'react'
import { useSearchParams } from 'next/navigation'
import type { PipelineStep, DocumentCategory } from './types'
import { KOMBI_V2_STEPS, dbStepToKombiV2Ui } from './types'
import type { SubSession, SessionListItem } from '../ocr-pipeline/types'
export type { SessionListItem }
const KLAUSUR_API = '/klausur-api'
/** Groups sessions by document_group_id for the session list */
export interface DocumentGroupView {
group_id: string
title: string
sessions: SessionListItem[]
page_count: number
}
function initSteps(): PipelineStep[] {
return KOMBI_V2_STEPS.map((s, i) => ({
...s,
status: i === 0 ? 'active' : 'pending',
}))
}
export function useKombiPipeline() {
const [currentStep, setCurrentStep] = useState(0)
const [sessionId, setSessionId] = useState<string | null>(null)
const [sessionName, setSessionName] = useState('')
const [sessions, setSessions] = useState<SessionListItem[]>([])
const [loadingSessions, setLoadingSessions] = useState(true)
const [activeCategory, setActiveCategory] = useState<DocumentCategory | undefined>(undefined)
const [isGroundTruth, setIsGroundTruth] = useState(false)
const [subSessions, setSubSessions] = useState<SubSession[]>([])
const [parentSessionId, setParentSessionId] = useState<string | null>(null)
const [steps, setSteps] = useState<PipelineStep[]>(initSteps())
const searchParams = useSearchParams()
const deepLinkHandled = useRef(false)
const gridSaveRef = useRef<(() => Promise<void>) | null>(null)
// ---- Session loading ----
const loadSessions = useCallback(async () => {
setLoadingSessions(true)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`)
if (res.ok) {
const data = await res.json()
setSessions((data.sessions || []).filter((s: SessionListItem) => !s.parent_session_id))
}
} catch (e) {
console.error('Failed to load sessions:', e)
} finally {
setLoadingSessions(false)
}
}, [])
useEffect(() => { loadSessions() }, [loadSessions])
// ---- Group sessions by document_group_id ----
const groupedSessions = useCallback((): (SessionListItem | DocumentGroupView)[] => {
const groups = new Map<string, SessionListItem[]>()
const ungrouped: SessionListItem[] = []
for (const s of sessions) {
if (s.document_group_id) {
const existing = groups.get(s.document_group_id) || []
existing.push(s)
groups.set(s.document_group_id, existing)
} else {
ungrouped.push(s)
}
}
const result: (SessionListItem | DocumentGroupView)[] = []
// Sort groups by earliest created_at
const sortedGroups = Array.from(groups.entries()).sort((a, b) => {
const aTime = Math.min(...a[1].map(s => new Date(s.created_at).getTime()))
const bTime = Math.min(...b[1].map(s => new Date(s.created_at).getTime()))
return bTime - aTime
})
for (const [groupId, groupSessions] of sortedGroups) {
groupSessions.sort((a, b) => (a.page_number || 0) - (b.page_number || 0))
// Extract base title (remove " — S. X" suffix)
const baseName = groupSessions[0]?.name?.replace(/ — S\. \d+$/, '') || 'Dokument'
result.push({
group_id: groupId,
title: baseName,
sessions: groupSessions,
page_count: groupSessions.length,
})
}
for (const s of ungrouped) {
result.push(s)
}
// Sort by creation time (most recent first)
const getTime = (item: SessionListItem | DocumentGroupView): number => {
if ('group_id' in item) {
return Math.min(...item.sessions.map((s: SessionListItem) => new Date(s.created_at).getTime()))
}
return new Date(item.created_at).getTime()
}
result.sort((a, b) => getTime(b) - getTime(a))
return result
}, [sessions])
// ---- Open session ----
const openSession = useCallback(async (sid: string, keepSubSessions?: boolean) => {
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
if (!res.ok) return
const data = await res.json()
setSessionId(sid)
setSessionName(data.name || data.filename || '')
setActiveCategory(data.document_category || undefined)
setIsGroundTruth(!!data.ground_truth?.build_grid_reference)
// Sub-session handling
if (data.sub_sessions?.length > 0) {
setSubSessions(data.sub_sessions)
setParentSessionId(sid)
} else if (data.parent_session_id) {
setParentSessionId(data.parent_session_id)
} else if (!keepSubSessions) {
setSubSessions([])
setParentSessionId(null)
}
// Determine UI step from DB state
const dbStep = data.current_step || 1
const hasGrid = !!data.grid_editor_result
const hasStructure = !!data.structure_result
const hasWords = !!data.word_result
let uiStep: number
if (hasGrid) {
uiStep = 9 // grid-review
} else if (hasStructure) {
uiStep = 8 // grid-build
} else if (hasWords) {
uiStep = 7 // structure
} else {
uiStep = dbStepToKombiV2Ui(dbStep)
}
// For sessions that already have an upload, skip the upload step
if (uiStep === 0 && dbStep >= 2) {
uiStep = 1
}
const skipIds: string[] = []
const isSubSession = !!data.parent_session_id
if (isSubSession && dbStep >= 5) {
skipIds.push('upload', 'orientation', 'page-split', 'deskew', 'dewarp', 'content-crop')
if (uiStep < 6) uiStep = 6
} else if (isSubSession && dbStep >= 2) {
skipIds.push('upload', 'orientation')
if (uiStep < 2) uiStep = 2
}
setSteps(
KOMBI_V2_STEPS.map((s, i) => ({
...s,
status: skipIds.includes(s.id)
? 'skipped'
: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
})),
)
setCurrentStep(uiStep)
} catch (e) {
console.error('Failed to open session:', e)
}
}, [])
// ---- Deep link handling ----
useEffect(() => {
if (deepLinkHandled.current) return
const urlSession = searchParams.get('session')
const urlStep = searchParams.get('step')
if (urlSession) {
deepLinkHandled.current = true
openSession(urlSession).then(() => {
if (urlStep) {
const stepIdx = parseInt(urlStep, 10)
if (!isNaN(stepIdx) && stepIdx >= 0 && stepIdx < KOMBI_V2_STEPS.length) {
setCurrentStep(stepIdx)
}
}
})
}
}, [searchParams, openSession])
// ---- Step navigation ----
const goToStep = useCallback((step: number) => {
setCurrentStep(step)
setSteps(prev =>
prev.map((s, i) => ({
...s,
status: i < step ? 'completed' : i === step ? 'active' : 'pending',
})),
)
}, [])
const handleStepClick = useCallback((index: number) => {
if (index <= currentStep || steps[index].status === 'completed') {
setCurrentStep(index)
}
}, [currentStep, steps])
const handleNext = useCallback(() => {
if (currentStep >= steps.length - 1) {
// Last step → return to session list
setSteps(initSteps())
setCurrentStep(0)
setSessionId(null)
setSubSessions([])
setParentSessionId(null)
loadSessions()
return
}
const nextStep = currentStep + 1
setSteps(prev =>
prev.map((s, i) => {
if (i === currentStep) return { ...s, status: 'completed' }
if (i === nextStep) return { ...s, status: 'active' }
return s
}),
)
setCurrentStep(nextStep)
}, [currentStep, steps, loadSessions])
// ---- Session CRUD ----
const handleNewSession = useCallback(() => {
setSessionId(null)
setSessionName('')
setCurrentStep(0)
setSubSessions([])
setParentSessionId(null)
setSteps(initSteps())
}, [])
const deleteSession = useCallback(async (sid: string) => {
try {
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`, { method: 'DELETE' })
setSessions(prev => prev.filter(s => s.id !== sid))
if (sessionId === sid) handleNewSession()
} catch (e) {
console.error('Failed to delete session:', e)
}
}, [sessionId, handleNewSession])
const renameSession = useCallback(async (sid: string, newName: string) => {
try {
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`, {
method: 'PUT',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: newName }),
})
setSessions(prev => prev.map(s => s.id === sid ? { ...s, name: newName } : s))
if (sessionId === sid) setSessionName(newName)
} catch (e) {
console.error('Failed to rename session:', e)
}
}, [sessionId])
const updateCategory = useCallback(async (sid: string, category: DocumentCategory) => {
try {
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`, {
method: 'PUT',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ document_category: category }),
})
setSessions(prev => prev.map(s => s.id === sid ? { ...s, document_category: category } : s))
if (sessionId === sid) setActiveCategory(category)
} catch (e) {
console.error('Failed to update category:', e)
}
}, [sessionId])
// ---- Orientation completion (checks for page-split sub-sessions) ----
const handleOrientationComplete = useCallback(async (sid: string) => {
setSessionId(sid)
loadSessions()
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
if (res.ok) {
const data = await res.json()
if (data.sub_sessions?.length > 0) {
const subs: SubSession[] = data.sub_sessions.map((s: SubSession) => ({
id: s.id,
name: s.name,
box_index: s.box_index,
current_step: s.current_step,
}))
setSubSessions(subs)
setParentSessionId(sid)
openSession(subs[0].id, true)
return
}
}
} catch (e) {
console.error('Failed to check for sub-sessions:', e)
}
handleNext()
}, [loadSessions, openSession, handleNext])
const handleSessionChange = useCallback((newSessionId: string) => {
openSession(newSessionId, true)
}, [openSession])
return {
// State
currentStep,
sessionId,
sessionName,
sessions,
loadingSessions,
activeCategory,
isGroundTruth,
subSessions,
parentSessionId,
steps,
gridSaveRef,
// Computed
groupedSessions,
// Actions
loadSessions,
openSession,
goToStep,
handleStepClick,
handleNext,
handleNewSession,
deleteSession,
renameSession,
updateCategory,
handleOrientationComplete,
handleSessionChange,
setSessionId,
setSubSessions,
setParentSessionId,
setIsGroundTruth,
}
}

View File

@@ -33,6 +33,9 @@ export interface SessionListItem {
current_step: number
document_category?: DocumentCategory
doc_type?: string
parent_session_id?: string
document_group_id?: string
page_number?: number
created_at: string
updated_at?: string
}
@@ -108,6 +111,8 @@ export interface SessionInfo {
sub_sessions?: SubSession[]
parent_session_id?: string
box_index?: number
document_group_id?: string
page_number?: number
}
export interface DeskewResult {