Compare commits

...

4 Commits

Author SHA1 Message Date
Benjamin Admin
4feec7c7b7 Lower syllable pipe-ratio threshold from 5% to 1%
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m58s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Real dictionary pages have only ~3% OCR-detected pipes because the thin
syllable divider lines are hard for OCR to read. The primary false-positive
guard (article_col_index check) already blocks synonym dictionaries.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 23:17:08 +01:00
Benjamin Admin
ed7fc99fc4 Improve syllable divider insertion for dictionary pages
Rewrite cv_syllable_detect.py with pyphen-first approach:
- Remove unreliable CV gate (morphological pipe detection)
- Strip existing pipes and re-syllabify via pyphen (DE then EN)
- Merge pipe-gap spaces where OCR split words at divider positions
- Guard merges with function word blacklist and punctuation checks

Add false-positive prevention:
- Pre-check: skip if <5% of cells have existing | from OCR
- Call-site check: require article_col_index (der/die/das column)
- Prevents syllabification of synonym dictionaries and word lists

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 19:44:29 +01:00
Benjamin Admin
7fbcae954b fix: auto-trigger orientation for page-split sessions without result
Page-split sessions (start_step=1) have no orientation_result stored.
StepOrientation now auto-runs orientation detection when loading an
existing session that lacks a result.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 17:19:56 +01:00
Benjamin Admin
f931091b57 refactor: independent sessions for page-split + URL-based pipeline navigation
Page-split now creates independent sessions (no parent_session_id),
parent marked as status='split' and hidden from list. Navigation uses
useSearchParams for URL-based step tracking (browser back/forward works).
page.tsx reduced from 684 to 443 lines via usePipelineNavigation hook.

Box sub-sessions (column detection) remain unchanged.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 17:05:33 +01:00
11 changed files with 611 additions and 504 deletions

View File

@@ -383,7 +383,7 @@ export default function OcrOverlayPage() {
if (mode === 'paddle-direct' || mode === 'kombi') { if (mode === 'paddle-direct' || mode === 'kombi') {
switch (currentStep) { switch (currentStep) {
case 0: case 0:
return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} /> return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSessionList={() => { loadSessions(); setSessionId(null) }} />
case 1: case 1:
return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} /> return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} />
case 2: case 2:
@@ -421,7 +421,7 @@ export default function OcrOverlayPage() {
} }
switch (currentStep) { switch (currentStep) {
case 0: case 0:
return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} /> return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSessionList={() => { loadSessions(); setSessionId(null) }} />
case 1: case 1:
return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} /> return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} />
case 2: case 2:

View File

@@ -1,6 +1,6 @@
'use client' 'use client'
import { useCallback, useEffect, useState } from 'react' import { Suspense, useCallback, useEffect, useState } from 'react'
import { PagePurpose } from '@/components/common/PagePurpose' import { PagePurpose } from '@/components/common/PagePurpose'
import { PipelineStepper } from '@/components/ocr-pipeline/PipelineStepper' import { PipelineStepper } from '@/components/ocr-pipeline/PipelineStepper'
import { StepOrientation } from '@/components/ocr-pipeline/StepOrientation' import { StepOrientation } from '@/components/ocr-pipeline/StepOrientation'
@@ -14,37 +14,28 @@ import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecogniti
import { StepLlmReview } from '@/components/ocr-pipeline/StepLlmReview' import { StepLlmReview } from '@/components/ocr-pipeline/StepLlmReview'
import { StepReconstruction } from '@/components/ocr-pipeline/StepReconstruction' import { StepReconstruction } from '@/components/ocr-pipeline/StepReconstruction'
import { StepGroundTruth } from '@/components/ocr-pipeline/StepGroundTruth' import { StepGroundTruth } from '@/components/ocr-pipeline/StepGroundTruth'
import { BoxSessionTabs } from '@/components/ocr-pipeline/BoxSessionTabs' import { DOCUMENT_CATEGORIES, type SessionListItem, type DocumentTypeResult, type DocumentCategory, type SubSession } from './types'
import { PIPELINE_STEPS, DOCUMENT_CATEGORIES, type PipelineStep, type SessionListItem, type DocumentTypeResult, type DocumentCategory, type SubSession } from './types' import { usePipelineNavigation } from './usePipelineNavigation'
const KLAUSUR_API = '/klausur-api' const KLAUSUR_API = '/klausur-api'
export default function OcrPipelinePage() { const STEP_NAMES: Record<number, string> = {
const [currentStep, setCurrentStep] = useState(0) 1: 'Orientierung', 2: 'Begradigung', 3: 'Entzerrung', 4: 'Zuschneiden',
const [sessionId, setSessionId] = useState<string | null>(null) 5: 'Spalten', 6: 'Zeilen', 7: 'Woerter', 8: 'Struktur',
const [sessionName, setSessionName] = useState<string>('') 9: 'Korrektur', 10: 'Rekonstruktion', 11: 'Validierung',
}
function OcrPipelineContent() {
const nav = usePipelineNavigation()
const [sessions, setSessions] = useState<SessionListItem[]>([]) const [sessions, setSessions] = useState<SessionListItem[]>([])
const [loadingSessions, setLoadingSessions] = useState(true) const [loadingSessions, setLoadingSessions] = useState(true)
const [editingName, setEditingName] = useState<string | null>(null) const [editingName, setEditingName] = useState<string | null>(null)
const [editNameValue, setEditNameValue] = useState('') const [editNameValue, setEditNameValue] = useState('')
const [editingCategory, setEditingCategory] = useState<string | null>(null) const [editingCategory, setEditingCategory] = useState<string | null>(null)
const [docTypeResult, setDocTypeResult] = useState<DocumentTypeResult | null>(null) const [sessionName, setSessionName] = useState('')
const [activeCategory, setActiveCategory] = useState<DocumentCategory | undefined>(undefined) const [activeCategory, setActiveCategory] = useState<DocumentCategory | undefined>(undefined)
const [subSessions, setSubSessions] = useState<SubSession[]>([])
const [parentSessionId, setParentSessionId] = useState<string | null>(null)
const [steps, setSteps] = useState<PipelineStep[]>(
PIPELINE_STEPS.map((s, i) => ({
...s,
status: i === 0 ? 'active' : 'pending',
})),
)
// Load session list on mount const loadSessions = useCallback(async () => {
useEffect(() => {
loadSessions()
}, [])
const loadSessions = async () => {
setLoadingSessions(true) setLoadingSessions(true)
try { try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`) const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`)
@@ -57,103 +48,42 @@ export default function OcrPipelinePage() {
} finally { } finally {
setLoadingSessions(false) setLoadingSessions(false)
} }
} }, [])
const openSession = useCallback(async (sid: string, keepSubSessions?: boolean) => { useEffect(() => { loadSessions() }, [loadSessions])
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
if (!res.ok) return
const data = await res.json()
setSessionId(sid) // Sync session name when nav.sessionId changes
setSessionName(data.name || data.filename || '') useEffect(() => {
setActiveCategory(data.document_category || undefined) if (!nav.sessionId) {
setSessionName('')
// Sub-session handling setActiveCategory(undefined)
if (data.sub_sessions && data.sub_sessions.length > 0) {
setSubSessions(data.sub_sessions)
setParentSessionId(sid)
// Parent has sub-sessions — open the first incomplete one (or most advanced if all done)
const incomplete = data.sub_sessions.find(
(s: SubSession) => !s.current_step || s.current_step < 10,
)
const target = incomplete || [...data.sub_sessions].sort(
(a: SubSession, b: SubSession) => (b.current_step || 0) - (a.current_step || 0),
)[0]
if (target) {
openSession(target.id, true)
return return
} }
} else if (data.parent_session_id) { const load = async () => {
// This is a sub-session — keep parent info but don't reset sub-session list try {
setParentSessionId(data.parent_session_id) const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${nav.sessionId}`)
} else if (!keepSubSessions) { if (!res.ok) return
setSubSessions([]) const data = await res.json()
setParentSessionId(null) setSessionName(data.name || data.filename || '')
setActiveCategory(data.document_category || undefined)
} catch { /* ignore */ }
} }
load()
}, [nav.sessionId])
// Restore doc type result if available const openSession = useCallback((sid: string) => {
const savedDocType: DocumentTypeResult | null = data.doc_type_result || null nav.goToSession(sid)
setDocTypeResult(savedDocType) }, [nav])
// Determine which step to jump to based on current_step
const dbStep = data.current_step || 1
// DB steps: 1=start, 2=orientation, 3=deskew, 4=dewarp, 5=crop, 6=columns, ...
// UI steps are 0-indexed: 0=orientation, 1=deskew, 2=dewarp, 3=crop, 4=columns, ...
let uiStep = Math.max(0, dbStep - 1)
const skipSteps = [...(savedDocType?.skip_steps || [])]
// Sub-session handling depends on how they were created:
// - Crop-based (current_step >= 5): image already cropped, skip all pre-processing
// - Page-split (current_step 2): orientation done on parent, skip only orientation
// - Page-split from original (current_step 1): needs full pipeline
const isSubSession = !!data.parent_session_id
if (isSubSession) {
if (dbStep >= 5) {
// Crop-based sub-sessions: image already cropped
const SUB_SESSION_SKIP = ['orientation', 'deskew', 'dewarp', 'crop']
for (const s of SUB_SESSION_SKIP) {
if (!skipSteps.includes(s)) skipSteps.push(s)
}
if (uiStep < 4) uiStep = 4 // columns step (index 4)
} else if (dbStep >= 2) {
// Page-split sub-session: parent orientation applied, skip only orientation
if (!skipSteps.includes('orientation')) skipSteps.push('orientation')
if (uiStep < 1) uiStep = 1 // advance past skipped orientation to deskew
}
// dbStep === 1: page-split from original image, needs full pipeline
}
setSteps(
PIPELINE_STEPS.map((s, i) => ({
...s,
status: skipSteps.includes(s.id)
? 'skipped'
: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
})),
)
setCurrentStep(uiStep)
} catch (e) {
console.error('Failed to open session:', e)
}
}, [])
const deleteSession = useCallback(async (sid: string) => { const deleteSession = useCallback(async (sid: string) => {
try { try {
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`, { method: 'DELETE' }) await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`, { method: 'DELETE' })
setSessions((prev) => prev.filter((s) => s.id !== sid)) setSessions(prev => prev.filter(s => s.id !== sid))
if (sessionId === sid) { if (nav.sessionId === sid) nav.goToSessionList()
setSessionId(null)
setCurrentStep(0)
setDocTypeResult(null)
setSubSessions([])
setParentSessionId(null)
setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
}
} catch (e) { } catch (e) {
console.error('Failed to delete session:', e) console.error('Failed to delete session:', e)
} }
}, [sessionId]) }, [nav])
const renameSession = useCallback(async (sid: string, newName: string) => { const renameSession = useCallback(async (sid: string, newName: string) => {
try { try {
@@ -162,13 +92,13 @@ export default function OcrPipelinePage() {
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: newName }), body: JSON.stringify({ name: newName }),
}) })
setSessions((prev) => prev.map((s) => (s.id === sid ? { ...s, name: newName } : s))) setSessions(prev => prev.map(s => (s.id === sid ? { ...s, name: newName } : s)))
if (sessionId === sid) setSessionName(newName) if (nav.sessionId === sid) setSessionName(newName)
} catch (e) { } catch (e) {
console.error('Failed to rename session:', e) console.error('Failed to rename session:', e)
} }
setEditingName(null) setEditingName(null)
}, [sessionId]) }, [nav.sessionId])
const updateCategory = useCallback(async (sid: string, category: DocumentCategory) => { const updateCategory = useCallback(async (sid: string, category: DocumentCategory) => {
try { try {
@@ -177,275 +107,107 @@ export default function OcrPipelinePage() {
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ document_category: category }), body: JSON.stringify({ document_category: category }),
}) })
setSessions((prev) => prev.map((s) => (s.id === sid ? { ...s, document_category: category } : s))) setSessions(prev => prev.map(s => (s.id === sid ? { ...s, document_category: category } : s)))
if (sessionId === sid) setActiveCategory(category) if (nav.sessionId === sid) setActiveCategory(category)
} catch (e) { } catch (e) {
console.error('Failed to update category:', e) console.error('Failed to update category:', e)
} }
setEditingCategory(null) setEditingCategory(null)
}, [sessionId]) }, [nav.sessionId])
const deleteAllSessions = useCallback(async () => { const deleteAllSessions = useCallback(async () => {
if (!confirm('Alle Sessions loeschen? Dies kann nicht rueckgaengig gemacht werden.')) return if (!confirm('Alle Sessions loeschen? Dies kann nicht rueckgaengig gemacht werden.')) return
try { try {
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, { method: 'DELETE' }) await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, { method: 'DELETE' })
setSessions([]) setSessions([])
setSessionId(null) nav.goToSessionList()
setCurrentStep(0)
setDocTypeResult(null)
setActiveCategory(undefined)
setSubSessions([])
setParentSessionId(null)
setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
} catch (e) { } catch (e) {
console.error('Failed to delete all sessions:', e) console.error('Failed to delete all sessions:', e)
} }
}, []) }, [nav])
const handleStepClick = (index: number) => { const handleStepClick = (index: number) => {
if (index <= currentStep || steps[index].status === 'completed') { if (index <= nav.currentStepIndex || nav.steps[index].status === 'completed') {
setCurrentStep(index) nav.goToStep(index)
} }
} }
const goToStep = (step: number) => { // Orientation: after upload, navigate to session at deskew step
setCurrentStep(step) const handleOrientationComplete = useCallback(async (sid: string) => {
setSteps((prev) =>
prev.map((s, i) => ({
...s,
status: i < step ? 'completed' : i === step ? 'active' : 'pending',
})),
)
}
const handleNext = () => {
if (currentStep >= steps.length - 1) {
// Last step completed
if (parentSessionId && sessionId !== parentSessionId) {
// Sub-session completed — mark it and find next incomplete one
const updatedSubs = subSessions.map((s) =>
s.id === sessionId ? { ...s, status: 'completed' as const, current_step: 10 } : s,
)
setSubSessions(updatedSubs)
// Find next incomplete sub-session
const nextIncomplete = updatedSubs.find(
(s) => s.id !== sessionId && (!s.current_step || s.current_step < 10),
)
if (nextIncomplete) {
// Open next incomplete sub-session
openSession(nextIncomplete.id, true)
} else {
// All sub-sessions done — return to session list
setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
setCurrentStep(0)
setSessionId(null)
setSubSessions([])
setParentSessionId(null)
loadSessions() loadSessions()
} // Navigate directly to deskew step (index 1) for this session
return nav.goToSession(sid)
} }, [nav, loadSessions])
// Main session: return to session list
setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
setCurrentStep(0)
setSessionId(null)
setSubSessions([])
setParentSessionId(null)
loadSessions()
return
}
// Find the next non-skipped step // Crop: detect doc type then advance
const skipSteps = docTypeResult?.skip_steps || [] const handleCropNext = useCallback(async () => {
let nextStep = currentStep + 1 if (nav.sessionId) {
while (nextStep < steps.length && skipSteps.includes(PIPELINE_STEPS[nextStep]?.id)) {
nextStep++
}
if (nextStep >= steps.length) nextStep = steps.length - 1
setSteps((prev) =>
prev.map((s, i) => {
if (i === currentStep) return { ...s, status: 'completed' }
if (i === nextStep) return { ...s, status: 'active' }
// Mark skipped steps between current and next
if (i > currentStep && i < nextStep && skipSteps.includes(PIPELINE_STEPS[i]?.id)) {
return { ...s, status: 'skipped' }
}
return s
}),
)
setCurrentStep(nextStep)
}
const handleOrientationComplete = async (sid: string) => {
setSessionId(sid)
loadSessions()
// Check for page-split sub-sessions directly from API
// (React state may not be committed yet due to batching)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
if (res.ok) {
const data = await res.json()
if (data.sub_sessions?.length > 0) {
const subs: SubSession[] = data.sub_sessions.map((s: SubSession) => ({
id: s.id,
name: s.name,
box_index: s.box_index,
current_step: s.current_step,
}))
setSubSessions(subs)
setParentSessionId(sid)
openSession(subs[0].id, true)
return
}
}
} catch (e) {
console.error('Failed to check for sub-sessions:', e)
}
handleNext()
}
const handleCropNext = async () => {
// Auto-detect document type after crop (last image-processing step), then advance
if (sessionId) {
try { try {
const res = await fetch( const res = await fetch(
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/detect-type`, `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${nav.sessionId}/detect-type`,
{ method: 'POST' }, { method: 'POST' },
) )
if (res.ok) { if (res.ok) {
const data: DocumentTypeResult = await res.json() const data: DocumentTypeResult = await res.json()
setDocTypeResult(data) nav.setDocType(data)
// Mark skipped steps immediately
const skipSteps = data.skip_steps || []
if (skipSteps.length > 0) {
setSteps((prev) =>
prev.map((s) =>
skipSteps.includes(s.id) ? { ...s, status: 'skipped' } : s,
),
)
}
} }
} catch (e) { } catch (e) {
console.error('Doc type detection failed:', e) console.error('Doc type detection failed:', e)
// Not critical — continue without it
} }
} }
handleNext() nav.goToNextStep()
} }, [nav])
const handleDocTypeChange = (newDocType: DocumentTypeResult['doc_type']) => { const handleDocTypeChange = (newDocType: DocumentTypeResult['doc_type']) => {
if (!docTypeResult) return if (!nav.docTypeResult) return
// Build new skip_steps based on doc type
let skipSteps: string[] = [] let skipSteps: string[] = []
if (newDocType === 'full_text') { if (newDocType === 'full_text') skipSteps = ['columns', 'rows']
skipSteps = ['columns', 'rows']
}
// vocab_table and generic_table: no skips
const updated: DocumentTypeResult = { nav.setDocType({
...docTypeResult, ...nav.docTypeResult,
doc_type: newDocType, doc_type: newDocType,
skip_steps: skipSteps, skip_steps: skipSteps,
pipeline: newDocType === 'full_text' ? 'full_page' : 'cell_first', pipeline: newDocType === 'full_text' ? 'full_page' : 'cell_first',
}
setDocTypeResult(updated)
// Update step statuses
setSteps((prev) =>
prev.map((s) => {
if (skipSteps.includes(s.id)) return { ...s, status: 'skipped' as const }
if (s.status === 'skipped') return { ...s, status: 'pending' as const }
return s
}),
)
}
const handleNewSession = () => {
setSessionId(null)
setSessionName('')
setCurrentStep(0)
setDocTypeResult(null)
setSubSessions([])
setParentSessionId(null)
setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
}
const handleSessionChange = useCallback((newSessionId: string) => {
openSession(newSessionId, true)
}, [openSession])
const handleBoxSessionsCreated = useCallback((subs: SubSession[]) => {
setSubSessions(subs)
if (sessionId) setParentSessionId(sessionId)
}, [sessionId])
const stepNames: Record<number, string> = {
1: 'Orientierung',
2: 'Begradigung',
3: 'Entzerrung',
4: 'Zuschneiden',
5: 'Spalten',
6: 'Zeilen',
7: 'Woerter',
8: 'Struktur',
9: 'Korrektur',
10: 'Rekonstruktion',
11: 'Validierung',
}
const reprocessFromStep = useCallback(async (uiStep: number) => {
if (!sessionId) return
const dbStep = uiStep + 1 // UI is 0-indexed, DB is 1-indexed
if (!confirm(`Ab Schritt ${dbStep} (${stepNames[dbStep] || '?'}) neu verarbeiten? Nachfolgende Daten werden geloescht.`)) return
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reprocess`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ from_step: dbStep }),
}) })
if (!res.ok) {
const data = await res.json().catch(() => ({}))
console.error('Reprocess failed:', data.detail || res.status)
return
} }
// Reset UI steps
goToStep(uiStep) // Box sub-sessions (column detection) — still supported
} catch (e) { const handleBoxSessionsCreated = useCallback((_subs: SubSession[]) => {
console.error('Reprocess error:', e) // Box sub-sessions are tracked by the backend; no client-side state needed anymore
} }, [])
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [sessionId, goToStep])
const renderStep = () => { const renderStep = () => {
switch (currentStep) { const sid = nav.sessionId
switch (nav.currentStepIndex) {
case 0: case 0:
return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} /> return (
<StepOrientation
key={sid}
sessionId={sid}
onNext={handleOrientationComplete}
onSessionList={() => { loadSessions(); nav.goToSessionList() }}
/>
)
case 1: case 1:
return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} /> return <StepDeskew key={sid} sessionId={sid} onNext={nav.goToNextStep} />
case 2: case 2:
return <StepDewarp key={sessionId} sessionId={sessionId} onNext={handleNext} /> return <StepDewarp key={sid} sessionId={sid} onNext={nav.goToNextStep} />
case 3: case 3:
return <StepCrop key={sessionId} sessionId={sessionId} onNext={handleCropNext} /> return <StepCrop key={sid} sessionId={sid} onNext={handleCropNext} />
case 4: case 4:
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} onBoxSessionsCreated={handleBoxSessionsCreated} /> return <StepColumnDetection sessionId={sid} onNext={nav.goToNextStep} onBoxSessionsCreated={handleBoxSessionsCreated} />
case 5: case 5:
return <StepRowDetection sessionId={sessionId} onNext={handleNext} /> return <StepRowDetection sessionId={sid} onNext={nav.goToNextStep} />
case 6: case 6:
return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} /> return <StepWordRecognition sessionId={sid} onNext={nav.goToNextStep} goToStep={nav.goToStep} />
case 7: case 7:
return <StepStructureDetection sessionId={sessionId} onNext={handleNext} /> return <StepStructureDetection sessionId={sid} onNext={nav.goToNextStep} />
case 8: case 8:
return <StepLlmReview sessionId={sessionId} onNext={handleNext} /> return <StepLlmReview sessionId={sid} onNext={nav.goToNextStep} />
case 9: case 9:
return <StepReconstruction sessionId={sessionId} onNext={handleNext} /> return <StepReconstruction sessionId={sid} onNext={nav.goToNextStep} />
case 10: case 10:
return <StepGroundTruth sessionId={sessionId} onNext={handleNext} /> return <StepGroundTruth sessionId={sid} onNext={nav.goToNextStep} />
default: default:
return null return null
} }
@@ -485,7 +247,7 @@ export default function OcrPipelinePage() {
</button> </button>
)} )}
<button <button
onClick={handleNewSession} onClick={() => nav.goToSessionList()}
className="text-xs px-3 py-1.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors" className="text-xs px-3 py-1.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors"
> >
+ Neue Session + Neue Session
@@ -505,7 +267,7 @@ export default function OcrPipelinePage() {
<div <div
key={s.id} key={s.id}
className={`relative flex items-start gap-3 px-3 py-2.5 rounded-lg text-sm transition-colors cursor-pointer ${ className={`relative flex items-start gap-3 px-3 py-2.5 rounded-lg text-sm transition-colors cursor-pointer ${
sessionId === s.id nav.sessionId === s.id
? 'bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700' ? 'bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700'
: 'hover:bg-gray-50 dark:hover:bg-gray-700/50' : 'hover:bg-gray-50 dark:hover:bg-gray-700/50'
}`} }`}
@@ -561,13 +323,12 @@ export default function OcrPipelinePage() {
</button> </button>
<div className="text-xs text-gray-400 flex gap-2 mt-0.5"> <div className="text-xs text-gray-400 flex gap-2 mt-0.5">
<span>{new Date(s.created_at).toLocaleDateString('de-DE', { day: '2-digit', month: '2-digit', year: '2-digit', hour: '2-digit', minute: '2-digit' })}</span> <span>{new Date(s.created_at).toLocaleDateString('de-DE', { day: '2-digit', month: '2-digit', year: '2-digit', hour: '2-digit', minute: '2-digit' })}</span>
<span>Schritt {s.current_step}: {stepNames[s.current_step] || '?'}</span> <span>Schritt {s.current_step}: {STEP_NAMES[s.current_step] || '?'}</span>
</div> </div>
</div> </div>
{/* Badges */} {/* Badges */}
<div className="flex flex-col gap-1 items-end flex-shrink-0" onClick={(e) => e.stopPropagation()}> <div className="flex flex-col gap-1 items-end flex-shrink-0" onClick={(e) => e.stopPropagation()}>
{/* Category Badge */}
<button <button
onClick={() => setEditingCategory(editingCategory === s.id ? null : s.id)} onClick={() => setEditingCategory(editingCategory === s.id ? null : s.id)}
className={`text-[10px] px-1.5 py-0.5 rounded-full border transition-colors ${ className={`text-[10px] px-1.5 py-0.5 rounded-full border transition-colors ${
@@ -579,7 +340,6 @@ export default function OcrPipelinePage() {
> >
{catInfo ? `${catInfo.icon} ${catInfo.label}` : '+ Kategorie'} {catInfo ? `${catInfo.icon} ${catInfo.label}` : '+ Kategorie'}
</button> </button>
{/* Doc Type Badge (read-only) */}
{s.doc_type && ( {s.doc_type && (
<span className="text-[10px] px-1.5 py-0.5 rounded-full bg-gray-100 dark:bg-gray-700 text-gray-500 dark:text-gray-400 border border-gray-200 dark:border-gray-600"> <span className="text-[10px] px-1.5 py-0.5 rounded-full bg-gray-100 dark:bg-gray-700 text-gray-500 dark:text-gray-400 border border-gray-200 dark:border-gray-600">
{s.doc_type} {s.doc_type}
@@ -616,7 +376,7 @@ export default function OcrPipelinePage() {
</button> </button>
</div> </div>
{/* Category dropdown (inline) */} {/* Category dropdown */}
{editingCategory === s.id && ( {editingCategory === s.id && (
<div <div
className="absolute right-0 top-full mt-1 z-20 bg-white dark:bg-gray-800 border border-gray-200 dark:border-gray-700 rounded-lg shadow-lg p-2 grid grid-cols-2 gap-1 w-64" className="absolute right-0 top-full mt-1 z-20 bg-white dark:bg-gray-800 border border-gray-200 dark:border-gray-700 rounded-lg shadow-lg p-2 grid grid-cols-2 gap-1 w-64"
@@ -645,40 +405,39 @@ export default function OcrPipelinePage() {
</div> </div>
{/* Active session info */} {/* Active session info */}
{sessionId && sessionName && ( {nav.sessionId && sessionName && (
<div className="flex items-center gap-3 text-sm text-gray-500 dark:text-gray-400"> <div className="flex items-center gap-3 text-sm text-gray-500 dark:text-gray-400">
<span>Aktive Session: <span className="font-medium text-gray-700 dark:text-gray-300">{sessionName}</span></span> <span>Aktive Session: <span className="font-medium text-gray-700 dark:text-gray-300">{sessionName}</span></span>
{activeCategory && (() => { {activeCategory && (() => {
const cat = DOCUMENT_CATEGORIES.find(c => c.value === activeCategory) const cat = DOCUMENT_CATEGORIES.find(c => c.value === activeCategory)
return cat ? <span className="text-xs px-2 py-0.5 rounded-full bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700 text-teal-700 dark:text-teal-300">{cat.icon} {cat.label}</span> : null return cat ? <span className="text-xs px-2 py-0.5 rounded-full bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700 text-teal-700 dark:text-teal-300">{cat.icon} {cat.label}</span> : null
})()} })()}
{docTypeResult && ( {nav.docTypeResult && (
<span className="text-xs px-2 py-0.5 rounded-full bg-gray-100 dark:bg-gray-700 text-gray-500 dark:text-gray-400 border border-gray-200 dark:border-gray-600"> <span className="text-xs px-2 py-0.5 rounded-full bg-gray-100 dark:bg-gray-700 text-gray-500 dark:text-gray-400 border border-gray-200 dark:border-gray-600">
{docTypeResult.doc_type} {nav.docTypeResult.doc_type}
</span> </span>
)} )}
</div> </div>
)} )}
<PipelineStepper <PipelineStepper
steps={steps} steps={nav.steps}
currentStep={currentStep} currentStep={nav.currentStepIndex}
onStepClick={handleStepClick} onStepClick={handleStepClick}
onReprocess={sessionId ? reprocessFromStep : undefined} onReprocess={nav.sessionId ? nav.reprocessFromStep : undefined}
docTypeResult={docTypeResult} docTypeResult={nav.docTypeResult}
onDocTypeChange={handleDocTypeChange} onDocTypeChange={handleDocTypeChange}
/> />
{subSessions.length > 0 && parentSessionId && sessionId && (
<BoxSessionTabs
parentSessionId={parentSessionId}
subSessions={subSessions}
activeSessionId={sessionId}
onSessionChange={handleSessionChange}
/>
)}
<div className="min-h-[400px]">{renderStep()}</div> <div className="min-h-[400px]">{renderStep()}</div>
</div> </div>
) )
} }
export default function OcrPipelinePage() {
return (
<Suspense fallback={<div className="p-8 text-gray-400">Lade Pipeline...</div>}>
<OcrPipelineContent />
</Suspense>
)
}

View File

@@ -35,10 +35,9 @@ export interface SessionListItem {
doc_type?: string doc_type?: string
created_at: string created_at: string
updated_at?: string updated_at?: string
parent_session_id?: string | null
box_index?: number | null
} }
/** Box sub-session (from column detection zone_type='box') */
export interface SubSession { export interface SubSession {
id: string id: string
name: string name: string

View File

@@ -0,0 +1,225 @@
'use client'
import { useCallback, useEffect, useState } from 'react'
import { useRouter, useSearchParams } from 'next/navigation'
import { PIPELINE_STEPS, type PipelineStep, type PipelineStepStatus, type DocumentTypeResult } from './types'
const KLAUSUR_API = '/klausur-api'
export interface PipelineNav {
sessionId: string | null
currentStepIndex: number
currentStepId: string
steps: PipelineStep[]
docTypeResult: DocumentTypeResult | null
goToNextStep: () => void
goToStep: (index: number) => void
goToSession: (sessionId: string) => void
goToSessionList: () => void
setDocType: (result: DocumentTypeResult) => void
reprocessFromStep: (uiStep: number) => Promise<void>
}
const STEP_NAMES: Record<number, string> = {
1: 'Orientierung', 2: 'Begradigung', 3: 'Entzerrung', 4: 'Zuschneiden',
5: 'Spalten', 6: 'Zeilen', 7: 'Woerter', 8: 'Struktur',
9: 'Korrektur', 10: 'Rekonstruktion', 11: 'Validierung',
}
function buildSteps(uiStep: number, skipSteps: string[]): PipelineStep[] {
return PIPELINE_STEPS.map((s, i) => ({
...s,
status: (
skipSteps.includes(s.id) ? 'skipped'
: i < uiStep ? 'completed'
: i === uiStep ? 'active'
: 'pending'
) as PipelineStepStatus,
}))
}
export function usePipelineNavigation(): PipelineNav {
const router = useRouter()
const searchParams = useSearchParams()
const paramSession = searchParams.get('session')
const paramStep = searchParams.get('step')
const [sessionId, setSessionId] = useState<string | null>(paramSession)
const [currentStepIndex, setCurrentStepIndex] = useState(0)
const [docTypeResult, setDocTypeResult] = useState<DocumentTypeResult | null>(null)
const [steps, setSteps] = useState<PipelineStep[]>(buildSteps(0, []))
const [loaded, setLoaded] = useState(false)
// Load session info when session param changes
useEffect(() => {
if (!paramSession) {
setSessionId(null)
setCurrentStepIndex(0)
setDocTypeResult(null)
setSteps(buildSteps(0, []))
setLoaded(true)
return
}
const load = async () => {
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${paramSession}`)
if (!res.ok) return
const data = await res.json()
setSessionId(paramSession)
const savedDocType: DocumentTypeResult | null = data.doc_type_result || null
setDocTypeResult(savedDocType)
const dbStep = data.current_step || 1
let uiStep = Math.max(0, dbStep - 1)
const skipSteps = [...(savedDocType?.skip_steps || [])]
// Box sub-sessions (from column detection) skip pre-processing
const isBoxSubSession = !!data.parent_session_id
if (isBoxSubSession && dbStep >= 5) {
const SUB_SESSION_SKIP = ['orientation', 'deskew', 'dewarp', 'crop']
for (const s of SUB_SESSION_SKIP) {
if (!skipSteps.includes(s)) skipSteps.push(s)
}
if (uiStep < 4) uiStep = 4
}
// If URL has a step param, use that instead
if (paramStep) {
const stepIdx = PIPELINE_STEPS.findIndex(s => s.id === paramStep)
if (stepIdx >= 0) uiStep = stepIdx
}
setCurrentStepIndex(uiStep)
setSteps(buildSteps(uiStep, skipSteps))
} catch (e) {
console.error('Failed to load session:', e)
} finally {
setLoaded(true)
}
}
load()
}, [paramSession, paramStep])
const updateUrl = useCallback((sid: string | null, stepIdx?: number) => {
if (!sid) {
router.push('/ai/ocr-pipeline')
return
}
const stepId = stepIdx !== undefined ? PIPELINE_STEPS[stepIdx]?.id : undefined
const params = new URLSearchParams()
params.set('session', sid)
if (stepId) params.set('step', stepId)
router.push(`/ai/ocr-pipeline?${params.toString()}`)
}, [router])
const goToNextStep = useCallback(() => {
if (currentStepIndex >= steps.length - 1) {
// Last step — return to session list
setSessionId(null)
setCurrentStepIndex(0)
setDocTypeResult(null)
setSteps(buildSteps(0, []))
router.push('/ai/ocr-pipeline')
return
}
const skipSteps = docTypeResult?.skip_steps || []
let nextStep = currentStepIndex + 1
while (nextStep < steps.length && skipSteps.includes(PIPELINE_STEPS[nextStep]?.id)) {
nextStep++
}
if (nextStep >= steps.length) nextStep = steps.length - 1
setSteps(prev =>
prev.map((s, i) => {
if (i === currentStepIndex) return { ...s, status: 'completed' as PipelineStepStatus }
if (i === nextStep) return { ...s, status: 'active' as PipelineStepStatus }
if (i > currentStepIndex && i < nextStep && skipSteps.includes(PIPELINE_STEPS[i]?.id)) {
return { ...s, status: 'skipped' as PipelineStepStatus }
}
return s
}),
)
setCurrentStepIndex(nextStep)
if (sessionId) updateUrl(sessionId, nextStep)
}, [currentStepIndex, steps.length, docTypeResult, sessionId, updateUrl, router])
const goToStep = useCallback((index: number) => {
setCurrentStepIndex(index)
setSteps(prev =>
prev.map((s, i) => ({
...s,
status: s.status === 'skipped' ? 'skipped'
: i < index ? 'completed'
: i === index ? 'active'
: 'pending' as PipelineStepStatus,
})),
)
if (sessionId) updateUrl(sessionId, index)
}, [sessionId, updateUrl])
const goToSession = useCallback((sid: string) => {
updateUrl(sid)
}, [updateUrl])
const goToSessionList = useCallback(() => {
setSessionId(null)
setCurrentStepIndex(0)
setDocTypeResult(null)
setSteps(buildSteps(0, []))
router.push('/ai/ocr-pipeline')
}, [router])
const setDocType = useCallback((result: DocumentTypeResult) => {
setDocTypeResult(result)
const skipSteps = result.skip_steps || []
if (skipSteps.length > 0) {
setSteps(prev =>
prev.map(s =>
skipSteps.includes(s.id) ? { ...s, status: 'skipped' as PipelineStepStatus } : s,
),
)
}
}, [])
const reprocessFromStep = useCallback(async (uiStep: number) => {
if (!sessionId) return
const dbStep = uiStep + 1
if (!confirm(`Ab Schritt ${dbStep} (${STEP_NAMES[dbStep] || '?'}) neu verarbeiten? Nachfolgende Daten werden geloescht.`)) return
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reprocess`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ from_step: dbStep }),
})
if (!res.ok) {
const data = await res.json().catch(() => ({}))
console.error('Reprocess failed:', data.detail || res.status)
return
}
goToStep(uiStep)
} catch (e) {
console.error('Reprocess error:', e)
}
}, [sessionId, goToStep])
return {
sessionId,
currentStepIndex,
currentStepId: PIPELINE_STEPS[currentStepIndex]?.id || 'orientation',
steps,
docTypeResult,
goToNextStep,
goToStep,
goToSession,
goToSessionList,
setDocType,
reprocessFromStep,
}
}

View File

@@ -21,6 +21,7 @@ function getStatusIcon(sub: SubSession): string {
return STATUS_ICONS.pending return STATUS_ICONS.pending
} }
/** Tabs for box sub-sessions (from column detection zone_type='box'). */
export function BoxSessionTabs({ parentSessionId, subSessions, activeSessionId, onSessionChange }: BoxSessionTabsProps) { export function BoxSessionTabs({ parentSessionId, subSessions, activeSessionId, onSessionChange }: BoxSessionTabsProps) {
if (subSessions.length === 0) return null if (subSessions.length === 0) return null
@@ -28,7 +29,6 @@ export function BoxSessionTabs({ parentSessionId, subSessions, activeSessionId,
return ( return (
<div className="flex items-center gap-1.5 px-1 py-1.5 bg-gray-50 dark:bg-gray-800/50 rounded-xl border border-gray-200 dark:border-gray-700"> <div className="flex items-center gap-1.5 px-1 py-1.5 bg-gray-50 dark:bg-gray-800/50 rounded-xl border border-gray-200 dark:border-gray-700">
{/* Main session tab */}
<button <button
onClick={() => onSessionChange(parentSessionId)} onClick={() => onSessionChange(parentSessionId)}
className={`px-3 py-1.5 rounded-lg text-xs font-medium transition-colors ${ className={`px-3 py-1.5 rounded-lg text-xs font-medium transition-colors ${
@@ -42,7 +42,6 @@ export function BoxSessionTabs({ parentSessionId, subSessions, activeSessionId,
<div className="w-px h-5 bg-gray-200 dark:bg-gray-700" /> <div className="w-px h-5 bg-gray-200 dark:bg-gray-700" />
{/* Sub-session tabs */}
{subSessions.map((sub) => { {subSessions.map((sub) => {
const isActive = activeSessionId === sub.id const isActive = activeSessionId === sub.id
const icon = getStatusIcon(sub) const icon = getStatusIcon(sub)
@@ -59,7 +58,7 @@ export function BoxSessionTabs({ parentSessionId, subSessions, activeSessionId,
title={sub.name} title={sub.name}
> >
<span className="mr-1">{icon}</span> <span className="mr-1">{icon}</span>
Seite {sub.box_index + 1} Box {sub.box_index + 1}
</button> </button>
) )
})} })}

View File

@@ -1,7 +1,7 @@
'use client' 'use client'
import { useCallback, useEffect, useState } from 'react' import { useCallback, useEffect, useState } from 'react'
import type { OrientationResult, SessionInfo, SubSession } from '@/app/(admin)/ai/ocr-pipeline/types' import type { OrientationResult, SessionInfo } from '@/app/(admin)/ai/ocr-pipeline/types'
import { ImageCompareView } from './ImageCompareView' import { ImageCompareView } from './ImageCompareView'
const KLAUSUR_API = '/klausur-api' const KLAUSUR_API = '/klausur-api'
@@ -17,10 +17,10 @@ interface PageSplitResult {
interface StepOrientationProps { interface StepOrientationProps {
sessionId?: string | null sessionId?: string | null
onNext: (sessionId: string) => void onNext: (sessionId: string) => void
onSubSessionsCreated?: (subs: SubSession[]) => void onSessionList?: () => void
} }
export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSessionsCreated }: StepOrientationProps) { export function StepOrientation({ sessionId: existingSessionId, onNext, onSessionList }: StepOrientationProps) {
const [session, setSession] = useState<SessionInfo | null>(null) const [session, setSession] = useState<SessionInfo | null>(null)
const [orientationResult, setOrientationResult] = useState<OrientationResult | null>(null) const [orientationResult, setOrientationResult] = useState<OrientationResult | null>(null)
const [pageSplitResult, setPageSplitResult] = useState<PageSplitResult | null>(null) const [pageSplitResult, setPageSplitResult] = useState<PageSplitResult | null>(null)
@@ -30,7 +30,7 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
const [dragOver, setDragOver] = useState(false) const [dragOver, setDragOver] = useState(false)
const [sessionName, setSessionName] = useState('') const [sessionName, setSessionName] = useState('')
// Reload session data when navigating back // Reload session data when navigating back — auto-trigger orientation if missing
useEffect(() => { useEffect(() => {
if (!existingSessionId || session) return if (!existingSessionId || session) return
@@ -51,6 +51,28 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
if (data.orientation_result) { if (data.orientation_result) {
setOrientationResult(data.orientation_result) setOrientationResult(data.orientation_result)
} else {
// Session exists but orientation not yet run (e.g. page-split session)
// Auto-trigger orientation detection
setDetecting(true)
try {
const orientRes = await fetch(
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}/orientation`,
{ method: 'POST' },
)
if (orientRes.ok) {
const orientData = await orientRes.json()
setOrientationResult({
orientation_degrees: orientData.orientation_degrees,
corrected: orientData.corrected,
duration_seconds: orientData.duration_seconds,
})
}
} catch (e) {
console.error('Auto-orientation failed:', e)
} finally {
setDetecting(false)
}
} }
} catch (e) { } catch (e) {
console.error('Failed to reload session:', e) console.error('Failed to reload session:', e)
@@ -112,16 +134,6 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
if (splitRes.ok) { if (splitRes.ok) {
const splitData: PageSplitResult = await splitRes.json() const splitData: PageSplitResult = await splitRes.json()
setPageSplitResult(splitData) setPageSplitResult(splitData)
if (splitData.multi_page && splitData.sub_sessions && onSubSessionsCreated) {
onSubSessionsCreated(
splitData.sub_sessions.map((s) => ({
id: s.id,
name: s.name,
box_index: s.page_index,
current_step: splitData.used_original ? 1 : 2,
}))
)
}
} }
} catch (e) { } catch (e) {
console.error('Page-split detection failed:', e) console.error('Page-split detection failed:', e)
@@ -133,7 +145,7 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
setUploading(false) setUploading(false)
setDetecting(false) setDetecting(false)
} }
}, [sessionName, onSubSessionsCreated]) }, [sessionName])
const handleDrop = useCallback((e: React.DragEvent) => { const handleDrop = useCallback((e: React.DragEvent) => {
e.preventDefault() e.preventDefault()
@@ -264,10 +276,10 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
{pageSplitResult?.multi_page && ( {pageSplitResult?.multi_page && (
<div className="bg-blue-50 dark:bg-blue-900/20 rounded-lg border border-blue-200 dark:border-blue-700 p-4"> <div className="bg-blue-50 dark:bg-blue-900/20 rounded-lg border border-blue-200 dark:border-blue-700 p-4">
<div className="text-sm font-medium text-blue-700 dark:text-blue-300"> <div className="text-sm font-medium text-blue-700 dark:text-blue-300">
Doppelseite erkannt {pageSplitResult.page_count} Seiten Doppelseite erkannt {pageSplitResult.page_count} unabhaengige Sessions erstellt
</div> </div>
<p className="text-xs text-blue-600 dark:text-blue-400 mt-1"> <p className="text-xs text-blue-600 dark:text-blue-400 mt-1">
Jede Seite wird einzeln durch die Pipeline (Begradigung, Entzerrung, Zuschnitt, ...) verarbeitet. Jede Seite wird als eigene Session durch die Pipeline verarbeitet.
{pageSplitResult.used_original && ' (Seitentrennung auf dem Originalbild, da die Orientierung die Doppelseite gedreht hat.)'} {pageSplitResult.used_original && ' (Seitentrennung auf dem Originalbild, da die Orientierung die Doppelseite gedreht hat.)'}
</p> </p>
<div className="flex gap-2 mt-2"> <div className="flex gap-2 mt-2">
@@ -286,12 +298,21 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
{/* Next button */} {/* Next button */}
{orientationResult && ( {orientationResult && (
<div className="flex justify-end"> <div className="flex justify-end">
{pageSplitResult?.multi_page ? (
<button
onClick={() => onSessionList?.()}
className="px-6 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium transition-colors"
>
Zur Session-Liste &rarr;
</button>
) : (
<button <button
onClick={() => onNext(session.session_id)} onClick={() => onNext(session.session_id)}
className="px-6 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium transition-colors" className="px-6 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium transition-colors"
> >
{pageSplitResult?.multi_page ? 'Seiten verarbeiten' : 'Weiter'} &rarr; Weiter &rarr;
</button> </button>
)}
</div> </div>
)} )}

View File

@@ -1,11 +1,15 @@
""" """
CV-based syllable divider detection and insertion for dictionary pages. Syllable divider insertion for dictionary pages.
Two-step approach: For confirmed dictionary pages (is_dictionary=True), processes all content
1. CV: morphological vertical line detection checks if a word_box image column cells:
contains thin, isolated pipe-like vertical lines (syllable dividers). 1. Strips existing | dividers for clean normalization
2. pyphen: inserts syllable breaks at linguistically correct positions 2. Merges pipe-gap spaces (where OCR split a word at a divider position)
for words where CV confirmed the presence of dividers. 3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN)
4. Only modifies words that pyphen recognizes — garbled OCR stays as-is
No CV gate needed — the dictionary detection confidence is sufficient.
pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable.
Lizenz: Apache 2.0 (kommerziell nutzbar) Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -13,94 +17,222 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging import logging
import re import re
from typing import Any, Dict, List from typing import Any, Dict, List, Optional, Tuple
import cv2
import numpy as np import numpy as np
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# IPA/phonetic characters — skip cells containing these
def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
"""CV check: does this word_box image show thin vertical pipe dividers?
Uses morphological opening with a tall thin kernel to isolate vertical
structures, then filters for thin (≤4px), isolated contours that are
NOT at the word edges (those would be l, I, 1 etc.).
"""
x = wb.get("left", 0)
y = wb.get("top", 0)
w = wb.get("width", 0)
h = wb.get("height", 0)
if w < 30 or h < 12:
return False
ih, iw = img_gray.shape[:2]
y1, y2 = max(0, y), min(ih, y + h)
x1, x2 = max(0, x), min(iw, x + w)
roi = img_gray[y1:y2, x1:x2]
if roi.size == 0:
return False
rh, rw = roi.shape
# Binarize (ink = white on black background)
_, binary = cv2.threshold(
roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
)
# Morphological opening: keep only tall vertical structures (≥55% height)
kern_h = max(int(rh * 0.55), 8)
kernel = np.ones((kern_h, 1), np.uint8)
vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
# Find surviving contours
contours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
margin = max(int(rw * 0.08), 3)
for cnt in contours:
cx, cy, cw, ch = cv2.boundingRect(cnt)
if cw > 4:
continue # too wide for a pipe
if cx < margin or cx + cw > rw - margin:
continue # at word edge — likely l, I, 1
# Check isolation: adjacent columns should be mostly empty (ink-free)
left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
left_ink = np.mean(left_zone) if left_zone.size else 255
right_ink = np.mean(right_zone) if right_zone.size else 255
if left_ink < 80 and right_ink < 80:
return True # isolated thin vertical line = pipe divider
return False
# IPA/phonetic bracket pattern — don't hyphenate transcriptions
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]') _IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
# Common German words that should NOT be merged with adjacent tokens.
# These are function words that appear as standalone words between
# headwords/definitions on dictionary pages.
_STOP_WORDS = frozenset([
# Articles
'der', 'die', 'das', 'dem', 'den', 'des',
'ein', 'eine', 'einem', 'einen', 'einer',
# Pronouns
'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
# Prepositions
'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
'zwischen', 'ohne', 'gegen',
# Conjunctions
'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
# Adverbs
'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
# Verbs
'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
'sein', 'haben',
# Other
'kein', 'keine', 'keinem', 'keinen', 'keiner',
])
# Cached hyphenators
_hyph_de = None
_hyph_en = None
def _get_hyphenators():
"""Lazy-load pyphen hyphenators (cached across calls)."""
global _hyph_de, _hyph_en
if _hyph_de is not None:
return _hyph_de, _hyph_en
try:
import pyphen
except ImportError:
return None, None
_hyph_de = pyphen.Pyphen(lang='de_DE')
_hyph_en = pyphen.Pyphen(lang='en_US')
return _hyph_de, _hyph_en
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
"""Try to hyphenate a word using DE then EN dictionary.
Returns word with | separators, or None if not recognized.
"""
hyph = hyph_de.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
hyph = hyph_en.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
return None
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
"""Merge fragments separated by single spaces where OCR split at a pipe.
Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
Guards against false merges:
- The FIRST token must be pure alpha (word start — no attached punctuation)
- The second token may have trailing punctuation (comma, period) which
stays attached to the merged word: "" + "fer," -> "Käfer,"
- Common German function words (der, die, das, ...) are never merged
- At least one fragment must be very short (<=3 alpha chars)
"""
parts = text.split(' ')
if len(parts) < 2:
return text
result = [parts[0]]
i = 1
while i < len(parts):
prev = result[-1]
curr = parts[i]
# Extract alpha-only core for lookup
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
# Guard 1: first token must be pure alpha (word-start fragment)
# second token may have trailing punctuation
# Guard 2: neither alpha core can be a common German function word
# Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
# Guard 4: combined length must be >= 4
should_try = (
prev == prev_alpha # first token: pure alpha (word start)
and prev_alpha and curr_alpha
and prev_alpha.lower() not in _STOP_WORDS
and curr_alpha.lower() not in _STOP_WORDS
and min(len(prev_alpha), len(curr_alpha)) <= 3
and len(prev_alpha) + len(curr_alpha) >= 4
)
if should_try:
merged_alpha = prev_alpha + curr_alpha
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
if '-' in hyph:
# pyphen recognizes merged word — collapse the space
result[-1] = prev + curr
i += 1
continue
result.append(curr)
i += 1
return ' '.join(result)
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
"""Syllabify all significant words in a text string.
1. Strip existing | dividers
2. Merge pipe-gap spaces where possible
3. Apply pyphen to each word >= 3 alphabetic chars
4. Words pyphen doesn't recognize stay as-is (no bad guesses)
"""
if not text:
return text
# Skip cells that contain IPA transcription characters
if _IPA_RE.search(text):
return text
# Phase 1: strip existing pipe dividers for clean normalization
clean = text.replace('|', '')
# Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
clean = _try_merge_pipe_gaps(clean, hyph_de)
# Phase 3: tokenize and syllabify each word
# Split on whitespace and comma/semicolon sequences, keeping separators
tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
result = []
for tok in tokens:
if not tok or re.match(r'^[\s,;:]+$', tok):
result.append(tok)
continue
# Strip trailing/leading punctuation for pyphen lookup
m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok)
if not m:
result.append(tok)
continue
lead, word, trail = m.group(1), m.group(2), m.group(3)
if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word):
result.append(tok)
continue
hyph = _hyphenate_word(word, hyph_de, hyph_en)
if hyph:
result.append(lead + hyph + trail)
else:
result.append(tok)
return ''.join(result)
def insert_syllable_dividers( def insert_syllable_dividers(
zones_data: List[Dict], zones_data: List[Dict],
img_bgr: np.ndarray, img_bgr: np.ndarray,
session_id: str, session_id: str,
) -> int: ) -> int:
"""Insert pipe syllable dividers into dictionary cells where CV confirms them. """Insert pipe syllable dividers into dictionary cells.
For each cell on a dictionary page: For dictionary pages: process all content column cells, strip existing
1. Check if ANY word_box has CV-detected pipe lines pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
3. Try DE hyphenation first, then EN Pre-check: at least 1% of content cells must already contain ``|`` from
OCR. This guards against pages with zero pipe characters (the primary
guard — article_col_index — is checked at the call site).
Returns the number of cells modified. Returns the number of cells modified.
""" """
try: hyph_de, hyph_en = _get_hyphenators()
import pyphen if hyph_de is None:
except ImportError:
logger.warning("pyphen not installed — skipping syllable insertion") logger.warning("pyphen not installed — skipping syllable insertion")
return 0 return 0
_hyph_de = pyphen.Pyphen(lang='de_DE') # Pre-check: count cells that already have | from OCR.
_hyph_en = pyphen.Pyphen(lang='en_US') # Real dictionary pages with printed syllable dividers will have OCR-
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) # detected pipes in many cells. Pages without syllable dividers will
# have zero — skip those to avoid false syllabification.
total_col_cells = 0
cells_with_pipes = 0
for z in zones_data:
for cell in z.get("cells", []):
if cell.get("col_type", "").startswith("column_"):
total_col_cells += 1
if "|" in cell.get("text", ""):
cells_with_pipes += 1
if total_col_cells > 0:
pipe_ratio = cells_with_pipes / total_col_cells
if pipe_ratio < 0.01:
logger.info(
"build-grid session %s: skipping syllable insertion — "
"only %.1f%% of cells have existing pipes (need >=1%%)",
session_id, pipe_ratio * 100,
)
return 0
insertions = 0 insertions = 0
for z in zones_data: for z in zones_data:
@@ -109,47 +241,18 @@ def insert_syllable_dividers(
if not ct.startswith("column_"): if not ct.startswith("column_"):
continue continue
text = cell.get("text", "") text = cell.get("text", "")
if not text or "|" in text: if not text:
continue
if _IPA_RE.search(text):
continue continue
# CV gate: check if ANY word_box in this cell has pipe lines new_text = _syllabify_text(text, hyph_de, hyph_en)
wbs = cell.get("word_boxes") or [] if new_text != text:
if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs): cell["text"] = new_text
continue
# Apply pyphen to each significant word in the cell
tokens = re.split(r'(\s+|[,;]+\s*)', text)
new_tokens = []
changed = False
for tok in tokens:
# Skip whitespace/punctuation separators
if re.match(r'^[\s,;]+$', tok):
new_tokens.append(tok)
continue
# Only hyphenate words ≥ 4 alpha chars
clean = re.sub(r'[().\-]', '', tok)
if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
new_tokens.append(tok)
continue
# Try DE first, then EN
hyph = _hyph_de.inserted(tok, hyphen='|')
if '|' not in hyph:
hyph = _hyph_en.inserted(tok, hyphen='|')
if '|' in hyph and hyph != tok:
new_tokens.append(hyph)
changed = True
else:
new_tokens.append(tok)
if changed:
cell["text"] = ''.join(new_tokens)
insertions += 1 insertions += 1
if insertions: if insertions:
logger.info( logger.info(
"build-grid session %s: inserted syllable dividers in %d cells " "build-grid session %s: syllable dividers inserted/normalized "
"(CV-validated)", "in %d cells (pyphen)",
session_id, insertions, session_id, insertions,
) )
return insertions return insertions

View File

@@ -1456,10 +1456,15 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
logger.warning("Dictionary detection failed: %s", e) logger.warning("Dictionary detection failed: %s", e)
# --- Syllable divider insertion for dictionary pages --- # --- Syllable divider insertion for dictionary pages ---
# CV-validated: only inserts "|" where image shows thin vertical lines. # Only on confirmed dictionary pages with article columns (der/die/das).
# See cv_syllable_detect.py for the detection + insertion logic. # The article_col_index check avoids false positives on synonym lists,
# word frequency tables, and other alphabetically sorted non-dictionary pages.
# Additionally, insert_syllable_dividers has its own pre-check for existing
# pipe characters in cells (OCR must have already found some).
syllable_insertions = 0 syllable_insertions = 0
if dict_detection.get("is_dictionary") and img_bgr is not None: if (dict_detection.get("is_dictionary")
and dict_detection.get("article_col_index") is not None
and img_bgr is not None):
try: try:
from cv_syllable_detect import insert_syllable_dividers from cv_syllable_detect import insert_syllable_dividers
syllable_insertions = insert_syllable_dividers( syllable_insertions = insert_syllable_dividers(

View File

@@ -238,7 +238,7 @@ async def list_sessions_db(
""" """
pool = await get_pool() pool = await get_pool()
async with pool.acquire() as conn: async with pool.acquire() as conn:
where = "" if include_sub_sessions else "WHERE parent_session_id IS NULL" where = "" if include_sub_sessions else "WHERE parent_session_id IS NULL AND (status IS NULL OR status != 'split')"
rows = await conn.fetch(f""" rows = await conn.fetch(f"""
SELECT id, name, filename, status, current_step, SELECT id, name, filename, status, current_step,
document_category, doc_type, document_category, doc_type,

View File

@@ -191,12 +191,12 @@ async def get_session_info(session_id: str):
if session.get("ground_truth"): if session.get("ground_truth"):
result["ground_truth"] = session["ground_truth"] result["ground_truth"] = session["ground_truth"]
# Sub-session info # Box sub-session info (zone_type='box' from column detection — NOT page-split)
if session.get("parent_session_id"): if session.get("parent_session_id"):
result["parent_session_id"] = session["parent_session_id"] result["parent_session_id"] = session["parent_session_id"]
result["box_index"] = session.get("box_index") result["box_index"] = session.get("box_index")
else: else:
# Check for sub-sessions # Check for box sub-sessions (column detection creates these)
subs = await get_sub_sessions(session_id) subs = await get_sub_sessions(session_id)
if subs: if subs:
result["sub_sessions"] = [ result["sub_sessions"] = [

View File

@@ -238,8 +238,8 @@ async def detect_page_split(session_id: str):
"duration_seconds": round(duration, 2), "duration_seconds": round(duration, 2),
} }
# Mark parent session as split (store info in crop_result for backward compat) # Mark parent session as split and hidden from session list
await update_session_db(session_id, crop_result=split_info) await update_session_db(session_id, crop_result=split_info, status='split')
cached["crop_result"] = split_info cached["crop_result"] = split_info
await _append_pipeline_log(session_id, "page_split", { await _append_pipeline_log(session_id, "page_split", {
@@ -346,6 +346,7 @@ async def auto_crop(session_id: str):
cropped_png=png_buf.tobytes() if ok else b"", cropped_png=png_buf.tobytes() if ok else b"",
crop_result=crop_info, crop_result=crop_info,
current_step=5, current_step=5,
status='split',
) )
logger.info( logger.info(
@@ -461,8 +462,6 @@ async def _create_page_sub_sessions(
name=sub_name, name=sub_name,
filename=parent_filename, filename=parent_filename,
original_png=page_png, original_png=page_png,
parent_session_id=parent_session_id,
box_index=pi,
) )
# Pre-populate: set cropped = original (already cropped) # Pre-populate: set cropped = original (already cropped)
@@ -540,8 +539,6 @@ async def _create_page_sub_sessions_full(
name=sub_name, name=sub_name,
filename=parent_filename, filename=parent_filename,
original_png=page_png, original_png=page_png,
parent_session_id=parent_session_id,
box_index=pi,
) )
# start_step=2 → ready for deskew (orientation already done on spread) # start_step=2 → ready for deskew (orientation already done on spread)
@@ -553,7 +550,6 @@ async def _create_page_sub_sessions_full(
"id": sub_id, "id": sub_id,
"filename": parent_filename, "filename": parent_filename,
"name": sub_name, "name": sub_name,
"parent_session_id": parent_session_id,
"original_bgr": page_bgr, "original_bgr": page_bgr,
"oriented_bgr": None, "oriented_bgr": None,
"cropped_bgr": None, "cropped_bgr": None,