Lower syllable pipe-ratio threshold from 5% to 1%

Real dictionary pages have only ~3% OCR-detected pipes because the thin syllable divider lines are hard for OCR to read. The primary false-positive guard (article_col_index check) already blocks synonym dictionaries. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Improve syllable divider insertion for dictionary pages
2026-03-24 23:17:08 +01:00 · 2026-03-24 19:44:29 +01:00 · 2026-03-24 17:19:56 +01:00 · 2026-03-24 17:05:33 +01:00
11 changed files with 611 additions and 504 deletions
--- a/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx
+++ b/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx
@@ -383,7 +383,7 @@ export default function OcrOverlayPage() {
    if (mode === 'paddle-direct' || mode === 'kombi') {
      switch (currentStep) {
        case 0:
-          return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} />
+          return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSessionList={() => { loadSessions(); setSessionId(null) }} />
        case 1:
          return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} />
        case 2:
@@ -421,7 +421,7 @@ export default function OcrOverlayPage() {
    }
    switch (currentStep) {
      case 0:
-        return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} />
+        return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSessionList={() => { loadSessions(); setSessionId(null) }} />
      case 1:
        return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} />
      case 2:
--- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx
+++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx
@@ -1,6 +1,6 @@
 'use client'

-import { useCallback, useEffect, useState } from 'react'
+import { Suspense, useCallback, useEffect, useState } from 'react'
 import { PagePurpose } from '@/components/common/PagePurpose'
 import { PipelineStepper } from '@/components/ocr-pipeline/PipelineStepper'
 import { StepOrientation } from '@/components/ocr-pipeline/StepOrientation'
@@ -14,37 +14,28 @@ import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecogniti
 import { StepLlmReview } from '@/components/ocr-pipeline/StepLlmReview'
 import { StepReconstruction } from '@/components/ocr-pipeline/StepReconstruction'
 import { StepGroundTruth } from '@/components/ocr-pipeline/StepGroundTruth'
-import { BoxSessionTabs } from '@/components/ocr-pipeline/BoxSessionTabs'
-import { PIPELINE_STEPS, DOCUMENT_CATEGORIES, type PipelineStep, type SessionListItem, type DocumentTypeResult, type DocumentCategory, type SubSession } from './types'
+import { DOCUMENT_CATEGORIES, type SessionListItem, type DocumentTypeResult, type DocumentCategory, type SubSession } from './types'
+import { usePipelineNavigation } from './usePipelineNavigation'

 const KLAUSUR_API = '/klausur-api'

-export default function OcrPipelinePage() {
-  const [currentStep, setCurrentStep] = useState(0)
-  const [sessionId, setSessionId] = useState<string | null>(null)
-  const [sessionName, setSessionName] = useState<string>('')
+const STEP_NAMES: Record<number, string> = {
+  1: 'Orientierung', 2: 'Begradigung', 3: 'Entzerrung', 4: 'Zuschneiden',
+  5: 'Spalten', 6: 'Zeilen', 7: 'Woerter', 8: 'Struktur',
+  9: 'Korrektur', 10: 'Rekonstruktion', 11: 'Validierung',
+}
+
+function OcrPipelineContent() {
+  const nav = usePipelineNavigation()
  const [sessions, setSessions] = useState<SessionListItem[]>([])
  const [loadingSessions, setLoadingSessions] = useState(true)
  const [editingName, setEditingName] = useState<string | null>(null)
  const [editNameValue, setEditNameValue] = useState('')
  const [editingCategory, setEditingCategory] = useState<string | null>(null)
-  const [docTypeResult, setDocTypeResult] = useState<DocumentTypeResult | null>(null)
+  const [sessionName, setSessionName] = useState('')
  const [activeCategory, setActiveCategory] = useState<DocumentCategory | undefined>(undefined)
-  const [subSessions, setSubSessions] = useState<SubSession[]>([])
-  const [parentSessionId, setParentSessionId] = useState<string | null>(null)
-  const [steps, setSteps] = useState<PipelineStep[]>(
-    PIPELINE_STEPS.map((s, i) => ({
-      ...s,
-      status: i === 0 ? 'active' : 'pending',
-    })),
-  )

-  // Load session list on mount
-  useEffect(() => {
-    loadSessions()
-  }, [])
-
-  const loadSessions = async () => {
+  const loadSessions = useCallback(async () => {
    setLoadingSessions(true)
    try {
      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`)
@@ -57,103 +48,42 @@ export default function OcrPipelinePage() {
    } finally {
      setLoadingSessions(false)
    }
-  }
-
-  const openSession = useCallback(async (sid: string, keepSubSessions?: boolean) => {
-    try {
-      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
-      if (!res.ok) return
-      const data = await res.json()
-
-      setSessionId(sid)
-      setSessionName(data.name || data.filename || '')
-      setActiveCategory(data.document_category || undefined)
-
-      // Sub-session handling
-      if (data.sub_sessions && data.sub_sessions.length > 0) {
-        setSubSessions(data.sub_sessions)
-        setParentSessionId(sid)
-        // Parent has sub-sessions — open the first incomplete one (or most advanced if all done)
-        const incomplete = data.sub_sessions.find(
-          (s: SubSession) => !s.current_step || s.current_step < 10,
-        )
-        const target = incomplete || [...data.sub_sessions].sort(
-          (a: SubSession, b: SubSession) => (b.current_step || 0) - (a.current_step || 0),
-        )[0]
-        if (target) {
-          openSession(target.id, true)
-          return
-        }
-      } else if (data.parent_session_id) {
-        // This is a sub-session — keep parent info but don't reset sub-session list
-        setParentSessionId(data.parent_session_id)
-      } else if (!keepSubSessions) {
-        setSubSessions([])
-        setParentSessionId(null)
-      }
-
-      // Restore doc type result if available
-      const savedDocType: DocumentTypeResult | null = data.doc_type_result || null
-      setDocTypeResult(savedDocType)
-
-      // Determine which step to jump to based on current_step
-      const dbStep = data.current_step || 1
-      // DB steps: 1=start, 2=orientation, 3=deskew, 4=dewarp, 5=crop, 6=columns, ...
-      // UI steps are 0-indexed: 0=orientation, 1=deskew, 2=dewarp, 3=crop, 4=columns, ...
-      let uiStep = Math.max(0, dbStep - 1)
-      const skipSteps = [...(savedDocType?.skip_steps || [])]
-
-      // Sub-session handling depends on how they were created:
-      // - Crop-based (current_step >= 5): image already cropped, skip all pre-processing
-      // - Page-split (current_step 2): orientation done on parent, skip only orientation
-      // - Page-split from original (current_step 1): needs full pipeline
-      const isSubSession = !!data.parent_session_id
-      if (isSubSession) {
-        if (dbStep >= 5) {
-          // Crop-based sub-sessions: image already cropped
-          const SUB_SESSION_SKIP = ['orientation', 'deskew', 'dewarp', 'crop']
-          for (const s of SUB_SESSION_SKIP) {
-            if (!skipSteps.includes(s)) skipSteps.push(s)
-          }
-          if (uiStep < 4) uiStep = 4 // columns step (index 4)
-        } else if (dbStep >= 2) {
-          // Page-split sub-session: parent orientation applied, skip only orientation
-          if (!skipSteps.includes('orientation')) skipSteps.push('orientation')
-          if (uiStep < 1) uiStep = 1 // advance past skipped orientation to deskew
-        }
-        // dbStep === 1: page-split from original image, needs full pipeline
-      }
-
-      setSteps(
-        PIPELINE_STEPS.map((s, i) => ({
-          ...s,
-          status: skipSteps.includes(s.id)
-            ? 'skipped'
-            : i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
-        })),
-      )
-      setCurrentStep(uiStep)
-    } catch (e) {
-      console.error('Failed to open session:', e)
-    }
  }, [])

+  useEffect(() => { loadSessions() }, [loadSessions])
+
+  // Sync session name when nav.sessionId changes
+  useEffect(() => {
+    if (!nav.sessionId) {
+      setSessionName('')
+      setActiveCategory(undefined)
+      return
+    }
+    const load = async () => {
+      try {
+        const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${nav.sessionId}`)
+        if (!res.ok) return
+        const data = await res.json()
+        setSessionName(data.name || data.filename || '')
+        setActiveCategory(data.document_category || undefined)
+      } catch { /* ignore */ }
+    }
+    load()
+  }, [nav.sessionId])
+
+  const openSession = useCallback((sid: string) => {
+    nav.goToSession(sid)
+  }, [nav])
+
  const deleteSession = useCallback(async (sid: string) => {
    try {
      await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`, { method: 'DELETE' })
-      setSessions((prev) => prev.filter((s) => s.id !== sid))
-      if (sessionId === sid) {
-        setSessionId(null)
-        setCurrentStep(0)
-        setDocTypeResult(null)
-        setSubSessions([])
-        setParentSessionId(null)
-        setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
-      }
+      setSessions(prev => prev.filter(s => s.id !== sid))
+      if (nav.sessionId === sid) nav.goToSessionList()
    } catch (e) {
      console.error('Failed to delete session:', e)
    }
-  }, [sessionId])
+  }, [nav])

  const renameSession = useCallback(async (sid: string, newName: string) => {
    try {
@@ -162,13 +92,13 @@ export default function OcrPipelinePage() {
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ name: newName }),
      })
-      setSessions((prev) => prev.map((s) => (s.id === sid ? { ...s, name: newName } : s)))
-      if (sessionId === sid) setSessionName(newName)
+      setSessions(prev => prev.map(s => (s.id === sid ? { ...s, name: newName } : s)))
+      if (nav.sessionId === sid) setSessionName(newName)
    } catch (e) {
      console.error('Failed to rename session:', e)
    }
    setEditingName(null)
-  }, [sessionId])
+  }, [nav.sessionId])

  const updateCategory = useCallback(async (sid: string, category: DocumentCategory) => {
    try {
@@ -177,275 +107,107 @@ export default function OcrPipelinePage() {
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ document_category: category }),
      })
-      setSessions((prev) => prev.map((s) => (s.id === sid ? { ...s, document_category: category } : s)))
-      if (sessionId === sid) setActiveCategory(category)
+      setSessions(prev => prev.map(s => (s.id === sid ? { ...s, document_category: category } : s)))
+      if (nav.sessionId === sid) setActiveCategory(category)
    } catch (e) {
      console.error('Failed to update category:', e)
    }
    setEditingCategory(null)
-  }, [sessionId])
+  }, [nav.sessionId])

  const deleteAllSessions = useCallback(async () => {
    if (!confirm('Alle Sessions loeschen? Dies kann nicht rueckgaengig gemacht werden.')) return
    try {
      await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, { method: 'DELETE' })
      setSessions([])
-      setSessionId(null)
-      setCurrentStep(0)
-      setDocTypeResult(null)
-      setActiveCategory(undefined)
-      setSubSessions([])
-      setParentSessionId(null)
-      setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
+      nav.goToSessionList()
    } catch (e) {
      console.error('Failed to delete all sessions:', e)
    }
-  }, [])
+  }, [nav])

  const handleStepClick = (index: number) => {
-    if (index <= currentStep || steps[index].status === 'completed') {
-      setCurrentStep(index)
+    if (index <= nav.currentStepIndex || nav.steps[index].status === 'completed') {
+      nav.goToStep(index)
    }
  }

-  const goToStep = (step: number) => {
-    setCurrentStep(step)
-    setSteps((prev) =>
-      prev.map((s, i) => ({
-        ...s,
-        status: i < step ? 'completed' : i === step ? 'active' : 'pending',
-      })),
-    )
-  }
-
-  const handleNext = () => {
-    if (currentStep >= steps.length - 1) {
-      // Last step completed
-      if (parentSessionId && sessionId !== parentSessionId) {
-        // Sub-session completed — mark it and find next incomplete one
-        const updatedSubs = subSessions.map((s) =>
-          s.id === sessionId ? { ...s, status: 'completed' as const, current_step: 10 } : s,
-        )
-        setSubSessions(updatedSubs)
-
-        // Find next incomplete sub-session
-        const nextIncomplete = updatedSubs.find(
-          (s) => s.id !== sessionId && (!s.current_step || s.current_step < 10),
-        )
-        if (nextIncomplete) {
-          // Open next incomplete sub-session
-          openSession(nextIncomplete.id, true)
-        } else {
-          // All sub-sessions done — return to session list
-          setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
-          setCurrentStep(0)
-          setSessionId(null)
-          setSubSessions([])
-          setParentSessionId(null)
-          loadSessions()
-        }
-        return
-      }
-      // Main session: return to session list
-      setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
-      setCurrentStep(0)
-      setSessionId(null)
-      setSubSessions([])
-      setParentSessionId(null)
-      loadSessions()
-      return
-    }
-
-    // Find the next non-skipped step
-    const skipSteps = docTypeResult?.skip_steps || []
-    let nextStep = currentStep + 1
-    while (nextStep < steps.length && skipSteps.includes(PIPELINE_STEPS[nextStep]?.id)) {
-      nextStep++
-    }
-    if (nextStep >= steps.length) nextStep = steps.length - 1
-
-    setSteps((prev) =>
-      prev.map((s, i) => {
-        if (i === currentStep) return { ...s, status: 'completed' }
-        if (i === nextStep) return { ...s, status: 'active' }
-        // Mark skipped steps between current and next
-        if (i > currentStep && i < nextStep && skipSteps.includes(PIPELINE_STEPS[i]?.id)) {
-          return { ...s, status: 'skipped' }
-        }
-        return s
-      }),
-    )
-    setCurrentStep(nextStep)
-  }
-
-  const handleOrientationComplete = async (sid: string) => {
-    setSessionId(sid)
+  // Orientation: after upload, navigate to session at deskew step
+  const handleOrientationComplete = useCallback(async (sid: string) => {
    loadSessions()
+    // Navigate directly to deskew step (index 1) for this session
+    nav.goToSession(sid)
+  }, [nav, loadSessions])

-    // Check for page-split sub-sessions directly from API
-    // (React state may not be committed yet due to batching)
-    try {
-      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
-      if (res.ok) {
-        const data = await res.json()
-        if (data.sub_sessions?.length > 0) {
-          const subs: SubSession[] = data.sub_sessions.map((s: SubSession) => ({
-            id: s.id,
-            name: s.name,
-            box_index: s.box_index,
-            current_step: s.current_step,
-          }))
-          setSubSessions(subs)
-          setParentSessionId(sid)
-          openSession(subs[0].id, true)
-          return
-        }
-      }
-    } catch (e) {
-      console.error('Failed to check for sub-sessions:', e)
-    }
-
-    handleNext()
-  }
-
-  const handleCropNext = async () => {
-    // Auto-detect document type after crop (last image-processing step), then advance
-    if (sessionId) {
+  // Crop: detect doc type then advance
+  const handleCropNext = useCallback(async () => {
+    if (nav.sessionId) {
      try {
        const res = await fetch(
-          `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/detect-type`,
+          `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${nav.sessionId}/detect-type`,
          { method: 'POST' },
        )
        if (res.ok) {
          const data: DocumentTypeResult = await res.json()
-          setDocTypeResult(data)
-
-          // Mark skipped steps immediately
-          const skipSteps = data.skip_steps || []
-          if (skipSteps.length > 0) {
-            setSteps((prev) =>
-              prev.map((s) =>
-                skipSteps.includes(s.id) ? { ...s, status: 'skipped' } : s,
-              ),
-            )
-          }
+          nav.setDocType(data)
        }
      } catch (e) {
        console.error('Doc type detection failed:', e)
-        // Not critical — continue without it
      }
    }
-    handleNext()
-  }
+    nav.goToNextStep()
+  }, [nav])

  const handleDocTypeChange = (newDocType: DocumentTypeResult['doc_type']) => {
-    if (!docTypeResult) return
-
-    // Build new skip_steps based on doc type
+    if (!nav.docTypeResult) return
    let skipSteps: string[] = []
-    if (newDocType === 'full_text') {
-      skipSteps = ['columns', 'rows']
-    }
-    // vocab_table and generic_table: no skips
+    if (newDocType === 'full_text') skipSteps = ['columns', 'rows']

-    const updated: DocumentTypeResult = {
-      ...docTypeResult,
+    nav.setDocType({
+      ...nav.docTypeResult,
      doc_type: newDocType,
      skip_steps: skipSteps,
      pipeline: newDocType === 'full_text' ? 'full_page' : 'cell_first',
-    }
-    setDocTypeResult(updated)
-
-    // Update step statuses
-    setSteps((prev) =>
-      prev.map((s) => {
-        if (skipSteps.includes(s.id)) return { ...s, status: 'skipped' as const }
-        if (s.status === 'skipped') return { ...s, status: 'pending' as const }
-        return s
-      }),
-    )
+    })
  }

-  const handleNewSession = () => {
-    setSessionId(null)
-    setSessionName('')
-    setCurrentStep(0)
-    setDocTypeResult(null)
-    setSubSessions([])
-    setParentSessionId(null)
-    setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
-  }
-
-  const handleSessionChange = useCallback((newSessionId: string) => {
-    openSession(newSessionId, true)
-  }, [openSession])
-
-  const handleBoxSessionsCreated = useCallback((subs: SubSession[]) => {
-    setSubSessions(subs)
-    if (sessionId) setParentSessionId(sessionId)
-  }, [sessionId])
-
-  const stepNames: Record<number, string> = {
-    1: 'Orientierung',
-    2: 'Begradigung',
-    3: 'Entzerrung',
-    4: 'Zuschneiden',
-    5: 'Spalten',
-    6: 'Zeilen',
-    7: 'Woerter',
-    8: 'Struktur',
-    9: 'Korrektur',
-    10: 'Rekonstruktion',
-    11: 'Validierung',
-  }
-
-  const reprocessFromStep = useCallback(async (uiStep: number) => {
-    if (!sessionId) return
-    const dbStep = uiStep + 1 // UI is 0-indexed, DB is 1-indexed
-    if (!confirm(`Ab Schritt ${dbStep} (${stepNames[dbStep] || '?'}) neu verarbeiten? Nachfolgende Daten werden geloescht.`)) return
-    try {
-      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reprocess`, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ from_step: dbStep }),
-      })
-      if (!res.ok) {
-        const data = await res.json().catch(() => ({}))
-        console.error('Reprocess failed:', data.detail || res.status)
-        return
-      }
-      // Reset UI steps
-      goToStep(uiStep)
-    } catch (e) {
-      console.error('Reprocess error:', e)
-    }
-  // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [sessionId, goToStep])
+  // Box sub-sessions (column detection) — still supported
+  const handleBoxSessionsCreated = useCallback((_subs: SubSession[]) => {
+    // Box sub-sessions are tracked by the backend; no client-side state needed anymore
+  }, [])

  const renderStep = () => {
-    switch (currentStep) {
+    const sid = nav.sessionId
+    switch (nav.currentStepIndex) {
      case 0:
-        return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} />
+        return (
+          <StepOrientation
+            key={sid}
+            sessionId={sid}
+            onNext={handleOrientationComplete}
+            onSessionList={() => { loadSessions(); nav.goToSessionList() }}
+          />
+        )
      case 1:
-        return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} />
+        return <StepDeskew key={sid} sessionId={sid} onNext={nav.goToNextStep} />
      case 2:
-        return <StepDewarp key={sessionId} sessionId={sessionId} onNext={handleNext} />
+        return <StepDewarp key={sid} sessionId={sid} onNext={nav.goToNextStep} />
      case 3:
-        return <StepCrop key={sessionId} sessionId={sessionId} onNext={handleCropNext} />
+        return <StepCrop key={sid} sessionId={sid} onNext={handleCropNext} />
      case 4:
-        return <StepColumnDetection sessionId={sessionId} onNext={handleNext} onBoxSessionsCreated={handleBoxSessionsCreated} />
+        return <StepColumnDetection sessionId={sid} onNext={nav.goToNextStep} onBoxSessionsCreated={handleBoxSessionsCreated} />
      case 5:
-        return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
+        return <StepRowDetection sessionId={sid} onNext={nav.goToNextStep} />
      case 6:
-        return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
+        return <StepWordRecognition sessionId={sid} onNext={nav.goToNextStep} goToStep={nav.goToStep} />
      case 7:
-        return <StepStructureDetection sessionId={sessionId} onNext={handleNext} />
+        return <StepStructureDetection sessionId={sid} onNext={nav.goToNextStep} />
      case 8:
-        return <StepLlmReview sessionId={sessionId} onNext={handleNext} />
+        return <StepLlmReview sessionId={sid} onNext={nav.goToNextStep} />
      case 9:
-        return <StepReconstruction sessionId={sessionId} onNext={handleNext} />
+        return <StepReconstruction sessionId={sid} onNext={nav.goToNextStep} />
      case 10:
-        return <StepGroundTruth sessionId={sessionId} onNext={handleNext} />
+        return <StepGroundTruth sessionId={sid} onNext={nav.goToNextStep} />
      default:
        return null
    }
@@ -485,7 +247,7 @@ export default function OcrPipelinePage() {
              </button>
            )}
            <button
-              onClick={handleNewSession}
+              onClick={() => nav.goToSessionList()}
              className="text-xs px-3 py-1.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors"
            >
              + Neue Session
@@ -505,7 +267,7 @@ export default function OcrPipelinePage() {
                <div
                  key={s.id}
                  className={`relative flex items-start gap-3 px-3 py-2.5 rounded-lg text-sm transition-colors cursor-pointer ${
-                    sessionId === s.id
+                    nav.sessionId === s.id
                      ? 'bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700'
                      : 'hover:bg-gray-50 dark:hover:bg-gray-700/50'
                  }`}
@@ -561,13 +323,12 @@ export default function OcrPipelinePage() {
                    </button>
                    <div className="text-xs text-gray-400 flex gap-2 mt-0.5">
                      <span>{new Date(s.created_at).toLocaleDateString('de-DE', { day: '2-digit', month: '2-digit', year: '2-digit', hour: '2-digit', minute: '2-digit' })}</span>
-                      <span>Schritt {s.current_step}: {stepNames[s.current_step] || '?'}</span>
+                      <span>Schritt {s.current_step}: {STEP_NAMES[s.current_step] || '?'}</span>
                    </div>
                  </div>

                  {/* Badges */}
                  <div className="flex flex-col gap-1 items-end flex-shrink-0" onClick={(e) => e.stopPropagation()}>
-                    {/* Category Badge */}
                    <button
                      onClick={() => setEditingCategory(editingCategory === s.id ? null : s.id)}
                      className={`text-[10px] px-1.5 py-0.5 rounded-full border transition-colors ${
@@ -579,7 +340,6 @@ export default function OcrPipelinePage() {
                    >
                      {catInfo ? `${catInfo.icon} ${catInfo.label}` : '+ Kategorie'}
                    </button>
-                    {/* Doc Type Badge (read-only) */}
                    {s.doc_type && (
                      <span className="text-[10px] px-1.5 py-0.5 rounded-full bg-gray-100 dark:bg-gray-700 text-gray-500 dark:text-gray-400 border border-gray-200 dark:border-gray-600">
                        {s.doc_type}
@@ -616,7 +376,7 @@ export default function OcrPipelinePage() {
                    </button>
                  </div>

-                  {/* Category dropdown (inline) */}
+                  {/* Category dropdown */}
                  {editingCategory === s.id && (
                    <div
                      className="absolute right-0 top-full mt-1 z-20 bg-white dark:bg-gray-800 border border-gray-200 dark:border-gray-700 rounded-lg shadow-lg p-2 grid grid-cols-2 gap-1 w-64"
@@ -645,40 +405,39 @@ export default function OcrPipelinePage() {
      </div>

      {/* Active session info */}
-      {sessionId && sessionName && (
+      {nav.sessionId && sessionName && (
        <div className="flex items-center gap-3 text-sm text-gray-500 dark:text-gray-400">
          <span>Aktive Session: <span className="font-medium text-gray-700 dark:text-gray-300">{sessionName}</span></span>
          {activeCategory && (() => {
            const cat = DOCUMENT_CATEGORIES.find(c => c.value === activeCategory)
            return cat ? <span className="text-xs px-2 py-0.5 rounded-full bg-teal-50 dark:bg-teal-900/30 border border-teal-200 dark:border-teal-700 text-teal-700 dark:text-teal-300">{cat.icon} {cat.label}</span> : null
          })()}
-          {docTypeResult && (
+          {nav.docTypeResult && (
            <span className="text-xs px-2 py-0.5 rounded-full bg-gray-100 dark:bg-gray-700 text-gray-500 dark:text-gray-400 border border-gray-200 dark:border-gray-600">
-              {docTypeResult.doc_type}
+              {nav.docTypeResult.doc_type}
            </span>
          )}
        </div>
      )}

      <PipelineStepper
-        steps={steps}
-        currentStep={currentStep}
+        steps={nav.steps}
+        currentStep={nav.currentStepIndex}
        onStepClick={handleStepClick}
-        onReprocess={sessionId ? reprocessFromStep : undefined}
-        docTypeResult={docTypeResult}
+        onReprocess={nav.sessionId ? nav.reprocessFromStep : undefined}
+        docTypeResult={nav.docTypeResult}
        onDocTypeChange={handleDocTypeChange}
      />

-      {subSessions.length > 0 && parentSessionId && sessionId && (
-        <BoxSessionTabs
-          parentSessionId={parentSessionId}
-          subSessions={subSessions}
-          activeSessionId={sessionId}
-          onSessionChange={handleSessionChange}
-        />
-      )}
-
      <div className="min-h-[400px]">{renderStep()}</div>
    </div>
  )
 }
+
+export default function OcrPipelinePage() {
+  return (
+    <Suspense fallback={<div className="p-8 text-gray-400">Lade Pipeline...</div>}>
+      <OcrPipelineContent />
+    </Suspense>
+  )
+}
--- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts
+++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts
@@ -35,10 +35,9 @@ export interface SessionListItem {
  doc_type?: string
  created_at: string
  updated_at?: string
-  parent_session_id?: string | null
-  box_index?: number | null
 }

+/** Box sub-session (from column detection zone_type='box') */
 export interface SubSession {
  id: string
  name: string
--- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/usePipelineNavigation.ts
+++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/usePipelineNavigation.ts
@@ -0,0 +1,225 @@
+'use client'
+
+import { useCallback, useEffect, useState } from 'react'
+import { useRouter, useSearchParams } from 'next/navigation'
+import { PIPELINE_STEPS, type PipelineStep, type PipelineStepStatus, type DocumentTypeResult } from './types'
+
+const KLAUSUR_API = '/klausur-api'
+
+export interface PipelineNav {
+  sessionId: string | null
+  currentStepIndex: number
+  currentStepId: string
+  steps: PipelineStep[]
+  docTypeResult: DocumentTypeResult | null
+
+  goToNextStep: () => void
+  goToStep: (index: number) => void
+  goToSession: (sessionId: string) => void
+  goToSessionList: () => void
+  setDocType: (result: DocumentTypeResult) => void
+  reprocessFromStep: (uiStep: number) => Promise<void>
+}
+
+const STEP_NAMES: Record<number, string> = {
+  1: 'Orientierung', 2: 'Begradigung', 3: 'Entzerrung', 4: 'Zuschneiden',
+  5: 'Spalten', 6: 'Zeilen', 7: 'Woerter', 8: 'Struktur',
+  9: 'Korrektur', 10: 'Rekonstruktion', 11: 'Validierung',
+}
+
+function buildSteps(uiStep: number, skipSteps: string[]): PipelineStep[] {
+  return PIPELINE_STEPS.map((s, i) => ({
+    ...s,
+    status: (
+      skipSteps.includes(s.id) ? 'skipped'
+        : i < uiStep ? 'completed'
+          : i === uiStep ? 'active'
+            : 'pending'
+    ) as PipelineStepStatus,
+  }))
+}
+
+export function usePipelineNavigation(): PipelineNav {
+  const router = useRouter()
+  const searchParams = useSearchParams()
+
+  const paramSession = searchParams.get('session')
+  const paramStep = searchParams.get('step')
+
+  const [sessionId, setSessionId] = useState<string | null>(paramSession)
+  const [currentStepIndex, setCurrentStepIndex] = useState(0)
+  const [docTypeResult, setDocTypeResult] = useState<DocumentTypeResult | null>(null)
+  const [steps, setSteps] = useState<PipelineStep[]>(buildSteps(0, []))
+  const [loaded, setLoaded] = useState(false)
+
+  // Load session info when session param changes
+  useEffect(() => {
+    if (!paramSession) {
+      setSessionId(null)
+      setCurrentStepIndex(0)
+      setDocTypeResult(null)
+      setSteps(buildSteps(0, []))
+      setLoaded(true)
+      return
+    }
+
+    const load = async () => {
+      try {
+        const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${paramSession}`)
+        if (!res.ok) return
+        const data = await res.json()
+
+        setSessionId(paramSession)
+
+        const savedDocType: DocumentTypeResult | null = data.doc_type_result || null
+        setDocTypeResult(savedDocType)
+
+        const dbStep = data.current_step || 1
+        let uiStep = Math.max(0, dbStep - 1)
+        const skipSteps = [...(savedDocType?.skip_steps || [])]
+
+        // Box sub-sessions (from column detection) skip pre-processing
+        const isBoxSubSession = !!data.parent_session_id
+        if (isBoxSubSession && dbStep >= 5) {
+          const SUB_SESSION_SKIP = ['orientation', 'deskew', 'dewarp', 'crop']
+          for (const s of SUB_SESSION_SKIP) {
+            if (!skipSteps.includes(s)) skipSteps.push(s)
+          }
+          if (uiStep < 4) uiStep = 4
+        }
+
+        // If URL has a step param, use that instead
+        if (paramStep) {
+          const stepIdx = PIPELINE_STEPS.findIndex(s => s.id === paramStep)
+          if (stepIdx >= 0) uiStep = stepIdx
+        }
+
+        setCurrentStepIndex(uiStep)
+        setSteps(buildSteps(uiStep, skipSteps))
+      } catch (e) {
+        console.error('Failed to load session:', e)
+      } finally {
+        setLoaded(true)
+      }
+    }
+
+    load()
+  }, [paramSession, paramStep])
+
+  const updateUrl = useCallback((sid: string | null, stepIdx?: number) => {
+    if (!sid) {
+      router.push('/ai/ocr-pipeline')
+      return
+    }
+    const stepId = stepIdx !== undefined ? PIPELINE_STEPS[stepIdx]?.id : undefined
+    const params = new URLSearchParams()
+    params.set('session', sid)
+    if (stepId) params.set('step', stepId)
+    router.push(`/ai/ocr-pipeline?${params.toString()}`)
+  }, [router])
+
+  const goToNextStep = useCallback(() => {
+    if (currentStepIndex >= steps.length - 1) {
+      // Last step — return to session list
+      setSessionId(null)
+      setCurrentStepIndex(0)
+      setDocTypeResult(null)
+      setSteps(buildSteps(0, []))
+      router.push('/ai/ocr-pipeline')
+      return
+    }
+
+    const skipSteps = docTypeResult?.skip_steps || []
+    let nextStep = currentStepIndex + 1
+    while (nextStep < steps.length && skipSteps.includes(PIPELINE_STEPS[nextStep]?.id)) {
+      nextStep++
+    }
+    if (nextStep >= steps.length) nextStep = steps.length - 1
+
+    setSteps(prev =>
+      prev.map((s, i) => {
+        if (i === currentStepIndex) return { ...s, status: 'completed' as PipelineStepStatus }
+        if (i === nextStep) return { ...s, status: 'active' as PipelineStepStatus }
+        if (i > currentStepIndex && i < nextStep && skipSteps.includes(PIPELINE_STEPS[i]?.id)) {
+          return { ...s, status: 'skipped' as PipelineStepStatus }
+        }
+        return s
+      }),
+    )
+    setCurrentStepIndex(nextStep)
+    if (sessionId) updateUrl(sessionId, nextStep)
+  }, [currentStepIndex, steps.length, docTypeResult, sessionId, updateUrl, router])
+
+  const goToStep = useCallback((index: number) => {
+    setCurrentStepIndex(index)
+    setSteps(prev =>
+      prev.map((s, i) => ({
+        ...s,
+        status: s.status === 'skipped' ? 'skipped'
+          : i < index ? 'completed'
+            : i === index ? 'active'
+              : 'pending' as PipelineStepStatus,
+      })),
+    )
+    if (sessionId) updateUrl(sessionId, index)
+  }, [sessionId, updateUrl])
+
+  const goToSession = useCallback((sid: string) => {
+    updateUrl(sid)
+  }, [updateUrl])
+
+  const goToSessionList = useCallback(() => {
+    setSessionId(null)
+    setCurrentStepIndex(0)
+    setDocTypeResult(null)
+    setSteps(buildSteps(0, []))
+    router.push('/ai/ocr-pipeline')
+  }, [router])
+
+  const setDocType = useCallback((result: DocumentTypeResult) => {
+    setDocTypeResult(result)
+    const skipSteps = result.skip_steps || []
+    if (skipSteps.length > 0) {
+      setSteps(prev =>
+        prev.map(s =>
+          skipSteps.includes(s.id) ? { ...s, status: 'skipped' as PipelineStepStatus } : s,
+        ),
+      )
+    }
+  }, [])
+
+  const reprocessFromStep = useCallback(async (uiStep: number) => {
+    if (!sessionId) return
+    const dbStep = uiStep + 1
+    if (!confirm(`Ab Schritt ${dbStep} (${STEP_NAMES[dbStep] || '?'}) neu verarbeiten? Nachfolgende Daten werden geloescht.`)) return
+    try {
+      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reprocess`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ from_step: dbStep }),
+      })
+      if (!res.ok) {
+        const data = await res.json().catch(() => ({}))
+        console.error('Reprocess failed:', data.detail || res.status)
+        return
+      }
+      goToStep(uiStep)
+    } catch (e) {
+      console.error('Reprocess error:', e)
+    }
+  }, [sessionId, goToStep])
+
+  return {
+    sessionId,
+    currentStepIndex,
+    currentStepId: PIPELINE_STEPS[currentStepIndex]?.id || 'orientation',
+    steps,
+    docTypeResult,
+    goToNextStep,
+    goToStep,
+    goToSession,
+    goToSessionList,
+    setDocType,
+    reprocessFromStep,
+  }
+}
--- a/admin-lehrer/components/ocr-pipeline/BoxSessionTabs.tsx
+++ b/admin-lehrer/components/ocr-pipeline/BoxSessionTabs.tsx
@@ -21,6 +21,7 @@ function getStatusIcon(sub: SubSession): string {
  return STATUS_ICONS.pending
 }

+/** Tabs for box sub-sessions (from column detection zone_type='box'). */
 export function BoxSessionTabs({ parentSessionId, subSessions, activeSessionId, onSessionChange }: BoxSessionTabsProps) {
  if (subSessions.length === 0) return null

@@ -28,7 +29,6 @@ export function BoxSessionTabs({ parentSessionId, subSessions, activeSessionId,

  return (
    <div className="flex items-center gap-1.5 px-1 py-1.5 bg-gray-50 dark:bg-gray-800/50 rounded-xl border border-gray-200 dark:border-gray-700">
-      {/* Main session tab */}
      <button
        onClick={() => onSessionChange(parentSessionId)}
        className={`px-3 py-1.5 rounded-lg text-xs font-medium transition-colors ${
@@ -42,7 +42,6 @@ export function BoxSessionTabs({ parentSessionId, subSessions, activeSessionId,

      <div className="w-px h-5 bg-gray-200 dark:bg-gray-700" />

-      {/* Sub-session tabs */}
      {subSessions.map((sub) => {
        const isActive = activeSessionId === sub.id
        const icon = getStatusIcon(sub)
@@ -59,7 +58,7 @@ export function BoxSessionTabs({ parentSessionId, subSessions, activeSessionId,
            title={sub.name}
          >
            <span className="mr-1">{icon}</span>
-            Seite {sub.box_index + 1}
+            Box {sub.box_index + 1}
          </button>
        )
      })}
--- a/admin-lehrer/components/ocr-pipeline/StepOrientation.tsx
+++ b/admin-lehrer/components/ocr-pipeline/StepOrientation.tsx
@@ -1,7 +1,7 @@
 'use client'

 import { useCallback, useEffect, useState } from 'react'
-import type { OrientationResult, SessionInfo, SubSession } from '@/app/(admin)/ai/ocr-pipeline/types'
+import type { OrientationResult, SessionInfo } from '@/app/(admin)/ai/ocr-pipeline/types'
 import { ImageCompareView } from './ImageCompareView'

 const KLAUSUR_API = '/klausur-api'
@@ -17,10 +17,10 @@ interface PageSplitResult {
 interface StepOrientationProps {
  sessionId?: string | null
  onNext: (sessionId: string) => void
-  onSubSessionsCreated?: (subs: SubSession[]) => void
+  onSessionList?: () => void
 }

-export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSessionsCreated }: StepOrientationProps) {
+export function StepOrientation({ sessionId: existingSessionId, onNext, onSessionList }: StepOrientationProps) {
  const [session, setSession] = useState<SessionInfo | null>(null)
  const [orientationResult, setOrientationResult] = useState<OrientationResult | null>(null)
  const [pageSplitResult, setPageSplitResult] = useState<PageSplitResult | null>(null)
@@ -30,7 +30,7 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
  const [dragOver, setDragOver] = useState(false)
  const [sessionName, setSessionName] = useState('')

-  // Reload session data when navigating back
+  // Reload session data when navigating back — auto-trigger orientation if missing
  useEffect(() => {
    if (!existingSessionId || session) return

@@ -51,6 +51,28 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes

        if (data.orientation_result) {
          setOrientationResult(data.orientation_result)
+        } else {
+          // Session exists but orientation not yet run (e.g. page-split session)
+          // Auto-trigger orientation detection
+          setDetecting(true)
+          try {
+            const orientRes = await fetch(
+              `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}/orientation`,
+              { method: 'POST' },
+            )
+            if (orientRes.ok) {
+              const orientData = await orientRes.json()
+              setOrientationResult({
+                orientation_degrees: orientData.orientation_degrees,
+                corrected: orientData.corrected,
+                duration_seconds: orientData.duration_seconds,
+              })
+            }
+          } catch (e) {
+            console.error('Auto-orientation failed:', e)
+          } finally {
+            setDetecting(false)
+          }
        }
      } catch (e) {
        console.error('Failed to reload session:', e)
@@ -112,16 +134,6 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
        if (splitRes.ok) {
          const splitData: PageSplitResult = await splitRes.json()
          setPageSplitResult(splitData)
-          if (splitData.multi_page && splitData.sub_sessions && onSubSessionsCreated) {
-            onSubSessionsCreated(
-              splitData.sub_sessions.map((s) => ({
-                id: s.id,
-                name: s.name,
-                box_index: s.page_index,
-                current_step: splitData.used_original ? 1 : 2,
-              }))
-            )
-          }
        }
      } catch (e) {
        console.error('Page-split detection failed:', e)
@@ -133,7 +145,7 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
      setUploading(false)
      setDetecting(false)
    }
-  }, [sessionName, onSubSessionsCreated])
+  }, [sessionName])

  const handleDrop = useCallback((e: React.DragEvent) => {
    e.preventDefault()
@@ -264,10 +276,10 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
      {pageSplitResult?.multi_page && (
        <div className="bg-blue-50 dark:bg-blue-900/20 rounded-lg border border-blue-200 dark:border-blue-700 p-4">
          <div className="text-sm font-medium text-blue-700 dark:text-blue-300">
-            Doppelseite erkannt — {pageSplitResult.page_count} Seiten
+            Doppelseite erkannt — {pageSplitResult.page_count} unabhaengige Sessions erstellt
          </div>
          <p className="text-xs text-blue-600 dark:text-blue-400 mt-1">
-            Jede Seite wird einzeln durch die Pipeline (Begradigung, Entzerrung, Zuschnitt, ...) verarbeitet.
+            Jede Seite wird als eigene Session durch die Pipeline verarbeitet.
            {pageSplitResult.used_original && ' (Seitentrennung auf dem Originalbild, da die Orientierung die Doppelseite gedreht hat.)'}
          </p>
          <div className="flex gap-2 mt-2">
@@ -286,12 +298,21 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
      {/* Next button */}
      {orientationResult && (
        <div className="flex justify-end">
-          <button
-            onClick={() => onNext(session.session_id)}
-            className="px-6 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium transition-colors"
-          >
-            {pageSplitResult?.multi_page ? 'Seiten verarbeiten' : 'Weiter'} &rarr;
-          </button>
+          {pageSplitResult?.multi_page ? (
+            <button
+              onClick={() => onSessionList?.()}
+              className="px-6 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium transition-colors"
+            >
+              Zur Session-Liste &rarr;
+            </button>
+          ) : (
+            <button
+              onClick={() => onNext(session.session_id)}
+              className="px-6 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium transition-colors"
+            >
+              Weiter &rarr;
+            </button>
+          )}
        </div>
      )}

--- a/klausur-service/backend/cv_syllable_detect.py
+++ b/klausur-service/backend/cv_syllable_detect.py
@@ -1,11 +1,15 @@
 """
-CV-based syllable divider detection and insertion for dictionary pages.
+Syllable divider insertion for dictionary pages.

-Two-step approach:
-  1. CV: morphological vertical line detection checks if a word_box image
-     contains thin, isolated pipe-like vertical lines (syllable dividers).
-  2. pyphen: inserts syllable breaks at linguistically correct positions
-     for words where CV confirmed the presence of dividers.
+For confirmed dictionary pages (is_dictionary=True), processes all content
+column cells:
+  1. Strips existing | dividers for clean normalization
+  2. Merges pipe-gap spaces (where OCR split a word at a divider position)
+  3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN)
+  4. Only modifies words that pyphen recognizes — garbled OCR stays as-is
+
+No CV gate needed — the dictionary detection confidence is sufficient.
+pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable.

 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -13,94 +17,222 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.

 import logging
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple

-import cv2
 import numpy as np

 logger = logging.getLogger(__name__)

-
-def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
-    """CV check: does this word_box image show thin vertical pipe dividers?
-
-    Uses morphological opening with a tall thin kernel to isolate vertical
-    structures, then filters for thin (≤4px), isolated contours that are
-    NOT at the word edges (those would be l, I, 1 etc.).
-    """
-    x = wb.get("left", 0)
-    y = wb.get("top", 0)
-    w = wb.get("width", 0)
-    h = wb.get("height", 0)
-    if w < 30 or h < 12:
-        return False
-    ih, iw = img_gray.shape[:2]
-    y1, y2 = max(0, y), min(ih, y + h)
-    x1, x2 = max(0, x), min(iw, x + w)
-    roi = img_gray[y1:y2, x1:x2]
-    if roi.size == 0:
-        return False
-    rh, rw = roi.shape
-
-    # Binarize (ink = white on black background)
-    _, binary = cv2.threshold(
-        roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
-    )
-
-    # Morphological opening: keep only tall vertical structures (≥55% height)
-    kern_h = max(int(rh * 0.55), 8)
-    kernel = np.ones((kern_h, 1), np.uint8)
-    vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
-
-    # Find surviving contours
-    contours, _ = cv2.findContours(
-        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
-    )
-
-    margin = max(int(rw * 0.08), 3)
-    for cnt in contours:
-        cx, cy, cw, ch = cv2.boundingRect(cnt)
-        if cw > 4:
-            continue  # too wide for a pipe
-        if cx < margin or cx + cw > rw - margin:
-            continue  # at word edge — likely l, I, 1
-        # Check isolation: adjacent columns should be mostly empty (ink-free)
-        left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
-        right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
-        left_ink = np.mean(left_zone) if left_zone.size else 255
-        right_ink = np.mean(right_zone) if right_zone.size else 255
-        if left_ink < 80 and right_ink < 80:
-            return True  # isolated thin vertical line = pipe divider
-    return False
-
-
-# IPA/phonetic bracket pattern — don't hyphenate transcriptions
+# IPA/phonetic characters — skip cells containing these
 _IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')

+# Common German words that should NOT be merged with adjacent tokens.
+# These are function words that appear as standalone words between
+# headwords/definitions on dictionary pages.
+_STOP_WORDS = frozenset([
+    # Articles
+    'der', 'die', 'das', 'dem', 'den', 'des',
+    'ein', 'eine', 'einem', 'einen', 'einer',
+    # Pronouns
+    'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
+    # Prepositions
+    'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
+    'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
+    'zwischen', 'ohne', 'gegen',
+    # Conjunctions
+    'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
+    # Adverbs
+    'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
+    # Verbs
+    'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
+    'sein', 'haben',
+    # Other
+    'kein', 'keine', 'keinem', 'keinen', 'keiner',
+])
+
+# Cached hyphenators
+_hyph_de = None
+_hyph_en = None
+
+
+def _get_hyphenators():
+    """Lazy-load pyphen hyphenators (cached across calls)."""
+    global _hyph_de, _hyph_en
+    if _hyph_de is not None:
+        return _hyph_de, _hyph_en
+    try:
+        import pyphen
+    except ImportError:
+        return None, None
+    _hyph_de = pyphen.Pyphen(lang='de_DE')
+    _hyph_en = pyphen.Pyphen(lang='en_US')
+    return _hyph_de, _hyph_en
+
+
+def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
+    """Try to hyphenate a word using DE then EN dictionary.
+
+    Returns word with | separators, or None if not recognized.
+    """
+    hyph = hyph_de.inserted(word, hyphen='|')
+    if '|' in hyph:
+        return hyph
+    hyph = hyph_en.inserted(word, hyphen='|')
+    if '|' in hyph:
+        return hyph
+    return None
+
+
+def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
+    """Merge fragments separated by single spaces where OCR split at a pipe.
+
+    Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
+    Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
+
+    Guards against false merges:
+    - The FIRST token must be pure alpha (word start — no attached punctuation)
+    - The second token may have trailing punctuation (comma, period) which
+      stays attached to the merged word: "Kä" + "fer," -> "Käfer,"
+    - Common German function words (der, die, das, ...) are never merged
+    - At least one fragment must be very short (<=3 alpha chars)
+    """
+    parts = text.split(' ')
+    if len(parts) < 2:
+        return text
+
+    result = [parts[0]]
+    i = 1
+    while i < len(parts):
+        prev = result[-1]
+        curr = parts[i]
+
+        # Extract alpha-only core for lookup
+        prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
+        curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
+
+        # Guard 1: first token must be pure alpha (word-start fragment)
+        #          second token may have trailing punctuation
+        # Guard 2: neither alpha core can be a common German function word
+        # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
+        # Guard 4: combined length must be >= 4
+        should_try = (
+            prev == prev_alpha  # first token: pure alpha (word start)
+            and prev_alpha and curr_alpha
+            and prev_alpha.lower() not in _STOP_WORDS
+            and curr_alpha.lower() not in _STOP_WORDS
+            and min(len(prev_alpha), len(curr_alpha)) <= 3
+            and len(prev_alpha) + len(curr_alpha) >= 4
+        )
+
+        if should_try:
+            merged_alpha = prev_alpha + curr_alpha
+            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
+            if '-' in hyph:
+                # pyphen recognizes merged word — collapse the space
+                result[-1] = prev + curr
+                i += 1
+                continue
+
+        result.append(curr)
+        i += 1
+
+    return ' '.join(result)
+
+
+def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
+    """Syllabify all significant words in a text string.
+
+    1. Strip existing | dividers
+    2. Merge pipe-gap spaces where possible
+    3. Apply pyphen to each word >= 3 alphabetic chars
+    4. Words pyphen doesn't recognize stay as-is (no bad guesses)
+    """
+    if not text:
+        return text
+
+    # Skip cells that contain IPA transcription characters
+    if _IPA_RE.search(text):
+        return text
+
+    # Phase 1: strip existing pipe dividers for clean normalization
+    clean = text.replace('|', '')
+
+    # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
+    clean = _try_merge_pipe_gaps(clean, hyph_de)
+
+    # Phase 3: tokenize and syllabify each word
+    # Split on whitespace and comma/semicolon sequences, keeping separators
+    tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
+
+    result = []
+    for tok in tokens:
+        if not tok or re.match(r'^[\s,;:]+$', tok):
+            result.append(tok)
+            continue
+
+        # Strip trailing/leading punctuation for pyphen lookup
+        m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok)
+        if not m:
+            result.append(tok)
+            continue
+        lead, word, trail = m.group(1), m.group(2), m.group(3)
+
+        if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word):
+            result.append(tok)
+            continue
+
+        hyph = _hyphenate_word(word, hyph_de, hyph_en)
+        if hyph:
+            result.append(lead + hyph + trail)
+        else:
+            result.append(tok)
+
+    return ''.join(result)
+

 def insert_syllable_dividers(
    zones_data: List[Dict],
    img_bgr: np.ndarray,
    session_id: str,
 ) -> int:
-    """Insert pipe syllable dividers into dictionary cells where CV confirms them.
+    """Insert pipe syllable dividers into dictionary cells.

-    For each cell on a dictionary page:
-      1. Check if ANY word_box has CV-detected pipe lines
-      2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
-      3. Try DE hyphenation first, then EN
+    For dictionary pages: process all content column cells, strip existing
+    pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
+
+    Pre-check: at least 1% of content cells must already contain ``|`` from
+    OCR.  This guards against pages with zero pipe characters (the primary
+    guard — article_col_index — is checked at the call site).

    Returns the number of cells modified.
    """
-    try:
-        import pyphen
-    except ImportError:
+    hyph_de, hyph_en = _get_hyphenators()
+    if hyph_de is None:
        logger.warning("pyphen not installed — skipping syllable insertion")
        return 0

-    _hyph_de = pyphen.Pyphen(lang='de_DE')
-    _hyph_en = pyphen.Pyphen(lang='en_US')
-    img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    # Pre-check: count cells that already have | from OCR.
+    # Real dictionary pages with printed syllable dividers will have OCR-
+    # detected pipes in many cells.  Pages without syllable dividers will
+    # have zero — skip those to avoid false syllabification.
+    total_col_cells = 0
+    cells_with_pipes = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            if cell.get("col_type", "").startswith("column_"):
+                total_col_cells += 1
+                if "|" in cell.get("text", ""):
+                    cells_with_pipes += 1
+
+    if total_col_cells > 0:
+        pipe_ratio = cells_with_pipes / total_col_cells
+        if pipe_ratio < 0.01:
+            logger.info(
+                "build-grid session %s: skipping syllable insertion — "
+                "only %.1f%% of cells have existing pipes (need >=1%%)",
+                session_id, pipe_ratio * 100,
+            )
+            return 0

    insertions = 0
    for z in zones_data:
@@ -109,47 +241,18 @@ def insert_syllable_dividers(
            if not ct.startswith("column_"):
                continue
            text = cell.get("text", "")
-            if not text or "|" in text:
-                continue
-            if _IPA_RE.search(text):
+            if not text:
                continue

-            # CV gate: check if ANY word_box in this cell has pipe lines
-            wbs = cell.get("word_boxes") or []
-            if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs):
-                continue
-
-            # Apply pyphen to each significant word in the cell
-            tokens = re.split(r'(\s+|[,;]+\s*)', text)
-            new_tokens = []
-            changed = False
-            for tok in tokens:
-                # Skip whitespace/punctuation separators
-                if re.match(r'^[\s,;]+$', tok):
-                    new_tokens.append(tok)
-                    continue
-                # Only hyphenate words ≥ 4 alpha chars
-                clean = re.sub(r'[().\-]', '', tok)
-                if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
-                    new_tokens.append(tok)
-                    continue
-                # Try DE first, then EN
-                hyph = _hyph_de.inserted(tok, hyphen='|')
-                if '|' not in hyph:
-                    hyph = _hyph_en.inserted(tok, hyphen='|')
-                if '|' in hyph and hyph != tok:
-                    new_tokens.append(hyph)
-                    changed = True
-                else:
-                    new_tokens.append(tok)
-            if changed:
-                cell["text"] = ''.join(new_tokens)
+            new_text = _syllabify_text(text, hyph_de, hyph_en)
+            if new_text != text:
+                cell["text"] = new_text
                insertions += 1

    if insertions:
        logger.info(
-            "build-grid session %s: inserted syllable dividers in %d cells "
-            "(CV-validated)",
+            "build-grid session %s: syllable dividers inserted/normalized "
+            "in %d cells (pyphen)",
            session_id, insertions,
        )
    return insertions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1456,10 +1456,15 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
        logger.warning("Dictionary detection failed: %s", e)

    # --- Syllable divider insertion for dictionary pages ---
-    # CV-validated: only inserts "|" where image shows thin vertical lines.
-    # See cv_syllable_detect.py for the detection + insertion logic.
+    # Only on confirmed dictionary pages with article columns (der/die/das).
+    # The article_col_index check avoids false positives on synonym lists,
+    # word frequency tables, and other alphabetically sorted non-dictionary pages.
+    # Additionally, insert_syllable_dividers has its own pre-check for existing
+    # pipe characters in cells (OCR must have already found some).
    syllable_insertions = 0
-    if dict_detection.get("is_dictionary") and img_bgr is not None:
+    if (dict_detection.get("is_dictionary")
+            and dict_detection.get("article_col_index") is not None
+            and img_bgr is not None):
        try:
            from cv_syllable_detect import insert_syllable_dividers
            syllable_insertions = insert_syllable_dividers(
--- a/klausur-service/backend/ocr_pipeline_session_store.py
+++ b/klausur-service/backend/ocr_pipeline_session_store.py
@@ -238,7 +238,7 @@ async def list_sessions_db(
    """
    pool = await get_pool()
    async with pool.acquire() as conn:
-        where = "" if include_sub_sessions else "WHERE parent_session_id IS NULL"
+        where = "" if include_sub_sessions else "WHERE parent_session_id IS NULL AND (status IS NULL OR status != 'split')"
        rows = await conn.fetch(f"""
            SELECT id, name, filename, status, current_step,
                   document_category, doc_type,
--- a/klausur-service/backend/ocr_pipeline_sessions.py
+++ b/klausur-service/backend/ocr_pipeline_sessions.py
@@ -191,12 +191,12 @@ async def get_session_info(session_id: str):
    if session.get("ground_truth"):
        result["ground_truth"] = session["ground_truth"]

-    # Sub-session info
+    # Box sub-session info (zone_type='box' from column detection — NOT page-split)
    if session.get("parent_session_id"):
        result["parent_session_id"] = session["parent_session_id"]
        result["box_index"] = session.get("box_index")
    else:
-        # Check for sub-sessions
+        # Check for box sub-sessions (column detection creates these)
        subs = await get_sub_sessions(session_id)
        if subs:
            result["sub_sessions"] = [
--- a/klausur-service/backend/orientation_crop_api.py
+++ b/klausur-service/backend/orientation_crop_api.py
@@ -238,8 +238,8 @@ async def detect_page_split(session_id: str):
        "duration_seconds": round(duration, 2),
    }

-    # Mark parent session as split (store info in crop_result for backward compat)
-    await update_session_db(session_id, crop_result=split_info)
+    # Mark parent session as split and hidden from session list
+    await update_session_db(session_id, crop_result=split_info, status='split')
    cached["crop_result"] = split_info

    await _append_pipeline_log(session_id, "page_split", {
@@ -346,6 +346,7 @@ async def auto_crop(session_id: str):
            cropped_png=png_buf.tobytes() if ok else b"",
            crop_result=crop_info,
            current_step=5,
+            status='split',
        )

        logger.info(
@@ -461,8 +462,6 @@ async def _create_page_sub_sessions(
            name=sub_name,
            filename=parent_filename,
            original_png=page_png,
-            parent_session_id=parent_session_id,
-            box_index=pi,
        )

        # Pre-populate: set cropped = original (already cropped)
@@ -540,8 +539,6 @@ async def _create_page_sub_sessions_full(
            name=sub_name,
            filename=parent_filename,
            original_png=page_png,
-            parent_session_id=parent_session_id,
-            box_index=pi,
        )

        # start_step=2 → ready for deskew (orientation already done on spread)
@@ -553,7 +550,6 @@ async def _create_page_sub_sessions_full(
            "id": sub_id,
            "filename": parent_filename,
            "name": sub_name,
-            "parent_session_id": parent_session_id,
            "original_bgr": page_bgr,
            "oriented_bgr": None,
            "cropped_bgr": None,
Author	SHA1	Message	Date
Benjamin Admin	4feec7c7b7	Lower syllable pipe-ratio threshold from 5% to 1% Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 24s Details CI / test-go-edu-search (push) Successful in 25s Details CI / test-python-klausur (push) Failing after 1m58s Details CI / test-python-agent-core (push) Successful in 15s Details CI / test-nodejs-website (push) Successful in 16s Details Real dictionary pages have only ~3% OCR-detected pipes because the thin syllable divider lines are hard for OCR to read. The primary false-positive guard (article_col_index check) already blocks synonym dictionaries. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-24 23:17:08 +01:00
Benjamin Admin	ed7fc99fc4	Improve syllable divider insertion for dictionary pages Rewrite cv_syllable_detect.py with pyphen-first approach: - Remove unreliable CV gate (morphological pipe detection) - Strip existing pipes and re-syllabify via pyphen (DE then EN) - Merge pipe-gap spaces where OCR split words at divider positions - Guard merges with function word blacklist and punctuation checks Add false-positive prevention: - Pre-check: skip if <5% of cells have existing \| from OCR - Call-site check: require article_col_index (der/die/das column) - Prevents syllabification of synonym dictionaries and word lists Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-24 19:44:29 +01:00
Benjamin Admin	7fbcae954b	fix: auto-trigger orientation for page-split sessions without result Page-split sessions (start_step=1) have no orientation_result stored. StepOrientation now auto-runs orientation detection when loading an existing session that lacks a result. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-24 17:19:56 +01:00
Benjamin Admin	f931091b57	refactor: independent sessions for page-split + URL-based pipeline navigation Page-split now creates independent sessions (no parent_session_id), parent marked as status='split' and hidden from list. Navigation uses useSearchParams for URL-based step tracking (browser back/forward works). page.tsx reduced from 684 to 443 lines via usePipelineNavigation hook. Box sub-sessions (column detection) remain unchanged. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-24 17:05:33 +01:00