Fix cross-column word assignment by splitting OCR merge artifacts

When OCR merges adjacent words from different columns into one word box (e.g. "sichzie" spanning Col 1+2, "dasZimmer" crossing boundary), the grid builder assigned the entire merged word to one column. New _split_cross_column_words() function splits these at column boundaries using case transitions and spellchecker validation to avoid false positives on real words like "oder", "Kabel", "Zeitung". Regression: 12/12 GT sessions pass with diff=+0. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Remove Hauptseite/Box tabs from Kombi pipeline
2026-03-28 10:54:41 +01:00 · 2026-03-27 17:43:58 +01:00 · 2026-03-27 16:47:42 +01:00 · 2026-03-27 16:33:38 +01:00 · 2026-03-27 15:49:52 +01:00 · 2026-03-27 15:35:12 +01:00
9 changed files with 844 additions and 227 deletions
--- a/admin-lehrer/app/(admin)/ai/ocr-kombi/page.tsx
+++ b/admin-lehrer/app/(admin)/ai/ocr-kombi/page.tsx
@@ -2,7 +2,6 @@

 import { Suspense } from 'react'
 import { PagePurpose } from '@/components/common/PagePurpose'
-import { BoxSessionTabs } from '@/components/ocr-pipeline/BoxSessionTabs'
 import { KombiStepper } from '@/components/ocr-kombi/KombiStepper'
 import { SessionList } from '@/components/ocr-kombi/SessionList'
 import { SessionHeader } from '@/components/ocr-kombi/SessionHeader'
@@ -27,8 +26,6 @@ function OcrKombiContent() {
    loadingSessions,
    activeCategory,
    isGroundTruth,
-    subSessions,
-    parentSessionId,
    steps,
    gridSaveRef,
    groupedSessions,
@@ -40,11 +37,8 @@ function OcrKombiContent() {
    deleteSession,
    renameSession,
    updateCategory,
-    handleOrientationComplete,
-    handleSessionChange,
    setSessionId,
-    setSubSessions,
-    setParentSessionId,
+    setSessionName,
    setIsGroundTruth,
  } = useKombiPipeline()

@@ -53,19 +47,20 @@ function OcrKombiContent() {
      case 0:
        return (
          <StepUpload
-            onUploaded={(sid) => {
+            sessionId={sessionId}
+            onUploaded={(sid, name) => {
              setSessionId(sid)
+              setSessionName(name)
              loadSessions()
-              openSession(sid)
-              handleNext()
            }}
+            onNext={handleNext}
          />
        )
      case 1:
        return (
          <StepOrientation
            sessionId={sessionId}
-            onNext={handleOrientationComplete}
+            onNext={() => handleNext()}
            onSessionList={() => { loadSessions(); handleNewSession() }}
          />
        )
@@ -73,10 +68,13 @@ function OcrKombiContent() {
        return (
          <StepPageSplit
            sessionId={sessionId}
+            sessionName={sessionName}
            onNext={handleNext}
-            onSubSessionsCreated={(subs) => {
-              setSubSessions(subs)
-              if (sessionId) setParentSessionId(sessionId)
+            onSplitComplete={(childId, childName) => {
+              // Switch to the first child session and refresh the list
+              setSessionId(childId)
+              setSessionName(childName)
+              loadSessions()
            }}
          />
        )
@@ -151,15 +149,6 @@ function OcrKombiContent() {
        onStepClick={handleStepClick}
      />

-      {subSessions.length > 0 && parentSessionId && sessionId && (
-        <BoxSessionTabs
-          parentSessionId={parentSessionId}
-          subSessions={subSessions}
-          activeSessionId={sessionId}
-          onSessionChange={handleSessionChange}
-        />
-      )}
-
      <div className="min-h-[400px]">{renderStep()}</div>
    </div>
  )
--- a/admin-lehrer/app/(admin)/ai/ocr-kombi/types.ts
+++ b/admin-lehrer/app/(admin)/ai/ocr-kombi/types.ts
@@ -8,7 +8,6 @@ export { DOCUMENT_CATEGORIES } from '../ocr-pipeline/types'
 export type {
  SessionListItem,
  SessionInfo,
-  SubSession,
  OrientationResult,
  CropResult,
  DeskewResult,
--- a/admin-lehrer/app/(admin)/ai/ocr-kombi/useKombiPipeline.ts
+++ b/admin-lehrer/app/(admin)/ai/ocr-kombi/useKombiPipeline.ts
@@ -4,7 +4,7 @@ import { useCallback, useEffect, useState, useRef } from 'react'
 import { useSearchParams } from 'next/navigation'
 import type { PipelineStep, DocumentCategory } from './types'
 import { KOMBI_V2_STEPS, dbStepToKombiV2Ui } from './types'
-import type { SubSession, SessionListItem } from '../ocr-pipeline/types'
+import type { SessionListItem } from '../ocr-pipeline/types'

 export type { SessionListItem }

@@ -33,8 +33,6 @@ export function useKombiPipeline() {
  const [loadingSessions, setLoadingSessions] = useState(true)
  const [activeCategory, setActiveCategory] = useState<DocumentCategory | undefined>(undefined)
  const [isGroundTruth, setIsGroundTruth] = useState(false)
-  const [subSessions, setSubSessions] = useState<SubSession[]>([])
-  const [parentSessionId, setParentSessionId] = useState<string | null>(null)
  const [steps, setSteps] = useState<PipelineStep[]>(initSteps())

  const searchParams = useSearchParams()
@@ -115,7 +113,7 @@ export function useKombiPipeline() {

  // ---- Open session ----

-  const openSession = useCallback(async (sid: string, keepSubSessions?: boolean) => {
+  const openSession = useCallback(async (sid: string) => {
    try {
      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
      if (!res.ok) return
@@ -126,17 +124,6 @@ export function useKombiPipeline() {
      setActiveCategory(data.document_category || undefined)
      setIsGroundTruth(!!data.ground_truth?.build_grid_reference)

-      // Sub-session handling
-      if (data.sub_sessions?.length > 0) {
-        setSubSessions(data.sub_sessions)
-        setParentSessionId(sid)
-      } else if (data.parent_session_id) {
-        setParentSessionId(data.parent_session_id)
-      } else if (!keepSubSessions) {
-        setSubSessions([])
-        setParentSessionId(null)
-      }
-
      // Determine UI step from DB state
      const dbStep = data.current_step || 1
      const hasGrid = !!data.grid_editor_result
@@ -154,27 +141,15 @@ export function useKombiPipeline() {
        uiStep = dbStepToKombiV2Ui(dbStep)
      }

-      // For sessions that already have an upload, skip the upload step
-      if (uiStep === 0 && dbStep >= 2) {
+      // Sessions only exist after upload, so always skip the upload step
+      if (uiStep === 0) {
        uiStep = 1
      }

-      const skipIds: string[] = []
-      const isSubSession = !!data.parent_session_id
-      if (isSubSession && dbStep >= 5) {
-        skipIds.push('upload', 'orientation', 'page-split', 'deskew', 'dewarp', 'content-crop')
-        if (uiStep < 6) uiStep = 6
-      } else if (isSubSession && dbStep >= 2) {
-        skipIds.push('upload', 'orientation')
-        if (uiStep < 2) uiStep = 2
-      }
-
      setSteps(
        KOMBI_V2_STEPS.map((s, i) => ({
          ...s,
-          status: skipIds.includes(s.id)
-            ? 'skipped'
-            : i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
+          status: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
        })),
      )
      setCurrentStep(uiStep)
@@ -226,8 +201,6 @@ export function useKombiPipeline() {
      setSteps(initSteps())
      setCurrentStep(0)
      setSessionId(null)
-      setSubSessions([])
-      setParentSessionId(null)
      loadSessions()
      return
    }
@@ -249,8 +222,6 @@ export function useKombiPipeline() {
    setSessionId(null)
    setSessionName('')
    setCurrentStep(0)
-    setSubSessions([])
-    setParentSessionId(null)
    setSteps(initSteps())
  }, [])

@@ -292,40 +263,6 @@ export function useKombiPipeline() {
    }
  }, [sessionId])

-  // ---- Orientation completion (checks for page-split sub-sessions) ----
-
-  const handleOrientationComplete = useCallback(async (sid: string) => {
-    setSessionId(sid)
-    loadSessions()
-
-    try {
-      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
-      if (res.ok) {
-        const data = await res.json()
-        if (data.sub_sessions?.length > 0) {
-          const subs: SubSession[] = data.sub_sessions.map((s: SubSession) => ({
-            id: s.id,
-            name: s.name,
-            box_index: s.box_index,
-            current_step: s.current_step,
-          }))
-          setSubSessions(subs)
-          setParentSessionId(sid)
-          openSession(subs[0].id, true)
-          return
-        }
-      }
-    } catch (e) {
-      console.error('Failed to check for sub-sessions:', e)
-    }
-
-    handleNext()
-  }, [loadSessions, openSession, handleNext])
-
-  const handleSessionChange = useCallback((newSessionId: string) => {
-    openSession(newSessionId, true)
-  }, [openSession])
-
  return {
    // State
    currentStep,
@@ -335,8 +272,6 @@ export function useKombiPipeline() {
    loadingSessions,
    activeCategory,
    isGroundTruth,
-    subSessions,
-    parentSessionId,
    steps,
    gridSaveRef,
    // Computed
@@ -351,11 +286,8 @@ export function useKombiPipeline() {
    deleteSession,
    renameSession,
    updateCategory,
-    handleOrientationComplete,
-    handleSessionChange,
    setSessionId,
-    setSubSessions,
-    setParentSessionId,
+    setSessionName,
    setIsGroundTruth,
  }
 }
--- a/admin-lehrer/components/ocr-kombi/StepOrientation.tsx
+++ b/admin-lehrer/components/ocr-kombi/StepOrientation.tsx
@@ -4,17 +4,17 @@ import { StepOrientation as BaseStepOrientation } from '@/components/ocr-pipelin

 interface StepOrientationProps {
  sessionId: string | null
-  onNext: (sessionId: string) => void
+  onNext: () => void
  onSessionList: () => void
 }

-/** Thin wrapper around the shared StepOrientation component */
+/** Thin wrapper — adapts the shared StepOrientation to the Kombi pipeline's simpler onNext() */
 export function StepOrientation({ sessionId, onNext, onSessionList }: StepOrientationProps) {
  return (
    <BaseStepOrientation
      key={sessionId}
      sessionId={sessionId}
-      onNext={onNext}
+      onNext={() => onNext()}
      onSessionList={onSessionList}
    />
  )
--- a/admin-lehrer/components/ocr-kombi/StepPageSplit.tsx
+++ b/admin-lehrer/components/ocr-kombi/StepPageSplit.tsx
@@ -1,123 +1,198 @@
 'use client'

-import { useState, useEffect } from 'react'
-import type { SubSession } from '@/app/(admin)/ai/ocr-pipeline/types'
-
+import { useState, useEffect, useRef } from 'react'
 const KLAUSUR_API = '/klausur-api'

+interface PageSplitResult {
+  multi_page: boolean
+  page_count?: number
+  page_splits?: { x: number; y: number; width: number; height: number; page_index: number }[]
+  sub_sessions?: { id: string; name: string; page_index: number }[]
+  used_original?: boolean
+  duration_seconds?: number
+}
+
 interface StepPageSplitProps {
  sessionId: string | null
+  sessionName: string
  onNext: () => void
-  onSubSessionsCreated: (subs: SubSession[]) => void
+  onSplitComplete: (firstChildId: string, firstChildName: string) => void
 }

-/**
- * Step 3: Page split detection.
- * Checks if the image is a double-page spread and offers to split it.
- * If no split needed, auto-advances.
- */
-export function StepPageSplit({ sessionId, onNext, onSubSessionsCreated }: StepPageSplitProps) {
-  const [checking, setChecking] = useState(false)
-  const [splitResult, setSplitResult] = useState<{ is_double_page: boolean; pages?: number } | null>(null)
-  const [splitting, setSplitting] = useState(false)
+export function StepPageSplit({ sessionId, sessionName, onNext, onSplitComplete }: StepPageSplitProps) {
+  const [detecting, setDetecting] = useState(false)
+  const [splitResult, setSplitResult] = useState<PageSplitResult | null>(null)
  const [error, setError] = useState('')
+  const didDetect = useRef(false)

+  // Auto-detect page split when step opens
  useEffect(() => {
-    if (!sessionId) return
-    // Auto-check for page split
-    checkPageSplit()
+    if (!sessionId || didDetect.current) return
+    didDetect.current = true
+    detectPageSplit()
  // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [sessionId])

-  const checkPageSplit = async () => {
+  const detectPageSplit = async () => {
    if (!sessionId) return
-    setChecking(true)
+    setDetecting(true)
    setError('')
    try {
-      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
-      if (!res.ok) throw new Error('Session nicht gefunden')
-      const data = await res.json()
-
-      // If sub-sessions already exist, this was already split
-      if (data.sub_sessions?.length > 0) {
-        onSubSessionsCreated(data.sub_sessions)
-        onNext()
-        return
+      // First check if this session was already split (status='split')
+      const sessionRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
+      if (sessionRes.ok) {
+        const sessionData = await sessionRes.json()
+        if (sessionData.status === 'split' && sessionData.crop_result?.multi_page) {
+          // Already split — find the child sessions in the session list
+          const listRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`)
+          if (listRes.ok) {
+            const listData = await listRes.json()
+            // Child sessions have names like "ParentName — Seite N"
+            const baseName = sessionName || sessionData.name || ''
+            const children = (listData.sessions || [])
+              .filter((s: { name?: string }) => s.name?.startsWith(baseName + ' — '))
+              .sort((a: { name: string }, b: { name: string }) => a.name.localeCompare(b.name))
+            if (children.length > 0) {
+              setSplitResult({
+                multi_page: true,
+                page_count: children.length,
+                sub_sessions: children.map((s: { id: string; name: string }, i: number) => ({
+                  id: s.id, name: s.name, page_index: i,
+                })),
+              })
+              onSplitComplete(children[0].id, children[0].name)
+              setDetecting(false)
+              return
+            }
+          }
+        }
      }

-      // Check aspect ratio to guess if double-page
-      // For now, just auto-advance (page-split detection happens in orientation step)
-      setSplitResult({ is_double_page: false })
-      // Auto-advance if single page
-      onNext()
-    } catch (e) {
-      setError(e instanceof Error ? e.message : String(e))
-    } finally {
-      setChecking(false)
-    }
-  }
-
-  const handleSplit = async () => {
-    if (!sessionId) return
-    setSplitting(true)
-    setError('')
-    try {
+      // Run page-split detection
      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/page-split`, {
        method: 'POST',
      })
      if (!res.ok) {
        const data = await res.json().catch(() => ({}))
-        throw new Error(data.detail || 'Split fehlgeschlagen')
+        throw new Error(data.detail || 'Seitentrennung fehlgeschlagen')
      }
-      const data = await res.json()
-      if (data.sub_sessions?.length > 0) {
-        onSubSessionsCreated(data.sub_sessions)
+      const data: PageSplitResult = await res.json()
+      setSplitResult(data)
+
+      if (data.multi_page && data.sub_sessions?.length) {
+        // Rename sub-sessions to "Title — S. 1", "Title — S. 2"
+        const baseName = sessionName || 'Dokument'
+        for (let i = 0; i < data.sub_sessions.length; i++) {
+          const sub = data.sub_sessions[i]
+          const newName = `${baseName} — S. ${i + 1}`
+          await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sub.id}`, {
+            method: 'PUT',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ name: newName }),
+          }).catch(() => {})
+          sub.name = newName
+        }
+
+        // Signal parent to switch to the first child session
+        onSplitComplete(data.sub_sessions[0].id, data.sub_sessions[0].name)
      }
-      onNext()
    } catch (e) {
      setError(e instanceof Error ? e.message : String(e))
    } finally {
-      setSplitting(false)
+      setDetecting(false)
    }
  }

-  if (checking) {
-    return <div className="text-sm text-gray-500 py-8 text-center">Pruefe Seitenformat...</div>
-  }
+  if (!sessionId) return null

-  if (splitResult?.is_double_page) {
-    return (
-      <div className="space-y-4 p-6 bg-blue-50 dark:bg-blue-900/20 rounded-xl border border-blue-200 dark:border-blue-800">
-        <h3 className="text-sm font-medium text-blue-700 dark:text-blue-300">
-          Doppelseite erkannt
-        </h3>
-        <p className="text-sm text-blue-600 dark:text-blue-400">
-          Das Bild scheint eine Doppelseite zu sein. Soll es in zwei Einzelseiten aufgeteilt werden?
-        </p>
-        <div className="flex gap-2">
-          <button
-            onClick={handleSplit}
-            disabled={splitting}
-            className="px-4 py-2 bg-blue-600 text-white text-sm rounded-lg hover:bg-blue-700 disabled:opacity-50"
-          >
-            {splitting ? 'Wird aufgeteilt...' : 'Aufteilen'}
-          </button>
-          <button
-            onClick={onNext}
-            className="px-4 py-2 bg-gray-200 dark:bg-gray-700 text-sm rounded-lg hover:bg-gray-300"
-          >
-            Einzelseite beibehalten
-          </button>
-        </div>
-        {error && <div className="text-sm text-red-500">{error}</div>}
-      </div>
-    )
-  }
+  const imageUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented`

  return (
-    <div className="text-sm text-gray-500 py-8 text-center">
-      Einzelseite erkannt — weiter zum naechsten Schritt.
-      {error && <div className="text-sm text-red-500 mt-2">{error}</div>}
+    <div className="space-y-4">
+      {/* Image */}
+      <div className="relative rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700">
+        {/* eslint-disable-next-line @next/next/no-img-element */}
+        <img
+          src={imageUrl}
+          alt="Orientiertes Bild"
+          className="w-full object-contain max-h-[500px]"
+          onError={(e) => {
+            // Fallback to non-oriented image
+            (e.target as HTMLImageElement).src =
+              `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`
+          }}
+        />
+      </div>
+
+      {/* Detection status */}
+      {detecting && (
+        <div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
+          <div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
+          Doppelseiten-Erkennung laeuft...
+        </div>
+      )}
+
+      {/* Detection result */}
+      {splitResult && !detecting && (
+        splitResult.multi_page ? (
+          <div className="bg-blue-50 dark:bg-blue-900/20 rounded-lg border border-blue-200 dark:border-blue-700 p-4 space-y-2">
+            <div className="text-sm font-medium text-blue-700 dark:text-blue-300">
+              Doppelseite erkannt — {splitResult.page_count} Seiten getrennt
+            </div>
+            <p className="text-xs text-blue-600 dark:text-blue-400">
+              Jede Seite wird als eigene Session weiterverarbeitet (eigene Begradigung, Entzerrung, etc.).
+              {splitResult.used_original && ' Trennung auf Originalbild, da Orientierung die Doppelseite gedreht hat.'}
+            </p>
+            <div className="flex gap-2 mt-2">
+              {splitResult.sub_sessions?.map(s => (
+                <span
+                  key={s.id}
+                  className="text-xs px-2.5 py-1 rounded-md bg-blue-100 dark:bg-blue-800/40 text-blue-700 dark:text-blue-300 font-medium"
+                >
+                  {s.name}
+                </span>
+              ))}
+            </div>
+            {splitResult.duration_seconds != null && (
+              <div className="text-xs text-gray-400">{splitResult.duration_seconds.toFixed(1)}s</div>
+            )}
+          </div>
+        ) : (
+          <div className="bg-green-50 dark:bg-green-900/20 rounded-lg border border-green-200 dark:border-green-800 p-4">
+            <div className="flex items-center gap-2 text-sm font-medium text-green-700 dark:text-green-300">
+              <span>&#10003;</span> Einzelseite — keine Trennung noetig
+            </div>
+            {splitResult.duration_seconds != null && (
+              <div className="text-xs text-gray-400 mt-1">{splitResult.duration_seconds.toFixed(1)}s</div>
+            )}
+          </div>
+        )
+      )}
+
+      {/* Error */}
+      {error && (
+        <div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
+          {error}
+          <button
+            onClick={() => { didDetect.current = false; detectPageSplit() }}
+            className="ml-2 text-teal-600 hover:underline"
+          >
+            Erneut versuchen
+          </button>
+        </div>
+      )}
+
+      {/* Next button — only show when detection is done */}
+      {(splitResult || error) && !detecting && (
+        <div className="flex justify-end">
+          <button
+            onClick={onNext}
+            className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
+          >
+            Weiter &rarr;
+          </button>
+        </div>
+      )}
    </div>
  )
 }
--- a/admin-lehrer/components/ocr-kombi/StepUpload.tsx
+++ b/admin-lehrer/components/ocr-kombi/StepUpload.tsx
@@ -1,28 +1,52 @@
 'use client'

-import { useState, useCallback } from 'react'
+import { useState, useCallback, useEffect } from 'react'
 import { DOCUMENT_CATEGORIES, type DocumentCategory } from '@/app/(admin)/ai/ocr-pipeline/types'

 const KLAUSUR_API = '/klausur-api'

 interface StepUploadProps {
-  onUploaded: (sessionId: string) => void
+  sessionId: string | null
+  onUploaded: (sessionId: string, name: string) => void
+  onNext: () => void
 }

-export function StepUpload({ onUploaded }: StepUploadProps) {
+export function StepUpload({ sessionId, onUploaded, onNext }: StepUploadProps) {
  const [dragging, setDragging] = useState(false)
  const [uploading, setUploading] = useState(false)
+  const [selectedFile, setSelectedFile] = useState<File | null>(null)
+  const [preview, setPreview] = useState<string | null>(null)
  const [title, setTitle] = useState('')
  const [category, setCategory] = useState<DocumentCategory>('vokabelseite')
  const [error, setError] = useState('')

-  const handleUpload = useCallback(async (file: File) => {
+  // Clean up preview URL on unmount
+  useEffect(() => {
+    return () => { if (preview) URL.revokeObjectURL(preview) }
+  }, [preview])
+
+  const handleFileSelect = useCallback((file: File) => {
+    setSelectedFile(file)
+    setError('')
+    if (file.type.startsWith('image/')) {
+      setPreview(URL.createObjectURL(file))
+    } else {
+      setPreview(null)
+    }
+    // Auto-fill title from filename if empty
+    if (!title.trim()) {
+      setTitle(file.name.replace(/\.[^.]+$/, ''))
+    }
+  }, [title])
+
+  const handleUpload = useCallback(async () => {
+    if (!selectedFile) return
    setUploading(true)
    setError('')

    try {
      const formData = new FormData()
-      formData.append('file', file)
+      formData.append('file', selectedFile)
      if (title.trim()) formData.append('name', title.trim())

      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, {
@@ -47,26 +71,164 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
        })
      }

-      onUploaded(sid)
+      onUploaded(sid, title.trim() || selectedFile.name)
    } catch (e) {
      setError(e instanceof Error ? e.message : String(e))
    } finally {
      setUploading(false)
    }
-  }, [title, category, onUploaded])
+  }, [selectedFile, title, category, onUploaded])

  const handleDrop = useCallback((e: React.DragEvent) => {
    e.preventDefault()
    setDragging(false)
    const file = e.dataTransfer.files[0]
-    if (file) handleUpload(file)
-  }, [handleUpload])
+    if (file) handleFileSelect(file)
+  }, [handleFileSelect])

-  const handleFileSelect = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
+  const handleInputChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
    const file = e.target.files?.[0]
-    if (file) handleUpload(file)
-  }, [handleUpload])
+    if (file) handleFileSelect(file)
+  }, [handleFileSelect])

+  const clearFile = useCallback(() => {
+    setSelectedFile(null)
+    if (preview) URL.revokeObjectURL(preview)
+    setPreview(null)
+  }, [preview])
+
+  // ---- Phase 2: Uploaded → show result + "Weiter" ----
+  if (sessionId) {
+    return (
+      <div className="space-y-4">
+        <div className="bg-green-50 dark:bg-green-900/20 border border-green-200 dark:border-green-800 rounded-lg p-4">
+          <div className="flex items-center gap-2 text-green-700 dark:text-green-300 text-sm font-medium mb-3">
+            <span>&#10003;</span> Dokument hochgeladen
+          </div>
+          <div className="flex gap-4">
+            <div className="w-48 h-64 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
+              {/* eslint-disable-next-line @next/next/no-img-element */}
+              <img
+                src={`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`}
+                alt="Hochgeladenes Dokument"
+                className="w-full h-full object-contain"
+                onError={(e) => { (e.target as HTMLImageElement).style.display = 'none' }}
+              />
+            </div>
+            <div className="text-sm text-gray-600 dark:text-gray-400">
+              <div className="font-medium text-gray-700 dark:text-gray-300 mb-1">
+                {title || 'Dokument'}
+              </div>
+              <div className="text-xs text-gray-400 mt-1">
+                Kategorie: {DOCUMENT_CATEGORIES.find(c => c.value === category)?.label || category}
+              </div>
+              <div className="text-xs font-mono text-gray-400 mt-1">
+                Session: {sessionId.slice(0, 8)}...
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <div className="flex justify-end">
+          <button
+            onClick={onNext}
+            className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
+          >
+            Weiter &rarr;
+          </button>
+        </div>
+      </div>
+    )
+  }
+
+  // ---- Phase 1b: File selected → preview + "Hochladen" ----
+  if (selectedFile) {
+    return (
+      <div className="space-y-4">
+        {/* Title input */}
+        <div>
+          <label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
+            Titel
+          </label>
+          <input
+            type="text"
+            value={title}
+            onChange={(e) => setTitle(e.target.value)}
+            placeholder="z.B. Vokabeln Unit 3"
+            className="w-full px-3 py-2 border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-800 text-sm"
+          />
+        </div>
+
+        {/* Category selector */}
+        <div>
+          <label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
+            Kategorie
+          </label>
+          <div className="grid grid-cols-4 gap-1.5">
+            {DOCUMENT_CATEGORIES.map(cat => (
+              <button
+                key={cat.value}
+                onClick={() => setCategory(cat.value)}
+                className={`text-xs px-2 py-1.5 rounded-md text-left transition-colors ${
+                  category === cat.value
+                    ? 'bg-teal-100 dark:bg-teal-900/40 text-teal-700 dark:text-teal-300 ring-1 ring-teal-400'
+                    : 'bg-gray-50 dark:bg-gray-700 text-gray-600 dark:text-gray-400 hover:bg-gray-100'
+                }`}
+              >
+                {cat.icon} {cat.label}
+              </button>
+            ))}
+          </div>
+        </div>
+
+        {/* File preview */}
+        <div className="border border-gray-200 dark:border-gray-700 rounded-xl p-4">
+          <div className="flex items-start gap-4">
+            {preview ? (
+              <div className="w-36 h-48 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
+                {/* eslint-disable-next-line @next/next/no-img-element */}
+                <img src={preview} alt="Vorschau" className="w-full h-full object-contain" />
+              </div>
+            ) : (
+              <div className="w-36 h-48 rounded-lg bg-gray-100 dark:bg-gray-700 flex-shrink-0 flex items-center justify-center border border-gray-200 dark:border-gray-600">
+                <span className="text-3xl">&#128196;</span>
+              </div>
+            )}
+            <div className="flex-1 min-w-0">
+              <div className="font-medium text-sm text-gray-700 dark:text-gray-300 truncate">
+                {selectedFile.name}
+              </div>
+              <div className="text-xs text-gray-400 mt-1">
+                {(selectedFile.size / 1024 / 1024).toFixed(1)} MB
+              </div>
+              <button
+                onClick={clearFile}
+                className="text-xs text-red-500 hover:text-red-700 mt-2"
+              >
+                Andere Datei waehlen
+              </button>
+            </div>
+          </div>
+
+          <button
+            onClick={handleUpload}
+            disabled={uploading}
+            className="mt-4 w-full px-4 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed transition-colors"
+          >
+            {uploading ? 'Wird hochgeladen...' : 'Hochladen'}
+          </button>
+        </div>
+
+        {error && (
+          <div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
+            {error}
+          </div>
+        )}
+      </div>
+    )
+  }
+
+  // ---- Phase 1a: No file → drop zone ----
  return (
    <div className="space-y-4">
      {/* Title input */}
@@ -116,25 +278,19 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
            : 'border-gray-300 dark:border-gray-600 hover:border-gray-400'
        }`}
      >
-        {uploading ? (
-          <div className="text-sm text-gray-500">Wird hochgeladen...</div>
-        ) : (
-          <>
-            <div className="text-4xl mb-3">📤</div>
-            <div className="text-sm text-gray-600 dark:text-gray-400 mb-2">
-              Bild oder PDF hierher ziehen
-            </div>
-            <label className="inline-block px-4 py-2 bg-teal-600 text-white text-sm rounded-lg cursor-pointer hover:bg-teal-700">
-              Datei auswaehlen
-              <input
-                type="file"
-                accept="image/*,.pdf"
-                onChange={handleFileSelect}
-                className="hidden"
-              />
-            </label>
-          </>
-        )}
+        <div className="text-4xl mb-3">&#128228;</div>
+        <div className="text-sm text-gray-600 dark:text-gray-400 mb-2">
+          Bild oder PDF hierher ziehen
+        </div>
+        <label className="inline-block px-4 py-2 bg-teal-600 text-white text-sm rounded-lg cursor-pointer hover:bg-teal-700">
+          Datei auswaehlen
+          <input
+            type="file"
+            accept="image/*,.pdf"
+            onChange={handleInputChange}
+            className="hidden"
+          />
+        </label>
      </div>

      {error && (
--- a/klausur-service/backend/cv_syllable_detect.py
+++ b/klausur-service/backend/cv_syllable_detect.py
@@ -34,7 +34,8 @@ _STOP_WORDS = frozenset([
    'der', 'die', 'das', 'dem', 'den', 'des',
    'ein', 'eine', 'einem', 'einen', 'einer',
    # Pronouns
-    'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
+    'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
+    'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
    # Prepositions
    'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
    'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
@@ -54,6 +55,9 @@ _STOP_WORDS = frozenset([
 _hyph_de = None
 _hyph_en = None

+# Cached spellchecker (for autocorrect_pipe_artifacts)
+_spell_de = None
+

 def _get_hyphenators():
    """Lazy-load pyphen hyphenators (cached across calls)."""
@@ -69,6 +73,35 @@ def _get_hyphenators():
    return _hyph_de, _hyph_en


+def _get_spellchecker():
+    """Lazy-load German spellchecker (cached across calls)."""
+    global _spell_de
+    if _spell_de is not None:
+        return _spell_de
+    try:
+        from spellchecker import SpellChecker
+    except ImportError:
+        return None
+    _spell_de = SpellChecker(language='de')
+    return _spell_de
+
+
+def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
+    """Check whether pyphen recognises a word (DE or EN)."""
+    if len(word) < 2:
+        return False
+    return ('|' in hyph_de.inserted(word, hyphen='|')
+            or '|' in hyph_en.inserted(word, hyphen='|'))
+
+
+def _is_real_word(word: str) -> bool:
+    """Check whether spellchecker knows this word (case-insensitive)."""
+    spell = _get_spellchecker()
+    if spell is None:
+        return False
+    return word.lower() in spell
+
+
 def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    """Try to hyphenate a word using DE then EN dictionary.

@@ -83,6 +116,139 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    return None


+def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
+    """Try to correct a word that has OCR pipe artifacts.
+
+    Printed syllable divider lines on dictionary pages confuse OCR:
+    the vertical stroke is often read as an extra character (commonly
+    ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
+    Sometimes OCR reads one divider as ``|`` and another as a letter,
+    so the garbled character may be far from any detected pipe.
+
+    Uses ``spellchecker`` (frequency-based word list) for validation —
+    unlike pyphen which is a pattern-based hyphenator and accepts
+    nonsense strings like "Zeplpelin".
+
+    Strategy:
+        1. Strip ``|`` — if spellchecker knows the result, done.
+        2. Try deleting each pipe-like character (l, I, 1, i, t).
+           OCR inserts extra chars that resemble vertical strokes.
+        3. Fall back to spellchecker's own ``correction()`` method.
+        4. Preserve the original casing of the first letter.
+    """
+    stripped = word_with_pipes.replace('|', '')
+    if not stripped or len(stripped) < 3:
+        return stripped  # too short to validate
+
+    # Step 1: if the stripped word is already a real word, done
+    if _is_real_word(stripped):
+        return stripped
+
+    # Step 2: try deleting pipe-like characters (most likely artifacts)
+    _PIPE_LIKE = frozenset('lI1it')
+    for idx in range(len(stripped)):
+        if stripped[idx] not in _PIPE_LIKE:
+            continue
+        candidate = stripped[:idx] + stripped[idx + 1:]
+        if len(candidate) >= 3 and _is_real_word(candidate):
+            return candidate
+
+    # Step 3: use spellchecker's built-in correction
+    spell = _get_spellchecker()
+    if spell is not None:
+        suggestion = spell.correction(stripped.lower())
+        if suggestion and suggestion != stripped.lower():
+            # Preserve original first-letter case
+            if stripped[0].isupper():
+                suggestion = suggestion[0].upper() + suggestion[1:]
+            return suggestion
+
+    return None  # could not fix
+
+
+def autocorrect_pipe_artifacts(
+    zones_data: List[Dict], session_id: str,
+) -> int:
+    """Strip OCR pipe artifacts and correct garbled words in-place.
+
+    Printed syllable divider lines on dictionary scans are read by OCR
+    as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
+    This function:
+
+    1. Strips ``|`` from every word in content cells.
+    2. Validates with spellchecker (real dictionary lookup).
+    3. If not recognised, tries deleting pipe-like characters or uses
+       spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
+    4. Updates both word-box texts and cell text.
+
+    Returns the number of cells modified.
+    """
+    spell = _get_spellchecker()
+    if spell is None:
+        logger.warning("spellchecker not available — pipe autocorrect limited")
+        # Fall back: still strip pipes even without spellchecker
+        pass
+
+    modified = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+
+            cell_changed = False
+
+            # --- Fix word boxes ---
+            for wb in cell.get("word_boxes", []):
+                wb_text = wb.get("text", "")
+                if "|" not in wb_text:
+                    continue
+
+                # Separate trailing punctuation
+                m = re.match(
+                    r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
+                    r'(.*?)'
+                    r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
+                    wb_text,
+                )
+                if not m:
+                    continue
+                lead, core, trail = m.group(1), m.group(2), m.group(3)
+                if "|" not in core:
+                    continue
+
+                corrected = _autocorrect_piped_word(core)
+                if corrected is not None and corrected != core:
+                    wb["text"] = lead + corrected + trail
+                    cell_changed = True
+
+            # --- Rebuild cell text from word boxes ---
+            if cell_changed:
+                wbs = cell.get("word_boxes", [])
+                if wbs:
+                    cell["text"] = " ".join(
+                        (wb.get("text") or "") for wb in wbs
+                    )
+                modified += 1
+
+            # --- Fallback: strip residual | from cell text ---
+            # (covers cases where word_boxes don't exist or weren't fixed)
+            text = cell.get("text", "")
+            if "|" in text:
+                clean = text.replace("|", "")
+                if clean != text:
+                    cell["text"] = clean
+                    if not cell_changed:
+                        modified += 1
+
+    if modified:
+        logger.info(
+            "build-grid session %s: autocorrected pipe artifacts in %d cells",
+            session_id, modified,
+        )
+    return modified
+
+
 def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
    """Merge fragments separated by single spaces where OCR split at a pipe.

@@ -139,6 +305,93 @@ def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
    return ' '.join(result)


+def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
+    """Merge OCR word-gap fragments in cell texts using pyphen validation.
+
+    OCR often splits words at syllable boundaries into separate word_boxes,
+    producing text like "zerknit tert" instead of "zerknittert".  This
+    function tries to merge adjacent fragments in every content cell.
+
+    More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
+    but still guarded by pyphen dictionary lookup and stop-word exclusion.
+
+    Returns the number of cells modified.
+    """
+    hyph_de, _ = _get_hyphenators()
+    if hyph_de is None:
+        return 0
+
+    modified = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            text = cell.get("text", "")
+            if not text or " " not in text:
+                continue
+
+            # Skip IPA cells
+            text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
+            if _IPA_RE.search(text_no_brackets):
+                continue
+
+            new_text = _try_merge_word_gaps(text, hyph_de)
+            if new_text != text:
+                cell["text"] = new_text
+                modified += 1
+
+    if modified:
+        logger.info(
+            "build-grid session %s: merged word gaps in %d cells",
+            session_id, modified,
+        )
+    return modified
+
+
+def _try_merge_word_gaps(text: str, hyph_de) -> str:
+    """Merge OCR word fragments with relaxed threshold (max_short=5).
+
+    Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
+    (max_short=5 instead of 3).  Still requires pyphen to recognize the
+    merged word.
+    """
+    parts = text.split(' ')
+    if len(parts) < 2:
+        return text
+
+    result = [parts[0]]
+    i = 1
+    while i < len(parts):
+        prev = result[-1]
+        curr = parts[i]
+
+        prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
+        curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
+
+        should_try = (
+            prev == prev_alpha
+            and prev_alpha and curr_alpha
+            and prev_alpha.lower() not in _STOP_WORDS
+            and curr_alpha.lower() not in _STOP_WORDS
+            and min(len(prev_alpha), len(curr_alpha)) <= 5
+            and len(prev_alpha) + len(curr_alpha) >= 4
+        )
+
+        if should_try:
+            merged_alpha = prev_alpha + curr_alpha
+            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
+            if '-' in hyph:
+                result[-1] = prev + curr
+                i += 1
+                continue
+
+        result.append(curr)
+        i += 1
+
+    return ' '.join(result)
+
+
 def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
    """Syllabify all significant words in a text string.

@@ -259,6 +512,12 @@ def insert_syllable_dividers(
            if not text:
                continue

+            # In auto mode (force=False), only normalize cells that already
+            # have | from OCR (i.e. printed syllable dividers on the original
+            # scan).  Don't add new syllable marks to other words.
+            if not force and "|" not in text:
+                continue
+
            new_text = _syllabify_text(text, hyph_de, hyph_en)
            if new_text != text:
                cell["text"] = new_text
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1323,6 +1323,14 @@ async def _build_grid_core(
                        and wb.get("conf", 100) < 85):
                    to_remove.add(i)

+            # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
+            # Small images/icons next to words get OCR'd as ">", "<", "~", etc.
+            # Remove word boxes that contain NO letters or digits.
+            for i, wb in enumerate(wbs):
+                t = (wb.get("text") or "").strip()
+                if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
+                    to_remove.add(i)
+
            # Rule (b) + (c): overlap and duplicate detection
            # Sort by x for pairwise comparison
            _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
@@ -1353,6 +1361,19 @@ async def _build_grid_core(
                        to_merge.append((i1, i2))
                        continue

+                    # High overlap (>75%) with different alphabetic text:
+                    # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
+                    # causing it to heavily overlap with the next fragment ("brech").
+                    # Merge instead of removing when one is a short prefix (≤4 chars)
+                    # and the texts are different.
+                    if (overlap_pct > 0.75
+                            and _ALPHA_WORD_RE.match(t1)
+                            and _ALPHA_WORD_RE.match(t2)
+                            and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
+                            and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
+                        to_merge.append((i1, i2))
+                        continue
+
                    if overlap_pct <= 0.40:
                        continue  # too little overlap and not alphabetic merge

@@ -1393,15 +1414,22 @@ async def _build_grid_core(
                    c2 = w2.get("conf", 50)
                    to_remove.add(i1 if c1 <= c2 else i2)

-            # Execute merges first (syllable-split words)
+            # Execute merges first (syllable-split words).
+            # Use merge_parent to support chain merging: if "zer" absorbed
+            # "brech" and then "brech"+"lich" is a merge pair, redirect to
+            # merge "lich" into "zer" → "zerbrechlich".
            if to_merge:
-                merged_indices: set = set()
+                merge_parent: Dict[int, int] = {}  # absorbed → absorber
                for mi1, mi2 in to_merge:
-                    if mi1 in to_remove or mi2 in to_remove:
-                        continue  # don't merge if one is being removed
-                    if mi1 in merged_indices or mi2 in merged_indices:
-                        continue  # already merged
-                    mw1, mw2 = wbs[mi1], wbs[mi2]
+                    # Follow chain: if mi1 was absorbed, find root absorber
+                    actual_mi1 = mi1
+                    while actual_mi1 in merge_parent:
+                        actual_mi1 = merge_parent[actual_mi1]
+                    if actual_mi1 in to_remove or mi2 in to_remove:
+                        continue
+                    if mi2 in merge_parent:
+                        continue  # mi2 already absorbed
+                    mw1, mw2 = wbs[actual_mi1], wbs[mi2]
                    # Concatenate text (no space — they're parts of one word)
                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
                    mt2 = (mw2.get("text") or "").strip()
@@ -1419,9 +1447,8 @@ async def _build_grid_core(
                    mw1["width"] = mr - mx
                    mw1["height"] = mb - my
                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
-                    to_remove.add(mi2)  # remove the second one
-                    merged_indices.add(mi1)
-                    merged_indices.add(mi2)
+                    to_remove.add(mi2)
+                    merge_parent[mi2] = actual_mi1
                    bullet_removed -= 1  # net: merge, not removal

            if to_remove:
@@ -1593,6 +1620,22 @@ async def _build_grid_core(
    except Exception as e:
        logger.warning("Dictionary detection failed: %s", e)

+    # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
+    try:
+        from cv_syllable_detect import merge_word_gaps_in_zones
+        merge_word_gaps_in_zones(zones_data, session_id)
+    except Exception as e:
+        logger.warning("Word-gap merge failed: %s", e)
+
+    # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
+    # Strips | from words, validates with pyphen, tries char-deletion for garbled
+    # words like "Ze|plpe|lin" → "Zeppelin".
+    try:
+        from cv_syllable_detect import autocorrect_pipe_artifacts
+        autocorrect_pipe_artifacts(zones_data, session_id)
+    except Exception as e:
+        logger.warning("Pipe autocorrect failed: %s", e)
+
    # --- Syllable divider insertion for dictionary pages ---
    # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
    #   "all" = force on all content words, "en" = English column only,
@@ -1626,6 +1669,15 @@ async def _build_grid_core(
            except Exception as e:
                logger.warning("Syllable insertion failed: %s", e)

+    # When syllable mode is "none", strip any residual | from OCR so
+    # that the displayed text is clean (e.g. "Zel|le" → "Zelle").
+    if syllable_mode == "none":
+        for z in zones_data:
+            for cell in z.get("cells", []):
+                t = cell.get("text", "")
+                if "|" in t:
+                    cell["text"] = t.replace("|", "")
+
    # Clean up internal flags before returning
    for z in zones_data:
        for cell in z.get("cells", []):
--- a/klausur-service/backend/grid_editor_helpers.py
+++ b/klausur-service/backend/grid_editor_helpers.py
@@ -22,6 +22,148 @@ from cv_ocr_engines import _text_has_garbled_ipa
 logger = logging.getLogger(__name__)


+# ---------------------------------------------------------------------------
+# Cross-column word splitting
+# ---------------------------------------------------------------------------
+
+_spell_cache: Optional[Any] = None
+_spell_loaded = False
+
+
+def _is_recognized_word(text: str) -> bool:
+    """Check if *text* is a recognized German or English word.
+
+    Uses the spellchecker library (same as cv_syllable_detect.py).
+    Returns True for real words like "oder", "Kabel", "Zeitung".
+    Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
+    """
+    global _spell_cache, _spell_loaded
+    if not text or len(text) < 2:
+        return False
+
+    if not _spell_loaded:
+        _spell_loaded = True
+        try:
+            from spellchecker import SpellChecker
+            _spell_cache = SpellChecker(language="de")
+        except Exception:
+            pass
+
+    if _spell_cache is None:
+        return False
+
+    return text.lower() in _spell_cache
+
+
+def _split_cross_column_words(
+    words: List[Dict],
+    columns: List[Dict],
+) -> List[Dict]:
+    """Split word boxes that span across column boundaries.
+
+    When OCR merges adjacent words from different columns (e.g. "sichzie"
+    spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
+    split the word box at the column boundary so each piece is assigned
+    to the correct column.
+
+    Only splits when:
+    - The word has significant overlap (>15% of its width) on both sides
+    - AND the word is not a recognized real word (OCR merge artifact), OR
+      the word contains a case transition (lowercase→uppercase) near the
+      boundary indicating two merged words like "dasZimmer".
+    """
+    if len(columns) < 2:
+        return words
+
+    # Column boundaries = midpoints between adjacent column edges
+    boundaries = []
+    for i in range(len(columns) - 1):
+        boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
+        boundaries.append(boundary)
+
+    new_words: List[Dict] = []
+    split_count = 0
+    for w in words:
+        w_left = w["left"]
+        w_width = w["width"]
+        w_right = w_left + w_width
+        text = (w.get("text") or "").strip()
+
+        if not text or len(text) < 4 or w_width < 10:
+            new_words.append(w)
+            continue
+
+        # Find the first boundary this word straddles significantly
+        split_boundary = None
+        for b in boundaries:
+            if w_left < b < w_right:
+                left_part = b - w_left
+                right_part = w_right - b
+                # Both sides must have at least 15% of the word width
+                if left_part > w_width * 0.15 and right_part > w_width * 0.15:
+                    split_boundary = b
+                    break
+
+        if split_boundary is None:
+            new_words.append(w)
+            continue
+
+        # Compute approximate split position in the text.
+        left_width = split_boundary - w_left
+        split_ratio = left_width / w_width
+        approx_pos = len(text) * split_ratio
+
+        # Strategy 1: look for a case transition (lowercase→uppercase) near
+        # the approximate split point — e.g. "dasZimmer" splits at 'Z'.
+        split_char = None
+        search_lo = max(1, int(approx_pos) - 3)
+        search_hi = min(len(text), int(approx_pos) + 2)
+        for i in range(search_lo, search_hi):
+            if text[i - 1].islower() and text[i].isupper():
+                split_char = i
+                break
+
+        # Strategy 2: if no case transition, only split if the whole word
+        # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
+        # Real words like "oder", "Kabel", "Zeitung" must not be split.
+        if split_char is None:
+            clean = re.sub(r"[,;:.!?]+$", "", text)  # strip trailing punct
+            if _is_recognized_word(clean):
+                new_words.append(w)
+                continue
+            # Not a real word — use floor of proportional position
+            split_char = max(1, min(len(text) - 1, int(approx_pos)))
+
+        left_text = text[:split_char].rstrip()
+        right_text = text[split_char:].lstrip()
+
+        if len(left_text) < 2 or len(right_text) < 2:
+            new_words.append(w)
+            continue
+
+        right_width = w_width - round(left_width)
+        new_words.append({
+            **w,
+            "text": left_text,
+            "width": round(left_width),
+        })
+        new_words.append({
+            **w,
+            "text": right_text,
+            "left": round(split_boundary),
+            "width": right_width,
+        })
+        split_count += 1
+        logger.info(
+            "split cross-column word %r → %r + %r at boundary %.0f",
+            text, left_text, right_text, split_boundary,
+        )
+
+    if split_count:
+        logger.info("split %d cross-column word(s)", split_count)
+    return new_words
+
+
 def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
    """Remove page-border decoration strip words BEFORE column detection.

@@ -912,6 +1054,13 @@ def _detect_heading_rows_by_single_cell(
            _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
            if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
                continue
+            # Guard: dictionary section headings are short (1-4 alpha chars
+            # like "A", "Ab", "Zi", "Sch").  Longer text that starts
+            # lowercase is a regular vocabulary word (e.g. "zentral") that
+            # happens to appear alone in its row.
+            alpha_only = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', text)
+            if len(alpha_only) > 4 and text[0].islower():
+                continue
            heading_row_indices.append(ri)

        # Guard: if >25% of eligible rows would become headings, the
@@ -1104,6 +1253,12 @@ def _build_zone_grid(
            "header_rows": [],
        }

+    # Split word boxes that straddle column boundaries (e.g. "sichzie"
+    # spanning Col 1 + Col 2).  Must happen after column detection and
+    # before cell assignment.
+    if len(columns) >= 2:
+        zone_words = _split_cross_column_words(zone_words, columns)
+
    # Build cells
    cells = _build_cells(zone_words, columns, rows, img_w, img_h)
Author	SHA1	Message	Date
Benjamin Admin	21b69e06be	Fix cross-column word assignment by splitting OCR merge artifacts Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 47s Details CI / test-go-edu-search (push) Successful in 36s Details CI / test-python-klausur (push) Failing after 2m21s Details CI / test-python-agent-core (push) Successful in 19s Details CI / test-nodejs-website (push) Successful in 23s Details When OCR merges adjacent words from different columns into one word box (e.g. "sichzie" spanning Col 1+2, "dasZimmer" crossing boundary), the grid builder assigned the entire merged word to one column. New _split_cross_column_words() function splits these at column boundaries using case transitions and spellchecker validation to avoid false positives on real words like "oder", "Kabel", "Zeitung". Regression: 12/12 GT sessions pass with diff=+0. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-28 10:54:41 +01:00
Benjamin Admin	0168ab1a67	Remove Hauptseite/Box tabs from Kombi pipeline Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 27s Details CI / test-go-edu-search (push) Successful in 29s Details CI / test-python-klausur (push) Failing after 2m15s Details CI / test-python-agent-core (push) Successful in 16s Details CI / test-nodejs-website (push) Successful in 20s Details Page-split now creates independent sessions that appear directly in the session list. After split, the UI switches to the first child session. BoxSessionTabs, sub-session state, and parent-child tracking removed from Kombi code. Legacy ocr-overlay still uses BoxSessionTabs. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-27 17:43:58 +01:00
Benjamin Admin	925f4356ce	Use spellchecker instead of pyphen for pipe autocorrect validation Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 27s Details CI / test-go-edu-search (push) Successful in 26s Details CI / test-python-klausur (push) Failing after 2m29s Details CI / test-python-agent-core (push) Successful in 16s Details CI / test-nodejs-website (push) Successful in 20s Details pyphen is a pattern-based hyphenator that accepts nonsense strings like "Zeplpelin". Switch to spellchecker (frequency-based word list) which correctly rejects garbled words and can suggest corrections. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-27 16:47:42 +01:00
Benjamin Admin	cc4cb3bc2f	Add pipe auto-correction and graphic artifact filter for grid builder Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 27s Details CI / test-go-edu-search (push) Successful in 27s Details CI / test-python-klausur (push) Failing after 2m10s Details CI / test-python-agent-core (push) Successful in 17s Details CI / test-nodejs-website (push) Successful in 19s Details - autocorrect_pipe_artifacts(): strips OCR pipe artifacts from printed syllable dividers, validates with pyphen, tries char-deletion near pipe positions for garbled words (e.g. "Ze\|plpe\|lin" → "Zeppelin") - Rule (a2): filters isolated non-alphanumeric word boxes (≤2 chars, no letters/digits) — catches small icons OCR'd as ">", "<" etc. - Both fixes are generic: pyphen-validated, no session-specific logic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-27 16:33:38 +01:00
Benjamin Admin	0685fb12da	Fix Bug 3: recover OCR-lost prefixes via overlap merge + chain merging Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 29s Details CI / test-go-edu-search (push) Successful in 27s Details CI / test-python-klausur (push) Failing after 2m24s Details CI / test-python-agent-core (push) Successful in 17s Details CI / test-nodejs-website (push) Successful in 19s Details When OCR merge expands a prefix word box (e.g. "zer" w=42 → w=104), it heavily overlaps (>75%) with the next fragment ("brech"). The grid builder's overlap filter previously removed the prefix as a duplicate. Fix: when overlap > 75% but both boxes are alphabetic with different text and one is ≤ 4 chars, merge instead of removing. Also enable chain merging via merge_parent tracking so "zer" + "brech" + "lich" → "zerbrechlich" in a single pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-27 15:49:52 +01:00
Benjamin Admin	96ea23164d	Fix word-gap merge: add missing pronouns to stop words, reduce threshold Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 38s Details CI / test-go-edu-search (push) Successful in 26s Details CI / test-python-klausur (push) Failing after 2m13s Details CI / test-python-agent-core (push) Successful in 18s Details CI / test-nodejs-website (push) Successful in 22s Details - Add du/dich/dir/mich/mir/uns/euch/ihm/ihn to _STOP_WORDS to prevent false merges like "du" + "zerlegst" → "duzerlegst" - Reduce max_short threshold from 6 to 5 to prevent merging multi-word phrases like "ziehen lassen" → "ziehenlassen" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-27 15:35:12 +01:00
Benjamin Admin	a8773d5b00	Fix 4 Grid Editor bugs: syllable modes, heading detection, word gaps 1. Syllable "Original" (auto) mode: only normalize cells that already have \| from OCR — don't add new syllable marks via pyphen to words without printed dividers on the original scan. 2. Syllable "Aus" (none) mode: strip residual \| chars from OCR text so cells display clean (e.g. "Zel\|le" → "Zelle"). 3. Heading detection: add text length guard in single-cell heuristic — words > 4 alpha chars starting lowercase (like "zentral") are regular vocabulary, not section headings. 4. Word-gap merge: new merge_word_gaps_in_zones() step with relaxed threshold (6 chars) fixes OCR splits like "zerknit tert" → "zerknittert". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-27 15:24:35 +01:00
Benjamin Admin	9f68bd3425	feat: Implement page-split step with auto-detection and sub-session naming StepPageSplit now: - Auto-calls POST /page-split on step entry - Shows oriented image + detection result - If double page: creates sub-sessions named "Title — S. 1/2" - If single page: green badge "keine Trennung noetig" - Manual "Weiter" button (no auto-advance) Also: - StepOrientation wrapper simplified (no page-split in orientation) - StepUpload passes name back via onUploaded(sid, name) - page.tsx: after page-split "Weiter" switches to first sub-session - useKombiPipeline exposes setSessionName Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-26 17:56:45 +01:00
Benjamin Admin	469f09d1e1	fix: Redesign StepUpload for manual step control StepUpload now has 3 phases: 1. File selection: drop zone / file picker → shows preview 2. Review: title input, category, file info → "Hochladen" button 3. Uploaded: shows session image → "Weiter" button No more auto-advance after upload. User controls every step. openSession() removed from onUploaded callback to prevent step-reset race condition. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-26 17:35:36 +01:00
Benjamin Admin	3bb04b25ab	fix: OCR Kombi upload race condition — openSession was resetting step to 0 openSession mapped dbStep=1 to uiStep=0 (upload), overriding handleNext's advancement to step 1. Fix: sessions always exist post-upload, so always skip past the upload step in openSession. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-26 17:10:04 +01:00