Fix word-gap merge: add missing pronouns to stop words, reduce threshold

- Add du/dich/dir/mich/mir/uns/euch/ihm/ihn to _STOP_WORDS to prevent false merges like "du" + "zerlegst" → "duzerlegst" - Reduce max_short threshold from 6 to 5 to prevent merging multi-word phrases like "ziehen lassen" → "ziehenlassen" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Fix 4 Grid Editor bugs: syllable modes, heading detection, word gaps
2026-03-27 15:35:12 +01:00 · 2026-03-27 15:24:35 +01:00 · 2026-03-26 17:56:45 +01:00 · 2026-03-26 17:35:36 +01:00 · 2026-03-26 17:10:04 +01:00
8 changed files with 486 additions and 124 deletions
--- a/admin-lehrer/app/(admin)/ai/ocr-kombi/page.tsx
+++ b/admin-lehrer/app/(admin)/ai/ocr-kombi/page.tsx
@@ -40,9 +40,9 @@ function OcrKombiContent() {
    deleteSession,
    renameSession,
    updateCategory,
    handleOrientationComplete,
    handleSessionChange,
    setSessionId,
    setSessionName,
    setSubSessions,
    setParentSessionId,
    setIsGroundTruth,
@@ -53,19 +53,20 @@ function OcrKombiContent() {
      case 0:
        return (
          <StepUpload
-            onUploaded={(sid) => {
+            sessionId={sessionId}
            onUploaded={(sid, name) => {
              setSessionId(sid)
              setSessionName(name)
              loadSessions()
              openSession(sid)
              handleNext()
            }}
            onNext={handleNext}
          />
        )
      case 1:
        return (
          <StepOrientation
            sessionId={sessionId}
-            onNext={handleOrientationComplete}
+            onNext={() => handleNext()}
            onSessionList={() => { loadSessions(); handleNewSession() }}
          />
        )
@@ -73,10 +74,19 @@ function OcrKombiContent() {
        return (
          <StepPageSplit
            sessionId={sessionId}
-            onNext={handleNext}
+            sessionName={sessionName}
            onNext={() => {
              // If sub-sessions were created, switch to the first one
              if (subSessions.length > 0) {
                setSessionId(subSessions[0].id)
                setSessionName(subSessions[0].name)
              }
              handleNext()
            }}
            onSubSessionsCreated={(subs) => {
              setSubSessions(subs)
              if (sessionId) setParentSessionId(sessionId)
              loadSessions()
            }}
          />
        )
--- a/admin-lehrer/app/(admin)/ai/ocr-kombi/useKombiPipeline.ts
+++ b/admin-lehrer/app/(admin)/ai/ocr-kombi/useKombiPipeline.ts
@@ -154,8 +154,8 @@ export function useKombiPipeline() {
        uiStep = dbStepToKombiV2Ui(dbStep)
      }
-      // For sessions that already have an upload, skip the upload step
+      // Sessions only exist after upload, so always skip the upload step
-      if (uiStep === 0 && dbStep >= 2) {
+      if (uiStep === 0) {
        uiStep = 1
      }
@@ -356,6 +356,7 @@ export function useKombiPipeline() {
    setSessionId,
    setSubSessions,
    setParentSessionId,
    setSessionName,
    setIsGroundTruth,
  }
 }
--- a/admin-lehrer/components/ocr-kombi/StepOrientation.tsx
+++ b/admin-lehrer/components/ocr-kombi/StepOrientation.tsx
@@ -4,17 +4,17 @@ import { StepOrientation as BaseStepOrientation } from '@/components/ocr-pipelin
 interface StepOrientationProps {
  sessionId: string | null
-  onNext: (sessionId: string) => void
+  onNext: () => void
  onSessionList: () => void
 }
-/** Thin wrapper around the shared StepOrientation component */
+/** Thin wrapper — adapts the shared StepOrientation to the Kombi pipeline's simpler onNext() */
 export function StepOrientation({ sessionId, onNext, onSessionList }: StepOrientationProps) {
  return (
    <BaseStepOrientation
      key={sessionId}
      sessionId={sessionId}
-      onNext={onNext}
+      onNext={() => onNext()}
      onSessionList={onSessionList}
    />
  )
--- a/admin-lehrer/components/ocr-kombi/StepPageSplit.tsx
+++ b/admin-lehrer/components/ocr-kombi/StepPageSplit.tsx
@@ -1,123 +1,201 @@
 'use client'
-import { useState, useEffect } from 'react'
+import { useState, useEffect, useRef } from 'react'
 import type { SubSession } from '@/app/(admin)/ai/ocr-pipeline/types'
 const KLAUSUR_API = '/klausur-api'
 interface PageSplitResult {
  multi_page: boolean
  page_count?: number
  page_splits?: { x: number; y: number; width: number; height: number; page_index: number }[]
  sub_sessions?: { id: string; name: string; page_index: number }[]
  used_original?: boolean
  duration_seconds?: number
 }
 interface StepPageSplitProps {
  sessionId: string | null
  sessionName: string
  onNext: () => void
  onSubSessionsCreated: (subs: SubSession[]) => void
 }
-/**
+export function StepPageSplit({ sessionId, sessionName, onNext, onSubSessionsCreated }: StepPageSplitProps) {
- * Step 3: Page split detection.
+  const [detecting, setDetecting] = useState(false)
- * Checks if the image is a double-page spread and offers to split it.
+  const [splitResult, setSplitResult] = useState<PageSplitResult | null>(null)
 * If no split needed, auto-advances.
 */
 export function StepPageSplit({ sessionId, onNext, onSubSessionsCreated }: StepPageSplitProps) {
  const [checking, setChecking] = useState(false)
  const [splitResult, setSplitResult] = useState<{ is_double_page: boolean; pages?: number } | null>(null)
  const [splitting, setSplitting] = useState(false)
  const [error, setError] = useState('')
  const didDetect = useRef(false)
  // Auto-detect page split when step opens
  useEffect(() => {
-    if (!sessionId) return
+    if (!sessionId || didDetect.current) return
-    // Auto-check for page split
+    didDetect.current = true
-    checkPageSplit()
+    detectPageSplit()
  // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [sessionId])
-  const checkPageSplit = async () => {
+  const detectPageSplit = async () => {
    if (!sessionId) return
-    setChecking(true)
+    setDetecting(true)
    setError('')
    try {
-      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
+      // First check if sub-sessions already exist
-      if (!res.ok) throw new Error('Session nicht gefunden')
+      const sessionRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
-      const data = await res.json()
+      if (sessionRes.ok) {
-
+        const sessionData = await sessionRes.json()
-      // If sub-sessions already exist, this was already split
+        if (sessionData.sub_sessions?.length > 0) {
-      if (data.sub_sessions?.length > 0) {
+          // Already split — show existing sub-sessions
-        onSubSessionsCreated(data.sub_sessions)
+          const subs = sessionData.sub_sessions as { id: string; name: string; page_index?: number; box_index?: number; current_step?: number }[]
-        onNext()
+          setSplitResult({
-        return
+            multi_page: true,
            page_count: subs.length,
            sub_sessions: subs.map((s: { id: string; name: string; page_index?: number; box_index?: number }) => ({
              id: s.id,
              name: s.name,
              page_index: s.page_index ?? s.box_index ?? 0,
            })),
          })
          onSubSessionsCreated(subs.map((s: { id: string; name: string; page_index?: number; box_index?: number; current_step?: number }) => ({
            id: s.id,
            name: s.name,
            box_index: s.page_index ?? s.box_index ?? 0,
            current_step: s.current_step ?? 2,
          })))
          setDetecting(false)
          return
        }
      }
-      // Check aspect ratio to guess if double-page
+      // Run page-split detection
      // For now, just auto-advance (page-split detection happens in orientation step)
      setSplitResult({ is_double_page: false })
      // Auto-advance if single page
      onNext()
    } catch (e) {
      setError(e instanceof Error ? e.message : String(e))
    } finally {
      setChecking(false)
    }
  }
  const handleSplit = async () => {
    if (!sessionId) return
    setSplitting(true)
    setError('')
    try {
      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/page-split`, {
        method: 'POST',
      })
      if (!res.ok) {
        const data = await res.json().catch(() => ({}))
-        throw new Error(data.detail || 'Split fehlgeschlagen')
+        throw new Error(data.detail || 'Seitentrennung fehlgeschlagen')
      }
-      const data = await res.json()
+      const data: PageSplitResult = await res.json()
-      if (data.sub_sessions?.length > 0) {
+      setSplitResult(data)
-        onSubSessionsCreated(data.sub_sessions)
+
      if (data.multi_page && data.sub_sessions?.length) {
        // Rename sub-sessions to "Title — S. 1", "Title — S. 2"
        const baseName = sessionName || 'Dokument'
        for (let i = 0; i < data.sub_sessions.length; i++) {
          const sub = data.sub_sessions[i]
          const newName = `${baseName} — S. ${i + 1}`
          await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sub.id}`, {
            method: 'PUT',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify({ name: newName }),
          }).catch(() => {})
          sub.name = newName
        }
        onSubSessionsCreated(data.sub_sessions.map(s => ({
          id: s.id,
          name: s.name,
          box_index: s.page_index,
          current_step: 2,
        })))
      }
      onNext()
    } catch (e) {
      setError(e instanceof Error ? e.message : String(e))
    } finally {
-      setSplitting(false)
+      setDetecting(false)
    }
  }
-  if (checking) {
+  if (!sessionId) return null
    return <div className="text-sm text-gray-500 py-8 text-center">Pruefe Seitenformat...</div>
  }
-  if (splitResult?.is_double_page) {
+  const imageUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented`
    return (
      <div className="space-y-4 p-6 bg-blue-50 dark:bg-blue-900/20 rounded-xl border border-blue-200 dark:border-blue-800">
        <h3 className="text-sm font-medium text-blue-700 dark:text-blue-300">
          Doppelseite erkannt
        </h3>
        <p className="text-sm text-blue-600 dark:text-blue-400">
          Das Bild scheint eine Doppelseite zu sein. Soll es in zwei Einzelseiten aufgeteilt werden?
        </p>
        <div className="flex gap-2">
          <button
            onClick={handleSplit}
            disabled={splitting}
            className="px-4 py-2 bg-blue-600 text-white text-sm rounded-lg hover:bg-blue-700 disabled:opacity-50"
          >
            {splitting ? 'Wird aufgeteilt...' : 'Aufteilen'}
          </button>
          <button
            onClick={onNext}
            className="px-4 py-2 bg-gray-200 dark:bg-gray-700 text-sm rounded-lg hover:bg-gray-300"
          >
            Einzelseite beibehalten
          </button>
        </div>
        {error && <div className="text-sm text-red-500">{error}</div>}
      </div>
    )
  }
  return (
-    <div className="text-sm text-gray-500 py-8 text-center">
+    <div className="space-y-4">
-      Einzelseite erkannt — weiter zum naechsten Schritt.
+      {/* Image */}
-      {error && <div className="text-sm text-red-500 mt-2">{error}</div>}
+      <div className="relative rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700">
        {/* eslint-disable-next-line @next/next/no-img-element */}
        <img
          src={imageUrl}
          alt="Orientiertes Bild"
          className="w-full object-contain max-h-[500px]"
          onError={(e) => {
            // Fallback to non-oriented image
            (e.target as HTMLImageElement).src =
              `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`
          }}
        />
      </div>
      {/* Detection status */}
      {detecting && (
        <div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
          <div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
          Doppelseiten-Erkennung laeuft...
        </div>
      )}
      {/* Detection result */}
      {splitResult && !detecting && (
        splitResult.multi_page ? (
          <div className="bg-blue-50 dark:bg-blue-900/20 rounded-lg border border-blue-200 dark:border-blue-700 p-4 space-y-2">
            <div className="text-sm font-medium text-blue-700 dark:text-blue-300">
              Doppelseite erkannt — {splitResult.page_count} Seiten getrennt
            </div>
            <p className="text-xs text-blue-600 dark:text-blue-400">
              Jede Seite wird als eigene Session weiterverarbeitet (eigene Begradigung, Entzerrung, etc.).
              {splitResult.used_original && ' Trennung auf Originalbild, da Orientierung die Doppelseite gedreht hat.'}
            </p>
            <div className="flex gap-2 mt-2">
              {splitResult.sub_sessions?.map(s => (
                <span
                  key={s.id}
                  className="text-xs px-2.5 py-1 rounded-md bg-blue-100 dark:bg-blue-800/40 text-blue-700 dark:text-blue-300 font-medium"
                >
                  {s.name}
                </span>
              ))}
            </div>
            {splitResult.duration_seconds != null && (
              <div className="text-xs text-gray-400">{splitResult.duration_seconds.toFixed(1)}s</div>
            )}
          </div>
        ) : (
          <div className="bg-green-50 dark:bg-green-900/20 rounded-lg border border-green-200 dark:border-green-800 p-4">
            <div className="flex items-center gap-2 text-sm font-medium text-green-700 dark:text-green-300">
              <span>&#10003;</span> Einzelseite — keine Trennung noetig
            </div>
            {splitResult.duration_seconds != null && (
              <div className="text-xs text-gray-400 mt-1">{splitResult.duration_seconds.toFixed(1)}s</div>
            )}
          </div>
        )
      )}
      {/* Error */}
      {error && (
        <div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
          {error}
          <button
            onClick={() => { didDetect.current = false; detectPageSplit() }}
            className="ml-2 text-teal-600 hover:underline"
          >
            Erneut versuchen
          </button>
        </div>
      )}
      {/* Next button — only show when detection is done */}
      {(splitResult || error) && !detecting && (
        <div className="flex justify-end">
          <button
            onClick={onNext}
            className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
          >
            Weiter &rarr;
          </button>
        </div>
      )}
    </div>
  )
 }
--- a/admin-lehrer/components/ocr-kombi/StepUpload.tsx
+++ b/admin-lehrer/components/ocr-kombi/StepUpload.tsx
@@ -1,28 +1,52 @@
 'use client'
-import { useState, useCallback } from 'react'
+import { useState, useCallback, useEffect } from 'react'
 import { DOCUMENT_CATEGORIES, type DocumentCategory } from '@/app/(admin)/ai/ocr-pipeline/types'
 const KLAUSUR_API = '/klausur-api'
 interface StepUploadProps {
-  onUploaded: (sessionId: string) => void
+  sessionId: string | null
  onUploaded: (sessionId: string, name: string) => void
  onNext: () => void
 }
-export function StepUpload({ onUploaded }: StepUploadProps) {
+export function StepUpload({ sessionId, onUploaded, onNext }: StepUploadProps) {
  const [dragging, setDragging] = useState(false)
  const [uploading, setUploading] = useState(false)
  const [selectedFile, setSelectedFile] = useState<File | null>(null)
  const [preview, setPreview] = useState<string | null>(null)
  const [title, setTitle] = useState('')
  const [category, setCategory] = useState<DocumentCategory>('vokabelseite')
  const [error, setError] = useState('')
-  const handleUpload = useCallback(async (file: File) => {
+  // Clean up preview URL on unmount
  useEffect(() => {
    return () => { if (preview) URL.revokeObjectURL(preview) }
  }, [preview])
  const handleFileSelect = useCallback((file: File) => {
    setSelectedFile(file)
    setError('')
    if (file.type.startsWith('image/')) {
      setPreview(URL.createObjectURL(file))
    } else {
      setPreview(null)
    }
    // Auto-fill title from filename if empty
    if (!title.trim()) {
      setTitle(file.name.replace(/\.[^.]+$/, ''))
    }
  }, [title])
  const handleUpload = useCallback(async () => {
    if (!selectedFile) return
    setUploading(true)
    setError('')
    try {
      const formData = new FormData()
-      formData.append('file', file)
+      formData.append('file', selectedFile)
      if (title.trim()) formData.append('name', title.trim())
      const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, {
@@ -47,26 +71,164 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
        })
      }
-      onUploaded(sid)
+      onUploaded(sid, title.trim() || selectedFile.name)
    } catch (e) {
      setError(e instanceof Error ? e.message : String(e))
    } finally {
      setUploading(false)
    }
-  }, [title, category, onUploaded])
+  }, [selectedFile, title, category, onUploaded])
  const handleDrop = useCallback((e: React.DragEvent) => {
    e.preventDefault()
    setDragging(false)
    const file = e.dataTransfer.files[0]
-    if (file) handleUpload(file)
+    if (file) handleFileSelect(file)
-  }, [handleUpload])
+  }, [handleFileSelect])
-  const handleFileSelect = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
+  const handleInputChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
    const file = e.target.files?.[0]
-    if (file) handleUpload(file)
+    if (file) handleFileSelect(file)
-  }, [handleUpload])
+  }, [handleFileSelect])
  const clearFile = useCallback(() => {
    setSelectedFile(null)
    if (preview) URL.revokeObjectURL(preview)
    setPreview(null)
  }, [preview])
  // ---- Phase 2: Uploaded → show result + "Weiter" ----
  if (sessionId) {
    return (
      <div className="space-y-4">
        <div className="bg-green-50 dark:bg-green-900/20 border border-green-200 dark:border-green-800 rounded-lg p-4">
          <div className="flex items-center gap-2 text-green-700 dark:text-green-300 text-sm font-medium mb-3">
            <span>&#10003;</span> Dokument hochgeladen
          </div>
          <div className="flex gap-4">
            <div className="w-48 h-64 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
              {/* eslint-disable-next-line @next/next/no-img-element */}
              <img
                src={`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`}
                alt="Hochgeladenes Dokument"
                className="w-full h-full object-contain"
                onError={(e) => { (e.target as HTMLImageElement).style.display = 'none' }}
              />
            </div>
            <div className="text-sm text-gray-600 dark:text-gray-400">
              <div className="font-medium text-gray-700 dark:text-gray-300 mb-1">
                {title || 'Dokument'}
              </div>
              <div className="text-xs text-gray-400 mt-1">
                Kategorie: {DOCUMENT_CATEGORIES.find(c => c.value === category)?.label || category}
              </div>
              <div className="text-xs font-mono text-gray-400 mt-1">
                Session: {sessionId.slice(0, 8)}...
              </div>
            </div>
          </div>
        </div>
        <div className="flex justify-end">
          <button
            onClick={onNext}
            className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
          >
            Weiter &rarr;
          </button>
        </div>
      </div>
    )
  }
  // ---- Phase 1b: File selected → preview + "Hochladen" ----
  if (selectedFile) {
    return (
      <div className="space-y-4">
        {/* Title input */}
        <div>
          <label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
            Titel
          </label>
          <input
            type="text"
            value={title}
            onChange={(e) => setTitle(e.target.value)}
            placeholder="z.B. Vokabeln Unit 3"
            className="w-full px-3 py-2 border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-800 text-sm"
          />
        </div>
        {/* Category selector */}
        <div>
          <label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
            Kategorie
          </label>
          <div className="grid grid-cols-4 gap-1.5">
            {DOCUMENT_CATEGORIES.map(cat => (
              <button
                key={cat.value}
                onClick={() => setCategory(cat.value)}
                className={`text-xs px-2 py-1.5 rounded-md text-left transition-colors ${
                  category === cat.value
                    ? 'bg-teal-100 dark:bg-teal-900/40 text-teal-700 dark:text-teal-300 ring-1 ring-teal-400'
                    : 'bg-gray-50 dark:bg-gray-700 text-gray-600 dark:text-gray-400 hover:bg-gray-100'
                }`}
              >
                {cat.icon} {cat.label}
              </button>
            ))}
          </div>
        </div>
        {/* File preview */}
        <div className="border border-gray-200 dark:border-gray-700 rounded-xl p-4">
          <div className="flex items-start gap-4">
            {preview ? (
              <div className="w-36 h-48 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
                {/* eslint-disable-next-line @next/next/no-img-element */}
                <img src={preview} alt="Vorschau" className="w-full h-full object-contain" />
              </div>
            ) : (
              <div className="w-36 h-48 rounded-lg bg-gray-100 dark:bg-gray-700 flex-shrink-0 flex items-center justify-center border border-gray-200 dark:border-gray-600">
                <span className="text-3xl">&#128196;</span>
              </div>
            )}
            <div className="flex-1 min-w-0">
              <div className="font-medium text-sm text-gray-700 dark:text-gray-300 truncate">
                {selectedFile.name}
              </div>
              <div className="text-xs text-gray-400 mt-1">
                {(selectedFile.size / 1024 / 1024).toFixed(1)} MB
              </div>
              <button
                onClick={clearFile}
                className="text-xs text-red-500 hover:text-red-700 mt-2"
              >
                Andere Datei waehlen
              </button>
            </div>
          </div>
          <button
            onClick={handleUpload}
            disabled={uploading}
            className="mt-4 w-full px-4 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed transition-colors"
          >
            {uploading ? 'Wird hochgeladen...' : 'Hochladen'}
          </button>
        </div>
        {error && (
          <div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
            {error}
          </div>
        )}
      </div>
    )
  }
  // ---- Phase 1a: No file → drop zone ----
  return (
    <div className="space-y-4">
      {/* Title input */}
@@ -116,25 +278,19 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
            : 'border-gray-300 dark:border-gray-600 hover:border-gray-400'
        }`}
      >
-        {uploading ? (
+        <div className="text-4xl mb-3">&#128228;</div>
-          <div className="text-sm text-gray-500">Wird hochgeladen...</div>
+        <div className="text-sm text-gray-600 dark:text-gray-400 mb-2">
-        ) : (
+          Bild oder PDF hierher ziehen
-          <>
+        </div>
-            <div className="text-4xl mb-3">📤</div>
+        <label className="inline-block px-4 py-2 bg-teal-600 text-white text-sm rounded-lg cursor-pointer hover:bg-teal-700">
-            <div className="text-sm text-gray-600 dark:text-gray-400 mb-2">
+          Datei auswaehlen
-              Bild oder PDF hierher ziehen
+          <input
-            </div>
+            type="file"
-            <label className="inline-block px-4 py-2 bg-teal-600 text-white text-sm rounded-lg cursor-pointer hover:bg-teal-700">
+            accept="image/*,.pdf"
-              Datei auswaehlen
+            onChange={handleInputChange}
-              <input
+            className="hidden"
-                type="file"
+          />
-                accept="image/*,.pdf"
+        </label>
                onChange={handleFileSelect}
                className="hidden"
              />
            </label>
          </>
        )}
      </div>
      {error && (
--- a/klausur-service/backend/cv_syllable_detect.py
+++ b/klausur-service/backend/cv_syllable_detect.py
@@ -34,7 +34,8 @@ _STOP_WORDS = frozenset([
    'der', 'die', 'das', 'dem', 'den', 'des',
    'ein', 'eine', 'einem', 'einen', 'einer',
    # Pronouns
-    'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
+    'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
    'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
    # Prepositions
    'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
    'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
@@ -139,6 +140,93 @@ def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
    return ' '.join(result)
 def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
    """Merge OCR word-gap fragments in cell texts using pyphen validation.
    OCR often splits words at syllable boundaries into separate word_boxes,
    producing text like "zerknit tert" instead of "zerknittert".  This
    function tries to merge adjacent fragments in every content cell.
    More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
    but still guarded by pyphen dictionary lookup and stop-word exclusion.
    Returns the number of cells modified.
    """
    hyph_de, _ = _get_hyphenators()
    if hyph_de is None:
        return 0
    modified = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            text = cell.get("text", "")
            if not text or " " not in text:
                continue
            # Skip IPA cells
            text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
            if _IPA_RE.search(text_no_brackets):
                continue
            new_text = _try_merge_word_gaps(text, hyph_de)
            if new_text != text:
                cell["text"] = new_text
                modified += 1
    if modified:
        logger.info(
            "build-grid session %s: merged word gaps in %d cells",
            session_id, modified,
        )
    return modified
 def _try_merge_word_gaps(text: str, hyph_de) -> str:
    """Merge OCR word fragments with relaxed threshold (max_short=6).
    Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
    (max_short=5 instead of 3).  Still requires pyphen to recognize the
    merged word.
    """
    parts = text.split(' ')
    if len(parts) < 2:
        return text
    result = [parts[0]]
    i = 1
    while i < len(parts):
        prev = result[-1]
        curr = parts[i]
        prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
        curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
        should_try = (
            prev == prev_alpha
            and prev_alpha and curr_alpha
            and prev_alpha.lower() not in _STOP_WORDS
            and curr_alpha.lower() not in _STOP_WORDS
            and min(len(prev_alpha), len(curr_alpha)) <= 5
            and len(prev_alpha) + len(curr_alpha) >= 4
        )
        if should_try:
            merged_alpha = prev_alpha + curr_alpha
            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
            if '-' in hyph:
                result[-1] = prev + curr
                i += 1
                continue
        result.append(curr)
        i += 1
    return ' '.join(result)
 def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
    """Syllabify all significant words in a text string.
@@ -259,6 +347,12 @@ def insert_syllable_dividers(
            if not text:
                continue
            # In auto mode (force=False), only normalize cells that already
            # have | from OCR (i.e. printed syllable dividers on the original
            # scan).  Don't add new syllable marks to other words.
            if not force and "|" not in text:
                continue
            new_text = _syllabify_text(text, hyph_de, hyph_en)
            if new_text != text:
                cell["text"] = new_text
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1593,6 +1593,13 @@ async def _build_grid_core(
    except Exception as e:
        logger.warning("Dictionary detection failed: %s", e)
    # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
    try:
        from cv_syllable_detect import merge_word_gaps_in_zones
        merge_word_gaps_in_zones(zones_data, session_id)
    except Exception as e:
        logger.warning("Word-gap merge failed: %s", e)
    # --- Syllable divider insertion for dictionary pages ---
    # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
    #   "all" = force on all content words, "en" = English column only,
@@ -1626,6 +1633,15 @@ async def _build_grid_core(
            except Exception as e:
                logger.warning("Syllable insertion failed: %s", e)
    # When syllable mode is "none", strip any residual | from OCR so
    # that the displayed text is clean (e.g. "Zel|le" → "Zelle").
    if syllable_mode == "none":
        for z in zones_data:
            for cell in z.get("cells", []):
                t = cell.get("text", "")
                if "|" in t:
                    cell["text"] = t.replace("|", "")
    # Clean up internal flags before returning
    for z in zones_data:
        for cell in z.get("cells", []):
--- a/klausur-service/backend/grid_editor_helpers.py
+++ b/klausur-service/backend/grid_editor_helpers.py
@@ -912,6 +912,13 @@ def _detect_heading_rows_by_single_cell(
            _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
            if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
                continue
            # Guard: dictionary section headings are short (1-4 alpha chars
            # like "A", "Ab", "Zi", "Sch").  Longer text that starts
            # lowercase is a regular vocabulary word (e.g. "zentral") that
            # happens to appear alone in its row.
            alpha_only = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', text)
            if len(alpha_only) > 4 and text[0].islower():
                continue
            heading_row_indices.append(ri)
        # Guard: if >25% of eligible rows would become headings, the
Author	SHA1	Message	Date
Benjamin Admin	96ea23164d	Fix word-gap merge: add missing pronouns to stop words, reduce threshold Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 38s Details CI / test-go-edu-search (push) Successful in 26s Details CI / test-python-klausur (push) Failing after 2m13s Details CI / test-python-agent-core (push) Successful in 18s Details CI / test-nodejs-website (push) Successful in 22s Details - Add du/dich/dir/mich/mir/uns/euch/ihm/ihn to _STOP_WORDS to prevent false merges like "du" + "zerlegst" → "duzerlegst" - Reduce max_short threshold from 6 to 5 to prevent merging multi-word phrases like "ziehen lassen" → "ziehenlassen" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-27 15:35:12 +01:00
Benjamin Admin	a8773d5b00	Fix 4 Grid Editor bugs: syllable modes, heading detection, word gaps 1. Syllable "Original" (auto) mode: only normalize cells that already have \| from OCR — don't add new syllable marks via pyphen to words without printed dividers on the original scan. 2. Syllable "Aus" (none) mode: strip residual \| chars from OCR text so cells display clean (e.g. "Zel\|le" → "Zelle"). 3. Heading detection: add text length guard in single-cell heuristic — words > 4 alpha chars starting lowercase (like "zentral") are regular vocabulary, not section headings. 4. Word-gap merge: new merge_word_gaps_in_zones() step with relaxed threshold (6 chars) fixes OCR splits like "zerknit tert" → "zerknittert". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-27 15:24:35 +01:00
Benjamin Admin	9f68bd3425	feat: Implement page-split step with auto-detection and sub-session naming StepPageSplit now: - Auto-calls POST /page-split on step entry - Shows oriented image + detection result - If double page: creates sub-sessions named "Title — S. 1/2" - If single page: green badge "keine Trennung noetig" - Manual "Weiter" button (no auto-advance) Also: - StepOrientation wrapper simplified (no page-split in orientation) - StepUpload passes name back via onUploaded(sid, name) - page.tsx: after page-split "Weiter" switches to first sub-session - useKombiPipeline exposes setSessionName Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-26 17:56:45 +01:00
Benjamin Admin	469f09d1e1	fix: Redesign StepUpload for manual step control StepUpload now has 3 phases: 1. File selection: drop zone / file picker → shows preview 2. Review: title input, category, file info → "Hochladen" button 3. Uploaded: shows session image → "Weiter" button No more auto-advance after upload. User controls every step. openSession() removed from onUploaded callback to prevent step-reset race condition. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-26 17:35:36 +01:00
Benjamin Admin	3bb04b25ab	fix: OCR Kombi upload race condition — openSession was resetting step to 0 openSession mapped dbStep=1 to uiStep=0 (upload), overriding handleNext's advancement to step 1. Fix: sessions always exist post-upload, so always skip past the upload step in openSession. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-26 17:10:04 +01:00