Compare commits

..

5 Commits

Author SHA1 Message Date
Benjamin Admin
96ea23164d Fix word-gap merge: add missing pronouns to stop words, reduce threshold
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 38s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m13s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 22s
- Add du/dich/dir/mich/mir/uns/euch/ihm/ihn to _STOP_WORDS to prevent
  false merges like "du" + "zerlegst" → "duzerlegst"
- Reduce max_short threshold from 6 to 5 to prevent merging multi-word
  phrases like "ziehen lassen" → "ziehenlassen"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 15:35:12 +01:00
Benjamin Admin
a8773d5b00 Fix 4 Grid Editor bugs: syllable modes, heading detection, word gaps
1. Syllable "Original" (auto) mode: only normalize cells that already
   have | from OCR — don't add new syllable marks via pyphen to words
   without printed dividers on the original scan.

2. Syllable "Aus" (none) mode: strip residual | chars from OCR text
   so cells display clean (e.g. "Zel|le" → "Zelle").

3. Heading detection: add text length guard in single-cell heuristic —
   words > 4 alpha chars starting lowercase (like "zentral") are regular
   vocabulary, not section headings.

4. Word-gap merge: new merge_word_gaps_in_zones() step with relaxed
   threshold (6 chars) fixes OCR splits like "zerknit tert" → "zerknittert".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 15:24:35 +01:00
Benjamin Admin
9f68bd3425 feat: Implement page-split step with auto-detection and sub-session naming
StepPageSplit now:
- Auto-calls POST /page-split on step entry
- Shows oriented image + detection result
- If double page: creates sub-sessions named "Title — S. 1/2"
- If single page: green badge "keine Trennung noetig"
- Manual "Weiter" button (no auto-advance)

Also:
- StepOrientation wrapper simplified (no page-split in orientation)
- StepUpload passes name back via onUploaded(sid, name)
- page.tsx: after page-split "Weiter" switches to first sub-session
- useKombiPipeline exposes setSessionName

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 17:56:45 +01:00
Benjamin Admin
469f09d1e1 fix: Redesign StepUpload for manual step control
StepUpload now has 3 phases:
1. File selection: drop zone / file picker → shows preview
2. Review: title input, category, file info → "Hochladen" button
3. Uploaded: shows session image → "Weiter" button

No more auto-advance after upload. User controls every step.
openSession() removed from onUploaded callback to prevent
step-reset race condition.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 17:35:36 +01:00
Benjamin Admin
3bb04b25ab fix: OCR Kombi upload race condition — openSession was resetting step to 0
openSession mapped dbStep=1 to uiStep=0 (upload), overriding handleNext's
advancement to step 1. Fix: sessions always exist post-upload, so always
skip past the upload step in openSession.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 17:10:04 +01:00
8 changed files with 486 additions and 124 deletions

View File

@@ -40,9 +40,9 @@ function OcrKombiContent() {
deleteSession, deleteSession,
renameSession, renameSession,
updateCategory, updateCategory,
handleOrientationComplete,
handleSessionChange, handleSessionChange,
setSessionId, setSessionId,
setSessionName,
setSubSessions, setSubSessions,
setParentSessionId, setParentSessionId,
setIsGroundTruth, setIsGroundTruth,
@@ -53,19 +53,20 @@ function OcrKombiContent() {
case 0: case 0:
return ( return (
<StepUpload <StepUpload
onUploaded={(sid) => { sessionId={sessionId}
onUploaded={(sid, name) => {
setSessionId(sid) setSessionId(sid)
setSessionName(name)
loadSessions() loadSessions()
openSession(sid)
handleNext()
}} }}
onNext={handleNext}
/> />
) )
case 1: case 1:
return ( return (
<StepOrientation <StepOrientation
sessionId={sessionId} sessionId={sessionId}
onNext={handleOrientationComplete} onNext={() => handleNext()}
onSessionList={() => { loadSessions(); handleNewSession() }} onSessionList={() => { loadSessions(); handleNewSession() }}
/> />
) )
@@ -73,10 +74,19 @@ function OcrKombiContent() {
return ( return (
<StepPageSplit <StepPageSplit
sessionId={sessionId} sessionId={sessionId}
onNext={handleNext} sessionName={sessionName}
onNext={() => {
// If sub-sessions were created, switch to the first one
if (subSessions.length > 0) {
setSessionId(subSessions[0].id)
setSessionName(subSessions[0].name)
}
handleNext()
}}
onSubSessionsCreated={(subs) => { onSubSessionsCreated={(subs) => {
setSubSessions(subs) setSubSessions(subs)
if (sessionId) setParentSessionId(sessionId) if (sessionId) setParentSessionId(sessionId)
loadSessions()
}} }}
/> />
) )

View File

@@ -154,8 +154,8 @@ export function useKombiPipeline() {
uiStep = dbStepToKombiV2Ui(dbStep) uiStep = dbStepToKombiV2Ui(dbStep)
} }
// For sessions that already have an upload, skip the upload step // Sessions only exist after upload, so always skip the upload step
if (uiStep === 0 && dbStep >= 2) { if (uiStep === 0) {
uiStep = 1 uiStep = 1
} }
@@ -356,6 +356,7 @@ export function useKombiPipeline() {
setSessionId, setSessionId,
setSubSessions, setSubSessions,
setParentSessionId, setParentSessionId,
setSessionName,
setIsGroundTruth, setIsGroundTruth,
} }
} }

View File

@@ -4,17 +4,17 @@ import { StepOrientation as BaseStepOrientation } from '@/components/ocr-pipelin
interface StepOrientationProps { interface StepOrientationProps {
sessionId: string | null sessionId: string | null
onNext: (sessionId: string) => void onNext: () => void
onSessionList: () => void onSessionList: () => void
} }
/** Thin wrapper around the shared StepOrientation component */ /** Thin wrapper — adapts the shared StepOrientation to the Kombi pipeline's simpler onNext() */
export function StepOrientation({ sessionId, onNext, onSessionList }: StepOrientationProps) { export function StepOrientation({ sessionId, onNext, onSessionList }: StepOrientationProps) {
return ( return (
<BaseStepOrientation <BaseStepOrientation
key={sessionId} key={sessionId}
sessionId={sessionId} sessionId={sessionId}
onNext={onNext} onNext={() => onNext()}
onSessionList={onSessionList} onSessionList={onSessionList}
/> />
) )

View File

@@ -1,123 +1,201 @@
'use client' 'use client'
import { useState, useEffect } from 'react' import { useState, useEffect, useRef } from 'react'
import type { SubSession } from '@/app/(admin)/ai/ocr-pipeline/types' import type { SubSession } from '@/app/(admin)/ai/ocr-pipeline/types'
const KLAUSUR_API = '/klausur-api' const KLAUSUR_API = '/klausur-api'
interface PageSplitResult {
multi_page: boolean
page_count?: number
page_splits?: { x: number; y: number; width: number; height: number; page_index: number }[]
sub_sessions?: { id: string; name: string; page_index: number }[]
used_original?: boolean
duration_seconds?: number
}
interface StepPageSplitProps { interface StepPageSplitProps {
sessionId: string | null sessionId: string | null
sessionName: string
onNext: () => void onNext: () => void
onSubSessionsCreated: (subs: SubSession[]) => void onSubSessionsCreated: (subs: SubSession[]) => void
} }
/** export function StepPageSplit({ sessionId, sessionName, onNext, onSubSessionsCreated }: StepPageSplitProps) {
* Step 3: Page split detection. const [detecting, setDetecting] = useState(false)
* Checks if the image is a double-page spread and offers to split it. const [splitResult, setSplitResult] = useState<PageSplitResult | null>(null)
* If no split needed, auto-advances.
*/
export function StepPageSplit({ sessionId, onNext, onSubSessionsCreated }: StepPageSplitProps) {
const [checking, setChecking] = useState(false)
const [splitResult, setSplitResult] = useState<{ is_double_page: boolean; pages?: number } | null>(null)
const [splitting, setSplitting] = useState(false)
const [error, setError] = useState('') const [error, setError] = useState('')
const didDetect = useRef(false)
// Auto-detect page split when step opens
useEffect(() => { useEffect(() => {
if (!sessionId) return if (!sessionId || didDetect.current) return
// Auto-check for page split didDetect.current = true
checkPageSplit() detectPageSplit()
// eslint-disable-next-line react-hooks/exhaustive-deps // eslint-disable-next-line react-hooks/exhaustive-deps
}, [sessionId]) }, [sessionId])
const checkPageSplit = async () => { const detectPageSplit = async () => {
if (!sessionId) return if (!sessionId) return
setChecking(true) setDetecting(true)
setError('') setError('')
try { try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`) // First check if sub-sessions already exist
if (!res.ok) throw new Error('Session nicht gefunden') const sessionRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
const data = await res.json() if (sessionRes.ok) {
const sessionData = await sessionRes.json()
// If sub-sessions already exist, this was already split if (sessionData.sub_sessions?.length > 0) {
if (data.sub_sessions?.length > 0) { // Already split — show existing sub-sessions
onSubSessionsCreated(data.sub_sessions) const subs = sessionData.sub_sessions as { id: string; name: string; page_index?: number; box_index?: number; current_step?: number }[]
onNext() setSplitResult({
return multi_page: true,
page_count: subs.length,
sub_sessions: subs.map((s: { id: string; name: string; page_index?: number; box_index?: number }) => ({
id: s.id,
name: s.name,
page_index: s.page_index ?? s.box_index ?? 0,
})),
})
onSubSessionsCreated(subs.map((s: { id: string; name: string; page_index?: number; box_index?: number; current_step?: number }) => ({
id: s.id,
name: s.name,
box_index: s.page_index ?? s.box_index ?? 0,
current_step: s.current_step ?? 2,
})))
setDetecting(false)
return
}
} }
// Check aspect ratio to guess if double-page // Run page-split detection
// For now, just auto-advance (page-split detection happens in orientation step)
setSplitResult({ is_double_page: false })
// Auto-advance if single page
onNext()
} catch (e) {
setError(e instanceof Error ? e.message : String(e))
} finally {
setChecking(false)
}
}
const handleSplit = async () => {
if (!sessionId) return
setSplitting(true)
setError('')
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/page-split`, { const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/page-split`, {
method: 'POST', method: 'POST',
}) })
if (!res.ok) { if (!res.ok) {
const data = await res.json().catch(() => ({})) const data = await res.json().catch(() => ({}))
throw new Error(data.detail || 'Split fehlgeschlagen') throw new Error(data.detail || 'Seitentrennung fehlgeschlagen')
} }
const data = await res.json() const data: PageSplitResult = await res.json()
if (data.sub_sessions?.length > 0) { setSplitResult(data)
onSubSessionsCreated(data.sub_sessions)
if (data.multi_page && data.sub_sessions?.length) {
// Rename sub-sessions to "Title — S. 1", "Title — S. 2"
const baseName = sessionName || 'Dokument'
for (let i = 0; i < data.sub_sessions.length; i++) {
const sub = data.sub_sessions[i]
const newName = `${baseName} — S. ${i + 1}`
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sub.id}`, {
method: 'PUT',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: newName }),
}).catch(() => {})
sub.name = newName
}
onSubSessionsCreated(data.sub_sessions.map(s => ({
id: s.id,
name: s.name,
box_index: s.page_index,
current_step: 2,
})))
} }
onNext()
} catch (e) { } catch (e) {
setError(e instanceof Error ? e.message : String(e)) setError(e instanceof Error ? e.message : String(e))
} finally { } finally {
setSplitting(false) setDetecting(false)
} }
} }
if (checking) { if (!sessionId) return null
return <div className="text-sm text-gray-500 py-8 text-center">Pruefe Seitenformat...</div>
}
if (splitResult?.is_double_page) { const imageUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented`
return (
<div className="space-y-4 p-6 bg-blue-50 dark:bg-blue-900/20 rounded-xl border border-blue-200 dark:border-blue-800">
<h3 className="text-sm font-medium text-blue-700 dark:text-blue-300">
Doppelseite erkannt
</h3>
<p className="text-sm text-blue-600 dark:text-blue-400">
Das Bild scheint eine Doppelseite zu sein. Soll es in zwei Einzelseiten aufgeteilt werden?
</p>
<div className="flex gap-2">
<button
onClick={handleSplit}
disabled={splitting}
className="px-4 py-2 bg-blue-600 text-white text-sm rounded-lg hover:bg-blue-700 disabled:opacity-50"
>
{splitting ? 'Wird aufgeteilt...' : 'Aufteilen'}
</button>
<button
onClick={onNext}
className="px-4 py-2 bg-gray-200 dark:bg-gray-700 text-sm rounded-lg hover:bg-gray-300"
>
Einzelseite beibehalten
</button>
</div>
{error && <div className="text-sm text-red-500">{error}</div>}
</div>
)
}
return ( return (
<div className="text-sm text-gray-500 py-8 text-center"> <div className="space-y-4">
Einzelseite erkannt weiter zum naechsten Schritt. {/* Image */}
{error && <div className="text-sm text-red-500 mt-2">{error}</div>} <div className="relative rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={imageUrl}
alt="Orientiertes Bild"
className="w-full object-contain max-h-[500px]"
onError={(e) => {
// Fallback to non-oriented image
(e.target as HTMLImageElement).src =
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`
}}
/>
</div>
{/* Detection status */}
{detecting && (
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
Doppelseiten-Erkennung laeuft...
</div>
)}
{/* Detection result */}
{splitResult && !detecting && (
splitResult.multi_page ? (
<div className="bg-blue-50 dark:bg-blue-900/20 rounded-lg border border-blue-200 dark:border-blue-700 p-4 space-y-2">
<div className="text-sm font-medium text-blue-700 dark:text-blue-300">
Doppelseite erkannt {splitResult.page_count} Seiten getrennt
</div>
<p className="text-xs text-blue-600 dark:text-blue-400">
Jede Seite wird als eigene Session weiterverarbeitet (eigene Begradigung, Entzerrung, etc.).
{splitResult.used_original && ' Trennung auf Originalbild, da Orientierung die Doppelseite gedreht hat.'}
</p>
<div className="flex gap-2 mt-2">
{splitResult.sub_sessions?.map(s => (
<span
key={s.id}
className="text-xs px-2.5 py-1 rounded-md bg-blue-100 dark:bg-blue-800/40 text-blue-700 dark:text-blue-300 font-medium"
>
{s.name}
</span>
))}
</div>
{splitResult.duration_seconds != null && (
<div className="text-xs text-gray-400">{splitResult.duration_seconds.toFixed(1)}s</div>
)}
</div>
) : (
<div className="bg-green-50 dark:bg-green-900/20 rounded-lg border border-green-200 dark:border-green-800 p-4">
<div className="flex items-center gap-2 text-sm font-medium text-green-700 dark:text-green-300">
<span>&#10003;</span> Einzelseite keine Trennung noetig
</div>
{splitResult.duration_seconds != null && (
<div className="text-xs text-gray-400 mt-1">{splitResult.duration_seconds.toFixed(1)}s</div>
)}
</div>
)
)}
{/* Error */}
{error && (
<div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
{error}
<button
onClick={() => { didDetect.current = false; detectPageSplit() }}
className="ml-2 text-teal-600 hover:underline"
>
Erneut versuchen
</button>
</div>
)}
{/* Next button — only show when detection is done */}
{(splitResult || error) && !detecting && (
<div className="flex justify-end">
<button
onClick={onNext}
className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
>
Weiter &rarr;
</button>
</div>
)}
</div> </div>
) )
} }

View File

@@ -1,28 +1,52 @@
'use client' 'use client'
import { useState, useCallback } from 'react' import { useState, useCallback, useEffect } from 'react'
import { DOCUMENT_CATEGORIES, type DocumentCategory } from '@/app/(admin)/ai/ocr-pipeline/types' import { DOCUMENT_CATEGORIES, type DocumentCategory } from '@/app/(admin)/ai/ocr-pipeline/types'
const KLAUSUR_API = '/klausur-api' const KLAUSUR_API = '/klausur-api'
interface StepUploadProps { interface StepUploadProps {
onUploaded: (sessionId: string) => void sessionId: string | null
onUploaded: (sessionId: string, name: string) => void
onNext: () => void
} }
export function StepUpload({ onUploaded }: StepUploadProps) { export function StepUpload({ sessionId, onUploaded, onNext }: StepUploadProps) {
const [dragging, setDragging] = useState(false) const [dragging, setDragging] = useState(false)
const [uploading, setUploading] = useState(false) const [uploading, setUploading] = useState(false)
const [selectedFile, setSelectedFile] = useState<File | null>(null)
const [preview, setPreview] = useState<string | null>(null)
const [title, setTitle] = useState('') const [title, setTitle] = useState('')
const [category, setCategory] = useState<DocumentCategory>('vokabelseite') const [category, setCategory] = useState<DocumentCategory>('vokabelseite')
const [error, setError] = useState('') const [error, setError] = useState('')
const handleUpload = useCallback(async (file: File) => { // Clean up preview URL on unmount
useEffect(() => {
return () => { if (preview) URL.revokeObjectURL(preview) }
}, [preview])
const handleFileSelect = useCallback((file: File) => {
setSelectedFile(file)
setError('')
if (file.type.startsWith('image/')) {
setPreview(URL.createObjectURL(file))
} else {
setPreview(null)
}
// Auto-fill title from filename if empty
if (!title.trim()) {
setTitle(file.name.replace(/\.[^.]+$/, ''))
}
}, [title])
const handleUpload = useCallback(async () => {
if (!selectedFile) return
setUploading(true) setUploading(true)
setError('') setError('')
try { try {
const formData = new FormData() const formData = new FormData()
formData.append('file', file) formData.append('file', selectedFile)
if (title.trim()) formData.append('name', title.trim()) if (title.trim()) formData.append('name', title.trim())
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, { const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, {
@@ -47,26 +71,164 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
}) })
} }
onUploaded(sid) onUploaded(sid, title.trim() || selectedFile.name)
} catch (e) { } catch (e) {
setError(e instanceof Error ? e.message : String(e)) setError(e instanceof Error ? e.message : String(e))
} finally { } finally {
setUploading(false) setUploading(false)
} }
}, [title, category, onUploaded]) }, [selectedFile, title, category, onUploaded])
const handleDrop = useCallback((e: React.DragEvent) => { const handleDrop = useCallback((e: React.DragEvent) => {
e.preventDefault() e.preventDefault()
setDragging(false) setDragging(false)
const file = e.dataTransfer.files[0] const file = e.dataTransfer.files[0]
if (file) handleUpload(file) if (file) handleFileSelect(file)
}, [handleUpload]) }, [handleFileSelect])
const handleFileSelect = useCallback((e: React.ChangeEvent<HTMLInputElement>) => { const handleInputChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
const file = e.target.files?.[0] const file = e.target.files?.[0]
if (file) handleUpload(file) if (file) handleFileSelect(file)
}, [handleUpload]) }, [handleFileSelect])
const clearFile = useCallback(() => {
setSelectedFile(null)
if (preview) URL.revokeObjectURL(preview)
setPreview(null)
}, [preview])
// ---- Phase 2: Uploaded → show result + "Weiter" ----
if (sessionId) {
return (
<div className="space-y-4">
<div className="bg-green-50 dark:bg-green-900/20 border border-green-200 dark:border-green-800 rounded-lg p-4">
<div className="flex items-center gap-2 text-green-700 dark:text-green-300 text-sm font-medium mb-3">
<span>&#10003;</span> Dokument hochgeladen
</div>
<div className="flex gap-4">
<div className="w-48 h-64 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`}
alt="Hochgeladenes Dokument"
className="w-full h-full object-contain"
onError={(e) => { (e.target as HTMLImageElement).style.display = 'none' }}
/>
</div>
<div className="text-sm text-gray-600 dark:text-gray-400">
<div className="font-medium text-gray-700 dark:text-gray-300 mb-1">
{title || 'Dokument'}
</div>
<div className="text-xs text-gray-400 mt-1">
Kategorie: {DOCUMENT_CATEGORIES.find(c => c.value === category)?.label || category}
</div>
<div className="text-xs font-mono text-gray-400 mt-1">
Session: {sessionId.slice(0, 8)}...
</div>
</div>
</div>
</div>
<div className="flex justify-end">
<button
onClick={onNext}
className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
>
Weiter &rarr;
</button>
</div>
</div>
)
}
// ---- Phase 1b: File selected → preview + "Hochladen" ----
if (selectedFile) {
return (
<div className="space-y-4">
{/* Title input */}
<div>
<label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
Titel
</label>
<input
type="text"
value={title}
onChange={(e) => setTitle(e.target.value)}
placeholder="z.B. Vokabeln Unit 3"
className="w-full px-3 py-2 border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-800 text-sm"
/>
</div>
{/* Category selector */}
<div>
<label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
Kategorie
</label>
<div className="grid grid-cols-4 gap-1.5">
{DOCUMENT_CATEGORIES.map(cat => (
<button
key={cat.value}
onClick={() => setCategory(cat.value)}
className={`text-xs px-2 py-1.5 rounded-md text-left transition-colors ${
category === cat.value
? 'bg-teal-100 dark:bg-teal-900/40 text-teal-700 dark:text-teal-300 ring-1 ring-teal-400'
: 'bg-gray-50 dark:bg-gray-700 text-gray-600 dark:text-gray-400 hover:bg-gray-100'
}`}
>
{cat.icon} {cat.label}
</button>
))}
</div>
</div>
{/* File preview */}
<div className="border border-gray-200 dark:border-gray-700 rounded-xl p-4">
<div className="flex items-start gap-4">
{preview ? (
<div className="w-36 h-48 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img src={preview} alt="Vorschau" className="w-full h-full object-contain" />
</div>
) : (
<div className="w-36 h-48 rounded-lg bg-gray-100 dark:bg-gray-700 flex-shrink-0 flex items-center justify-center border border-gray-200 dark:border-gray-600">
<span className="text-3xl">&#128196;</span>
</div>
)}
<div className="flex-1 min-w-0">
<div className="font-medium text-sm text-gray-700 dark:text-gray-300 truncate">
{selectedFile.name}
</div>
<div className="text-xs text-gray-400 mt-1">
{(selectedFile.size / 1024 / 1024).toFixed(1)} MB
</div>
<button
onClick={clearFile}
className="text-xs text-red-500 hover:text-red-700 mt-2"
>
Andere Datei waehlen
</button>
</div>
</div>
<button
onClick={handleUpload}
disabled={uploading}
className="mt-4 w-full px-4 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed transition-colors"
>
{uploading ? 'Wird hochgeladen...' : 'Hochladen'}
</button>
</div>
{error && (
<div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
{error}
</div>
)}
</div>
)
}
// ---- Phase 1a: No file → drop zone ----
return ( return (
<div className="space-y-4"> <div className="space-y-4">
{/* Title input */} {/* Title input */}
@@ -116,25 +278,19 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
: 'border-gray-300 dark:border-gray-600 hover:border-gray-400' : 'border-gray-300 dark:border-gray-600 hover:border-gray-400'
}`} }`}
> >
{uploading ? ( <div className="text-4xl mb-3">&#128228;</div>
<div className="text-sm text-gray-500">Wird hochgeladen...</div> <div className="text-sm text-gray-600 dark:text-gray-400 mb-2">
) : ( Bild oder PDF hierher ziehen
<> </div>
<div className="text-4xl mb-3">📤</div> <label className="inline-block px-4 py-2 bg-teal-600 text-white text-sm rounded-lg cursor-pointer hover:bg-teal-700">
<div className="text-sm text-gray-600 dark:text-gray-400 mb-2"> Datei auswaehlen
Bild oder PDF hierher ziehen <input
</div> type="file"
<label className="inline-block px-4 py-2 bg-teal-600 text-white text-sm rounded-lg cursor-pointer hover:bg-teal-700"> accept="image/*,.pdf"
Datei auswaehlen onChange={handleInputChange}
<input className="hidden"
type="file" />
accept="image/*,.pdf" </label>
onChange={handleFileSelect}
className="hidden"
/>
</label>
</>
)}
</div> </div>
{error && ( {error && (

View File

@@ -34,7 +34,8 @@ _STOP_WORDS = frozenset([
'der', 'die', 'das', 'dem', 'den', 'des', 'der', 'die', 'das', 'dem', 'den', 'des',
'ein', 'eine', 'einem', 'einen', 'einer', 'ein', 'eine', 'einem', 'einen', 'einer',
# Pronouns # Pronouns
'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich', 'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
# Prepositions # Prepositions
'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im', 'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter', 'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
@@ -139,6 +140,93 @@ def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
return ' '.join(result) return ' '.join(result)
def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
"""Merge OCR word-gap fragments in cell texts using pyphen validation.
OCR often splits words at syllable boundaries into separate word_boxes,
producing text like "zerknit tert" instead of "zerknittert". This
function tries to merge adjacent fragments in every content cell.
More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
but still guarded by pyphen dictionary lookup and stop-word exclusion.
Returns the number of cells modified.
"""
hyph_de, _ = _get_hyphenators()
if hyph_de is None:
return 0
modified = 0
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
text = cell.get("text", "")
if not text or " " not in text:
continue
# Skip IPA cells
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
if _IPA_RE.search(text_no_brackets):
continue
new_text = _try_merge_word_gaps(text, hyph_de)
if new_text != text:
cell["text"] = new_text
modified += 1
if modified:
logger.info(
"build-grid session %s: merged word gaps in %d cells",
session_id, modified,
)
return modified
def _try_merge_word_gaps(text: str, hyph_de) -> str:
"""Merge OCR word fragments with relaxed threshold (max_short=6).
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
(max_short=5 instead of 3). Still requires pyphen to recognize the
merged word.
"""
parts = text.split(' ')
if len(parts) < 2:
return text
result = [parts[0]]
i = 1
while i < len(parts):
prev = result[-1]
curr = parts[i]
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
should_try = (
prev == prev_alpha
and prev_alpha and curr_alpha
and prev_alpha.lower() not in _STOP_WORDS
and curr_alpha.lower() not in _STOP_WORDS
and min(len(prev_alpha), len(curr_alpha)) <= 5
and len(prev_alpha) + len(curr_alpha) >= 4
)
if should_try:
merged_alpha = prev_alpha + curr_alpha
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
if '-' in hyph:
result[-1] = prev + curr
i += 1
continue
result.append(curr)
i += 1
return ' '.join(result)
def _syllabify_text(text: str, hyph_de, hyph_en) -> str: def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
"""Syllabify all significant words in a text string. """Syllabify all significant words in a text string.
@@ -259,6 +347,12 @@ def insert_syllable_dividers(
if not text: if not text:
continue continue
# In auto mode (force=False), only normalize cells that already
# have | from OCR (i.e. printed syllable dividers on the original
# scan). Don't add new syllable marks to other words.
if not force and "|" not in text:
continue
new_text = _syllabify_text(text, hyph_de, hyph_en) new_text = _syllabify_text(text, hyph_de, hyph_en)
if new_text != text: if new_text != text:
cell["text"] = new_text cell["text"] = new_text

View File

@@ -1593,6 +1593,13 @@ async def _build_grid_core(
except Exception as e: except Exception as e:
logger.warning("Dictionary detection failed: %s", e) logger.warning("Dictionary detection failed: %s", e)
# --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
try:
from cv_syllable_detect import merge_word_gaps_in_zones
merge_word_gaps_in_zones(zones_data, session_id)
except Exception as e:
logger.warning("Word-gap merge failed: %s", e)
# --- Syllable divider insertion for dictionary pages --- # --- Syllable divider insertion for dictionary pages ---
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold), # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
# "all" = force on all content words, "en" = English column only, # "all" = force on all content words, "en" = English column only,
@@ -1626,6 +1633,15 @@ async def _build_grid_core(
except Exception as e: except Exception as e:
logger.warning("Syllable insertion failed: %s", e) logger.warning("Syllable insertion failed: %s", e)
# When syllable mode is "none", strip any residual | from OCR so
# that the displayed text is clean (e.g. "Zel|le" → "Zelle").
if syllable_mode == "none":
for z in zones_data:
for cell in z.get("cells", []):
t = cell.get("text", "")
if "|" in t:
cell["text"] = t.replace("|", "")
# Clean up internal flags before returning # Clean up internal flags before returning
for z in zones_data: for z in zones_data:
for cell in z.get("cells", []): for cell in z.get("cells", []):

View File

@@ -912,6 +912,13 @@ def _detect_heading_rows_by_single_cell(
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text): if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
continue continue
# Guard: dictionary section headings are short (1-4 alpha chars
# like "A", "Ab", "Zi", "Sch"). Longer text that starts
# lowercase is a regular vocabulary word (e.g. "zentral") that
# happens to appear alone in its row.
alpha_only = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', text)
if len(alpha_only) > 4 and text[0].islower():
continue
heading_row_indices.append(ri) heading_row_indices.append(ri)
# Guard: if >25% of eligible rows would become headings, the # Guard: if >25% of eligible rows would become headings, the