Compare commits

..

5 Commits

Author SHA1 Message Date
Benjamin Admin
96ea23164d Fix word-gap merge: add missing pronouns to stop words, reduce threshold
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 38s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m13s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 22s
- Add du/dich/dir/mich/mir/uns/euch/ihm/ihn to _STOP_WORDS to prevent
  false merges like "du" + "zerlegst" → "duzerlegst"
- Reduce max_short threshold from 6 to 5 to prevent merging multi-word
  phrases like "ziehen lassen" → "ziehenlassen"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 15:35:12 +01:00
Benjamin Admin
a8773d5b00 Fix 4 Grid Editor bugs: syllable modes, heading detection, word gaps
1. Syllable "Original" (auto) mode: only normalize cells that already
   have | from OCR — don't add new syllable marks via pyphen to words
   without printed dividers on the original scan.

2. Syllable "Aus" (none) mode: strip residual | chars from OCR text
   so cells display clean (e.g. "Zel|le" → "Zelle").

3. Heading detection: add text length guard in single-cell heuristic —
   words > 4 alpha chars starting lowercase (like "zentral") are regular
   vocabulary, not section headings.

4. Word-gap merge: new merge_word_gaps_in_zones() step with relaxed
   threshold (6 chars) fixes OCR splits like "zerknit tert" → "zerknittert".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 15:24:35 +01:00
Benjamin Admin
9f68bd3425 feat: Implement page-split step with auto-detection and sub-session naming
StepPageSplit now:
- Auto-calls POST /page-split on step entry
- Shows oriented image + detection result
- If double page: creates sub-sessions named "Title — S. 1/2"
- If single page: green badge "keine Trennung noetig"
- Manual "Weiter" button (no auto-advance)

Also:
- StepOrientation wrapper simplified (no page-split in orientation)
- StepUpload passes name back via onUploaded(sid, name)
- page.tsx: after page-split "Weiter" switches to first sub-session
- useKombiPipeline exposes setSessionName

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 17:56:45 +01:00
Benjamin Admin
469f09d1e1 fix: Redesign StepUpload for manual step control
StepUpload now has 3 phases:
1. File selection: drop zone / file picker → shows preview
2. Review: title input, category, file info → "Hochladen" button
3. Uploaded: shows session image → "Weiter" button

No more auto-advance after upload. User controls every step.
openSession() removed from onUploaded callback to prevent
step-reset race condition.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 17:35:36 +01:00
Benjamin Admin
3bb04b25ab fix: OCR Kombi upload race condition — openSession was resetting step to 0
openSession mapped dbStep=1 to uiStep=0 (upload), overriding handleNext's
advancement to step 1. Fix: sessions always exist post-upload, so always
skip past the upload step in openSession.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 17:10:04 +01:00
8 changed files with 486 additions and 124 deletions

View File

@@ -40,9 +40,9 @@ function OcrKombiContent() {
deleteSession,
renameSession,
updateCategory,
handleOrientationComplete,
handleSessionChange,
setSessionId,
setSessionName,
setSubSessions,
setParentSessionId,
setIsGroundTruth,
@@ -53,19 +53,20 @@ function OcrKombiContent() {
case 0:
return (
<StepUpload
onUploaded={(sid) => {
sessionId={sessionId}
onUploaded={(sid, name) => {
setSessionId(sid)
setSessionName(name)
loadSessions()
openSession(sid)
handleNext()
}}
onNext={handleNext}
/>
)
case 1:
return (
<StepOrientation
sessionId={sessionId}
onNext={handleOrientationComplete}
onNext={() => handleNext()}
onSessionList={() => { loadSessions(); handleNewSession() }}
/>
)
@@ -73,10 +74,19 @@ function OcrKombiContent() {
return (
<StepPageSplit
sessionId={sessionId}
onNext={handleNext}
sessionName={sessionName}
onNext={() => {
// If sub-sessions were created, switch to the first one
if (subSessions.length > 0) {
setSessionId(subSessions[0].id)
setSessionName(subSessions[0].name)
}
handleNext()
}}
onSubSessionsCreated={(subs) => {
setSubSessions(subs)
if (sessionId) setParentSessionId(sessionId)
loadSessions()
}}
/>
)

View File

@@ -154,8 +154,8 @@ export function useKombiPipeline() {
uiStep = dbStepToKombiV2Ui(dbStep)
}
// For sessions that already have an upload, skip the upload step
if (uiStep === 0 && dbStep >= 2) {
// Sessions only exist after upload, so always skip the upload step
if (uiStep === 0) {
uiStep = 1
}
@@ -356,6 +356,7 @@ export function useKombiPipeline() {
setSessionId,
setSubSessions,
setParentSessionId,
setSessionName,
setIsGroundTruth,
}
}

View File

@@ -4,17 +4,17 @@ import { StepOrientation as BaseStepOrientation } from '@/components/ocr-pipelin
interface StepOrientationProps {
sessionId: string | null
onNext: (sessionId: string) => void
onNext: () => void
onSessionList: () => void
}
/** Thin wrapper around the shared StepOrientation component */
/** Thin wrapper — adapts the shared StepOrientation to the Kombi pipeline's simpler onNext() */
export function StepOrientation({ sessionId, onNext, onSessionList }: StepOrientationProps) {
return (
<BaseStepOrientation
key={sessionId}
sessionId={sessionId}
onNext={onNext}
onNext={() => onNext()}
onSessionList={onSessionList}
/>
)

View File

@@ -1,123 +1,201 @@
'use client'
import { useState, useEffect } from 'react'
import { useState, useEffect, useRef } from 'react'
import type { SubSession } from '@/app/(admin)/ai/ocr-pipeline/types'
const KLAUSUR_API = '/klausur-api'
interface PageSplitResult {
multi_page: boolean
page_count?: number
page_splits?: { x: number; y: number; width: number; height: number; page_index: number }[]
sub_sessions?: { id: string; name: string; page_index: number }[]
used_original?: boolean
duration_seconds?: number
}
interface StepPageSplitProps {
sessionId: string | null
sessionName: string
onNext: () => void
onSubSessionsCreated: (subs: SubSession[]) => void
}
/**
* Step 3: Page split detection.
* Checks if the image is a double-page spread and offers to split it.
* If no split needed, auto-advances.
*/
export function StepPageSplit({ sessionId, onNext, onSubSessionsCreated }: StepPageSplitProps) {
const [checking, setChecking] = useState(false)
const [splitResult, setSplitResult] = useState<{ is_double_page: boolean; pages?: number } | null>(null)
const [splitting, setSplitting] = useState(false)
export function StepPageSplit({ sessionId, sessionName, onNext, onSubSessionsCreated }: StepPageSplitProps) {
const [detecting, setDetecting] = useState(false)
const [splitResult, setSplitResult] = useState<PageSplitResult | null>(null)
const [error, setError] = useState('')
const didDetect = useRef(false)
// Auto-detect page split when step opens
useEffect(() => {
if (!sessionId) return
// Auto-check for page split
checkPageSplit()
if (!sessionId || didDetect.current) return
didDetect.current = true
detectPageSplit()
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [sessionId])
const checkPageSplit = async () => {
const detectPageSplit = async () => {
if (!sessionId) return
setChecking(true)
setDetecting(true)
setError('')
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
if (!res.ok) throw new Error('Session nicht gefunden')
const data = await res.json()
// If sub-sessions already exist, this was already split
if (data.sub_sessions?.length > 0) {
onSubSessionsCreated(data.sub_sessions)
onNext()
// First check if sub-sessions already exist
const sessionRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
if (sessionRes.ok) {
const sessionData = await sessionRes.json()
if (sessionData.sub_sessions?.length > 0) {
// Already split — show existing sub-sessions
const subs = sessionData.sub_sessions as { id: string; name: string; page_index?: number; box_index?: number; current_step?: number }[]
setSplitResult({
multi_page: true,
page_count: subs.length,
sub_sessions: subs.map((s: { id: string; name: string; page_index?: number; box_index?: number }) => ({
id: s.id,
name: s.name,
page_index: s.page_index ?? s.box_index ?? 0,
})),
})
onSubSessionsCreated(subs.map((s: { id: string; name: string; page_index?: number; box_index?: number; current_step?: number }) => ({
id: s.id,
name: s.name,
box_index: s.page_index ?? s.box_index ?? 0,
current_step: s.current_step ?? 2,
})))
setDetecting(false)
return
}
// Check aspect ratio to guess if double-page
// For now, just auto-advance (page-split detection happens in orientation step)
setSplitResult({ is_double_page: false })
// Auto-advance if single page
onNext()
} catch (e) {
setError(e instanceof Error ? e.message : String(e))
} finally {
setChecking(false)
}
}
const handleSplit = async () => {
if (!sessionId) return
setSplitting(true)
setError('')
try {
// Run page-split detection
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/page-split`, {
method: 'POST',
})
if (!res.ok) {
const data = await res.json().catch(() => ({}))
throw new Error(data.detail || 'Split fehlgeschlagen')
throw new Error(data.detail || 'Seitentrennung fehlgeschlagen')
}
const data = await res.json()
if (data.sub_sessions?.length > 0) {
onSubSessionsCreated(data.sub_sessions)
const data: PageSplitResult = await res.json()
setSplitResult(data)
if (data.multi_page && data.sub_sessions?.length) {
// Rename sub-sessions to "Title — S. 1", "Title — S. 2"
const baseName = sessionName || 'Dokument'
for (let i = 0; i < data.sub_sessions.length; i++) {
const sub = data.sub_sessions[i]
const newName = `${baseName} — S. ${i + 1}`
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sub.id}`, {
method: 'PUT',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ name: newName }),
}).catch(() => {})
sub.name = newName
}
onSubSessionsCreated(data.sub_sessions.map(s => ({
id: s.id,
name: s.name,
box_index: s.page_index,
current_step: 2,
})))
}
onNext()
} catch (e) {
setError(e instanceof Error ? e.message : String(e))
} finally {
setSplitting(false)
setDetecting(false)
}
}
if (checking) {
return <div className="text-sm text-gray-500 py-8 text-center">Pruefe Seitenformat...</div>
}
if (!sessionId) return null
const imageUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented`
if (splitResult?.is_double_page) {
return (
<div className="space-y-4 p-6 bg-blue-50 dark:bg-blue-900/20 rounded-xl border border-blue-200 dark:border-blue-800">
<h3 className="text-sm font-medium text-blue-700 dark:text-blue-300">
Doppelseite erkannt
</h3>
<p className="text-sm text-blue-600 dark:text-blue-400">
Das Bild scheint eine Doppelseite zu sein. Soll es in zwei Einzelseiten aufgeteilt werden?
<div className="space-y-4">
{/* Image */}
<div className="relative rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={imageUrl}
alt="Orientiertes Bild"
className="w-full object-contain max-h-[500px]"
onError={(e) => {
// Fallback to non-oriented image
(e.target as HTMLImageElement).src =
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`
}}
/>
</div>
{/* Detection status */}
{detecting && (
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
Doppelseiten-Erkennung laeuft...
</div>
)}
{/* Detection result */}
{splitResult && !detecting && (
splitResult.multi_page ? (
<div className="bg-blue-50 dark:bg-blue-900/20 rounded-lg border border-blue-200 dark:border-blue-700 p-4 space-y-2">
<div className="text-sm font-medium text-blue-700 dark:text-blue-300">
Doppelseite erkannt {splitResult.page_count} Seiten getrennt
</div>
<p className="text-xs text-blue-600 dark:text-blue-400">
Jede Seite wird als eigene Session weiterverarbeitet (eigene Begradigung, Entzerrung, etc.).
{splitResult.used_original && ' Trennung auf Originalbild, da Orientierung die Doppelseite gedreht hat.'}
</p>
<div className="flex gap-2">
<button
onClick={handleSplit}
disabled={splitting}
className="px-4 py-2 bg-blue-600 text-white text-sm rounded-lg hover:bg-blue-700 disabled:opacity-50"
<div className="flex gap-2 mt-2">
{splitResult.sub_sessions?.map(s => (
<span
key={s.id}
className="text-xs px-2.5 py-1 rounded-md bg-blue-100 dark:bg-blue-800/40 text-blue-700 dark:text-blue-300 font-medium"
>
{splitting ? 'Wird aufgeteilt...' : 'Aufteilen'}
{s.name}
</span>
))}
</div>
{splitResult.duration_seconds != null && (
<div className="text-xs text-gray-400">{splitResult.duration_seconds.toFixed(1)}s</div>
)}
</div>
) : (
<div className="bg-green-50 dark:bg-green-900/20 rounded-lg border border-green-200 dark:border-green-800 p-4">
<div className="flex items-center gap-2 text-sm font-medium text-green-700 dark:text-green-300">
<span>&#10003;</span> Einzelseite keine Trennung noetig
</div>
{splitResult.duration_seconds != null && (
<div className="text-xs text-gray-400 mt-1">{splitResult.duration_seconds.toFixed(1)}s</div>
)}
</div>
)
)}
{/* Error */}
{error && (
<div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
{error}
<button
onClick={() => { didDetect.current = false; detectPageSplit() }}
className="ml-2 text-teal-600 hover:underline"
>
Erneut versuchen
</button>
</div>
)}
{/* Next button — only show when detection is done */}
{(splitResult || error) && !detecting && (
<div className="flex justify-end">
<button
onClick={onNext}
className="px-4 py-2 bg-gray-200 dark:bg-gray-700 text-sm rounded-lg hover:bg-gray-300"
className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
>
Einzelseite beibehalten
Weiter &rarr;
</button>
</div>
{error && <div className="text-sm text-red-500">{error}</div>}
</div>
)
}
return (
<div className="text-sm text-gray-500 py-8 text-center">
Einzelseite erkannt weiter zum naechsten Schritt.
{error && <div className="text-sm text-red-500 mt-2">{error}</div>}
)}
</div>
)
}

View File

@@ -1,28 +1,52 @@
'use client'
import { useState, useCallback } from 'react'
import { useState, useCallback, useEffect } from 'react'
import { DOCUMENT_CATEGORIES, type DocumentCategory } from '@/app/(admin)/ai/ocr-pipeline/types'
const KLAUSUR_API = '/klausur-api'
interface StepUploadProps {
onUploaded: (sessionId: string) => void
sessionId: string | null
onUploaded: (sessionId: string, name: string) => void
onNext: () => void
}
export function StepUpload({ onUploaded }: StepUploadProps) {
export function StepUpload({ sessionId, onUploaded, onNext }: StepUploadProps) {
const [dragging, setDragging] = useState(false)
const [uploading, setUploading] = useState(false)
const [selectedFile, setSelectedFile] = useState<File | null>(null)
const [preview, setPreview] = useState<string | null>(null)
const [title, setTitle] = useState('')
const [category, setCategory] = useState<DocumentCategory>('vokabelseite')
const [error, setError] = useState('')
const handleUpload = useCallback(async (file: File) => {
// Clean up preview URL on unmount
useEffect(() => {
return () => { if (preview) URL.revokeObjectURL(preview) }
}, [preview])
const handleFileSelect = useCallback((file: File) => {
setSelectedFile(file)
setError('')
if (file.type.startsWith('image/')) {
setPreview(URL.createObjectURL(file))
} else {
setPreview(null)
}
// Auto-fill title from filename if empty
if (!title.trim()) {
setTitle(file.name.replace(/\.[^.]+$/, ''))
}
}, [title])
const handleUpload = useCallback(async () => {
if (!selectedFile) return
setUploading(true)
setError('')
try {
const formData = new FormData()
formData.append('file', file)
formData.append('file', selectedFile)
if (title.trim()) formData.append('name', title.trim())
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, {
@@ -47,26 +71,164 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
})
}
onUploaded(sid)
onUploaded(sid, title.trim() || selectedFile.name)
} catch (e) {
setError(e instanceof Error ? e.message : String(e))
} finally {
setUploading(false)
}
}, [title, category, onUploaded])
}, [selectedFile, title, category, onUploaded])
const handleDrop = useCallback((e: React.DragEvent) => {
e.preventDefault()
setDragging(false)
const file = e.dataTransfer.files[0]
if (file) handleUpload(file)
}, [handleUpload])
if (file) handleFileSelect(file)
}, [handleFileSelect])
const handleFileSelect = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
const handleInputChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
const file = e.target.files?.[0]
if (file) handleUpload(file)
}, [handleUpload])
if (file) handleFileSelect(file)
}, [handleFileSelect])
const clearFile = useCallback(() => {
setSelectedFile(null)
if (preview) URL.revokeObjectURL(preview)
setPreview(null)
}, [preview])
// ---- Phase 2: Uploaded → show result + "Weiter" ----
if (sessionId) {
return (
<div className="space-y-4">
<div className="bg-green-50 dark:bg-green-900/20 border border-green-200 dark:border-green-800 rounded-lg p-4">
<div className="flex items-center gap-2 text-green-700 dark:text-green-300 text-sm font-medium mb-3">
<span>&#10003;</span> Dokument hochgeladen
</div>
<div className="flex gap-4">
<div className="w-48 h-64 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`}
alt="Hochgeladenes Dokument"
className="w-full h-full object-contain"
onError={(e) => { (e.target as HTMLImageElement).style.display = 'none' }}
/>
</div>
<div className="text-sm text-gray-600 dark:text-gray-400">
<div className="font-medium text-gray-700 dark:text-gray-300 mb-1">
{title || 'Dokument'}
</div>
<div className="text-xs text-gray-400 mt-1">
Kategorie: {DOCUMENT_CATEGORIES.find(c => c.value === category)?.label || category}
</div>
<div className="text-xs font-mono text-gray-400 mt-1">
Session: {sessionId.slice(0, 8)}...
</div>
</div>
</div>
</div>
<div className="flex justify-end">
<button
onClick={onNext}
className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
>
Weiter &rarr;
</button>
</div>
</div>
)
}
// ---- Phase 1b: File selected → preview + "Hochladen" ----
if (selectedFile) {
return (
<div className="space-y-4">
{/* Title input */}
<div>
<label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
Titel
</label>
<input
type="text"
value={title}
onChange={(e) => setTitle(e.target.value)}
placeholder="z.B. Vokabeln Unit 3"
className="w-full px-3 py-2 border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-800 text-sm"
/>
</div>
{/* Category selector */}
<div>
<label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
Kategorie
</label>
<div className="grid grid-cols-4 gap-1.5">
{DOCUMENT_CATEGORIES.map(cat => (
<button
key={cat.value}
onClick={() => setCategory(cat.value)}
className={`text-xs px-2 py-1.5 rounded-md text-left transition-colors ${
category === cat.value
? 'bg-teal-100 dark:bg-teal-900/40 text-teal-700 dark:text-teal-300 ring-1 ring-teal-400'
: 'bg-gray-50 dark:bg-gray-700 text-gray-600 dark:text-gray-400 hover:bg-gray-100'
}`}
>
{cat.icon} {cat.label}
</button>
))}
</div>
</div>
{/* File preview */}
<div className="border border-gray-200 dark:border-gray-700 rounded-xl p-4">
<div className="flex items-start gap-4">
{preview ? (
<div className="w-36 h-48 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
{/* eslint-disable-next-line @next/next/no-img-element */}
<img src={preview} alt="Vorschau" className="w-full h-full object-contain" />
</div>
) : (
<div className="w-36 h-48 rounded-lg bg-gray-100 dark:bg-gray-700 flex-shrink-0 flex items-center justify-center border border-gray-200 dark:border-gray-600">
<span className="text-3xl">&#128196;</span>
</div>
)}
<div className="flex-1 min-w-0">
<div className="font-medium text-sm text-gray-700 dark:text-gray-300 truncate">
{selectedFile.name}
</div>
<div className="text-xs text-gray-400 mt-1">
{(selectedFile.size / 1024 / 1024).toFixed(1)} MB
</div>
<button
onClick={clearFile}
className="text-xs text-red-500 hover:text-red-700 mt-2"
>
Andere Datei waehlen
</button>
</div>
</div>
<button
onClick={handleUpload}
disabled={uploading}
className="mt-4 w-full px-4 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed transition-colors"
>
{uploading ? 'Wird hochgeladen...' : 'Hochladen'}
</button>
</div>
{error && (
<div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
{error}
</div>
)}
</div>
)
}
// ---- Phase 1a: No file → drop zone ----
return (
<div className="space-y-4">
{/* Title input */}
@@ -116,11 +278,7 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
: 'border-gray-300 dark:border-gray-600 hover:border-gray-400'
}`}
>
{uploading ? (
<div className="text-sm text-gray-500">Wird hochgeladen...</div>
) : (
<>
<div className="text-4xl mb-3">📤</div>
<div className="text-4xl mb-3">&#128228;</div>
<div className="text-sm text-gray-600 dark:text-gray-400 mb-2">
Bild oder PDF hierher ziehen
</div>
@@ -129,12 +287,10 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
<input
type="file"
accept="image/*,.pdf"
onChange={handleFileSelect}
onChange={handleInputChange}
className="hidden"
/>
</label>
</>
)}
</div>
{error && (

View File

@@ -34,7 +34,8 @@ _STOP_WORDS = frozenset([
'der', 'die', 'das', 'dem', 'den', 'des',
'ein', 'eine', 'einem', 'einen', 'einer',
# Pronouns
'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
# Prepositions
'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
@@ -139,6 +140,93 @@ def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
return ' '.join(result)
def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
"""Merge OCR word-gap fragments in cell texts using pyphen validation.
OCR often splits words at syllable boundaries into separate word_boxes,
producing text like "zerknit tert" instead of "zerknittert". This
function tries to merge adjacent fragments in every content cell.
More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
but still guarded by pyphen dictionary lookup and stop-word exclusion.
Returns the number of cells modified.
"""
hyph_de, _ = _get_hyphenators()
if hyph_de is None:
return 0
modified = 0
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
text = cell.get("text", "")
if not text or " " not in text:
continue
# Skip IPA cells
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
if _IPA_RE.search(text_no_brackets):
continue
new_text = _try_merge_word_gaps(text, hyph_de)
if new_text != text:
cell["text"] = new_text
modified += 1
if modified:
logger.info(
"build-grid session %s: merged word gaps in %d cells",
session_id, modified,
)
return modified
def _try_merge_word_gaps(text: str, hyph_de) -> str:
"""Merge OCR word fragments with relaxed threshold (max_short=6).
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
(max_short=5 instead of 3). Still requires pyphen to recognize the
merged word.
"""
parts = text.split(' ')
if len(parts) < 2:
return text
result = [parts[0]]
i = 1
while i < len(parts):
prev = result[-1]
curr = parts[i]
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
should_try = (
prev == prev_alpha
and prev_alpha and curr_alpha
and prev_alpha.lower() not in _STOP_WORDS
and curr_alpha.lower() not in _STOP_WORDS
and min(len(prev_alpha), len(curr_alpha)) <= 5
and len(prev_alpha) + len(curr_alpha) >= 4
)
if should_try:
merged_alpha = prev_alpha + curr_alpha
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
if '-' in hyph:
result[-1] = prev + curr
i += 1
continue
result.append(curr)
i += 1
return ' '.join(result)
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
"""Syllabify all significant words in a text string.
@@ -259,6 +347,12 @@ def insert_syllable_dividers(
if not text:
continue
# In auto mode (force=False), only normalize cells that already
# have | from OCR (i.e. printed syllable dividers on the original
# scan). Don't add new syllable marks to other words.
if not force and "|" not in text:
continue
new_text = _syllabify_text(text, hyph_de, hyph_en)
if new_text != text:
cell["text"] = new_text

View File

@@ -1593,6 +1593,13 @@ async def _build_grid_core(
except Exception as e:
logger.warning("Dictionary detection failed: %s", e)
# --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
try:
from cv_syllable_detect import merge_word_gaps_in_zones
merge_word_gaps_in_zones(zones_data, session_id)
except Exception as e:
logger.warning("Word-gap merge failed: %s", e)
# --- Syllable divider insertion for dictionary pages ---
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
# "all" = force on all content words, "en" = English column only,
@@ -1626,6 +1633,15 @@ async def _build_grid_core(
except Exception as e:
logger.warning("Syllable insertion failed: %s", e)
# When syllable mode is "none", strip any residual | from OCR so
# that the displayed text is clean (e.g. "Zel|le" → "Zelle").
if syllable_mode == "none":
for z in zones_data:
for cell in z.get("cells", []):
t = cell.get("text", "")
if "|" in t:
cell["text"] = t.replace("|", "")
# Clean up internal flags before returning
for z in zones_data:
for cell in z.get("cells", []):

View File

@@ -912,6 +912,13 @@ def _detect_heading_rows_by_single_cell(
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
continue
# Guard: dictionary section headings are short (1-4 alpha chars
# like "A", "Ab", "Zi", "Sch"). Longer text that starts
# lowercase is a regular vocabulary word (e.g. "zentral") that
# happens to appear alone in its row.
alpha_only = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', text)
if len(alpha_only) > 4 and text[0].islower():
continue
heading_row_indices.append(ri)
# Guard: if >25% of eligible rows would become headings, the