Compare commits
10 Commits
85fe0a73d6
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
21b69e06be | ||
|
|
0168ab1a67 | ||
|
|
925f4356ce | ||
|
|
cc4cb3bc2f | ||
|
|
0685fb12da | ||
|
|
96ea23164d | ||
|
|
a8773d5b00 | ||
|
|
9f68bd3425 | ||
|
|
469f09d1e1 | ||
|
|
3bb04b25ab |
@@ -2,7 +2,6 @@
|
||||
|
||||
import { Suspense } from 'react'
|
||||
import { PagePurpose } from '@/components/common/PagePurpose'
|
||||
import { BoxSessionTabs } from '@/components/ocr-pipeline/BoxSessionTabs'
|
||||
import { KombiStepper } from '@/components/ocr-kombi/KombiStepper'
|
||||
import { SessionList } from '@/components/ocr-kombi/SessionList'
|
||||
import { SessionHeader } from '@/components/ocr-kombi/SessionHeader'
|
||||
@@ -27,8 +26,6 @@ function OcrKombiContent() {
|
||||
loadingSessions,
|
||||
activeCategory,
|
||||
isGroundTruth,
|
||||
subSessions,
|
||||
parentSessionId,
|
||||
steps,
|
||||
gridSaveRef,
|
||||
groupedSessions,
|
||||
@@ -40,11 +37,8 @@ function OcrKombiContent() {
|
||||
deleteSession,
|
||||
renameSession,
|
||||
updateCategory,
|
||||
handleOrientationComplete,
|
||||
handleSessionChange,
|
||||
setSessionId,
|
||||
setSubSessions,
|
||||
setParentSessionId,
|
||||
setSessionName,
|
||||
setIsGroundTruth,
|
||||
} = useKombiPipeline()
|
||||
|
||||
@@ -53,19 +47,20 @@ function OcrKombiContent() {
|
||||
case 0:
|
||||
return (
|
||||
<StepUpload
|
||||
onUploaded={(sid) => {
|
||||
sessionId={sessionId}
|
||||
onUploaded={(sid, name) => {
|
||||
setSessionId(sid)
|
||||
setSessionName(name)
|
||||
loadSessions()
|
||||
openSession(sid)
|
||||
handleNext()
|
||||
}}
|
||||
onNext={handleNext}
|
||||
/>
|
||||
)
|
||||
case 1:
|
||||
return (
|
||||
<StepOrientation
|
||||
sessionId={sessionId}
|
||||
onNext={handleOrientationComplete}
|
||||
onNext={() => handleNext()}
|
||||
onSessionList={() => { loadSessions(); handleNewSession() }}
|
||||
/>
|
||||
)
|
||||
@@ -73,10 +68,13 @@ function OcrKombiContent() {
|
||||
return (
|
||||
<StepPageSplit
|
||||
sessionId={sessionId}
|
||||
sessionName={sessionName}
|
||||
onNext={handleNext}
|
||||
onSubSessionsCreated={(subs) => {
|
||||
setSubSessions(subs)
|
||||
if (sessionId) setParentSessionId(sessionId)
|
||||
onSplitComplete={(childId, childName) => {
|
||||
// Switch to the first child session and refresh the list
|
||||
setSessionId(childId)
|
||||
setSessionName(childName)
|
||||
loadSessions()
|
||||
}}
|
||||
/>
|
||||
)
|
||||
@@ -151,15 +149,6 @@ function OcrKombiContent() {
|
||||
onStepClick={handleStepClick}
|
||||
/>
|
||||
|
||||
{subSessions.length > 0 && parentSessionId && sessionId && (
|
||||
<BoxSessionTabs
|
||||
parentSessionId={parentSessionId}
|
||||
subSessions={subSessions}
|
||||
activeSessionId={sessionId}
|
||||
onSessionChange={handleSessionChange}
|
||||
/>
|
||||
)}
|
||||
|
||||
<div className="min-h-[400px]">{renderStep()}</div>
|
||||
</div>
|
||||
)
|
||||
|
||||
@@ -8,7 +8,6 @@ export { DOCUMENT_CATEGORIES } from '../ocr-pipeline/types'
|
||||
export type {
|
||||
SessionListItem,
|
||||
SessionInfo,
|
||||
SubSession,
|
||||
OrientationResult,
|
||||
CropResult,
|
||||
DeskewResult,
|
||||
|
||||
@@ -4,7 +4,7 @@ import { useCallback, useEffect, useState, useRef } from 'react'
|
||||
import { useSearchParams } from 'next/navigation'
|
||||
import type { PipelineStep, DocumentCategory } from './types'
|
||||
import { KOMBI_V2_STEPS, dbStepToKombiV2Ui } from './types'
|
||||
import type { SubSession, SessionListItem } from '../ocr-pipeline/types'
|
||||
import type { SessionListItem } from '../ocr-pipeline/types'
|
||||
|
||||
export type { SessionListItem }
|
||||
|
||||
@@ -33,8 +33,6 @@ export function useKombiPipeline() {
|
||||
const [loadingSessions, setLoadingSessions] = useState(true)
|
||||
const [activeCategory, setActiveCategory] = useState<DocumentCategory | undefined>(undefined)
|
||||
const [isGroundTruth, setIsGroundTruth] = useState(false)
|
||||
const [subSessions, setSubSessions] = useState<SubSession[]>([])
|
||||
const [parentSessionId, setParentSessionId] = useState<string | null>(null)
|
||||
const [steps, setSteps] = useState<PipelineStep[]>(initSteps())
|
||||
|
||||
const searchParams = useSearchParams()
|
||||
@@ -115,7 +113,7 @@ export function useKombiPipeline() {
|
||||
|
||||
// ---- Open session ----
|
||||
|
||||
const openSession = useCallback(async (sid: string, keepSubSessions?: boolean) => {
|
||||
const openSession = useCallback(async (sid: string) => {
|
||||
try {
|
||||
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
|
||||
if (!res.ok) return
|
||||
@@ -126,17 +124,6 @@ export function useKombiPipeline() {
|
||||
setActiveCategory(data.document_category || undefined)
|
||||
setIsGroundTruth(!!data.ground_truth?.build_grid_reference)
|
||||
|
||||
// Sub-session handling
|
||||
if (data.sub_sessions?.length > 0) {
|
||||
setSubSessions(data.sub_sessions)
|
||||
setParentSessionId(sid)
|
||||
} else if (data.parent_session_id) {
|
||||
setParentSessionId(data.parent_session_id)
|
||||
} else if (!keepSubSessions) {
|
||||
setSubSessions([])
|
||||
setParentSessionId(null)
|
||||
}
|
||||
|
||||
// Determine UI step from DB state
|
||||
const dbStep = data.current_step || 1
|
||||
const hasGrid = !!data.grid_editor_result
|
||||
@@ -154,27 +141,15 @@ export function useKombiPipeline() {
|
||||
uiStep = dbStepToKombiV2Ui(dbStep)
|
||||
}
|
||||
|
||||
// For sessions that already have an upload, skip the upload step
|
||||
if (uiStep === 0 && dbStep >= 2) {
|
||||
// Sessions only exist after upload, so always skip the upload step
|
||||
if (uiStep === 0) {
|
||||
uiStep = 1
|
||||
}
|
||||
|
||||
const skipIds: string[] = []
|
||||
const isSubSession = !!data.parent_session_id
|
||||
if (isSubSession && dbStep >= 5) {
|
||||
skipIds.push('upload', 'orientation', 'page-split', 'deskew', 'dewarp', 'content-crop')
|
||||
if (uiStep < 6) uiStep = 6
|
||||
} else if (isSubSession && dbStep >= 2) {
|
||||
skipIds.push('upload', 'orientation')
|
||||
if (uiStep < 2) uiStep = 2
|
||||
}
|
||||
|
||||
setSteps(
|
||||
KOMBI_V2_STEPS.map((s, i) => ({
|
||||
...s,
|
||||
status: skipIds.includes(s.id)
|
||||
? 'skipped'
|
||||
: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
|
||||
status: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending',
|
||||
})),
|
||||
)
|
||||
setCurrentStep(uiStep)
|
||||
@@ -226,8 +201,6 @@ export function useKombiPipeline() {
|
||||
setSteps(initSteps())
|
||||
setCurrentStep(0)
|
||||
setSessionId(null)
|
||||
setSubSessions([])
|
||||
setParentSessionId(null)
|
||||
loadSessions()
|
||||
return
|
||||
}
|
||||
@@ -249,8 +222,6 @@ export function useKombiPipeline() {
|
||||
setSessionId(null)
|
||||
setSessionName('')
|
||||
setCurrentStep(0)
|
||||
setSubSessions([])
|
||||
setParentSessionId(null)
|
||||
setSteps(initSteps())
|
||||
}, [])
|
||||
|
||||
@@ -292,40 +263,6 @@ export function useKombiPipeline() {
|
||||
}
|
||||
}, [sessionId])
|
||||
|
||||
// ---- Orientation completion (checks for page-split sub-sessions) ----
|
||||
|
||||
const handleOrientationComplete = useCallback(async (sid: string) => {
|
||||
setSessionId(sid)
|
||||
loadSessions()
|
||||
|
||||
try {
|
||||
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`)
|
||||
if (res.ok) {
|
||||
const data = await res.json()
|
||||
if (data.sub_sessions?.length > 0) {
|
||||
const subs: SubSession[] = data.sub_sessions.map((s: SubSession) => ({
|
||||
id: s.id,
|
||||
name: s.name,
|
||||
box_index: s.box_index,
|
||||
current_step: s.current_step,
|
||||
}))
|
||||
setSubSessions(subs)
|
||||
setParentSessionId(sid)
|
||||
openSession(subs[0].id, true)
|
||||
return
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Failed to check for sub-sessions:', e)
|
||||
}
|
||||
|
||||
handleNext()
|
||||
}, [loadSessions, openSession, handleNext])
|
||||
|
||||
const handleSessionChange = useCallback((newSessionId: string) => {
|
||||
openSession(newSessionId, true)
|
||||
}, [openSession])
|
||||
|
||||
return {
|
||||
// State
|
||||
currentStep,
|
||||
@@ -335,8 +272,6 @@ export function useKombiPipeline() {
|
||||
loadingSessions,
|
||||
activeCategory,
|
||||
isGroundTruth,
|
||||
subSessions,
|
||||
parentSessionId,
|
||||
steps,
|
||||
gridSaveRef,
|
||||
// Computed
|
||||
@@ -351,11 +286,8 @@ export function useKombiPipeline() {
|
||||
deleteSession,
|
||||
renameSession,
|
||||
updateCategory,
|
||||
handleOrientationComplete,
|
||||
handleSessionChange,
|
||||
setSessionId,
|
||||
setSubSessions,
|
||||
setParentSessionId,
|
||||
setSessionName,
|
||||
setIsGroundTruth,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,17 +4,17 @@ import { StepOrientation as BaseStepOrientation } from '@/components/ocr-pipelin
|
||||
|
||||
interface StepOrientationProps {
|
||||
sessionId: string | null
|
||||
onNext: (sessionId: string) => void
|
||||
onNext: () => void
|
||||
onSessionList: () => void
|
||||
}
|
||||
|
||||
/** Thin wrapper around the shared StepOrientation component */
|
||||
/** Thin wrapper — adapts the shared StepOrientation to the Kombi pipeline's simpler onNext() */
|
||||
export function StepOrientation({ sessionId, onNext, onSessionList }: StepOrientationProps) {
|
||||
return (
|
||||
<BaseStepOrientation
|
||||
key={sessionId}
|
||||
sessionId={sessionId}
|
||||
onNext={onNext}
|
||||
onNext={() => onNext()}
|
||||
onSessionList={onSessionList}
|
||||
/>
|
||||
)
|
||||
|
||||
@@ -1,123 +1,198 @@
|
||||
'use client'
|
||||
|
||||
import { useState, useEffect } from 'react'
|
||||
import type { SubSession } from '@/app/(admin)/ai/ocr-pipeline/types'
|
||||
|
||||
import { useState, useEffect, useRef } from 'react'
|
||||
const KLAUSUR_API = '/klausur-api'
|
||||
|
||||
interface PageSplitResult {
|
||||
multi_page: boolean
|
||||
page_count?: number
|
||||
page_splits?: { x: number; y: number; width: number; height: number; page_index: number }[]
|
||||
sub_sessions?: { id: string; name: string; page_index: number }[]
|
||||
used_original?: boolean
|
||||
duration_seconds?: number
|
||||
}
|
||||
|
||||
interface StepPageSplitProps {
|
||||
sessionId: string | null
|
||||
sessionName: string
|
||||
onNext: () => void
|
||||
onSubSessionsCreated: (subs: SubSession[]) => void
|
||||
onSplitComplete: (firstChildId: string, firstChildName: string) => void
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: Page split detection.
|
||||
* Checks if the image is a double-page spread and offers to split it.
|
||||
* If no split needed, auto-advances.
|
||||
*/
|
||||
export function StepPageSplit({ sessionId, onNext, onSubSessionsCreated }: StepPageSplitProps) {
|
||||
const [checking, setChecking] = useState(false)
|
||||
const [splitResult, setSplitResult] = useState<{ is_double_page: boolean; pages?: number } | null>(null)
|
||||
const [splitting, setSplitting] = useState(false)
|
||||
export function StepPageSplit({ sessionId, sessionName, onNext, onSplitComplete }: StepPageSplitProps) {
|
||||
const [detecting, setDetecting] = useState(false)
|
||||
const [splitResult, setSplitResult] = useState<PageSplitResult | null>(null)
|
||||
const [error, setError] = useState('')
|
||||
const didDetect = useRef(false)
|
||||
|
||||
// Auto-detect page split when step opens
|
||||
useEffect(() => {
|
||||
if (!sessionId) return
|
||||
// Auto-check for page split
|
||||
checkPageSplit()
|
||||
if (!sessionId || didDetect.current) return
|
||||
didDetect.current = true
|
||||
detectPageSplit()
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [sessionId])
|
||||
|
||||
const checkPageSplit = async () => {
|
||||
const detectPageSplit = async () => {
|
||||
if (!sessionId) return
|
||||
setChecking(true)
|
||||
setDetecting(true)
|
||||
setError('')
|
||||
try {
|
||||
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
|
||||
if (!res.ok) throw new Error('Session nicht gefunden')
|
||||
const data = await res.json()
|
||||
|
||||
// If sub-sessions already exist, this was already split
|
||||
if (data.sub_sessions?.length > 0) {
|
||||
onSubSessionsCreated(data.sub_sessions)
|
||||
onNext()
|
||||
return
|
||||
// First check if this session was already split (status='split')
|
||||
const sessionRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
|
||||
if (sessionRes.ok) {
|
||||
const sessionData = await sessionRes.json()
|
||||
if (sessionData.status === 'split' && sessionData.crop_result?.multi_page) {
|
||||
// Already split — find the child sessions in the session list
|
||||
const listRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`)
|
||||
if (listRes.ok) {
|
||||
const listData = await listRes.json()
|
||||
// Child sessions have names like "ParentName — Seite N"
|
||||
const baseName = sessionName || sessionData.name || ''
|
||||
const children = (listData.sessions || [])
|
||||
.filter((s: { name?: string }) => s.name?.startsWith(baseName + ' — '))
|
||||
.sort((a: { name: string }, b: { name: string }) => a.name.localeCompare(b.name))
|
||||
if (children.length > 0) {
|
||||
setSplitResult({
|
||||
multi_page: true,
|
||||
page_count: children.length,
|
||||
sub_sessions: children.map((s: { id: string; name: string }, i: number) => ({
|
||||
id: s.id, name: s.name, page_index: i,
|
||||
})),
|
||||
})
|
||||
onSplitComplete(children[0].id, children[0].name)
|
||||
setDetecting(false)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check aspect ratio to guess if double-page
|
||||
// For now, just auto-advance (page-split detection happens in orientation step)
|
||||
setSplitResult({ is_double_page: false })
|
||||
// Auto-advance if single page
|
||||
onNext()
|
||||
} catch (e) {
|
||||
setError(e instanceof Error ? e.message : String(e))
|
||||
} finally {
|
||||
setChecking(false)
|
||||
}
|
||||
}
|
||||
|
||||
const handleSplit = async () => {
|
||||
if (!sessionId) return
|
||||
setSplitting(true)
|
||||
setError('')
|
||||
try {
|
||||
// Run page-split detection
|
||||
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/page-split`, {
|
||||
method: 'POST',
|
||||
})
|
||||
if (!res.ok) {
|
||||
const data = await res.json().catch(() => ({}))
|
||||
throw new Error(data.detail || 'Split fehlgeschlagen')
|
||||
throw new Error(data.detail || 'Seitentrennung fehlgeschlagen')
|
||||
}
|
||||
const data = await res.json()
|
||||
if (data.sub_sessions?.length > 0) {
|
||||
onSubSessionsCreated(data.sub_sessions)
|
||||
const data: PageSplitResult = await res.json()
|
||||
setSplitResult(data)
|
||||
|
||||
if (data.multi_page && data.sub_sessions?.length) {
|
||||
// Rename sub-sessions to "Title — S. 1", "Title — S. 2"
|
||||
const baseName = sessionName || 'Dokument'
|
||||
for (let i = 0; i < data.sub_sessions.length; i++) {
|
||||
const sub = data.sub_sessions[i]
|
||||
const newName = `${baseName} — S. ${i + 1}`
|
||||
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sub.id}`, {
|
||||
method: 'PUT',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ name: newName }),
|
||||
}).catch(() => {})
|
||||
sub.name = newName
|
||||
}
|
||||
|
||||
// Signal parent to switch to the first child session
|
||||
onSplitComplete(data.sub_sessions[0].id, data.sub_sessions[0].name)
|
||||
}
|
||||
onNext()
|
||||
} catch (e) {
|
||||
setError(e instanceof Error ? e.message : String(e))
|
||||
} finally {
|
||||
setSplitting(false)
|
||||
setDetecting(false)
|
||||
}
|
||||
}
|
||||
|
||||
if (checking) {
|
||||
return <div className="text-sm text-gray-500 py-8 text-center">Pruefe Seitenformat...</div>
|
||||
}
|
||||
if (!sessionId) return null
|
||||
|
||||
if (splitResult?.is_double_page) {
|
||||
return (
|
||||
<div className="space-y-4 p-6 bg-blue-50 dark:bg-blue-900/20 rounded-xl border border-blue-200 dark:border-blue-800">
|
||||
<h3 className="text-sm font-medium text-blue-700 dark:text-blue-300">
|
||||
Doppelseite erkannt
|
||||
</h3>
|
||||
<p className="text-sm text-blue-600 dark:text-blue-400">
|
||||
Das Bild scheint eine Doppelseite zu sein. Soll es in zwei Einzelseiten aufgeteilt werden?
|
||||
</p>
|
||||
<div className="flex gap-2">
|
||||
<button
|
||||
onClick={handleSplit}
|
||||
disabled={splitting}
|
||||
className="px-4 py-2 bg-blue-600 text-white text-sm rounded-lg hover:bg-blue-700 disabled:opacity-50"
|
||||
>
|
||||
{splitting ? 'Wird aufgeteilt...' : 'Aufteilen'}
|
||||
</button>
|
||||
<button
|
||||
onClick={onNext}
|
||||
className="px-4 py-2 bg-gray-200 dark:bg-gray-700 text-sm rounded-lg hover:bg-gray-300"
|
||||
>
|
||||
Einzelseite beibehalten
|
||||
</button>
|
||||
</div>
|
||||
{error && <div className="text-sm text-red-500">{error}</div>}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
const imageUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented`
|
||||
|
||||
return (
|
||||
<div className="text-sm text-gray-500 py-8 text-center">
|
||||
Einzelseite erkannt — weiter zum naechsten Schritt.
|
||||
{error && <div className="text-sm text-red-500 mt-2">{error}</div>}
|
||||
<div className="space-y-4">
|
||||
{/* Image */}
|
||||
<div className="relative rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700">
|
||||
{/* eslint-disable-next-line @next/next/no-img-element */}
|
||||
<img
|
||||
src={imageUrl}
|
||||
alt="Orientiertes Bild"
|
||||
className="w-full object-contain max-h-[500px]"
|
||||
onError={(e) => {
|
||||
// Fallback to non-oriented image
|
||||
(e.target as HTMLImageElement).src =
|
||||
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Detection status */}
|
||||
{detecting && (
|
||||
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
|
||||
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
|
||||
Doppelseiten-Erkennung laeuft...
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Detection result */}
|
||||
{splitResult && !detecting && (
|
||||
splitResult.multi_page ? (
|
||||
<div className="bg-blue-50 dark:bg-blue-900/20 rounded-lg border border-blue-200 dark:border-blue-700 p-4 space-y-2">
|
||||
<div className="text-sm font-medium text-blue-700 dark:text-blue-300">
|
||||
Doppelseite erkannt — {splitResult.page_count} Seiten getrennt
|
||||
</div>
|
||||
<p className="text-xs text-blue-600 dark:text-blue-400">
|
||||
Jede Seite wird als eigene Session weiterverarbeitet (eigene Begradigung, Entzerrung, etc.).
|
||||
{splitResult.used_original && ' Trennung auf Originalbild, da Orientierung die Doppelseite gedreht hat.'}
|
||||
</p>
|
||||
<div className="flex gap-2 mt-2">
|
||||
{splitResult.sub_sessions?.map(s => (
|
||||
<span
|
||||
key={s.id}
|
||||
className="text-xs px-2.5 py-1 rounded-md bg-blue-100 dark:bg-blue-800/40 text-blue-700 dark:text-blue-300 font-medium"
|
||||
>
|
||||
{s.name}
|
||||
</span>
|
||||
))}
|
||||
</div>
|
||||
{splitResult.duration_seconds != null && (
|
||||
<div className="text-xs text-gray-400">{splitResult.duration_seconds.toFixed(1)}s</div>
|
||||
)}
|
||||
</div>
|
||||
) : (
|
||||
<div className="bg-green-50 dark:bg-green-900/20 rounded-lg border border-green-200 dark:border-green-800 p-4">
|
||||
<div className="flex items-center gap-2 text-sm font-medium text-green-700 dark:text-green-300">
|
||||
<span>✓</span> Einzelseite — keine Trennung noetig
|
||||
</div>
|
||||
{splitResult.duration_seconds != null && (
|
||||
<div className="text-xs text-gray-400 mt-1">{splitResult.duration_seconds.toFixed(1)}s</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
)}
|
||||
|
||||
{/* Error */}
|
||||
{error && (
|
||||
<div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
|
||||
{error}
|
||||
<button
|
||||
onClick={() => { didDetect.current = false; detectPageSplit() }}
|
||||
className="ml-2 text-teal-600 hover:underline"
|
||||
>
|
||||
Erneut versuchen
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Next button — only show when detection is done */}
|
||||
{(splitResult || error) && !detecting && (
|
||||
<div className="flex justify-end">
|
||||
<button
|
||||
onClick={onNext}
|
||||
className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
|
||||
>
|
||||
Weiter →
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,28 +1,52 @@
|
||||
'use client'
|
||||
|
||||
import { useState, useCallback } from 'react'
|
||||
import { useState, useCallback, useEffect } from 'react'
|
||||
import { DOCUMENT_CATEGORIES, type DocumentCategory } from '@/app/(admin)/ai/ocr-pipeline/types'
|
||||
|
||||
const KLAUSUR_API = '/klausur-api'
|
||||
|
||||
interface StepUploadProps {
|
||||
onUploaded: (sessionId: string) => void
|
||||
sessionId: string | null
|
||||
onUploaded: (sessionId: string, name: string) => void
|
||||
onNext: () => void
|
||||
}
|
||||
|
||||
export function StepUpload({ onUploaded }: StepUploadProps) {
|
||||
export function StepUpload({ sessionId, onUploaded, onNext }: StepUploadProps) {
|
||||
const [dragging, setDragging] = useState(false)
|
||||
const [uploading, setUploading] = useState(false)
|
||||
const [selectedFile, setSelectedFile] = useState<File | null>(null)
|
||||
const [preview, setPreview] = useState<string | null>(null)
|
||||
const [title, setTitle] = useState('')
|
||||
const [category, setCategory] = useState<DocumentCategory>('vokabelseite')
|
||||
const [error, setError] = useState('')
|
||||
|
||||
const handleUpload = useCallback(async (file: File) => {
|
||||
// Clean up preview URL on unmount
|
||||
useEffect(() => {
|
||||
return () => { if (preview) URL.revokeObjectURL(preview) }
|
||||
}, [preview])
|
||||
|
||||
const handleFileSelect = useCallback((file: File) => {
|
||||
setSelectedFile(file)
|
||||
setError('')
|
||||
if (file.type.startsWith('image/')) {
|
||||
setPreview(URL.createObjectURL(file))
|
||||
} else {
|
||||
setPreview(null)
|
||||
}
|
||||
// Auto-fill title from filename if empty
|
||||
if (!title.trim()) {
|
||||
setTitle(file.name.replace(/\.[^.]+$/, ''))
|
||||
}
|
||||
}, [title])
|
||||
|
||||
const handleUpload = useCallback(async () => {
|
||||
if (!selectedFile) return
|
||||
setUploading(true)
|
||||
setError('')
|
||||
|
||||
try {
|
||||
const formData = new FormData()
|
||||
formData.append('file', file)
|
||||
formData.append('file', selectedFile)
|
||||
if (title.trim()) formData.append('name', title.trim())
|
||||
|
||||
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, {
|
||||
@@ -47,26 +71,164 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
|
||||
})
|
||||
}
|
||||
|
||||
onUploaded(sid)
|
||||
onUploaded(sid, title.trim() || selectedFile.name)
|
||||
} catch (e) {
|
||||
setError(e instanceof Error ? e.message : String(e))
|
||||
} finally {
|
||||
setUploading(false)
|
||||
}
|
||||
}, [title, category, onUploaded])
|
||||
}, [selectedFile, title, category, onUploaded])
|
||||
|
||||
const handleDrop = useCallback((e: React.DragEvent) => {
|
||||
e.preventDefault()
|
||||
setDragging(false)
|
||||
const file = e.dataTransfer.files[0]
|
||||
if (file) handleUpload(file)
|
||||
}, [handleUpload])
|
||||
if (file) handleFileSelect(file)
|
||||
}, [handleFileSelect])
|
||||
|
||||
const handleFileSelect = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const handleInputChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const file = e.target.files?.[0]
|
||||
if (file) handleUpload(file)
|
||||
}, [handleUpload])
|
||||
if (file) handleFileSelect(file)
|
||||
}, [handleFileSelect])
|
||||
|
||||
const clearFile = useCallback(() => {
|
||||
setSelectedFile(null)
|
||||
if (preview) URL.revokeObjectURL(preview)
|
||||
setPreview(null)
|
||||
}, [preview])
|
||||
|
||||
// ---- Phase 2: Uploaded → show result + "Weiter" ----
|
||||
if (sessionId) {
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
<div className="bg-green-50 dark:bg-green-900/20 border border-green-200 dark:border-green-800 rounded-lg p-4">
|
||||
<div className="flex items-center gap-2 text-green-700 dark:text-green-300 text-sm font-medium mb-3">
|
||||
<span>✓</span> Dokument hochgeladen
|
||||
</div>
|
||||
<div className="flex gap-4">
|
||||
<div className="w-48 h-64 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
|
||||
{/* eslint-disable-next-line @next/next/no-img-element */}
|
||||
<img
|
||||
src={`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image`}
|
||||
alt="Hochgeladenes Dokument"
|
||||
className="w-full h-full object-contain"
|
||||
onError={(e) => { (e.target as HTMLImageElement).style.display = 'none' }}
|
||||
/>
|
||||
</div>
|
||||
<div className="text-sm text-gray-600 dark:text-gray-400">
|
||||
<div className="font-medium text-gray-700 dark:text-gray-300 mb-1">
|
||||
{title || 'Dokument'}
|
||||
</div>
|
||||
<div className="text-xs text-gray-400 mt-1">
|
||||
Kategorie: {DOCUMENT_CATEGORIES.find(c => c.value === category)?.label || category}
|
||||
</div>
|
||||
<div className="text-xs font-mono text-gray-400 mt-1">
|
||||
Session: {sessionId.slice(0, 8)}...
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex justify-end">
|
||||
<button
|
||||
onClick={onNext}
|
||||
className="px-6 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 transition-colors"
|
||||
>
|
||||
Weiter →
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
// ---- Phase 1b: File selected → preview + "Hochladen" ----
|
||||
if (selectedFile) {
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
{/* Title input */}
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
|
||||
Titel
|
||||
</label>
|
||||
<input
|
||||
type="text"
|
||||
value={title}
|
||||
onChange={(e) => setTitle(e.target.value)}
|
||||
placeholder="z.B. Vokabeln Unit 3"
|
||||
className="w-full px-3 py-2 border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-800 text-sm"
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Category selector */}
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-1">
|
||||
Kategorie
|
||||
</label>
|
||||
<div className="grid grid-cols-4 gap-1.5">
|
||||
{DOCUMENT_CATEGORIES.map(cat => (
|
||||
<button
|
||||
key={cat.value}
|
||||
onClick={() => setCategory(cat.value)}
|
||||
className={`text-xs px-2 py-1.5 rounded-md text-left transition-colors ${
|
||||
category === cat.value
|
||||
? 'bg-teal-100 dark:bg-teal-900/40 text-teal-700 dark:text-teal-300 ring-1 ring-teal-400'
|
||||
: 'bg-gray-50 dark:bg-gray-700 text-gray-600 dark:text-gray-400 hover:bg-gray-100'
|
||||
}`}
|
||||
>
|
||||
{cat.icon} {cat.label}
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* File preview */}
|
||||
<div className="border border-gray-200 dark:border-gray-700 rounded-xl p-4">
|
||||
<div className="flex items-start gap-4">
|
||||
{preview ? (
|
||||
<div className="w-36 h-48 rounded-lg overflow-hidden bg-gray-100 dark:bg-gray-700 flex-shrink-0 border border-gray-200 dark:border-gray-600">
|
||||
{/* eslint-disable-next-line @next/next/no-img-element */}
|
||||
<img src={preview} alt="Vorschau" className="w-full h-full object-contain" />
|
||||
</div>
|
||||
) : (
|
||||
<div className="w-36 h-48 rounded-lg bg-gray-100 dark:bg-gray-700 flex-shrink-0 flex items-center justify-center border border-gray-200 dark:border-gray-600">
|
||||
<span className="text-3xl">📄</span>
|
||||
</div>
|
||||
)}
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="font-medium text-sm text-gray-700 dark:text-gray-300 truncate">
|
||||
{selectedFile.name}
|
||||
</div>
|
||||
<div className="text-xs text-gray-400 mt-1">
|
||||
{(selectedFile.size / 1024 / 1024).toFixed(1)} MB
|
||||
</div>
|
||||
<button
|
||||
onClick={clearFile}
|
||||
className="text-xs text-red-500 hover:text-red-700 mt-2"
|
||||
>
|
||||
Andere Datei waehlen
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<button
|
||||
onClick={handleUpload}
|
||||
disabled={uploading}
|
||||
className="mt-4 w-full px-4 py-2.5 bg-teal-600 text-white text-sm font-medium rounded-lg hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed transition-colors"
|
||||
>
|
||||
{uploading ? 'Wird hochgeladen...' : 'Hochladen'}
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{error && (
|
||||
<div className="text-sm text-red-500 bg-red-50 dark:bg-red-900/20 p-3 rounded-lg">
|
||||
{error}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
// ---- Phase 1a: No file → drop zone ----
|
||||
return (
|
||||
<div className="space-y-4">
|
||||
{/* Title input */}
|
||||
@@ -116,25 +278,19 @@ export function StepUpload({ onUploaded }: StepUploadProps) {
|
||||
: 'border-gray-300 dark:border-gray-600 hover:border-gray-400'
|
||||
}`}
|
||||
>
|
||||
{uploading ? (
|
||||
<div className="text-sm text-gray-500">Wird hochgeladen...</div>
|
||||
) : (
|
||||
<>
|
||||
<div className="text-4xl mb-3">📤</div>
|
||||
<div className="text-sm text-gray-600 dark:text-gray-400 mb-2">
|
||||
Bild oder PDF hierher ziehen
|
||||
</div>
|
||||
<label className="inline-block px-4 py-2 bg-teal-600 text-white text-sm rounded-lg cursor-pointer hover:bg-teal-700">
|
||||
Datei auswaehlen
|
||||
<input
|
||||
type="file"
|
||||
accept="image/*,.pdf"
|
||||
onChange={handleFileSelect}
|
||||
className="hidden"
|
||||
/>
|
||||
</label>
|
||||
</>
|
||||
)}
|
||||
<div className="text-4xl mb-3">📤</div>
|
||||
<div className="text-sm text-gray-600 dark:text-gray-400 mb-2">
|
||||
Bild oder PDF hierher ziehen
|
||||
</div>
|
||||
<label className="inline-block px-4 py-2 bg-teal-600 text-white text-sm rounded-lg cursor-pointer hover:bg-teal-700">
|
||||
Datei auswaehlen
|
||||
<input
|
||||
type="file"
|
||||
accept="image/*,.pdf"
|
||||
onChange={handleInputChange}
|
||||
className="hidden"
|
||||
/>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
{error && (
|
||||
|
||||
@@ -34,7 +34,8 @@ _STOP_WORDS = frozenset([
|
||||
'der', 'die', 'das', 'dem', 'den', 'des',
|
||||
'ein', 'eine', 'einem', 'einen', 'einer',
|
||||
# Pronouns
|
||||
'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
|
||||
'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
|
||||
'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
|
||||
# Prepositions
|
||||
'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
|
||||
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
|
||||
@@ -54,6 +55,9 @@ _STOP_WORDS = frozenset([
|
||||
_hyph_de = None
|
||||
_hyph_en = None
|
||||
|
||||
# Cached spellchecker (for autocorrect_pipe_artifacts)
|
||||
_spell_de = None
|
||||
|
||||
|
||||
def _get_hyphenators():
|
||||
"""Lazy-load pyphen hyphenators (cached across calls)."""
|
||||
@@ -69,6 +73,35 @@ def _get_hyphenators():
|
||||
return _hyph_de, _hyph_en
|
||||
|
||||
|
||||
def _get_spellchecker():
|
||||
"""Lazy-load German spellchecker (cached across calls)."""
|
||||
global _spell_de
|
||||
if _spell_de is not None:
|
||||
return _spell_de
|
||||
try:
|
||||
from spellchecker import SpellChecker
|
||||
except ImportError:
|
||||
return None
|
||||
_spell_de = SpellChecker(language='de')
|
||||
return _spell_de
|
||||
|
||||
|
||||
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
|
||||
"""Check whether pyphen recognises a word (DE or EN)."""
|
||||
if len(word) < 2:
|
||||
return False
|
||||
return ('|' in hyph_de.inserted(word, hyphen='|')
|
||||
or '|' in hyph_en.inserted(word, hyphen='|'))
|
||||
|
||||
|
||||
def _is_real_word(word: str) -> bool:
|
||||
"""Check whether spellchecker knows this word (case-insensitive)."""
|
||||
spell = _get_spellchecker()
|
||||
if spell is None:
|
||||
return False
|
||||
return word.lower() in spell
|
||||
|
||||
|
||||
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||||
"""Try to hyphenate a word using DE then EN dictionary.
|
||||
|
||||
@@ -83,6 +116,139 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
|
||||
"""Try to correct a word that has OCR pipe artifacts.
|
||||
|
||||
Printed syllable divider lines on dictionary pages confuse OCR:
|
||||
the vertical stroke is often read as an extra character (commonly
|
||||
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
|
||||
Sometimes OCR reads one divider as ``|`` and another as a letter,
|
||||
so the garbled character may be far from any detected pipe.
|
||||
|
||||
Uses ``spellchecker`` (frequency-based word list) for validation —
|
||||
unlike pyphen which is a pattern-based hyphenator and accepts
|
||||
nonsense strings like "Zeplpelin".
|
||||
|
||||
Strategy:
|
||||
1. Strip ``|`` — if spellchecker knows the result, done.
|
||||
2. Try deleting each pipe-like character (l, I, 1, i, t).
|
||||
OCR inserts extra chars that resemble vertical strokes.
|
||||
3. Fall back to spellchecker's own ``correction()`` method.
|
||||
4. Preserve the original casing of the first letter.
|
||||
"""
|
||||
stripped = word_with_pipes.replace('|', '')
|
||||
if not stripped or len(stripped) < 3:
|
||||
return stripped # too short to validate
|
||||
|
||||
# Step 1: if the stripped word is already a real word, done
|
||||
if _is_real_word(stripped):
|
||||
return stripped
|
||||
|
||||
# Step 2: try deleting pipe-like characters (most likely artifacts)
|
||||
_PIPE_LIKE = frozenset('lI1it')
|
||||
for idx in range(len(stripped)):
|
||||
if stripped[idx] not in _PIPE_LIKE:
|
||||
continue
|
||||
candidate = stripped[:idx] + stripped[idx + 1:]
|
||||
if len(candidate) >= 3 and _is_real_word(candidate):
|
||||
return candidate
|
||||
|
||||
# Step 3: use spellchecker's built-in correction
|
||||
spell = _get_spellchecker()
|
||||
if spell is not None:
|
||||
suggestion = spell.correction(stripped.lower())
|
||||
if suggestion and suggestion != stripped.lower():
|
||||
# Preserve original first-letter case
|
||||
if stripped[0].isupper():
|
||||
suggestion = suggestion[0].upper() + suggestion[1:]
|
||||
return suggestion
|
||||
|
||||
return None # could not fix
|
||||
|
||||
|
||||
def autocorrect_pipe_artifacts(
|
||||
zones_data: List[Dict], session_id: str,
|
||||
) -> int:
|
||||
"""Strip OCR pipe artifacts and correct garbled words in-place.
|
||||
|
||||
Printed syllable divider lines on dictionary scans are read by OCR
|
||||
as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
|
||||
This function:
|
||||
|
||||
1. Strips ``|`` from every word in content cells.
|
||||
2. Validates with spellchecker (real dictionary lookup).
|
||||
3. If not recognised, tries deleting pipe-like characters or uses
|
||||
spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
|
||||
4. Updates both word-box texts and cell text.
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
spell = _get_spellchecker()
|
||||
if spell is None:
|
||||
logger.warning("spellchecker not available — pipe autocorrect limited")
|
||||
# Fall back: still strip pipes even without spellchecker
|
||||
pass
|
||||
|
||||
modified = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
|
||||
cell_changed = False
|
||||
|
||||
# --- Fix word boxes ---
|
||||
for wb in cell.get("word_boxes", []):
|
||||
wb_text = wb.get("text", "")
|
||||
if "|" not in wb_text:
|
||||
continue
|
||||
|
||||
# Separate trailing punctuation
|
||||
m = re.match(
|
||||
r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
|
||||
r'(.*?)'
|
||||
r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
|
||||
wb_text,
|
||||
)
|
||||
if not m:
|
||||
continue
|
||||
lead, core, trail = m.group(1), m.group(2), m.group(3)
|
||||
if "|" not in core:
|
||||
continue
|
||||
|
||||
corrected = _autocorrect_piped_word(core)
|
||||
if corrected is not None and corrected != core:
|
||||
wb["text"] = lead + corrected + trail
|
||||
cell_changed = True
|
||||
|
||||
# --- Rebuild cell text from word boxes ---
|
||||
if cell_changed:
|
||||
wbs = cell.get("word_boxes", [])
|
||||
if wbs:
|
||||
cell["text"] = " ".join(
|
||||
(wb.get("text") or "") for wb in wbs
|
||||
)
|
||||
modified += 1
|
||||
|
||||
# --- Fallback: strip residual | from cell text ---
|
||||
# (covers cases where word_boxes don't exist or weren't fixed)
|
||||
text = cell.get("text", "")
|
||||
if "|" in text:
|
||||
clean = text.replace("|", "")
|
||||
if clean != text:
|
||||
cell["text"] = clean
|
||||
if not cell_changed:
|
||||
modified += 1
|
||||
|
||||
if modified:
|
||||
logger.info(
|
||||
"build-grid session %s: autocorrected pipe artifacts in %d cells",
|
||||
session_id, modified,
|
||||
)
|
||||
return modified
|
||||
|
||||
|
||||
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
||||
"""Merge fragments separated by single spaces where OCR split at a pipe.
|
||||
|
||||
@@ -139,6 +305,93 @@ def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
||||
return ' '.join(result)
|
||||
|
||||
|
||||
def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
|
||||
"""Merge OCR word-gap fragments in cell texts using pyphen validation.
|
||||
|
||||
OCR often splits words at syllable boundaries into separate word_boxes,
|
||||
producing text like "zerknit tert" instead of "zerknittert". This
|
||||
function tries to merge adjacent fragments in every content cell.
|
||||
|
||||
More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
|
||||
but still guarded by pyphen dictionary lookup and stop-word exclusion.
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
hyph_de, _ = _get_hyphenators()
|
||||
if hyph_de is None:
|
||||
return 0
|
||||
|
||||
modified = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if not text or " " not in text:
|
||||
continue
|
||||
|
||||
# Skip IPA cells
|
||||
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
|
||||
if _IPA_RE.search(text_no_brackets):
|
||||
continue
|
||||
|
||||
new_text = _try_merge_word_gaps(text, hyph_de)
|
||||
if new_text != text:
|
||||
cell["text"] = new_text
|
||||
modified += 1
|
||||
|
||||
if modified:
|
||||
logger.info(
|
||||
"build-grid session %s: merged word gaps in %d cells",
|
||||
session_id, modified,
|
||||
)
|
||||
return modified
|
||||
|
||||
|
||||
def _try_merge_word_gaps(text: str, hyph_de) -> str:
|
||||
"""Merge OCR word fragments with relaxed threshold (max_short=5).
|
||||
|
||||
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
|
||||
(max_short=5 instead of 3). Still requires pyphen to recognize the
|
||||
merged word.
|
||||
"""
|
||||
parts = text.split(' ')
|
||||
if len(parts) < 2:
|
||||
return text
|
||||
|
||||
result = [parts[0]]
|
||||
i = 1
|
||||
while i < len(parts):
|
||||
prev = result[-1]
|
||||
curr = parts[i]
|
||||
|
||||
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
|
||||
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
|
||||
|
||||
should_try = (
|
||||
prev == prev_alpha
|
||||
and prev_alpha and curr_alpha
|
||||
and prev_alpha.lower() not in _STOP_WORDS
|
||||
and curr_alpha.lower() not in _STOP_WORDS
|
||||
and min(len(prev_alpha), len(curr_alpha)) <= 5
|
||||
and len(prev_alpha) + len(curr_alpha) >= 4
|
||||
)
|
||||
|
||||
if should_try:
|
||||
merged_alpha = prev_alpha + curr_alpha
|
||||
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
|
||||
if '-' in hyph:
|
||||
result[-1] = prev + curr
|
||||
i += 1
|
||||
continue
|
||||
|
||||
result.append(curr)
|
||||
i += 1
|
||||
|
||||
return ' '.join(result)
|
||||
|
||||
|
||||
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
|
||||
"""Syllabify all significant words in a text string.
|
||||
|
||||
@@ -259,6 +512,12 @@ def insert_syllable_dividers(
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# In auto mode (force=False), only normalize cells that already
|
||||
# have | from OCR (i.e. printed syllable dividers on the original
|
||||
# scan). Don't add new syllable marks to other words.
|
||||
if not force and "|" not in text:
|
||||
continue
|
||||
|
||||
new_text = _syllabify_text(text, hyph_de, hyph_en)
|
||||
if new_text != text:
|
||||
cell["text"] = new_text
|
||||
|
||||
@@ -1323,6 +1323,14 @@ async def _build_grid_core(
|
||||
and wb.get("conf", 100) < 85):
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
|
||||
# Small images/icons next to words get OCR'd as ">", "<", "~", etc.
|
||||
# Remove word boxes that contain NO letters or digits.
|
||||
for i, wb in enumerate(wbs):
|
||||
t = (wb.get("text") or "").strip()
|
||||
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (b) + (c): overlap and duplicate detection
|
||||
# Sort by x for pairwise comparison
|
||||
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
||||
@@ -1353,6 +1361,19 @@ async def _build_grid_core(
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
# High overlap (>75%) with different alphabetic text:
|
||||
# OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
|
||||
# causing it to heavily overlap with the next fragment ("brech").
|
||||
# Merge instead of removing when one is a short prefix (≤4 chars)
|
||||
# and the texts are different.
|
||||
if (overlap_pct > 0.75
|
||||
and _ALPHA_WORD_RE.match(t1)
|
||||
and _ALPHA_WORD_RE.match(t2)
|
||||
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
|
||||
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
if overlap_pct <= 0.40:
|
||||
continue # too little overlap and not alphabetic merge
|
||||
|
||||
@@ -1393,15 +1414,22 @@ async def _build_grid_core(
|
||||
c2 = w2.get("conf", 50)
|
||||
to_remove.add(i1 if c1 <= c2 else i2)
|
||||
|
||||
# Execute merges first (syllable-split words)
|
||||
# Execute merges first (syllable-split words).
|
||||
# Use merge_parent to support chain merging: if "zer" absorbed
|
||||
# "brech" and then "brech"+"lich" is a merge pair, redirect to
|
||||
# merge "lich" into "zer" → "zerbrechlich".
|
||||
if to_merge:
|
||||
merged_indices: set = set()
|
||||
merge_parent: Dict[int, int] = {} # absorbed → absorber
|
||||
for mi1, mi2 in to_merge:
|
||||
if mi1 in to_remove or mi2 in to_remove:
|
||||
continue # don't merge if one is being removed
|
||||
if mi1 in merged_indices or mi2 in merged_indices:
|
||||
continue # already merged
|
||||
mw1, mw2 = wbs[mi1], wbs[mi2]
|
||||
# Follow chain: if mi1 was absorbed, find root absorber
|
||||
actual_mi1 = mi1
|
||||
while actual_mi1 in merge_parent:
|
||||
actual_mi1 = merge_parent[actual_mi1]
|
||||
if actual_mi1 in to_remove or mi2 in to_remove:
|
||||
continue
|
||||
if mi2 in merge_parent:
|
||||
continue # mi2 already absorbed
|
||||
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
|
||||
# Concatenate text (no space — they're parts of one word)
|
||||
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
|
||||
mt2 = (mw2.get("text") or "").strip()
|
||||
@@ -1419,9 +1447,8 @@ async def _build_grid_core(
|
||||
mw1["width"] = mr - mx
|
||||
mw1["height"] = mb - my
|
||||
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
|
||||
to_remove.add(mi2) # remove the second one
|
||||
merged_indices.add(mi1)
|
||||
merged_indices.add(mi2)
|
||||
to_remove.add(mi2)
|
||||
merge_parent[mi2] = actual_mi1
|
||||
bullet_removed -= 1 # net: merge, not removal
|
||||
|
||||
if to_remove:
|
||||
@@ -1593,6 +1620,22 @@ async def _build_grid_core(
|
||||
except Exception as e:
|
||||
logger.warning("Dictionary detection failed: %s", e)
|
||||
|
||||
# --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
|
||||
try:
|
||||
from cv_syllable_detect import merge_word_gaps_in_zones
|
||||
merge_word_gaps_in_zones(zones_data, session_id)
|
||||
except Exception as e:
|
||||
logger.warning("Word-gap merge failed: %s", e)
|
||||
|
||||
# --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
|
||||
# Strips | from words, validates with pyphen, tries char-deletion for garbled
|
||||
# words like "Ze|plpe|lin" → "Zeppelin".
|
||||
try:
|
||||
from cv_syllable_detect import autocorrect_pipe_artifacts
|
||||
autocorrect_pipe_artifacts(zones_data, session_id)
|
||||
except Exception as e:
|
||||
logger.warning("Pipe autocorrect failed: %s", e)
|
||||
|
||||
# --- Syllable divider insertion for dictionary pages ---
|
||||
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
|
||||
# "all" = force on all content words, "en" = English column only,
|
||||
@@ -1626,6 +1669,15 @@ async def _build_grid_core(
|
||||
except Exception as e:
|
||||
logger.warning("Syllable insertion failed: %s", e)
|
||||
|
||||
# When syllable mode is "none", strip any residual | from OCR so
|
||||
# that the displayed text is clean (e.g. "Zel|le" → "Zelle").
|
||||
if syllable_mode == "none":
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
t = cell.get("text", "")
|
||||
if "|" in t:
|
||||
cell["text"] = t.replace("|", "")
|
||||
|
||||
# Clean up internal flags before returning
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
|
||||
@@ -22,6 +22,148 @@ from cv_ocr_engines import _text_has_garbled_ipa
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cross-column word splitting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_spell_cache: Optional[Any] = None
|
||||
_spell_loaded = False
|
||||
|
||||
|
||||
def _is_recognized_word(text: str) -> bool:
|
||||
"""Check if *text* is a recognized German or English word.
|
||||
|
||||
Uses the spellchecker library (same as cv_syllable_detect.py).
|
||||
Returns True for real words like "oder", "Kabel", "Zeitung".
|
||||
Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
|
||||
"""
|
||||
global _spell_cache, _spell_loaded
|
||||
if not text or len(text) < 2:
|
||||
return False
|
||||
|
||||
if not _spell_loaded:
|
||||
_spell_loaded = True
|
||||
try:
|
||||
from spellchecker import SpellChecker
|
||||
_spell_cache = SpellChecker(language="de")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if _spell_cache is None:
|
||||
return False
|
||||
|
||||
return text.lower() in _spell_cache
|
||||
|
||||
|
||||
def _split_cross_column_words(
|
||||
words: List[Dict],
|
||||
columns: List[Dict],
|
||||
) -> List[Dict]:
|
||||
"""Split word boxes that span across column boundaries.
|
||||
|
||||
When OCR merges adjacent words from different columns (e.g. "sichzie"
|
||||
spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
|
||||
split the word box at the column boundary so each piece is assigned
|
||||
to the correct column.
|
||||
|
||||
Only splits when:
|
||||
- The word has significant overlap (>15% of its width) on both sides
|
||||
- AND the word is not a recognized real word (OCR merge artifact), OR
|
||||
the word contains a case transition (lowercase→uppercase) near the
|
||||
boundary indicating two merged words like "dasZimmer".
|
||||
"""
|
||||
if len(columns) < 2:
|
||||
return words
|
||||
|
||||
# Column boundaries = midpoints between adjacent column edges
|
||||
boundaries = []
|
||||
for i in range(len(columns) - 1):
|
||||
boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
|
||||
boundaries.append(boundary)
|
||||
|
||||
new_words: List[Dict] = []
|
||||
split_count = 0
|
||||
for w in words:
|
||||
w_left = w["left"]
|
||||
w_width = w["width"]
|
||||
w_right = w_left + w_width
|
||||
text = (w.get("text") or "").strip()
|
||||
|
||||
if not text or len(text) < 4 or w_width < 10:
|
||||
new_words.append(w)
|
||||
continue
|
||||
|
||||
# Find the first boundary this word straddles significantly
|
||||
split_boundary = None
|
||||
for b in boundaries:
|
||||
if w_left < b < w_right:
|
||||
left_part = b - w_left
|
||||
right_part = w_right - b
|
||||
# Both sides must have at least 15% of the word width
|
||||
if left_part > w_width * 0.15 and right_part > w_width * 0.15:
|
||||
split_boundary = b
|
||||
break
|
||||
|
||||
if split_boundary is None:
|
||||
new_words.append(w)
|
||||
continue
|
||||
|
||||
# Compute approximate split position in the text.
|
||||
left_width = split_boundary - w_left
|
||||
split_ratio = left_width / w_width
|
||||
approx_pos = len(text) * split_ratio
|
||||
|
||||
# Strategy 1: look for a case transition (lowercase→uppercase) near
|
||||
# the approximate split point — e.g. "dasZimmer" splits at 'Z'.
|
||||
split_char = None
|
||||
search_lo = max(1, int(approx_pos) - 3)
|
||||
search_hi = min(len(text), int(approx_pos) + 2)
|
||||
for i in range(search_lo, search_hi):
|
||||
if text[i - 1].islower() and text[i].isupper():
|
||||
split_char = i
|
||||
break
|
||||
|
||||
# Strategy 2: if no case transition, only split if the whole word
|
||||
# is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
|
||||
# Real words like "oder", "Kabel", "Zeitung" must not be split.
|
||||
if split_char is None:
|
||||
clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct
|
||||
if _is_recognized_word(clean):
|
||||
new_words.append(w)
|
||||
continue
|
||||
# Not a real word — use floor of proportional position
|
||||
split_char = max(1, min(len(text) - 1, int(approx_pos)))
|
||||
|
||||
left_text = text[:split_char].rstrip()
|
||||
right_text = text[split_char:].lstrip()
|
||||
|
||||
if len(left_text) < 2 or len(right_text) < 2:
|
||||
new_words.append(w)
|
||||
continue
|
||||
|
||||
right_width = w_width - round(left_width)
|
||||
new_words.append({
|
||||
**w,
|
||||
"text": left_text,
|
||||
"width": round(left_width),
|
||||
})
|
||||
new_words.append({
|
||||
**w,
|
||||
"text": right_text,
|
||||
"left": round(split_boundary),
|
||||
"width": right_width,
|
||||
})
|
||||
split_count += 1
|
||||
logger.info(
|
||||
"split cross-column word %r → %r + %r at boundary %.0f",
|
||||
text, left_text, right_text, split_boundary,
|
||||
)
|
||||
|
||||
if split_count:
|
||||
logger.info("split %d cross-column word(s)", split_count)
|
||||
return new_words
|
||||
|
||||
|
||||
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
|
||||
"""Remove page-border decoration strip words BEFORE column detection.
|
||||
|
||||
@@ -912,6 +1054,13 @@ def _detect_heading_rows_by_single_cell(
|
||||
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||
if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
|
||||
continue
|
||||
# Guard: dictionary section headings are short (1-4 alpha chars
|
||||
# like "A", "Ab", "Zi", "Sch"). Longer text that starts
|
||||
# lowercase is a regular vocabulary word (e.g. "zentral") that
|
||||
# happens to appear alone in its row.
|
||||
alpha_only = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', text)
|
||||
if len(alpha_only) > 4 and text[0].islower():
|
||||
continue
|
||||
heading_row_indices.append(ri)
|
||||
|
||||
# Guard: if >25% of eligible rows would become headings, the
|
||||
@@ -1104,6 +1253,12 @@ def _build_zone_grid(
|
||||
"header_rows": [],
|
||||
}
|
||||
|
||||
# Split word boxes that straddle column boundaries (e.g. "sichzie"
|
||||
# spanning Col 1 + Col 2). Must happen after column detection and
|
||||
# before cell assignment.
|
||||
if len(columns) >= 2:
|
||||
zone_words = _split_cross_column_words(zone_words, columns)
|
||||
|
||||
# Build cells
|
||||
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user