feat: OCR pipeline step 8 — validation view with image detection & generation
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
Replaces the stub StepGroundTruth with a full side-by-side Original vs Reconstruction view. Adds VLM-based image region detection (qwen2.5vl), mflux image generation proxy, sync scroll/zoom, manual region drawing, and score/notes persistence. New backend endpoints: detect-images, generate-image, validate, get validation. New standalone mflux-service (scripts/mflux-service.py) for Metal GPU generation. Dockerfile.base: adds fonts-liberation (Apache-2.0). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -313,7 +313,7 @@ export default function OcrPipelinePage() {
|
|||||||
case 6:
|
case 6:
|
||||||
return <StepReconstruction sessionId={sessionId} onNext={handleNext} />
|
return <StepReconstruction sessionId={sessionId} onNext={handleNext} />
|
||||||
case 7:
|
case 7:
|
||||||
return <StepGroundTruth />
|
return <StepGroundTruth sessionId={sessionId} onNext={handleNext} />
|
||||||
default:
|
default:
|
||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -264,6 +264,24 @@ export interface WordGroundTruth {
|
|||||||
notes?: string
|
notes?: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface ImageRegion {
|
||||||
|
bbox_pct: { x: number; y: number; w: number; h: number }
|
||||||
|
prompt: string
|
||||||
|
description: string
|
||||||
|
image_b64: string | null
|
||||||
|
style: 'educational' | 'cartoon' | 'sketch' | 'clipart' | 'realistic'
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ImageStyle = ImageRegion['style']
|
||||||
|
|
||||||
|
export const IMAGE_STYLES: { value: ImageStyle; label: string }[] = [
|
||||||
|
{ value: 'educational', label: 'Lehrbuch' },
|
||||||
|
{ value: 'cartoon', label: 'Cartoon' },
|
||||||
|
{ value: 'sketch', label: 'Skizze' },
|
||||||
|
{ value: 'clipart', label: 'Clipart' },
|
||||||
|
{ value: 'realistic', label: 'Realistisch' },
|
||||||
|
]
|
||||||
|
|
||||||
export const PIPELINE_STEPS: PipelineStep[] = [
|
export const PIPELINE_STEPS: PipelineStep[] = [
|
||||||
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
|
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
|
||||||
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
|
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
|
||||||
|
|||||||
@@ -1,18 +1,582 @@
|
|||||||
'use client'
|
'use client'
|
||||||
|
|
||||||
export function StepGroundTruth() {
|
import { useCallback, useEffect, useRef, useState } from 'react'
|
||||||
|
import type {
|
||||||
|
GridCell, ColumnMeta, ImageRegion, ImageStyle,
|
||||||
|
} from '@/app/(admin)/ai/ocr-pipeline/types'
|
||||||
|
import { IMAGE_STYLES as STYLES } from '@/app/(admin)/ai/ocr-pipeline/types'
|
||||||
|
|
||||||
|
const KLAUSUR_API = '/klausur-api'
|
||||||
|
|
||||||
|
const COL_TYPE_COLORS: Record<string, string> = {
|
||||||
|
column_en: '#3b82f6',
|
||||||
|
column_de: '#22c55e',
|
||||||
|
column_example: '#f97316',
|
||||||
|
column_text: '#a855f7',
|
||||||
|
page_ref: '#06b6d4',
|
||||||
|
column_marker: '#6b7280',
|
||||||
|
}
|
||||||
|
|
||||||
|
interface StepGroundTruthProps {
|
||||||
|
sessionId: string | null
|
||||||
|
onNext: () => void
|
||||||
|
}
|
||||||
|
|
||||||
|
interface SessionData {
|
||||||
|
cells: GridCell[]
|
||||||
|
columnsUsed: ColumnMeta[]
|
||||||
|
imageWidth: number
|
||||||
|
imageHeight: number
|
||||||
|
originalImageUrl: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export function StepGroundTruth({ sessionId, onNext }: StepGroundTruthProps) {
|
||||||
|
const [status, setStatus] = useState<'loading' | 'ready' | 'saving' | 'saved' | 'error'>('loading')
|
||||||
|
const [error, setError] = useState('')
|
||||||
|
const [session, setSession] = useState<SessionData | null>(null)
|
||||||
|
const [imageRegions, setImageRegions] = useState<(ImageRegion & { generating?: boolean })[]>([])
|
||||||
|
const [detecting, setDetecting] = useState(false)
|
||||||
|
const [zoom, setZoom] = useState(100)
|
||||||
|
const [syncScroll, setSyncScroll] = useState(true)
|
||||||
|
const [notes, setNotes] = useState('')
|
||||||
|
const [score, setScore] = useState<number | null>(null)
|
||||||
|
const [drawingRegion, setDrawingRegion] = useState(false)
|
||||||
|
const [dragStart, setDragStart] = useState<{ x: number; y: number } | null>(null)
|
||||||
|
const [dragEnd, setDragEnd] = useState<{ x: number; y: number } | null>(null)
|
||||||
|
|
||||||
|
const leftPanelRef = useRef<HTMLDivElement>(null)
|
||||||
|
const rightPanelRef = useRef<HTMLDivElement>(null)
|
||||||
|
|
||||||
|
// Load session data
|
||||||
|
useEffect(() => {
|
||||||
|
if (!sessionId) return
|
||||||
|
loadSessionData()
|
||||||
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
|
}, [sessionId])
|
||||||
|
|
||||||
|
const loadSessionData = async () => {
|
||||||
|
if (!sessionId) return
|
||||||
|
setStatus('loading')
|
||||||
|
try {
|
||||||
|
const resp = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
|
||||||
|
if (!resp.ok) throw new Error(`Failed to load session: ${resp.status}`)
|
||||||
|
const data = await resp.json()
|
||||||
|
|
||||||
|
const wordResult = data.word_result || {}
|
||||||
|
setSession({
|
||||||
|
cells: wordResult.cells || [],
|
||||||
|
columnsUsed: wordResult.columns_used || [],
|
||||||
|
imageWidth: wordResult.image_width || data.image_width || 800,
|
||||||
|
imageHeight: wordResult.image_height || data.image_height || 600,
|
||||||
|
originalImageUrl: data.original_image_url || `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/original`,
|
||||||
|
})
|
||||||
|
|
||||||
|
// Load existing validation data
|
||||||
|
const valResp = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reconstruction/validation`)
|
||||||
|
if (valResp.ok) {
|
||||||
|
const valData = await valResp.json()
|
||||||
|
const validation = valData.validation
|
||||||
|
if (validation) {
|
||||||
|
setImageRegions(validation.image_regions || [])
|
||||||
|
setNotes(validation.notes || '')
|
||||||
|
setScore(validation.score ?? null)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
setStatus('ready')
|
||||||
|
} catch (e) {
|
||||||
|
setError(e instanceof Error ? e.message : String(e))
|
||||||
|
setStatus('error')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sync scroll between panels
|
||||||
|
const handleScroll = useCallback((source: 'left' | 'right') => {
|
||||||
|
if (!syncScroll) return
|
||||||
|
const from = source === 'left' ? leftPanelRef.current : rightPanelRef.current
|
||||||
|
const to = source === 'left' ? rightPanelRef.current : leftPanelRef.current
|
||||||
|
if (from && to) {
|
||||||
|
to.scrollTop = from.scrollTop
|
||||||
|
to.scrollLeft = from.scrollLeft
|
||||||
|
}
|
||||||
|
}, [syncScroll])
|
||||||
|
|
||||||
|
// Detect images via VLM
|
||||||
|
const handleDetectImages = async () => {
|
||||||
|
if (!sessionId) return
|
||||||
|
setDetecting(true)
|
||||||
|
try {
|
||||||
|
const resp = await fetch(
|
||||||
|
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reconstruction/detect-images`,
|
||||||
|
{ method: 'POST' }
|
||||||
|
)
|
||||||
|
if (!resp.ok) throw new Error(`Detection failed: ${resp.status}`)
|
||||||
|
const data = await resp.json()
|
||||||
|
setImageRegions(data.regions || [])
|
||||||
|
} catch (e) {
|
||||||
|
setError(e instanceof Error ? e.message : String(e))
|
||||||
|
} finally {
|
||||||
|
setDetecting(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate image for a region
|
||||||
|
const handleGenerateImage = async (index: number) => {
|
||||||
|
if (!sessionId) return
|
||||||
|
const region = imageRegions[index]
|
||||||
|
if (!region) return
|
||||||
|
|
||||||
|
setImageRegions(prev => prev.map((r, i) => i === index ? { ...r, generating: true } : r))
|
||||||
|
|
||||||
|
try {
|
||||||
|
const resp = await fetch(
|
||||||
|
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reconstruction/generate-image`,
|
||||||
|
{
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({
|
||||||
|
region_index: index,
|
||||||
|
prompt: region.prompt,
|
||||||
|
style: region.style,
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if (!resp.ok) throw new Error(`Generation failed: ${resp.status}`)
|
||||||
|
const data = await resp.json()
|
||||||
|
|
||||||
|
setImageRegions(prev => prev.map((r, i) =>
|
||||||
|
i === index ? { ...r, image_b64: data.image_b64, generating: false } : r
|
||||||
|
))
|
||||||
|
} catch (e) {
|
||||||
|
setImageRegions(prev => prev.map((r, i) => i === index ? { ...r, generating: false } : r))
|
||||||
|
setError(e instanceof Error ? e.message : String(e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save validation
|
||||||
|
const handleSave = async () => {
|
||||||
|
if (!sessionId) return
|
||||||
|
setStatus('saving')
|
||||||
|
try {
|
||||||
|
const resp = await fetch(
|
||||||
|
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reconstruction/validate`,
|
||||||
|
{
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ notes, score }),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if (!resp.ok) throw new Error(`Save failed: ${resp.status}`)
|
||||||
|
setStatus('saved')
|
||||||
|
} catch (e) {
|
||||||
|
setError(e instanceof Error ? e.message : String(e))
|
||||||
|
setStatus('error')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle manual region drawing on reconstruction
|
||||||
|
const handleReconMouseDown = (e: React.MouseEvent<HTMLDivElement>) => {
|
||||||
|
if (!drawingRegion) return
|
||||||
|
const rect = e.currentTarget.getBoundingClientRect()
|
||||||
|
const x = ((e.clientX - rect.left) / rect.width) * 100
|
||||||
|
const y = ((e.clientY - rect.top) / rect.height) * 100
|
||||||
|
setDragStart({ x, y })
|
||||||
|
setDragEnd({ x, y })
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleReconMouseMove = (e: React.MouseEvent<HTMLDivElement>) => {
|
||||||
|
if (!dragStart) return
|
||||||
|
const rect = e.currentTarget.getBoundingClientRect()
|
||||||
|
const x = ((e.clientX - rect.left) / rect.width) * 100
|
||||||
|
const y = ((e.clientY - rect.top) / rect.height) * 100
|
||||||
|
setDragEnd({ x, y })
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleReconMouseUp = () => {
|
||||||
|
if (!dragStart || !dragEnd) return
|
||||||
|
const x = Math.min(dragStart.x, dragEnd.x)
|
||||||
|
const y = Math.min(dragStart.y, dragEnd.y)
|
||||||
|
const w = Math.abs(dragEnd.x - dragStart.x)
|
||||||
|
const h = Math.abs(dragEnd.y - dragStart.y)
|
||||||
|
|
||||||
|
if (w > 2 && h > 2) {
|
||||||
|
setImageRegions(prev => [...prev, {
|
||||||
|
bbox_pct: { x, y, w, h },
|
||||||
|
prompt: '',
|
||||||
|
description: 'Manually selected region',
|
||||||
|
image_b64: null,
|
||||||
|
style: 'educational' as ImageStyle,
|
||||||
|
}])
|
||||||
|
}
|
||||||
|
|
||||||
|
setDragStart(null)
|
||||||
|
setDragEnd(null)
|
||||||
|
setDrawingRegion(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleRemoveRegion = (index: number) => {
|
||||||
|
setImageRegions(prev => prev.filter((_, i) => i !== index))
|
||||||
|
}
|
||||||
|
|
||||||
|
if (status === 'loading') {
|
||||||
return (
|
return (
|
||||||
<div className="flex flex-col items-center justify-center py-16 text-center">
|
<div className="flex items-center justify-center py-16">
|
||||||
<div className="text-5xl mb-4">✅</div>
|
<div className="animate-spin rounded-full h-8 w-8 border-b-2 border-teal-500 mr-3" />
|
||||||
<h3 className="text-lg font-medium text-gray-700 dark:text-gray-300 mb-2">
|
<span className="text-gray-500 dark:text-gray-400">Session wird geladen...</span>
|
||||||
Schritt 7: Ground Truth Validierung
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (status === 'error' && !session) {
|
||||||
|
return (
|
||||||
|
<div className="text-center py-16">
|
||||||
|
<p className="text-red-500">{error}</p>
|
||||||
|
<button onClick={loadSessionData} className="mt-4 px-4 py-2 bg-teal-600 text-white rounded hover:bg-teal-700">
|
||||||
|
Erneut laden
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!session) return null
|
||||||
|
|
||||||
|
const aspect = session.imageHeight / session.imageWidth
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-4">
|
||||||
|
{/* Header / Controls */}
|
||||||
|
<div className="flex items-center justify-between flex-wrap gap-2">
|
||||||
|
<h3 className="text-lg font-medium text-gray-800 dark:text-gray-200">
|
||||||
|
Validierung — Original vs. Rekonstruktion
|
||||||
</h3>
|
</h3>
|
||||||
<p className="text-gray-500 dark:text-gray-400 max-w-md">
|
<div className="flex items-center gap-3">
|
||||||
Gesamtpruefung der rekonstruierten Seite gegen das Original.
|
<button
|
||||||
Dieser Schritt wird in einer zukuenftigen Version implementiert.
|
onClick={handleDetectImages}
|
||||||
</p>
|
disabled={detecting}
|
||||||
<div className="mt-6 px-4 py-2 bg-amber-100 dark:bg-amber-900/30 text-amber-700 dark:text-amber-400 rounded-full text-sm font-medium">
|
className="px-3 py-1.5 text-sm bg-indigo-600 text-white rounded hover:bg-indigo-700 disabled:opacity-50"
|
||||||
Kommt bald
|
>
|
||||||
|
{detecting ? 'Erkennung laeuft...' : 'Bilder erkennen'}
|
||||||
|
</button>
|
||||||
|
<label className="flex items-center gap-1.5 text-sm text-gray-600 dark:text-gray-400">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={syncScroll}
|
||||||
|
onChange={e => setSyncScroll(e.target.checked)}
|
||||||
|
className="rounded"
|
||||||
|
/>
|
||||||
|
Sync Scroll
|
||||||
|
</label>
|
||||||
|
<div className="flex items-center gap-1.5">
|
||||||
|
<button onClick={() => setZoom(z => Math.max(50, z - 25))} className="px-2 py-1 text-sm border rounded dark:border-gray-600 hover:bg-gray-100 dark:hover:bg-gray-700">-</button>
|
||||||
|
<span className="text-sm text-gray-600 dark:text-gray-400 w-12 text-center">{zoom}%</span>
|
||||||
|
<button onClick={() => setZoom(z => Math.min(200, z + 25))} className="px-2 py-1 text-sm border rounded dark:border-gray-600 hover:bg-gray-100 dark:hover:bg-gray-700">+</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{error && (
|
||||||
|
<div className="p-2 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 text-sm rounded">
|
||||||
|
{error}
|
||||||
|
<button onClick={() => setError('')} className="ml-2 underline">Schliessen</button>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Side-by-side panels */}
|
||||||
|
<div className="grid grid-cols-2 gap-4" style={{ height: 'calc(100vh - 380px)', minHeight: 400 }}>
|
||||||
|
{/* Left: Original */}
|
||||||
|
<div className="border rounded-lg dark:border-gray-700 overflow-hidden flex flex-col">
|
||||||
|
<div className="px-3 py-1.5 bg-gray-50 dark:bg-gray-800 text-sm font-medium text-gray-600 dark:text-gray-400 border-b dark:border-gray-700">
|
||||||
|
Original
|
||||||
|
</div>
|
||||||
|
<div
|
||||||
|
ref={leftPanelRef}
|
||||||
|
className="flex-1 overflow-auto"
|
||||||
|
onScroll={() => handleScroll('left')}
|
||||||
|
>
|
||||||
|
<div style={{ width: `${zoom}%`, minWidth: '100%' }}>
|
||||||
|
<img
|
||||||
|
src={session.originalImageUrl}
|
||||||
|
alt="Original"
|
||||||
|
className="w-full h-auto"
|
||||||
|
draggable={false}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Right: Reconstruction */}
|
||||||
|
<div className="border rounded-lg dark:border-gray-700 overflow-hidden flex flex-col">
|
||||||
|
<div className="px-3 py-1.5 bg-gray-50 dark:bg-gray-800 text-sm font-medium text-gray-600 dark:text-gray-400 border-b dark:border-gray-700 flex items-center justify-between">
|
||||||
|
<span>Rekonstruktion</span>
|
||||||
|
<button
|
||||||
|
onClick={() => setDrawingRegion(!drawingRegion)}
|
||||||
|
className={`text-xs px-2 py-0.5 rounded ${drawingRegion ? 'bg-indigo-600 text-white' : 'bg-gray-200 dark:bg-gray-700 text-gray-600 dark:text-gray-400'}`}
|
||||||
|
>
|
||||||
|
{drawingRegion ? 'Region zeichnen...' : '+ Region'}
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div
|
||||||
|
ref={rightPanelRef}
|
||||||
|
className="flex-1 overflow-auto"
|
||||||
|
onScroll={() => handleScroll('right')}
|
||||||
|
>
|
||||||
|
<div style={{ width: `${zoom}%`, minWidth: '100%' }}>
|
||||||
|
{/* Reconstruction container */}
|
||||||
|
<div
|
||||||
|
className="relative bg-white"
|
||||||
|
style={{
|
||||||
|
paddingBottom: `${aspect * 100}%`,
|
||||||
|
cursor: drawingRegion ? 'crosshair' : 'default',
|
||||||
|
}}
|
||||||
|
onMouseDown={handleReconMouseDown}
|
||||||
|
onMouseMove={handleReconMouseMove}
|
||||||
|
onMouseUp={handleReconMouseUp}
|
||||||
|
>
|
||||||
|
{/* Column background stripes */}
|
||||||
|
{session.columnsUsed.map((col, i) => {
|
||||||
|
const color = COL_TYPE_COLORS[col.type] || '#9ca3af'
|
||||||
|
return (
|
||||||
|
<div
|
||||||
|
key={`col-${i}`}
|
||||||
|
className="absolute top-0 bottom-0"
|
||||||
|
style={{
|
||||||
|
left: `${(col.x / session.imageWidth) * 100}%`,
|
||||||
|
width: `${(col.width / session.imageWidth) * 100}%`,
|
||||||
|
backgroundColor: color,
|
||||||
|
opacity: 0.06,
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
|
||||||
|
{/* Row separator lines — derive from cells */}
|
||||||
|
{(() => {
|
||||||
|
const rowYs = new Set<number>()
|
||||||
|
for (const cell of session.cells) {
|
||||||
|
if (cell.col_index === 0 && cell.bbox_pct) {
|
||||||
|
rowYs.add(cell.bbox_pct.y)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Array.from(rowYs).map((y, i) => (
|
||||||
|
<div
|
||||||
|
key={`row-${i}`}
|
||||||
|
className="absolute left-0 right-0"
|
||||||
|
style={{
|
||||||
|
top: `${y}%`,
|
||||||
|
height: '1px',
|
||||||
|
backgroundColor: 'rgba(0,0,0,0.08)',
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
))
|
||||||
|
})()}
|
||||||
|
|
||||||
|
{/* Cell texts */}
|
||||||
|
{session.cells.map(cell => {
|
||||||
|
if (!cell.bbox_pct || !cell.text) return null
|
||||||
|
const color = COL_TYPE_COLORS[cell.col_type] || '#374151'
|
||||||
|
return (
|
||||||
|
<span
|
||||||
|
key={cell.cell_id}
|
||||||
|
className="absolute text-[0.6em] leading-tight overflow-hidden"
|
||||||
|
style={{
|
||||||
|
left: `${cell.bbox_pct.x}%`,
|
||||||
|
top: `${cell.bbox_pct.y}%`,
|
||||||
|
width: `${cell.bbox_pct.w}%`,
|
||||||
|
height: `${cell.bbox_pct.h}%`,
|
||||||
|
color,
|
||||||
|
fontFamily: "'Liberation Sans', 'DejaVu Sans', sans-serif",
|
||||||
|
display: 'flex',
|
||||||
|
alignItems: 'center',
|
||||||
|
padding: '0 1px',
|
||||||
|
}}
|
||||||
|
title={`${cell.cell_id}: ${cell.text}`}
|
||||||
|
>
|
||||||
|
{cell.text}
|
||||||
|
</span>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
|
||||||
|
{/* Generated images at region positions */}
|
||||||
|
{imageRegions.map((region, i) => (
|
||||||
|
<div
|
||||||
|
key={`region-${i}`}
|
||||||
|
className="absolute border-2 border-dashed border-indigo-400"
|
||||||
|
style={{
|
||||||
|
left: `${region.bbox_pct.x}%`,
|
||||||
|
top: `${region.bbox_pct.y}%`,
|
||||||
|
width: `${region.bbox_pct.w}%`,
|
||||||
|
height: `${region.bbox_pct.h}%`,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{region.image_b64 ? (
|
||||||
|
<img src={region.image_b64} alt={region.description} className="w-full h-full object-cover" />
|
||||||
|
) : (
|
||||||
|
<div className="w-full h-full flex items-center justify-center bg-indigo-50/50 text-indigo-400 text-[0.5em]">
|
||||||
|
{region.generating ? '...' : `Bild ${i + 1}`}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
|
||||||
|
{/* Drawing rectangle */}
|
||||||
|
{dragStart && dragEnd && (
|
||||||
|
<div
|
||||||
|
className="absolute border-2 border-dashed border-red-500 bg-red-100/20 pointer-events-none"
|
||||||
|
style={{
|
||||||
|
left: `${Math.min(dragStart.x, dragEnd.x)}%`,
|
||||||
|
top: `${Math.min(dragStart.y, dragEnd.y)}%`,
|
||||||
|
width: `${Math.abs(dragEnd.x - dragStart.x)}%`,
|
||||||
|
height: `${Math.abs(dragEnd.y - dragStart.y)}%`,
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Image regions panel */}
|
||||||
|
{imageRegions.length > 0 && (
|
||||||
|
<div className="border rounded-lg dark:border-gray-700 p-4">
|
||||||
|
<h4 className="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">
|
||||||
|
Bildbereiche ({imageRegions.length} gefunden)
|
||||||
|
</h4>
|
||||||
|
<div className="space-y-3">
|
||||||
|
{imageRegions.map((region, i) => (
|
||||||
|
<div key={i} className="flex items-start gap-3 p-3 bg-gray-50 dark:bg-gray-800 rounded-lg">
|
||||||
|
{/* Preview thumbnail */}
|
||||||
|
<div className="w-16 h-16 flex-shrink-0 border rounded dark:border-gray-600 overflow-hidden bg-white">
|
||||||
|
{region.image_b64 ? (
|
||||||
|
<img src={region.image_b64} alt="" className="w-full h-full object-cover" />
|
||||||
|
) : (
|
||||||
|
<div className="w-full h-full flex items-center justify-center text-gray-400 text-xs">
|
||||||
|
{Math.round(region.bbox_pct.w)}x{Math.round(region.bbox_pct.h)}%
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Prompt + controls */}
|
||||||
|
<div className="flex-1 min-w-0 space-y-2">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<span className="text-xs text-gray-500 dark:text-gray-400 flex-shrink-0">
|
||||||
|
Bereich {i + 1}:
|
||||||
|
</span>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
value={region.prompt}
|
||||||
|
onChange={e => {
|
||||||
|
setImageRegions(prev => prev.map((r, j) =>
|
||||||
|
j === i ? { ...r, prompt: e.target.value } : r
|
||||||
|
))
|
||||||
|
}}
|
||||||
|
placeholder="Beschreibung / Prompt..."
|
||||||
|
className="flex-1 text-sm px-2 py-1 border rounded dark:border-gray-600 dark:bg-gray-700 dark:text-white"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<select
|
||||||
|
value={region.style}
|
||||||
|
onChange={e => {
|
||||||
|
setImageRegions(prev => prev.map((r, j) =>
|
||||||
|
j === i ? { ...r, style: e.target.value as ImageStyle } : r
|
||||||
|
))
|
||||||
|
}}
|
||||||
|
className="text-sm px-2 py-1 border rounded dark:border-gray-600 dark:bg-gray-700 dark:text-white"
|
||||||
|
>
|
||||||
|
{STYLES.map(s => (
|
||||||
|
<option key={s.value} value={s.value}>{s.label}</option>
|
||||||
|
))}
|
||||||
|
</select>
|
||||||
|
<button
|
||||||
|
onClick={() => handleGenerateImage(i)}
|
||||||
|
disabled={!!region.generating || !region.prompt}
|
||||||
|
className="px-3 py-1 text-sm bg-teal-600 text-white rounded hover:bg-teal-700 disabled:opacity-50"
|
||||||
|
>
|
||||||
|
{region.generating ? 'Generiere...' : 'Generieren'}
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => handleRemoveRegion(i)}
|
||||||
|
className="px-2 py-1 text-sm text-red-600 hover:bg-red-50 dark:hover:bg-red-900/20 rounded"
|
||||||
|
>
|
||||||
|
Entfernen
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
{region.description && region.description !== region.prompt && (
|
||||||
|
<p className="text-xs text-gray-400">{region.description}</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Notes and score */}
|
||||||
|
<div className="border rounded-lg dark:border-gray-700 p-4 space-y-3">
|
||||||
|
<div className="flex items-center gap-4">
|
||||||
|
<label className="text-sm font-medium text-gray-700 dark:text-gray-300">
|
||||||
|
Bewertung (1-10):
|
||||||
|
</label>
|
||||||
|
<input
|
||||||
|
type="number"
|
||||||
|
min={1}
|
||||||
|
max={10}
|
||||||
|
value={score ?? ''}
|
||||||
|
onChange={e => setScore(e.target.value ? parseInt(e.target.value) : null)}
|
||||||
|
className="w-20 text-sm px-2 py-1 border rounded dark:border-gray-600 dark:bg-gray-700 dark:text-white"
|
||||||
|
/>
|
||||||
|
<div className="flex gap-1">
|
||||||
|
{[1, 2, 3, 4, 5, 6, 7, 8, 9, 10].map(v => (
|
||||||
|
<button
|
||||||
|
key={v}
|
||||||
|
onClick={() => setScore(v)}
|
||||||
|
className={`w-7 h-7 text-xs rounded ${score === v ? 'bg-teal-600 text-white' : 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-400 hover:bg-gray-200 dark:hover:bg-gray-600'}`}
|
||||||
|
>
|
||||||
|
{v}
|
||||||
|
</button>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<label className="text-sm font-medium text-gray-700 dark:text-gray-300 block mb-1">
|
||||||
|
Notizen:
|
||||||
|
</label>
|
||||||
|
<textarea
|
||||||
|
value={notes}
|
||||||
|
onChange={e => setNotes(e.target.value)}
|
||||||
|
rows={3}
|
||||||
|
placeholder="Anmerkungen zur Qualitaet der Rekonstruktion..."
|
||||||
|
className="w-full text-sm px-3 py-2 border rounded dark:border-gray-600 dark:bg-gray-700 dark:text-white"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Actions */}
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<div className="text-sm text-gray-500 dark:text-gray-400">
|
||||||
|
{status === 'saved' && <span className="text-green-600 dark:text-green-400">Validierung gespeichert</span>}
|
||||||
|
{status === 'saving' && <span>Speichere...</span>}
|
||||||
|
</div>
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<button
|
||||||
|
onClick={handleSave}
|
||||||
|
disabled={status === 'saving'}
|
||||||
|
className="px-4 py-2 text-sm bg-gray-600 text-white rounded hover:bg-gray-700 disabled:opacity-50"
|
||||||
|
>
|
||||||
|
Speichern
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={async () => {
|
||||||
|
await handleSave()
|
||||||
|
onNext()
|
||||||
|
}}
|
||||||
|
disabled={status === 'saving'}
|
||||||
|
className="px-4 py-2 text-sm bg-teal-600 text-white rounded hover:bg-teal-700 disabled:opacity-50"
|
||||||
|
>
|
||||||
|
Abschliessen
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
# OCR Pipeline - Schrittweise Seitenrekonstruktion
|
# OCR Pipeline - Schrittweise Seitenrekonstruktion
|
||||||
|
|
||||||
**Version:** 2.0.0
|
**Version:** 3.0.0
|
||||||
**Status:** Produktiv (Schritte 1–8 implementiert)
|
**Status:** Produktiv (Schritte 1–8 implementiert)
|
||||||
**URL:** https://macmini:3002/ai/ocr-pipeline
|
**URL:** https://macmini:3002/ai/ocr-pipeline
|
||||||
|
|
||||||
## Uebersicht
|
## Uebersicht
|
||||||
|
|
||||||
Die OCR Pipeline zerlegt den OCR-Prozess in **8 einzelne Schritte**, um eingescannte Vokabelseiten
|
Die OCR Pipeline zerlegt den OCR-Prozess in **8 einzelne Schritte**, um eingescannte Seiten
|
||||||
aus mehrspaltig gedruckten Schulbuechern Wort fuer Wort zu rekonstruieren.
|
aus mehrspaltig gedruckten Schulbuechern Wort fuer Wort zu rekonstruieren.
|
||||||
Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten versehen werden.
|
Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten versehen werden.
|
||||||
|
|
||||||
@@ -20,13 +20,94 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v
|
|||||||
| 2 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert |
|
| 2 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert |
|
||||||
| 3 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
|
| 3 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
|
||||||
| 4 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
|
| 4 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
|
||||||
| 5 | Worterkennung | Grid aus Spalten x Zeilen, OCR pro Zelle, Post-Processing | Implementiert |
|
| 5 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert |
|
||||||
| 6 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
|
| 6 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
|
||||||
| 7 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund | Implementiert |
|
| 7 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert |
|
||||||
| 8 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
|
| 8 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Dokumenttyp-Erkennung und Pipeline-Pfade
|
||||||
|
|
||||||
|
### Automatische Weiche: `detect_document_type()`
|
||||||
|
|
||||||
|
Nicht jedes Dokument durchlaeuft denselben Pfad. Nach den gemeinsamen Vorverarbeitungsschritten
|
||||||
|
(Deskew, Dewarp, Binarisierung) analysiert `detect_document_type()` die Seitenstruktur
|
||||||
|
**ohne OCR** — rein ueber Projektionsprofile und Textdichte-Analyse (< 2 Sekunden).
|
||||||
|
|
||||||
|
```
|
||||||
|
detect_document_type(ocr_img, img_bgr) → DocumentTypeResult
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Entscheidungslogik
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A[Bild-Input] --> B[Vertikales Projektionsprofil]
|
||||||
|
B --> C{Interne Spalten-Gaps >= 2?}
|
||||||
|
C -->|Ja| D{Zeilen-Gaps >= 5?}
|
||||||
|
D -->|Ja| E["vocab_table<br/>pipeline = cell_first<br/>confidence 0.7–0.95"]
|
||||||
|
D -->|Nein| F{Zeilen-Gaps >= 3?}
|
||||||
|
C -->|Nein| G{Interne Spalten-Gaps >= 1?}
|
||||||
|
G -->|Ja| F
|
||||||
|
G -->|Nein| H["full_text<br/>pipeline = full_page<br/>skip: columns, rows"]
|
||||||
|
F -->|Ja| I["generic_table<br/>pipeline = cell_first<br/>confidence 0.5–0.85"]
|
||||||
|
F -->|Nein| H
|
||||||
|
```
|
||||||
|
|
||||||
|
| Dokumenttyp | Spalten-Gaps | Zeilen-Gaps | Pipeline | Beispiel |
|
||||||
|
|-------------|-------------|-------------|----------|----------|
|
||||||
|
| `vocab_table` | ≥ 2 | ≥ 5 | `cell_first` | 3-spaltige Schulbuch-Vokabeltabelle |
|
||||||
|
| `generic_table` | ≥ 1 | ≥ 3 | `cell_first` | 2-spaltiges Glossar |
|
||||||
|
| `full_text` | 0 | egal | `full_page` | Fliesstext, Aufsatz, Buchseite |
|
||||||
|
|
||||||
|
### Komplett-Flussdiagramm
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ GEMEINSAME VORVERARBEITUNG (alle Dokumente) │
|
||||||
|
│ │
|
||||||
|
│ Stage 1: Render (432 DPI, 3× Zoom) │
|
||||||
|
│ Stage 2: Deskew (Hough Lines + Ensemble) │
|
||||||
|
│ Stage 3: Dewarp (Vertikalkanten-Drift, Ensemble Shear) │
|
||||||
|
│ Stage 4: Dual-Bild (ocr_img = binarisiert, layout_img = CLAHE) │
|
||||||
|
└─────────────────────────────────────┬───────────────────────────────┘
|
||||||
|
│
|
||||||
|
detect_document_type()
|
||||||
|
│
|
||||||
|
┌─────────────────┴──────────────────┐
|
||||||
|
▼ ▼
|
||||||
|
FULL-TEXT PFAD CELL-FIRST PFAD
|
||||||
|
(pipeline='full_page') (pipeline='cell_first')
|
||||||
|
│ │
|
||||||
|
Keine Spalten/Zeilen Spaltenerkennung
|
||||||
|
analyze_layout_by_words() detect_column_geometry()
|
||||||
|
Lese-Reihenfolge _detect_sub_columns()
|
||||||
|
│ expand_narrow_columns()
|
||||||
|
│ Zeilenerkennung
|
||||||
|
│ detect_row_geometry()
|
||||||
|
│ │
|
||||||
|
│ build_cell_grid_v2()
|
||||||
|
│ │
|
||||||
|
│ ┌─────────┴──────────┐
|
||||||
|
│ ▼ ▼
|
||||||
|
│ Breite Spalten Schmale Spalten
|
||||||
|
│ (>= 15% Breite) (< 15% Breite)
|
||||||
|
│ Full-Page Words Cell-Crop OCR
|
||||||
|
│ word_lookup cell_crop_v2
|
||||||
|
│ │ │
|
||||||
|
└───────────────────────────┴────────────────────┘
|
||||||
|
│
|
||||||
|
Post-Processing Pipeline
|
||||||
|
(Lautschrift, Komma-Split, etc.)
|
||||||
|
│
|
||||||
|
Schritt 6: Korrektur (Spell)
|
||||||
|
Schritt 7: Rekonstruktion
|
||||||
|
Schritt 8: Validierung
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Architektur
|
## Architektur
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -55,9 +136,11 @@ Admin-Lehrer (Next.js) klausur-service (FastAPI :8086)
|
|||||||
|
|
||||||
```
|
```
|
||||||
klausur-service/backend/
|
klausur-service/backend/
|
||||||
|
├── services/
|
||||||
|
│ └── cv_vocab_pipeline.py # Computer Vision + NLP Algorithmen
|
||||||
├── ocr_pipeline_api.py # FastAPI Router (alle Endpoints)
|
├── ocr_pipeline_api.py # FastAPI Router (alle Endpoints)
|
||||||
├── ocr_pipeline_session_store.py # PostgreSQL Persistence
|
├── ocr_pipeline_session_store.py # PostgreSQL Persistence
|
||||||
├── cv_vocab_pipeline.py # Computer Vision + NLP Algorithmen
|
├── layout_reconstruction_service.py # Fabric.js JSON + PDF/DOCX Export
|
||||||
└── migrations/
|
└── migrations/
|
||||||
├── 002_ocr_pipeline_sessions.sql # Basis-Schema
|
├── 002_ocr_pipeline_sessions.sql # Basis-Schema
|
||||||
├── 003_add_row_result.sql # Row-Result Spalte
|
├── 003_add_row_result.sql # Row-Result Spalte
|
||||||
@@ -76,6 +159,7 @@ admin-lehrer/
|
|||||||
├── StepWordRecognition.tsx # Schritt 5: Worterkennung
|
├── StepWordRecognition.tsx # Schritt 5: Worterkennung
|
||||||
├── StepLlmReview.tsx # Schritt 6: Korrektur (SSE-Stream)
|
├── StepLlmReview.tsx # Schritt 6: Korrektur (SSE-Stream)
|
||||||
├── StepReconstruction.tsx # Schritt 7: Rekonstruktion (Canvas)
|
├── StepReconstruction.tsx # Schritt 7: Rekonstruktion (Canvas)
|
||||||
|
├── FabricReconstructionCanvas.tsx # Fabric.js Editor
|
||||||
└── StepGroundTruth.tsx # Schritt 8: Validierung
|
└── StepGroundTruth.tsx # Schritt 8: Validierung
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -94,6 +178,7 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
|
|||||||
| `GET` | `/sessions/{id}` | Session-Info mit allen Step-Results |
|
| `GET` | `/sessions/{id}` | Session-Info mit allen Step-Results |
|
||||||
| `PUT` | `/sessions/{id}` | Session umbenennen |
|
| `PUT` | `/sessions/{id}` | Session umbenennen |
|
||||||
| `DELETE` | `/sessions/{id}` | Session loeschen |
|
| `DELETE` | `/sessions/{id}` | Session loeschen |
|
||||||
|
| `POST` | `/sessions/{id}/detect-type` | Dokumenttyp erkennen |
|
||||||
|
|
||||||
### Bilder
|
### Bilder
|
||||||
|
|
||||||
@@ -160,6 +245,34 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
|
|||||||
| Methode | Pfad | Beschreibung |
|
| Methode | Pfad | Beschreibung |
|
||||||
|---------|------|--------------|
|
|---------|------|--------------|
|
||||||
| `POST` | `/sessions/{id}/reconstruction` | Zellaenderungen speichern |
|
| `POST` | `/sessions/{id}/reconstruction` | Zellaenderungen speichern |
|
||||||
|
| `GET` | `/sessions/{id}/reconstruction/fabric-json` | Fabric.js Canvas-Daten |
|
||||||
|
| `GET` | `/sessions/{id}/reconstruction/export/pdf` | PDF-Export (reportlab) |
|
||||||
|
| `GET` | `/sessions/{id}/reconstruction/export/docx` | DOCX-Export (python-docx) |
|
||||||
|
| `POST` | `/sessions/{id}/reconstruction/detect-images` | Bildbereiche per VLM erkennen |
|
||||||
|
| `POST` | `/sessions/{id}/reconstruction/generate-image` | Bild per mflux generieren |
|
||||||
|
| `POST` | `/sessions/{id}/reconstruction/validate` | Validierung speichern (Step 8) |
|
||||||
|
| `GET` | `/sessions/{id}/reconstruction/validation` | Validierungsdaten abrufen |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Schritt 2: Entzerrung/Dewarp (Detail)
|
||||||
|
|
||||||
|
### Algorithmus: Vertikalkanten-Drift
|
||||||
|
|
||||||
|
Die Dewarp-Erkennung misst die **vertikale Spaltenkippung** (dx/dy) statt Textzeilen-Neigung:
|
||||||
|
|
||||||
|
1. Woerter werden nach X-Position in vertikale Spaltencluster gruppiert
|
||||||
|
2. Pro Cluster: Lineare Regression `x = a*y + b` → `a = dx/dy = tan(shear_angle)`
|
||||||
|
3. Ensemble aus drei Methoden: Textzeilen (1.5× Gewicht), Projektionsprofil (2-Pass), Vertikalkanten
|
||||||
|
4. Qualitaetspruefung: Horizontale Projektionsvarianz vor/nach Korrektur
|
||||||
|
|
||||||
|
**Schwellenwerte:**
|
||||||
|
|
||||||
|
| Parameter | Wert | Beschreibung |
|
||||||
|
|-----------|------|--------------|
|
||||||
|
| Min. Korrekturwinkel | 0.08° | Unter 0.08° wird nicht korrigiert |
|
||||||
|
| Ensemble Min-Confidence | 0.35 | Mindest-Konfidenz fuer Korrektur |
|
||||||
|
| Quality-Gate Skip | < 0.5° | Kleine Korrekturen ueberspringen Quality-Gate |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -180,6 +293,38 @@ Bild → Binarisierung → Vertikalprofil → Lueckenerkennung → Wort-Validier
|
|||||||
- **Phantom-Spalten-Filter (Step 9):** Spalten mit Breite < 3 % der Content-Breite UND < 3 Woerter werden als Artefakte entfernt; die angrenzenden Spalten schliessen die Luecke.
|
- **Phantom-Spalten-Filter (Step 9):** Spalten mit Breite < 3 % der Content-Breite UND < 3 Woerter werden als Artefakte entfernt; die angrenzenden Spalten schliessen die Luecke.
|
||||||
- **Spaltenzuweisung:** Woerter werden anhand des groessten horizontalen Ueberlappungsbereichs einer Spalte zugeordnet.
|
- **Spaltenzuweisung:** Woerter werden anhand des groessten horizontalen Ueberlappungsbereichs einer Spalte zugeordnet.
|
||||||
|
|
||||||
|
### Sub-Spalten-Erkennung: `_detect_sub_columns()`
|
||||||
|
|
||||||
|
Erkennt versteckte Sub-Spalten innerhalb breiter Spalten (z.B. Seitenzahl-Spalte links neben EN-Vokabeln).
|
||||||
|
|
||||||
|
**Algorithmus (Left-Edge Alignment Clustering):**
|
||||||
|
|
||||||
|
1. Fuer jede Spalte mit `width_ratio >= 0.15` und `word_count >= 5`:
|
||||||
|
2. Left-Edges aller Woerter mit `conf >= 30` sammeln
|
||||||
|
3. In Alignment-Bins clustern (8px Toleranz)
|
||||||
|
4. Linkester Bin mit >= 10% der Woerter = wahrer Spaltenanfang
|
||||||
|
5. Woerter links davon = Sub-Spalte, wenn >= 2 und < 35% Anteil
|
||||||
|
6. Neue ColumnGeometry-Objekte mit korrekten Indizes erzeugen
|
||||||
|
|
||||||
|
**Koordinatensystem:** Word `left`-Werte sind relativ zum Content-ROI (`left_x`), `ColumnGeometry.x` ist absolut. `left_x` wird als Parameter durchgereicht.
|
||||||
|
|
||||||
|
### Spalten-Erweiterung: `expand_narrow_columns()`
|
||||||
|
|
||||||
|
Laeuft **nach** `_detect_sub_columns()`. Erweitert sehr schmale Spalten (< 10% Content-Breite,
|
||||||
|
z.B. `page_ref`, `marker`) in den Weissraum zum Nachbar-Spalte hinein, aber nie ueber die
|
||||||
|
naechsten Woerter im Nachbarn hinaus (4px Sicherheitsabstand).
|
||||||
|
|
||||||
|
### Spaltentyp-Klassifikation: `classify_column_types()`
|
||||||
|
|
||||||
|
| Spaltentyp | Beschreibung | Erkennung |
|
||||||
|
|------------|--------------|-----------|
|
||||||
|
| `column_en` | Englische Vokabeln | EN-Funktionswoerter (the, a, is...) |
|
||||||
|
| `column_de` | Deutsche Uebersetzung | DE-Funktionswoerter (der, die, das...) |
|
||||||
|
| `column_example` | Beispielsaetze | Abkuerzungen, Grammatik-Marker |
|
||||||
|
| `page_ref` | Seitenzahlen | Schmal (< 20% Breite), wenige Woerter |
|
||||||
|
| `column_marker` | Dekorative Markierungen | Sehr schmal, spezielle Zeichen |
|
||||||
|
| `column_text` | Generischer Text | Fallback |
|
||||||
|
|
||||||
### Konfigurierbare Parameter
|
### Konfigurierbare Parameter
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -219,29 +364,95 @@ def _heal_row_gaps(rows, top_bound, bottom_bound):
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Schritt 5: Worterkennung (Detail)
|
## Schritt 5: Worterkennung — Hybrid-Grid (Detail)
|
||||||
|
|
||||||
### Algorithmus: `build_cell_grid()`
|
### Algorithmus: `build_cell_grid_v2()`
|
||||||
|
|
||||||
Schritt 5 nutzt die Ergebnisse von Schritt 3 (Spalten) und Schritt 4 (Zeilen), um ein Grid
|
Schritt 5 nutzt eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
|
||||||
zu erstellen und jede Zelle per OCR auszulesen.
|
schmale Spalten werden isoliert per Cell-Crop OCR verarbeitet.
|
||||||
|
|
||||||
```
|
!!! success "Warum Hybrid?"
|
||||||
Spalten (Step 3): column_en | column_de | column_example
|
Full-Page OCR liefert gute Ergebnisse fuer breite Spalten (Saetze, IPA-Klammern, Interpunktion).
|
||||||
───────────┼─────────────┼────────────────
|
Aber bei schmalen Spalten (Seitenzahlen, Marker) „bluten" Woerter aus Nachbar-Spalten ein.
|
||||||
Zeilen (Step 4): R0 │ hello │ hallo │ Hello, World!
|
Cell-Crop isoliert jede Zelle und verhindert dieses Bleeding.
|
||||||
R1 │ world │ Welt │ The whole world
|
|
||||||
R2 │ book │ Buch │ Read a book
|
### Broad vs. Narrow — Die 15%-Schwelle
|
||||||
───────────┼─────────────┼────────────────
|
|
||||||
|
```python
|
||||||
|
_NARROW_COL_THRESHOLD_PCT = 15.0 # cv_vocab_pipeline.py
|
||||||
```
|
```
|
||||||
|
|
||||||
**Ablauf:**
|
| Eigenschaft | Breite Spalten (>= 15%) | Schmale Spalten (< 15%) |
|
||||||
|
|-------------|------------------------|------------------------|
|
||||||
|
| **OCR-Quelle** | Full-Page Tesseract (vorher gelaufen) | Isolierter Cell-Crop |
|
||||||
|
| **Wort-Zuweisung** | `_assign_row_words_to_columns()` | Direktes Zell-OCR |
|
||||||
|
| **Confidence-Filter** | `conf >= 30` | `conf >= 30` |
|
||||||
|
| **Text-Bereinigung** | `_clean_cell_text()` (mittel) | `_clean_cell_text_lite()` (aggressiv) |
|
||||||
|
| **Neighbour-Bleeding** | Risiko vorhanden | Verhindert (isoliert) |
|
||||||
|
| **Parallelisierung** | Sequentiell | Parallel (`max_workers=4`) |
|
||||||
|
| **OCR-Engine Label** | `word_lookup` | `cell_crop_v2` |
|
||||||
|
| **Typische Spalten** | EN-Vokabeln, DE-Uebersetzung, Beispielsaetze | Seitenzahlen, Marker |
|
||||||
|
|
||||||
1. **Initialer Scan:** Ganzes Bild einmal per Tesseract/RapidOCR → alle Wort-Bboxes
|
**Empirische Grundlage:** Typische breite Spalten liegen bei 20–40% Bildbreite,
|
||||||
2. **Zuweisung:** Jedes Wort der Spalte mit groesstem horizontalem Ueberlapp zuordnen
|
typische schmale bei 3–12%. Die 15%-Grenze trennt diese Gruppen sauber.
|
||||||
3. **Zell-OCR Fallback:** Leere Zellen bekommen eigenen Crop + erneuten OCR-Aufruf (PSM 6/7)
|
|
||||||
4. **Batch-Spalten-OCR:** Bei vielen leeren Zellen in einer Spalte: gesamte Spalte einmal OCR-en
|
!!! note "Offener Punkt: Schwellen-Validierung"
|
||||||
5. **Post-Processing:** Continuation-Rows zusammenfuehren, Lautschrift erkennen, Komma-Eintraege splitten
|
Die 15%-Schwelle wurde an Vokabeltabellen mit 3–5 Spalten validiert.
|
||||||
|
Fuer eine breitere Validierung werden diverse Schulbuchseiten mit unterschiedlichen
|
||||||
|
Layouts (2-, 3-, 4-, 5-spaltig, verschiedene Verlage) benoetigt. Aktuell gibt es
|
||||||
|
in der Datenbank nur Sessions mit demselben Arbeitsblatt-Typ.
|
||||||
|
|
||||||
|
### Cell-Crop OCR: `_ocr_cell_crop()`
|
||||||
|
|
||||||
|
Isolierte OCR einer einzelnen Zelle (Spalte × Zeile Schnittflaeche):
|
||||||
|
|
||||||
|
1. **Crop:** Exakte Spalten- × Zeilengrenzen mit 3px internem Padding
|
||||||
|
2. **Density-Check:** Ueberspringe leere Zellen (`dark_ratio < 0.005`)
|
||||||
|
3. **Upscaling:** Kleine Crops (Hoehe < 80px) werden 3× vergroessert
|
||||||
|
4. **OCR:** Engine-spezifisch (Tesseract, TrOCR, RapidOCR, LightON)
|
||||||
|
5. **Fallback:** Bei leerem Ergebnis → PSM 7 (Einzelzeile) statt PSM 6
|
||||||
|
6. **Bereinigung:** `_clean_cell_text_lite()` (aggressives Noise-Filtering)
|
||||||
|
|
||||||
|
### Ablauf von `build_cell_grid_v2()`
|
||||||
|
|
||||||
|
```
|
||||||
|
Eingabe: ocr_img, column_regions, row_geometries
|
||||||
|
│
|
||||||
|
┌───────────┴───────────┐
|
||||||
|
│ Filter │
|
||||||
|
│ • Phantom-Zeilen │
|
||||||
|
│ • Artefakt-Zeilen │
|
||||||
|
│ • Irrelevante Spalten │
|
||||||
|
│ (header, footer, │
|
||||||
|
│ margin, ignore) │
|
||||||
|
└───────────┬───────────┘
|
||||||
|
│
|
||||||
|
┌───────────┴───────────┐
|
||||||
|
│ Klassifizierung │
|
||||||
|
│ Spalte.width / img_w │
|
||||||
|
│ >= 15% → broad │
|
||||||
|
│ < 15% → narrow │
|
||||||
|
└───────────┬───────────┘
|
||||||
|
│
|
||||||
|
┌───────────┴────────────────┐
|
||||||
|
│ │
|
||||||
|
Phase 1: Broad Phase 2: Narrow
|
||||||
|
(sequentiell) (parallel, max_workers=4)
|
||||||
|
│ │
|
||||||
|
Pro (row, col): Pro (row, col):
|
||||||
|
1. Words aus Full-Page 1. _ocr_cell_crop()
|
||||||
|
2. Filter conf >= 30 2. Isoliertes Zell-Bild
|
||||||
|
3. _words_to_reading_order 3. Upscale wenn noetig
|
||||||
|
4. _clean_cell_text() 4. _clean_cell_text_lite()
|
||||||
|
│ │
|
||||||
|
└───────────┬────────────────┘
|
||||||
|
│
|
||||||
|
Merge + Sortierung
|
||||||
|
(row_index, col_index)
|
||||||
|
Leere Zeilen entfernen
|
||||||
|
│
|
||||||
|
Ausgabe: cells[], columns_meta[]
|
||||||
|
```
|
||||||
|
|
||||||
### Post-Processing Pipeline (in `build_vocab_pipeline_streaming`)
|
### Post-Processing Pipeline (in `build_vocab_pipeline_streaming`)
|
||||||
|
|
||||||
@@ -264,7 +475,7 @@ Zeilen (Step 4): R0 │ hello │ hallo │ Hello, World!
|
|||||||
|
|
||||||
### Korrektur-Engine
|
### Korrektur-Engine
|
||||||
|
|
||||||
Schritt 6 kombiniert zwei Korrektur-Stufen, beide als SSE-Stream:
|
Schritt 6 kombiniert drei Korrektur-Stufen, alle als SSE-Stream:
|
||||||
|
|
||||||
**Stufe 1 — Zeichenverwirrungskorrektur** (`_fix_character_confusion`):
|
**Stufe 1 — Zeichenverwirrungskorrektur** (`_fix_character_confusion`):
|
||||||
|
|
||||||
@@ -288,8 +499,9 @@ _SPELL_SUBS = {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Logik: Kandidaten werden durch Woerterbuch-Lookup validiert. Strukturregel: Verdaechtiges
|
**Stufe 3 — Seitenzahl-Korrektur** (`page_ref`-Felder):
|
||||||
Zeichen an Position 0 + Rest klein → erstes Substitut (z.B. `8en` → `Ben`).
|
|
||||||
|
Korrigiert haeufige OCR-Fehler in Seitenverweisen (z.B. `p.5g` → `p.59`).
|
||||||
|
|
||||||
### Umgebungsvariablen
|
### Umgebungsvariablen
|
||||||
|
|
||||||
@@ -318,7 +530,11 @@ Change-Format:
|
|||||||
|
|
||||||
## Schritt 7: Rekonstruktion (Detail)
|
## Schritt 7: Rekonstruktion (Detail)
|
||||||
|
|
||||||
Interaktiver Canvas-Editor: Das entzerrte Originalbild wird mit 30 % Opazitaet als Hintergrund
|
Zwei Modi verfuegbar:
|
||||||
|
|
||||||
|
### Einfacher Modus
|
||||||
|
|
||||||
|
Das entzerrte Originalbild wird mit 30 % Opazitaet als Hintergrund
|
||||||
angezeigt, alle Grid-Zellen (auch leere!) werden als editierbare Textfelder darueber gelegt.
|
angezeigt, alle Grid-Zellen (auch leere!) werden als editierbare Textfelder darueber gelegt.
|
||||||
|
|
||||||
**Features:**
|
**Features:**
|
||||||
@@ -331,6 +547,14 @@ angezeigt, alle Grid-Zellen (auch leere!) werden als editierbare Textfelder daru
|
|||||||
- Zoom 50–200 %
|
- Zoom 50–200 %
|
||||||
- Per-Zell-Reset-Button bei geaenderten Zellen
|
- Per-Zell-Reset-Button bei geaenderten Zellen
|
||||||
|
|
||||||
|
### Fabric.js Editor
|
||||||
|
|
||||||
|
Erweiterter Canvas-Editor (`FabricReconstructionCanvas.tsx`):
|
||||||
|
|
||||||
|
- Drag & Drop fuer Zellen
|
||||||
|
- Freie Positionierung auf dem Canvas
|
||||||
|
- Export als PDF (reportlab) oder DOCX (python-docx)
|
||||||
|
|
||||||
```
|
```
|
||||||
POST /sessions/{id}/reconstruction
|
POST /sessions/{id}/reconstruction
|
||||||
Body: {"cells": [{"cell_id": "r5_c2", "text": "corrected text"}]}
|
Body: {"cells": [{"cell_id": "r5_c2", "text": "corrected text"}]}
|
||||||
@@ -338,6 +562,19 @@ Body: {"cells": [{"cell_id": "r5_c2", "text": "corrected text"}]}
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Wichtige Konstanten
|
||||||
|
|
||||||
|
| Konstante | Wert | Datei | Beschreibung |
|
||||||
|
|-----------|------|-------|--------------|
|
||||||
|
| `_NARROW_COL_THRESHOLD_PCT` | 15.0% | cv_vocab_pipeline.py | Schwelle breit/schmal fuer Hybrid-OCR |
|
||||||
|
| `_NARROW_THRESHOLD_PCT` | 10.0% | cv_vocab_pipeline.py | Schwelle fuer Spalten-Erweiterung |
|
||||||
|
| `_MIN_WORD_CONF` | 30 | cv_vocab_pipeline.py | Mindest-Confidence fuer OCR-Woerter |
|
||||||
|
| `_PAD` | 3px | cv_vocab_pipeline.py | Internes Padding bei Cell-Crop |
|
||||||
|
| `PDF_ZOOM` | 3.0 | cv_vocab_pipeline.py | PDF-Rendering (= 432 DPI) |
|
||||||
|
| `_MIN_WORD_MARGIN` | 4px | cv_vocab_pipeline.py | Sicherheitsabstand bei Spalten-Erweiterung |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Datenbank-Schema
|
## Datenbank-Schema
|
||||||
|
|
||||||
```sql
|
```sql
|
||||||
@@ -348,6 +585,10 @@ CREATE TABLE ocr_pipeline_sessions (
|
|||||||
status VARCHAR(50) DEFAULT 'active',
|
status VARCHAR(50) DEFAULT 'active',
|
||||||
current_step INT DEFAULT 1,
|
current_step INT DEFAULT 1,
|
||||||
|
|
||||||
|
-- Dokumenttyp-Erkennung
|
||||||
|
doc_type VARCHAR(50), -- 'vocab_table', 'generic_table', 'full_text'
|
||||||
|
doc_type_result JSONB, -- Vollstaendiges DetectionResult
|
||||||
|
|
||||||
-- Bilder (BYTEA)
|
-- Bilder (BYTEA)
|
||||||
original_png BYTEA,
|
original_png BYTEA,
|
||||||
deskewed_png BYTEA,
|
deskewed_png BYTEA,
|
||||||
@@ -374,7 +615,7 @@ CREATE TABLE ocr_pipeline_sessions (
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"vocab_entries": [...],
|
"vocab_entries": [...],
|
||||||
"cells": [{"cell_id": "r0_c0", "text": "hello", "bbox_pct": {...}, ...}],
|
"cells": [{"cell_id": "r0_c0", "text": "hello", "bbox_pct": {...}, "ocr_engine": "word_lookup", ...}],
|
||||||
"columns_used": [...],
|
"columns_used": [...],
|
||||||
"llm_review": {
|
"llm_review": {
|
||||||
"changes": [{"row_index": 5, "field": "english", "old": "...", "new": "..."}],
|
"changes": [{"row_index": 5, "field": "english", "old": "...", "new": "..."}],
|
||||||
@@ -399,10 +640,13 @@ CREATE TABLE ocr_pipeline_sessions (
|
|||||||
| `onnxruntime` | latest | MIT | ONNX-Inferenz fuer RapidOCR |
|
| `onnxruntime` | latest | MIT | ONNX-Inferenz fuer RapidOCR |
|
||||||
| `pyspellchecker` | ≥0.8.1 | MIT | Regel-basierte OCR-Korrektur (Schritt 6) |
|
| `pyspellchecker` | ≥0.8.1 | MIT | Regel-basierte OCR-Korrektur (Schritt 6) |
|
||||||
| `eng-to-ipa` | latest | MIT | IPA-Lautschrift-Lookup (Schritt 5) |
|
| `eng-to-ipa` | latest | MIT | IPA-Lautschrift-Lookup (Schritt 5) |
|
||||||
|
| `reportlab` | latest | BSD | PDF-Export (Schritt 7) |
|
||||||
|
| `python-docx` | ≥1.1.0 | MIT | DOCX-Export (Schritt 7) |
|
||||||
|
| `fabric` (JS) | ^6 | MIT | Canvas-Editor (Frontend) |
|
||||||
|
|
||||||
!!! info "pyspellchecker (neu seit 2026-03)"
|
!!! info "pyspellchecker (neu seit 2026-03)"
|
||||||
`pyspellchecker` (MIT-Lizenz) ersetzt die LLM-basierte Korrektur als Standard-Engine.
|
`pyspellchecker` (MIT-Lizenz) ersetzt die LLM-basierte Korrektur als Standard-Engine.
|
||||||
EN+DE-Woerterbuch, ~134k Woerter. Kein Ollama notig.
|
EN+DE-Woerterbuch, ~134k Woerter. Kein Ollama noetig.
|
||||||
Umschaltbar via `REVIEW_ENGINE=llm` fuer den LLM-Pfad.
|
Umschaltbar via `REVIEW_ENGINE=llm` fuer den LLM-Pfad.
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -413,8 +657,10 @@ CREATE TABLE ocr_pipeline_sessions (
|
|||||||
|---------|---------|------------|
|
|---------|---------|------------|
|
||||||
| Schraeg gedruckte Seiten | Deskew erkennt Text-Rotation, nicht Seiten-Rotation | Manueller Winkel |
|
| Schraeg gedruckte Seiten | Deskew erkennt Text-Rotation, nicht Seiten-Rotation | Manueller Winkel |
|
||||||
| Sehr kleine Schrift (< 8pt) | Tesseract PSM 7 braucht min. Zeichengroesse | Vorher zoomen |
|
| Sehr kleine Schrift (< 8pt) | Tesseract PSM 7 braucht min. Zeichengroesse | Vorher zoomen |
|
||||||
| Handgeschriebene Eintraege | Tesseract/RapidOCR sind fuer Druckschrift optimiert | TrOCR-Engine (geplant) |
|
| Handgeschriebene Eintraege | Tesseract/RapidOCR sind fuer Druckschrift optimiert | TrOCR-Engine |
|
||||||
| Mehr als 4 Spalten | Projektionsprofil kann verschmelzen | Manuelle Spalten |
|
| Mehr als 4 Spalten | Projektionsprofil kann verschmelzen | Manuelle Spalten |
|
||||||
|
| Farbige Marker (rot/blau) | HSV-Erkennung erzeugt False Positives | Manuell im Rekonstruktions-Editor |
|
||||||
|
| 15%-Schwelle nicht breit validiert | Nur an einem Arbeitsblatt-Typ getestet | Diverse Schulbuchseiten testen |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -425,17 +671,15 @@ CREATE TABLE ocr_pipeline_sessions (
|
|||||||
git push origin main
|
git push origin main
|
||||||
|
|
||||||
# 2. Mac Mini pull + build
|
# 2. Mac Mini pull + build
|
||||||
ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && git pull --no-rebase origin main"
|
ssh macmini "git -C /Users/benjaminadmin/Projekte/breakpilot-lehrer pull --no-rebase origin main"
|
||||||
|
|
||||||
# klausur-service (Backend) — bei requirements.txt Aenderungen: klausur-base neu bauen
|
# klausur-service (Backend)
|
||||||
ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \
|
ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/breakpilot-lehrer/docker-compose.yml build klausur-service"
|
||||||
/usr/local/bin/docker compose build klausur-service && \
|
ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/breakpilot-lehrer/docker-compose.yml up -d klausur-service"
|
||||||
/usr/local/bin/docker compose up -d klausur-service"
|
|
||||||
|
|
||||||
# admin-lehrer (Frontend)
|
# admin-lehrer (Frontend)
|
||||||
ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \
|
ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/breakpilot-lehrer/docker-compose.yml build admin-lehrer"
|
||||||
/usr/local/bin/docker compose build admin-lehrer && \
|
ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/breakpilot-lehrer/docker-compose.yml up -d admin-lehrer"
|
||||||
/usr/local/bin/docker compose up -d admin-lehrer"
|
|
||||||
|
|
||||||
# 3. Testen unter:
|
# 3. Testen unter:
|
||||||
# https://macmini:3002/ai/ocr-pipeline
|
# https://macmini:3002/ai/ocr-pipeline
|
||||||
@@ -445,9 +689,8 @@ ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \
|
|||||||
Wenn `requirements.txt` geaendert wird (z.B. neues Paket hinzugefuegt), muss zuerst
|
Wenn `requirements.txt` geaendert wird (z.B. neues Paket hinzugefuegt), muss zuerst
|
||||||
das Base-Image neu gebaut werden:
|
das Base-Image neu gebaut werden:
|
||||||
```bash
|
```bash
|
||||||
ssh macmini "cd ~/Projekte/breakpilot-lehrer && \
|
ssh macmini "/usr/local/bin/docker build -f /Users/benjaminadmin/Projekte/breakpilot-lehrer/klausur-service/Dockerfile.base \
|
||||||
/usr/local/bin/docker build -f klausur-service/Dockerfile.base \
|
-t klausur-base:latest /Users/benjaminadmin/Projekte/breakpilot-lehrer/klausur-service/"
|
||||||
-t klausur-base:latest klausur-service/"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -456,6 +699,9 @@ ssh macmini "cd /Users/benjaminadmin/Projekte/breakpilot-lehrer && \
|
|||||||
|
|
||||||
| Datum | Version | Aenderung |
|
| Datum | Version | Aenderung |
|
||||||
|-------|---------|----------|
|
|-------|---------|----------|
|
||||||
|
| 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade |
|
||||||
|
| 2026-03-04 | 2.2.0 | Dewarp: Vertikalkanten-Drift statt Textzeilen-Neigung, Schwellenwerte gesenkt |
|
||||||
|
| 2026-03-04 | 2.1.0 | Sub-Column-Detection, expand_narrow_columns, Fabric.js Editor, PDF/DOCX-Export |
|
||||||
| 2026-03-03 | 2.0.0 | Schritte 6–7 implementiert; Spell-Checker, Rekonstruktions-Canvas |
|
| 2026-03-03 | 2.0.0 | Schritte 6–7 implementiert; Spell-Checker, Rekonstruktions-Canvas |
|
||||||
| 2026-03-03 | 1.5.0 | Spaltenerkennung: volle Bildbreite fuer initialen Scan, Phantom-Filter |
|
| 2026-03-03 | 1.5.0 | Spaltenerkennung: volle Bildbreite fuer initialen Scan, Phantom-Filter |
|
||||||
| 2026-03-03 | 1.4.0 | Zeilenerkennung: Artefakt-Zeilen entfernen + Luecken-Heilung |
|
| 2026-03-03 | 1.4.0 | Zeilenerkennung: Artefakt-Zeilen entfernen + Luecken-Heilung |
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
tesseract-ocr-eng \
|
tesseract-ocr-eng \
|
||||||
libgl1 \
|
libgl1 \
|
||||||
libglib2.0-0 \
|
libglib2.0-0 \
|
||||||
|
fonts-liberation \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Python dependencies
|
# Python dependencies
|
||||||
|
|||||||
@@ -2238,6 +2238,271 @@ async def export_reconstruction_docx(session_id: str):
|
|||||||
raise HTTPException(status_code=501, detail="python-docx not installed")
|
raise HTTPException(status_code=501, detail="python-docx not installed")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Step 8: Validation — Original vs. Reconstruction
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
STYLE_SUFFIXES = {
|
||||||
|
"educational": "educational illustration, textbook style, clear, colorful",
|
||||||
|
"cartoon": "cartoon, child-friendly, simple shapes",
|
||||||
|
"sketch": "pencil sketch, hand-drawn, black and white",
|
||||||
|
"clipart": "clipart, flat vector style, simple",
|
||||||
|
"realistic": "photorealistic, high detail",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ValidationRequest(BaseModel):
|
||||||
|
notes: Optional[str] = None
|
||||||
|
score: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateImageRequest(BaseModel):
|
||||||
|
region_index: int
|
||||||
|
prompt: str
|
||||||
|
style: str = "educational"
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/reconstruction/detect-images")
|
||||||
|
async def detect_image_regions(session_id: str):
|
||||||
|
"""Detect illustration/image regions in the original scan using VLM.
|
||||||
|
|
||||||
|
Sends the original image to qwen2.5vl to find non-text, non-table
|
||||||
|
image areas, returning bounding boxes (in %) and descriptions.
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
import httpx
|
||||||
|
import re
|
||||||
|
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
# Get original image bytes
|
||||||
|
original_png = await get_session_image(session_id, "original")
|
||||||
|
if not original_png:
|
||||||
|
raise HTTPException(status_code=400, detail="No original image found")
|
||||||
|
|
||||||
|
# Build context from vocab entries for richer descriptions
|
||||||
|
word_result = session.get("word_result") or {}
|
||||||
|
entries = word_result.get("vocab_entries") or word_result.get("entries") or []
|
||||||
|
vocab_context = ""
|
||||||
|
if entries:
|
||||||
|
sample = entries[:10]
|
||||||
|
words = [f"{e.get('english', '')} / {e.get('german', '')}" for e in sample if e.get('english')]
|
||||||
|
if words:
|
||||||
|
vocab_context = f"\nContext: This is a vocabulary page with words like: {', '.join(words)}"
|
||||||
|
|
||||||
|
ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||||
|
model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
|
||||||
|
|
||||||
|
prompt = (
|
||||||
|
"Analyze this scanned page. Find ALL illustration/image/picture regions "
|
||||||
|
"(NOT text, NOT table cells, NOT blank areas). "
|
||||||
|
"For each image region found, return its bounding box as percentage of page dimensions "
|
||||||
|
"and a short English description of what the image shows. "
|
||||||
|
"Reply with ONLY a JSON array like: "
|
||||||
|
'[{"x": 10, "y": 20, "w": 30, "h": 25, "description": "drawing of a cat"}] '
|
||||||
|
"where x, y, w, h are percentages (0-100) of the page width/height. "
|
||||||
|
"If there are NO images on the page, return an empty array: []"
|
||||||
|
f"{vocab_context}"
|
||||||
|
)
|
||||||
|
|
||||||
|
img_b64 = base64.b64encode(original_png).decode("utf-8")
|
||||||
|
payload = {
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
"images": [img_b64],
|
||||||
|
"stream": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||||
|
resp = await client.post(f"{ollama_base}/api/generate", json=payload)
|
||||||
|
resp.raise_for_status()
|
||||||
|
text = resp.json().get("response", "")
|
||||||
|
|
||||||
|
# Parse JSON array from response
|
||||||
|
match = re.search(r'\[.*?\]', text, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
raw_regions = json.loads(match.group(0))
|
||||||
|
else:
|
||||||
|
raw_regions = []
|
||||||
|
|
||||||
|
# Normalize to ImageRegion format
|
||||||
|
regions = []
|
||||||
|
for r in raw_regions:
|
||||||
|
regions.append({
|
||||||
|
"bbox_pct": {
|
||||||
|
"x": max(0, min(100, float(r.get("x", 0)))),
|
||||||
|
"y": max(0, min(100, float(r.get("y", 0)))),
|
||||||
|
"w": max(1, min(100, float(r.get("w", 10)))),
|
||||||
|
"h": max(1, min(100, float(r.get("h", 10)))),
|
||||||
|
},
|
||||||
|
"description": r.get("description", ""),
|
||||||
|
"prompt": r.get("description", ""),
|
||||||
|
"image_b64": None,
|
||||||
|
"style": "educational",
|
||||||
|
})
|
||||||
|
|
||||||
|
# Enrich prompts with nearby vocab context
|
||||||
|
if entries:
|
||||||
|
for region in regions:
|
||||||
|
ry = region["bbox_pct"]["y"]
|
||||||
|
rh = region["bbox_pct"]["h"]
|
||||||
|
nearby = [
|
||||||
|
e for e in entries
|
||||||
|
if e.get("bbox") and abs(e["bbox"].get("y", 0) - ry) < rh + 10
|
||||||
|
]
|
||||||
|
if nearby:
|
||||||
|
en_words = [e.get("english", "") for e in nearby if e.get("english")]
|
||||||
|
de_words = [e.get("german", "") for e in nearby if e.get("german")]
|
||||||
|
if en_words or de_words:
|
||||||
|
context = f" (vocabulary context: {', '.join(en_words[:5])}"
|
||||||
|
if de_words:
|
||||||
|
context += f" / {', '.join(de_words[:5])}"
|
||||||
|
context += ")"
|
||||||
|
region["prompt"] = region["description"] + context
|
||||||
|
|
||||||
|
# Save to ground_truth JSONB
|
||||||
|
ground_truth = session.get("ground_truth") or {}
|
||||||
|
validation = ground_truth.get("validation") or {}
|
||||||
|
validation["image_regions"] = regions
|
||||||
|
validation["detected_at"] = datetime.utcnow().isoformat()
|
||||||
|
ground_truth["validation"] = validation
|
||||||
|
await update_session_db(session_id, ground_truth=ground_truth)
|
||||||
|
|
||||||
|
if session_id in _cache:
|
||||||
|
_cache[session_id]["ground_truth"] = ground_truth
|
||||||
|
|
||||||
|
logger.info(f"Detected {len(regions)} image regions for session {session_id}")
|
||||||
|
|
||||||
|
return {"regions": regions, "count": len(regions)}
|
||||||
|
|
||||||
|
except httpx.ConnectError:
|
||||||
|
logger.warning(f"VLM not available at {ollama_base} for image detection")
|
||||||
|
return {"regions": [], "count": 0, "error": "VLM not available"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Image detection failed for {session_id}: {e}")
|
||||||
|
return {"regions": [], "count": 0, "error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/reconstruction/generate-image")
|
||||||
|
async def generate_image_for_region(session_id: str, req: GenerateImageRequest):
|
||||||
|
"""Generate a replacement image for a detected region using mflux.
|
||||||
|
|
||||||
|
Sends the prompt (with style suffix) to the mflux-service running
|
||||||
|
natively on the Mac Mini (Metal GPU required).
|
||||||
|
"""
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
ground_truth = session.get("ground_truth") or {}
|
||||||
|
validation = ground_truth.get("validation") or {}
|
||||||
|
regions = validation.get("image_regions") or []
|
||||||
|
|
||||||
|
if req.region_index < 0 or req.region_index >= len(regions):
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid region_index {req.region_index}, have {len(regions)} regions")
|
||||||
|
|
||||||
|
mflux_url = os.getenv("MFLUX_URL", "http://host.docker.internal:8095")
|
||||||
|
style_suffix = STYLE_SUFFIXES.get(req.style, STYLE_SUFFIXES["educational"])
|
||||||
|
full_prompt = f"{req.prompt}, {style_suffix}"
|
||||||
|
|
||||||
|
# Determine image size from region aspect ratio (snap to multiples of 64)
|
||||||
|
region = regions[req.region_index]
|
||||||
|
bbox = region["bbox_pct"]
|
||||||
|
aspect = bbox["w"] / max(bbox["h"], 1)
|
||||||
|
if aspect > 1.3:
|
||||||
|
width, height = 768, 512
|
||||||
|
elif aspect < 0.7:
|
||||||
|
width, height = 512, 768
|
||||||
|
else:
|
||||||
|
width, height = 512, 512
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||||
|
resp = await client.post(f"{mflux_url}/generate", json={
|
||||||
|
"prompt": full_prompt,
|
||||||
|
"width": width,
|
||||||
|
"height": height,
|
||||||
|
"steps": 4,
|
||||||
|
})
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
image_b64 = data.get("image_b64")
|
||||||
|
|
||||||
|
if not image_b64:
|
||||||
|
return {"image_b64": None, "success": False, "error": "No image returned"}
|
||||||
|
|
||||||
|
# Save to ground_truth
|
||||||
|
regions[req.region_index]["image_b64"] = image_b64
|
||||||
|
regions[req.region_index]["prompt"] = req.prompt
|
||||||
|
regions[req.region_index]["style"] = req.style
|
||||||
|
validation["image_regions"] = regions
|
||||||
|
ground_truth["validation"] = validation
|
||||||
|
await update_session_db(session_id, ground_truth=ground_truth)
|
||||||
|
|
||||||
|
if session_id in _cache:
|
||||||
|
_cache[session_id]["ground_truth"] = ground_truth
|
||||||
|
|
||||||
|
logger.info(f"Generated image for session {session_id} region {req.region_index}")
|
||||||
|
return {"image_b64": image_b64, "success": True}
|
||||||
|
|
||||||
|
except httpx.ConnectError:
|
||||||
|
logger.warning(f"mflux-service not available at {mflux_url}")
|
||||||
|
return {"image_b64": None, "success": False, "error": f"mflux-service not available at {mflux_url}"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Image generation failed for {session_id}: {e}")
|
||||||
|
return {"image_b64": None, "success": False, "error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/reconstruction/validate")
|
||||||
|
async def save_validation(session_id: str, req: ValidationRequest):
|
||||||
|
"""Save final validation results for step 8.
|
||||||
|
|
||||||
|
Stores notes, score, and preserves any detected/generated image regions.
|
||||||
|
Sets current_step = 8 to mark pipeline as complete.
|
||||||
|
"""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
ground_truth = session.get("ground_truth") or {}
|
||||||
|
validation = ground_truth.get("validation") or {}
|
||||||
|
validation["validated_at"] = datetime.utcnow().isoformat()
|
||||||
|
validation["notes"] = req.notes
|
||||||
|
validation["score"] = req.score
|
||||||
|
ground_truth["validation"] = validation
|
||||||
|
|
||||||
|
await update_session_db(session_id, ground_truth=ground_truth, current_step=8)
|
||||||
|
|
||||||
|
if session_id in _cache:
|
||||||
|
_cache[session_id]["ground_truth"] = ground_truth
|
||||||
|
|
||||||
|
logger.info(f"Validation saved for session {session_id}: score={req.score}")
|
||||||
|
|
||||||
|
return {"session_id": session_id, "validation": validation}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}/reconstruction/validation")
|
||||||
|
async def get_validation(session_id: str):
|
||||||
|
"""Retrieve saved validation data for step 8."""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
ground_truth = session.get("ground_truth") or {}
|
||||||
|
validation = ground_truth.get("validation")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"validation": validation,
|
||||||
|
"word_result": session.get("word_result"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/reprocess")
|
@router.post("/sessions/{session_id}/reprocess")
|
||||||
async def reprocess_session(session_id: str, request: Request):
|
async def reprocess_session(session_id: str, request: Request):
|
||||||
"""Re-run pipeline from a specific step, clearing downstream data.
|
"""Re-run pipeline from a specific step, clearing downstream data.
|
||||||
|
|||||||
121
scripts/mflux-service.py
Normal file
121
scripts/mflux-service.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
mflux-service — Standalone FastAPI wrapper for mflux image generation.
|
||||||
|
|
||||||
|
Runs NATIVELY on Mac Mini (requires Metal GPU, not Docker).
|
||||||
|
Generates images using Flux Schnell via the mflux library.
|
||||||
|
|
||||||
|
Setup:
|
||||||
|
python3 -m venv ~/mflux-env
|
||||||
|
source ~/mflux-env/bin/activate
|
||||||
|
pip install mflux fastapi uvicorn
|
||||||
|
|
||||||
|
Run:
|
||||||
|
source ~/mflux-env/bin/activate
|
||||||
|
python scripts/mflux-service.py
|
||||||
|
|
||||||
|
Or as a background service:
|
||||||
|
nohup ~/mflux-env/bin/python scripts/mflux-service.py > /tmp/mflux-service.log 2>&1 &
|
||||||
|
|
||||||
|
License: Apache-2.0
|
||||||
|
"""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import uvicorn
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
|
logger = logging.getLogger("mflux-service")
|
||||||
|
|
||||||
|
app = FastAPI(title="mflux Image Generation Service", version="1.0.0")
|
||||||
|
|
||||||
|
# Lazy-loaded generator
|
||||||
|
_flux = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_flux():
|
||||||
|
"""Lazy-load the Flux model on first use."""
|
||||||
|
global _flux
|
||||||
|
if _flux is None:
|
||||||
|
logger.info("Loading Flux Schnell model (first call, may download ~12 GB)...")
|
||||||
|
from mflux import Flux1
|
||||||
|
|
||||||
|
_flux = Flux1(
|
||||||
|
model_name="schnell",
|
||||||
|
quantize=8,
|
||||||
|
)
|
||||||
|
logger.info("Flux Schnell model loaded.")
|
||||||
|
return _flux
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateRequest(BaseModel):
|
||||||
|
prompt: str
|
||||||
|
width: int = 512
|
||||||
|
height: int = 512
|
||||||
|
steps: int = 4
|
||||||
|
seed: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateResponse(BaseModel):
|
||||||
|
image_b64: Optional[str] = None
|
||||||
|
success: bool = True
|
||||||
|
error: Optional[str] = None
|
||||||
|
duration_ms: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
return {"status": "ok", "model": "flux-schnell", "gpu": "metal"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/generate", response_model=GenerateResponse)
|
||||||
|
async def generate_image(req: GenerateRequest):
|
||||||
|
"""Generate an image from a text prompt using Flux Schnell."""
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Validate dimensions (must be multiples of 64 for Flux)
|
||||||
|
width = max(256, min(1024, (req.width // 64) * 64))
|
||||||
|
height = max(256, min(1024, (req.height // 64) * 64))
|
||||||
|
|
||||||
|
try:
|
||||||
|
from mflux import Config
|
||||||
|
|
||||||
|
flux = _get_flux()
|
||||||
|
image = flux.generate_image(
|
||||||
|
seed=req.seed or int(time.time()) % 2**31,
|
||||||
|
prompt=req.prompt,
|
||||||
|
config=Config(
|
||||||
|
num_inference_steps=req.steps,
|
||||||
|
height=height,
|
||||||
|
width=width,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert PIL image to base64
|
||||||
|
buf = io.BytesIO()
|
||||||
|
image.save(buf, format="PNG")
|
||||||
|
buf.seek(0)
|
||||||
|
img_b64 = "data:image/png;base64," + base64.b64encode(buf.read()).decode("utf-8")
|
||||||
|
|
||||||
|
duration_ms = int((time.time() - t0) * 1000)
|
||||||
|
logger.info(f"Generated {width}x{height} image in {duration_ms}ms: {req.prompt[:60]}...")
|
||||||
|
|
||||||
|
return GenerateResponse(image_b64=img_b64, success=True, duration_ms=duration_ms)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
duration_ms = int((time.time() - t0) * 1000)
|
||||||
|
logger.error(f"Generation failed: {e}")
|
||||||
|
return GenerateResponse(image_b64=None, success=False, error=str(e), duration_ms=duration_ms)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
port = int(os.getenv("MFLUX_PORT", "8095"))
|
||||||
|
logger.info(f"Starting mflux-service on port {port}")
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=port)
|
||||||
Reference in New Issue
Block a user