feat: Orientierung + Zuschneiden als Schritte 1-2 in OCR-Pipeline
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 18s

Zwei neue Wizard-Schritte vor Begradigung:
- Step 1: Orientierungserkennung (0/90/180/270° via Tesseract OSD)
- Step 2: Seitenrand-Erkennung und Zuschnitt (Scannerraender entfernen)

Backend:
- orientation_crop_api.py: POST /orientation, POST /crop, POST /crop/skip
- page_crop.py: detect_and_crop_page() mit Format-Erkennung (A4/A5/Letter)
- Session-Store: orientation_result, crop_result Felder
- Pipeline nutzt zugeschnittenes Bild fuer Deskew/Dewarp

Frontend:
- StepOrientation.tsx: Upload + Auto-Orientierung + Vorher/Nachher
- StepCrop.tsx: Auto-Crop + Format-Badge + Ueberspringen-Option
- Pipeline-Stepper: 10 Schritte (war 8)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-08 23:55:23 +01:00
parent 9a5a35bff1
commit 2763631711
12 changed files with 1247 additions and 259 deletions

View File

@@ -3,6 +3,8 @@
import { useCallback, useEffect, useState } from 'react'
import { PagePurpose } from '@/components/common/PagePurpose'
import { PipelineStepper } from '@/components/ocr-pipeline/PipelineStepper'
import { StepOrientation } from '@/components/ocr-pipeline/StepOrientation'
import { StepCrop } from '@/components/ocr-pipeline/StepCrop'
import { StepDeskew } from '@/components/ocr-pipeline/StepDeskew'
import { StepDewarp } from '@/components/ocr-pipeline/StepDewarp'
import { StepColumnDetection } from '@/components/ocr-pipeline/StepColumnDetection'
@@ -196,7 +198,7 @@ export default function OcrPipelinePage() {
setCurrentStep(nextStep)
}
const handleDeskewComplete = (sid: string) => {
const handleOrientationComplete = (sid: string) => {
setSessionId(sid)
// Reload session list to show the new session
loadSessions()
@@ -270,14 +272,16 @@ export default function OcrPipelinePage() {
}
const stepNames: Record<number, string> = {
1: 'Begradigung',
2: 'Entzerrung',
3: 'Spalten',
4: 'Zeilen',
5: 'Woerter',
6: 'Korrektur',
7: 'Rekonstruktion',
8: 'Validierung',
1: 'Orientierung',
2: 'Zuschneiden',
3: 'Begradigung',
4: 'Entzerrung',
5: 'Spalten',
6: 'Zeilen',
7: 'Woerter',
8: 'Korrektur',
9: 'Rekonstruktion',
10: 'Validierung',
}
const reprocessFromStep = useCallback(async (uiStep: number) => {
@@ -306,20 +310,24 @@ export default function OcrPipelinePage() {
const renderStep = () => {
switch (currentStep) {
case 0:
return <StepDeskew sessionId={sessionId} onNext={handleDeskewComplete} />
return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} />
case 1:
return <StepDewarp sessionId={sessionId} onNext={handleDewarpNext} />
return <StepCrop sessionId={sessionId} onNext={handleNext} />
case 2:
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} />
return <StepDeskew sessionId={sessionId} onNext={handleNext} />
case 3:
return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
return <StepDewarp sessionId={sessionId} onNext={handleDewarpNext} />
case 4:
return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} />
case 5:
return <StepLlmReview sessionId={sessionId} onNext={handleNext} />
return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
case 6:
return <StepReconstruction sessionId={sessionId} onNext={handleNext} />
return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
case 7:
return <StepLlmReview sessionId={sessionId} onNext={handleNext} />
case 8:
return <StepReconstruction sessionId={sessionId} onNext={handleNext} />
case 9:
return <StepGroundTruth sessionId={sessionId} onNext={handleNext} />
default:
return null

View File

@@ -57,6 +57,26 @@ export interface DocumentTypeResult {
duration_seconds?: number
}
export interface OrientationResult {
orientation_degrees: number
corrected: boolean
duration_seconds: number
}
export interface CropResult {
crop_applied: boolean
crop_rect?: { x: number; y: number; width: number; height: number }
crop_rect_pct?: { x: number; y: number; width: number; height: number }
original_size: { width: number; height: number }
cropped_size: { width: number; height: number }
detected_format?: string
format_confidence?: number
aspect_ratio?: number
border_fractions?: { top: number; bottom: number; left: number; right: number }
skipped?: boolean
duration_seconds?: number
}
export interface SessionInfo {
session_id: string
filename: string
@@ -67,6 +87,8 @@ export interface SessionInfo {
current_step?: number
document_category?: DocumentCategory
doc_type?: string
orientation_result?: OrientationResult
crop_result?: CropResult
deskew_result?: DeskewResult
dewarp_result?: DewarpResult
column_result?: ColumnResult
@@ -85,7 +107,6 @@ export interface DeskewResult {
angle_applied: number
method_used: 'hough' | 'word_alignment' | 'manual' | 'iterative' | 'two_pass' | 'three_pass' | 'manual_combined'
confidence: number
orientation_degrees?: number
duration_seconds: number
deskewed_image_url: string
binarized_image_url: string
@@ -288,6 +309,8 @@ export const IMAGE_STYLES: { value: ImageStyle; label: string }[] = [
]
export const PIPELINE_STEPS: PipelineStep[] = [
{ id: 'orientation', name: 'Orientierung', icon: '🔄', status: 'pending' },
{ id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
{ id: 'columns', name: 'Spalten', icon: '📊', status: 'pending' },

View File

@@ -59,11 +59,6 @@ export function DeskewControls({
{/* Results */}
{deskewResult && (
<div className="bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700 p-4">
{deskewResult.orientation_degrees ? (
<div className="flex items-center gap-2 mb-2 px-2 py-1 bg-amber-50 dark:bg-amber-900/20 text-amber-700 dark:text-amber-400 rounded text-xs">
Seite wurde um {deskewResult.orientation_degrees}° gedreht (Orientierungskorrektur)
</div>
) : null}
<div className="flex flex-wrap items-center gap-3 text-sm">
<div>
<span className="text-gray-500">Winkel:</span>{' '}

View File

@@ -0,0 +1,185 @@
'use client'
import { useEffect, useState } from 'react'
import type { CropResult } from '@/app/(admin)/ai/ocr-pipeline/types'
import { ImageCompareView } from './ImageCompareView'
const KLAUSUR_API = '/klausur-api'
interface StepCropProps {
sessionId: string | null
onNext: () => void
}
export function StepCrop({ sessionId, onNext }: StepCropProps) {
const [cropResult, setCropResult] = useState<CropResult | null>(null)
const [cropping, setCropping] = useState(false)
const [error, setError] = useState<string | null>(null)
const [hasRun, setHasRun] = useState(false)
// Auto-trigger crop on mount
useEffect(() => {
if (!sessionId || hasRun) return
setHasRun(true)
const runCrop = async () => {
setCropping(true)
setError(null)
try {
// Check if session already has crop result
const sessionRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
if (sessionRes.ok) {
const sessionData = await sessionRes.json()
if (sessionData.crop_result) {
setCropResult(sessionData.crop_result)
setCropping(false)
return
}
}
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/crop`, {
method: 'POST',
})
if (!res.ok) {
throw new Error('Zuschnitt fehlgeschlagen')
}
const data = await res.json()
setCropResult(data)
} catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
} finally {
setCropping(false)
}
}
runCrop()
}, [sessionId, hasRun])
const handleSkip = async () => {
if (!sessionId) return
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/crop/skip`, {
method: 'POST',
})
if (res.ok) {
const data = await res.json()
setCropResult(data)
}
} catch (e) {
console.error('Skip crop failed:', e)
}
onNext()
}
if (!sessionId) {
return <div className="text-sm text-gray-400">Keine Session ausgewaehlt.</div>
}
const orientedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented`
const croppedUrl = cropResult
? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped`
: null
return (
<div className="space-y-4">
{/* Loading indicator */}
{cropping && (
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
Scannerraender werden erkannt...
</div>
)}
{/* Image comparison */}
<ImageCompareView
originalUrl={orientedUrl}
deskewedUrl={croppedUrl}
showGrid={false}
showBinarized={false}
binarizedUrl={null}
leftLabel="Orientiert"
rightLabel="Zugeschnitten"
/>
{/* Crop result info */}
{cropResult && (
<div className="bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700 p-4">
<div className="flex flex-wrap items-center gap-3 text-sm">
{cropResult.crop_applied ? (
<>
<span className="inline-flex items-center gap-1.5 px-3 py-1 rounded-full bg-amber-50 dark:bg-amber-900/20 text-amber-700 dark:text-amber-400 text-xs font-medium">
Zugeschnitten
</span>
{cropResult.detected_format && (
<>
<div className="h-4 w-px bg-gray-300 dark:bg-gray-600" />
<span className="text-gray-600 dark:text-gray-400">
Format: <span className="font-medium">{cropResult.detected_format}</span>
{cropResult.format_confidence != null && (
<span className="text-gray-400 ml-1">
({Math.round(cropResult.format_confidence * 100)}%)
</span>
)}
</span>
</>
)}
<div className="h-4 w-px bg-gray-300 dark:bg-gray-600" />
<span className="text-gray-400 text-xs">
{cropResult.original_size.width}x{cropResult.original_size.height} {cropResult.cropped_size.width}x{cropResult.cropped_size.height}
</span>
{cropResult.border_fractions && (
<>
<div className="h-4 w-px bg-gray-300 dark:bg-gray-600" />
<span className="text-gray-400 text-xs">
Raender: O={pct(cropResult.border_fractions.top)} U={pct(cropResult.border_fractions.bottom)} L={pct(cropResult.border_fractions.left)} R={pct(cropResult.border_fractions.right)}
</span>
</>
)}
</>
) : (
<span className="inline-flex items-center gap-1.5 px-3 py-1 rounded-full bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-400 text-xs font-medium">
Kein Zuschnitt noetig
</span>
)}
{cropResult.duration_seconds != null && (
<span className="text-gray-400 text-xs ml-auto">
{cropResult.duration_seconds}s
</span>
)}
</div>
</div>
)}
{/* Action buttons */}
{cropResult && (
<div className="flex justify-between">
<button
onClick={handleSkip}
className="px-4 py-2 text-sm text-gray-500 hover:text-gray-700 dark:text-gray-400 dark:hover:text-gray-200 transition-colors"
>
Ueberspringen
</button>
<button
onClick={onNext}
className="px-6 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium transition-colors"
>
Weiter &rarr;
</button>
</div>
)}
{error && (
<div className="p-3 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 rounded-lg text-sm">
{error}
</div>
)}
</div>
)
}
function pct(v: number): string {
return `${(v * 100).toFixed(1)}%`
}

View File

@@ -8,29 +8,27 @@ import { ImageCompareView } from './ImageCompareView'
const KLAUSUR_API = '/klausur-api'
interface StepDeskewProps {
sessionId?: string | null
onNext: (sessionId: string) => void
sessionId: string | null
onNext: () => void
}
export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewProps) {
export function StepDeskew({ sessionId, onNext }: StepDeskewProps) {
const [session, setSession] = useState<SessionInfo | null>(null)
const [deskewResult, setDeskewResult] = useState<DeskewResult | null>(null)
const [uploading, setUploading] = useState(false)
const [deskewing, setDeskewing] = useState(false)
const [applying, setApplying] = useState(false)
const [showBinarized, setShowBinarized] = useState(false)
const [showGrid, setShowGrid] = useState(true)
const [error, setError] = useState<string | null>(null)
const [dragOver, setDragOver] = useState(false)
const [sessionName, setSessionName] = useState('')
const [hasAutoRun, setHasAutoRun] = useState(false)
// Reload session data when navigating back from a later step
// Load session and auto-trigger deskew
useEffect(() => {
if (!existingSessionId || session) return
if (!sessionId || session) return
const loadSession = async () => {
const loadAndDeskew = async () => {
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}`)
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
if (!res.ok) return
const data = await res.json()
@@ -39,83 +37,56 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP
filename: data.filename,
image_width: data.image_width,
image_height: data.image_height,
original_image_url: `${KLAUSUR_API}${data.original_image_url}`,
// Use cropped image as "before" view
original_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped`,
}
setSession(sessionInfo)
// Reconstruct deskew result from session data
// If deskew result already exists, use it
if (data.deskew_result) {
const dr: DeskewResult = {
...data.deskew_result,
deskewed_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}/image/deskewed`,
binarized_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}/image/binarized`,
deskewed_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/deskewed`,
binarized_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/binarized`,
}
setDeskewResult(dr)
return
}
// Auto-trigger deskew if not already done
if (!hasAutoRun) {
setHasAutoRun(true)
setDeskewing(true)
const deskewRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/deskew`, {
method: 'POST',
})
if (!deskewRes.ok) {
throw new Error('Begradigung fehlgeschlagen')
}
const deskewData: DeskewResult = await deskewRes.json()
deskewData.deskewed_image_url = `${KLAUSUR_API}${deskewData.deskewed_image_url}`
deskewData.binarized_image_url = `${KLAUSUR_API}${deskewData.binarized_image_url}`
setDeskewResult(deskewData)
}
} catch (e) {
console.error('Failed to reload session:', e)
setError(e instanceof Error ? e.message : 'Fehler beim Laden')
} finally {
setDeskewing(false)
}
}
loadSession()
}, [existingSessionId, session])
const handleUpload = useCallback(async (file: File) => {
setUploading(true)
setError(null)
setDeskewResult(null)
try {
const formData = new FormData()
formData.append('file', file)
if (sessionName.trim()) {
formData.append('name', sessionName.trim())
}
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, {
method: 'POST',
body: formData,
})
if (!res.ok) {
const err = await res.json().catch(() => ({ detail: res.statusText }))
throw new Error(err.detail || 'Upload fehlgeschlagen')
}
const data: SessionInfo = await res.json()
// Prepend API prefix to relative URLs
data.original_image_url = `${KLAUSUR_API}${data.original_image_url}`
setSession(data)
// Auto-trigger deskew
setDeskewing(true)
const deskewRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${data.session_id}/deskew`, {
method: 'POST',
})
if (!deskewRes.ok) {
throw new Error('Begradigung fehlgeschlagen')
}
const deskewData: DeskewResult = await deskewRes.json()
deskewData.deskewed_image_url = `${KLAUSUR_API}${deskewData.deskewed_image_url}`
deskewData.binarized_image_url = `${KLAUSUR_API}${deskewData.binarized_image_url}`
setDeskewResult(deskewData)
} catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
} finally {
setUploading(false)
setDeskewing(false)
}
}, [])
loadAndDeskew()
}, [sessionId, session, hasAutoRun])
const handleManualDeskew = useCallback(async (angle: number) => {
if (!session) return
if (!sessionId) return
setApplying(true)
setError(null)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${session.session_id}/deskew/manual`, {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/deskew/manual`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ angle }),
@@ -130,7 +101,6 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP
...prev,
angle_applied: data.angle_applied,
method_used: data.method_used,
// Force reload by appending timestamp
deskewed_image_url: `${KLAUSUR_API}${data.deskewed_image_url}?t=${Date.now()}`,
}
: null,
@@ -140,12 +110,12 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP
} finally {
setApplying(false)
}
}, [session])
}, [sessionId])
const handleGroundTruth = useCallback(async (gt: DeskewGroundTruth) => {
if (!session) return
if (!sessionId) return
try {
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${session.session_id}/ground-truth/deskew`, {
await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/ground-truth/deskew`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(gt),
@@ -153,89 +123,21 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP
} catch (e) {
console.error('Ground truth save failed:', e)
}
}, [session])
}, [sessionId])
const handleDrop = useCallback((e: React.DragEvent) => {
e.preventDefault()
setDragOver(false)
const file = e.dataTransfer.files[0]
if (file) handleUpload(file)
}, [handleUpload])
const handleFileInput = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
const file = e.target.files?.[0]
if (file) handleUpload(file)
}, [handleUpload])
// Upload area (no session yet)
if (!session) {
return (
<div className="space-y-4">
{/* Session name input */}
<div>
<label className="block text-sm font-medium text-gray-600 dark:text-gray-400 mb-1">
Session-Name (optional)
</label>
<input
type="text"
value={sessionName}
onChange={(e) => setSessionName(e.target.value)}
placeholder="z.B. Unit 3 Seite 42"
className="w-full max-w-sm px-3 py-2 text-sm border rounded-lg dark:bg-gray-800 dark:border-gray-600 dark:text-gray-200 focus:outline-none focus:ring-2 focus:ring-teal-500"
/>
</div>
<div
onDragOver={(e) => { e.preventDefault(); setDragOver(true) }}
onDragLeave={() => setDragOver(false)}
onDrop={handleDrop}
className={`border-2 border-dashed rounded-xl p-12 text-center transition-colors ${
dragOver
? 'border-teal-400 bg-teal-50 dark:bg-teal-900/20'
: 'border-gray-300 dark:border-gray-600 hover:border-teal-400'
}`}
>
{uploading ? (
<div className="text-gray-500">
<div className="animate-spin inline-block w-8 h-8 border-2 border-teal-500 border-t-transparent rounded-full mb-3" />
<p>Wird hochgeladen...</p>
</div>
) : (
<>
<div className="text-4xl mb-3">📄</div>
<p className="text-gray-600 dark:text-gray-400 mb-2">
PDF oder Bild hierher ziehen
</p>
<p className="text-sm text-gray-400 mb-4">oder</p>
<label className="inline-block px-4 py-2 bg-teal-600 text-white rounded-lg cursor-pointer hover:bg-teal-700 transition-colors">
Datei auswaehlen
<input
type="file"
accept=".pdf,.png,.jpg,.jpeg,.tiff,.tif"
onChange={handleFileInput}
className="hidden"
/>
</label>
</>
)}
</div>
{error && (
<div className="p-3 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 rounded-lg text-sm">
{error}
</div>
)}
</div>
)
if (!sessionId) {
return <div className="text-sm text-gray-400">Keine Session ausgewaehlt.</div>
}
// Session active: show comparison + controls
return (
<div className="space-y-4">
{/* Filename */}
<div className="text-sm text-gray-500 dark:text-gray-400">
Datei: <span className="font-medium text-gray-700 dark:text-gray-300">{session.filename}</span>
{' '}({session.image_width} x {session.image_height} px)
</div>
{session && (
<div className="text-sm text-gray-500 dark:text-gray-400">
Datei: <span className="font-medium text-gray-700 dark:text-gray-300">{session.filename}</span>
{' '}({session.image_width} x {session.image_height} px)
</div>
)}
{/* Loading indicator */}
{deskewing && (
@@ -246,13 +148,17 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP
)}
{/* Image comparison */}
<ImageCompareView
originalUrl={session.original_image_url}
deskewedUrl={deskewResult?.deskewed_image_url ?? null}
showGrid={showGrid}
showBinarized={showBinarized}
binarizedUrl={deskewResult?.binarized_image_url ?? null}
/>
{session && (
<ImageCompareView
originalUrl={session.original_image_url}
deskewedUrl={deskewResult?.deskewed_image_url ?? null}
showGrid={showGrid}
showBinarized={showBinarized}
binarizedUrl={deskewResult?.binarized_image_url ?? null}
leftLabel="Zugeschnitten"
rightLabel="Begradigt"
/>
)}
{/* Controls */}
<DeskewControls
@@ -263,7 +169,7 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP
onToggleGrid={() => setShowGrid((v) => !v)}
onManualDeskew={handleManualDeskew}
onGroundTruth={handleGroundTruth}
onNext={() => session && onNext(session.session_id)}
onNext={onNext}
isApplying={applying}
/>

View File

@@ -0,0 +1,247 @@
'use client'
import { useCallback, useEffect, useState } from 'react'
import type { OrientationResult, SessionInfo } from '@/app/(admin)/ai/ocr-pipeline/types'
import { ImageCompareView } from './ImageCompareView'
const KLAUSUR_API = '/klausur-api'
interface StepOrientationProps {
sessionId?: string | null
onNext: (sessionId: string) => void
}
export function StepOrientation({ sessionId: existingSessionId, onNext }: StepOrientationProps) {
const [session, setSession] = useState<SessionInfo | null>(null)
const [orientationResult, setOrientationResult] = useState<OrientationResult | null>(null)
const [uploading, setUploading] = useState(false)
const [detecting, setDetecting] = useState(false)
const [error, setError] = useState<string | null>(null)
const [dragOver, setDragOver] = useState(false)
const [sessionName, setSessionName] = useState('')
// Reload session data when navigating back
useEffect(() => {
if (!existingSessionId || session) return
const loadSession = async () => {
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}`)
if (!res.ok) return
const data = await res.json()
const sessionInfo: SessionInfo = {
session_id: data.session_id,
filename: data.filename,
image_width: data.image_width,
image_height: data.image_height,
original_image_url: `${KLAUSUR_API}${data.original_image_url}`,
}
setSession(sessionInfo)
if (data.orientation_result) {
setOrientationResult(data.orientation_result)
}
} catch (e) {
console.error('Failed to reload session:', e)
}
}
loadSession()
}, [existingSessionId, session])
const handleUpload = useCallback(async (file: File) => {
setUploading(true)
setError(null)
setOrientationResult(null)
try {
const formData = new FormData()
formData.append('file', file)
if (sessionName.trim()) {
formData.append('name', sessionName.trim())
}
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, {
method: 'POST',
body: formData,
})
if (!res.ok) {
const err = await res.json().catch(() => ({ detail: res.statusText }))
throw new Error(err.detail || 'Upload fehlgeschlagen')
}
const data: SessionInfo = await res.json()
data.original_image_url = `${KLAUSUR_API}${data.original_image_url}`
setSession(data)
// Auto-trigger orientation detection
setDetecting(true)
const orientRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${data.session_id}/orientation`, {
method: 'POST',
})
if (!orientRes.ok) {
throw new Error('Orientierungserkennung fehlgeschlagen')
}
const orientData = await orientRes.json()
setOrientationResult({
orientation_degrees: orientData.orientation_degrees,
corrected: orientData.corrected,
duration_seconds: orientData.duration_seconds,
})
} catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
} finally {
setUploading(false)
setDetecting(false)
}
}, [sessionName])
const handleDrop = useCallback((e: React.DragEvent) => {
e.preventDefault()
setDragOver(false)
const file = e.dataTransfer.files[0]
if (file) handleUpload(file)
}, [handleUpload])
const handleFileInput = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
const file = e.target.files?.[0]
if (file) handleUpload(file)
}, [handleUpload])
// Upload area (no session yet)
if (!session) {
return (
<div className="space-y-4">
{/* Session name input */}
<div>
<label className="block text-sm font-medium text-gray-600 dark:text-gray-400 mb-1">
Session-Name (optional)
</label>
<input
type="text"
value={sessionName}
onChange={(e) => setSessionName(e.target.value)}
placeholder="z.B. Unit 3 Seite 42"
className="w-full max-w-sm px-3 py-2 text-sm border rounded-lg dark:bg-gray-800 dark:border-gray-600 dark:text-gray-200 focus:outline-none focus:ring-2 focus:ring-teal-500"
/>
</div>
<div
onDragOver={(e) => { e.preventDefault(); setDragOver(true) }}
onDragLeave={() => setDragOver(false)}
onDrop={handleDrop}
className={`border-2 border-dashed rounded-xl p-12 text-center transition-colors ${
dragOver
? 'border-teal-400 bg-teal-50 dark:bg-teal-900/20'
: 'border-gray-300 dark:border-gray-600 hover:border-teal-400'
}`}
>
{uploading ? (
<div className="text-gray-500">
<div className="animate-spin inline-block w-8 h-8 border-2 border-teal-500 border-t-transparent rounded-full mb-3" />
<p>Wird hochgeladen...</p>
</div>
) : (
<>
<div className="text-4xl mb-3">📄</div>
<p className="text-gray-600 dark:text-gray-400 mb-2">
PDF oder Bild hierher ziehen
</p>
<p className="text-sm text-gray-400 mb-4">oder</p>
<label className="inline-block px-4 py-2 bg-teal-600 text-white rounded-lg cursor-pointer hover:bg-teal-700 transition-colors">
Datei auswaehlen
<input
type="file"
accept=".pdf,.png,.jpg,.jpeg,.tiff,.tif"
onChange={handleFileInput}
className="hidden"
/>
</label>
</>
)}
</div>
{error && (
<div className="p-3 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 rounded-lg text-sm">
{error}
</div>
)}
</div>
)
}
// Session active: show orientation result
const orientedUrl = orientationResult
? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${session.session_id}/image/oriented`
: null
return (
<div className="space-y-4">
{/* Filename */}
<div className="text-sm text-gray-500 dark:text-gray-400">
Datei: <span className="font-medium text-gray-700 dark:text-gray-300">{session.filename}</span>
{' '}({session.image_width} x {session.image_height} px)
</div>
{/* Loading indicator */}
{detecting && (
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
Orientierung wird erkannt...
</div>
)}
{/* Image comparison */}
<ImageCompareView
originalUrl={session.original_image_url}
deskewedUrl={orientedUrl}
showGrid={false}
showBinarized={false}
binarizedUrl={null}
leftLabel="Original"
rightLabel="Orientiert"
/>
{/* Orientation result badge */}
{orientationResult && (
<div className="bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700 p-4">
<div className="flex items-center gap-3 text-sm">
{orientationResult.corrected ? (
<span className="inline-flex items-center gap-1.5 px-3 py-1 rounded-full bg-amber-50 dark:bg-amber-900/20 text-amber-700 dark:text-amber-400 text-xs font-medium">
🔄 {orientationResult.orientation_degrees}° korrigiert
</span>
) : (
<span className="inline-flex items-center gap-1.5 px-3 py-1 rounded-full bg-green-50 dark:bg-green-900/20 text-green-700 dark:text-green-400 text-xs font-medium">
0° (keine Drehung noetig)
</span>
)}
<span className="text-gray-400 text-xs">
{orientationResult.duration_seconds}s
</span>
</div>
</div>
)}
{/* Next button */}
{orientationResult && (
<div className="flex justify-end">
<button
onClick={() => onNext(session.session_id)}
className="px-6 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium transition-colors"
>
Weiter &rarr;
</button>
</div>
)}
{error && (
<div className="p-3 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 rounded-lg text-sm">
{error}
</div>
)}
</div>
)
}

View File

@@ -1,12 +1,12 @@
# OCR Pipeline - Schrittweise Seitenrekonstruktion
**Version:** 3.0.0
**Status:** Produktiv (Schritte 18 implementiert)
**Version:** 4.0.0
**Status:** Produktiv (Schritte 110 implementiert)
**URL:** https://macmini:3002/ai/ocr-pipeline
## Uebersicht
Die OCR Pipeline zerlegt den OCR-Prozess in **8 einzelne Schritte**, um eingescannte Seiten
Die OCR Pipeline zerlegt den OCR-Prozess in **10 einzelne Schritte**, um eingescannte Seiten
aus mehrspaltig gedruckten Schulbuechern Wort fuer Wort zu rekonstruieren.
Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten versehen werden.
@@ -16,14 +16,16 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v
| Schritt | Name | Beschreibung | Status |
|---------|------|--------------|--------|
| 1 | Begradigung (Deskew) | Scan begradigen (Hough Lines + Word Alignment) | Implementiert |
| 2 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert |
| 3 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
| 4 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
| 5 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert |
| 6 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
| 7 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert |
| 8 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
| 1 | Orientierung | 90/180/270° Drehungen von Scannern korrigieren | Implementiert |
| 2 | Zuschneiden (Crop) | Scannerraender entfernen, Papierformat (A4) erkennen | Implementiert |
| 3 | Begradigung (Deskew) | Scan begradigen (Hough Lines + Word Alignment) | Implementiert |
| 4 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert |
| 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
| 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
| 7 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert |
| 8 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
| 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert |
| 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
---
@@ -206,6 +208,7 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
|---------|------|--------------|
| `POST` | `/sessions/{id}/dewarp` | Automatische Entzerrung |
| `POST` | `/sessions/{id}/dewarp/manual` | Manueller Scherbungswinkel |
| `POST` | `/sessions/{id}/adjust-combined` | Kombinierte Rotation + Shear Feinabstimmung |
| `POST` | `/sessions/{id}/ground-truth/dewarp` | Ground Truth speichern |
### Schritt 3: Spalten
@@ -274,16 +277,48 @@ Die Dewarp-Erkennung misst die **vertikale Spaltenkippung** (dx/dy) statt Textze
| Ensemble Min-Confidence | 0.35 | Mindest-Konfidenz fuer Korrektur |
| Quality-Gate Skip | < 0.5° | Kleine Korrekturen ueberspringen Quality-Gate |
### Feinabstimmung (Combined Adjust)
Der Endpoint `POST /sessions/{id}/adjust-combined` erlaubt die kombinierte Feinabstimmung von
Rotation und Shear in einem Schritt. Im Frontend stehen **7 Schieberegler** zur Verfuegung:
**Rotation (3 Paesse):**
| Slider | Bereich | Beschreibung |
|--------|---------|--------------|
| P1 Iterative | ±5° | Erster Deskew-Pass (Hough Lines) |
| P2 Word-Alignment | ±3° | Zweiter Pass (Wort-Ausrichtung) |
| P3 Textline | ±3° | Dritter Pass (Textzeilen-Regression) |
Die Summe aller drei ergibt den finalen Rotationswinkel.
**Shear (4 Methoden, Radio-Auswahl):**
| Slider | Bereich | Beschreibung |
|--------|---------|--------------|
| A: Textline Drift | ±5° | Textzeilen-Drift |
| B: Projection Profile | ±5° | 2-Pass Projektionsprofil |
| C: Vertical Edges | ±5° | Vertikalkanten-Analyse |
| D: Ensemble | ±5° | Gewichteter Ensemble-Wert |
Nur der per Radio-Button ausgewaehlte Shear-Wert wird verwendet.
```
POST /sessions/{id}/adjust-combined
Body: {"rotation_degrees": 1.23, "shear_degrees": -0.45}
Response: {"method_used": "manual_combined", "shear_degrees": -0.45, "dewarped_image_url": "..."}
```
---
## Schritt 3: Spaltenerkennung (Detail)
### Algorithmus: `detect_column_geometry()`
Zweistufige Erkennung: vertikale Projektionsprofile finden Luecken, Wort-Bounding-Boxes validieren.
Mehrstufige Erkennung: Seite segmentieren, vertikale Projektionsprofile finden Luecken, Wort-Bounding-Boxes validieren.
```
Bild → Binarisierung → Vertikalprofil → Lueckenerkennung → Wort-Validierung → ColumnGeometry
Bild → Binarisierung → Seiten-Segmentierung → Vertikalprofil → Lueckenerkennung → Wort-Validierung → ColumnGeometry
```
**Wichtige Implementierungsdetails:**
@@ -293,6 +328,54 @@ Bild → Binarisierung → Vertikalprofil → Lueckenerkennung → Wort-Validier
- **Phantom-Spalten-Filter (Step 9):** Spalten mit Breite < 3 % der Content-Breite UND < 3 Woerter werden als Artefakte entfernt; die angrenzenden Spalten schliessen die Luecke.
- **Spaltenzuweisung:** Woerter werden anhand des groessten horizontalen Ueberlappungsbereichs einer Spalte zugeordnet.
### Seiten-Segmentierung an Sub-Headern
Farbige Zwischenueberschriften (z.B. „Unit 4: Bonnie Scotland" mit blauem Hintergrund)
erzeugen nach Binarisierung Tinte ueber die gesamte Seitenbreite. Diese Baender fuellen
Spaltenluecken im vertikalen Projektionsprofil auf und fuehren zu fragmentierten Spalten
(z.B. 11 statt 5).
**Loesung: Horizontale Gap-Segmentierung (Step 2b)**
1. **Horizontales Projektionsprofil** berechnen: Zeilensummen ueber den Content-Bereich
2. **Leere Zeilen** erkennen: Zeilen mit < 2% Tinten-Dichte (`H_GAP_THRESH = 0.02`)
3. **Gaps sammeln**: Zusammenhaengende leere Zeilen zu Gaps buendeln (Mindestlaenge: `max(5, h/200)`)
4. **Grosse Gaps identifizieren**: Gaps > 1.8× Median-Gap-Hoehe = Sub-Header-Trennungen
5. **Segmente bilden**: Seite an grossen Gaps aufteilen
6. **Groesstes Segment waehlen**: Das hoechste Segment wird fuer die vertikale Projektion verwendet
```
┌─────────────────────────────────┐
│ Header / Titel │ ─── grosser Gap ───
├─────────────────────────────────┤
│ EN │ DE │ Example │ Page │ ← Segment 1 (groesster)
│ ... │ ... │ ... │ ... │
├─────────────────────────────────┤
│ Unit 4: Bonnie Scotland │ ─── grosser Gap ───
├─────────────────────────────────┤
│ EN │ DE │ Example │ Page │ ← Segment 2
│ ... │ ... │ ... │ ... │
└─────────────────────────────────┘
```
**Segment-gefilterte Wort-Validierung:**
Die Wort-Validierung (Step 5) nutzt nur Tesseract-Woerter **innerhalb des gewaehlten Segments**.
Woerter aus Sub-Header-Bereichen (die die volle Breite einnehmen) werden so ausgeschlossen
und koennen die Spaltenluecken-Validierung nicht verfaelschen.
### Word-Coverage Gap Detection (Fallback)
Wenn die pixel-basierte Projektion keine ausreichenden Spaltenluecken findet
(z.B. bei Seiten mit Illustrationen, die Spaltenluecken teilweise verdecken),
greift ein Fallback auf Basis der Tesseract-Wort-Bounding-Boxes:
1. X-Achse in 2px-Bins aufteilen
2. Pro Bin zaehlen, wie viele Segment-Woerter ihn ueberdecken
3. Zusammenhaengende Bins mit 0 Woertern = Gap-Kandidaten
4. Nur Gaps im inneren 90%-Bereich beruecksichtigen (Raender ignorieren)
5. Gaps mit Mindestbreite (`max(8px, content_w * 0.5%)`) werden als Spaltenluecken akzeptiert
### Sub-Spalten-Erkennung: `_detect_sub_columns()`
Erkennt versteckte Sub-Spalten innerhalb breiter Spalten (z.B. Seitenzahl-Spalte links neben EN-Vokabeln).
@@ -658,7 +741,7 @@ CREATE TABLE ocr_pipeline_sessions (
| Schraeg gedruckte Seiten | Deskew erkennt Text-Rotation, nicht Seiten-Rotation | Manueller Winkel |
| Sehr kleine Schrift (< 8pt) | Tesseract PSM 7 braucht min. Zeichengroesse | Vorher zoomen |
| Handgeschriebene Eintraege | Tesseract/RapidOCR sind fuer Druckschrift optimiert | TrOCR-Engine |
| Mehr als 4 Spalten | Projektionsprofil kann verschmelzen | Manuelle Spalten |
| Mehr als 5 Spalten | Projektionsprofil kann verschmelzen (Segmentierung hilft) | Manuelle Spalten |
| Farbige Marker (rot/blau) | HSV-Erkennung erzeugt False Positives | Manuell im Rekonstruktions-Editor |
| 15%-Schwelle nicht breit validiert | Nur an einem Arbeitsblatt-Typ getestet | Diverse Schulbuchseiten testen |
@@ -699,6 +782,8 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea
| Datum | Version | Aenderung |
|-------|---------|----------|
| 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung |
| 2026-03-05 | 3.0.1 | Dewarp: Feinabstimmung mit 7 Schiebereglern (3 Rotation + 4 Shear), Combined-Adjust-Endpoint |
| 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade |
| 2026-03-04 | 2.2.0 | Dewarp: Vertikalkanten-Drift statt Textzeilen-Neigung, Schwellenwerte gesenkt |
| 2026-03-04 | 2.1.0 | Sub-Column-Detection, expand_narrow_columns, Fabric.js Editor, PDF/DOCX-Export |

View File

@@ -42,7 +42,8 @@ try:
except ImportError:
trocr_router = None
from vocab_worksheet_api import router as vocab_router, set_db_pool as set_vocab_db_pool, _init_vocab_table, _load_all_sessions, DATABASE_URL as VOCAB_DATABASE_URL
from ocr_pipeline_api import router as ocr_pipeline_router
from ocr_pipeline_api import router as ocr_pipeline_router, _cache as ocr_pipeline_cache
from orientation_crop_api import router as orientation_crop_router, set_cache_ref as set_orientation_crop_cache
from ocr_pipeline_session_store import init_ocr_pipeline_tables
try:
from handwriting_htr_api import router as htr_router
@@ -177,6 +178,8 @@ if trocr_router:
app.include_router(trocr_router) # TrOCR Handwriting OCR
app.include_router(vocab_router) # Vocabulary Worksheet Generator
app.include_router(ocr_pipeline_router) # OCR Pipeline (step-by-step)
set_orientation_crop_cache(ocr_pipeline_cache)
app.include_router(orientation_crop_router) # OCR Pipeline: Orientation + Crop
if htr_router:
app.include_router(htr_router) # Handwriting HTR (Klausur)
if dsfa_rag_router:

View File

@@ -1,15 +1,17 @@
"""
OCR Pipeline API - Schrittweise Seitenrekonstruktion.
Zerlegt den OCR-Prozess in 8 einzelne Schritte:
1. Deskewing - Scan begradigen
2. Dewarping - Buchwoelbung entzerren
3. Spaltenerkennung - Unsichtbare Spalten finden
4. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
5. Worterkennung - OCR mit Bounding Boxes
6. LLM-Korrektur - OCR-Fehler per LLM korrigieren
7. Seitenrekonstruktion - Seite nachbauen
8. Ground Truth Validierung - Gesamtpruefung
Zerlegt den OCR-Prozess in 10 einzelne Schritte:
1. Orientierung - 90/180/270° Drehungen korrigieren (orientation_crop_api.py)
2. Zuschneiden - Scannerraender entfernen (orientation_crop_api.py)
3. Deskewing - Scan begradigen
4. Dewarping - Buchwoelbung entzerren
5. Spaltenerkennung - Unsichtbare Spalten finden
6. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
7. Worterkennung - OCR mit Bounding Boxes
8. LLM-Korrektur - OCR-Fehler per LLM korrigieren
9. Seitenrekonstruktion - Seite nachbauen
10. Ground Truth Validierung - Gesamtpruefung
Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -54,7 +56,6 @@ from cv_vocab_pipeline import (
deskew_image_by_word_alignment,
deskew_image_iterative,
deskew_two_pass,
detect_and_fix_orientation,
detect_column_geometry,
detect_document_type,
detect_row_geometry,
@@ -103,6 +104,8 @@ async def _load_session_to_cache(session_id: str) -> Dict[str, Any]:
"id": session_id,
**session,
"original_bgr": None,
"oriented_bgr": None,
"cropped_bgr": None,
"deskewed_bgr": None,
"dewarped_bgr": None,
}
@@ -110,6 +113,8 @@ async def _load_session_to_cache(session_id: str) -> Dict[str, Any]:
# Decode images from DB into BGR numpy arrays
for img_type, bgr_key in [
("original", "original_bgr"),
("oriented", "oriented_bgr"),
("cropped", "cropped_bgr"),
("deskewed", "deskewed_bgr"),
("dewarped", "dewarped_bgr"),
]:
@@ -252,8 +257,12 @@ async def create_session(
"filename": filename,
"name": session_name,
"original_bgr": img_bgr,
"oriented_bgr": None,
"cropped_bgr": None,
"deskewed_bgr": None,
"dewarped_bgr": None,
"orientation_result": None,
"crop_result": None,
"deskew_result": None,
"dewarp_result": None,
"ground_truth": {},
@@ -301,6 +310,10 @@ async def get_session_info(session_id: str):
"doc_type": session.get("doc_type"),
}
if session.get("orientation_result"):
result["orientation_result"] = session["orientation_result"]
if session.get("crop_result"):
result["crop_result"] = session["crop_result"]
if session.get("deskew_result"):
result["deskew_result"] = session["deskew_result"]
if session.get("dewarp_result"):
@@ -427,7 +440,7 @@ async def _append_pipeline_log(
@router.get("/sessions/{session_id}/image/{image_type}")
async def get_image(session_id: str, image_type: str):
"""Serve session images: original, deskewed, dewarped, binarized, columns-overlay, or rows-overlay."""
valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay", "clean"}
valid_types = {"original", "oriented", "cropped", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay", "clean"}
if image_type not in valid_types:
raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")
@@ -470,22 +483,13 @@ async def auto_deskew(session_id: str):
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = cached.get("original_bgr")
# Use cropped image as input (from step 2), fall back to oriented, then original
img_bgr = cached.get("cropped_bgr") or cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Original image not available")
raise HTTPException(status_code=400, detail="No image available for deskewing")
t0 = time.time()
# Orientation detection (fix 90/180/270° rotations from scanners)
img_bgr, orientation_deg = detect_and_fix_orientation(img_bgr)
if orientation_deg:
# Update original in cache + DB so all subsequent steps use corrected image
cached["original_bgr"] = img_bgr
success_ori, ori_buf = cv2.imencode(".png", img_bgr)
if success_ori:
await update_session_db(session_id, original_png=ori_buf.tobytes())
logger.info(f"OCR Pipeline: orientation corrected {orientation_deg}° for session {session_id}")
# Two-pass deskew: iterative (±5°) + word-alignment residual check
deskewed_bgr, angle_applied, two_pass_debug = deskew_two_pass(img_bgr.copy())
@@ -534,7 +538,6 @@ async def auto_deskew(session_id: str):
"angle_residual": round(angle_residual, 3),
"angle_textline": round(angle_textline, 3),
"angle_applied": round(angle_applied, 3),
"orientation_degrees": orientation_deg,
"method_used": method_used,
"confidence": round(confidence, 2),
"duration_seconds": round(duration, 2),
@@ -550,7 +553,7 @@ async def auto_deskew(session_id: str):
db_update = {
"deskewed_png": deskewed_png,
"deskew_result": deskew_result,
"current_step": 2,
"current_step": 4,
}
if binarized_png:
db_update["binarized_png"] = binarized_png
@@ -563,7 +566,6 @@ async def auto_deskew(session_id: str):
f"-> {method_used} total={angle_applied:.2f}")
await _append_pipeline_log(session_id, "deskew", {
"orientation": orientation_deg,
"angle_applied": round(angle_applied, 3),
"angle_iterative": round(angle_iterative, 3),
"angle_residual": round(angle_residual, 3),
@@ -582,14 +584,14 @@ async def auto_deskew(session_id: str):
@router.post("/sessions/{session_id}/deskew/manual")
async def manual_deskew(session_id: str, req: ManualDeskewRequest):
"""Apply a manual rotation angle to the original image."""
"""Apply a manual rotation angle to the cropped image."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = cached.get("original_bgr")
img_bgr = cached.get("cropped_bgr") or cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Original image not available")
raise HTTPException(status_code=400, detail="No image available for deskewing")
angle = max(-5.0, min(5.0, req.angle))
@@ -797,7 +799,7 @@ async def auto_dewarp(
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=3,
current_step=5,
)
logger.info(f"OCR Pipeline: dewarp session {session_id}: "
@@ -1109,7 +1111,7 @@ async def detect_columns(session_id: str):
column_result=column_result,
row_result=None,
word_result=None,
current_step=3,
current_step=5,
)
# Update cache
@@ -1335,7 +1337,7 @@ async def detect_rows(session_id: str):
session_id,
row_result=row_result,
word_result=None,
current_step=4,
current_step=6,
)
cached["row_result"] = row_result
@@ -1601,7 +1603,7 @@ async def detect_words(
await update_session_db(
session_id,
word_result=word_result,
current_step=5,
current_step=7,
)
cached["word_result"] = word_result
@@ -1745,7 +1747,7 @@ async def _word_batch_stream_generator(
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
vocab_entries = entries
await update_session_db(session_id, word_result=word_result, current_step=5)
await update_session_db(session_id, word_result=word_result, current_step=7)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
@@ -1892,7 +1894,7 @@ async def _word_stream_generator(
await update_session_db(
session_id,
word_result=word_result,
current_step=5,
current_step=7,
)
cached["word_result"] = word_result
@@ -2016,7 +2018,7 @@ async def run_llm_review(session_id: str, request: Request, stream: bool = False
"duration_ms": result["duration_ms"],
"entries_corrected": result["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=6)
await update_session_db(session_id, word_result=word_result, current_step=8)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2065,7 +2067,7 @@ async def _llm_review_stream_generator(
"duration_ms": event["duration_ms"],
"entries_corrected": event["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=6)
await update_session_db(session_id, word_result=word_result, current_step=8)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2153,7 +2155,7 @@ async def save_reconstruction(session_id: str, request: Request):
cell_updates = body.get("cells", [])
if not cell_updates:
await update_session_db(session_id, current_step=7)
await update_session_db(session_id, current_step=9)
return {"session_id": session_id, "updated": 0}
# Build update map: cell_id -> new text
@@ -2189,7 +2191,7 @@ async def save_reconstruction(session_id: str, request: Request):
if "entries" in word_result:
word_result["entries"] = entries
await update_session_db(session_id, word_result=word_result, current_step=7)
await update_session_db(session_id, word_result=word_result, current_step=9)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2572,7 +2574,7 @@ async def save_validation(session_id: str, req: ValidationRequest):
"""Save final validation results for step 8.
Stores notes, score, and preserves any detected/generated image regions.
Sets current_step = 8 to mark pipeline as complete.
Sets current_step = 10 to mark pipeline as complete.
"""
session = await get_session_db(session_id)
if not session:
@@ -2585,7 +2587,7 @@ async def save_validation(session_id: str, req: ValidationRequest):
validation["score"] = req.score
ground_truth["validation"] = validation
await update_session_db(session_id, ground_truth=ground_truth, current_step=8)
await update_session_db(session_id, ground_truth=ground_truth, current_step=10)
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
@@ -2619,12 +2621,14 @@ async def reprocess_session(session_id: str, request: Request):
Body: {"from_step": 5} (1-indexed step number)
Clears downstream results:
- from_step <= 1: deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 2: dewarp_result, column_result, row_result, word_result
- from_step <= 3: column_result, row_result, word_result
- from_step <= 4: row_result, word_result
- from_step <= 5: word_result (cells, vocab_entries)
- from_step <= 6: word_result.llm_review only
- from_step <= 1: orientation_result, crop_result, deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 2: crop_result, deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 3: deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 4: dewarp_result, column_result, row_result, word_result
- from_step <= 5: column_result, row_result, word_result
- from_step <= 6: row_result, word_result
- from_step <= 7: word_result (cells, vocab_entries)
- from_step <= 8: word_result.llm_review only
"""
session = await get_session_db(session_id)
if not session:
@@ -2632,15 +2636,15 @@ async def reprocess_session(session_id: str, request: Request):
body = await request.json()
from_step = body.get("from_step", 1)
if not isinstance(from_step, int) or from_step < 1 or from_step > 7:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 7")
if not isinstance(from_step, int) or from_step < 1 or from_step > 9:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 9")
update_kwargs: Dict[str, Any] = {"current_step": from_step}
# Clear downstream data based on from_step
if from_step <= 5:
if from_step <= 7:
update_kwargs["word_result"] = None
elif from_step == 6:
elif from_step == 8:
# Only clear LLM review from word_result
word_result = session.get("word_result")
if word_result:
@@ -2648,14 +2652,18 @@ async def reprocess_session(session_id: str, request: Request):
word_result.pop("llm_corrections", None)
update_kwargs["word_result"] = word_result
if from_step <= 4:
if from_step <= 6:
update_kwargs["row_result"] = None
if from_step <= 3:
if from_step <= 5:
update_kwargs["column_result"] = None
if from_step <= 2:
if from_step <= 4:
update_kwargs["dewarp_result"] = None
if from_step <= 1:
if from_step <= 3:
update_kwargs["deskew_result"] = None
if from_step <= 2:
update_kwargs["crop_result"] = None
if from_step <= 1:
update_kwargs["orientation_result"] = None
await update_session_db(session_id, **update_kwargs)
@@ -3074,7 +3082,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
deskewed_png=deskewed_png,
deskew_result=deskew_result,
auto_rotation_degrees=float(angle_applied),
current_step=2,
current_step=4,
)
session = await get_session_db(session_id)
@@ -3137,7 +3145,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=3,
current_step=5,
)
session = await get_session_db(session_id)
@@ -3196,7 +3204,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
cached["column_result"] = column_result
await update_session_db(session_id, column_result=column_result,
row_result=None, word_result=None, current_step=4)
row_result=None, word_result=None, current_step=6)
session = await get_session_db(session_id)
steps_run.append("columns")
@@ -3273,7 +3281,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
}
cached["row_result"] = row_result
await update_session_db(session_id, row_result=row_result, current_step=5)
await update_session_db(session_id, row_result=row_result, current_step=7)
session = await get_session_db(session_id)
steps_run.append("rows")
@@ -3381,7 +3389,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
word_result_data["entry_count"] = len(entries)
word_result_data["summary"]["total_entries"] = len(entries)
await update_session_db(session_id, word_result=word_result_data, current_step=6)
await update_session_db(session_id, word_result=word_result_data, current_step=8)
cached["word_result"] = word_result_data
session = await get_session_db(session_id)
@@ -3426,7 +3434,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
word_result_updated["llm_reviewed"] = True
word_result_updated["llm_model"] = OLLAMA_REVIEW_MODEL
await update_session_db(session_id, word_result=word_result_updated, current_step=7)
await update_session_db(session_id, word_result=word_result_updated, current_step=9)
cached["word_result"] = word_result_updated
steps_run.append("llm_review")

View File

@@ -68,7 +68,11 @@ async def init_ocr_pipeline_tables():
ADD COLUMN IF NOT EXISTS doc_type VARCHAR(50),
ADD COLUMN IF NOT EXISTS doc_type_result JSONB,
ADD COLUMN IF NOT EXISTS document_category VARCHAR(50),
ADD COLUMN IF NOT EXISTS pipeline_log JSONB
ADD COLUMN IF NOT EXISTS pipeline_log JSONB,
ADD COLUMN IF NOT EXISTS oriented_png BYTEA,
ADD COLUMN IF NOT EXISTS cropped_png BYTEA,
ADD COLUMN IF NOT EXISTS orientation_result JSONB,
ADD COLUMN IF NOT EXISTS crop_result JSONB
""")
@@ -90,6 +94,7 @@ async def create_session_db(
id, name, filename, original_png, status, current_step
) VALUES ($1, $2, $3, $4, 'active', 1)
RETURNING id, name, filename, status, current_step,
orientation_result, crop_result,
deskew_result, dewarp_result, column_result, row_result,
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
@@ -106,6 +111,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
async with pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT id, name, filename, status, current_step,
orientation_result, crop_result,
deskew_result, dewarp_result, column_result, row_result,
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
@@ -123,6 +129,8 @@ async def get_session_image(session_id: str, image_type: str) -> Optional[bytes]
"""Load a single image (BYTEA) from the session."""
column_map = {
"original": "original_png",
"oriented": "oriented_png",
"cropped": "cropped_png",
"deskewed": "deskewed_png",
"binarized": "binarized_png",
"dewarped": "dewarped_png",
@@ -150,15 +158,17 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
allowed_fields = {
'name', 'filename', 'status', 'current_step',
'original_png', 'deskewed_png', 'binarized_png', 'dewarped_png',
'original_png', 'oriented_png', 'cropped_png',
'deskewed_png', 'binarized_png', 'dewarped_png',
'clean_png', 'handwriting_removal_meta',
'orientation_result', 'crop_result',
'deskew_result', 'dewarp_result', 'column_result', 'row_result',
'word_result', 'ground_truth', 'auto_shear_degrees',
'doc_type', 'doc_type_result',
'document_category', 'pipeline_log',
}
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log'}
jsonb_fields = {'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log'}
for key, value in kwargs.items():
if key in allowed_fields:
@@ -182,6 +192,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
SET {', '.join(fields)}
WHERE id = ${param_idx}
RETURNING id, name, filename, status, current_step,
orientation_result, crop_result,
deskew_result, dewarp_result, column_result, row_result,
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
@@ -254,7 +265,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
result[key] = result[key].isoformat()
# JSONB → parsed (asyncpg returns str for JSONB)
for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log']:
for key in ['orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log']:
if key in result and result[key] is not None:
if isinstance(result[key], str):
result[key] = json.loads(result[key])

View File

@@ -0,0 +1,330 @@
"""
Orientation & Crop API - Steps 1-2 of the OCR Pipeline.
Step 1: Orientation detection (fix 90/180/270 degree rotations)
Step 2: Page cropping (remove scanner borders, detect paper format)
These endpoints were extracted from the main pipeline to keep files manageable.
"""
import logging
import time
from typing import Any, Dict, Optional
import cv2
import numpy as np
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from cv_vocab_pipeline import detect_and_fix_orientation
from page_crop import detect_and_crop_page
from ocr_pipeline_session_store import (
get_session_db,
get_session_image,
update_session_db,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
# Reference to the shared cache from ocr_pipeline_api (set in main.py)
_cache: Dict[str, Dict[str, Any]] = {}
def set_cache_ref(cache: Dict[str, Dict[str, Any]]):
"""Set reference to the shared cache from ocr_pipeline_api."""
global _cache
_cache = cache
async def _ensure_cached(session_id: str) -> Dict[str, Any]:
"""Ensure session is in cache, loading from DB if needed."""
if session_id in _cache:
return _cache[session_id]
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
cache_entry: Dict[str, Any] = {
"id": session_id,
**session,
"original_bgr": None,
"oriented_bgr": None,
"cropped_bgr": None,
"deskewed_bgr": None,
"dewarped_bgr": None,
}
for img_type, bgr_key in [
("original", "original_bgr"),
("oriented", "oriented_bgr"),
("cropped", "cropped_bgr"),
("deskewed", "deskewed_bgr"),
("dewarped", "dewarped_bgr"),
]:
png_data = await get_session_image(session_id, img_type)
if png_data:
arr = np.frombuffer(png_data, dtype=np.uint8)
bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
cache_entry[bgr_key] = bgr
_cache[session_id] = cache_entry
return cache_entry
async def _append_pipeline_log(session_id: str, step: str, metrics: dict, duration_ms: int):
"""Append a step entry to the pipeline log."""
from datetime import datetime
session = await get_session_db(session_id)
if not session:
return
pipeline_log = session.get("pipeline_log") or {"steps": []}
pipeline_log["steps"].append({
"step": step,
"completed_at": datetime.utcnow().isoformat(),
"success": True,
"duration_ms": duration_ms,
"metrics": metrics,
})
await update_session_db(session_id, pipeline_log=pipeline_log)
# ---------------------------------------------------------------------------
# Step 1: Orientation
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/orientation")
async def detect_orientation(session_id: str):
"""Detect and fix 90/180/270 degree rotations from scanners.
Reads the original image, applies orientation correction,
stores the result as oriented_png.
"""
cached = await _ensure_cached(session_id)
img_bgr = cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Original image not available")
t0 = time.time()
# Detect and fix orientation
oriented_bgr, orientation_deg = detect_and_fix_orientation(img_bgr.copy())
duration = time.time() - t0
orientation_result = {
"orientation_degrees": orientation_deg,
"corrected": orientation_deg != 0,
"duration_seconds": round(duration, 2),
}
# Encode oriented image
success, png_buf = cv2.imencode(".png", oriented_bgr)
oriented_png = png_buf.tobytes() if success else b""
# Update cache
cached["oriented_bgr"] = oriented_bgr
cached["orientation_result"] = orientation_result
# Persist to DB
await update_session_db(
session_id,
oriented_png=oriented_png,
orientation_result=orientation_result,
current_step=2,
)
logger.info(
"OCR Pipeline: orientation session %s: %d° (%s) in %.2fs",
session_id, orientation_deg,
"corrected" if orientation_deg else "no change",
duration,
)
await _append_pipeline_log(session_id, "orientation", {
"orientation_degrees": orientation_deg,
"corrected": orientation_deg != 0,
}, duration_ms=int(duration * 1000))
h, w = oriented_bgr.shape[:2]
return {
"session_id": session_id,
**orientation_result,
"image_width": w,
"image_height": h,
"oriented_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/oriented",
}
# ---------------------------------------------------------------------------
# Step 2: Crop
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/crop")
async def auto_crop(session_id: str):
"""Auto-detect and crop scanner borders.
Reads the oriented image (or original if no orientation step),
detects the page boundary and crops.
"""
cached = await _ensure_cached(session_id)
# Use oriented image if available, else original
img_bgr = cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for cropping")
t0 = time.time()
cropped_bgr, crop_info = detect_and_crop_page(img_bgr)
duration = time.time() - t0
crop_info["duration_seconds"] = round(duration, 2)
# Encode cropped image
success, png_buf = cv2.imencode(".png", cropped_bgr)
cropped_png = png_buf.tobytes() if success else b""
# Update cache
cached["cropped_bgr"] = cropped_bgr
cached["crop_result"] = crop_info
# Persist to DB
await update_session_db(
session_id,
cropped_png=cropped_png,
crop_result=crop_info,
current_step=3,
)
logger.info(
"OCR Pipeline: crop session %s: applied=%s format=%s in %.2fs",
session_id, crop_info["crop_applied"],
crop_info.get("detected_format", "?"),
duration,
)
await _append_pipeline_log(session_id, "crop", {
"crop_applied": crop_info["crop_applied"],
"detected_format": crop_info.get("detected_format"),
"format_confidence": crop_info.get("format_confidence"),
}, duration_ms=int(duration * 1000))
h, w = cropped_bgr.shape[:2]
return {
"session_id": session_id,
**crop_info,
"image_width": w,
"image_height": h,
"cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped",
}
class ManualCropRequest(BaseModel):
x: float # percentage 0-100
y: float # percentage 0-100
width: float # percentage 0-100
height: float # percentage 0-100
@router.post("/sessions/{session_id}/crop/manual")
async def manual_crop(session_id: str, req: ManualCropRequest):
"""Manually crop using percentage coordinates."""
cached = await _ensure_cached(session_id)
img_bgr = cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for cropping")
h, w = img_bgr.shape[:2]
# Convert percentages to pixels
px_x = int(w * req.x / 100.0)
px_y = int(h * req.y / 100.0)
px_w = int(w * req.width / 100.0)
px_h = int(h * req.height / 100.0)
# Clamp
px_x = max(0, min(px_x, w - 1))
px_y = max(0, min(px_y, h - 1))
px_w = max(1, min(px_w, w - px_x))
px_h = max(1, min(px_h, h - px_y))
cropped_bgr = img_bgr[px_y:px_y + px_h, px_x:px_x + px_w].copy()
success, png_buf = cv2.imencode(".png", cropped_bgr)
cropped_png = png_buf.tobytes() if success else b""
crop_result = {
"crop_applied": True,
"crop_rect": {"x": px_x, "y": px_y, "width": px_w, "height": px_h},
"crop_rect_pct": {"x": round(req.x, 2), "y": round(req.y, 2),
"width": round(req.width, 2), "height": round(req.height, 2)},
"original_size": {"width": w, "height": h},
"cropped_size": {"width": px_w, "height": px_h},
"method": "manual",
}
cached["cropped_bgr"] = cropped_bgr
cached["crop_result"] = crop_result
await update_session_db(
session_id,
cropped_png=cropped_png,
crop_result=crop_result,
current_step=3,
)
ch, cw = cropped_bgr.shape[:2]
return {
"session_id": session_id,
**crop_result,
"image_width": cw,
"image_height": ch,
"cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped",
}
@router.post("/sessions/{session_id}/crop/skip")
async def skip_crop(session_id: str):
"""Skip cropping — use oriented (or original) image as-is."""
cached = await _ensure_cached(session_id)
img_bgr = cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available")
h, w = img_bgr.shape[:2]
# Store the oriented image as cropped (identity crop)
success, png_buf = cv2.imencode(".png", img_bgr)
cropped_png = png_buf.tobytes() if success else b""
crop_result = {
"crop_applied": False,
"skipped": True,
"original_size": {"width": w, "height": h},
"cropped_size": {"width": w, "height": h},
}
cached["cropped_bgr"] = img_bgr
cached["crop_result"] = crop_result
await update_session_db(
session_id,
cropped_png=cropped_png,
crop_result=crop_result,
current_step=3,
)
return {
"session_id": session_id,
**crop_result,
"image_width": w,
"image_height": h,
"cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped",
}

View File

@@ -0,0 +1,187 @@
"""
Page Crop - Automatic scanner border removal and page format detection.
Detects the paper boundary in a scanned image and crops away scanner borders.
Also identifies the paper format (A4, Letter, etc.) from the aspect ratio.
License: Apache 2.0
"""
import logging
from typing import Dict, Any, Tuple
import cv2
import numpy as np
logger = logging.getLogger(__name__)
# Known paper format aspect ratios (height / width, portrait orientation)
PAPER_FORMATS = {
"A4": 297.0 / 210.0, # 1.4143
"A5": 210.0 / 148.0, # 1.4189
"Letter": 11.0 / 8.5, # 1.2941
"Legal": 14.0 / 8.5, # 1.6471
"A3": 420.0 / 297.0, # 1.4141
}
def detect_and_crop_page(
img_bgr: np.ndarray,
min_border_fraction: float = 0.01,
) -> Tuple[np.ndarray, Dict[str, Any]]:
"""Detect page boundary and crop scanner borders.
Algorithm:
1. Grayscale + GaussianBlur to smooth out text
2. Otsu threshold (page=bright, scanner border=dark)
3. Morphological close to fill gaps
4. Find largest contour = page
5. If contour covers >95% of image area -> no crop needed
6. Get bounding rect, add safety margin
7. Match aspect ratio to known paper formats
Args:
img_bgr: Input BGR image
min_border_fraction: Minimum border fraction to trigger crop (default 1%)
Returns:
Tuple of (cropped_image, result_dict)
"""
h, w = img_bgr.shape[:2]
total_area = h * w
result: Dict[str, Any] = {
"crop_applied": False,
"crop_rect": None,
"crop_rect_pct": None,
"original_size": {"width": w, "height": h},
"cropped_size": {"width": w, "height": h},
"detected_format": None,
"format_confidence": 0.0,
"aspect_ratio": round(max(h, w) / max(min(h, w), 1), 4),
"border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0},
}
# 1. Grayscale + blur
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (21, 21), 0)
# 2. Otsu threshold
_, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 3. Morphological close to fill text gaps
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 50))
closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
# 4. Find contours
contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not contours:
logger.info("No contours found - returning original image")
return img_bgr, result
# Get the largest contour
largest = max(contours, key=cv2.contourArea)
contour_area = cv2.contourArea(largest)
# 5. If contour covers >95% of image, no crop needed
if contour_area > 0.95 * total_area:
logger.info("Page covers >95%% of image - no crop needed")
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# 6. Get bounding rect
rx, ry, rw, rh = cv2.boundingRect(largest)
# Calculate border fractions
border_top = ry / h
border_bottom = (h - (ry + rh)) / h
border_left = rx / w
border_right = (w - (rx + rw)) / w
result["border_fractions"] = {
"top": round(border_top, 4),
"bottom": round(border_bottom, 4),
"left": round(border_left, 4),
"right": round(border_right, 4),
}
# 7. Check if borders are significant enough to crop
if all(f < min_border_fraction for f in [border_top, border_bottom, border_left, border_right]):
logger.info("All borders < %.1f%% - no crop needed", min_border_fraction * 100)
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# 8. Add safety margin (0.5% of image dimensions)
margin_x = int(w * 0.005)
margin_y = int(h * 0.005)
crop_x = max(0, rx - margin_x)
crop_y = max(0, ry - margin_y)
crop_x2 = min(w, rx + rw + margin_x)
crop_y2 = min(h, ry + rh + margin_y)
crop_w = crop_x2 - crop_x
crop_h = crop_y2 - crop_y
# Sanity check: cropped area should be at least 50% of original
if crop_w * crop_h < 0.5 * total_area:
logger.warning("Cropped area too small (%.0f%%) - skipping crop",
100.0 * crop_w * crop_h / total_area)
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# 9. Crop
cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy()
# 10. Detect format from cropped dimensions
detected_format, format_confidence = _detect_format(crop_w, crop_h)
result["crop_applied"] = True
result["crop_rect"] = {"x": crop_x, "y": crop_y, "width": crop_w, "height": crop_h}
result["crop_rect_pct"] = {
"x": round(100.0 * crop_x / w, 2),
"y": round(100.0 * crop_y / h, 2),
"width": round(100.0 * crop_w / w, 2),
"height": round(100.0 * crop_h / h, 2),
}
result["cropped_size"] = {"width": crop_w, "height": crop_h}
result["detected_format"] = detected_format
result["format_confidence"] = format_confidence
result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4)
logger.info("Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
w, h, crop_w, crop_h, detected_format, format_confidence * 100,
border_top * 100, border_bottom * 100, border_left * 100, border_right * 100)
return cropped, result
def _detect_format(width: int, height: int) -> Tuple[str, float]:
"""Detect paper format from dimensions by comparing aspect ratios.
Returns:
(format_name, confidence) where confidence is 0.0-1.0
"""
if width <= 0 or height <= 0:
return "unknown", 0.0
# Use portrait aspect ratio (taller / shorter)
aspect = max(width, height) / min(width, height)
best_format = "unknown"
best_diff = float("inf")
for fmt, expected_ratio in PAPER_FORMATS.items():
diff = abs(aspect - expected_ratio)
if diff < best_diff:
best_diff = diff
best_format = fmt
# Confidence: 1.0 if exact match, decreasing with deviation
# Threshold: if diff > 0.1, confidence drops below 0.5
confidence = max(0.0, 1.0 - best_diff * 5.0)
if confidence < 0.3:
return "unknown", 0.0
return best_format, round(confidence, 3)