From 27636317117e29a7298283300ef7dd0baddb4cec Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 8 Mar 2026 23:55:23 +0100 Subject: [PATCH] feat: Orientierung + Zuschneiden als Schritte 1-2 in OCR-Pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zwei neue Wizard-Schritte vor Begradigung: - Step 1: Orientierungserkennung (0/90/180/270Β° via Tesseract OSD) - Step 2: Seitenrand-Erkennung und Zuschnitt (Scannerraender entfernen) Backend: - orientation_crop_api.py: POST /orientation, POST /crop, POST /crop/skip - page_crop.py: detect_and_crop_page() mit Format-Erkennung (A4/A5/Letter) - Session-Store: orientation_result, crop_result Felder - Pipeline nutzt zugeschnittenes Bild fuer Deskew/Dewarp Frontend: - StepOrientation.tsx: Upload + Auto-Orientierung + Vorher/Nachher - StepCrop.tsx: Auto-Crop + Format-Badge + Ueberspringen-Option - Pipeline-Stepper: 10 Schritte (war 8) Co-Authored-By: Claude Opus 4.6 --- .../app/(admin)/ai/ocr-pipeline/page.tsx | 40 ++- .../app/(admin)/ai/ocr-pipeline/types.ts | 25 +- .../ocr-pipeline/DeskewControls.tsx | 5 - .../components/ocr-pipeline/StepCrop.tsx | 185 ++++++++++ .../components/ocr-pipeline/StepDeskew.tsx | 220 ++++-------- .../ocr-pipeline/StepOrientation.tsx | 247 +++++++++++++ .../services/klausur-service/OCR-Pipeline.md | 113 +++++- klausur-service/backend/main.py | 5 +- klausur-service/backend/ocr_pipeline_api.py | 130 +++---- .../backend/ocr_pipeline_session_store.py | 19 +- .../backend/orientation_crop_api.py | 330 ++++++++++++++++++ klausur-service/backend/page_crop.py | 187 ++++++++++ 12 files changed, 1247 insertions(+), 259 deletions(-) create mode 100644 admin-lehrer/components/ocr-pipeline/StepCrop.tsx create mode 100644 admin-lehrer/components/ocr-pipeline/StepOrientation.tsx create mode 100644 klausur-service/backend/orientation_crop_api.py create mode 100644 klausur-service/backend/page_crop.py diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx index e21fab7..7f851bf 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx @@ -3,6 +3,8 @@ import { useCallback, useEffect, useState } from 'react' import { PagePurpose } from '@/components/common/PagePurpose' import { PipelineStepper } from '@/components/ocr-pipeline/PipelineStepper' +import { StepOrientation } from '@/components/ocr-pipeline/StepOrientation' +import { StepCrop } from '@/components/ocr-pipeline/StepCrop' import { StepDeskew } from '@/components/ocr-pipeline/StepDeskew' import { StepDewarp } from '@/components/ocr-pipeline/StepDewarp' import { StepColumnDetection } from '@/components/ocr-pipeline/StepColumnDetection' @@ -196,7 +198,7 @@ export default function OcrPipelinePage() { setCurrentStep(nextStep) } - const handleDeskewComplete = (sid: string) => { + const handleOrientationComplete = (sid: string) => { setSessionId(sid) // Reload session list to show the new session loadSessions() @@ -270,14 +272,16 @@ export default function OcrPipelinePage() { } const stepNames: Record = { - 1: 'Begradigung', - 2: 'Entzerrung', - 3: 'Spalten', - 4: 'Zeilen', - 5: 'Woerter', - 6: 'Korrektur', - 7: 'Rekonstruktion', - 8: 'Validierung', + 1: 'Orientierung', + 2: 'Zuschneiden', + 3: 'Begradigung', + 4: 'Entzerrung', + 5: 'Spalten', + 6: 'Zeilen', + 7: 'Woerter', + 8: 'Korrektur', + 9: 'Rekonstruktion', + 10: 'Validierung', } const reprocessFromStep = useCallback(async (uiStep: number) => { @@ -306,20 +310,24 @@ export default function OcrPipelinePage() { const renderStep = () => { switch (currentStep) { case 0: - return + return case 1: - return + return case 2: - return + return case 3: - return + return case 4: - return + return case 5: - return + return case 6: - return + return case 7: + return + case 8: + return + case 9: return default: return null diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts index d888cf1..138126e 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts @@ -57,6 +57,26 @@ export interface DocumentTypeResult { duration_seconds?: number } +export interface OrientationResult { + orientation_degrees: number + corrected: boolean + duration_seconds: number +} + +export interface CropResult { + crop_applied: boolean + crop_rect?: { x: number; y: number; width: number; height: number } + crop_rect_pct?: { x: number; y: number; width: number; height: number } + original_size: { width: number; height: number } + cropped_size: { width: number; height: number } + detected_format?: string + format_confidence?: number + aspect_ratio?: number + border_fractions?: { top: number; bottom: number; left: number; right: number } + skipped?: boolean + duration_seconds?: number +} + export interface SessionInfo { session_id: string filename: string @@ -67,6 +87,8 @@ export interface SessionInfo { current_step?: number document_category?: DocumentCategory doc_type?: string + orientation_result?: OrientationResult + crop_result?: CropResult deskew_result?: DeskewResult dewarp_result?: DewarpResult column_result?: ColumnResult @@ -85,7 +107,6 @@ export interface DeskewResult { angle_applied: number method_used: 'hough' | 'word_alignment' | 'manual' | 'iterative' | 'two_pass' | 'three_pass' | 'manual_combined' confidence: number - orientation_degrees?: number duration_seconds: number deskewed_image_url: string binarized_image_url: string @@ -288,6 +309,8 @@ export const IMAGE_STYLES: { value: ImageStyle; label: string }[] = [ ] export const PIPELINE_STEPS: PipelineStep[] = [ + { id: 'orientation', name: 'Orientierung', icon: 'πŸ”„', status: 'pending' }, + { id: 'crop', name: 'Zuschneiden', icon: 'βœ‚οΈ', status: 'pending' }, { id: 'deskew', name: 'Begradigung', icon: 'πŸ“', status: 'pending' }, { id: 'dewarp', name: 'Entzerrung', icon: 'πŸ”§', status: 'pending' }, { id: 'columns', name: 'Spalten', icon: 'πŸ“Š', status: 'pending' }, diff --git a/admin-lehrer/components/ocr-pipeline/DeskewControls.tsx b/admin-lehrer/components/ocr-pipeline/DeskewControls.tsx index c0f493c..c696ac5 100644 --- a/admin-lehrer/components/ocr-pipeline/DeskewControls.tsx +++ b/admin-lehrer/components/ocr-pipeline/DeskewControls.tsx @@ -59,11 +59,6 @@ export function DeskewControls({ {/* Results */} {deskewResult && (
- {deskewResult.orientation_degrees ? ( -
- Seite wurde um {deskewResult.orientation_degrees}Β° gedreht (Orientierungskorrektur) -
- ) : null}
Winkel:{' '} diff --git a/admin-lehrer/components/ocr-pipeline/StepCrop.tsx b/admin-lehrer/components/ocr-pipeline/StepCrop.tsx new file mode 100644 index 0000000..7d65322 --- /dev/null +++ b/admin-lehrer/components/ocr-pipeline/StepCrop.tsx @@ -0,0 +1,185 @@ +'use client' + +import { useEffect, useState } from 'react' +import type { CropResult } from '@/app/(admin)/ai/ocr-pipeline/types' +import { ImageCompareView } from './ImageCompareView' + +const KLAUSUR_API = '/klausur-api' + +interface StepCropProps { + sessionId: string | null + onNext: () => void +} + +export function StepCrop({ sessionId, onNext }: StepCropProps) { + const [cropResult, setCropResult] = useState(null) + const [cropping, setCropping] = useState(false) + const [error, setError] = useState(null) + const [hasRun, setHasRun] = useState(false) + + // Auto-trigger crop on mount + useEffect(() => { + if (!sessionId || hasRun) return + setHasRun(true) + + const runCrop = async () => { + setCropping(true) + setError(null) + + try { + // Check if session already has crop result + const sessionRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`) + if (sessionRes.ok) { + const sessionData = await sessionRes.json() + if (sessionData.crop_result) { + setCropResult(sessionData.crop_result) + setCropping(false) + return + } + } + + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/crop`, { + method: 'POST', + }) + + if (!res.ok) { + throw new Error('Zuschnitt fehlgeschlagen') + } + + const data = await res.json() + setCropResult(data) + } catch (e) { + setError(e instanceof Error ? e.message : 'Unbekannter Fehler') + } finally { + setCropping(false) + } + } + + runCrop() + }, [sessionId, hasRun]) + + const handleSkip = async () => { + if (!sessionId) return + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/crop/skip`, { + method: 'POST', + }) + if (res.ok) { + const data = await res.json() + setCropResult(data) + } + } catch (e) { + console.error('Skip crop failed:', e) + } + onNext() + } + + if (!sessionId) { + return
Keine Session ausgewaehlt.
+ } + + const orientedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented` + const croppedUrl = cropResult + ? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped` + : null + + return ( +
+ {/* Loading indicator */} + {cropping && ( +
+
+ Scannerraender werden erkannt... +
+ )} + + {/* Image comparison */} + + + {/* Crop result info */} + {cropResult && ( +
+
+ {cropResult.crop_applied ? ( + <> + + βœ‚οΈ Zugeschnitten + + {cropResult.detected_format && ( + <> +
+ + Format: {cropResult.detected_format} + {cropResult.format_confidence != null && ( + + ({Math.round(cropResult.format_confidence * 100)}%) + + )} + + + )} +
+ + {cropResult.original_size.width}x{cropResult.original_size.height} β†’ {cropResult.cropped_size.width}x{cropResult.cropped_size.height} + + {cropResult.border_fractions && ( + <> +
+ + Raender: O={pct(cropResult.border_fractions.top)} U={pct(cropResult.border_fractions.bottom)} L={pct(cropResult.border_fractions.left)} R={pct(cropResult.border_fractions.right)} + + + )} + + ) : ( + + βœ“ Kein Zuschnitt noetig + + )} + {cropResult.duration_seconds != null && ( + + {cropResult.duration_seconds}s + + )} +
+
+ )} + + {/* Action buttons */} + {cropResult && ( +
+ + +
+ )} + + {error && ( +
+ {error} +
+ )} +
+ ) +} + +function pct(v: number): string { + return `${(v * 100).toFixed(1)}%` +} diff --git a/admin-lehrer/components/ocr-pipeline/StepDeskew.tsx b/admin-lehrer/components/ocr-pipeline/StepDeskew.tsx index ed1845c..fe229b0 100644 --- a/admin-lehrer/components/ocr-pipeline/StepDeskew.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepDeskew.tsx @@ -8,29 +8,27 @@ import { ImageCompareView } from './ImageCompareView' const KLAUSUR_API = '/klausur-api' interface StepDeskewProps { - sessionId?: string | null - onNext: (sessionId: string) => void + sessionId: string | null + onNext: () => void } -export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewProps) { +export function StepDeskew({ sessionId, onNext }: StepDeskewProps) { const [session, setSession] = useState(null) const [deskewResult, setDeskewResult] = useState(null) - const [uploading, setUploading] = useState(false) const [deskewing, setDeskewing] = useState(false) const [applying, setApplying] = useState(false) const [showBinarized, setShowBinarized] = useState(false) const [showGrid, setShowGrid] = useState(true) const [error, setError] = useState(null) - const [dragOver, setDragOver] = useState(false) - const [sessionName, setSessionName] = useState('') + const [hasAutoRun, setHasAutoRun] = useState(false) - // Reload session data when navigating back from a later step + // Load session and auto-trigger deskew useEffect(() => { - if (!existingSessionId || session) return + if (!sessionId || session) return - const loadSession = async () => { + const loadAndDeskew = async () => { try { - const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}`) + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`) if (!res.ok) return const data = await res.json() @@ -39,83 +37,56 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP filename: data.filename, image_width: data.image_width, image_height: data.image_height, - original_image_url: `${KLAUSUR_API}${data.original_image_url}`, + // Use cropped image as "before" view + original_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped`, } setSession(sessionInfo) - // Reconstruct deskew result from session data + // If deskew result already exists, use it if (data.deskew_result) { const dr: DeskewResult = { ...data.deskew_result, - deskewed_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}/image/deskewed`, - binarized_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}/image/binarized`, + deskewed_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/deskewed`, + binarized_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/binarized`, } setDeskewResult(dr) + return + } + + // Auto-trigger deskew if not already done + if (!hasAutoRun) { + setHasAutoRun(true) + setDeskewing(true) + const deskewRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/deskew`, { + method: 'POST', + }) + + if (!deskewRes.ok) { + throw new Error('Begradigung fehlgeschlagen') + } + + const deskewData: DeskewResult = await deskewRes.json() + deskewData.deskewed_image_url = `${KLAUSUR_API}${deskewData.deskewed_image_url}` + deskewData.binarized_image_url = `${KLAUSUR_API}${deskewData.binarized_image_url}` + setDeskewResult(deskewData) } } catch (e) { - console.error('Failed to reload session:', e) + setError(e instanceof Error ? e.message : 'Fehler beim Laden') + } finally { + setDeskewing(false) } } - loadSession() - }, [existingSessionId, session]) - - const handleUpload = useCallback(async (file: File) => { - setUploading(true) - setError(null) - setDeskewResult(null) - - try { - const formData = new FormData() - formData.append('file', file) - if (sessionName.trim()) { - formData.append('name', sessionName.trim()) - } - - const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, { - method: 'POST', - body: formData, - }) - - if (!res.ok) { - const err = await res.json().catch(() => ({ detail: res.statusText })) - throw new Error(err.detail || 'Upload fehlgeschlagen') - } - - const data: SessionInfo = await res.json() - // Prepend API prefix to relative URLs - data.original_image_url = `${KLAUSUR_API}${data.original_image_url}` - setSession(data) - - // Auto-trigger deskew - setDeskewing(true) - const deskewRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${data.session_id}/deskew`, { - method: 'POST', - }) - - if (!deskewRes.ok) { - throw new Error('Begradigung fehlgeschlagen') - } - - const deskewData: DeskewResult = await deskewRes.json() - deskewData.deskewed_image_url = `${KLAUSUR_API}${deskewData.deskewed_image_url}` - deskewData.binarized_image_url = `${KLAUSUR_API}${deskewData.binarized_image_url}` - setDeskewResult(deskewData) - } catch (e) { - setError(e instanceof Error ? e.message : 'Unbekannter Fehler') - } finally { - setUploading(false) - setDeskewing(false) - } - }, []) + loadAndDeskew() + }, [sessionId, session, hasAutoRun]) const handleManualDeskew = useCallback(async (angle: number) => { - if (!session) return + if (!sessionId) return setApplying(true) setError(null) try { - const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${session.session_id}/deskew/manual`, { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/deskew/manual`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ angle }), @@ -130,7 +101,6 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP ...prev, angle_applied: data.angle_applied, method_used: data.method_used, - // Force reload by appending timestamp deskewed_image_url: `${KLAUSUR_API}${data.deskewed_image_url}?t=${Date.now()}`, } : null, @@ -140,12 +110,12 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP } finally { setApplying(false) } - }, [session]) + }, [sessionId]) const handleGroundTruth = useCallback(async (gt: DeskewGroundTruth) => { - if (!session) return + if (!sessionId) return try { - await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${session.session_id}/ground-truth/deskew`, { + await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/ground-truth/deskew`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(gt), @@ -153,89 +123,21 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP } catch (e) { console.error('Ground truth save failed:', e) } - }, [session]) + }, [sessionId]) - const handleDrop = useCallback((e: React.DragEvent) => { - e.preventDefault() - setDragOver(false) - const file = e.dataTransfer.files[0] - if (file) handleUpload(file) - }, [handleUpload]) - - const handleFileInput = useCallback((e: React.ChangeEvent) => { - const file = e.target.files?.[0] - if (file) handleUpload(file) - }, [handleUpload]) - - // Upload area (no session yet) - if (!session) { - return ( -
- {/* Session name input */} -
- - setSessionName(e.target.value)} - placeholder="z.B. Unit 3 Seite 42" - className="w-full max-w-sm px-3 py-2 text-sm border rounded-lg dark:bg-gray-800 dark:border-gray-600 dark:text-gray-200 focus:outline-none focus:ring-2 focus:ring-teal-500" - /> -
- -
{ e.preventDefault(); setDragOver(true) }} - onDragLeave={() => setDragOver(false)} - onDrop={handleDrop} - className={`border-2 border-dashed rounded-xl p-12 text-center transition-colors ${ - dragOver - ? 'border-teal-400 bg-teal-50 dark:bg-teal-900/20' - : 'border-gray-300 dark:border-gray-600 hover:border-teal-400' - }`} - > - {uploading ? ( -
-
-

Wird hochgeladen...

-
- ) : ( - <> -
πŸ“„
-

- PDF oder Bild hierher ziehen -

-

oder

- - - )} -
- {error && ( -
- {error} -
- )} -
- ) + if (!sessionId) { + return
Keine Session ausgewaehlt.
} - // Session active: show comparison + controls return (
{/* Filename */} -
- Datei: {session.filename} - {' '}({session.image_width} x {session.image_height} px) -
+ {session && ( +
+ Datei: {session.filename} + {' '}({session.image_width} x {session.image_height} px) +
+ )} {/* Loading indicator */} {deskewing && ( @@ -246,13 +148,17 @@ export function StepDeskew({ sessionId: existingSessionId, onNext }: StepDeskewP )} {/* Image comparison */} - + {session && ( + + )} {/* Controls */} setShowGrid((v) => !v)} onManualDeskew={handleManualDeskew} onGroundTruth={handleGroundTruth} - onNext={() => session && onNext(session.session_id)} + onNext={onNext} isApplying={applying} /> diff --git a/admin-lehrer/components/ocr-pipeline/StepOrientation.tsx b/admin-lehrer/components/ocr-pipeline/StepOrientation.tsx new file mode 100644 index 0000000..89239c7 --- /dev/null +++ b/admin-lehrer/components/ocr-pipeline/StepOrientation.tsx @@ -0,0 +1,247 @@ +'use client' + +import { useCallback, useEffect, useState } from 'react' +import type { OrientationResult, SessionInfo } from '@/app/(admin)/ai/ocr-pipeline/types' +import { ImageCompareView } from './ImageCompareView' + +const KLAUSUR_API = '/klausur-api' + +interface StepOrientationProps { + sessionId?: string | null + onNext: (sessionId: string) => void +} + +export function StepOrientation({ sessionId: existingSessionId, onNext }: StepOrientationProps) { + const [session, setSession] = useState(null) + const [orientationResult, setOrientationResult] = useState(null) + const [uploading, setUploading] = useState(false) + const [detecting, setDetecting] = useState(false) + const [error, setError] = useState(null) + const [dragOver, setDragOver] = useState(false) + const [sessionName, setSessionName] = useState('') + + // Reload session data when navigating back + useEffect(() => { + if (!existingSessionId || session) return + + const loadSession = async () => { + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${existingSessionId}`) + if (!res.ok) return + const data = await res.json() + + const sessionInfo: SessionInfo = { + session_id: data.session_id, + filename: data.filename, + image_width: data.image_width, + image_height: data.image_height, + original_image_url: `${KLAUSUR_API}${data.original_image_url}`, + } + setSession(sessionInfo) + + if (data.orientation_result) { + setOrientationResult(data.orientation_result) + } + } catch (e) { + console.error('Failed to reload session:', e) + } + } + + loadSession() + }, [existingSessionId, session]) + + const handleUpload = useCallback(async (file: File) => { + setUploading(true) + setError(null) + setOrientationResult(null) + + try { + const formData = new FormData() + formData.append('file', file) + if (sessionName.trim()) { + formData.append('name', sessionName.trim()) + } + + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, { + method: 'POST', + body: formData, + }) + + if (!res.ok) { + const err = await res.json().catch(() => ({ detail: res.statusText })) + throw new Error(err.detail || 'Upload fehlgeschlagen') + } + + const data: SessionInfo = await res.json() + data.original_image_url = `${KLAUSUR_API}${data.original_image_url}` + setSession(data) + + // Auto-trigger orientation detection + setDetecting(true) + const orientRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${data.session_id}/orientation`, { + method: 'POST', + }) + + if (!orientRes.ok) { + throw new Error('Orientierungserkennung fehlgeschlagen') + } + + const orientData = await orientRes.json() + setOrientationResult({ + orientation_degrees: orientData.orientation_degrees, + corrected: orientData.corrected, + duration_seconds: orientData.duration_seconds, + }) + } catch (e) { + setError(e instanceof Error ? e.message : 'Unbekannter Fehler') + } finally { + setUploading(false) + setDetecting(false) + } + }, [sessionName]) + + const handleDrop = useCallback((e: React.DragEvent) => { + e.preventDefault() + setDragOver(false) + const file = e.dataTransfer.files[0] + if (file) handleUpload(file) + }, [handleUpload]) + + const handleFileInput = useCallback((e: React.ChangeEvent) => { + const file = e.target.files?.[0] + if (file) handleUpload(file) + }, [handleUpload]) + + // Upload area (no session yet) + if (!session) { + return ( +
+ {/* Session name input */} +
+ + setSessionName(e.target.value)} + placeholder="z.B. Unit 3 Seite 42" + className="w-full max-w-sm px-3 py-2 text-sm border rounded-lg dark:bg-gray-800 dark:border-gray-600 dark:text-gray-200 focus:outline-none focus:ring-2 focus:ring-teal-500" + /> +
+ +
{ e.preventDefault(); setDragOver(true) }} + onDragLeave={() => setDragOver(false)} + onDrop={handleDrop} + className={`border-2 border-dashed rounded-xl p-12 text-center transition-colors ${ + dragOver + ? 'border-teal-400 bg-teal-50 dark:bg-teal-900/20' + : 'border-gray-300 dark:border-gray-600 hover:border-teal-400' + }`} + > + {uploading ? ( +
+
+

Wird hochgeladen...

+
+ ) : ( + <> +
πŸ“„
+

+ PDF oder Bild hierher ziehen +

+

oder

+ + + )} +
+ {error && ( +
+ {error} +
+ )} +
+ ) + } + + // Session active: show orientation result + const orientedUrl = orientationResult + ? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${session.session_id}/image/oriented` + : null + + return ( +
+ {/* Filename */} +
+ Datei: {session.filename} + {' '}({session.image_width} x {session.image_height} px) +
+ + {/* Loading indicator */} + {detecting && ( +
+
+ Orientierung wird erkannt... +
+ )} + + {/* Image comparison */} + + + {/* Orientation result badge */} + {orientationResult && ( +
+
+ {orientationResult.corrected ? ( + + πŸ”„ {orientationResult.orientation_degrees}Β° korrigiert + + ) : ( + + βœ“ 0Β° (keine Drehung noetig) + + )} + + {orientationResult.duration_seconds}s + +
+
+ )} + + {/* Next button */} + {orientationResult && ( +
+ +
+ )} + + {error && ( +
+ {error} +
+ )} +
+ ) +} diff --git a/docs-src/services/klausur-service/OCR-Pipeline.md b/docs-src/services/klausur-service/OCR-Pipeline.md index b66fcb7..7b75253 100644 --- a/docs-src/services/klausur-service/OCR-Pipeline.md +++ b/docs-src/services/klausur-service/OCR-Pipeline.md @@ -1,12 +1,12 @@ # OCR Pipeline - Schrittweise Seitenrekonstruktion -**Version:** 3.0.0 -**Status:** Produktiv (Schritte 1–8 implementiert) +**Version:** 4.0.0 +**Status:** Produktiv (Schritte 1–10 implementiert) **URL:** https://macmini:3002/ai/ocr-pipeline ## Uebersicht -Die OCR Pipeline zerlegt den OCR-Prozess in **8 einzelne Schritte**, um eingescannte Seiten +Die OCR Pipeline zerlegt den OCR-Prozess in **10 einzelne Schritte**, um eingescannte Seiten aus mehrspaltig gedruckten Schulbuechern Wort fuer Wort zu rekonstruieren. Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten versehen werden. @@ -16,14 +16,16 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v | Schritt | Name | Beschreibung | Status | |---------|------|--------------|--------| -| 1 | Begradigung (Deskew) | Scan begradigen (Hough Lines + Word Alignment) | Implementiert | -| 2 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert | -| 3 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert | -| 4 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert | -| 5 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert | -| 6 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert | -| 7 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert | -| 8 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert | +| 1 | Orientierung | 90/180/270Β° Drehungen von Scannern korrigieren | Implementiert | +| 2 | Zuschneiden (Crop) | Scannerraender entfernen, Papierformat (A4) erkennen | Implementiert | +| 3 | Begradigung (Deskew) | Scan begradigen (Hough Lines + Word Alignment) | Implementiert | +| 4 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert | +| 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert | +| 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert | +| 7 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert | +| 8 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert | +| 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert | +| 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert | --- @@ -206,6 +208,7 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`. |---------|------|--------------| | `POST` | `/sessions/{id}/dewarp` | Automatische Entzerrung | | `POST` | `/sessions/{id}/dewarp/manual` | Manueller Scherbungswinkel | +| `POST` | `/sessions/{id}/adjust-combined` | Kombinierte Rotation + Shear Feinabstimmung | | `POST` | `/sessions/{id}/ground-truth/dewarp` | Ground Truth speichern | ### Schritt 3: Spalten @@ -274,16 +277,48 @@ Die Dewarp-Erkennung misst die **vertikale Spaltenkippung** (dx/dy) statt Textze | Ensemble Min-Confidence | 0.35 | Mindest-Konfidenz fuer Korrektur | | Quality-Gate Skip | < 0.5Β° | Kleine Korrekturen ueberspringen Quality-Gate | +### Feinabstimmung (Combined Adjust) + +Der Endpoint `POST /sessions/{id}/adjust-combined` erlaubt die kombinierte Feinabstimmung von +Rotation und Shear in einem Schritt. Im Frontend stehen **7 Schieberegler** zur Verfuegung: + +**Rotation (3 Paesse):** + +| Slider | Bereich | Beschreibung | +|--------|---------|--------------| +| P1 Iterative | Β±5Β° | Erster Deskew-Pass (Hough Lines) | +| P2 Word-Alignment | Β±3Β° | Zweiter Pass (Wort-Ausrichtung) | +| P3 Textline | Β±3Β° | Dritter Pass (Textzeilen-Regression) | + +Die Summe aller drei ergibt den finalen Rotationswinkel. + +**Shear (4 Methoden, Radio-Auswahl):** + +| Slider | Bereich | Beschreibung | +|--------|---------|--------------| +| A: Textline Drift | Β±5Β° | Textzeilen-Drift | +| B: Projection Profile | Β±5Β° | 2-Pass Projektionsprofil | +| C: Vertical Edges | Β±5Β° | Vertikalkanten-Analyse | +| D: Ensemble | Β±5Β° | Gewichteter Ensemble-Wert | + +Nur der per Radio-Button ausgewaehlte Shear-Wert wird verwendet. + +``` +POST /sessions/{id}/adjust-combined +Body: {"rotation_degrees": 1.23, "shear_degrees": -0.45} +Response: {"method_used": "manual_combined", "shear_degrees": -0.45, "dewarped_image_url": "..."} +``` + --- ## Schritt 3: Spaltenerkennung (Detail) ### Algorithmus: `detect_column_geometry()` -Zweistufige Erkennung: vertikale Projektionsprofile finden Luecken, Wort-Bounding-Boxes validieren. +Mehrstufige Erkennung: Seite segmentieren, vertikale Projektionsprofile finden Luecken, Wort-Bounding-Boxes validieren. ``` -Bild β†’ Binarisierung β†’ Vertikalprofil β†’ Lueckenerkennung β†’ Wort-Validierung β†’ ColumnGeometry +Bild β†’ Binarisierung β†’ Seiten-Segmentierung β†’ Vertikalprofil β†’ Lueckenerkennung β†’ Wort-Validierung β†’ ColumnGeometry ``` **Wichtige Implementierungsdetails:** @@ -293,6 +328,54 @@ Bild β†’ Binarisierung β†’ Vertikalprofil β†’ Lueckenerkennung β†’ Wort-Validier - **Phantom-Spalten-Filter (Step 9):** Spalten mit Breite < 3 % der Content-Breite UND < 3 Woerter werden als Artefakte entfernt; die angrenzenden Spalten schliessen die Luecke. - **Spaltenzuweisung:** Woerter werden anhand des groessten horizontalen Ueberlappungsbereichs einer Spalte zugeordnet. +### Seiten-Segmentierung an Sub-Headern + +Farbige Zwischenueberschriften (z.B. β€žUnit 4: Bonnie Scotland" mit blauem Hintergrund) +erzeugen nach Binarisierung Tinte ueber die gesamte Seitenbreite. Diese Baender fuellen +Spaltenluecken im vertikalen Projektionsprofil auf und fuehren zu fragmentierten Spalten +(z.B. 11 statt 5). + +**Loesung: Horizontale Gap-Segmentierung (Step 2b)** + +1. **Horizontales Projektionsprofil** berechnen: Zeilensummen ueber den Content-Bereich +2. **Leere Zeilen** erkennen: Zeilen mit < 2% Tinten-Dichte (`H_GAP_THRESH = 0.02`) +3. **Gaps sammeln**: Zusammenhaengende leere Zeilen zu Gaps buendeln (Mindestlaenge: `max(5, h/200)`) +4. **Grosse Gaps identifizieren**: Gaps > 1.8Γ— Median-Gap-Hoehe = Sub-Header-Trennungen +5. **Segmente bilden**: Seite an grossen Gaps aufteilen +6. **Groesstes Segment waehlen**: Das hoechste Segment wird fuer die vertikale Projektion verwendet + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Header / Titel β”‚ ─── grosser Gap ─── +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ EN β”‚ DE β”‚ Example β”‚ Page β”‚ ← Segment 1 (groesster) +β”‚ ... β”‚ ... β”‚ ... β”‚ ... β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ Unit 4: Bonnie Scotland β”‚ ─── grosser Gap ─── +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ EN β”‚ DE β”‚ Example β”‚ Page β”‚ ← Segment 2 +β”‚ ... β”‚ ... β”‚ ... β”‚ ... β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +**Segment-gefilterte Wort-Validierung:** + +Die Wort-Validierung (Step 5) nutzt nur Tesseract-Woerter **innerhalb des gewaehlten Segments**. +Woerter aus Sub-Header-Bereichen (die die volle Breite einnehmen) werden so ausgeschlossen +und koennen die Spaltenluecken-Validierung nicht verfaelschen. + +### Word-Coverage Gap Detection (Fallback) + +Wenn die pixel-basierte Projektion keine ausreichenden Spaltenluecken findet +(z.B. bei Seiten mit Illustrationen, die Spaltenluecken teilweise verdecken), +greift ein Fallback auf Basis der Tesseract-Wort-Bounding-Boxes: + +1. X-Achse in 2px-Bins aufteilen +2. Pro Bin zaehlen, wie viele Segment-Woerter ihn ueberdecken +3. Zusammenhaengende Bins mit 0 Woertern = Gap-Kandidaten +4. Nur Gaps im inneren 90%-Bereich beruecksichtigen (Raender ignorieren) +5. Gaps mit Mindestbreite (`max(8px, content_w * 0.5%)`) werden als Spaltenluecken akzeptiert + ### Sub-Spalten-Erkennung: `_detect_sub_columns()` Erkennt versteckte Sub-Spalten innerhalb breiter Spalten (z.B. Seitenzahl-Spalte links neben EN-Vokabeln). @@ -658,7 +741,7 @@ CREATE TABLE ocr_pipeline_sessions ( | Schraeg gedruckte Seiten | Deskew erkennt Text-Rotation, nicht Seiten-Rotation | Manueller Winkel | | Sehr kleine Schrift (< 8pt) | Tesseract PSM 7 braucht min. Zeichengroesse | Vorher zoomen | | Handgeschriebene Eintraege | Tesseract/RapidOCR sind fuer Druckschrift optimiert | TrOCR-Engine | -| Mehr als 4 Spalten | Projektionsprofil kann verschmelzen | Manuelle Spalten | +| Mehr als 5 Spalten | Projektionsprofil kann verschmelzen (Segmentierung hilft) | Manuelle Spalten | | Farbige Marker (rot/blau) | HSV-Erkennung erzeugt False Positives | Manuell im Rekonstruktions-Editor | | 15%-Schwelle nicht breit validiert | Nur an einem Arbeitsblatt-Typ getestet | Diverse Schulbuchseiten testen | @@ -699,6 +782,8 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea | Datum | Version | Aenderung | |-------|---------|----------| +| 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung | +| 2026-03-05 | 3.0.1 | Dewarp: Feinabstimmung mit 7 Schiebereglern (3 Rotation + 4 Shear), Combined-Adjust-Endpoint | | 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade | | 2026-03-04 | 2.2.0 | Dewarp: Vertikalkanten-Drift statt Textzeilen-Neigung, Schwellenwerte gesenkt | | 2026-03-04 | 2.1.0 | Sub-Column-Detection, expand_narrow_columns, Fabric.js Editor, PDF/DOCX-Export | diff --git a/klausur-service/backend/main.py b/klausur-service/backend/main.py index 4c1ef18..24c931c 100644 --- a/klausur-service/backend/main.py +++ b/klausur-service/backend/main.py @@ -42,7 +42,8 @@ try: except ImportError: trocr_router = None from vocab_worksheet_api import router as vocab_router, set_db_pool as set_vocab_db_pool, _init_vocab_table, _load_all_sessions, DATABASE_URL as VOCAB_DATABASE_URL -from ocr_pipeline_api import router as ocr_pipeline_router +from ocr_pipeline_api import router as ocr_pipeline_router, _cache as ocr_pipeline_cache +from orientation_crop_api import router as orientation_crop_router, set_cache_ref as set_orientation_crop_cache from ocr_pipeline_session_store import init_ocr_pipeline_tables try: from handwriting_htr_api import router as htr_router @@ -177,6 +178,8 @@ if trocr_router: app.include_router(trocr_router) # TrOCR Handwriting OCR app.include_router(vocab_router) # Vocabulary Worksheet Generator app.include_router(ocr_pipeline_router) # OCR Pipeline (step-by-step) +set_orientation_crop_cache(ocr_pipeline_cache) +app.include_router(orientation_crop_router) # OCR Pipeline: Orientation + Crop if htr_router: app.include_router(htr_router) # Handwriting HTR (Klausur) if dsfa_rag_router: diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index cc633f2..4fc55be 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1,15 +1,17 @@ """ OCR Pipeline API - Schrittweise Seitenrekonstruktion. -Zerlegt den OCR-Prozess in 8 einzelne Schritte: -1. Deskewing - Scan begradigen -2. Dewarping - Buchwoelbung entzerren -3. Spaltenerkennung - Unsichtbare Spalten finden -4. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen -5. Worterkennung - OCR mit Bounding Boxes -6. LLM-Korrektur - OCR-Fehler per LLM korrigieren -7. Seitenrekonstruktion - Seite nachbauen -8. Ground Truth Validierung - Gesamtpruefung +Zerlegt den OCR-Prozess in 10 einzelne Schritte: +1. Orientierung - 90/180/270Β° Drehungen korrigieren (orientation_crop_api.py) +2. Zuschneiden - Scannerraender entfernen (orientation_crop_api.py) +3. Deskewing - Scan begradigen +4. Dewarping - Buchwoelbung entzerren +5. Spaltenerkennung - Unsichtbare Spalten finden +6. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen +7. Worterkennung - OCR mit Bounding Boxes +8. LLM-Korrektur - OCR-Fehler per LLM korrigieren +9. Seitenrekonstruktion - Seite nachbauen +10. Ground Truth Validierung - Gesamtpruefung Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. @@ -54,7 +56,6 @@ from cv_vocab_pipeline import ( deskew_image_by_word_alignment, deskew_image_iterative, deskew_two_pass, - detect_and_fix_orientation, detect_column_geometry, detect_document_type, detect_row_geometry, @@ -103,6 +104,8 @@ async def _load_session_to_cache(session_id: str) -> Dict[str, Any]: "id": session_id, **session, "original_bgr": None, + "oriented_bgr": None, + "cropped_bgr": None, "deskewed_bgr": None, "dewarped_bgr": None, } @@ -110,6 +113,8 @@ async def _load_session_to_cache(session_id: str) -> Dict[str, Any]: # Decode images from DB into BGR numpy arrays for img_type, bgr_key in [ ("original", "original_bgr"), + ("oriented", "oriented_bgr"), + ("cropped", "cropped_bgr"), ("deskewed", "deskewed_bgr"), ("dewarped", "dewarped_bgr"), ]: @@ -252,8 +257,12 @@ async def create_session( "filename": filename, "name": session_name, "original_bgr": img_bgr, + "oriented_bgr": None, + "cropped_bgr": None, "deskewed_bgr": None, "dewarped_bgr": None, + "orientation_result": None, + "crop_result": None, "deskew_result": None, "dewarp_result": None, "ground_truth": {}, @@ -301,6 +310,10 @@ async def get_session_info(session_id: str): "doc_type": session.get("doc_type"), } + if session.get("orientation_result"): + result["orientation_result"] = session["orientation_result"] + if session.get("crop_result"): + result["crop_result"] = session["crop_result"] if session.get("deskew_result"): result["deskew_result"] = session["deskew_result"] if session.get("dewarp_result"): @@ -427,7 +440,7 @@ async def _append_pipeline_log( @router.get("/sessions/{session_id}/image/{image_type}") async def get_image(session_id: str, image_type: str): """Serve session images: original, deskewed, dewarped, binarized, columns-overlay, or rows-overlay.""" - valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay", "clean"} + valid_types = {"original", "oriented", "cropped", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay", "clean"} if image_type not in valid_types: raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}") @@ -470,22 +483,13 @@ async def auto_deskew(session_id: str): await _load_session_to_cache(session_id) cached = _get_cached(session_id) - img_bgr = cached.get("original_bgr") + # Use cropped image as input (from step 2), fall back to oriented, then original + img_bgr = cached.get("cropped_bgr") or cached.get("oriented_bgr") or cached.get("original_bgr") if img_bgr is None: - raise HTTPException(status_code=400, detail="Original image not available") + raise HTTPException(status_code=400, detail="No image available for deskewing") t0 = time.time() - # Orientation detection (fix 90/180/270Β° rotations from scanners) - img_bgr, orientation_deg = detect_and_fix_orientation(img_bgr) - if orientation_deg: - # Update original in cache + DB so all subsequent steps use corrected image - cached["original_bgr"] = img_bgr - success_ori, ori_buf = cv2.imencode(".png", img_bgr) - if success_ori: - await update_session_db(session_id, original_png=ori_buf.tobytes()) - logger.info(f"OCR Pipeline: orientation corrected {orientation_deg}Β° for session {session_id}") - # Two-pass deskew: iterative (Β±5Β°) + word-alignment residual check deskewed_bgr, angle_applied, two_pass_debug = deskew_two_pass(img_bgr.copy()) @@ -534,7 +538,6 @@ async def auto_deskew(session_id: str): "angle_residual": round(angle_residual, 3), "angle_textline": round(angle_textline, 3), "angle_applied": round(angle_applied, 3), - "orientation_degrees": orientation_deg, "method_used": method_used, "confidence": round(confidence, 2), "duration_seconds": round(duration, 2), @@ -550,7 +553,7 @@ async def auto_deskew(session_id: str): db_update = { "deskewed_png": deskewed_png, "deskew_result": deskew_result, - "current_step": 2, + "current_step": 4, } if binarized_png: db_update["binarized_png"] = binarized_png @@ -563,7 +566,6 @@ async def auto_deskew(session_id: str): f"-> {method_used} total={angle_applied:.2f}") await _append_pipeline_log(session_id, "deskew", { - "orientation": orientation_deg, "angle_applied": round(angle_applied, 3), "angle_iterative": round(angle_iterative, 3), "angle_residual": round(angle_residual, 3), @@ -582,14 +584,14 @@ async def auto_deskew(session_id: str): @router.post("/sessions/{session_id}/deskew/manual") async def manual_deskew(session_id: str, req: ManualDeskewRequest): - """Apply a manual rotation angle to the original image.""" + """Apply a manual rotation angle to the cropped image.""" if session_id not in _cache: await _load_session_to_cache(session_id) cached = _get_cached(session_id) - img_bgr = cached.get("original_bgr") + img_bgr = cached.get("cropped_bgr") or cached.get("oriented_bgr") or cached.get("original_bgr") if img_bgr is None: - raise HTTPException(status_code=400, detail="Original image not available") + raise HTTPException(status_code=400, detail="No image available for deskewing") angle = max(-5.0, min(5.0, req.angle)) @@ -797,7 +799,7 @@ async def auto_dewarp( dewarped_png=dewarped_png, dewarp_result=dewarp_result, auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0), - current_step=3, + current_step=5, ) logger.info(f"OCR Pipeline: dewarp session {session_id}: " @@ -1109,7 +1111,7 @@ async def detect_columns(session_id: str): column_result=column_result, row_result=None, word_result=None, - current_step=3, + current_step=5, ) # Update cache @@ -1335,7 +1337,7 @@ async def detect_rows(session_id: str): session_id, row_result=row_result, word_result=None, - current_step=4, + current_step=6, ) cached["row_result"] = row_result @@ -1601,7 +1603,7 @@ async def detect_words( await update_session_db( session_id, word_result=word_result, - current_step=5, + current_step=7, ) cached["word_result"] = word_result @@ -1745,7 +1747,7 @@ async def _word_batch_stream_generator( word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) vocab_entries = entries - await update_session_db(session_id, word_result=word_result, current_step=5) + await update_session_db(session_id, word_result=word_result, current_step=7) cached["word_result"] = word_result logger.info(f"OCR Pipeline SSE batch: words session {session_id}: " @@ -1892,7 +1894,7 @@ async def _word_stream_generator( await update_session_db( session_id, word_result=word_result, - current_step=5, + current_step=7, ) cached["word_result"] = word_result @@ -2016,7 +2018,7 @@ async def run_llm_review(session_id: str, request: Request, stream: bool = False "duration_ms": result["duration_ms"], "entries_corrected": result["entries_corrected"], } - await update_session_db(session_id, word_result=word_result, current_step=6) + await update_session_db(session_id, word_result=word_result, current_step=8) if session_id in _cache: _cache[session_id]["word_result"] = word_result @@ -2065,7 +2067,7 @@ async def _llm_review_stream_generator( "duration_ms": event["duration_ms"], "entries_corrected": event["entries_corrected"], } - await update_session_db(session_id, word_result=word_result, current_step=6) + await update_session_db(session_id, word_result=word_result, current_step=8) if session_id in _cache: _cache[session_id]["word_result"] = word_result @@ -2153,7 +2155,7 @@ async def save_reconstruction(session_id: str, request: Request): cell_updates = body.get("cells", []) if not cell_updates: - await update_session_db(session_id, current_step=7) + await update_session_db(session_id, current_step=9) return {"session_id": session_id, "updated": 0} # Build update map: cell_id -> new text @@ -2189,7 +2191,7 @@ async def save_reconstruction(session_id: str, request: Request): if "entries" in word_result: word_result["entries"] = entries - await update_session_db(session_id, word_result=word_result, current_step=7) + await update_session_db(session_id, word_result=word_result, current_step=9) if session_id in _cache: _cache[session_id]["word_result"] = word_result @@ -2572,7 +2574,7 @@ async def save_validation(session_id: str, req: ValidationRequest): """Save final validation results for step 8. Stores notes, score, and preserves any detected/generated image regions. - Sets current_step = 8 to mark pipeline as complete. + Sets current_step = 10 to mark pipeline as complete. """ session = await get_session_db(session_id) if not session: @@ -2585,7 +2587,7 @@ async def save_validation(session_id: str, req: ValidationRequest): validation["score"] = req.score ground_truth["validation"] = validation - await update_session_db(session_id, ground_truth=ground_truth, current_step=8) + await update_session_db(session_id, ground_truth=ground_truth, current_step=10) if session_id in _cache: _cache[session_id]["ground_truth"] = ground_truth @@ -2619,12 +2621,14 @@ async def reprocess_session(session_id: str, request: Request): Body: {"from_step": 5} (1-indexed step number) Clears downstream results: - - from_step <= 1: deskew_result, dewarp_result, column_result, row_result, word_result - - from_step <= 2: dewarp_result, column_result, row_result, word_result - - from_step <= 3: column_result, row_result, word_result - - from_step <= 4: row_result, word_result - - from_step <= 5: word_result (cells, vocab_entries) - - from_step <= 6: word_result.llm_review only + - from_step <= 1: orientation_result, crop_result, deskew_result, dewarp_result, column_result, row_result, word_result + - from_step <= 2: crop_result, deskew_result, dewarp_result, column_result, row_result, word_result + - from_step <= 3: deskew_result, dewarp_result, column_result, row_result, word_result + - from_step <= 4: dewarp_result, column_result, row_result, word_result + - from_step <= 5: column_result, row_result, word_result + - from_step <= 6: row_result, word_result + - from_step <= 7: word_result (cells, vocab_entries) + - from_step <= 8: word_result.llm_review only """ session = await get_session_db(session_id) if not session: @@ -2632,15 +2636,15 @@ async def reprocess_session(session_id: str, request: Request): body = await request.json() from_step = body.get("from_step", 1) - if not isinstance(from_step, int) or from_step < 1 or from_step > 7: - raise HTTPException(status_code=400, detail="from_step must be between 1 and 7") + if not isinstance(from_step, int) or from_step < 1 or from_step > 9: + raise HTTPException(status_code=400, detail="from_step must be between 1 and 9") update_kwargs: Dict[str, Any] = {"current_step": from_step} # Clear downstream data based on from_step - if from_step <= 5: + if from_step <= 7: update_kwargs["word_result"] = None - elif from_step == 6: + elif from_step == 8: # Only clear LLM review from word_result word_result = session.get("word_result") if word_result: @@ -2648,14 +2652,18 @@ async def reprocess_session(session_id: str, request: Request): word_result.pop("llm_corrections", None) update_kwargs["word_result"] = word_result - if from_step <= 4: + if from_step <= 6: update_kwargs["row_result"] = None - if from_step <= 3: + if from_step <= 5: update_kwargs["column_result"] = None - if from_step <= 2: + if from_step <= 4: update_kwargs["dewarp_result"] = None - if from_step <= 1: + if from_step <= 3: update_kwargs["deskew_result"] = None + if from_step <= 2: + update_kwargs["crop_result"] = None + if from_step <= 1: + update_kwargs["orientation_result"] = None await update_session_db(session_id, **update_kwargs) @@ -3074,7 +3082,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request): deskewed_png=deskewed_png, deskew_result=deskew_result, auto_rotation_degrees=float(angle_applied), - current_step=2, + current_step=4, ) session = await get_session_db(session_id) @@ -3137,7 +3145,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request): dewarped_png=dewarped_png, dewarp_result=dewarp_result, auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0), - current_step=3, + current_step=5, ) session = await get_session_db(session_id) @@ -3196,7 +3204,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request): cached["column_result"] = column_result await update_session_db(session_id, column_result=column_result, - row_result=None, word_result=None, current_step=4) + row_result=None, word_result=None, current_step=6) session = await get_session_db(session_id) steps_run.append("columns") @@ -3273,7 +3281,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request): } cached["row_result"] = row_result - await update_session_db(session_id, row_result=row_result, current_step=5) + await update_session_db(session_id, row_result=row_result, current_step=7) session = await get_session_db(session_id) steps_run.append("rows") @@ -3381,7 +3389,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request): word_result_data["entry_count"] = len(entries) word_result_data["summary"]["total_entries"] = len(entries) - await update_session_db(session_id, word_result=word_result_data, current_step=6) + await update_session_db(session_id, word_result=word_result_data, current_step=8) cached["word_result"] = word_result_data session = await get_session_db(session_id) @@ -3426,7 +3434,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request): word_result_updated["llm_reviewed"] = True word_result_updated["llm_model"] = OLLAMA_REVIEW_MODEL - await update_session_db(session_id, word_result=word_result_updated, current_step=7) + await update_session_db(session_id, word_result=word_result_updated, current_step=9) cached["word_result"] = word_result_updated steps_run.append("llm_review") diff --git a/klausur-service/backend/ocr_pipeline_session_store.py b/klausur-service/backend/ocr_pipeline_session_store.py index 12f3c2a..a4adbaf 100644 --- a/klausur-service/backend/ocr_pipeline_session_store.py +++ b/klausur-service/backend/ocr_pipeline_session_store.py @@ -68,7 +68,11 @@ async def init_ocr_pipeline_tables(): ADD COLUMN IF NOT EXISTS doc_type VARCHAR(50), ADD COLUMN IF NOT EXISTS doc_type_result JSONB, ADD COLUMN IF NOT EXISTS document_category VARCHAR(50), - ADD COLUMN IF NOT EXISTS pipeline_log JSONB + ADD COLUMN IF NOT EXISTS pipeline_log JSONB, + ADD COLUMN IF NOT EXISTS oriented_png BYTEA, + ADD COLUMN IF NOT EXISTS cropped_png BYTEA, + ADD COLUMN IF NOT EXISTS orientation_result JSONB, + ADD COLUMN IF NOT EXISTS crop_result JSONB """) @@ -90,6 +94,7 @@ async def create_session_db( id, name, filename, original_png, status, current_step ) VALUES ($1, $2, $3, $4, 'active', 1) RETURNING id, name, filename, status, current_step, + orientation_result, crop_result, deskew_result, dewarp_result, column_result, row_result, word_result, ground_truth, auto_shear_degrees, doc_type, doc_type_result, @@ -106,6 +111,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]: async with pool.acquire() as conn: row = await conn.fetchrow(""" SELECT id, name, filename, status, current_step, + orientation_result, crop_result, deskew_result, dewarp_result, column_result, row_result, word_result, ground_truth, auto_shear_degrees, doc_type, doc_type_result, @@ -123,6 +129,8 @@ async def get_session_image(session_id: str, image_type: str) -> Optional[bytes] """Load a single image (BYTEA) from the session.""" column_map = { "original": "original_png", + "oriented": "oriented_png", + "cropped": "cropped_png", "deskewed": "deskewed_png", "binarized": "binarized_png", "dewarped": "dewarped_png", @@ -150,15 +158,17 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any allowed_fields = { 'name', 'filename', 'status', 'current_step', - 'original_png', 'deskewed_png', 'binarized_png', 'dewarped_png', + 'original_png', 'oriented_png', 'cropped_png', + 'deskewed_png', 'binarized_png', 'dewarped_png', 'clean_png', 'handwriting_removal_meta', + 'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'auto_shear_degrees', 'doc_type', 'doc_type_result', 'document_category', 'pipeline_log', } - jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log'} + jsonb_fields = {'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log'} for key, value in kwargs.items(): if key in allowed_fields: @@ -182,6 +192,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any SET {', '.join(fields)} WHERE id = ${param_idx} RETURNING id, name, filename, status, current_step, + orientation_result, crop_result, deskew_result, dewarp_result, column_result, row_result, word_result, ground_truth, auto_shear_degrees, doc_type, doc_type_result, @@ -254,7 +265,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]: result[key] = result[key].isoformat() # JSONB β†’ parsed (asyncpg returns str for JSONB) - for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log']: + for key in ['orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log']: if key in result and result[key] is not None: if isinstance(result[key], str): result[key] = json.loads(result[key]) diff --git a/klausur-service/backend/orientation_crop_api.py b/klausur-service/backend/orientation_crop_api.py new file mode 100644 index 0000000..6fee2ce --- /dev/null +++ b/klausur-service/backend/orientation_crop_api.py @@ -0,0 +1,330 @@ +""" +Orientation & Crop API - Steps 1-2 of the OCR Pipeline. + +Step 1: Orientation detection (fix 90/180/270 degree rotations) +Step 2: Page cropping (remove scanner borders, detect paper format) + +These endpoints were extracted from the main pipeline to keep files manageable. +""" + +import logging +import time +from typing import Any, Dict, Optional + +import cv2 +import numpy as np +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +from cv_vocab_pipeline import detect_and_fix_orientation +from page_crop import detect_and_crop_page +from ocr_pipeline_session_store import ( + get_session_db, + get_session_image, + update_session_db, +) + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"]) + + +# Reference to the shared cache from ocr_pipeline_api (set in main.py) +_cache: Dict[str, Dict[str, Any]] = {} + + +def set_cache_ref(cache: Dict[str, Dict[str, Any]]): + """Set reference to the shared cache from ocr_pipeline_api.""" + global _cache + _cache = cache + + +async def _ensure_cached(session_id: str) -> Dict[str, Any]: + """Ensure session is in cache, loading from DB if needed.""" + if session_id in _cache: + return _cache[session_id] + + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + cache_entry: Dict[str, Any] = { + "id": session_id, + **session, + "original_bgr": None, + "oriented_bgr": None, + "cropped_bgr": None, + "deskewed_bgr": None, + "dewarped_bgr": None, + } + + for img_type, bgr_key in [ + ("original", "original_bgr"), + ("oriented", "oriented_bgr"), + ("cropped", "cropped_bgr"), + ("deskewed", "deskewed_bgr"), + ("dewarped", "dewarped_bgr"), + ]: + png_data = await get_session_image(session_id, img_type) + if png_data: + arr = np.frombuffer(png_data, dtype=np.uint8) + bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR) + cache_entry[bgr_key] = bgr + + _cache[session_id] = cache_entry + return cache_entry + + +async def _append_pipeline_log(session_id: str, step: str, metrics: dict, duration_ms: int): + """Append a step entry to the pipeline log.""" + from datetime import datetime + session = await get_session_db(session_id) + if not session: + return + pipeline_log = session.get("pipeline_log") or {"steps": []} + pipeline_log["steps"].append({ + "step": step, + "completed_at": datetime.utcnow().isoformat(), + "success": True, + "duration_ms": duration_ms, + "metrics": metrics, + }) + await update_session_db(session_id, pipeline_log=pipeline_log) + + +# --------------------------------------------------------------------------- +# Step 1: Orientation +# --------------------------------------------------------------------------- + +@router.post("/sessions/{session_id}/orientation") +async def detect_orientation(session_id: str): + """Detect and fix 90/180/270 degree rotations from scanners. + + Reads the original image, applies orientation correction, + stores the result as oriented_png. + """ + cached = await _ensure_cached(session_id) + + img_bgr = cached.get("original_bgr") + if img_bgr is None: + raise HTTPException(status_code=400, detail="Original image not available") + + t0 = time.time() + + # Detect and fix orientation + oriented_bgr, orientation_deg = detect_and_fix_orientation(img_bgr.copy()) + + duration = time.time() - t0 + + orientation_result = { + "orientation_degrees": orientation_deg, + "corrected": orientation_deg != 0, + "duration_seconds": round(duration, 2), + } + + # Encode oriented image + success, png_buf = cv2.imencode(".png", oriented_bgr) + oriented_png = png_buf.tobytes() if success else b"" + + # Update cache + cached["oriented_bgr"] = oriented_bgr + cached["orientation_result"] = orientation_result + + # Persist to DB + await update_session_db( + session_id, + oriented_png=oriented_png, + orientation_result=orientation_result, + current_step=2, + ) + + logger.info( + "OCR Pipeline: orientation session %s: %dΒ° (%s) in %.2fs", + session_id, orientation_deg, + "corrected" if orientation_deg else "no change", + duration, + ) + + await _append_pipeline_log(session_id, "orientation", { + "orientation_degrees": orientation_deg, + "corrected": orientation_deg != 0, + }, duration_ms=int(duration * 1000)) + + h, w = oriented_bgr.shape[:2] + return { + "session_id": session_id, + **orientation_result, + "image_width": w, + "image_height": h, + "oriented_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/oriented", + } + + +# --------------------------------------------------------------------------- +# Step 2: Crop +# --------------------------------------------------------------------------- + +@router.post("/sessions/{session_id}/crop") +async def auto_crop(session_id: str): + """Auto-detect and crop scanner borders. + + Reads the oriented image (or original if no orientation step), + detects the page boundary and crops. + """ + cached = await _ensure_cached(session_id) + + # Use oriented image if available, else original + img_bgr = cached.get("oriented_bgr") or cached.get("original_bgr") + if img_bgr is None: + raise HTTPException(status_code=400, detail="No image available for cropping") + + t0 = time.time() + + cropped_bgr, crop_info = detect_and_crop_page(img_bgr) + + duration = time.time() - t0 + crop_info["duration_seconds"] = round(duration, 2) + + # Encode cropped image + success, png_buf = cv2.imencode(".png", cropped_bgr) + cropped_png = png_buf.tobytes() if success else b"" + + # Update cache + cached["cropped_bgr"] = cropped_bgr + cached["crop_result"] = crop_info + + # Persist to DB + await update_session_db( + session_id, + cropped_png=cropped_png, + crop_result=crop_info, + current_step=3, + ) + + logger.info( + "OCR Pipeline: crop session %s: applied=%s format=%s in %.2fs", + session_id, crop_info["crop_applied"], + crop_info.get("detected_format", "?"), + duration, + ) + + await _append_pipeline_log(session_id, "crop", { + "crop_applied": crop_info["crop_applied"], + "detected_format": crop_info.get("detected_format"), + "format_confidence": crop_info.get("format_confidence"), + }, duration_ms=int(duration * 1000)) + + h, w = cropped_bgr.shape[:2] + return { + "session_id": session_id, + **crop_info, + "image_width": w, + "image_height": h, + "cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped", + } + + +class ManualCropRequest(BaseModel): + x: float # percentage 0-100 + y: float # percentage 0-100 + width: float # percentage 0-100 + height: float # percentage 0-100 + + +@router.post("/sessions/{session_id}/crop/manual") +async def manual_crop(session_id: str, req: ManualCropRequest): + """Manually crop using percentage coordinates.""" + cached = await _ensure_cached(session_id) + + img_bgr = cached.get("oriented_bgr") or cached.get("original_bgr") + if img_bgr is None: + raise HTTPException(status_code=400, detail="No image available for cropping") + + h, w = img_bgr.shape[:2] + + # Convert percentages to pixels + px_x = int(w * req.x / 100.0) + px_y = int(h * req.y / 100.0) + px_w = int(w * req.width / 100.0) + px_h = int(h * req.height / 100.0) + + # Clamp + px_x = max(0, min(px_x, w - 1)) + px_y = max(0, min(px_y, h - 1)) + px_w = max(1, min(px_w, w - px_x)) + px_h = max(1, min(px_h, h - px_y)) + + cropped_bgr = img_bgr[px_y:px_y + px_h, px_x:px_x + px_w].copy() + + success, png_buf = cv2.imencode(".png", cropped_bgr) + cropped_png = png_buf.tobytes() if success else b"" + + crop_result = { + "crop_applied": True, + "crop_rect": {"x": px_x, "y": px_y, "width": px_w, "height": px_h}, + "crop_rect_pct": {"x": round(req.x, 2), "y": round(req.y, 2), + "width": round(req.width, 2), "height": round(req.height, 2)}, + "original_size": {"width": w, "height": h}, + "cropped_size": {"width": px_w, "height": px_h}, + "method": "manual", + } + + cached["cropped_bgr"] = cropped_bgr + cached["crop_result"] = crop_result + + await update_session_db( + session_id, + cropped_png=cropped_png, + crop_result=crop_result, + current_step=3, + ) + + ch, cw = cropped_bgr.shape[:2] + return { + "session_id": session_id, + **crop_result, + "image_width": cw, + "image_height": ch, + "cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped", + } + + +@router.post("/sessions/{session_id}/crop/skip") +async def skip_crop(session_id: str): + """Skip cropping β€” use oriented (or original) image as-is.""" + cached = await _ensure_cached(session_id) + + img_bgr = cached.get("oriented_bgr") or cached.get("original_bgr") + if img_bgr is None: + raise HTTPException(status_code=400, detail="No image available") + + h, w = img_bgr.shape[:2] + + # Store the oriented image as cropped (identity crop) + success, png_buf = cv2.imencode(".png", img_bgr) + cropped_png = png_buf.tobytes() if success else b"" + + crop_result = { + "crop_applied": False, + "skipped": True, + "original_size": {"width": w, "height": h}, + "cropped_size": {"width": w, "height": h}, + } + + cached["cropped_bgr"] = img_bgr + cached["crop_result"] = crop_result + + await update_session_db( + session_id, + cropped_png=cropped_png, + crop_result=crop_result, + current_step=3, + ) + + return { + "session_id": session_id, + **crop_result, + "image_width": w, + "image_height": h, + "cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped", + } diff --git a/klausur-service/backend/page_crop.py b/klausur-service/backend/page_crop.py new file mode 100644 index 0000000..b6eb8cd --- /dev/null +++ b/klausur-service/backend/page_crop.py @@ -0,0 +1,187 @@ +""" +Page Crop - Automatic scanner border removal and page format detection. + +Detects the paper boundary in a scanned image and crops away scanner borders. +Also identifies the paper format (A4, Letter, etc.) from the aspect ratio. + +License: Apache 2.0 +""" + +import logging +from typing import Dict, Any, Tuple + +import cv2 +import numpy as np + +logger = logging.getLogger(__name__) + +# Known paper format aspect ratios (height / width, portrait orientation) +PAPER_FORMATS = { + "A4": 297.0 / 210.0, # 1.4143 + "A5": 210.0 / 148.0, # 1.4189 + "Letter": 11.0 / 8.5, # 1.2941 + "Legal": 14.0 / 8.5, # 1.6471 + "A3": 420.0 / 297.0, # 1.4141 +} + + +def detect_and_crop_page( + img_bgr: np.ndarray, + min_border_fraction: float = 0.01, +) -> Tuple[np.ndarray, Dict[str, Any]]: + """Detect page boundary and crop scanner borders. + + Algorithm: + 1. Grayscale + GaussianBlur to smooth out text + 2. Otsu threshold (page=bright, scanner border=dark) + 3. Morphological close to fill gaps + 4. Find largest contour = page + 5. If contour covers >95% of image area -> no crop needed + 6. Get bounding rect, add safety margin + 7. Match aspect ratio to known paper formats + + Args: + img_bgr: Input BGR image + min_border_fraction: Minimum border fraction to trigger crop (default 1%) + + Returns: + Tuple of (cropped_image, result_dict) + """ + h, w = img_bgr.shape[:2] + total_area = h * w + + result: Dict[str, Any] = { + "crop_applied": False, + "crop_rect": None, + "crop_rect_pct": None, + "original_size": {"width": w, "height": h}, + "cropped_size": {"width": w, "height": h}, + "detected_format": None, + "format_confidence": 0.0, + "aspect_ratio": round(max(h, w) / max(min(h, w), 1), 4), + "border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0}, + } + + # 1. Grayscale + blur + gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) + blurred = cv2.GaussianBlur(gray, (21, 21), 0) + + # 2. Otsu threshold + _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + + # 3. Morphological close to fill text gaps + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 50)) + closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel) + + # 4. Find contours + contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + if not contours: + logger.info("No contours found - returning original image") + return img_bgr, result + + # Get the largest contour + largest = max(contours, key=cv2.contourArea) + contour_area = cv2.contourArea(largest) + + # 5. If contour covers >95% of image, no crop needed + if contour_area > 0.95 * total_area: + logger.info("Page covers >95%% of image - no crop needed") + result["detected_format"], result["format_confidence"] = _detect_format(w, h) + return img_bgr, result + + # 6. Get bounding rect + rx, ry, rw, rh = cv2.boundingRect(largest) + + # Calculate border fractions + border_top = ry / h + border_bottom = (h - (ry + rh)) / h + border_left = rx / w + border_right = (w - (rx + rw)) / w + + result["border_fractions"] = { + "top": round(border_top, 4), + "bottom": round(border_bottom, 4), + "left": round(border_left, 4), + "right": round(border_right, 4), + } + + # 7. Check if borders are significant enough to crop + if all(f < min_border_fraction for f in [border_top, border_bottom, border_left, border_right]): + logger.info("All borders < %.1f%% - no crop needed", min_border_fraction * 100) + result["detected_format"], result["format_confidence"] = _detect_format(w, h) + return img_bgr, result + + # 8. Add safety margin (0.5% of image dimensions) + margin_x = int(w * 0.005) + margin_y = int(h * 0.005) + + crop_x = max(0, rx - margin_x) + crop_y = max(0, ry - margin_y) + crop_x2 = min(w, rx + rw + margin_x) + crop_y2 = min(h, ry + rh + margin_y) + + crop_w = crop_x2 - crop_x + crop_h = crop_y2 - crop_y + + # Sanity check: cropped area should be at least 50% of original + if crop_w * crop_h < 0.5 * total_area: + logger.warning("Cropped area too small (%.0f%%) - skipping crop", + 100.0 * crop_w * crop_h / total_area) + result["detected_format"], result["format_confidence"] = _detect_format(w, h) + return img_bgr, result + + # 9. Crop + cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy() + + # 10. Detect format from cropped dimensions + detected_format, format_confidence = _detect_format(crop_w, crop_h) + + result["crop_applied"] = True + result["crop_rect"] = {"x": crop_x, "y": crop_y, "width": crop_w, "height": crop_h} + result["crop_rect_pct"] = { + "x": round(100.0 * crop_x / w, 2), + "y": round(100.0 * crop_y / h, 2), + "width": round(100.0 * crop_w / w, 2), + "height": round(100.0 * crop_h / h, 2), + } + result["cropped_size"] = {"width": crop_w, "height": crop_h} + result["detected_format"] = detected_format + result["format_confidence"] = format_confidence + result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4) + + logger.info("Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%", + w, h, crop_w, crop_h, detected_format, format_confidence * 100, + border_top * 100, border_bottom * 100, border_left * 100, border_right * 100) + + return cropped, result + + +def _detect_format(width: int, height: int) -> Tuple[str, float]: + """Detect paper format from dimensions by comparing aspect ratios. + + Returns: + (format_name, confidence) where confidence is 0.0-1.0 + """ + if width <= 0 or height <= 0: + return "unknown", 0.0 + + # Use portrait aspect ratio (taller / shorter) + aspect = max(width, height) / min(width, height) + + best_format = "unknown" + best_diff = float("inf") + + for fmt, expected_ratio in PAPER_FORMATS.items(): + diff = abs(aspect - expected_ratio) + if diff < best_diff: + best_diff = diff + best_format = fmt + + # Confidence: 1.0 if exact match, decreasing with deviation + # Threshold: if diff > 0.1, confidence drops below 0.5 + confidence = max(0.0, 1.0 - best_diff * 5.0) + + if confidence < 0.3: + return "unknown", 0.0 + + return best_format, round(confidence, 3)