From 90c1efd9b035ff5dcbe4b4ee814552fc82d34836 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 12 Mar 2026 16:41:55 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20Paddle=20Direct=20=E2=80=94=201-click?= =?UTF-8?q?=20OCR=20without=20deskew/dewarp/crop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New 2-step mode (Upload → PaddleOCR+Overlay) alongside the existing 7-step pipeline. Backend endpoint runs PaddleOCR on the original image and clusters words into rows/cells directly. Frontend adds a mode toggle and PaddleDirectStep component. Co-Authored-By: Claude Opus 4.6 --- .../app/(admin)/ai/ocr-overlay/page.tsx | 98 ++++++++-- .../app/(admin)/ai/ocr-overlay/types.ts | 9 + .../ocr-overlay/PaddleDirectStep.tsx | 129 ++++++++++++ klausur-service/backend/ocr_pipeline_api.py | 183 ++++++++++++++++++ 4 files changed, 403 insertions(+), 16 deletions(-) create mode 100644 admin-lehrer/components/ocr-overlay/PaddleDirectStep.tsx diff --git a/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx b/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx index 19e8011..5cc77fe 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx +++ b/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx @@ -10,11 +10,13 @@ import { StepCrop } from '@/components/ocr-pipeline/StepCrop' import { StepRowDetection } from '@/components/ocr-pipeline/StepRowDetection' import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecognition' import { OverlayReconstruction } from '@/components/ocr-overlay/OverlayReconstruction' -import { OVERLAY_PIPELINE_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types' +import { PaddleDirectStep } from '@/components/ocr-overlay/PaddleDirectStep' +import { OVERLAY_PIPELINE_STEPS, PADDLE_DIRECT_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types' const KLAUSUR_API = '/klausur-api' export default function OcrOverlayPage() { + const [mode, setMode] = useState<'pipeline' | 'paddle-direct'>('pipeline') const [currentStep, setCurrentStep] = useState(0) const [sessionId, setSessionId] = useState(null) const [sessionName, setSessionName] = useState('') @@ -61,17 +63,32 @@ export default function OcrOverlayPage() { setSessionName(data.name || data.filename || '') setActiveCategory(data.document_category || undefined) - // Map DB step to overlay UI step - const dbStep = data.current_step || 1 - const uiStep = dbStepToOverlayUi(dbStep) + // Check if this session was processed with paddle_direct + const isPaddleDirect = data.word_result?.ocr_engine === 'paddle_direct' - setSteps( - OVERLAY_PIPELINE_STEPS.map((s, i) => ({ - ...s, - status: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending', - })), - ) - setCurrentStep(uiStep) + if (isPaddleDirect) { + setMode('paddle-direct') + setSteps( + PADDLE_DIRECT_STEPS.map((s, i) => ({ + ...s, + status: i < 1 ? 'completed' : i === 1 ? 'active' : 'pending', + })), + ) + setCurrentStep(1) + } else { + setMode('pipeline') + // Map DB step to overlay UI step + const dbStep = data.current_step || 1 + const uiStep = dbStepToOverlayUi(dbStep) + + setSteps( + OVERLAY_PIPELINE_STEPS.map((s, i) => ({ + ...s, + status: i < uiStep ? 'completed' : i === uiStep ? 'active' : 'pending', + })), + ) + setCurrentStep(uiStep) + } } catch (e) { console.error('Failed to open session:', e) } @@ -84,12 +101,13 @@ export default function OcrOverlayPage() { if (sessionId === sid) { setSessionId(null) setCurrentStep(0) - setSteps(OVERLAY_PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) + const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS + setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) } } catch (e) { console.error('Failed to delete session:', e) } - }, [sessionId]) + }, [sessionId, mode]) const renameSession = useCallback(async (sid: string, newName: string) => { try { @@ -140,7 +158,8 @@ export default function OcrOverlayPage() { const handleNext = () => { if (currentStep >= steps.length - 1) { // Last step completed — return to session list - setSteps(OVERLAY_PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) + const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS + setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) setCurrentStep(0) setSessionId(null) loadSessions() @@ -168,7 +187,8 @@ export default function OcrOverlayPage() { setSessionId(null) setSessionName('') setCurrentStep(0) - setSteps(OVERLAY_PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) + const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS + setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) } const stepNames: Record = { @@ -206,6 +226,16 @@ export default function OcrOverlayPage() { }, [sessionId, goToStep]) const renderStep = () => { + if (mode === 'paddle-direct') { + switch (currentStep) { + case 0: + return + case 1: + return + default: + return null + } + } switch (currentStep) { case 0: return @@ -410,11 +440,47 @@ export default function OcrOverlayPage() { )} + {/* Mode Toggle */} +
+ + +
+
{renderStep()}
diff --git a/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts b/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts index c7ead0d..034b808 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts @@ -48,6 +48,15 @@ export const OVERLAY_UI_TO_DB: Record = { 6: 9, // reconstruction } +/** + * 2-step pipeline for Paddle Direct mode. + * Upload → PaddleOCR+Overlay (skips deskew/dewarp/crop/rows) + */ +export const PADDLE_DIRECT_STEPS: PipelineStep[] = [ + { id: 'orientation', name: 'Upload', icon: '📤', status: 'pending' }, + { id: 'paddle-direct', name: 'PaddleOCR + Overlay', icon: '⚡', status: 'pending' }, +] + /** Map from DB step to overlay UI step index */ export function dbStepToOverlayUi(dbStep: number): number { // DB: 1=start, 2=orient, 3=deskew, 4=dewarp, 5=crop, 6=columns, 7=rows, 8=words, 9=recon, 10=gt diff --git a/admin-lehrer/components/ocr-overlay/PaddleDirectStep.tsx b/admin-lehrer/components/ocr-overlay/PaddleDirectStep.tsx new file mode 100644 index 0000000..febc4b8 --- /dev/null +++ b/admin-lehrer/components/ocr-overlay/PaddleDirectStep.tsx @@ -0,0 +1,129 @@ +'use client' + +import { useCallback, useEffect, useState } from 'react' +import { OverlayReconstruction } from './OverlayReconstruction' + +const KLAUSUR_API = '/klausur-api' + +type Phase = 'idle' | 'running' | 'overlay' + +interface PaddleDirectStepProps { + sessionId: string | null + onNext: () => void +} + +export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) { + const [phase, setPhase] = useState('idle') + const [error, setError] = useState(null) + const [stats, setStats] = useState<{ cells: number; rows: number; duration: number } | null>(null) + + // Auto-detect: if session already has paddle_direct word_result → show overlay + useEffect(() => { + if (!sessionId) return + let cancelled = false + ;(async () => { + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`) + if (!res.ok || cancelled) return + const data = await res.json() + if (data.word_result?.ocr_engine === 'paddle_direct') { + setPhase('overlay') + } + } catch { + // ignore + } + })() + return () => { cancelled = true } + }, [sessionId]) + + const runPaddleDirect = useCallback(async () => { + if (!sessionId) return + setPhase('running') + setError(null) + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/paddle-direct`, { + method: 'POST', + }) + if (!res.ok) { + const data = await res.json().catch(() => ({})) + throw new Error(data.detail || `HTTP ${res.status}`) + } + const data = await res.json() + setStats({ + cells: data.summary?.total_cells || 0, + rows: data.grid_shape?.rows || 0, + duration: data.duration_seconds || 0, + }) + setPhase('overlay') + } catch (e: unknown) { + setError(e instanceof Error ? e.message : 'Unbekannter Fehler') + setPhase('idle') + } + }, [sessionId]) + + if (!sessionId) { + return ( +
+ Bitte zuerst ein Bild hochladen. +
+ ) + } + + if (phase === 'overlay') { + return ( +
+ {stats && ( +
+ {stats.cells} Woerter erkannt + {stats.rows} Zeilen + {stats.duration.toFixed(1)}s +
+ )} + +
+ ) + } + + return ( +
+ {phase === 'running' ? ( + <> +
+
+

+ PaddleOCR laeuft... +

+

+ Bild wird analysiert (ca. 5-30s) +

+
+ + ) : ( + <> +
+
+

+ Paddle Direct +

+

+ PaddleOCR erkennt alle Woerter direkt auf dem Originalbild — ohne Begradigung, Entzerrung oder Zuschnitt. +

+
+ + {error && ( +
+ {error} +
+ )} + + + + )} +
+ ) +} diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 612e83b..6b05a87 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -2509,6 +2509,189 @@ async def _word_stream_generator( yield f"data: {json.dumps(complete_event)}\n\n" +@router.post("/sessions/{session_id}/paddle-direct") +async def paddle_direct(session_id: str): + """Run PaddleOCR on the original image and build a word grid directly. + + Skips deskew/dewarp/crop/rows — just Upload → PaddleOCR → Overlay. + The original image is stored as cropped_png so OverlayReconstruction + can display it as the background. + """ + original_png = await get_session_image(session_id, "original") + if not original_png: + raise HTTPException(status_code=404, detail="No original image found for this session") + + img_arr = np.frombuffer(original_png, dtype=np.uint8) + img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR) + if img_bgr is None: + raise HTTPException(status_code=400, detail="Failed to decode original image") + + img_h, img_w = img_bgr.shape[:2] + + from cv_ocr_engines import ocr_region_paddle + + t0 = time.time() + word_dicts = await ocr_region_paddle(img_bgr, region=None) + if not word_dicts: + raise HTTPException(status_code=400, detail="PaddleOCR returned no words") + + cells, columns_meta = _paddle_words_to_grid_cells(word_dicts, img_w, img_h) + duration = time.time() - t0 + + n_rows = len(set(c["row_index"] for c in cells)) if cells else 0 + n_cols = len(columns_meta) + + word_result = { + "cells": cells, + "grid_shape": { + "rows": n_rows, + "cols": n_cols, + "total_cells": len(cells), + }, + "columns_used": columns_meta, + "layout": "generic", + "image_width": img_w, + "image_height": img_h, + "duration_seconds": round(duration, 2), + "ocr_engine": "paddle_direct", + "grid_method": "paddle_direct", + "summary": { + "total_cells": len(cells), + "non_empty_cells": sum(1 for c in cells if c.get("text")), + "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), + }, + } + + # Store original image as cropped_png so OverlayReconstruction shows it + await update_session_db( + session_id, + word_result=word_result, + cropped_png=original_png, + current_step=8, + ) + + logger.info( + "paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs", + session_id, len(cells), n_rows, n_cols, duration, + ) + + await _append_pipeline_log(session_id, "paddle_direct", { + "total_cells": len(cells), + "non_empty_cells": word_result["summary"]["non_empty_cells"], + "ocr_engine": "paddle_direct", + }, duration_ms=int(duration * 1000)) + + return {"session_id": session_id, **word_result} + + +def _paddle_words_to_grid_cells( + word_dicts: List[Dict[str, Any]], + img_w: int, + img_h: int, +) -> tuple: + """Convert PaddleOCR word dicts into GridCell dicts + columns_meta. + + 1. Sort words by (top, left). + 2. Cluster into rows by Y-proximity (threshold = 50% of median word height). + 3. Within each row, sort left→right and assign col_index. + 4. Each word → 1 GridCell with word_boxes and bbox_pct. + + Returns (cells, columns_meta) in the same format as build_grid_from_words. + """ + if not word_dicts: + return [], [] + + # Sort by top then left + sorted_words = sorted(word_dicts, key=lambda w: (w["top"], w["left"])) + + # Compute median word height for row clustering threshold + heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0] + median_h = sorted(heights)[len(heights) // 2] if heights else 30 + row_threshold = max(median_h * 0.5, 8) + + # Cluster into rows + rows: List[List[Dict]] = [] + current_row: List[Dict] = [] + current_y = -9999.0 + + for w in sorted_words: + center_y = w["top"] + w["height"] / 2 + if current_row and abs(center_y - current_y) > row_threshold: + rows.append(current_row) + current_row = [] + current_row.append(w) + # Running average Y center for the row + current_y = sum(ww["top"] + ww["height"] / 2 for ww in current_row) / len(current_row) + + if current_row: + rows.append(current_row) + + # Sort each row left→right and build cells + cells: List[Dict[str, Any]] = [] + max_col = 0 + + for row_idx, row_words in enumerate(rows): + row_words.sort(key=lambda w: w["left"]) + for col_idx, w in enumerate(row_words): + left = w["left"] + top = w["top"] + width = w["width"] + height = w["height"] + conf = w.get("confidence", 0) + if isinstance(conf, float) and conf <= 1.0: + conf = conf * 100 # normalize to 0-100 + + cell = { + "cell_id": f"PD_R{row_idx:02d}_W{col_idx:02d}", + "x": left, + "y": top, + "width": width, + "height": height, + "text": w.get("text", ""), + "confidence": round(conf, 1), + "column_index": col_idx, + "row_index": row_idx, + "zone_index": 0, + "ocr_engine": "paddle_direct", + "word_boxes": [{ + "text": w.get("text", ""), + "left": left, + "top": top, + "width": width, + "height": height, + "confidence": round(conf, 1), + }], + "bbox_pct": { + "x": round(left / img_w * 100, 3), + "y": round(top / img_h * 100, 3), + "w": round(width / img_w * 100, 3), + "h": round(height / img_h * 100, 3), + }, + } + cells.append(cell) + if col_idx > max_col: + max_col = col_idx + + # Build columns_meta — one pseudo-column per column index + columns_meta = [] + for ci in range(max_col + 1): + col_cells = [c for c in cells if c["column_index"] == ci] + if col_cells: + min_x = min(c["x"] for c in col_cells) + max_right = max(c["x"] + c["width"] for c in col_cells) + columns_meta.append({ + "type": "column_text", + "x": min_x, + "y": 0, + "width": max_right - min_x, + "height": img_h, + "classification_confidence": 1.0, + "classification_method": "paddle_direct", + }) + + return cells, columns_meta + + class WordGroundTruthRequest(BaseModel): is_correct: bool corrected_entries: Optional[List[Dict[str, Any]]] = None