diff --git a/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx b/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx index 6078447..c9fd289 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx +++ b/admin-lehrer/app/(admin)/ai/ocr-overlay/page.tsx @@ -11,12 +11,12 @@ import { StepRowDetection } from '@/components/ocr-pipeline/StepRowDetection' import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecognition' import { OverlayReconstruction } from '@/components/ocr-overlay/OverlayReconstruction' import { PaddleDirectStep } from '@/components/ocr-overlay/PaddleDirectStep' -import { OVERLAY_PIPELINE_STEPS, PADDLE_DIRECT_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types' +import { OVERLAY_PIPELINE_STEPS, PADDLE_DIRECT_STEPS, KOMBI_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types' const KLAUSUR_API = '/klausur-api' export default function OcrOverlayPage() { - const [mode, setMode] = useState<'pipeline' | 'paddle-direct'>('pipeline') + const [mode, setMode] = useState<'pipeline' | 'paddle-direct' | 'kombi'>('pipeline') const [currentStep, setCurrentStep] = useState(0) const [sessionId, setSessionId] = useState(null) const [sessionName, setSessionName] = useState('') @@ -63,13 +63,17 @@ export default function OcrOverlayPage() { setSessionName(data.name || data.filename || '') setActiveCategory(data.document_category || undefined) - // Check if this session was processed with paddle_direct - const isPaddleDirect = data.word_result?.ocr_engine === 'paddle_direct' + // Check if this session was processed with paddle_direct or kombi + const ocrEngine = data.word_result?.ocr_engine + const isPaddleDirect = ocrEngine === 'paddle_direct' + const isKombi = ocrEngine === 'kombi' - if (isPaddleDirect) { - setMode('paddle-direct') + if (isPaddleDirect || isKombi) { + const m = isKombi ? 'kombi' : 'paddle-direct' + const baseSteps = isKombi ? KOMBI_STEPS : PADDLE_DIRECT_STEPS + setMode(m) setSteps( - PADDLE_DIRECT_STEPS.map((s, i) => ({ + baseSteps.map((s, i) => ({ ...s, status: i < 4 ? 'completed' : i === 4 ? 'active' : 'pending', })), @@ -101,7 +105,7 @@ export default function OcrOverlayPage() { if (sessionId === sid) { setSessionId(null) setCurrentStep(0) - const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS + const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) } } catch (e) { @@ -158,7 +162,7 @@ export default function OcrOverlayPage() { const handleNext = () => { if (currentStep >= steps.length - 1) { // Last step completed — return to session list - const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS + const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) setCurrentStep(0) setSessionId(null) @@ -187,7 +191,7 @@ export default function OcrOverlayPage() { setSessionId(null) setSessionName('') setCurrentStep(0) - const baseSteps = mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS + const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) } @@ -226,7 +230,7 @@ export default function OcrOverlayPage() { }, [sessionId, goToStep]) const renderStep = () => { - if (mode === 'paddle-direct') { + if (mode === 'paddle-direct' || mode === 'kombi') { switch (currentStep) { case 0: return @@ -237,7 +241,21 @@ export default function OcrOverlayPage() { case 3: return case 4: - return + return mode === 'kombi' ? ( + + ) : ( + + ) default: return null } @@ -480,13 +498,29 @@ export default function OcrOverlayPage() { > Paddle Direct (5 Schritte) +
{renderStep()}
diff --git a/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts b/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts index ba578a4..2f38407 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts @@ -60,6 +60,18 @@ export const PADDLE_DIRECT_STEPS: PipelineStep[] = [ { id: 'paddle-direct', name: 'PaddleOCR + Overlay', icon: '⚡', status: 'pending' }, ] +/** + * 5-step pipeline for Kombi mode (PaddleOCR + Tesseract). + * Same preprocessing, then both engines run and results are merged. + */ +export const KOMBI_STEPS: PipelineStep[] = [ + { id: 'orientation', name: 'Orientierung', icon: '🔄', status: 'pending' }, + { id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' }, + { id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' }, + { id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' }, + { id: 'kombi', name: 'Paddle + Tesseract', icon: '🔀', status: 'pending' }, +] + /** Map from DB step to overlay UI step index */ export function dbStepToOverlayUi(dbStep: number): number { // DB: 1=start, 2=orient, 3=deskew, 4=dewarp, 5=crop, 6=columns, 7=rows, 8=words, 9=recon, 10=gt diff --git a/admin-lehrer/components/ocr-overlay/PaddleDirectStep.tsx b/admin-lehrer/components/ocr-overlay/PaddleDirectStep.tsx index febc4b8..5ad802f 100644 --- a/admin-lehrer/components/ocr-overlay/PaddleDirectStep.tsx +++ b/admin-lehrer/components/ocr-overlay/PaddleDirectStep.tsx @@ -10,14 +10,38 @@ type Phase = 'idle' | 'running' | 'overlay' interface PaddleDirectStepProps { sessionId: string | null onNext: () => void + /** Backend endpoint suffix, default: 'paddle-direct' */ + endpoint?: string + /** Title shown in idle state */ + title?: string + /** Description shown in idle state */ + description?: string + /** Icon shown in idle state */ + icon?: string + /** Button label */ + buttonLabel?: string + /** Running label */ + runningLabel?: string + /** OCR engine key to check for auto-detect */ + engineKey?: string } -export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) { +export function PaddleDirectStep({ + sessionId, + onNext, + endpoint = 'paddle-direct', + title = 'Paddle Direct', + description = 'PaddleOCR erkennt alle Woerter direkt auf dem Originalbild — ohne Begradigung, Entzerrung oder Zuschnitt.', + icon = '⚡', + buttonLabel = 'PaddleOCR starten', + runningLabel = 'PaddleOCR laeuft...', + engineKey = 'paddle_direct', +}: PaddleDirectStepProps) { const [phase, setPhase] = useState('idle') const [error, setError] = useState(null) const [stats, setStats] = useState<{ cells: number; rows: number; duration: number } | null>(null) - // Auto-detect: if session already has paddle_direct word_result → show overlay + // Auto-detect: if session already has matching word_result → show overlay useEffect(() => { if (!sessionId) return let cancelled = false @@ -26,7 +50,7 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) { const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`) if (!res.ok || cancelled) return const data = await res.json() - if (data.word_result?.ocr_engine === 'paddle_direct') { + if (data.word_result?.ocr_engine === engineKey) { setPhase('overlay') } } catch { @@ -34,14 +58,14 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) { } })() return () => { cancelled = true } - }, [sessionId]) + }, [sessionId, engineKey]) - const runPaddleDirect = useCallback(async () => { + const runOcr = useCallback(async () => { if (!sessionId) return setPhase('running') setError(null) try { - const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/paddle-direct`, { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/${endpoint}`, { method: 'POST', }) if (!res.ok) { @@ -59,7 +83,7 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) { setError(e instanceof Error ? e.message : 'Unbekannter Fehler') setPhase('idle') } - }, [sessionId]) + }, [sessionId, endpoint]) if (!sessionId) { return ( @@ -91,7 +115,7 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) {

- PaddleOCR laeuft... + {runningLabel}

Bild wird analysiert (ca. 5-30s) @@ -101,12 +125,12 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) { ) : ( <>

-
+
{icon}

- Paddle Direct + {title}

- PaddleOCR erkennt alle Woerter direkt auf dem Originalbild — ohne Begradigung, Entzerrung oder Zuschnitt. + {description}

@@ -117,10 +141,10 @@ export function PaddleDirectStep({ sessionId, onNext }: PaddleDirectStepProps) { )} )} diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index f19bcf8..6a328f1 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -2599,6 +2599,189 @@ async def paddle_direct(session_id: str): return {"session_id": session_id, **word_result} +def _box_iou(a: dict, b: dict) -> float: + """Compute IoU between two word boxes (each has left, top, width, height).""" + ax1, ay1 = a["left"], a["top"] + ax2, ay2 = ax1 + a["width"], ay1 + a["height"] + bx1, by1 = b["left"], b["top"] + bx2, by2 = bx1 + b["width"], by1 + b["height"] + + ix1, iy1 = max(ax1, bx1), max(ay1, by1) + ix2, iy2 = min(ax2, bx2), min(ay2, by2) + inter = max(0, ix2 - ix1) * max(0, iy2 - iy1) + if inter == 0: + return 0.0 + area_a = (ax2 - ax1) * (ay2 - ay1) + area_b = (bx2 - bx1) * (by2 - by1) + return inter / (area_a + area_b - inter) if (area_a + area_b - inter) > 0 else 0.0 + + +def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list: + """Merge word boxes from PaddleOCR and Tesseract. + + Matching: IoU > 0.3 between bounding boxes. + Merging: Weighted average of coordinates by confidence. + """ + merged = [] + used_tess: set = set() + + for pw in paddle_words: + best_iou, best_ti = 0.0, -1 + for ti, tw in enumerate(tess_words): + if ti in used_tess: + continue + iou = _box_iou(pw, tw) + if iou > best_iou: + best_iou, best_ti = iou, ti + + if best_iou > 0.3 and best_ti >= 0: + tw = tess_words[best_ti] + used_tess.add(best_ti) + pc = pw.get("conf", 80) + tc = tw.get("conf", 50) + total = pc + tc + if total == 0: + total = 1 + merged.append({ + "text": pw["text"], # Paddle text usually better + "left": round((pw["left"] * pc + tw["left"] * tc) / total), + "top": round((pw["top"] * pc + tw["top"] * tc) / total), + "width": round((pw["width"] * pc + tw["width"] * tc) / total), + "height": round((pw["height"] * pc + tw["height"] * tc) / total), + "conf": max(pc, tc), + }) + else: + merged.append(pw) + + # Add unmatched Tesseract words (bullet points, symbols, etc.) + for ti, tw in enumerate(tess_words): + if ti not in used_tess and tw.get("conf", 0) >= 40: + merged.append(tw) + + return merged + + +@router.post("/sessions/{session_id}/paddle-kombi") +async def paddle_kombi(session_id: str): + """Run PaddleOCR + Tesseract on the preprocessed image and merge results. + + Both engines run on the same preprocessed (cropped/dewarped) image. + Word boxes are matched by IoU and coordinates are averaged weighted by + confidence. Unmatched Tesseract words (bullets, symbols) are added. + """ + img_png = await get_session_image(session_id, "cropped") + if not img_png: + img_png = await get_session_image(session_id, "dewarped") + if not img_png: + img_png = await get_session_image(session_id, "original") + if not img_png: + raise HTTPException(status_code=404, detail="No image found for this session") + + img_arr = np.frombuffer(img_png, dtype=np.uint8) + img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR) + if img_bgr is None: + raise HTTPException(status_code=400, detail="Failed to decode image") + + img_h, img_w = img_bgr.shape[:2] + + from cv_ocr_engines import ocr_region_paddle + + t0 = time.time() + + # --- PaddleOCR --- + paddle_words = await ocr_region_paddle(img_bgr, region=None) + if not paddle_words: + paddle_words = [] + + # --- Tesseract --- + from PIL import Image + import pytesseract + + pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)) + data = pytesseract.image_to_data( + pil_img, lang="eng+deu", + config="--psm 6 --oem 3", + output_type=pytesseract.Output.DICT, + ) + tess_words = [] + for i in range(len(data["text"])): + text = str(data["text"][i]).strip() + conf_raw = str(data["conf"][i]) + conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1 + if not text or conf < 20: + continue + tess_words.append({ + "text": text, + "left": data["left"][i], + "top": data["top"][i], + "width": data["width"][i], + "height": data["height"][i], + "conf": conf, + }) + + # --- Merge --- + if not paddle_words and not tess_words: + raise HTTPException(status_code=400, detail="Both OCR engines returned no words") + + merged_words = _merge_paddle_tesseract(paddle_words, tess_words) + + cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) + duration = time.time() - t0 + + for cell in cells: + cell["ocr_engine"] = "kombi" + + n_rows = len(set(c["row_index"] for c in cells)) if cells else 0 + n_cols = len(columns_meta) + col_types = {c.get("type") for c in columns_meta} + is_vocab = bool(col_types & {"column_en", "column_de"}) + + word_result = { + "cells": cells, + "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)}, + "columns_used": columns_meta, + "layout": "vocab" if is_vocab else "generic", + "image_width": img_w, + "image_height": img_h, + "duration_seconds": round(duration, 2), + "ocr_engine": "kombi", + "grid_method": "kombi", + "summary": { + "total_cells": len(cells), + "non_empty_cells": sum(1 for c in cells if c.get("text")), + "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50), + "paddle_words": len(paddle_words), + "tesseract_words": len(tess_words), + "merged_words": len(merged_words), + }, + } + + await update_session_db( + session_id, + word_result=word_result, + cropped_png=img_png, + current_step=8, + ) + + logger.info( + "paddle_kombi session %s: %d cells (%d rows, %d cols) in %.2fs " + "[paddle=%d, tess=%d, merged=%d]", + session_id, len(cells), n_rows, n_cols, duration, + len(paddle_words), len(tess_words), len(merged_words), + ) + + await _append_pipeline_log(session_id, "paddle_kombi", { + "total_cells": len(cells), + "non_empty_cells": word_result["summary"]["non_empty_cells"], + "paddle_words": len(paddle_words), + "tesseract_words": len(tess_words), + "merged_words": len(merged_words), + "ocr_engine": "kombi", + }, duration_ms=int(duration * 1000)) + + return {"session_id": session_id, **word_result} + + class WordGroundTruthRequest(BaseModel): is_correct: bool corrected_entries: Optional[List[Dict[str, Any]]] = None