From 2f8270f77b2763401cea0f93d43eb168babd6b28 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 24 Apr 2026 00:24:22 +0200 Subject: [PATCH] Add Vision-LLM OCR Fusion (Step 4) for degraded scans New module vision_ocr_fusion.py: Sends scan image + OCR word coordinates + document type to Qwen2.5-VL 32B. The LLM reads the image visually while using OCR positions as structural hints. Key features: - Document-type-aware prompts (Vokabelseite, Woerterbuch, etc.) - OCR words grouped into lines with x/y coordinates in prompt - Low-confidence words marked with (?) for LLM attention - Continuation row merging instructions in prompt - JSON response parsing with markdown code block handling - Fallback to original OCR on any error Frontend (admin-lehrer Grid Review): - "Vision-LLM" checkbox toggle - "Typ" dropdown (Vokabelseite, Woerterbuch, etc.) - Steps 1-3 defaults set to inactive Activate: Check "Vision-LLM", select document type, click "OCR neu + Grid". Co-Authored-By: Claude Opus 4.6 (1M context) --- .../components/grid-editor/useGridEditor.ts | 20 +- .../ocr-pipeline/StepGridReview.tsx | 20 ++ klausur-service/backend/grid_editor_api.py | 24 ++ klausur-service/backend/vision_ocr_fusion.py | 261 ++++++++++++++++++ 4 files changed, 320 insertions(+), 5 deletions(-) create mode 100644 klausur-service/backend/vision_ocr_fusion.py diff --git a/admin-lehrer/components/grid-editor/useGridEditor.ts b/admin-lehrer/components/grid-editor/useGridEditor.ts index f3bd451..b04d02d 100644 --- a/admin-lehrer/components/grid-editor/useGridEditor.ts +++ b/admin-lehrer/components/grid-editor/useGridEditor.ts @@ -28,10 +28,14 @@ export function useGridEditor(sessionId: string | null) { const [ipaMode, setIpaMode] = useState('auto') const [syllableMode, setSyllableMode] = useState('auto') - // OCR Quality Steps (A/B testing toggles) - const [ocrEnhance, setOcrEnhance] = useState(true) - const [ocrMaxCols, setOcrMaxCols] = useState(0) // 0 = unlimited (admin pipeline default) - const [ocrMinConf, setOcrMinConf] = useState(0) // 0 = auto from quality score + // OCR Quality Steps (A/B testing toggles — defaults off for now) + const [ocrEnhance, setOcrEnhance] = useState(false) + const [ocrMaxCols, setOcrMaxCols] = useState(0) + const [ocrMinConf, setOcrMinConf] = useState(0) + + // Vision-LLM Fusion (Step 4) + const [visionFusion, setVisionFusion] = useState(false) + const [documentCategory, setDocumentCategory] = useState('vokabelseite') // Undo/redo stacks store serialized zone arrays const undoStack = useRef([]) @@ -92,6 +96,8 @@ export function useGridEditor(sessionId: string | null) { params.set('enhance', String(ocrEnhance)) if (ocrMaxCols > 0) params.set('max_cols', String(ocrMaxCols)) if (ocrMinConf > 0) params.set('min_conf', String(ocrMinConf)) + params.set('vision_fusion', String(visionFusion)) + if (documentCategory) params.set('doc_category', documentCategory) const res = await fetch( `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/rerun-ocr-and-build-grid?${params}`, { method: 'POST' }, @@ -110,7 +116,7 @@ export function useGridEditor(sessionId: string | null) { } finally { setLoading(false) } - }, [sessionId, ipaMode, syllableMode, ocrEnhance, ocrMaxCols, ocrMinConf]) + }, [sessionId, ipaMode, syllableMode, ocrEnhance, ocrMaxCols, ocrMinConf, visionFusion, documentCategory]) const loadGrid = useCallback(async () => { if (!sessionId) return @@ -1030,6 +1036,10 @@ export function useGridEditor(sessionId: string | null) { setOcrMaxCols, ocrMinConf, setOcrMinConf, + visionFusion, + setVisionFusion, + documentCategory, + setDocumentCategory, rerunOcr, } } diff --git a/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx b/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx index 22df350..8829961 100644 --- a/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx @@ -67,6 +67,10 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro setOcrMaxCols, ocrMinConf, setOcrMinConf, + visionFusion, + setVisionFusion, + documentCategory, + setDocumentCategory, rerunOcr, } = useGridEditor(sessionId) @@ -291,6 +295,22 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro + | + + +