From 0ee92e7210ee9c7986b57f76b6bc04adf6c2eab0 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 11 Mar 2026 19:39:49 +0100 Subject: [PATCH] feat: OCR word_boxes fuer pixelgenaue Overlay-Positionierung MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend: _ocr_cell_crop speichert jetzt word_boxes mit exakten Tesseract/RapidOCR Wort-Koordinaten (left, top, width, height) im Cell-Ergebnis. Absolute Bildkoordinaten, bereits zurueckgemappt. Frontend: Slide-Hook nutzt word_boxes direkt wenn vorhanden — jedes Wort wird exakt an seiner OCR-Position platziert. Kein Pixel-Scanning noetig. Fallback auf alten Slide wenn keine Boxes. Co-Authored-By: Claude Opus 4.6 --- .../app/(admin)/ai/ocr-overlay/types.ts | 1 + .../app/(admin)/ai/ocr-pipeline/types.ts | 10 ++++ .../ocr-overlay/useSlideWordPositions.ts | 56 +++++++++++++------ klausur-service/backend/cv_cell_grid.py | 31 ++++++++++ 4 files changed, 80 insertions(+), 18 deletions(-) diff --git a/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts b/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts index a26fc7c..c7ead0d 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-overlay/types.ts @@ -16,6 +16,7 @@ export type { RowItem, GridResult, GridCell, + OcrWordBox, WordBbox, ColumnMeta, } from '../ocr-pipeline/types' diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts index b3f56a5..da938c4 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts @@ -220,6 +220,15 @@ export interface WordBbox { h: number } +export interface OcrWordBox { + text: string + left: number // absolute image x in px + top: number // absolute image y in px + width: number // px + height: number // px + conf: number +} + export interface GridCell { cell_id: string // "R03_C1" row_index: number @@ -232,6 +241,7 @@ export interface GridCell { ocr_engine?: string is_bold?: boolean status?: 'pending' | 'confirmed' | 'edited' | 'skipped' + word_boxes?: OcrWordBox[] // per-word bounding boxes from OCR engine } export interface ColumnMeta { diff --git a/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts b/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts index ec3159b..c6695b9 100644 --- a/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts +++ b/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts @@ -9,16 +9,16 @@ export interface WordPosition { } /** - * "Slide from left" positioning algorithm. + * "Slide from left" positioning using OCR word bounding boxes. * - * Takes ALL recognised words per cell and slides them left-to-right across - * the row's dark-pixel projection until each word "locks" onto its ink. + * If the backend provides `word_boxes` (exact per-word coordinates from + * Tesseract/RapidOCR), we place each word directly at its OCR position. + * This gives pixel-accurate overlay without any heuristic pixel scanning. * - * Font size: fontRatio = 1.0 for all tokens (matches fallback rendering). - * Token widths: derived from canvas measureText scaled to the rendered font - * size (medianCh * 0.7), ensuring visually correct proportions. + * Fallback: if no word_boxes, slide tokens across dark-pixel projection + * (original slide algorithm). * - * Guarantees: no words dropped, no complex matching rules needed. + * Font size: fontRatio = 1.0 for all (matches fallback rendering). */ export function useSlideWordPositions( imageUrl: string, @@ -37,6 +37,37 @@ export function useSlideWordPositions( const imgW = img.naturalWidth const imgH = img.naturalHeight + // Check if we can use word_boxes (fast path — no canvas needed) + const hasWordBoxes = cells.some(c => c.word_boxes && c.word_boxes.length > 0) + + if (hasWordBoxes) { + // --- FAST PATH: use OCR word bounding boxes directly --- + const positions = new Map() + + for (const cell of cells) { + if (!cell.bbox_pct || !cell.text) continue + const boxes = cell.word_boxes + if (!boxes || boxes.length === 0) continue + + const wordPos: WordPosition[] = boxes + .filter(wb => wb.text.trim()) + .map(wb => ({ + xPct: (wb.left / imgW) * 100, + wPct: (wb.width / imgW) * 100, + text: wb.text, + fontRatio: 1.0, + })) + + if (wordPos.length > 0) { + positions.set(cell.cell_id, wordPos) + } + } + + setResult(positions) + return + } + + // --- SLOW PATH: pixel-projection slide (fallback if no word_boxes) --- const canvas = document.createElement('canvas') canvas.width = imgW canvas.height = imgH @@ -56,7 +87,6 @@ export function useSlideWordPositions( const fontFam = "'Liberation Sans', Arial, sans-serif" ctx.font = `${refFontSize}px ${fontFam}` - // --- Global font scale from median cell height --- const cellHeights = cells .filter(c => c.bbox_pct && c.bbox_pct.h > 0) .map(c => Math.round(c.bbox_pct.h / 100 * imgH)) @@ -65,7 +95,6 @@ export function useSlideWordPositions( ? cellHeights[Math.floor(cellHeights.length / 2)] : 30 - // measureScale maps measureText pixels → image pixels at rendered font const renderedFontImgPx = medianCh * 0.7 const measureScale = renderedFontImgPx / refFontSize const spaceWidthPx = Math.max(2, Math.round(ctx.measureText(' ').width * measureScale)) @@ -91,7 +120,6 @@ export function useSlideWordPositions( if (cy < 0) cy = 0 if (cx + cw > imgW || cy + ch > imgH) continue - // --- Dark-pixel projection --- const imageData = ctx.getImageData(cx, cy, cw, ch) const proj = new Float32Array(cw) for (let y = 0; y < ch; y++) { @@ -111,23 +139,18 @@ export function useSlideWordPositions( ink.reverse() } - // --- Split into individual tokens --- const tokens = cell.text.split(/\s+/).filter(Boolean) if (tokens.length === 0) continue - // Token widths in image pixels (at rendered font size) const tokenWidthsPx = tokens.map(t => Math.max(4, Math.round(ctx.measureText(t).width * measureScale)) ) - // --- Slide each token left-to-right --- const wordPos: WordPosition[] = [] let cursor = 0 for (let ti = 0; ti < tokens.length; ti++) { const tokenW = tokenWidthsPx[ti] - - // Find first x from cursor where ≥15% of span has ink const coverageNeeded = Math.max(1, Math.round(tokenW * 0.15)) let bestX = cursor @@ -143,14 +166,12 @@ export function useSlideWordPositions( bestX = x break } - // Safety: don't scan more than 30% of cell width past cursor if (x > cursor + cw * 0.3 && ti > 0) { bestX = cursor break } } - // Clamp to cell bounds if (bestX + tokenW > cw) { bestX = Math.max(0, cw - tokenW) } @@ -162,7 +183,6 @@ export function useSlideWordPositions( fontRatio: 1.0, }) - // Advance cursor: past this token + space cursor = bestX + tokenW + spaceWidthPx } diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py index 748c746..81343ec 100644 --- a/klausur-service/backend/cv_cell_grid.py +++ b/klausur-service/backend/cv_cell_grid.py @@ -221,6 +221,20 @@ def _ocr_cell_crop( sum(w['conf'] for w in psm7_words) / len(psm7_words), 1 ) used_engine = 'cell_crop_v2_psm7' + # Remap PSM7 word positions back to original image coords + if up_w != cw or up_h != ch: + sx = cw / max(up_w, 1) + sy = ch / max(up_h, 1) + for w in psm7_words: + w['left'] = int(w['left'] * sx) + cx + w['top'] = int(w['top'] * sy) + cy + w['width'] = int(w['width'] * sx) + w['height'] = int(w['height'] * sy) + else: + for w in psm7_words: + w['left'] += cx + w['top'] += cy + words = psm7_words # --- Noise filter --- if text.strip(): @@ -235,6 +249,23 @@ def _ocr_cell_crop( result['text'] = text result['confidence'] = avg_conf result['ocr_engine'] = used_engine + + # Store individual word bounding boxes (absolute image coordinates) + # for pixel-accurate overlay positioning in the frontend. + if words and text.strip(): + result['word_boxes'] = [ + { + 'text': w.get('text', ''), + 'left': w['left'], + 'top': w['top'], + 'width': w['width'], + 'height': w['height'], + 'conf': w.get('conf', 0), + } + for w in words + if w.get('text', '').strip() + ] + return result