From 06d63d18f910bd2e1006faead414ab8ff99c9df4 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 12 Mar 2026 16:19:19 +0100 Subject: [PATCH] fix: generic fuzzy text matching for overlay word-box positioning Replace sequential 1:1 token-to-box mapping with fuzzy text matching. Each token from cell.text finds its best matching word_box by text similarity (normalized prefix match + substring bonus). Handles: - Reordered boxes (different sort between text and boxes) - IPA corrections changing token boundaries - Token/box count mismatches Unmatched tokens get interpolated positions from matched neighbors. Co-Authored-By: Claude Opus 4.6 --- .../ocr-overlay/useSlideWordPositions.ts | 94 ++++++++++++------- 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts b/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts index 8457b2a..6b2246d 100644 --- a/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts +++ b/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts @@ -47,22 +47,20 @@ export function useSlideWordPositions( if (hasWordBoxes) { // --- WORD-BOX PATH: use OCR positions with cell.text tokens --- + // Uses fuzzy text matching to pair each token with its best box, + // handling reordering, IPA corrections, and token count mismatches. const positions = new Map() for (const cell of cells) { if (!cell.bbox_pct || !cell.text) continue - // Tokens from the CLEANED cell text (reading order) const tokens = cell.text.split(/\s+/).filter(Boolean) if (tokens.length === 0) continue - // Word boxes sorted left-to-right const boxes = (cell.word_boxes || []) .filter(wb => wb.text.trim()) - .sort((a, b) => a.left - b.left) if (boxes.length === 0) { - // No boxes — place all tokens at cell start as fallback const fallbackW = cell.bbox_pct.w / tokens.length const wordPos = tokens.map((t, i) => ({ xPct: cell.bbox_pct.x + i * fallbackW, @@ -74,47 +72,79 @@ export function useSlideWordPositions( continue } + // Match each token to its best box by text similarity. + // Normalize: lowercase, strip brackets/punctuation for comparison. + const norm = (s: string) => s.toLowerCase().replace(/[^a-z0-9äöüß]/g, '') + + const used = new Set() + const tokenBoxIdx: (number | null)[] = [] + + for (const token of tokens) { + const tn = norm(token) + let bestIdx = -1 + let bestScore = 0 + + for (let bi = 0; bi < boxes.length; bi++) { + if (used.has(bi)) continue + const bn = norm(boxes[bi].text) + // Score: length of common prefix / max length + let common = 0 + const minLen = Math.min(tn.length, bn.length) + for (let k = 0; k < minLen; k++) { + if (tn[k] === bn[k]) common++ + else break + } + // Also check if token is a substring of box text or vice versa + const containsBonus = (bn.includes(tn) || tn.includes(bn)) ? 0.5 : 0 + const score = (minLen > 0 ? common / Math.max(tn.length, bn.length) : 0) + containsBonus + if (score > bestScore) { + bestScore = score + bestIdx = bi + } + } + + if (bestIdx >= 0 && bestScore > 0.2) { + used.add(bestIdx) + tokenBoxIdx.push(bestIdx) + } else { + tokenBoxIdx.push(null) // no match + } + } + + // Build positions: matched tokens get box positions, + // unmatched tokens get interpolated between neighbors. const wordPos: WordPosition[] = [] - if (tokens.length <= boxes.length) { - // More boxes than tokens: assign each token to a box in order. - // This handles the common case where box count matches or - // exceeds token count (e.g. OCR found extra fragments). - for (let ti = 0; ti < tokens.length; ti++) { - const box = boxes[ti] + for (let ti = 0; ti < tokens.length; ti++) { + const bi = tokenBoxIdx[ti] + if (bi !== null) { + const box = boxes[bi] wordPos.push({ xPct: (box.left / imgW) * 100, wPct: (box.width / imgW) * 100, text: tokens[ti], fontRatio: 1.0, }) - } - } else { - // More tokens than boxes: assign boxes to first N tokens, - // then spread remaining tokens after the last box. - for (let ti = 0; ti < boxes.length; ti++) { - const box = boxes[ti] + } else { + // Interpolate: find nearest matched neighbor before/after + let prevPx = cell.bbox_pct.x / 100 * imgW + let prevW = 0 + for (let p = ti - 1; p >= 0; p--) { + if (tokenBoxIdx[p] !== null) { + const pb = boxes[tokenBoxIdx[p]!] + prevPx = pb.left + pb.width + 5 + prevW = pb.width + break + } + } + const estW = prevW > 0 ? prevW : (cell.bbox_pct.w / 100 * imgW / tokens.length) wordPos.push({ - xPct: (box.left / imgW) * 100, - wPct: (box.width / imgW) * 100, + xPct: (prevPx / imgW) * 100, + wPct: (estW / imgW) * 100, text: tokens[ti], fontRatio: 1.0, }) } - // Remaining tokens: estimate position after last box - const lastBox = boxes[boxes.length - 1] - let cursorPx = lastBox.left + lastBox.width + 5 - for (let ti = boxes.length; ti < tokens.length; ti++) { - // Estimate width from average box width - const avgW = boxes.reduce((s, b) => s + b.width, 0) / boxes.length - wordPos.push({ - xPct: (cursorPx / imgW) * 100, - wPct: (avgW / imgW) * 100, - text: tokens[ti], - fontRatio: 1.0, - }) - cursorPx += avgW + 5 - } } if (wordPos.length > 0) {