From b81baa1d1671353016c475b898b6ef2566b6db3d Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 11 Mar 2026 16:51:55 +0100 Subject: [PATCH] fix: Slide-Modus globale Schriftgroesse statt per-Token Scale Schriftgroesse wird jetzt GLOBAL aus der medianen Zellhoehe berechnet (65% der Zellhoehe als Ziel-Font). Alle Tokens bekommen dieselbe konsistente Groesse. Die Slide-Logik bestimmt nur noch die x-Position. Vorher: Scale pro Zelle aus Ink-Span/Textbreite -> inkonsistente Groessen. Co-Authored-By: Claude Opus 4.6 --- .../ocr-overlay/useSlideWordPositions.ts | 162 ++++++++---------- 1 file changed, 67 insertions(+), 95 deletions(-) diff --git a/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts b/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts index d7f060a..f05e502 100644 --- a/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts +++ b/admin-lehrer/components/ocr-overlay/useSlideWordPositions.ts @@ -9,29 +9,25 @@ export interface WordPosition { } /** - * Alternative positioning algorithm: "slide from left". + * "Slide from left" positioning algorithm. * - * Instead of matching text groups to pixel clusters (which can lose words), - * this algorithm takes ALL recognised words and slides them left-to-right - * across the row's dark-pixel projection until each word "locks" onto its - * ink coverage. + * Takes ALL recognised words per cell and slides them left-to-right across + * the row's dark-pixel projection until each word "locks" onto its ink. + * + * Key design: font size is determined GLOBALLY (median cell height), + * NOT per-token. The slide only determines the x-position. Token width + * is derived from the global font size + canvas measureText, ensuring + * consistent sizing across all cells. * * Algorithm per cell: - * 1. Build horizontal dark-pixel projection (same as cluster approach). - * 2. Split the cell text into individual tokens (words/symbols). - * 3. Measure each token's expected pixel width (canvas measureText). - * 4. Slide a cursor from x=0 rightward. For each token, find the first - * x position where the projection has enough dark pixels under the - * token's width span (≥ coverageThreshold of the span is "inked"). - * 5. Lock the token at that x, advance cursor past it + a small gap. + * 1. Build horizontal dark-pixel projection. + * 2. Find dark-pixel clusters (contiguous inked regions). + * 3. Split cell text into tokens. + * 4. Compute a global scale: median cell height → reference font → pixel widths. + * 5. For each token, slide from cursor position until ink coverage is found. + * 6. Place token at that x with width from measureText * globalScale. * - * This guarantees: - * - ALL words appear (nothing is dropped) - * - Original spacing is roughly preserved (words land on their ink) - * - Box borders/lines are naturally covered by "|" / "l" tokens - * - No complex cluster-matching or artifact-merging rules needed - * - * Returns Map. + * Guarantees: no words dropped, no complex matching rules needed. */ export function useSlideWordPositions( imageUrl: string, @@ -69,12 +65,34 @@ export function useSlideWordPositions( const fontFam = "'Liberation Sans', Arial, sans-serif" ctx.font = `${refFontSize}px ${fontFam}` + // --- Compute a GLOBAL scale from median cell height --- + // This ensures all tokens across all cells get the same font size. + const cellHeights = cells + .filter(c => c.bbox_pct && c.bbox_pct.h > 0) + .map(c => Math.round(c.bbox_pct.h / 100 * imgH)) + .sort((a, b) => a - b) + const medianCh = cellHeights.length > 0 + ? cellHeights[Math.floor(cellHeights.length / 2)] + : 30 + + // Target font size in image pixels = fraction of median cell height. + // Typical printed text fills ~60-70% of the row height. + const targetFontPx = medianCh * 0.65 + // globalScale maps measureText pixels (at refFontSize) → image pixels + const globalScale = targetFontPx / refFontSize + // fontRatio for the renderer (medianCellHeightPx * fontRatio * fontScale = fontSize) + // We want autoFontPx = targetFontPx, renderer does medianCh * fontRatio * fontScale + // with fontScale=0.7 default → fontRatio = targetFontPx / (medianCh * 0.7) + // But we don't know fontScale here. So just set fontRatio = targetFontPx / medianCh + // and let the user's fontScale slider adjust. + const globalFontRatio = Math.min(targetFontPx / medianCh, 1.0) + const positions = new Map() for (const cell of cells) { if (!cell.bbox_pct || !cell.text) continue - // --- Get cell rectangle in image pixels --- + // --- Cell rectangle in image pixels --- let cx: number, cy: number const cw = Math.round(cell.bbox_pct.w / 100 * imgW) const ch = Math.round(cell.bbox_pct.h / 100 * imgH) @@ -91,7 +109,7 @@ export function useSlideWordPositions( if (cy < 0) cy = 0 if (cx + cw > imgW || cy + ch > imgH) continue - // --- Build dark-pixel projection --- + // --- Dark-pixel projection --- const imageData = ctx.getImageData(cx, cy, cw, ch) const proj = new Float32Array(cw) for (let y = 0; y < ch; y++) { @@ -102,98 +120,73 @@ export function useSlideWordPositions( } } - // Dark pixel threshold per column (minimum to count as "inked") const threshold = Math.max(1, ch * 0.03) - // Build binary ink mask: true if column has enough dark pixels + // Binary ink mask const ink = new Uint8Array(cw) for (let x = 0; x < cw; x++) { ink[x] = proj[x] >= threshold ? 1 : 0 } - // For 180° rotation, flip the ink mask if (rotation === 180) { ink.reverse() } - // --- Split text into tokens --- - // Use triple-space groups first (preserving OCR column separation), - // then split each group into individual words for fine positioning. + // --- Tokens --- const tokens = cell.text.split(/\s+/).filter(Boolean) if (tokens.length === 0) continue - // Measure each token's width in pixels (at reference font size) - const tokenWidths = tokens.map(t => ctx.measureText(t).width) + // Token widths in image pixels (using global scale) + const tokenWidthsPx = tokens.map(t => Math.round(ctx.measureText(t).width * globalScale)) + const spaceWidthPx = Math.round(ctx.measureText(' ').width * globalScale) - // Total measured width of all tokens + inter-word spaces - const spaceWidth = ctx.measureText(' ').width - const totalTextW = tokenWidths.reduce((a, b) => a + b, 0) + (tokens.length - 1) * spaceWidth - - // Scale factor: map measured text width → pixel width on image. - // Use the total INK SPAN (first dark pixel to last dark pixel), - // not the count of dark columns. Text characters have gaps between - // strokes, so counting only dark pixels gives a much-too-small scale. - let firstInk = -1, lastInk = -1 - for (let x = 0; x < cw; x++) { - if (ink[x]) { - if (firstInk < 0) firstInk = x - lastInk = x - } - } - - // If almost no ink, skip - if (firstInk < 0 || lastInk <= firstInk) continue - - const inkSpan = lastInk - firstInk + 1 - const scale = inkSpan / totalTextW - - // --- Slide each token from left to right --- + // --- Slide each token left-to-right --- const wordPos: WordPosition[] = [] - let cursor = 0 // current search position in cell pixels - const minGapPx = Math.max(2, Math.round(cw * 0.005)) // minimum gap between tokens + let cursor = 0 for (let ti = 0; ti < tokens.length; ti++) { - const tokenW = Math.round(tokenWidths[ti] * scale) - if (tokenW <= 0) continue + const tokenW = Math.max(1, tokenWidthsPx[ti]) - // Find first position from cursor where the token has enough ink coverage. - // "Enough" = at least 15% of the token's width has ink underneath. + // Find first x from cursor where ≥15% of span has ink const coverageNeeded = Math.max(1, Math.round(tokenW * 0.15)) let bestX = cursor - for (let x = cursor; x <= cw - tokenW; x++) { + // Don't search beyond cell width + const searchLimit = Math.min(cw - 1, cw - tokenW) + + for (let x = cursor; x <= searchLimit; x++) { let inkCount = 0 - for (let dx = 0; dx < tokenW; dx++) { + const end = Math.min(x + tokenW, cw) + for (let dx = 0; dx < end - x; dx++) { inkCount += ink[x + dx] } if (inkCount >= coverageNeeded) { bestX = x break } - // If we've scanned way past where ink should be, just use cursor - if (x > cursor + cw * 0.3 && ti > 0) { + // Safety: don't scan more than 40% of cell width past cursor + // to avoid tokens jumping far right when there's a large gap + if (x > cursor + cw * 0.4 && ti > 0) { bestX = cursor break } } - // Compute font size from token width vs measured width - const autoFontPx = refFontSize * (tokenW / tokenWidths[ti]) - const fontRatio = Math.min(autoFontPx / ch, 1.0) - - // Convert pixel position to percentage within cell, then to image % - const xInCellPct = bestX / cw - const wInCellPct = tokenW / cw + // Clamp to cell bounds + if (bestX + tokenW > cw) { + bestX = Math.max(0, cw - tokenW) + } + // Convert to percentage wordPos.push({ - xPct: cell.bbox_pct.x + xInCellPct * cell.bbox_pct.w, - wPct: wInCellPct * cell.bbox_pct.w, + xPct: cell.bbox_pct.x + (bestX / cw) * cell.bbox_pct.w, + wPct: (tokenW / cw) * cell.bbox_pct.w, text: tokens[ti], - fontRatio, + fontRatio: globalFontRatio, }) - // Advance cursor past this token + gap - cursor = bestX + tokenW + minGapPx + // Advance cursor: past this token + space + cursor = bestX + tokenW + spaceWidthPx } if (wordPos.length > 0) { @@ -201,27 +194,6 @@ export function useSlideWordPositions( } } - // Normalise font: use mode fontRatio for all words - const allRatios: number[] = [] - for (const wps of positions.values()) { - for (const wp of wps) allRatios.push(wp.fontRatio) - } - if (allRatios.length > 0) { - const buckets = new Map() - for (const r of allRatios) { - const key = Math.round(r * 50) / 50 - buckets.set(key, (buckets.get(key) || 0) + 1) - } - let modeRatio = allRatios[0] - let modeCount = 0 - for (const [ratio, count] of buckets) { - if (count > modeCount) { modeRatio = ratio; modeCount = count } - } - for (const wps of positions.values()) { - for (const wp of wps) wp.fontRatio = modeRatio - } - } - setResult(positions) } img.src = imageUrl