diff --git a/admin-lehrer/components/ocr-overlay/usePixelWordPositions.ts b/admin-lehrer/components/ocr-overlay/usePixelWordPositions.ts index b7b1b27..b4872b9 100644 --- a/admin-lehrer/components/ocr-overlay/usePixelWordPositions.ts +++ b/admin-lehrer/components/ocr-overlay/usePixelWordPositions.ts @@ -59,7 +59,28 @@ export function usePixelWordPositions( for (const cell of cells) { if (!cell.bbox_pct || !cell.text) continue - const groups = cell.text.split(/\s{3,}/).map(s => s.trim()).filter(Boolean) + const rawGroups = cell.text.split(/\s{3,}/).map(s => s.trim()).filter(Boolean) + + // Merge single-char symbol groups (OCR artifacts from box borders like "|", ">") + // with their neighbour to avoid polluting the cluster-to-group matching + const groups: string[] = [] + for (let gi = 0; gi < rawGroups.length; gi++) { + const g = rawGroups[gi] + const isArtifact = g.length <= 2 && !/[a-zA-Z0-9\u00C0-\u024F]/.test(g) + if (isArtifact) { + if (gi + 1 < rawGroups.length) { + // merge with next group + rawGroups[gi + 1] = g + ' ' + rawGroups[gi + 1] + } else if (groups.length > 0) { + // last group — merge with previous + groups[groups.length - 1] += ' ' + g + } else { + groups.push(g) + } + } else { + groups.push(g) + } + } let cx: number, cy: number const cw = Math.round(cell.bbox_pct.w / 100 * imgW)