fix: OCR-Artefakte (|, >) vor Cluster-Matching zusammenfuehren

Box-Rahmen werden vom OCR als einzelne Symbole wie "|" oder ">" erkannt und als eigene Text-Gruppen behandelt. Das verfaelscht die Cluster-Zuordnung weil diese Artefakte entweder keinen eigenen Cluster erzeugen oder den falschen Cluster zugewiesen bekommen. Fix: Gruppen mit max 2 Zeichen ohne Buchstaben/Ziffern werden mit der benachbarten Gruppe zusammengefuehrt bevor die Cluster-Zuordnung laeuft. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 15:03:37 +01:00
parent 87efc1b4ba
commit d182d87f26
1 changed files with 22 additions and 1 deletions
@@ -59,7 +59,28 @@ export function usePixelWordPositions(
      for (const cell of cells) {
        if (!cell.bbox_pct || !cell.text) continue

-        const groups = cell.text.split(/\s{3,}/).map(s => s.trim()).filter(Boolean)
+        const rawGroups = cell.text.split(/\s{3,}/).map(s => s.trim()).filter(Boolean)
+
+        // Merge single-char symbol groups (OCR artifacts from box borders like "|", ">")
+        // with their neighbour to avoid polluting the cluster-to-group matching
+        const groups: string[] = []
+        for (let gi = 0; gi < rawGroups.length; gi++) {
+          const g = rawGroups[gi]
+          const isArtifact = g.length <= 2 && !/[a-zA-Z0-9\u00C0-\u024F]/.test(g)
+          if (isArtifact) {
+            if (gi + 1 < rawGroups.length) {
+              // merge with next group
+              rawGroups[gi + 1] = g + '  ' + rawGroups[gi + 1]
+            } else if (groups.length > 0) {
+              // last group — merge with previous
+              groups[groups.length - 1] += '  ' + g
+            } else {
+              groups.push(g)
+            }
+          } else {
+            groups.push(g)
+          }
+        }

        let cx: number, cy: number
        const cw = Math.round(cell.bbox_pct.w / 100 * imgW)