fix: deduplicate overlapping OCR words and use per-word Y positions in overlay

Backend: Add spatial overlap check (>=50% horizontal IoU) to Kombi merge so words at the same position are deduplicated even when OCR text differs. Frontend: Add yPct/hPct to WordPosition so each word renders at its actual vertical position instead of all words collapsing to the cell center Y. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 20:27:08 +01:00
parent 703e110bab
commit d6f51e4418
5 changed files with 70 additions and 4 deletions
@@ -513,9 +513,9 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi
                        className="absolute leading-none pointer-events-none select-none"
                        style={{
                          left: `${wp.xPct}%`,
-                          top: `${bboxPct.y}%`,
+                          top: `${wp.yPct}%`,
                          width: `${wp.wPct}%`,
-                          height: `${bboxPct.h}%`,
+                          height: `${wp.hPct}%`,
                          fontSize: `${fs}px`,
                          fontWeight: globalBold ? 'bold' : 'normal',
                          fontFamily: "'Liberation Sans', Arial, sans-serif",
@@ -534,9 +534,9 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi
                  return (
                    <div key={`${cell.cellId}_wp_${i}`} className="absolute group" style={{
                      left: `${wp.xPct}%`,
-                      top: `${bboxPct.y}%`,
+                      top: `${wp.yPct}%`,
                      width: `${wp.wPct}%`,
-                      height: `${bboxPct.h}%`,
+                      height: `${wp.hPct}%`,
                    }}>
                      <input
                        id={`cell-${cell.cellId}`}
@@ -4,6 +4,8 @@ import type { GridCell } from '@/app/(admin)/ai/ocr-overlay/types'
 export interface WordPosition {
  xPct: number
  wPct: number
  yPct: number
  hPct: number
  text: string
  fontRatio: number
 }
@@ -192,6 +194,8 @@ export function usePixelWordPositions(
            wordPos.push({
              xPct: cell.bbox_pct.x + (cl.start / cw) * cell.bbox_pct.w,
              wPct: ((cl.end - cl.start + 1) / cw) * cell.bbox_pct.w,
              yPct: cell.bbox_pct.y,
              hPct: cell.bbox_pct.h,
              text: groups[gi],
              fontRatio,
            })
@@ -209,6 +213,8 @@ export function usePixelWordPositions(
          wordPos.push({
            xPct: cell.bbox_pct.x + (widest.start / cw) * cell.bbox_pct.w,
            wPct: ((widest.end - widest.start + 1) / cw) * cell.bbox_pct.w,
            yPct: cell.bbox_pct.y,
            hPct: cell.bbox_pct.h,
            text: cell.text.trim(),
            fontRatio,
          })
@@ -4,6 +4,8 @@ import type { GridCell } from '@/app/(admin)/ai/ocr-overlay/types'
 export interface WordPosition {
  xPct: number
  wPct: number
  yPct: number
  hPct: number
  text: string
  fontRatio: number
 }
@@ -66,6 +68,8 @@ export function useSlideWordPositions(
            const wordPos = tokens.map((t, i) => ({
              xPct: cell.bbox_pct.x + i * fallbackW,
              wPct: fallbackW,
              yPct: cell.bbox_pct.y,
              hPct: cell.bbox_pct.h,
              text: t,
              fontRatio: 1.0,
            }))
@@ -77,6 +81,8 @@ export function useSlideWordPositions(
          const wordPos: WordPosition[] = boxes.map(box => ({
            xPct: (box.left / imgW) * 100,
            wPct: (box.width / imgW) * 100,
            yPct: (box.top / imgH) * 100,
            hPct: (box.height / imgH) * 100,
            text: box.text,
            fontRatio: 1.0,
          }))
@@ -202,6 +208,8 @@ export function useSlideWordPositions(
          wordPos.push({
            xPct: cell.bbox_pct.x + (bestX / cw) * cell.bbox_pct.w,
            wPct: (tokenW / cw) * cell.bbox_pct.w,
            yPct: cell.bbox_pct.y,
            hPct: cell.bbox_pct.h,
            text: tokens[ti],
            fontRatio: 1.0,
          })
@@ -2704,6 +2704,19 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
        # Same text or one contains the other
        is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))
        # Spatial overlap check: if words overlap >= 50% horizontally,
        # they're the same physical word regardless of OCR text differences
        if not is_same:
            overlap_left = max(pw["left"], tw["left"])
            overlap_right = min(
                pw["left"] + pw.get("width", 0),
                tw["left"] + tw.get("width", 0),
            )
            overlap_w = max(0, overlap_right - overlap_left)
            min_w = min(pw.get("width", 1), tw.get("width", 1))
            if min_w > 0 and overlap_w / min_w >= 0.5:
                is_same = True
        if is_same:
            # Matched — average coordinates weighted by confidence
            pc = pw.get("conf", 80)
@@ -410,6 +410,45 @@ class TestMergeRealWorldRegression:
        assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"
 class TestSpatialOverlapDedup:
    """Test that words at the same position are deduplicated even if text differs."""
    def test_same_position_different_text_deduplicated(self):
        """Both engines find same physical word but OCR text differs slightly.
        Spatial overlap should catch this as a duplicate."""
        paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
        tess = [_word("helo", 102, 52, 76, 18, conf=70)]
        merged = _merge_row_sequences(paddle, tess)
        assert len(merged) == 1, (
            f"Expected 1 word (deduped by overlap), got {len(merged)}: "
            f"{[w['text'] for w in merged]}"
        )
        # Paddle text preferred (higher confidence)
        assert merged[0]["text"] == "hello"
    def test_same_position_single_char_deduplicated(self):
        """Single-char words at same position should be deduplicated via overlap."""
        paddle = [_word("a", 100, 50, 20, 20, conf=90)]
        tess = [_word("a!", 101, 51, 22, 19, conf=60)]
        merged = _merge_row_sequences(paddle, tess)
        assert len(merged) == 1
    def test_no_overlap_different_words_kept(self):
        """Different words at different positions should both be kept."""
        paddle = [_word("cat", 100, 50, 50, 20, conf=90)]
        tess = [_word("dog", 300, 50, 50, 20, conf=70)]
        merged = _merge_row_sequences(paddle, tess)
        assert len(merged) == 2
    def test_partial_overlap_below_threshold_kept(self):
        """Words with < 50% overlap are different words and both kept."""
        paddle = [_word("take", 100, 50, 60, 20, conf=90)]
        tess = [_word("part", 145, 50, 60, 20, conf=70)]
        merged = _merge_row_sequences(paddle, tess)
        # 15px overlap / 60px min width = 25% < 50% → kept as separate
        assert len(merged) == 2
 class TestSplitThenMerge:
    """Test the full pipeline: split multi-word Paddle boxes, then merge."""