fix: deduplicate overlapping OCR words and use per-word Y positions in overlay
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 33s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 24s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 33s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 24s
Backend: Add spatial overlap check (>=50% horizontal IoU) to Kombi merge so words at the same position are deduplicated even when OCR text differs. Frontend: Add yPct/hPct to WordPosition so each word renders at its actual vertical position instead of all words collapsing to the cell center Y. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -513,9 +513,9 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi
|
|||||||
className="absolute leading-none pointer-events-none select-none"
|
className="absolute leading-none pointer-events-none select-none"
|
||||||
style={{
|
style={{
|
||||||
left: `${wp.xPct}%`,
|
left: `${wp.xPct}%`,
|
||||||
top: `${bboxPct.y}%`,
|
top: `${wp.yPct}%`,
|
||||||
width: `${wp.wPct}%`,
|
width: `${wp.wPct}%`,
|
||||||
height: `${bboxPct.h}%`,
|
height: `${wp.hPct}%`,
|
||||||
fontSize: `${fs}px`,
|
fontSize: `${fs}px`,
|
||||||
fontWeight: globalBold ? 'bold' : 'normal',
|
fontWeight: globalBold ? 'bold' : 'normal',
|
||||||
fontFamily: "'Liberation Sans', Arial, sans-serif",
|
fontFamily: "'Liberation Sans', Arial, sans-serif",
|
||||||
@@ -534,9 +534,9 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi
|
|||||||
return (
|
return (
|
||||||
<div key={`${cell.cellId}_wp_${i}`} className="absolute group" style={{
|
<div key={`${cell.cellId}_wp_${i}`} className="absolute group" style={{
|
||||||
left: `${wp.xPct}%`,
|
left: `${wp.xPct}%`,
|
||||||
top: `${bboxPct.y}%`,
|
top: `${wp.yPct}%`,
|
||||||
width: `${wp.wPct}%`,
|
width: `${wp.wPct}%`,
|
||||||
height: `${bboxPct.h}%`,
|
height: `${wp.hPct}%`,
|
||||||
}}>
|
}}>
|
||||||
<input
|
<input
|
||||||
id={`cell-${cell.cellId}`}
|
id={`cell-${cell.cellId}`}
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ import type { GridCell } from '@/app/(admin)/ai/ocr-overlay/types'
|
|||||||
export interface WordPosition {
|
export interface WordPosition {
|
||||||
xPct: number
|
xPct: number
|
||||||
wPct: number
|
wPct: number
|
||||||
|
yPct: number
|
||||||
|
hPct: number
|
||||||
text: string
|
text: string
|
||||||
fontRatio: number
|
fontRatio: number
|
||||||
}
|
}
|
||||||
@@ -192,6 +194,8 @@ export function usePixelWordPositions(
|
|||||||
wordPos.push({
|
wordPos.push({
|
||||||
xPct: cell.bbox_pct.x + (cl.start / cw) * cell.bbox_pct.w,
|
xPct: cell.bbox_pct.x + (cl.start / cw) * cell.bbox_pct.w,
|
||||||
wPct: ((cl.end - cl.start + 1) / cw) * cell.bbox_pct.w,
|
wPct: ((cl.end - cl.start + 1) / cw) * cell.bbox_pct.w,
|
||||||
|
yPct: cell.bbox_pct.y,
|
||||||
|
hPct: cell.bbox_pct.h,
|
||||||
text: groups[gi],
|
text: groups[gi],
|
||||||
fontRatio,
|
fontRatio,
|
||||||
})
|
})
|
||||||
@@ -209,6 +213,8 @@ export function usePixelWordPositions(
|
|||||||
wordPos.push({
|
wordPos.push({
|
||||||
xPct: cell.bbox_pct.x + (widest.start / cw) * cell.bbox_pct.w,
|
xPct: cell.bbox_pct.x + (widest.start / cw) * cell.bbox_pct.w,
|
||||||
wPct: ((widest.end - widest.start + 1) / cw) * cell.bbox_pct.w,
|
wPct: ((widest.end - widest.start + 1) / cw) * cell.bbox_pct.w,
|
||||||
|
yPct: cell.bbox_pct.y,
|
||||||
|
hPct: cell.bbox_pct.h,
|
||||||
text: cell.text.trim(),
|
text: cell.text.trim(),
|
||||||
fontRatio,
|
fontRatio,
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ import type { GridCell } from '@/app/(admin)/ai/ocr-overlay/types'
|
|||||||
export interface WordPosition {
|
export interface WordPosition {
|
||||||
xPct: number
|
xPct: number
|
||||||
wPct: number
|
wPct: number
|
||||||
|
yPct: number
|
||||||
|
hPct: number
|
||||||
text: string
|
text: string
|
||||||
fontRatio: number
|
fontRatio: number
|
||||||
}
|
}
|
||||||
@@ -66,6 +68,8 @@ export function useSlideWordPositions(
|
|||||||
const wordPos = tokens.map((t, i) => ({
|
const wordPos = tokens.map((t, i) => ({
|
||||||
xPct: cell.bbox_pct.x + i * fallbackW,
|
xPct: cell.bbox_pct.x + i * fallbackW,
|
||||||
wPct: fallbackW,
|
wPct: fallbackW,
|
||||||
|
yPct: cell.bbox_pct.y,
|
||||||
|
hPct: cell.bbox_pct.h,
|
||||||
text: t,
|
text: t,
|
||||||
fontRatio: 1.0,
|
fontRatio: 1.0,
|
||||||
}))
|
}))
|
||||||
@@ -77,6 +81,8 @@ export function useSlideWordPositions(
|
|||||||
const wordPos: WordPosition[] = boxes.map(box => ({
|
const wordPos: WordPosition[] = boxes.map(box => ({
|
||||||
xPct: (box.left / imgW) * 100,
|
xPct: (box.left / imgW) * 100,
|
||||||
wPct: (box.width / imgW) * 100,
|
wPct: (box.width / imgW) * 100,
|
||||||
|
yPct: (box.top / imgH) * 100,
|
||||||
|
hPct: (box.height / imgH) * 100,
|
||||||
text: box.text,
|
text: box.text,
|
||||||
fontRatio: 1.0,
|
fontRatio: 1.0,
|
||||||
}))
|
}))
|
||||||
@@ -202,6 +208,8 @@ export function useSlideWordPositions(
|
|||||||
wordPos.push({
|
wordPos.push({
|
||||||
xPct: cell.bbox_pct.x + (bestX / cw) * cell.bbox_pct.w,
|
xPct: cell.bbox_pct.x + (bestX / cw) * cell.bbox_pct.w,
|
||||||
wPct: (tokenW / cw) * cell.bbox_pct.w,
|
wPct: (tokenW / cw) * cell.bbox_pct.w,
|
||||||
|
yPct: cell.bbox_pct.y,
|
||||||
|
hPct: cell.bbox_pct.h,
|
||||||
text: tokens[ti],
|
text: tokens[ti],
|
||||||
fontRatio: 1.0,
|
fontRatio: 1.0,
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -2704,6 +2704,19 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
|
|||||||
# Same text or one contains the other
|
# Same text or one contains the other
|
||||||
is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))
|
is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))
|
||||||
|
|
||||||
|
# Spatial overlap check: if words overlap >= 50% horizontally,
|
||||||
|
# they're the same physical word regardless of OCR text differences
|
||||||
|
if not is_same:
|
||||||
|
overlap_left = max(pw["left"], tw["left"])
|
||||||
|
overlap_right = min(
|
||||||
|
pw["left"] + pw.get("width", 0),
|
||||||
|
tw["left"] + tw.get("width", 0),
|
||||||
|
)
|
||||||
|
overlap_w = max(0, overlap_right - overlap_left)
|
||||||
|
min_w = min(pw.get("width", 1), tw.get("width", 1))
|
||||||
|
if min_w > 0 and overlap_w / min_w >= 0.5:
|
||||||
|
is_same = True
|
||||||
|
|
||||||
if is_same:
|
if is_same:
|
||||||
# Matched — average coordinates weighted by confidence
|
# Matched — average coordinates weighted by confidence
|
||||||
pc = pw.get("conf", 80)
|
pc = pw.get("conf", 80)
|
||||||
|
|||||||
@@ -410,6 +410,45 @@ class TestMergeRealWorldRegression:
|
|||||||
assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"
|
assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"
|
||||||
|
|
||||||
|
|
||||||
|
class TestSpatialOverlapDedup:
|
||||||
|
"""Test that words at the same position are deduplicated even if text differs."""
|
||||||
|
|
||||||
|
def test_same_position_different_text_deduplicated(self):
|
||||||
|
"""Both engines find same physical word but OCR text differs slightly.
|
||||||
|
Spatial overlap should catch this as a duplicate."""
|
||||||
|
paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
|
||||||
|
tess = [_word("helo", 102, 52, 76, 18, conf=70)]
|
||||||
|
merged = _merge_row_sequences(paddle, tess)
|
||||||
|
assert len(merged) == 1, (
|
||||||
|
f"Expected 1 word (deduped by overlap), got {len(merged)}: "
|
||||||
|
f"{[w['text'] for w in merged]}"
|
||||||
|
)
|
||||||
|
# Paddle text preferred (higher confidence)
|
||||||
|
assert merged[0]["text"] == "hello"
|
||||||
|
|
||||||
|
def test_same_position_single_char_deduplicated(self):
|
||||||
|
"""Single-char words at same position should be deduplicated via overlap."""
|
||||||
|
paddle = [_word("a", 100, 50, 20, 20, conf=90)]
|
||||||
|
tess = [_word("a!", 101, 51, 22, 19, conf=60)]
|
||||||
|
merged = _merge_row_sequences(paddle, tess)
|
||||||
|
assert len(merged) == 1
|
||||||
|
|
||||||
|
def test_no_overlap_different_words_kept(self):
|
||||||
|
"""Different words at different positions should both be kept."""
|
||||||
|
paddle = [_word("cat", 100, 50, 50, 20, conf=90)]
|
||||||
|
tess = [_word("dog", 300, 50, 50, 20, conf=70)]
|
||||||
|
merged = _merge_row_sequences(paddle, tess)
|
||||||
|
assert len(merged) == 2
|
||||||
|
|
||||||
|
def test_partial_overlap_below_threshold_kept(self):
|
||||||
|
"""Words with < 50% overlap are different words and both kept."""
|
||||||
|
paddle = [_word("take", 100, 50, 60, 20, conf=90)]
|
||||||
|
tess = [_word("part", 145, 50, 60, 20, conf=70)]
|
||||||
|
merged = _merge_row_sequences(paddle, tess)
|
||||||
|
# 15px overlap / 60px min width = 25% < 50% → kept as separate
|
||||||
|
assert len(merged) == 2
|
||||||
|
|
||||||
|
|
||||||
class TestSplitThenMerge:
|
class TestSplitThenMerge:
|
||||||
"""Test the full pipeline: split multi-word Paddle boxes, then merge."""
|
"""Test the full pipeline: split multi-word Paddle boxes, then merge."""
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user