diff --git a/admin-lehrer/components/ocr-overlay/OverlayReconstruction.tsx b/admin-lehrer/components/ocr-overlay/OverlayReconstruction.tsx
index 7df5507..5790852 100644
--- a/admin-lehrer/components/ocr-overlay/OverlayReconstruction.tsx
+++ b/admin-lehrer/components/ocr-overlay/OverlayReconstruction.tsx
@@ -513,9 +513,9 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi
className="absolute leading-none pointer-events-none select-none"
style={{
left: `${wp.xPct}%`,
- top: `${bboxPct.y}%`,
+ top: `${wp.yPct}%`,
width: `${wp.wPct}%`,
- height: `${bboxPct.h}%`,
+ height: `${wp.hPct}%`,
fontSize: `${fs}px`,
fontWeight: globalBold ? 'bold' : 'normal',
fontFamily: "'Liberation Sans', Arial, sans-serif",
@@ -534,9 +534,9 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi
return (
({
xPct: cell.bbox_pct.x + i * fallbackW,
wPct: fallbackW,
+ yPct: cell.bbox_pct.y,
+ hPct: cell.bbox_pct.h,
text: t,
fontRatio: 1.0,
}))
@@ -77,6 +81,8 @@ export function useSlideWordPositions(
const wordPos: WordPosition[] = boxes.map(box => ({
xPct: (box.left / imgW) * 100,
wPct: (box.width / imgW) * 100,
+ yPct: (box.top / imgH) * 100,
+ hPct: (box.height / imgH) * 100,
text: box.text,
fontRatio: 1.0,
}))
@@ -202,6 +208,8 @@ export function useSlideWordPositions(
wordPos.push({
xPct: cell.bbox_pct.x + (bestX / cw) * cell.bbox_pct.w,
wPct: (tokenW / cw) * cell.bbox_pct.w,
+ yPct: cell.bbox_pct.y,
+ hPct: cell.bbox_pct.h,
text: tokens[ti],
fontRatio: 1.0,
})
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index ab76609..4914718 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -2704,6 +2704,19 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
# Same text or one contains the other
is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))
+ # Spatial overlap check: if words overlap >= 50% horizontally,
+ # they're the same physical word regardless of OCR text differences
+ if not is_same:
+ overlap_left = max(pw["left"], tw["left"])
+ overlap_right = min(
+ pw["left"] + pw.get("width", 0),
+ tw["left"] + tw.get("width", 0),
+ )
+ overlap_w = max(0, overlap_right - overlap_left)
+ min_w = min(pw.get("width", 1), tw.get("width", 1))
+ if min_w > 0 and overlap_w / min_w >= 0.5:
+ is_same = True
+
if is_same:
# Matched — average coordinates weighted by confidence
pc = pw.get("conf", 80)
diff --git a/klausur-service/backend/tests/test_paddle_kombi.py b/klausur-service/backend/tests/test_paddle_kombi.py
index 612032a..65371ce 100644
--- a/klausur-service/backend/tests/test_paddle_kombi.py
+++ b/klausur-service/backend/tests/test_paddle_kombi.py
@@ -410,6 +410,45 @@ class TestMergeRealWorldRegression:
assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"
+class TestSpatialOverlapDedup:
+ """Test that words at the same position are deduplicated even if text differs."""
+
+ def test_same_position_different_text_deduplicated(self):
+ """Both engines find same physical word but OCR text differs slightly.
+ Spatial overlap should catch this as a duplicate."""
+ paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
+ tess = [_word("helo", 102, 52, 76, 18, conf=70)]
+ merged = _merge_row_sequences(paddle, tess)
+ assert len(merged) == 1, (
+ f"Expected 1 word (deduped by overlap), got {len(merged)}: "
+ f"{[w['text'] for w in merged]}"
+ )
+ # Paddle text preferred (higher confidence)
+ assert merged[0]["text"] == "hello"
+
+ def test_same_position_single_char_deduplicated(self):
+ """Single-char words at same position should be deduplicated via overlap."""
+ paddle = [_word("a", 100, 50, 20, 20, conf=90)]
+ tess = [_word("a!", 101, 51, 22, 19, conf=60)]
+ merged = _merge_row_sequences(paddle, tess)
+ assert len(merged) == 1
+
+ def test_no_overlap_different_words_kept(self):
+ """Different words at different positions should both be kept."""
+ paddle = [_word("cat", 100, 50, 50, 20, conf=90)]
+ tess = [_word("dog", 300, 50, 50, 20, conf=70)]
+ merged = _merge_row_sequences(paddle, tess)
+ assert len(merged) == 2
+
+ def test_partial_overlap_below_threshold_kept(self):
+ """Words with < 50% overlap are different words and both kept."""
+ paddle = [_word("take", 100, 50, 60, 20, conf=90)]
+ tess = [_word("part", 145, 50, 60, 20, conf=70)]
+ merged = _merge_row_sequences(paddle, tess)
+ # 15px overlap / 60px min width = 25% < 50% → kept as separate
+ assert len(merged) == 2
+
+
class TestSplitThenMerge:
"""Test the full pipeline: split multi-word Paddle boxes, then merge."""