diff --git a/admin-lehrer/components/ocr-overlay/OverlayReconstruction.tsx b/admin-lehrer/components/ocr-overlay/OverlayReconstruction.tsx index 7df5507..5790852 100644 --- a/admin-lehrer/components/ocr-overlay/OverlayReconstruction.tsx +++ b/admin-lehrer/components/ocr-overlay/OverlayReconstruction.tsx @@ -513,9 +513,9 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi className="absolute leading-none pointer-events-none select-none" style={{ left: `${wp.xPct}%`, - top: `${bboxPct.y}%`, + top: `${wp.yPct}%`, width: `${wp.wPct}%`, - height: `${bboxPct.h}%`, + height: `${wp.hPct}%`, fontSize: `${fs}px`, fontWeight: globalBold ? 'bold' : 'normal', fontFamily: "'Liberation Sans', Arial, sans-serif", @@ -534,9 +534,9 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi return (
({ xPct: cell.bbox_pct.x + i * fallbackW, wPct: fallbackW, + yPct: cell.bbox_pct.y, + hPct: cell.bbox_pct.h, text: t, fontRatio: 1.0, })) @@ -77,6 +81,8 @@ export function useSlideWordPositions( const wordPos: WordPosition[] = boxes.map(box => ({ xPct: (box.left / imgW) * 100, wPct: (box.width / imgW) * 100, + yPct: (box.top / imgH) * 100, + hPct: (box.height / imgH) * 100, text: box.text, fontRatio: 1.0, })) @@ -202,6 +208,8 @@ export function useSlideWordPositions( wordPos.push({ xPct: cell.bbox_pct.x + (bestX / cw) * cell.bbox_pct.w, wPct: (tokenW / cw) * cell.bbox_pct.w, + yPct: cell.bbox_pct.y, + hPct: cell.bbox_pct.h, text: tokens[ti], fontRatio: 1.0, }) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index ab76609..4914718 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -2704,6 +2704,19 @@ def _merge_row_sequences(paddle_row: list, tess_row: list) -> list: # Same text or one contains the other is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt)) + # Spatial overlap check: if words overlap >= 50% horizontally, + # they're the same physical word regardless of OCR text differences + if not is_same: + overlap_left = max(pw["left"], tw["left"]) + overlap_right = min( + pw["left"] + pw.get("width", 0), + tw["left"] + tw.get("width", 0), + ) + overlap_w = max(0, overlap_right - overlap_left) + min_w = min(pw.get("width", 1), tw.get("width", 1)) + if min_w > 0 and overlap_w / min_w >= 0.5: + is_same = True + if is_same: # Matched — average coordinates weighted by confidence pc = pw.get("conf", 80) diff --git a/klausur-service/backend/tests/test_paddle_kombi.py b/klausur-service/backend/tests/test_paddle_kombi.py index 612032a..65371ce 100644 --- a/klausur-service/backend/tests/test_paddle_kombi.py +++ b/klausur-service/backend/tests/test_paddle_kombi.py @@ -410,6 +410,45 @@ class TestMergeRealWorldRegression: assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate" +class TestSpatialOverlapDedup: + """Test that words at the same position are deduplicated even if text differs.""" + + def test_same_position_different_text_deduplicated(self): + """Both engines find same physical word but OCR text differs slightly. + Spatial overlap should catch this as a duplicate.""" + paddle = [_word("hello", 100, 50, 80, 20, conf=90)] + tess = [_word("helo", 102, 52, 76, 18, conf=70)] + merged = _merge_row_sequences(paddle, tess) + assert len(merged) == 1, ( + f"Expected 1 word (deduped by overlap), got {len(merged)}: " + f"{[w['text'] for w in merged]}" + ) + # Paddle text preferred (higher confidence) + assert merged[0]["text"] == "hello" + + def test_same_position_single_char_deduplicated(self): + """Single-char words at same position should be deduplicated via overlap.""" + paddle = [_word("a", 100, 50, 20, 20, conf=90)] + tess = [_word("a!", 101, 51, 22, 19, conf=60)] + merged = _merge_row_sequences(paddle, tess) + assert len(merged) == 1 + + def test_no_overlap_different_words_kept(self): + """Different words at different positions should both be kept.""" + paddle = [_word("cat", 100, 50, 50, 20, conf=90)] + tess = [_word("dog", 300, 50, 50, 20, conf=70)] + merged = _merge_row_sequences(paddle, tess) + assert len(merged) == 2 + + def test_partial_overlap_below_threshold_kept(self): + """Words with < 50% overlap are different words and both kept.""" + paddle = [_word("take", 100, 50, 60, 20, conf=90)] + tess = [_word("part", 145, 50, 60, 20, conf=70)] + merged = _merge_row_sequences(paddle, tess) + # 15px overlap / 60px min width = 25% < 50% → kept as separate + assert len(merged) == 2 + + class TestSplitThenMerge: """Test the full pipeline: split multi-word Paddle boxes, then merge."""