breakpilot-lehrer/klausur-service/backend/tests/test_paddle_kombi.py

"""Tests for the Kombi-Modus merge algorithm.

Functions under test (ocr_pipeline_api.py):
- _box_iou: IoU between two word boxes
- _box_center_dist: Euclidean distance between box centers
- _text_similarity: Simple text similarity (0-1)
- _words_match: Multi-criteria match (IoU + center + text)
- _merge_paddle_tesseract: Merge PaddleOCR + Tesseract word lists
"""

import pytest
import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from ocr_pipeline_api import (
    _box_iou,
    _box_center_dist,
    _text_similarity,
    _words_match,
    _merge_paddle_tesseract,
)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80):
    """Create a synthetic word dict."""
    return {
        "text": text,
        "left": left,
        "top": top,
        "width": width,
        "height": height,
        "conf": conf,
    }


# ---------------------------------------------------------------------------
# _box_iou
# ---------------------------------------------------------------------------

class TestBoxIoU:

    def test_identical_boxes(self):
        a = _word("hello", 10, 10, 100, 20)
        assert _box_iou(a, a) == pytest.approx(1.0)

    def test_no_overlap(self):
        a = _word("a", 0, 0, 50, 20)
        b = _word("b", 200, 200, 50, 20)
        assert _box_iou(a, b) == 0.0

    def test_partial_overlap(self):
        a = _word("a", 0, 0, 100, 20)
        b = _word("b", 50, 0, 100, 20)
        assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01)

    def test_contained_box(self):
        big = _word("big", 0, 0, 200, 40)
        small = _word("small", 50, 10, 30, 10)
        assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01)

    def test_touching_edges(self):
        a = _word("a", 0, 0, 50, 20)
        b = _word("b", 50, 0, 50, 20)
        assert _box_iou(a, b) == 0.0

    def test_zero_area_box(self):
        a = _word("a", 10, 10, 0, 0)
        b = _word("b", 10, 10, 50, 20)
        assert _box_iou(a, b) == 0.0


# ---------------------------------------------------------------------------
# _box_center_dist
# ---------------------------------------------------------------------------

class TestBoxCenterDist:

    def test_same_center(self):
        a = _word("a", 100, 50, 60, 20)
        assert _box_center_dist(a, a) == 0.0

    def test_horizontal_offset(self):
        a = _word("a", 100, 50, 60, 20)
        b = _word("b", 110, 50, 60, 20)
        assert _box_center_dist(a, b) == pytest.approx(10.0)

    def test_diagonal(self):
        a = _word("a", 0, 0, 20, 20)  # center (10, 10)
        b = _word("b", 20, 20, 20, 20)  # center (30, 30)
        expected = (20**2 + 20**2) ** 0.5
        assert _box_center_dist(a, b) == pytest.approx(expected, abs=0.1)


# ---------------------------------------------------------------------------
# _text_similarity
# ---------------------------------------------------------------------------

class TestTextSimilarity:

    def test_identical(self):
        assert _text_similarity("hello", "hello") == 1.0

    def test_case_insensitive(self):
        assert _text_similarity("Hello", "hello") == 1.0

    def test_substring(self):
        """One is substring of other (e.g. '!Betonung' vs 'Betonung')."""
        assert _text_similarity("!Betonung", "Betonung") == 0.8

    def test_completely_different(self):
        assert _text_similarity("abc", "xyz") == 0.0

    def test_empty_strings(self):
        assert _text_similarity("", "hello") == 0.0
        assert _text_similarity("", "") == 0.0

    def test_partial_overlap(self):
        """Some shared characters."""
        sim = _text_similarity("apple", "ape")
        assert 0.0 < sim < 1.0


# ---------------------------------------------------------------------------
# _words_match
# ---------------------------------------------------------------------------

class TestWordsMatch:

    def test_high_iou_matches(self):
        """IoU > 0.15 is sufficient for a match."""
        a = _word("hello", 100, 50, 80, 20)
        b = _word("hello", 105, 50, 80, 20)
        assert _words_match(a, b) is True

    def test_same_text_same_row_matches(self):
        """Same text on same row matches even with low IoU."""
        a = _word("Betonung", 100, 50, 80, 20)
        b = _word("Betonung", 130, 52, 70, 18)  # shifted but same row
        assert _words_match(a, b) is True

    def test_close_centers_same_row_matches(self):
        """Nearby centers on same row match."""
        a = _word("x", 100, 50, 40, 20)
        b = _word("y", 110, 52, 50, 22)  # close, same row
        assert _words_match(a, b) is True

    def test_different_rows_no_match(self):
        """Words on different rows don't match even with same text."""
        a = _word("hello", 100, 50, 80, 20)
        b = _word("hello", 100, 200, 80, 20)  # far away vertically
        assert _words_match(a, b) is False

    def test_far_apart_same_row_different_text(self):
        """Different text far apart on same row: no match."""
        a = _word("cat", 10, 50, 40, 20)
        b = _word("dog", 400, 50, 40, 20)
        assert _words_match(a, b) is False

    def test_no_overlap_no_proximity_no_text(self):
        """Completely different words far apart: no match."""
        a = _word("abc", 0, 0, 50, 20)
        b = _word("xyz", 500, 500, 50, 20)
        assert _words_match(a, b) is False


# ---------------------------------------------------------------------------
# _merge_paddle_tesseract
# ---------------------------------------------------------------------------

class TestMergePaddleTesseract:

    def test_perfect_match_averages_coords(self):
        """Same word at same position: coordinates averaged by confidence."""
        pw = [_word("hello", 100, 50, 80, 20, conf=90)]
        tw = [_word("hello", 110, 55, 70, 18, conf=60)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1
        m = merged[0]
        assert m["text"] == "hello"
        assert m["left"] == 104  # (100*90 + 110*60) / 150
        assert m["conf"] == 90

    def test_same_word_slightly_offset_merges(self):
        """Same word with slight offset still merges (center proximity)."""
        pw = [_word("Betonung", 100, 50, 90, 22, conf=85)]
        tw = [_word("Betonung", 115, 52, 80, 20, conf=60)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1
        assert merged[0]["text"] == "Betonung"

    def test_truly_different_words_kept_separate(self):
        """Non-overlapping different words: both kept."""
        pw = [_word("hello", 10, 10)]
        tw = [_word("bullet", 500, 500, conf=50)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 2
        texts = {m["text"] for m in merged}
        assert texts == {"hello", "bullet"}

    def test_low_conf_tesseract_dropped(self):
        """Unmatched Tesseract words with conf < 40 are dropped."""
        pw = [_word("hello", 10, 10)]
        tw = [_word("noise", 500, 500, conf=20)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1

    def test_empty_paddle(self):
        pw = []
        tw = [_word("bullet", 10, 10, conf=80), _word("noise", 200, 200, conf=10)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1
        assert merged[0]["text"] == "bullet"

    def test_empty_tesseract(self):
        pw = [_word("a", 10, 10), _word("b", 200, 10)]
        merged = _merge_paddle_tesseract(pw, [])
        assert len(merged) == 2

    def test_both_empty(self):
        assert _merge_paddle_tesseract([], []) == []

    def test_one_to_one_matching(self):
        """Each Tesseract word matches at most one Paddle word."""
        pw = [
            _word("cat", 10, 10, 60, 20, conf=80),
            _word("dog", 200, 10, 60, 20, conf=80),
        ]
        tw = [_word("cat", 15, 12, 55, 18, conf=70)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 2  # cat (merged) + dog (unmatched paddle)

    def test_far_apart_different_text_not_merged(self):
        """Different words far apart stay separate."""
        pw = [_word("hello", 0, 0, 100, 20, conf=80)]
        tw = [_word("world", 500, 300, 100, 20, conf=70)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 2

    def test_paddle_text_preferred(self):
        """Merged word uses Paddle's text."""
        pw = [_word("Betonung", 100, 50, 80, 20, conf=85)]
        tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1
        assert merged[0]["text"] == "Betonung"

    def test_confidence_weighted_positions(self):
        """Equal confidence → simple average of coordinates."""
        pw = [_word("x", 100, 200, 60, 20, conf=50)]
        tw = [_word("x", 110, 200, 60, 20, conf=50)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1
        m = merged[0]
        assert m["left"] == 105
        assert m["top"] == 200

    def test_zero_confidence_no_division_error(self):
        """Words with conf=0 don't cause division by zero."""
        pw = [_word("a", 100, 50, 80, 20, conf=0)]
        tw = [_word("a", 100, 50, 80, 20, conf=0)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 1

    def test_duplicate_words_same_position_deduplicated(self):
        """The core bug fix: same word at same position from both engines
        should appear only once, not doubled."""
        # Simulate typical case: both engines find same words
        pw = [
            _word("apple", 50, 10, 70, 20, conf=90),
            _word("Apfel", 300, 10, 60, 20, conf=85),
            _word("dog", 50, 50, 50, 20, conf=88),
            _word("Hund", 300, 50, 60, 20, conf=82),
        ]
        tw = [
            _word("apple", 52, 11, 68, 19, conf=75),
            _word("Apfel", 298, 12, 62, 18, conf=70),
            _word("dog", 48, 49, 52, 21, conf=72),
            _word("Hund", 302, 51, 58, 19, conf=68),
        ]
        merged = _merge_paddle_tesseract(pw, tw)
        # Each word should appear exactly once
        assert len(merged) == 4
        texts = [m["text"] for m in merged]
        assert sorted(texts) == ["Apfel", "Hund", "apple", "dog"]


class TestMergePaddleTesseractBulletPoints:
    """Tesseract catches bullet points / symbols that PaddleOCR misses."""

    def test_bullet_added_from_tesseract(self):
        """Bullet character from Tesseract is added."""
        pw = [_word("Betonung", 60, 10, 80, 20)]
        tw = [
            _word("•", 10, 10, 15, 15, conf=65),
            _word("Betonung", 60, 10, 80, 20, conf=50),
        ]
        merged = _merge_paddle_tesseract(pw, tw)
        texts = [m["text"] for m in merged]
        assert "•" in texts
        assert "Betonung" in texts
        assert len(merged) == 2

    def test_exclamation_added_from_tesseract(self):
        """Exclamation mark from Tesseract is added."""
        pw = [_word("important", 60, 10, 100, 20)]
        tw = [
            _word("!", 40, 10, 12, 20, conf=70),
            _word("important", 60, 10, 100, 20, conf=55),
        ]
        merged = _merge_paddle_tesseract(pw, tw)
        texts = [m["text"] for m in merged]
        assert "!" in texts
        assert len(merged) == 2

    def test_multiple_unique_tesseract_symbols(self):
        """Multiple symbols only found by Tesseract are all added."""
        pw = [_word("word", 100, 10, 60, 20)]
        tw = [
            _word("!", 20, 10, 10, 20, conf=70),
            _word("•", 40, 10, 10, 15, conf=65),
            _word("word", 100, 10, 60, 20, conf=50),
        ]
        merged = _merge_paddle_tesseract(pw, tw)
        texts = [m["text"] for m in merged]
        assert "!" in texts
        assert "•" in texts
        assert "word" in texts
        assert len(merged) == 3