breakpilot-lehrer/klausur-service/backend/tests/test_paddle_kombi.py

"""Tests for the Kombi-Modus row-based sequence merge algorithm.

Functions under test (ocr_pipeline_api.py):
- _split_paddle_multi_words: Split multi-word PaddleOCR boxes into individual words
- _group_words_into_rows: Cluster words by Y-position into rows
- _merge_row_sequences: Merge two word sequences within the same row
- _merge_paddle_tesseract: Full merge with row matching + sequence dedup
"""

import pytest
import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from ocr_pipeline_api import (
    _split_paddle_multi_words,
    _group_words_into_rows,
    _merge_row_sequences,
    _merge_paddle_tesseract,
)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80):
    """Create a synthetic word dict."""
    return {
        "text": text,
        "left": left,
        "top": top,
        "width": width,
        "height": height,
        "conf": conf,
    }


# ---------------------------------------------------------------------------
# _split_paddle_multi_words
# ---------------------------------------------------------------------------

class TestSplitPaddleMultiWords:

    def test_single_word_unchanged(self):
        words = [_word("hello", 100, 50, 80, 20)]
        result = _split_paddle_multi_words(words)
        assert len(result) == 1
        assert result[0]["text"] == "hello"

    def test_multi_word_split(self):
        """'More than 200' as one box → 3 individual words."""
        words = [_word("More than 200", 100, 50, 300, 20)]
        result = _split_paddle_multi_words(words)
        assert len(result) == 3
        assert result[0]["text"] == "More"
        assert result[1]["text"] == "than"
        assert result[2]["text"] == "200"
        # All should be within the original box
        assert result[0]["left"] >= 100
        assert result[2]["left"] + result[2]["width"] <= 400 + 5  # allow rounding

    def test_exclamation_split(self):
        """'!Betonung' → ['!', 'Betonung']."""
        words = [_word("!Betonung", 100, 50, 120, 20)]
        result = _split_paddle_multi_words(words)
        assert len(result) == 2
        assert result[0]["text"] == "!"
        assert result[1]["text"] == "Betonung"

    def test_ipa_bracket_split(self):
        """'badge[bxd3]' → ['badge', '[bxd3]']."""
        words = [_word("badge[bxd3]", 100, 50, 150, 20)]
        result = _split_paddle_multi_words(words)
        assert len(result) == 2
        assert result[0]["text"] == "badge"
        assert result[1]["text"] == "[bxd3]"

    def test_long_phrase(self):
        """'More than 200 singers took part in the' → 8 words."""
        words = [_word("More than 200 singers took part in the", 944, 287, 454, 29)]
        result = _split_paddle_multi_words(words)
        assert len(result) == 8
        texts = [w["text"] for w in result]
        assert texts == ["More", "than", "200", "singers", "took", "part", "in", "the"]

    def test_empty_input(self):
        assert _split_paddle_multi_words([]) == []

    def test_preserves_top_and_height(self):
        words = [_word("a b", 100, 50, 200, 25)]
        result = _split_paddle_multi_words(words)
        for w in result:
            assert w["top"] == 50
            assert w["height"] == 25


# ---------------------------------------------------------------------------
# _group_words_into_rows
# ---------------------------------------------------------------------------

class TestGroupWordsIntoRows:

    def test_single_row(self):
        words = [_word("a", 10, 50), _word("b", 100, 52), _word("c", 200, 48)]
        rows = _group_words_into_rows(words)
        assert len(rows) == 1
        assert len(rows[0]) == 3
        # Sorted left-to-right
        assert rows[0][0]["text"] == "a"
        assert rows[0][2]["text"] == "c"

    def test_two_rows(self):
        words = [
            _word("a", 10, 50), _word("b", 100, 52),
            _word("c", 10, 100), _word("d", 100, 102),
        ]
        rows = _group_words_into_rows(words)
        assert len(rows) == 2
        assert [w["text"] for w in rows[0]] == ["a", "b"]
        assert [w["text"] for w in rows[1]] == ["c", "d"]

    def test_empty(self):
        assert _group_words_into_rows([]) == []

    def test_different_heights_same_row(self):
        """Paddle (h=29) and Tesseract (h=21) words at similar Y → same row."""
        words = [
            _word("take", 100, 287, 47, 29),  # center_y = 301.5
            _word("take", 103, 289, 52, 21),  # center_y = 299.5
        ]
        rows = _group_words_into_rows(words)
        assert len(rows) == 1  # Same row, not two rows

    def test_close_rows_separated(self):
        """Two rows ~30px apart should be separate rows."""
        words = [
            _word("a", 10, 50, height=20),   # center_y = 60
            _word("b", 10, 85, height=20),   # center_y = 95
        ]
        rows = _group_words_into_rows(words)
        assert len(rows) == 2


# ---------------------------------------------------------------------------
# _merge_row_sequences
# ---------------------------------------------------------------------------

class TestMergeRowSequences:

    def test_identical_sequences_deduplicated(self):
        """Same words from both engines → only one copy each."""
        paddle = [_word("apple", 50, 10), _word("Apfel", 200, 10)]
        tess = [_word("apple", 52, 12), _word("Apfel", 198, 11)]
        merged = _merge_row_sequences(paddle, tess)
        assert len(merged) == 2
        assert merged[0]["text"] == "apple"
        assert merged[1]["text"] == "Apfel"

    def test_tesseract_extra_symbol(self):
        """Tesseract finds '!' that Paddle missed → included."""
        paddle = [_word("Betonung", 60, 10)]
        tess = [_word("!", 20, 10, 12, 20, conf=70), _word("Betonung", 60, 10)]
        merged = _merge_row_sequences(paddle, tess)
        texts = [w["text"] for w in merged]
        assert "!" in texts
        assert "Betonung" in texts
        assert len(merged) == 2

    def test_paddle_extra_word(self):
        """Paddle finds word that Tesseract missed → included."""
        paddle = [_word("!", 20, 10, 12, 20), _word("word", 60, 10)]
        tess = [_word("word", 62, 12)]
        merged = _merge_row_sequences(paddle, tess)
        assert len(merged) == 2

    def test_coordinates_averaged(self):
        """Matched words have coordinates averaged by confidence."""
        paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
        tess = [_word("hello", 110, 55, 70, 18, conf=60)]
        merged = _merge_row_sequences(paddle, tess)
        assert len(merged) == 1
        m = merged[0]
        assert m["text"] == "hello"
        # (100*90 + 110*60) / 150 = 104
        assert m["left"] == 104
        assert m["conf"] == 90

    def test_empty_paddle_row(self):
        tess = [_word("a", 10, 10, conf=80)]
        merged = _merge_row_sequences([], tess)
        assert len(merged) == 1

    def test_empty_tess_row(self):
        paddle = [_word("a", 10, 10)]
        merged = _merge_row_sequences(paddle, [])
        assert len(merged) == 1

    def test_both_empty(self):
        assert _merge_row_sequences([], []) == []

    def test_substring_match(self):
        """'part(in)' from Paddle matches 'part' from Tesseract (substring)."""
        paddle = [_word("part(in)", 100, 10, 90, 20)]
        tess = [_word("part", 100, 12, 50, 18), _word("(in)", 155, 12, 40, 18)]
        merged = _merge_row_sequences(paddle, tess)
        # part(in) matches part, then (in) is extra from Tesseract
        assert len(merged) == 2

    def test_low_conf_tesseract_dropped(self):
        """Unmatched Tesseract words with conf < 30 are dropped."""
        paddle = [_word("hello", 100, 10)]
        tess = [_word("noise", 10, 10, conf=15), _word("hello", 100, 12)]
        merged = _merge_row_sequences(paddle, tess)
        texts = [w["text"] for w in merged]
        assert "noise" not in texts
        assert len(merged) == 1

    def test_real_world_row(self):
        """Reproduce real data: both engines find 'take part teilnehmen More than'."""
        paddle = [
            _word("take", 185, 287, 47, 29, conf=90),
            _word("part(in)", 238, 287, 94, 29, conf=90),
            _word("teilnehmen", 526, 282, 140, 35, conf=93),
            _word("More", 944, 287, 50, 29, conf=96),
            _word("than", 1003, 287, 50, 29, conf=96),
        ]
        tess = [
            _word("take", 188, 289, 52, 21, conf=96),
            _word("part", 249, 292, 48, 24, conf=96),
            _word("(in)", 305, 290, 38, 24, conf=93),
            _word("[teık", 352, 292, 47, 21, conf=90),
            _word("teilnehmen", 534, 290, 127, 21, conf=95),
            _word("More", 948, 292, 60, 20, conf=90),
            _word("than", 1017, 291, 49, 21, conf=96),
        ]
        merged = _merge_row_sequences(paddle, tess)
        texts = [w["text"] for w in merged]
        # No duplicates
        assert texts.count("take") == 1
        assert texts.count("More") == 1
        assert texts.count("than") == 1
        assert texts.count("teilnehmen") == 1
        # Tesseract-only phonetic kept
        assert "[teık" in texts


# ---------------------------------------------------------------------------
# _merge_paddle_tesseract (full pipeline)
# ---------------------------------------------------------------------------

class TestMergePaddleTesseract:

    def test_same_words_deduplicated(self):
        """Both engines find same words → no duplicates."""
        pw = [
            _word("apple", 50, 10, 70, 20, conf=90),
            _word("Apfel", 300, 10, 60, 20, conf=85),
        ]
        tw = [
            _word("apple", 52, 11, 68, 19, conf=75),
            _word("Apfel", 298, 12, 62, 18, conf=70),
        ]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 2
        texts = sorted(w["text"] for w in merged)
        assert texts == ["Apfel", "apple"]

    def test_different_rows_not_cross_merged(self):
        """Words from different rows must NOT be averaged together."""
        pw = [
            _word("row1word", 50, 50, 80, 20, conf=90),
            _word("row2word", 50, 100, 80, 20, conf=90),
        ]
        tw = [
            _word("row1word", 52, 52, 78, 18, conf=80),
            _word("row2word", 52, 102, 78, 18, conf=80),
        ]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 2
        # Row 1 word should stay near y=50, not averaged with y=100
        row1 = [w for w in merged if w["text"] == "row1word"][0]
        row2 = [w for w in merged if w["text"] == "row2word"][0]
        assert row1["top"] < 60  # stays near row 1
        assert row2["top"] > 90  # stays near row 2

    def test_tesseract_extra_symbols_added(self):
        """Symbols only found by Tesseract are included."""
        pw = [_word("Betonung", 60, 10, 80, 20)]
        tw = [
            _word("!", 20, 10, 12, 20, conf=65),
            _word("Betonung", 60, 10, 80, 20, conf=50),
        ]
        merged = _merge_paddle_tesseract(pw, tw)
        texts = [w["text"] for w in merged]
        assert "!" in texts
        assert "Betonung" in texts
        assert len(merged) == 2

    def test_paddle_extra_words_added(self):
        """Words only found by Paddle are included."""
        pw = [_word("extra", 10, 10), _word("word", 100, 10)]
        tw = [_word("word", 102, 12)]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 2

    def test_empty_paddle(self):
        tw = [_word("a", 10, 10, conf=80), _word("b", 200, 200, conf=10)]
        merged = _merge_paddle_tesseract([], tw)
        assert len(merged) == 1  # only conf >= 40

    def test_empty_tesseract(self):
        pw = [_word("a", 10, 10), _word("b", 200, 10)]
        merged = _merge_paddle_tesseract(pw, [])
        assert len(merged) == 2

    def test_both_empty(self):
        assert _merge_paddle_tesseract([], []) == []

    def test_multi_row_deduplication(self):
        """Multiple rows with words from both engines, all deduplicated."""
        pw = [
            _word("cat", 50, 50, conf=90),
            _word("Katze", 200, 50, conf=85),
            _word("dog", 50, 100, conf=88),
            _word("Hund", 200, 100, conf=82),
        ]
        tw = [
            _word("cat", 52, 52, conf=75),
            _word("Katze", 198, 51, conf=70),
            _word("dog", 48, 101, conf=72),
            _word("Hund", 202, 102, conf=68),
        ]
        merged = _merge_paddle_tesseract(pw, tw)
        assert len(merged) == 4
        texts = sorted(w["text"] for w in merged)
        assert texts == ["Hund", "Katze", "cat", "dog"]


class TestMergeRealWorldRegression:
    """Regression test with actual data from the doubled-words bug."""

    def test_full_page_no_duplicates(self):
        """Both engines find same words at slightly different positions.
        Merge should produce no near-duplicate words."""
        paddle = [
            _word("teilnehmen", 526, 282, 140, 35, conf=93),
            _word("take", 185, 287, 47, 29, conf=90),
            _word("part(in)", 238, 287, 94, 29, conf=90),
            _word("More", 944, 287, 50, 29, conf=96),
            _word("than", 1003, 287, 50, 29, conf=96),
            _word("200", 1063, 287, 38, 29, conf=96),
            _word("singers", 1110, 287, 88, 29, conf=96),
            _word("took", 1207, 287, 50, 29, conf=96),
            _word("part", 1266, 287, 50, 29, conf=96),
            _word("in", 1326, 287, 25, 29, conf=96),
            _word("the", 1360, 287, 38, 29, conf=96),
            # Second row
            _word("be", 185, 365, 30, 29, conf=90),
            _word("good", 216, 365, 50, 29, conf=90),
            _word("at", 275, 365, 25, 29, conf=90),
            _word("sth.", 306, 365, 45, 29, conf=90),
        ]
        tess = [
            _word("take", 188, 289, 52, 21, conf=96),
            _word("part", 249, 292, 48, 24, conf=96),
            _word("(in)", 305, 290, 38, 24, conf=93),
            _word("teilnehmen", 534, 290, 127, 21, conf=95),
            _word("More", 948, 292, 60, 20, conf=90),
            _word("than", 1017, 291, 49, 21, conf=96),
            _word("200", 1076, 292, 43, 20, conf=93),
            _word("singers", 1128, 293, 75, 26, conf=93),
            _word("took", 1212, 291, 55, 22, conf=96),
            _word("part", 1276, 294, 47, 25, conf=96),
            _word("in", 1332, 292, 20, 20, conf=95),
            _word("the", 1361, 292, 36, 21, conf=95),
            _word("[teık", 352, 292, 47, 21, conf=90),
            _word("'pa:t]", 407, 292, 55, 23, conf=89),
            # Second row
            _word("be", 189, 369, 28, 21, conf=96),
            _word("good", 225, 369, 50, 21, conf=96),
            _word("at", 292, 371, 22, 21, conf=96),
            _word("sth.", 324, 369, 42, 21, conf=96),
        ]
        merged = _merge_paddle_tesseract(paddle, tess)

        # Check no near-duplicates: same text within 30px horizontal / 15px vertical
        for i, w1 in enumerate(merged):
            for j in range(i + 1, len(merged)):
                w2 = merged[j]
                if w1["text"].lower() == w2["text"].lower():
                    cx1 = w1["left"] + w1.get("width", 0) / 2
                    cx2 = w2["left"] + w2.get("width", 0) / 2
                    cy1 = w1["top"] + w1.get("height", 0) / 2
                    cy2 = w2["top"] + w2.get("height", 0) / 2
                    assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
                        f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) "
                        f"vs ({w2['left']},{w2['top']})"
                    )

        # Tesseract-only phonetic words should be present
        texts = [w["text"] for w in merged]
        assert "[teık" in texts
        assert "'pa:t]" in texts

        # Row 1 and Row 2 words should not be merged to same Y position
        be_word = [w for w in merged if w["text"] == "be"][0]
        take_word = [w for w in merged if w["text"] == "take"][0]
        assert abs(be_word["top"] - take_word["top"]) > 30, "Rows should stay separate"


class TestSpatialOverlapDedup:
    """Test that words at the same position are deduplicated even if text differs."""

    def test_same_position_different_text_deduplicated(self):
        """Both engines find same physical word but OCR text differs slightly.
        Spatial overlap should catch this as a duplicate."""
        paddle = [_word("hello", 100, 50, 80, 20, conf=90)]
        tess = [_word("helo", 102, 52, 76, 18, conf=70)]
        merged = _merge_row_sequences(paddle, tess)
        assert len(merged) == 1, (
            f"Expected 1 word (deduped by overlap), got {len(merged)}: "
            f"{[w['text'] for w in merged]}"
        )
        # Paddle text preferred (higher confidence)
        assert merged[0]["text"] == "hello"

    def test_same_position_single_char_deduplicated(self):
        """Single-char words at same position should be deduplicated via overlap."""
        paddle = [_word("a", 100, 50, 20, 20, conf=90)]
        tess = [_word("a!", 101, 51, 22, 19, conf=60)]
        merged = _merge_row_sequences(paddle, tess)
        assert len(merged) == 1

    def test_no_overlap_different_words_kept(self):
        """Different words at different positions should both be kept."""
        paddle = [_word("cat", 100, 50, 50, 20, conf=90)]
        tess = [_word("dog", 300, 50, 50, 20, conf=70)]
        merged = _merge_row_sequences(paddle, tess)
        assert len(merged) == 2

    def test_partial_overlap_below_threshold_kept(self):
        """Words with < 50% overlap are different words and both kept."""
        paddle = [_word("take", 100, 50, 60, 20, conf=90)]
        tess = [_word("part", 145, 50, 60, 20, conf=70)]
        merged = _merge_row_sequences(paddle, tess)
        # 15px overlap / 60px min width = 25% < 50% → kept as separate
        assert len(merged) == 2


class TestRapidOcrMergeCompatibility:
    """Test that _merge_paddle_tesseract works with RapidOCR word format.

    RapidOCR words include an extra 'region_type' key that PaddleOCR words
    don't have. The merge logic must tolerate this extra field.
    """

    def _rapid_word(self, text, left, top, width=60, height=20, conf=80, region_type="full_page"):
        """Create a word dict in RapidOCR format (has region_type)."""
        return {
            "text": text,
            "left": left,
            "top": top,
            "width": width,
            "height": height,
            "conf": conf,
            "region_type": region_type,
        }

    def test_rapid_words_merge_with_tesseract(self):
        """RapidOCR words (with region_type) merge correctly with Tesseract words."""
        rapid = [
            self._rapid_word("apple", 50, 10, 70, 20, conf=90),
            self._rapid_word("Apfel", 300, 10, 60, 20, conf=85),
        ]
        tess = [
            _word("apple", 52, 11, 68, 19, conf=75),
            _word("Apfel", 298, 12, 62, 18, conf=70),
        ]
        merged = _merge_paddle_tesseract(rapid, tess)
        assert len(merged) == 2
        texts = sorted(w["text"] for w in merged)
        assert texts == ["Apfel", "apple"]

    def test_rapid_words_split_then_merge(self):
        """Split + merge works with RapidOCR multi-word boxes."""
        rapid_raw = [
            self._rapid_word("More than 200", 944, 287, 160, 29, conf=96),
        ]
        tess = [
            _word("More", 948, 292, 60, 20, conf=90),
            _word("than", 1017, 291, 49, 21, conf=96),
            _word("200", 1076, 292, 43, 20, conf=93),
        ]
        rapid_split = _split_paddle_multi_words(rapid_raw)
        assert len(rapid_split) == 3
        merged = _merge_paddle_tesseract(rapid_split, tess)
        texts = [w["text"] for w in merged]
        assert texts.count("More") == 1
        assert texts.count("than") == 1
        assert texts.count("200") == 1

    def test_region_type_preserved_in_unmatched(self):
        """Unmatched RapidOCR words keep their region_type field."""
        rapid = [self._rapid_word("unique", 500, 10, 80, 20, conf=90)]
        tess = []  # No Tesseract words
        merged = _merge_paddle_tesseract(rapid, tess)
        assert len(merged) == 1
        assert merged[0]["text"] == "unique"


class TestSplitThenMerge:
    """Test the full pipeline: split multi-word Paddle boxes, then merge."""

    def test_multi_word_paddle_boxes_no_duplicates(self):
        """PaddleOCR returns phrases as single boxes — after splitting,
        merge should produce no duplicates."""
        # Paddle returns multi-word boxes (real-world behavior)
        paddle_raw = [
            _word("take part(in) [teik'pa:t]", 185, 287, 281, 29, conf=90),
            _word("teilnehmen (an.mitmachen", 526, 282, 329, 35, conf=93),
            _word("More than 200 singers took part in the", 944, 287, 454, 29, conf=96),
        ]
        tess = [
            _word("take", 188, 289, 52, 21, conf=96),
            _word("part", 249, 292, 48, 24, conf=96),
            _word("(in)", 305, 290, 38, 24, conf=93),
            _word("[teık", 352, 292, 47, 21, conf=90),
            _word("'pa:t]", 407, 292, 55, 23, conf=89),
            _word("teilnehmen", 534, 290, 127, 21, conf=95),
            _word("(an),", 671, 291, 48, 23, conf=96),
            _word("mitmachen", 730, 290, 123, 22, conf=96),
            _word("More", 948, 292, 60, 20, conf=90),
            _word("than", 1017, 291, 49, 21, conf=96),
            _word("200", 1076, 292, 43, 20, conf=93),
            _word("singers", 1128, 293, 75, 26, conf=93),
            _word("took", 1212, 291, 55, 22, conf=96),
            _word("part", 1276, 294, 47, 25, conf=96),
            _word("in", 1332, 292, 20, 20, conf=95),
            _word("the", 1361, 292, 36, 21, conf=95),
        ]

        # Split paddle multi-word boxes first
        paddle_split = _split_paddle_multi_words(paddle_raw)
        assert len(paddle_split) > len(paddle_raw), "Should have more words after split"

        # Merge
        merged = _merge_paddle_tesseract(paddle_split, tess)

        # Check no near-duplicates
        for i, w1 in enumerate(merged):
            for j in range(i + 1, len(merged)):
                w2 = merged[j]
                if w1["text"].lower() == w2["text"].lower():
                    cx1 = w1["left"] + w1.get("width", 0) / 2
                    cx2 = w2["left"] + w2.get("width", 0) / 2
                    cy1 = w1["top"] + w1.get("height", 0) / 2
                    cy2 = w2["top"] + w2.get("height", 0) / 2
                    assert abs(cx1 - cx2) >= 30 or abs(cy1 - cy2) >= 15, (
                        f"Near-duplicate: '{w1['text']}' at ({w1['left']},{w1['top']}) "
                        f"vs ({w2['left']},{w2['top']})"
                    )