breakpilot-compliance/backend-compliance/compliance/tests/test_similarity_detector.py

"""Tests for the Too-Close Similarity Detector."""

import pytest
from compliance.services.similarity_detector import (
    max_exact_run,
    token_overlap_jaccard,
    ngram_jaccard,
    lcs_ratio,
    check_similarity,
    _tokenize,
)


class TestTokenize:
    def test_basic(self):
        tokens = _tokenize("Hello World 123")
        assert tokens == ["hello", "world", "123"]

    def test_german_umlauts(self):
        tokens = _tokenize("Schutzmaßnahmen für Daten")
        assert len(tokens) == 3

    def test_empty(self):
        assert _tokenize("") == []


class TestMaxExactRun:
    def test_identical(self):
        tokens = _tokenize("the quick brown fox jumps over the lazy dog")
        assert max_exact_run(tokens, tokens) == len(tokens)

    def test_partial_match(self):
        a = _tokenize("the quick brown fox")
        b = _tokenize("a quick brown cat")
        assert max_exact_run(a, b) == 2  # "quick brown"

    def test_no_match(self):
        a = _tokenize("hello world")
        b = _tokenize("foo bar")
        assert max_exact_run(a, b) == 0

    def test_empty(self):
        assert max_exact_run([], []) == 0
        assert max_exact_run(["a"], []) == 0


class TestTokenOverlapJaccard:
    def test_identical(self):
        tokens = _tokenize("hello world")
        assert token_overlap_jaccard(tokens, tokens) == 1.0

    def test_no_overlap(self):
        a = _tokenize("hello world")
        b = _tokenize("foo bar")
        assert token_overlap_jaccard(a, b) == 0.0

    def test_partial(self):
        a = _tokenize("hello world foo")
        b = _tokenize("hello bar baz")
        # intersection: {hello}, union: {hello, world, foo, bar, baz}
        assert abs(token_overlap_jaccard(a, b) - 0.2) < 0.01


class TestNgramJaccard:
    def test_identical(self):
        assert ngram_jaccard("hello", "hello") == 1.0

    def test_different(self):
        assert ngram_jaccard("abc", "xyz") == 0.0

    def test_short(self):
        assert ngram_jaccard("ab", "cd") == 0.0  # too short for 3-grams


class TestLcsRatio:
    def test_identical(self):
        tokens = _tokenize("multi factor authentication required")
        assert lcs_ratio(tokens, tokens) == 1.0

    def test_partial(self):
        a = _tokenize("multi factor authentication")
        b = _tokenize("single factor verification")
        # LCS: "factor" (length 1), max(3,3) = 3, ratio = 1/3
        result = lcs_ratio(a, b)
        assert 0.3 < result < 0.4

    def test_empty(self):
        assert lcs_ratio([], []) == 0.0


class TestCheckSimilarity:
    @pytest.mark.asyncio
    async def test_identical_texts_fail(self):
        text = "Multi-factor authentication must be enforced for all administrative accounts."
        report = await check_similarity(text, text, embedding_url="http://localhost:99999")
        # Identical texts should have max overlap
        assert report.token_overlap == 1.0
        assert report.status == "FAIL"

    @pytest.mark.asyncio
    async def test_different_texts_pass(self):
        source = "Die Anwendung muss eine Zwei-Faktor-Authentisierung implementieren."
        candidate = "Network traffic should be encrypted using TLS 1.3 at minimum."
        report = await check_similarity(source, candidate, embedding_url="http://localhost:99999")
        assert report.token_overlap < 0.1
        assert report.status == "PASS"

    @pytest.mark.asyncio
    async def test_report_fields(self):
        report = await check_similarity("hello world", "foo bar", embedding_url="http://localhost:99999")
        assert hasattr(report, "max_exact_run")
        assert hasattr(report, "token_overlap")
        assert hasattr(report, "ngram_jaccard")
        assert hasattr(report, "embedding_cosine")
        assert hasattr(report, "lcs_ratio")
        assert hasattr(report, "status")
        assert hasattr(report, "details")
        assert report.status in ("PASS", "WARN", "FAIL")