"""Tests for the Too-Close Similarity Detector.""" import pytest from compliance.services.similarity_detector import ( max_exact_run, token_overlap_jaccard, ngram_jaccard, lcs_ratio, check_similarity, _tokenize, ) class TestTokenize: def test_basic(self): tokens = _tokenize("Hello World 123") assert tokens == ["hello", "world", "123"] def test_german_umlauts(self): tokens = _tokenize("Schutzmaßnahmen für Daten") assert len(tokens) == 3 def test_empty(self): assert _tokenize("") == [] class TestMaxExactRun: def test_identical(self): tokens = _tokenize("the quick brown fox jumps over the lazy dog") assert max_exact_run(tokens, tokens) == len(tokens) def test_partial_match(self): a = _tokenize("the quick brown fox") b = _tokenize("a quick brown cat") assert max_exact_run(a, b) == 2 # "quick brown" def test_no_match(self): a = _tokenize("hello world") b = _tokenize("foo bar") assert max_exact_run(a, b) == 0 def test_empty(self): assert max_exact_run([], []) == 0 assert max_exact_run(["a"], []) == 0 class TestTokenOverlapJaccard: def test_identical(self): tokens = _tokenize("hello world") assert token_overlap_jaccard(tokens, tokens) == 1.0 def test_no_overlap(self): a = _tokenize("hello world") b = _tokenize("foo bar") assert token_overlap_jaccard(a, b) == 0.0 def test_partial(self): a = _tokenize("hello world foo") b = _tokenize("hello bar baz") # intersection: {hello}, union: {hello, world, foo, bar, baz} assert abs(token_overlap_jaccard(a, b) - 0.2) < 0.01 class TestNgramJaccard: def test_identical(self): assert ngram_jaccard("hello", "hello") == 1.0 def test_different(self): assert ngram_jaccard("abc", "xyz") == 0.0 def test_short(self): assert ngram_jaccard("ab", "cd") == 0.0 # too short for 3-grams class TestLcsRatio: def test_identical(self): tokens = _tokenize("multi factor authentication required") assert lcs_ratio(tokens, tokens) == 1.0 def test_partial(self): a = _tokenize("multi factor authentication") b = _tokenize("single factor verification") # LCS: "factor" (length 1), max(3,3) = 3, ratio = 1/3 result = lcs_ratio(a, b) assert 0.3 < result < 0.4 def test_empty(self): assert lcs_ratio([], []) == 0.0 class TestCheckSimilarity: @pytest.mark.asyncio async def test_identical_texts_fail(self): text = "Multi-factor authentication must be enforced for all administrative accounts." report = await check_similarity(text, text, embedding_url="http://localhost:99999") # Identical texts should have max overlap assert report.token_overlap == 1.0 assert report.status == "FAIL" @pytest.mark.asyncio async def test_different_texts_pass(self): source = "Die Anwendung muss eine Zwei-Faktor-Authentisierung implementieren." candidate = "Network traffic should be encrypted using TLS 1.3 at minimum." report = await check_similarity(source, candidate, embedding_url="http://localhost:99999") assert report.token_overlap < 0.1 assert report.status == "PASS" @pytest.mark.asyncio async def test_report_fields(self): report = await check_similarity("hello world", "foo bar", embedding_url="http://localhost:99999") assert hasattr(report, "max_exact_run") assert hasattr(report, "token_overlap") assert hasattr(report, "ngram_jaccard") assert hasattr(report, "embedding_cosine") assert hasattr(report, "lcs_ratio") assert hasattr(report, "status") assert hasattr(report, "details") assert report.status in ("PASS", "WARN", "FAIL")