Files
breakpilot-compliance/backend-compliance/compliance/tests/test_similarity_detector.py
Benjamin Admin 050f353192
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 40s
CI/CD / test-python-backend-compliance (push) Successful in 41s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 18s
CI/CD / deploy-hetzner (push) Successful in 2m26s
feat(canonical-controls): Canonical Control Library — rechtssichere Security Controls
Eigenstaendig formulierte Security Controls mit unabhaengiger Taxonomie
und Open-Source-Verankerung (OWASP, NIST, ENISA). Keine BSI-Nomenklatur.

- Migration 044: 5 DB-Tabellen (frameworks, controls, sources, licenses, mappings)
- 10 Seed Controls mit 39 Open-Source-Referenzen
- License Gate: Quellen-Berechtigungspruefung (analysis/excerpt/embeddings/product)
- Too-Close-Detektor: 5 Metriken (exact-phrase, token-overlap, ngram, embedding, LCS)
- REST API: 8 Endpoints unter /v1/canonical/
- Go Loader mit Multi-Index (ID, domain, severity, framework)
- Frontend: Control Library Browser + Provenance Wiki
- CI/CD: validate-controls.py Job (schema, no-leak, open-anchors)
- 67 Tests (8 Go + 59 Python), alle PASS
- MkDocs Dokumentation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 19:55:06 +01:00

119 lines
3.8 KiB
Python

"""Tests for the Too-Close Similarity Detector."""
import pytest
from compliance.services.similarity_detector import (
max_exact_run,
token_overlap_jaccard,
ngram_jaccard,
lcs_ratio,
check_similarity,
_tokenize,
)
class TestTokenize:
def test_basic(self):
tokens = _tokenize("Hello World 123")
assert tokens == ["hello", "world", "123"]
def test_german_umlauts(self):
tokens = _tokenize("Schutzmaßnahmen für Daten")
assert len(tokens) == 3
def test_empty(self):
assert _tokenize("") == []
class TestMaxExactRun:
def test_identical(self):
tokens = _tokenize("the quick brown fox jumps over the lazy dog")
assert max_exact_run(tokens, tokens) == len(tokens)
def test_partial_match(self):
a = _tokenize("the quick brown fox")
b = _tokenize("a quick brown cat")
assert max_exact_run(a, b) == 2 # "quick brown"
def test_no_match(self):
a = _tokenize("hello world")
b = _tokenize("foo bar")
assert max_exact_run(a, b) == 0
def test_empty(self):
assert max_exact_run([], []) == 0
assert max_exact_run(["a"], []) == 0
class TestTokenOverlapJaccard:
def test_identical(self):
tokens = _tokenize("hello world")
assert token_overlap_jaccard(tokens, tokens) == 1.0
def test_no_overlap(self):
a = _tokenize("hello world")
b = _tokenize("foo bar")
assert token_overlap_jaccard(a, b) == 0.0
def test_partial(self):
a = _tokenize("hello world foo")
b = _tokenize("hello bar baz")
# intersection: {hello}, union: {hello, world, foo, bar, baz}
assert abs(token_overlap_jaccard(a, b) - 0.2) < 0.01
class TestNgramJaccard:
def test_identical(self):
assert ngram_jaccard("hello", "hello") == 1.0
def test_different(self):
assert ngram_jaccard("abc", "xyz") == 0.0
def test_short(self):
assert ngram_jaccard("ab", "cd") == 0.0 # too short for 3-grams
class TestLcsRatio:
def test_identical(self):
tokens = _tokenize("multi factor authentication required")
assert lcs_ratio(tokens, tokens) == 1.0
def test_partial(self):
a = _tokenize("multi factor authentication")
b = _tokenize("single factor verification")
# LCS: "factor" (length 1), max(3,3) = 3, ratio = 1/3
result = lcs_ratio(a, b)
assert 0.3 < result < 0.4
def test_empty(self):
assert lcs_ratio([], []) == 0.0
class TestCheckSimilarity:
@pytest.mark.asyncio
async def test_identical_texts_fail(self):
text = "Multi-factor authentication must be enforced for all administrative accounts."
report = await check_similarity(text, text, embedding_url="http://localhost:99999")
# Identical texts should have max overlap
assert report.token_overlap == 1.0
assert report.status == "FAIL"
@pytest.mark.asyncio
async def test_different_texts_pass(self):
source = "Die Anwendung muss eine Zwei-Faktor-Authentisierung implementieren."
candidate = "Network traffic should be encrypted using TLS 1.3 at minimum."
report = await check_similarity(source, candidate, embedding_url="http://localhost:99999")
assert report.token_overlap < 0.1
assert report.status == "PASS"
@pytest.mark.asyncio
async def test_report_fields(self):
report = await check_similarity("hello world", "foo bar", embedding_url="http://localhost:99999")
assert hasattr(report, "max_exact_run")
assert hasattr(report, "token_overlap")
assert hasattr(report, "ngram_jaccard")
assert hasattr(report, "embedding_cosine")
assert hasattr(report, "lcs_ratio")
assert hasattr(report, "status")
assert hasattr(report, "details")
assert report.status in ("PASS", "WARN", "FAIL")