All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 40s
CI/CD / test-python-backend-compliance (push) Successful in 41s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 18s
CI/CD / deploy-hetzner (push) Successful in 2m26s
Eigenstaendig formulierte Security Controls mit unabhaengiger Taxonomie und Open-Source-Verankerung (OWASP, NIST, ENISA). Keine BSI-Nomenklatur. - Migration 044: 5 DB-Tabellen (frameworks, controls, sources, licenses, mappings) - 10 Seed Controls mit 39 Open-Source-Referenzen - License Gate: Quellen-Berechtigungspruefung (analysis/excerpt/embeddings/product) - Too-Close-Detektor: 5 Metriken (exact-phrase, token-overlap, ngram, embedding, LCS) - REST API: 8 Endpoints unter /v1/canonical/ - Go Loader mit Multi-Index (ID, domain, severity, framework) - Frontend: Control Library Browser + Provenance Wiki - CI/CD: validate-controls.py Job (schema, no-leak, open-anchors) - 67 Tests (8 Go + 59 Python), alle PASS - MkDocs Dokumentation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
119 lines
3.8 KiB
Python
119 lines
3.8 KiB
Python
"""Tests for the Too-Close Similarity Detector."""
|
|
|
|
import pytest
|
|
from compliance.services.similarity_detector import (
|
|
max_exact_run,
|
|
token_overlap_jaccard,
|
|
ngram_jaccard,
|
|
lcs_ratio,
|
|
check_similarity,
|
|
_tokenize,
|
|
)
|
|
|
|
|
|
class TestTokenize:
|
|
def test_basic(self):
|
|
tokens = _tokenize("Hello World 123")
|
|
assert tokens == ["hello", "world", "123"]
|
|
|
|
def test_german_umlauts(self):
|
|
tokens = _tokenize("Schutzmaßnahmen für Daten")
|
|
assert len(tokens) == 3
|
|
|
|
def test_empty(self):
|
|
assert _tokenize("") == []
|
|
|
|
|
|
class TestMaxExactRun:
|
|
def test_identical(self):
|
|
tokens = _tokenize("the quick brown fox jumps over the lazy dog")
|
|
assert max_exact_run(tokens, tokens) == len(tokens)
|
|
|
|
def test_partial_match(self):
|
|
a = _tokenize("the quick brown fox")
|
|
b = _tokenize("a quick brown cat")
|
|
assert max_exact_run(a, b) == 2 # "quick brown"
|
|
|
|
def test_no_match(self):
|
|
a = _tokenize("hello world")
|
|
b = _tokenize("foo bar")
|
|
assert max_exact_run(a, b) == 0
|
|
|
|
def test_empty(self):
|
|
assert max_exact_run([], []) == 0
|
|
assert max_exact_run(["a"], []) == 0
|
|
|
|
|
|
class TestTokenOverlapJaccard:
|
|
def test_identical(self):
|
|
tokens = _tokenize("hello world")
|
|
assert token_overlap_jaccard(tokens, tokens) == 1.0
|
|
|
|
def test_no_overlap(self):
|
|
a = _tokenize("hello world")
|
|
b = _tokenize("foo bar")
|
|
assert token_overlap_jaccard(a, b) == 0.0
|
|
|
|
def test_partial(self):
|
|
a = _tokenize("hello world foo")
|
|
b = _tokenize("hello bar baz")
|
|
# intersection: {hello}, union: {hello, world, foo, bar, baz}
|
|
assert abs(token_overlap_jaccard(a, b) - 0.2) < 0.01
|
|
|
|
|
|
class TestNgramJaccard:
|
|
def test_identical(self):
|
|
assert ngram_jaccard("hello", "hello") == 1.0
|
|
|
|
def test_different(self):
|
|
assert ngram_jaccard("abc", "xyz") == 0.0
|
|
|
|
def test_short(self):
|
|
assert ngram_jaccard("ab", "cd") == 0.0 # too short for 3-grams
|
|
|
|
|
|
class TestLcsRatio:
|
|
def test_identical(self):
|
|
tokens = _tokenize("multi factor authentication required")
|
|
assert lcs_ratio(tokens, tokens) == 1.0
|
|
|
|
def test_partial(self):
|
|
a = _tokenize("multi factor authentication")
|
|
b = _tokenize("single factor verification")
|
|
# LCS: "factor" (length 1), max(3,3) = 3, ratio = 1/3
|
|
result = lcs_ratio(a, b)
|
|
assert 0.3 < result < 0.4
|
|
|
|
def test_empty(self):
|
|
assert lcs_ratio([], []) == 0.0
|
|
|
|
|
|
class TestCheckSimilarity:
|
|
@pytest.mark.asyncio
|
|
async def test_identical_texts_fail(self):
|
|
text = "Multi-factor authentication must be enforced for all administrative accounts."
|
|
report = await check_similarity(text, text, embedding_url="http://localhost:99999")
|
|
# Identical texts should have max overlap
|
|
assert report.token_overlap == 1.0
|
|
assert report.status == "FAIL"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_different_texts_pass(self):
|
|
source = "Die Anwendung muss eine Zwei-Faktor-Authentisierung implementieren."
|
|
candidate = "Network traffic should be encrypted using TLS 1.3 at minimum."
|
|
report = await check_similarity(source, candidate, embedding_url="http://localhost:99999")
|
|
assert report.token_overlap < 0.1
|
|
assert report.status == "PASS"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_report_fields(self):
|
|
report = await check_similarity("hello world", "foo bar", embedding_url="http://localhost:99999")
|
|
assert hasattr(report, "max_exact_run")
|
|
assert hasattr(report, "token_overlap")
|
|
assert hasattr(report, "ngram_jaccard")
|
|
assert hasattr(report, "embedding_cosine")
|
|
assert hasattr(report, "lcs_ratio")
|
|
assert hasattr(report, "status")
|
|
assert hasattr(report, "details")
|
|
assert report.status in ("PASS", "WARN", "FAIL")
|