feat(canonical-controls): Canonical Control Library — rechtssichere Security Controls
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 40s
CI/CD / test-python-backend-compliance (push) Successful in 41s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 18s
CI/CD / deploy-hetzner (push) Successful in 2m26s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 40s
CI/CD / test-python-backend-compliance (push) Successful in 41s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 18s
CI/CD / deploy-hetzner (push) Successful in 2m26s
Eigenstaendig formulierte Security Controls mit unabhaengiger Taxonomie und Open-Source-Verankerung (OWASP, NIST, ENISA). Keine BSI-Nomenklatur. - Migration 044: 5 DB-Tabellen (frameworks, controls, sources, licenses, mappings) - 10 Seed Controls mit 39 Open-Source-Referenzen - License Gate: Quellen-Berechtigungspruefung (analysis/excerpt/embeddings/product) - Too-Close-Detektor: 5 Metriken (exact-phrase, token-overlap, ngram, embedding, LCS) - REST API: 8 Endpoints unter /v1/canonical/ - Go Loader mit Multi-Index (ID, domain, severity, framework) - Frontend: Control Library Browser + Provenance Wiki - CI/CD: validate-controls.py Job (schema, no-leak, open-anchors) - 67 Tests (8 Go + 59 Python), alle PASS - MkDocs Dokumentation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
118
backend-compliance/compliance/tests/test_similarity_detector.py
Normal file
118
backend-compliance/compliance/tests/test_similarity_detector.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Tests for the Too-Close Similarity Detector."""
|
||||
|
||||
import pytest
|
||||
from compliance.services.similarity_detector import (
|
||||
max_exact_run,
|
||||
token_overlap_jaccard,
|
||||
ngram_jaccard,
|
||||
lcs_ratio,
|
||||
check_similarity,
|
||||
_tokenize,
|
||||
)
|
||||
|
||||
|
||||
class TestTokenize:
|
||||
def test_basic(self):
|
||||
tokens = _tokenize("Hello World 123")
|
||||
assert tokens == ["hello", "world", "123"]
|
||||
|
||||
def test_german_umlauts(self):
|
||||
tokens = _tokenize("Schutzmaßnahmen für Daten")
|
||||
assert len(tokens) == 3
|
||||
|
||||
def test_empty(self):
|
||||
assert _tokenize("") == []
|
||||
|
||||
|
||||
class TestMaxExactRun:
|
||||
def test_identical(self):
|
||||
tokens = _tokenize("the quick brown fox jumps over the lazy dog")
|
||||
assert max_exact_run(tokens, tokens) == len(tokens)
|
||||
|
||||
def test_partial_match(self):
|
||||
a = _tokenize("the quick brown fox")
|
||||
b = _tokenize("a quick brown cat")
|
||||
assert max_exact_run(a, b) == 2 # "quick brown"
|
||||
|
||||
def test_no_match(self):
|
||||
a = _tokenize("hello world")
|
||||
b = _tokenize("foo bar")
|
||||
assert max_exact_run(a, b) == 0
|
||||
|
||||
def test_empty(self):
|
||||
assert max_exact_run([], []) == 0
|
||||
assert max_exact_run(["a"], []) == 0
|
||||
|
||||
|
||||
class TestTokenOverlapJaccard:
|
||||
def test_identical(self):
|
||||
tokens = _tokenize("hello world")
|
||||
assert token_overlap_jaccard(tokens, tokens) == 1.0
|
||||
|
||||
def test_no_overlap(self):
|
||||
a = _tokenize("hello world")
|
||||
b = _tokenize("foo bar")
|
||||
assert token_overlap_jaccard(a, b) == 0.0
|
||||
|
||||
def test_partial(self):
|
||||
a = _tokenize("hello world foo")
|
||||
b = _tokenize("hello bar baz")
|
||||
# intersection: {hello}, union: {hello, world, foo, bar, baz}
|
||||
assert abs(token_overlap_jaccard(a, b) - 0.2) < 0.01
|
||||
|
||||
|
||||
class TestNgramJaccard:
|
||||
def test_identical(self):
|
||||
assert ngram_jaccard("hello", "hello") == 1.0
|
||||
|
||||
def test_different(self):
|
||||
assert ngram_jaccard("abc", "xyz") == 0.0
|
||||
|
||||
def test_short(self):
|
||||
assert ngram_jaccard("ab", "cd") == 0.0 # too short for 3-grams
|
||||
|
||||
|
||||
class TestLcsRatio:
|
||||
def test_identical(self):
|
||||
tokens = _tokenize("multi factor authentication required")
|
||||
assert lcs_ratio(tokens, tokens) == 1.0
|
||||
|
||||
def test_partial(self):
|
||||
a = _tokenize("multi factor authentication")
|
||||
b = _tokenize("single factor verification")
|
||||
# LCS: "factor" (length 1), max(3,3) = 3, ratio = 1/3
|
||||
result = lcs_ratio(a, b)
|
||||
assert 0.3 < result < 0.4
|
||||
|
||||
def test_empty(self):
|
||||
assert lcs_ratio([], []) == 0.0
|
||||
|
||||
|
||||
class TestCheckSimilarity:
|
||||
@pytest.mark.asyncio
|
||||
async def test_identical_texts_fail(self):
|
||||
text = "Multi-factor authentication must be enforced for all administrative accounts."
|
||||
report = await check_similarity(text, text, embedding_url="http://localhost:99999")
|
||||
# Identical texts should have max overlap
|
||||
assert report.token_overlap == 1.0
|
||||
assert report.status == "FAIL"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_different_texts_pass(self):
|
||||
source = "Die Anwendung muss eine Zwei-Faktor-Authentisierung implementieren."
|
||||
candidate = "Network traffic should be encrypted using TLS 1.3 at minimum."
|
||||
report = await check_similarity(source, candidate, embedding_url="http://localhost:99999")
|
||||
assert report.token_overlap < 0.1
|
||||
assert report.status == "PASS"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_report_fields(self):
|
||||
report = await check_similarity("hello world", "foo bar", embedding_url="http://localhost:99999")
|
||||
assert hasattr(report, "max_exact_run")
|
||||
assert hasattr(report, "token_overlap")
|
||||
assert hasattr(report, "ngram_jaccard")
|
||||
assert hasattr(report, "embedding_cosine")
|
||||
assert hasattr(report, "lcs_ratio")
|
||||
assert hasattr(report, "status")
|
||||
assert hasattr(report, "details")
|
||||
assert report.status in ("PASS", "WARN", "FAIL")
|
||||
Reference in New Issue
Block a user