feat(canonical-controls): Canonical Control Library — rechtssichere Security Controls

Eigenstaendig formulierte Security Controls mit unabhaengiger Taxonomie und Open-Source-Verankerung (OWASP, NIST, ENISA). Keine BSI-Nomenklatur. - Migration 044: 5 DB-Tabellen (frameworks, controls, sources, licenses, mappings) - 10 Seed Controls mit 39 Open-Source-Referenzen - License Gate: Quellen-Berechtigungspruefung (analysis/excerpt/embeddings/product) - Too-Close-Detektor: 5 Metriken (exact-phrase, token-overlap, ngram, embedding, LCS) - REST API: 8 Endpoints unter /v1/canonical/ - Go Loader mit Multi-Index (ID, domain, severity, framework) - Frontend: Control Library Browser + Provenance Wiki - CI/CD: validate-controls.py Job (schema, no-leak, open-anchors) - 67 Tests (8 Go + 59 Python), alle PASS - MkDocs Dokumentation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 19:55:06 +01:00
parent 8442115e7c
commit 050f353192
20 changed files with 3935 additions and 0 deletions
@@ -0,0 +1,118 @@
+"""Tests for the Too-Close Similarity Detector."""
+
+import pytest
+from compliance.services.similarity_detector import (
+    max_exact_run,
+    token_overlap_jaccard,
+    ngram_jaccard,
+    lcs_ratio,
+    check_similarity,
+    _tokenize,
+)
+
+
+class TestTokenize:
+    def test_basic(self):
+        tokens = _tokenize("Hello World 123")
+        assert tokens == ["hello", "world", "123"]
+
+    def test_german_umlauts(self):
+        tokens = _tokenize("Schutzmaßnahmen für Daten")
+        assert len(tokens) == 3
+
+    def test_empty(self):
+        assert _tokenize("") == []
+
+
+class TestMaxExactRun:
+    def test_identical(self):
+        tokens = _tokenize("the quick brown fox jumps over the lazy dog")
+        assert max_exact_run(tokens, tokens) == len(tokens)
+
+    def test_partial_match(self):
+        a = _tokenize("the quick brown fox")
+        b = _tokenize("a quick brown cat")
+        assert max_exact_run(a, b) == 2  # "quick brown"
+
+    def test_no_match(self):
+        a = _tokenize("hello world")
+        b = _tokenize("foo bar")
+        assert max_exact_run(a, b) == 0
+
+    def test_empty(self):
+        assert max_exact_run([], []) == 0
+        assert max_exact_run(["a"], []) == 0
+
+
+class TestTokenOverlapJaccard:
+    def test_identical(self):
+        tokens = _tokenize("hello world")
+        assert token_overlap_jaccard(tokens, tokens) == 1.0
+
+    def test_no_overlap(self):
+        a = _tokenize("hello world")
+        b = _tokenize("foo bar")
+        assert token_overlap_jaccard(a, b) == 0.0
+
+    def test_partial(self):
+        a = _tokenize("hello world foo")
+        b = _tokenize("hello bar baz")
+        # intersection: {hello}, union: {hello, world, foo, bar, baz}
+        assert abs(token_overlap_jaccard(a, b) - 0.2) < 0.01
+
+
+class TestNgramJaccard:
+    def test_identical(self):
+        assert ngram_jaccard("hello", "hello") == 1.0
+
+    def test_different(self):
+        assert ngram_jaccard("abc", "xyz") == 0.0
+
+    def test_short(self):
+        assert ngram_jaccard("ab", "cd") == 0.0  # too short for 3-grams
+
+
+class TestLcsRatio:
+    def test_identical(self):
+        tokens = _tokenize("multi factor authentication required")
+        assert lcs_ratio(tokens, tokens) == 1.0
+
+    def test_partial(self):
+        a = _tokenize("multi factor authentication")
+        b = _tokenize("single factor verification")
+        # LCS: "factor" (length 1), max(3,3) = 3, ratio = 1/3
+        result = lcs_ratio(a, b)
+        assert 0.3 < result < 0.4
+
+    def test_empty(self):
+        assert lcs_ratio([], []) == 0.0
+
+
+class TestCheckSimilarity:
+    @pytest.mark.asyncio
+    async def test_identical_texts_fail(self):
+        text = "Multi-factor authentication must be enforced for all administrative accounts."
+        report = await check_similarity(text, text, embedding_url="http://localhost:99999")
+        # Identical texts should have max overlap
+        assert report.token_overlap == 1.0
+        assert report.status == "FAIL"
+
+    @pytest.mark.asyncio
+    async def test_different_texts_pass(self):
+        source = "Die Anwendung muss eine Zwei-Faktor-Authentisierung implementieren."
+        candidate = "Network traffic should be encrypted using TLS 1.3 at minimum."
+        report = await check_similarity(source, candidate, embedding_url="http://localhost:99999")
+        assert report.token_overlap < 0.1
+        assert report.status == "PASS"
+
+    @pytest.mark.asyncio
+    async def test_report_fields(self):
+        report = await check_similarity("hello world", "foo bar", embedding_url="http://localhost:99999")
+        assert hasattr(report, "max_exact_run")
+        assert hasattr(report, "token_overlap")
+        assert hasattr(report, "ngram_jaccard")
+        assert hasattr(report, "embedding_cosine")
+        assert hasattr(report, "lcs_ratio")
+        assert hasattr(report, "status")
+        assert hasattr(report, "details")
+        assert report.status in ("PASS", "WARN", "FAIL")