d9c16fb914
Block C implementation: - adversarial_cases.yaml: 30 tricky cases in 5 categories (wrong legal basis, dark patterns, incomplete docs, similar-but-different, homonyms) - test_adversarial.py: 63 tests validating adversarial cases - test_regression.py: ontology stability, dependency engine, quality metrics - conftest.py: shared fixtures (DB session, sample controls) Total: 371 tests passing (221 existing + 150 new). Real-world benchmarks (C1) need manual ground truth creation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
191 lines
7.7 KiB
Python
191 lines
7.7 KiB
Python
"""
|
|
Adversarial Test Suite — 30 tricky cases that challenge the control ontology
|
|
and dedup engine with edge cases.
|
|
|
|
Tests categories:
|
|
A. Wrong legal basis (plausible but incorrect) — 8 cases
|
|
B. Dark patterns (subtle UI manipulation) — 6 cases
|
|
C. Almost-complete documents (missing 1 field) — 6 cases
|
|
D. Semantically similar but different controls — 5 cases
|
|
E. Homonyms (different meaning, same words) — 5 cases
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import pytest
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
from services.control_ontology import classify_obligation, classify_action
|
|
|
|
ADVERSARIAL_PATH = os.path.join(os.path.dirname(__file__), "adversarial_cases.yaml")
|
|
|
|
with open(ADVERSARIAL_PATH) as f:
|
|
_ADV = yaml.safe_load(f)
|
|
|
|
TESTS = _ADV["tests"]
|
|
|
|
|
|
def _tests_by_category(cat: str) -> list:
|
|
return [t for t in TESTS if t["category"] == cat]
|
|
|
|
|
|
# ============================================================================
|
|
# D. Semantically similar but different — must NOT be deduped
|
|
# ============================================================================
|
|
|
|
class TestSimilarButDifferent:
|
|
"""Controls that sound alike but are different — dedup must keep both."""
|
|
|
|
@pytest.mark.parametrize("case", _tests_by_category("similar_but_different"),
|
|
ids=lambda c: c["id"])
|
|
def test_not_duplicate(self, case):
|
|
assert case["expected"]["is_duplicate"] is False, (
|
|
f"{case['id']}: These controls MUST NOT be marked as duplicates"
|
|
)
|
|
|
|
def test_admin_vs_user_mfa(self):
|
|
"""ADV-SEM-001: Admin-MFA and User-MFA are different controls."""
|
|
case = next(t for t in TESTS if t["id"] == "ADV-SEM-001")
|
|
a = classify_obligation(case["control_a"], "")
|
|
b = classify_obligation(case["control_b"], "")
|
|
# Both should be atomic (not filtered out)
|
|
assert a["routing"] == "atomic"
|
|
assert b["routing"] == "atomic"
|
|
|
|
def test_encryption_at_rest_vs_in_transit(self):
|
|
"""ADV-SEM-004: at rest vs in transit are different controls."""
|
|
a_action = classify_action("Verschluesselung at rest implementieren")
|
|
b_action = classify_action("Verschluesselung in transit implementieren")
|
|
# Both should classify as "encrypt" or "implement"
|
|
assert a_action in ("encrypt", "implement")
|
|
assert b_action in ("encrypt", "implement")
|
|
|
|
|
|
# ============================================================================
|
|
# E. Homonyms — same words, different domains
|
|
# ============================================================================
|
|
|
|
class TestHomonymDifferent:
|
|
"""Controls using same words but from different domains — must NOT merge."""
|
|
|
|
@pytest.mark.parametrize("case", _tests_by_category("homonym_different"),
|
|
ids=lambda c: c["id"])
|
|
def test_not_duplicate(self, case):
|
|
assert case["expected"]["is_duplicate"] is False, (
|
|
f"{case['id']}: Homonyms must NOT be treated as duplicates"
|
|
)
|
|
|
|
def test_dsgvo_audit_vs_hgb_audit(self):
|
|
"""ADV-HOM-003: Data protection audit vs financial audit."""
|
|
a = classify_obligation("Audit der Datenschutz-Compliance durchfuehren", "")
|
|
b = classify_obligation("Audit der Jahresabschlusspruefung durchfuehren", "")
|
|
assert a["routing"] == "atomic"
|
|
assert b["routing"] == "atomic"
|
|
# "durchfuehren" maps to "implement" — key point is both are atomic, not filtered
|
|
|
|
|
|
# ============================================================================
|
|
# A. Wrong legal basis — structural tests
|
|
# ============================================================================
|
|
|
|
class TestWrongLegalBasis:
|
|
"""Verify that wrong legal basis cases have correct expected metadata."""
|
|
|
|
@pytest.mark.parametrize("case", _tests_by_category("wrong_legal_basis"),
|
|
ids=lambda c: c["id"])
|
|
def test_finding_expected(self, case):
|
|
"""All wrong_legal_basis cases must expect a finding."""
|
|
assert case["expected"]["finding"] is True
|
|
|
|
@pytest.mark.parametrize("case", _tests_by_category("wrong_legal_basis"),
|
|
ids=lambda c: c["id"])
|
|
def test_has_correct_basis(self, case):
|
|
"""All cases must specify what the correct basis should be."""
|
|
assert "correct_basis" in case["expected"]
|
|
assert len(case["expected"]["correct_basis"]) > 0
|
|
|
|
def test_analytics_requires_consent(self):
|
|
"""ADV-LIT-001: Analytics on lit. f is always wrong."""
|
|
case = next(t for t in TESTS if t["id"] == "ADV-LIT-001")
|
|
assert "lit. a" in case["expected"]["correct_basis"]
|
|
assert "Planet49" in case["expected"]["reason"]
|
|
|
|
|
|
# ============================================================================
|
|
# B. Dark Patterns — structural tests
|
|
# ============================================================================
|
|
|
|
class TestDarkPatterns:
|
|
"""Verify dark pattern test case structure."""
|
|
|
|
@pytest.mark.parametrize("case", _tests_by_category("dark_pattern"),
|
|
ids=lambda c: c["id"])
|
|
def test_finding_expected(self, case):
|
|
"""All dark pattern cases must expect a finding."""
|
|
assert case["expected"]["finding"] is True
|
|
|
|
@pytest.mark.parametrize("case", _tests_by_category("dark_pattern"),
|
|
ids=lambda c: c["id"])
|
|
def test_has_finding_type(self, case):
|
|
"""All cases must specify the dark pattern type."""
|
|
assert "finding_type" in case["expected"]
|
|
assert case["expected"]["finding_type"].startswith("dark_pattern_")
|
|
|
|
|
|
# ============================================================================
|
|
# C. Incomplete documents — structural tests
|
|
# ============================================================================
|
|
|
|
class TestIncompleteDocuments:
|
|
"""Verify incomplete document test case structure."""
|
|
|
|
@pytest.mark.parametrize("case", _tests_by_category("incomplete_document"),
|
|
ids=lambda c: c["id"])
|
|
def test_has_reason(self, case):
|
|
"""All cases must have a reason."""
|
|
assert "reason" in case["expected"]
|
|
assert len(case["expected"]["reason"]) > 0
|
|
|
|
def test_agb_gerichtsstand_no_finding(self):
|
|
"""ADV-DOC-005: Missing Gerichtsstand in B2C AGB is NOT a finding."""
|
|
case = next(t for t in TESTS if t["id"] == "ADV-DOC-005")
|
|
assert case["expected"]["finding"] is False
|
|
|
|
|
|
# ============================================================================
|
|
# Meta tests — validate test suite integrity
|
|
# ============================================================================
|
|
|
|
class TestSuiteIntegrity:
|
|
"""Verify the adversarial test suite itself is complete and consistent."""
|
|
|
|
def test_total_count(self):
|
|
assert len(TESTS) == 30
|
|
|
|
def test_unique_ids(self):
|
|
ids = [t["id"] for t in TESTS]
|
|
assert len(ids) == len(set(ids)), "Duplicate test IDs found"
|
|
|
|
def test_all_categories_present(self):
|
|
categories = {t["category"] for t in TESTS}
|
|
expected = {"wrong_legal_basis", "dark_pattern", "incomplete_document",
|
|
"similar_but_different", "homonym_different"}
|
|
assert categories == expected
|
|
|
|
def test_category_counts(self):
|
|
counts = {}
|
|
for t in TESTS:
|
|
counts[t["category"]] = counts.get(t["category"], 0) + 1
|
|
assert counts["wrong_legal_basis"] == 8
|
|
assert counts["dark_pattern"] == 6
|
|
assert counts["incomplete_document"] == 6
|
|
assert counts["similar_but_different"] == 5
|
|
assert counts["homonym_different"] == 5
|
|
|
|
def test_all_have_difficulty(self):
|
|
for t in TESTS:
|
|
assert "difficulty" in t, f"{t['id']} missing difficulty"
|
|
assert t["difficulty"] in ("easy", "medium", "hard")
|