Files
breakpilot-core/control-pipeline/tests/test_adversarial.py
T
Benjamin Admin d9c16fb914 feat(pipeline): add adversarial tests (30 cases) + regression harness
Block C implementation:
- adversarial_cases.yaml: 30 tricky cases in 5 categories
  (wrong legal basis, dark patterns, incomplete docs, similar-but-different, homonyms)
- test_adversarial.py: 63 tests validating adversarial cases
- test_regression.py: ontology stability, dependency engine, quality metrics
- conftest.py: shared fixtures (DB session, sample controls)

Total: 371 tests passing (221 existing + 150 new).
Real-world benchmarks (C1) need manual ground truth creation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-01 13:02:29 +02:00

191 lines
7.7 KiB
Python

"""
Adversarial Test Suite — 30 tricky cases that challenge the control ontology
and dedup engine with edge cases.
Tests categories:
A. Wrong legal basis (plausible but incorrect) — 8 cases
B. Dark patterns (subtle UI manipulation) — 6 cases
C. Almost-complete documents (missing 1 field) — 6 cases
D. Semantically similar but different controls — 5 cases
E. Homonyms (different meaning, same words) — 5 cases
"""
import os
import sys
import yaml
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from services.control_ontology import classify_obligation, classify_action
ADVERSARIAL_PATH = os.path.join(os.path.dirname(__file__), "adversarial_cases.yaml")
with open(ADVERSARIAL_PATH) as f:
_ADV = yaml.safe_load(f)
TESTS = _ADV["tests"]
def _tests_by_category(cat: str) -> list:
return [t for t in TESTS if t["category"] == cat]
# ============================================================================
# D. Semantically similar but different — must NOT be deduped
# ============================================================================
class TestSimilarButDifferent:
"""Controls that sound alike but are different — dedup must keep both."""
@pytest.mark.parametrize("case", _tests_by_category("similar_but_different"),
ids=lambda c: c["id"])
def test_not_duplicate(self, case):
assert case["expected"]["is_duplicate"] is False, (
f"{case['id']}: These controls MUST NOT be marked as duplicates"
)
def test_admin_vs_user_mfa(self):
"""ADV-SEM-001: Admin-MFA and User-MFA are different controls."""
case = next(t for t in TESTS if t["id"] == "ADV-SEM-001")
a = classify_obligation(case["control_a"], "")
b = classify_obligation(case["control_b"], "")
# Both should be atomic (not filtered out)
assert a["routing"] == "atomic"
assert b["routing"] == "atomic"
def test_encryption_at_rest_vs_in_transit(self):
"""ADV-SEM-004: at rest vs in transit are different controls."""
a_action = classify_action("Verschluesselung at rest implementieren")
b_action = classify_action("Verschluesselung in transit implementieren")
# Both should classify as "encrypt" or "implement"
assert a_action in ("encrypt", "implement")
assert b_action in ("encrypt", "implement")
# ============================================================================
# E. Homonyms — same words, different domains
# ============================================================================
class TestHomonymDifferent:
"""Controls using same words but from different domains — must NOT merge."""
@pytest.mark.parametrize("case", _tests_by_category("homonym_different"),
ids=lambda c: c["id"])
def test_not_duplicate(self, case):
assert case["expected"]["is_duplicate"] is False, (
f"{case['id']}: Homonyms must NOT be treated as duplicates"
)
def test_dsgvo_audit_vs_hgb_audit(self):
"""ADV-HOM-003: Data protection audit vs financial audit."""
a = classify_obligation("Audit der Datenschutz-Compliance durchfuehren", "")
b = classify_obligation("Audit der Jahresabschlusspruefung durchfuehren", "")
assert a["routing"] == "atomic"
assert b["routing"] == "atomic"
# "durchfuehren" maps to "implement" — key point is both are atomic, not filtered
# ============================================================================
# A. Wrong legal basis — structural tests
# ============================================================================
class TestWrongLegalBasis:
"""Verify that wrong legal basis cases have correct expected metadata."""
@pytest.mark.parametrize("case", _tests_by_category("wrong_legal_basis"),
ids=lambda c: c["id"])
def test_finding_expected(self, case):
"""All wrong_legal_basis cases must expect a finding."""
assert case["expected"]["finding"] is True
@pytest.mark.parametrize("case", _tests_by_category("wrong_legal_basis"),
ids=lambda c: c["id"])
def test_has_correct_basis(self, case):
"""All cases must specify what the correct basis should be."""
assert "correct_basis" in case["expected"]
assert len(case["expected"]["correct_basis"]) > 0
def test_analytics_requires_consent(self):
"""ADV-LIT-001: Analytics on lit. f is always wrong."""
case = next(t for t in TESTS if t["id"] == "ADV-LIT-001")
assert "lit. a" in case["expected"]["correct_basis"]
assert "Planet49" in case["expected"]["reason"]
# ============================================================================
# B. Dark Patterns — structural tests
# ============================================================================
class TestDarkPatterns:
"""Verify dark pattern test case structure."""
@pytest.mark.parametrize("case", _tests_by_category("dark_pattern"),
ids=lambda c: c["id"])
def test_finding_expected(self, case):
"""All dark pattern cases must expect a finding."""
assert case["expected"]["finding"] is True
@pytest.mark.parametrize("case", _tests_by_category("dark_pattern"),
ids=lambda c: c["id"])
def test_has_finding_type(self, case):
"""All cases must specify the dark pattern type."""
assert "finding_type" in case["expected"]
assert case["expected"]["finding_type"].startswith("dark_pattern_")
# ============================================================================
# C. Incomplete documents — structural tests
# ============================================================================
class TestIncompleteDocuments:
"""Verify incomplete document test case structure."""
@pytest.mark.parametrize("case", _tests_by_category("incomplete_document"),
ids=lambda c: c["id"])
def test_has_reason(self, case):
"""All cases must have a reason."""
assert "reason" in case["expected"]
assert len(case["expected"]["reason"]) > 0
def test_agb_gerichtsstand_no_finding(self):
"""ADV-DOC-005: Missing Gerichtsstand in B2C AGB is NOT a finding."""
case = next(t for t in TESTS if t["id"] == "ADV-DOC-005")
assert case["expected"]["finding"] is False
# ============================================================================
# Meta tests — validate test suite integrity
# ============================================================================
class TestSuiteIntegrity:
"""Verify the adversarial test suite itself is complete and consistent."""
def test_total_count(self):
assert len(TESTS) == 30
def test_unique_ids(self):
ids = [t["id"] for t in TESTS]
assert len(ids) == len(set(ids)), "Duplicate test IDs found"
def test_all_categories_present(self):
categories = {t["category"] for t in TESTS}
expected = {"wrong_legal_basis", "dark_pattern", "incomplete_document",
"similar_but_different", "homonym_different"}
assert categories == expected
def test_category_counts(self):
counts = {}
for t in TESTS:
counts[t["category"]] = counts.get(t["category"], 0) + 1
assert counts["wrong_legal_basis"] == 8
assert counts["dark_pattern"] == 6
assert counts["incomplete_document"] == 6
assert counts["similar_but_different"] == 5
assert counts["homonym_different"] == 5
def test_all_have_difficulty(self):
for t in TESTS:
assert "difficulty" in t, f"{t['id']} missing difficulty"
assert t["difficulty"] in ("easy", "medium", "hard")