breakpilot-compliance/backend-compliance/tests/test_vocabulary.py

"""Characterization tests for the Domain Vocabulary (data, not code).

Pins the IDENTITY-vs-REPRESENTATION contract: regulations have a stable id + canonical name + aliases
(so CRA and "Cyber Resilience Act" resolve to the SAME identity — the normalization that the KPIs
flagged). Journey classes cluster transition instances so we do not duplicate the same reise; they
are PROVISIONAL (no MJRN minting) and reference regulation ids that exist in the vocabulary.
"""

from __future__ import annotations

import os

import yaml

_VOCAB = os.path.join(os.path.dirname(__file__), "..", "knowledge", "vocabulary")


def _regs():
    with open(os.path.join(_VOCAB, "regulations.yaml"), encoding="utf-8") as h:
        return yaml.safe_load(h)["regulations"]


def _classes():
    with open(os.path.join(_VOCAB, "journey_classes.yaml"), encoding="utf-8") as h:
        return yaml.safe_load(h)


def _norm(s):
    return "".join(c for c in str(s).lower() if c.isalnum())


def _alias_map():
    amap = {}
    for r in _regs():
        for name in [r["canonical"]] + list(r.get("aliases", [])):
            amap[_norm(name)] = r["id"]
    return amap


def test_every_regulation_has_id_canonical_aliases():
    for r in _regs():
        assert r["id"] and r["canonical"] and r["aliases"]
        assert r["id"] == r["id"].lower()                              # ids are lowercase stable keys


def test_cra_spellings_resolve_to_one_identity():
    amap = _alias_map()
    # the exact normalization the KPIs needed: CRA == Cyber Resilience Act
    assert amap[_norm("CRA")] == "cra" and amap[_norm("Cyber Resilience Act")] == "cra"
    assert amap[_norm("Regulation (EU) 2024/2847")] == "cra"


def test_iso_and_management_system_aliases_resolve():
    amap = _alias_map()
    assert amap[_norm("ISO9001")] == "iso9001" and amap[_norm("QMS")] == "iso9001"
    assert amap[_norm("ISO/IEC 27001")] == "iso27001" and amap[_norm("ISMS")] == "iso27001"
    assert amap[_norm("Maschinenverordnung")] == "maschinenvo" and amap[_norm("MaschinenVO")] == "maschinenvo"


def test_aliases_are_unambiguous():
    # no normalized alias maps to two different regulation identities
    seen = {}
    for r in _regs():
        for name in [r["canonical"]] + list(r.get("aliases", [])):
            k = _norm(name)
            assert seen.get(k, r["id"]) == r["id"], "ambiguous alias %r" % name
            seen[k] = r["id"]


def test_journey_classes_are_provisional():
    assert _classes()["status"] == "provisional"                       # new abstraction -> own Rule of Three


def test_iso9001_maschinenvo_is_an_instance_not_a_new_kind():
    classes = _classes()["classes"]
    qm = [c for c in classes if c["id"] == "qm-to-product-compliance"][0]
    pairs = {(i["from"], i["to"]) for i in qm["instances"]}
    assert ("iso9001", "maschinenvo") in pairs                         # same CLASS as iso9001->cra, iso13485->mdr
    assert ("iso13485", "mdr") in pairs                                # class generalises across domains


def test_class_endpoints_reference_known_regulations():
    reg_ids = {r["id"] for r in _regs()}
    for c in _classes()["classes"]:
        for inst in c["instances"]:
            assert inst["from"] in reg_ids and inst["to"] in reg_ids   # vocabulary is internally consistent