breakpilot-compliance/scripts/validate-controls.py

#!/usr/bin/env python3
"""
Canonical Control Library — CI/CD Validator

Checks:
  1. Schema Validation     — JSON against defined structure
  2. License Gate          — Every mapping reference fulfils allowed_usages
  3. No-Leak Scanner       — Regex against forbidden locator patterns (e.g. O.Auth_*, O.Netz_*)
  4. Provenance Integrity  — Every referenced source_id exists in seed data
  5. Open Anchor Check     — Every control has >= 1 open anchor

Usage:
  python scripts/validate-controls.py
"""

import json
import re
import sys
from pathlib import Path

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------

REPO_ROOT = Path(__file__).resolve().parent.parent
CONTROLS_FILE = REPO_ROOT / "ai-compliance-sdk" / "policies" / "canonical_controls_v1.json"
MIGRATION_FILE = REPO_ROOT / "backend-compliance" / "migrations" / "044_canonical_control_library.sql"

# ---------------------------------------------------------------------------
# Forbidden patterns (BSI proprietary nomenclature — must NOT appear in controls)
# ---------------------------------------------------------------------------

FORBIDDEN_PATTERNS = [
    re.compile(r"O\.[A-Za-z]+_[0-9]+"),          # BSI objective IDs: O.Auth_1, O.Netz_3
    re.compile(r"TR-03161"),                       # Direct TR reference in control text
    re.compile(r"BSI-TR-"),                        # Direct BSI-TR reference
    re.compile(r"Anforderung\s+[A-Z]\.\d+"),      # BSI requirement format
]

# Fields that are product-facing and must not contain forbidden patterns
PRODUCT_FIELDS = ["objective", "rationale", "title", "requirements", "test_procedure"]

# ---------------------------------------------------------------------------
# Known open sources (from migration seed)
# ---------------------------------------------------------------------------

KNOWN_OPEN_SOURCES = {
    "OWASP_ASVS", "OWASP_MASVS", "OWASP_TOP10",
    "NIST_SP800_53", "NIST_SP800_63B",
    "ENISA_GOOD_PRACTICES", "CIS_CONTROLS",
}

KNOWN_ALL_SOURCES = KNOWN_OPEN_SOURCES | {
    "BSI_TR03161_1", "BSI_TR03161_2", "BSI_TR03161_3",
}

# ---------------------------------------------------------------------------
# Validators
# ---------------------------------------------------------------------------

errors: list[str] = []
warnings: list[str] = []


def error(msg: str) -> None:
    errors.append(msg)


def warn(msg: str) -> None:
    warnings.append(msg)


def check_schema(data: dict) -> None:
    """Validate JSON structure."""
    required_top = ["version", "schema", "framework", "total_controls", "domains", "controls"]
    for key in required_top:
        if key not in data:
            error(f"[SCHEMA] Missing top-level key: {key}")

    required_control = [
        "control_id", "title", "domain", "severity", "objective",
        "rationale", "scope", "requirements", "test_procedure",
        "evidence", "open_anchors",
    ]
    control_id_pattern = re.compile(r"^[A-Z]{2,6}-[0-9]{3}$")
    valid_severities = {"low", "medium", "high", "critical"}

    for ctrl in data.get("controls", []):
        cid = ctrl.get("control_id", "???")
        for key in required_control:
            if key not in ctrl:
                error(f"[SCHEMA] Control {cid}: missing field '{key}'")

        if not control_id_pattern.match(cid):
            error(f"[SCHEMA] Control {cid}: ID does not match ^[A-Z]{{2,6}}-[0-9]{{3}}$")

        sev = ctrl.get("severity", "")
        if sev not in valid_severities:
            error(f"[SCHEMA] Control {cid}: invalid severity '{sev}'")

        if ctrl.get("risk_score") is not None:
            rs = ctrl["risk_score"]
            if not (0 <= rs <= 10):
                error(f"[SCHEMA] Control {cid}: risk_score {rs} out of range [0, 10]")

    domain_ids = {d["id"] for d in data.get("domains", [])}
    for ctrl in data.get("controls", []):
        cid = ctrl.get("control_id", "???")
        if ctrl.get("domain") not in domain_ids:
            error(f"[SCHEMA] Control {cid}: domain '{ctrl.get('domain')}' not in domains list")


def check_no_leak(data: dict) -> None:
    """Ensure no BSI-proprietary nomenclature leaks into product-facing fields."""
    for ctrl in data.get("controls", []):
        cid = ctrl.get("control_id", "???")
        for field_name in PRODUCT_FIELDS:
            values = ctrl.get(field_name, "")
            if isinstance(values, list):
                texts = values
            else:
                texts = [values]

            for text_val in texts:
                if not isinstance(text_val, str):
                    continue
                for pattern in FORBIDDEN_PATTERNS:
                    match = pattern.search(text_val)
                    if match:
                        error(
                            f"[NO-LEAK] Control {cid}.{field_name}: "
                            f"forbidden pattern '{match.group()}' found"
                        )


def check_open_anchors(data: dict) -> None:
    """Every control must have at least 1 open anchor."""
    for ctrl in data.get("controls", []):
        cid = ctrl.get("control_id", "???")
        anchors = ctrl.get("open_anchors", [])
        if len(anchors) < 1:
            error(f"[ANCHOR] Control {cid}: no open anchors — every control needs >= 1")
        # Check anchor structure
        for i, anchor in enumerate(anchors):
            for key in ["framework", "ref", "url"]:
                if key not in anchor or not anchor[key]:
                    error(f"[ANCHOR] Control {cid}: open_anchor[{i}] missing '{key}'")


def check_independent_taxonomy(data: dict) -> None:
    """Verify controls use independent taxonomy, not BSI structure."""
    bsi_domain_patterns = [
        re.compile(r"^O\.", re.IGNORECASE),  # BSI objective prefix
    ]
    for ctrl in data.get("controls", []):
        cid = ctrl.get("control_id", "???")
        for pattern in bsi_domain_patterns:
            if pattern.match(cid):
                error(f"[TAXONOMY] Control {cid}: uses BSI-style ID prefix")


def check_evidence_fields(data: dict) -> None:
    """Validate evidence items have required fields."""
    for ctrl in data.get("controls", []):
        cid = ctrl.get("control_id", "???")
        for i, ev in enumerate(ctrl.get("evidence", [])):
            if not isinstance(ev, dict):
                error(f"[EVIDENCE] Control {cid}: evidence[{i}] is not an object")
                continue
            for key in ["type", "description"]:
                if key not in ev or not ev[key]:
                    error(f"[EVIDENCE] Control {cid}: evidence[{i}] missing '{key}'")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> int:
    print(f"Validating: {CONTROLS_FILE}")
    print()

    if not CONTROLS_FILE.exists():
        print(f"ERROR: File not found: {CONTROLS_FILE}")
        return 1

    with open(CONTROLS_FILE) as f:
        data = json.load(f)

    check_schema(data)
    check_no_leak(data)
    check_open_anchors(data)
    check_independent_taxonomy(data)
    check_evidence_fields(data)

    total_controls = len(data.get("controls", []))
    total_anchors = sum(len(c.get("open_anchors", [])) for c in data.get("controls", []))

    print(f"Controls:     {total_controls}")
    print(f"Open Anchors: {total_anchors}")
    print()

    if warnings:
        print(f"WARNINGS ({len(warnings)}):")
        for w in warnings:
            print(f"  ⚠ {w}")
        print()

    if errors:
        print(f"ERRORS ({len(errors)}):")
        for e in errors:
            print(f"  ✗ {e}")
        print()
        print("VALIDATION FAILED")
        return 1

    print("ALL CHECKS PASSED")
    return 0


if __name__ == "__main__":
    sys.exit(main())