#!/usr/bin/env python3 """ Canonical Control Library — CI/CD Validator Checks: 1. Schema Validation — JSON against defined structure 2. License Gate — Every mapping reference fulfils allowed_usages 3. No-Leak Scanner — Regex against forbidden locator patterns (e.g. O.Auth_*, O.Netz_*) 4. Provenance Integrity — Every referenced source_id exists in seed data 5. Open Anchor Check — Every control has >= 1 open anchor Usage: python scripts/validate-controls.py """ import json import re import sys from pathlib import Path # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- REPO_ROOT = Path(__file__).resolve().parent.parent CONTROLS_FILE = REPO_ROOT / "ai-compliance-sdk" / "policies" / "canonical_controls_v1.json" MIGRATION_FILE = REPO_ROOT / "backend-compliance" / "migrations" / "044_canonical_control_library.sql" # --------------------------------------------------------------------------- # Forbidden patterns (BSI proprietary nomenclature — must NOT appear in controls) # --------------------------------------------------------------------------- FORBIDDEN_PATTERNS = [ re.compile(r"O\.[A-Za-z]+_[0-9]+"), # BSI objective IDs: O.Auth_1, O.Netz_3 re.compile(r"TR-03161"), # Direct TR reference in control text re.compile(r"BSI-TR-"), # Direct BSI-TR reference re.compile(r"Anforderung\s+[A-Z]\.\d+"), # BSI requirement format ] # Fields that are product-facing and must not contain forbidden patterns PRODUCT_FIELDS = ["objective", "rationale", "title", "requirements", "test_procedure"] # --------------------------------------------------------------------------- # Known open sources (from migration seed) # --------------------------------------------------------------------------- KNOWN_OPEN_SOURCES = { "OWASP_ASVS", "OWASP_MASVS", "OWASP_TOP10", "NIST_SP800_53", "NIST_SP800_63B", "ENISA_GOOD_PRACTICES", "CIS_CONTROLS", } KNOWN_ALL_SOURCES = KNOWN_OPEN_SOURCES | { "BSI_TR03161_1", "BSI_TR03161_2", "BSI_TR03161_3", } # --------------------------------------------------------------------------- # Validators # --------------------------------------------------------------------------- errors: list[str] = [] warnings: list[str] = [] def error(msg: str) -> None: errors.append(msg) def warn(msg: str) -> None: warnings.append(msg) def check_schema(data: dict) -> None: """Validate JSON structure.""" required_top = ["version", "schema", "framework", "total_controls", "domains", "controls"] for key in required_top: if key not in data: error(f"[SCHEMA] Missing top-level key: {key}") required_control = [ "control_id", "title", "domain", "severity", "objective", "rationale", "scope", "requirements", "test_procedure", "evidence", "open_anchors", ] control_id_pattern = re.compile(r"^[A-Z]{2,6}-[0-9]{3}$") valid_severities = {"low", "medium", "high", "critical"} for ctrl in data.get("controls", []): cid = ctrl.get("control_id", "???") for key in required_control: if key not in ctrl: error(f"[SCHEMA] Control {cid}: missing field '{key}'") if not control_id_pattern.match(cid): error(f"[SCHEMA] Control {cid}: ID does not match ^[A-Z]{{2,6}}-[0-9]{{3}}$") sev = ctrl.get("severity", "") if sev not in valid_severities: error(f"[SCHEMA] Control {cid}: invalid severity '{sev}'") if ctrl.get("risk_score") is not None: rs = ctrl["risk_score"] if not (0 <= rs <= 10): error(f"[SCHEMA] Control {cid}: risk_score {rs} out of range [0, 10]") domain_ids = {d["id"] for d in data.get("domains", [])} for ctrl in data.get("controls", []): cid = ctrl.get("control_id", "???") if ctrl.get("domain") not in domain_ids: error(f"[SCHEMA] Control {cid}: domain '{ctrl.get('domain')}' not in domains list") def check_no_leak(data: dict) -> None: """Ensure no BSI-proprietary nomenclature leaks into product-facing fields.""" for ctrl in data.get("controls", []): cid = ctrl.get("control_id", "???") for field_name in PRODUCT_FIELDS: values = ctrl.get(field_name, "") if isinstance(values, list): texts = values else: texts = [values] for text_val in texts: if not isinstance(text_val, str): continue for pattern in FORBIDDEN_PATTERNS: match = pattern.search(text_val) if match: error( f"[NO-LEAK] Control {cid}.{field_name}: " f"forbidden pattern '{match.group()}' found" ) def check_open_anchors(data: dict) -> None: """Every control must have at least 1 open anchor.""" for ctrl in data.get("controls", []): cid = ctrl.get("control_id", "???") anchors = ctrl.get("open_anchors", []) if len(anchors) < 1: error(f"[ANCHOR] Control {cid}: no open anchors — every control needs >= 1") # Check anchor structure for i, anchor in enumerate(anchors): for key in ["framework", "ref", "url"]: if key not in anchor or not anchor[key]: error(f"[ANCHOR] Control {cid}: open_anchor[{i}] missing '{key}'") def check_independent_taxonomy(data: dict) -> None: """Verify controls use independent taxonomy, not BSI structure.""" bsi_domain_patterns = [ re.compile(r"^O\.", re.IGNORECASE), # BSI objective prefix ] for ctrl in data.get("controls", []): cid = ctrl.get("control_id", "???") for pattern in bsi_domain_patterns: if pattern.match(cid): error(f"[TAXONOMY] Control {cid}: uses BSI-style ID prefix") def check_evidence_fields(data: dict) -> None: """Validate evidence items have required fields.""" for ctrl in data.get("controls", []): cid = ctrl.get("control_id", "???") for i, ev in enumerate(ctrl.get("evidence", [])): if not isinstance(ev, dict): error(f"[EVIDENCE] Control {cid}: evidence[{i}] is not an object") continue for key in ["type", "description"]: if key not in ev or not ev[key]: error(f"[EVIDENCE] Control {cid}: evidence[{i}] missing '{key}'") # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> int: print(f"Validating: {CONTROLS_FILE}") print() if not CONTROLS_FILE.exists(): print(f"ERROR: File not found: {CONTROLS_FILE}") return 1 with open(CONTROLS_FILE) as f: data = json.load(f) check_schema(data) check_no_leak(data) check_open_anchors(data) check_independent_taxonomy(data) check_evidence_fields(data) total_controls = len(data.get("controls", [])) total_anchors = sum(len(c.get("open_anchors", [])) for c in data.get("controls", [])) print(f"Controls: {total_controls}") print(f"Open Anchors: {total_anchors}") print() if warnings: print(f"WARNINGS ({len(warnings)}):") for w in warnings: print(f" ⚠ {w}") print() if errors: print(f"ERRORS ({len(errors)}):") for e in errors: print(f" ✗ {e}") print() print("VALIDATION FAILED") return 1 print("ALL CHECKS PASSED") return 0 if __name__ == "__main__": sys.exit(main())