This commit is contained in:
@@ -0,0 +1,128 @@
|
||||
---
|
||||
# Example lab inventory variables. Replace addresses, users and node topology for your environment.
|
||||
|
||||
slurm_cluster_name: labcluster
|
||||
|
||||
slurm_control_machine: slurm-ctl01
|
||||
slurm_control_addr: 10.10.10.11
|
||||
|
||||
slurm_config_dir: /etc/slurm
|
||||
slurm_user: slurm
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
slurmctld_port: 6817
|
||||
slurmd_port: 6818
|
||||
|
||||
slurm_job_comp_type: jobcomp/none
|
||||
|
||||
slurm_select_type: select/cons_tres
|
||||
slurm_select_type_parameters: CR_Core_Memory
|
||||
|
||||
slurm_return_to_service: 2
|
||||
slurm_default_mpi_type: none
|
||||
|
||||
slurm_gres_types: gpu
|
||||
|
||||
slurm_nodes:
|
||||
- name: slurm-c01
|
||||
managed_state: present
|
||||
addr: 10.10.10.12
|
||||
cpus: 2
|
||||
real_memory: 1800
|
||||
features: ""
|
||||
gres: ""
|
||||
topology: ""
|
||||
- name: slurm-c02
|
||||
managed_state: present
|
||||
addr: 10.10.10.13
|
||||
cpus: 2
|
||||
real_memory: 1800
|
||||
features: ""
|
||||
gres: ""
|
||||
topology: ""
|
||||
- name: gpu01
|
||||
managed_state: present
|
||||
addr: 10.10.10.14
|
||||
cpus: 12
|
||||
real_memory: 60000
|
||||
features: "gpu"
|
||||
gres: "gpu:1"
|
||||
gres_file: /dev/nvidia0
|
||||
topology: "Boards=1 SocketsPerBoard=1 CoresPerSocket=6 ThreadsPerCore=2"
|
||||
|
||||
slurm_partitions:
|
||||
- name: debug
|
||||
managed_state: present
|
||||
nodes: "slurm-c[01-02]"
|
||||
default: "YES"
|
||||
max_time: "INFINITE"
|
||||
state: "UP"
|
||||
- name: gpu
|
||||
managed_state: present
|
||||
nodes: "gpu01"
|
||||
default: "NO"
|
||||
max_time: "INFINITE"
|
||||
state: "UP"
|
||||
- name: all
|
||||
managed_state: present
|
||||
nodes: "slurm-c[01-02],gpu01"
|
||||
default: "NO"
|
||||
max_time: "INFINITE"
|
||||
state: "UP"
|
||||
|
||||
# Cgroup enforcement
|
||||
slurm_enable_cgroup: true
|
||||
slurm_task_plugin: task/cgroup,task/affinity
|
||||
slurm_proctrack_type: proctrack/cgroup
|
||||
slurm_job_acct_gather_type: jobacct_gather/cgroup
|
||||
|
||||
# Slurm accounting / SlurmDBD
|
||||
slurm_accounting_storage_type: accounting_storage/slurmdbd
|
||||
slurm_accounting_storage_host: slurm-ctl01
|
||||
slurm_accounting_storage_port: 6819
|
||||
slurm_accounting_storage_enforce: associations,limits,qos
|
||||
slurm_accounting_storage_tres: cpu,mem,energy,node,billing,fs/disk,pages,vmem,gres/gpu
|
||||
|
||||
slurmdbd_host: slurm-ctl01
|
||||
slurmdbd_port: 6819
|
||||
slurmdbd_storage_type: accounting_storage/mysql
|
||||
slurmdbd_storage_host: localhost
|
||||
slurmdbd_storage_port: 3306
|
||||
slurmdbd_storage_loc: slurm_acct_db
|
||||
slurmdbd_storage_user: slurm
|
||||
# Use Ansible Vault in real environments. See inventories/lab/group_vars/vault.example.yml
|
||||
slurmdbd_storage_pass: "{{ vault_slurmdbd_storage_pass | default('CHANGE_ME_USE_ANSIBLE_VAULT') }}"
|
||||
|
||||
slurm_account_name: lab
|
||||
slurm_account_description: "AI/HPC Slurm lab account"
|
||||
slurm_account_organization: "labcluster"
|
||||
|
||||
# SlurmDBD purge / retention policy for lab
|
||||
slurmdbd_commit_delay: 1
|
||||
slurmdbd_purge_event_after: 12months
|
||||
slurmdbd_purge_job_after: 12months
|
||||
slurmdbd_purge_resv_after: 12months
|
||||
slurmdbd_purge_step_after: 3months
|
||||
slurmdbd_purge_suspend_after: 3months
|
||||
slurmdbd_purge_txn_after: 12months
|
||||
slurmdbd_purge_usage_after: 24months
|
||||
|
||||
# Archive is disabled for the lab; backup playbooks handle database dumps.
|
||||
slurmdbd_archive_events: no
|
||||
slurmdbd_archive_jobs: no
|
||||
slurmdbd_archive_steps: no
|
||||
slurmdbd_archive_suspend: no
|
||||
slurmdbd_archive_txn: no
|
||||
slurmdbd_archive_usage: no
|
||||
|
||||
# Slurm priority / fairshare
|
||||
slurm_priority_type: priority/multifactor
|
||||
slurm_priority_decay_half_life: 7-0
|
||||
slurm_priority_calc_period: 5
|
||||
slurm_priority_favor_small: "NO"
|
||||
slurm_priority_weight_age: 1000
|
||||
slurm_priority_weight_fairshare: 10000
|
||||
slurm_priority_weight_job_size: 1000
|
||||
slurm_priority_weight_partition: 1000
|
||||
slurm_priority_weight_qos: 10000
|
||||
slurm_priority_max_age: 1-0
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
# Copy this file to vault.yml and encrypt it with ansible-vault.
|
||||
# ansible-vault encrypt inventories/lab/group_vars/vault.yml
|
||||
|
||||
vault_slurmdbd_storage_pass: CHANGE_ME
|
||||
@@ -0,0 +1,24 @@
|
||||
all:
|
||||
vars:
|
||||
ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
|
||||
children:
|
||||
slurm_cluster:
|
||||
children:
|
||||
slurm_controller:
|
||||
hosts:
|
||||
slurm-ctl01:
|
||||
ansible_host: 10.10.10.11
|
||||
ansible_user: ansible
|
||||
slurm_compute:
|
||||
hosts:
|
||||
slurm-c01:
|
||||
ansible_host: 10.10.10.12
|
||||
ansible_user: ansible
|
||||
slurm-c02:
|
||||
ansible_host: 10.10.10.13
|
||||
ansible_user: ansible
|
||||
slurm_gpu:
|
||||
hosts:
|
||||
gpu01:
|
||||
ansible_host: 10.10.10.14
|
||||
ansible_user: ansible
|
||||
Reference in New Issue
Block a user