Add Slurm AI/HPC cluster platform project
lint / shell-yaml-ansible (push) Failing after 47s

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit d300d490f5
49 changed files with 4777 additions and 0 deletions
@@ -0,0 +1,128 @@
---
# Example lab inventory variables. Replace addresses, users and node topology for your environment.
slurm_cluster_name: labcluster
slurm_control_machine: slurm-ctl01
slurm_control_addr: 10.10.10.11
slurm_config_dir: /etc/slurm
slurm_user: slurm
slurm_operator_user: slurmuser
slurmctld_port: 6817
slurmd_port: 6818
slurm_job_comp_type: jobcomp/none
slurm_select_type: select/cons_tres
slurm_select_type_parameters: CR_Core_Memory
slurm_return_to_service: 2
slurm_default_mpi_type: none
slurm_gres_types: gpu
slurm_nodes:
- name: slurm-c01
managed_state: present
addr: 10.10.10.12
cpus: 2
real_memory: 1800
features: ""
gres: ""
topology: ""
- name: slurm-c02
managed_state: present
addr: 10.10.10.13
cpus: 2
real_memory: 1800
features: ""
gres: ""
topology: ""
- name: gpu01
managed_state: present
addr: 10.10.10.14
cpus: 12
real_memory: 60000
features: "gpu"
gres: "gpu:1"
gres_file: /dev/nvidia0
topology: "Boards=1 SocketsPerBoard=1 CoresPerSocket=6 ThreadsPerCore=2"
slurm_partitions:
- name: debug
managed_state: present
nodes: "slurm-c[01-02]"
default: "YES"
max_time: "INFINITE"
state: "UP"
- name: gpu
managed_state: present
nodes: "gpu01"
default: "NO"
max_time: "INFINITE"
state: "UP"
- name: all
managed_state: present
nodes: "slurm-c[01-02],gpu01"
default: "NO"
max_time: "INFINITE"
state: "UP"
# Cgroup enforcement
slurm_enable_cgroup: true
slurm_task_plugin: task/cgroup,task/affinity
slurm_proctrack_type: proctrack/cgroup
slurm_job_acct_gather_type: jobacct_gather/cgroup
# Slurm accounting / SlurmDBD
slurm_accounting_storage_type: accounting_storage/slurmdbd
slurm_accounting_storage_host: slurm-ctl01
slurm_accounting_storage_port: 6819
slurm_accounting_storage_enforce: associations,limits,qos
slurm_accounting_storage_tres: cpu,mem,energy,node,billing,fs/disk,pages,vmem,gres/gpu
slurmdbd_host: slurm-ctl01
slurmdbd_port: 6819
slurmdbd_storage_type: accounting_storage/mysql
slurmdbd_storage_host: localhost
slurmdbd_storage_port: 3306
slurmdbd_storage_loc: slurm_acct_db
slurmdbd_storage_user: slurm
# Use Ansible Vault in real environments. See inventories/lab/group_vars/vault.example.yml
slurmdbd_storage_pass: "{{ vault_slurmdbd_storage_pass | default('CHANGE_ME_USE_ANSIBLE_VAULT') }}"
slurm_account_name: lab
slurm_account_description: "AI/HPC Slurm lab account"
slurm_account_organization: "labcluster"
# SlurmDBD purge / retention policy for lab
slurmdbd_commit_delay: 1
slurmdbd_purge_event_after: 12months
slurmdbd_purge_job_after: 12months
slurmdbd_purge_resv_after: 12months
slurmdbd_purge_step_after: 3months
slurmdbd_purge_suspend_after: 3months
slurmdbd_purge_txn_after: 12months
slurmdbd_purge_usage_after: 24months
# Archive is disabled for the lab; backup playbooks handle database dumps.
slurmdbd_archive_events: no
slurmdbd_archive_jobs: no
slurmdbd_archive_steps: no
slurmdbd_archive_suspend: no
slurmdbd_archive_txn: no
slurmdbd_archive_usage: no
# Slurm priority / fairshare
slurm_priority_type: priority/multifactor
slurm_priority_decay_half_life: 7-0
slurm_priority_calc_period: 5
slurm_priority_favor_small: "NO"
slurm_priority_weight_age: 1000
slurm_priority_weight_fairshare: 10000
slurm_priority_weight_job_size: 1000
slurm_priority_weight_partition: 1000
slurm_priority_weight_qos: 10000
slurm_priority_max_age: 1-0
@@ -0,0 +1,5 @@
---
# Copy this file to vault.yml and encrypt it with ansible-vault.
# ansible-vault encrypt inventories/lab/group_vars/vault.yml
vault_slurmdbd_storage_pass: CHANGE_ME