Add Slurm AI/HPC cluster platform project
This commit is contained in:
@@ -0,0 +1,169 @@
|
||||
---
|
||||
- name: Configure Slurm QOS, limits and fairshare
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Ensure sacctmgr is avgpu01le
|
||||
ansible.builtin.command:
|
||||
cmd: sacctmgr -n list cluster
|
||||
changed_when: false
|
||||
|
||||
- name: Validate accounting GPU TRES exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### configured AccountingStorageTRES"
|
||||
scontrol show config | grep -E "AccountingStorageTRES|AccountingStorageType|AccountingStorageEnforce"
|
||||
|
||||
echo
|
||||
echo "### known TRES"
|
||||
sacctmgr show tres
|
||||
|
||||
echo
|
||||
echo "### checking gres/gpu"
|
||||
sacctmgr -n show tres format=Type,Name | awk '$1=="gres" && $2=="gpu" {found=1} END {exit !found}'
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_tres_check
|
||||
changed_when: false
|
||||
|
||||
- name: Ensure normal QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos normal Priority=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_normal
|
||||
changed_when: "'Adding QOS' in (add_qos_normal.stdout + add_qos_normal.stderr)"
|
||||
failed_when: >
|
||||
add_qos_normal.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
|
||||
'already exists' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
|
||||
'Already existing' not in (add_qos_normal.stdout + add_qos_normal.stderr)
|
||||
|
||||
- name: Ensure debug-short QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos debug-short Priority=500
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_debug
|
||||
changed_when: "'Adding QOS' in (add_qos_debug.stdout + add_qos_debug.stderr)"
|
||||
failed_when: >
|
||||
add_qos_debug.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
|
||||
'already exists' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
|
||||
'Already existing' not in (add_qos_debug.stdout + add_qos_debug.stderr)
|
||||
|
||||
- name: Ensure gpu-short QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos gpu-short Priority=1000
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_gpu
|
||||
changed_when: "'Adding QOS' in (add_qos_gpu.stdout + add_qos_gpu.stderr)"
|
||||
failed_when: >
|
||||
add_qos_gpu.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
|
||||
'already exists' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
|
||||
'Already existing' not in (add_qos_gpu.stdout + add_qos_gpu.stderr)
|
||||
|
||||
- name: Ensure maintenance QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos maintenance Priority=5000
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_maintenance
|
||||
changed_when: "'Adding QOS' in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)"
|
||||
failed_when: >
|
||||
add_qos_maintenance.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
|
||||
'already exists' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
|
||||
'Already existing' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)
|
||||
|
||||
- name: Normalize normal QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos normal set Priority=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Normalize debug-short QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos debug-short set Priority=500 MaxWall=00:10:00 MaxTRESPU=cpu=2 MaxJobsPU=4
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Normalize gpu-short QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos gpu-short set Priority=1000 MaxWall=01:00:00 MaxTRESPU=gres/gpu=1,cpu=12 MaxJobsPU=2
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Normalize maintenance QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos maintenance set Priority=5000 MaxWall=02:00:00
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Assign QOS set to lab account
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify account {{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Assign default account to slurmuser
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Assign QOS set to slurmuser association
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify user where name=slurmuser account={{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Show configured QOS and associations
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### TRES"
|
||||
sacctmgr show tres
|
||||
|
||||
echo
|
||||
echo "### QOS"
|
||||
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%40,MaxJobsPU
|
||||
|
||||
echo
|
||||
echo "### Associations"
|
||||
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%60,DefaultQOS,Fairshare
|
||||
|
||||
echo
|
||||
echo "### Fairshare"
|
||||
sshare -A {{ slurm_account_name }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: qos_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print QOS state
|
||||
ansible.builtin.debug:
|
||||
var: qos_state.stdout_lines
|
||||
Reference in New Issue
Block a user