Files
2026-06-05 15:38:56 +00:00

170 lines
5.9 KiB
YAML

---
- name: Configure Slurm QOS, limits and fairshare
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Ensure sacctmgr is avgpu01le
ansible.builtin.command:
cmd: sacctmgr -n list cluster
changed_when: false
- name: Validate accounting GPU TRES exists
ansible.builtin.shell: |
set -euo pipefail
echo "### configured AccountingStorageTRES"
scontrol show config | grep -E "AccountingStorageTRES|AccountingStorageType|AccountingStorageEnforce"
echo
echo "### known TRES"
sacctmgr show tres
echo
echo "### checking gres/gpu"
sacctmgr -n show tres format=Type,Name | awk '$1=="gres" && $2=="gpu" {found=1} END {exit !found}'
args:
executable: /bin/bash
register: gpu_tres_check
changed_when: false
- name: Ensure normal QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos normal Priority=100
args:
executable: /bin/bash
register: add_qos_normal
changed_when: "'Adding QOS' in (add_qos_normal.stdout + add_qos_normal.stderr)"
failed_when: >
add_qos_normal.rc != 0 and
'Nothing new added' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
'already exists' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
'Already existing' not in (add_qos_normal.stdout + add_qos_normal.stderr)
- name: Ensure debug-short QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos debug-short Priority=500
args:
executable: /bin/bash
register: add_qos_debug
changed_when: "'Adding QOS' in (add_qos_debug.stdout + add_qos_debug.stderr)"
failed_when: >
add_qos_debug.rc != 0 and
'Nothing new added' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
'already exists' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
'Already existing' not in (add_qos_debug.stdout + add_qos_debug.stderr)
- name: Ensure gpu-short QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos gpu-short Priority=1000
args:
executable: /bin/bash
register: add_qos_gpu
changed_when: "'Adding QOS' in (add_qos_gpu.stdout + add_qos_gpu.stderr)"
failed_when: >
add_qos_gpu.rc != 0 and
'Nothing new added' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
'already exists' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
'Already existing' not in (add_qos_gpu.stdout + add_qos_gpu.stderr)
- name: Ensure maintenance QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos maintenance Priority=5000
args:
executable: /bin/bash
register: add_qos_maintenance
changed_when: "'Adding QOS' in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)"
failed_when: >
add_qos_maintenance.rc != 0 and
'Nothing new added' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
'already exists' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
'Already existing' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)
- name: Normalize normal QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos normal set Priority=100
args:
executable: /bin/bash
changed_when: true
- name: Normalize debug-short QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos debug-short set Priority=500 MaxWall=00:10:00 MaxTRESPU=cpu=2 MaxJobsPU=4
args:
executable: /bin/bash
changed_when: true
- name: Normalize gpu-short QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos gpu-short set Priority=1000 MaxWall=01:00:00 MaxTRESPU=gres/gpu=1,cpu=12 MaxJobsPU=2
args:
executable: /bin/bash
changed_when: true
- name: Normalize maintenance QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos maintenance set Priority=5000 MaxWall=02:00:00
args:
executable: /bin/bash
changed_when: true
- name: Assign QOS set to lab account
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify account {{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
args:
executable: /bin/bash
changed_when: true
- name: Assign default account to slurmuser
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
args:
executable: /bin/bash
changed_when: true
- name: Assign QOS set to slurmuser association
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify user where name=slurmuser account={{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
args:
executable: /bin/bash
changed_when: true
- name: Show configured QOS and associations
ansible.builtin.shell: |
set -euo pipefail
echo "### TRES"
sacctmgr show tres
echo
echo "### QOS"
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%40,MaxJobsPU
echo
echo "### Associations"
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%60,DefaultQOS,Fairshare
echo
echo "### Fairshare"
sshare -A {{ slurm_account_name }} || true
args:
executable: /bin/bash
register: qos_state
changed_when: false
- name: Print QOS state
ansible.builtin.debug:
var: qos_state.stdout_lines