170 lines
5.9 KiB
YAML
170 lines
5.9 KiB
YAML
|
|
---
|
||
|
|
- name: Configure Slurm QOS, limits and fairshare
|
||
|
|
hosts: slurm_controller
|
||
|
|
become: true
|
||
|
|
gather_facts: false
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Ensure sacctmgr is avgpu01le
|
||
|
|
ansible.builtin.command:
|
||
|
|
cmd: sacctmgr -n list cluster
|
||
|
|
changed_when: false
|
||
|
|
|
||
|
|
- name: Validate accounting GPU TRES exists
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
echo "### configured AccountingStorageTRES"
|
||
|
|
scontrol show config | grep -E "AccountingStorageTRES|AccountingStorageType|AccountingStorageEnforce"
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "### known TRES"
|
||
|
|
sacctmgr show tres
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "### checking gres/gpu"
|
||
|
|
sacctmgr -n show tres format=Type,Name | awk '$1=="gres" && $2=="gpu" {found=1} END {exit !found}'
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: gpu_tres_check
|
||
|
|
changed_when: false
|
||
|
|
|
||
|
|
- name: Ensure normal QOS exists
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i add qos normal Priority=100
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: add_qos_normal
|
||
|
|
changed_when: "'Adding QOS' in (add_qos_normal.stdout + add_qos_normal.stderr)"
|
||
|
|
failed_when: >
|
||
|
|
add_qos_normal.rc != 0 and
|
||
|
|
'Nothing new added' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
|
||
|
|
'already exists' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
|
||
|
|
'Already existing' not in (add_qos_normal.stdout + add_qos_normal.stderr)
|
||
|
|
|
||
|
|
- name: Ensure debug-short QOS exists
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i add qos debug-short Priority=500
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: add_qos_debug
|
||
|
|
changed_when: "'Adding QOS' in (add_qos_debug.stdout + add_qos_debug.stderr)"
|
||
|
|
failed_when: >
|
||
|
|
add_qos_debug.rc != 0 and
|
||
|
|
'Nothing new added' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
|
||
|
|
'already exists' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
|
||
|
|
'Already existing' not in (add_qos_debug.stdout + add_qos_debug.stderr)
|
||
|
|
|
||
|
|
- name: Ensure gpu-short QOS exists
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i add qos gpu-short Priority=1000
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: add_qos_gpu
|
||
|
|
changed_when: "'Adding QOS' in (add_qos_gpu.stdout + add_qos_gpu.stderr)"
|
||
|
|
failed_when: >
|
||
|
|
add_qos_gpu.rc != 0 and
|
||
|
|
'Nothing new added' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
|
||
|
|
'already exists' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
|
||
|
|
'Already existing' not in (add_qos_gpu.stdout + add_qos_gpu.stderr)
|
||
|
|
|
||
|
|
- name: Ensure maintenance QOS exists
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i add qos maintenance Priority=5000
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: add_qos_maintenance
|
||
|
|
changed_when: "'Adding QOS' in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)"
|
||
|
|
failed_when: >
|
||
|
|
add_qos_maintenance.rc != 0 and
|
||
|
|
'Nothing new added' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
|
||
|
|
'already exists' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
|
||
|
|
'Already existing' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)
|
||
|
|
|
||
|
|
- name: Normalize normal QOS settings
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i modify qos normal set Priority=100
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Normalize debug-short QOS settings
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i modify qos debug-short set Priority=500 MaxWall=00:10:00 MaxTRESPU=cpu=2 MaxJobsPU=4
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Normalize gpu-short QOS settings
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i modify qos gpu-short set Priority=1000 MaxWall=01:00:00 MaxTRESPU=gres/gpu=1,cpu=12 MaxJobsPU=2
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Normalize maintenance QOS settings
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i modify qos maintenance set Priority=5000 MaxWall=02:00:00
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Assign QOS set to lab account
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i modify account {{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Assign default account to slurmuser
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Assign QOS set to slurmuser association
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sacctmgr -i modify user where name=slurmuser account={{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Show configured QOS and associations
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
echo "### TRES"
|
||
|
|
sacctmgr show tres
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "### QOS"
|
||
|
|
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%40,MaxJobsPU
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "### Associations"
|
||
|
|
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%60,DefaultQOS,Fairshare
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "### Fairshare"
|
||
|
|
sshare -A {{ slurm_account_name }} || true
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: qos_state
|
||
|
|
changed_when: false
|
||
|
|
|
||
|
|
- name: Print QOS state
|
||
|
|
ansible.builtin.debug:
|
||
|
|
var: qos_state.stdout_lines
|