Add Slurm AI/HPC cluster platform project
lint / shell-yaml-ansible (push) Failing after 47s

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit d300d490f5
49 changed files with 4777 additions and 0 deletions
@@ -0,0 +1,126 @@
---
- name: Initialize Slurm accounting entities
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Wait for sacctmgr connectivity
ansible.builtin.command:
cmd: sacctmgr -n list cluster
register: sacctmgr_cluster_list
retries: 20
delay: 2
until: sacctmgr_cluster_list.rc == 0
changed_when: false
- name: Show current accounting state before changes
ansible.builtin.shell: |
set -euo pipefail
echo "### clusters"
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
echo
echo "### accounts"
sacctmgr list account format=Account,Descr,Org
echo
echo "### users"
sacctmgr list user format=User,DefaultAccount,Admin
echo
echo "### associations"
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
args:
executable: /bin/bash
register: accounting_state_before
changed_when: false
- name: Print current accounting state before changes
ansible.builtin.debug:
var: accounting_state_before.stdout_lines
- name: Ensure Slurm cluster exists in accounting DB
ansible.builtin.shell: |
set -euo pipefail
if sacctmgr -n list cluster format=Cluster | awk '{print $1}' | grep -qx "{{ slurm_cluster_name }}"; then
echo "Cluster {{ slurm_cluster_name }} already exists"
else
sacctmgr -i add cluster {{ slurm_cluster_name }}
fi
args:
executable: /bin/bash
register: ensure_cluster
changed_when: "'Adding Cluster' in ensure_cluster.stdout"
- name: Ensure default lab account exists for cluster
ansible.builtin.shell: |
set -euo pipefail
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="" {found=1} END {exit !found}'; then
echo "Account {{ slurm_account_name }} already associated with cluster {{ slurm_cluster_name }}"
else
sacctmgr -i add account {{ slurm_account_name }} \
Cluster={{ slurm_cluster_name }} \
Description="{{ slurm_account_description }}" \
Organization="{{ slurm_account_organization }}"
fi
args:
executable: /bin/bash
register: ensure_account
changed_when: "'Adding Account' in ensure_account.stdout"
- name: Ensure slurmuser exists with lab account association
ansible.builtin.shell: |
set -euo pipefail
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="slurmuser" {found=1} END {exit !found}'; then
echo "User slurmuser already associated with account {{ slurm_account_name }} on cluster {{ slurm_cluster_name }}"
else
sacctmgr -i add user slurmuser \
Cluster={{ slurm_cluster_name }} \
Account={{ slurm_account_name }} \
DefaultAccount={{ slurm_account_name }}
fi
args:
executable: /bin/bash
register: ensure_user_assoc
changed_when: "'Adding User' in ensure_user_assoc.stdout"
- name: Ensure slurmuser has default account set
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
args:
executable: /bin/bash
register: set_default_account
changed_when: "'Modified user' in (set_default_account.stdout + set_default_account.stderr)"
- name: Show final accounting state
ansible.builtin.shell: |
set -euo pipefail
echo "### clusters"
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
echo
echo "### accounts"
sacctmgr list account format=Account,Descr,Org
echo
echo "### users"
sacctmgr list user format=User,DefaultAccount,Admin
echo
echo "### associations"
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
args:
executable: /bin/bash
register: accounting_state_after
changed_when: false
- name: Print final accounting state
ansible.builtin.debug:
var: accounting_state_after.stdout_lines