Add Slurm AI/HPC cluster platform project

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit cd6830334b
47 changed files with 4727 additions and 0 deletions
@@ -0,0 +1,90 @@
---
- name: Backup SlurmDBD MariaDB database
hosts: slurm_controller
become: true
gather_facts: true
vars:
slurmdbd_backup_dir: /var/backups/slurmdbd
local_fetch_dir: "{{ playbook_dir }}/../../artifacts/backups/slurmdbd"
tasks:
- name: Create remote backup directory
ansible.builtin.file:
path: "{{ slurmdbd_backup_dir }}"
state: directory
owner: root
group: root
mode: "0700"
- name: Create local fetch directory on Ansible controller
ansible.builtin.file:
path: "{{ local_fetch_dir }}"
state: directory
owner: root
group: root
mode: "0700"
delegate_to: localhost
become: false
- name: Validate MariaDB is running
ansible.builtin.command:
cmd: systemctl is-active mariadb
changed_when: false
- name: Validate SlurmDBD is running
ansible.builtin.command:
cmd: systemctl is-active slurmdbd
changed_when: false
- name: Validate Slurm accounting database exists
ansible.builtin.shell: |
set -euo pipefail
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';" | grep -qx "{{ slurmdbd_storage_loc }}"
args:
executable: /bin/bash
changed_when: false
- name: Dump Slurm accounting database
ansible.builtin.shell: |
set -euo pipefail
ts="$(date +%F-%H%M%S)"
out="{{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-${ts}.sql.gz"
mysqldump \
--single-transaction \
--routines \
--events \
--triggers \
{{ slurmdbd_storage_loc }} | gzip -9 > "$out"
chmod 0600 "$out"
echo "$out"
args:
executable: /bin/bash
register: db_dump
changed_when: true
- name: Validate backup file is non-empty
ansible.builtin.stat:
path: "{{ db_dump.stdout }}"
register: backup_file
- name: Fail if backup file is empty
ansible.builtin.fail:
msg: "Backup file is empty: {{ db_dump.stdout }}"
when: backup_file.stat.size | int < 1024
- name: Fetch DB backup to Ansible controller
ansible.builtin.fetch:
src: "{{ db_dump.stdout }}"
dest: "{{ local_fetch_dir }}/"
flat: true
- name: Show DB backup result
ansible.builtin.debug:
msg:
- "Remote backup: {{ db_dump.stdout }}"
- "Backup size bytes: {{ backup_file.stat.size }}"
- "Fetched to: {{ local_fetch_dir }}/"
@@ -0,0 +1,126 @@
---
- name: Initialize Slurm accounting entities
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Wait for sacctmgr connectivity
ansible.builtin.command:
cmd: sacctmgr -n list cluster
register: sacctmgr_cluster_list
retries: 20
delay: 2
until: sacctmgr_cluster_list.rc == 0
changed_when: false
- name: Show current accounting state before changes
ansible.builtin.shell: |
set -euo pipefail
echo "### clusters"
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
echo
echo "### accounts"
sacctmgr list account format=Account,Descr,Org
echo
echo "### users"
sacctmgr list user format=User,DefaultAccount,Admin
echo
echo "### associations"
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
args:
executable: /bin/bash
register: accounting_state_before
changed_when: false
- name: Print current accounting state before changes
ansible.builtin.debug:
var: accounting_state_before.stdout_lines
- name: Ensure Slurm cluster exists in accounting DB
ansible.builtin.shell: |
set -euo pipefail
if sacctmgr -n list cluster format=Cluster | awk '{print $1}' | grep -qx "{{ slurm_cluster_name }}"; then
echo "Cluster {{ slurm_cluster_name }} already exists"
else
sacctmgr -i add cluster {{ slurm_cluster_name }}
fi
args:
executable: /bin/bash
register: ensure_cluster
changed_when: "'Adding Cluster' in ensure_cluster.stdout"
- name: Ensure default lab account exists for cluster
ansible.builtin.shell: |
set -euo pipefail
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="" {found=1} END {exit !found}'; then
echo "Account {{ slurm_account_name }} already associated with cluster {{ slurm_cluster_name }}"
else
sacctmgr -i add account {{ slurm_account_name }} \
Cluster={{ slurm_cluster_name }} \
Description="{{ slurm_account_description }}" \
Organization="{{ slurm_account_organization }}"
fi
args:
executable: /bin/bash
register: ensure_account
changed_when: "'Adding Account' in ensure_account.stdout"
- name: Ensure slurmuser exists with lab account association
ansible.builtin.shell: |
set -euo pipefail
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="slurmuser" {found=1} END {exit !found}'; then
echo "User slurmuser already associated with account {{ slurm_account_name }} on cluster {{ slurm_cluster_name }}"
else
sacctmgr -i add user slurmuser \
Cluster={{ slurm_cluster_name }} \
Account={{ slurm_account_name }} \
DefaultAccount={{ slurm_account_name }}
fi
args:
executable: /bin/bash
register: ensure_user_assoc
changed_when: "'Adding User' in ensure_user_assoc.stdout"
- name: Ensure slurmuser has default account set
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
args:
executable: /bin/bash
register: set_default_account
changed_when: "'Modified user' in (set_default_account.stdout + set_default_account.stderr)"
- name: Show final accounting state
ansible.builtin.shell: |
set -euo pipefail
echo "### clusters"
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
echo
echo "### accounts"
sacctmgr list account format=Account,Descr,Org
echo
echo "### users"
sacctmgr list user format=User,DefaultAccount,Admin
echo
echo "### associations"
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
args:
executable: /bin/bash
register: accounting_state_after
changed_when: false
- name: Print final accounting state
ansible.builtin.debug:
var: accounting_state_after.stdout_lines
@@ -0,0 +1,98 @@
---
- name: Restore-check latest SlurmDBD backup into test database
hosts: slurm_controller
become: true
gather_facts: false
vars:
restore_check_db: "{{ slurmdbd_storage_loc }}_restorecheck"
slurmdbd_backup_dir: /var/backups/slurmdbd
tasks:
- name: Validate MariaDB is running
ansible.builtin.command:
cmd: systemctl is-active mariadb
changed_when: false
- name: Find latest SlurmDBD backup
ansible.builtin.shell: |
set -euo pipefail
ls -1t {{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-*.sql.gz | head -n 1
args:
executable: /bin/bash
register: latest_backup
changed_when: false
- name: Validate latest backup exists
ansible.builtin.stat:
path: "{{ latest_backup.stdout }}"
register: latest_backup_stat
- name: Fail if latest backup is missing or empty
ansible.builtin.fail:
msg: "Latest SlurmDBD backup is missing or empty: {{ latest_backup.stdout }}"
when:
- not latest_backup_stat.stat.exists or latest_backup_stat.stat.size | int < 1024
- name: Recreate restore-check database
ansible.builtin.shell: |
set -euo pipefail
mysql <<SQL
DROP DATABASE IF EXISTS {{ restore_check_db }};
CREATE DATABASE {{ restore_check_db }};
SQL
args:
executable: /bin/bash
changed_when: true
- name: Import backup into restore-check database
ansible.builtin.shell: |
set -euo pipefail
zcat "{{ latest_backup.stdout }}" | mysql {{ restore_check_db }}
args:
executable: /bin/bash
changed_when: true
- name: Validate restored table count
ansible.builtin.shell: |
set -euo pipefail
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
args:
executable: /bin/bash
register: restored_tables
changed_when: false
failed_when: restored_tables.stdout | int < 1
- name: Validate restored row count sample
ansible.builtin.shell: |
set -euo pipefail
echo "### restored database"
echo "{{ restore_check_db }}"
echo
echo "### table count"
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
echo
echo "### largest tables"
mysql -N -B -e "
SELECT table_name, table_rows
FROM information_schema.tables
WHERE table_schema='{{ restore_check_db }}'
ORDER BY table_rows DESC
LIMIT 10;
"
args:
executable: /bin/bash
register: restore_check_summary
changed_when: false
- name: Show restore-check result
ansible.builtin.debug:
msg:
- "Imported backup: {{ latest_backup.stdout }}"
- "Restore-check DB: {{ restore_check_db }}"
- "Restored tables: {{ restored_tables.stdout }}"
- "Summary:"
- "{{ restore_check_summary.stdout_lines }}"
@@ -0,0 +1,105 @@
---
- name: Install and configure MariaDB for SlurmDBD
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Install MariaDB and SlurmDBD packages
ansible.builtin.apt:
name:
- mariadb-server
- mariadb-client
- slurmdbd
- slurm-wlm-mysql-plugin
state: present
update_cache: true
- name: Ensure MariaDB is enabled and running
ansible.builtin.systemd:
name: mariadb
enabled: true
state: started
- name: Ensure Slurm log directory exists
ansible.builtin.file:
path: /var/log/slurm
state: directory
owner: slurm
group: slurm
mode: "0755"
- name: Create Slurm accounting database and DB user
ansible.builtin.shell: |
set -euo pipefail
mysql <<SQL
CREATE DATABASE IF NOT EXISTS {{ slurmdbd_storage_loc }};
CREATE USER IF NOT EXISTS '{{ slurmdbd_storage_user }}'@'localhost' IDENTIFIED BY '{{ slurmdbd_storage_pass }}';
CREATE USER IF NOT EXISTS '{{ slurmdbd_storage_user }}'@'127.0.0.1' IDENTIFIED BY '{{ slurmdbd_storage_pass }}';
GRANT ALL PRIVILEGES ON {{ slurmdbd_storage_loc }}.* TO '{{ slurmdbd_storage_user }}'@'localhost';
GRANT ALL PRIVILEGES ON {{ slurmdbd_storage_loc }}.* TO '{{ slurmdbd_storage_user }}'@'127.0.0.1';
FLUSH PRIVILEGES;
SQL
args:
executable: /bin/bash
changed_when: true
- name: Ensure /etc/slurm exists
ansible.builtin.file:
path: /etc/slurm
state: directory
owner: root
group: root
mode: "0755"
- name: Deploy slurmdbd.conf
ansible.builtin.template:
src: ../../templates/slurmdbd.conf.j2
dest: /etc/slurm/slurmdbd.conf
owner: slurm
group: slurm
mode: "0600"
notify:
- Restart slurmdbd
- name: Ensure slurmdbd is enabled and running
ansible.builtin.systemd:
name: slurmdbd
enabled: true
state: started
- name: Flush handlers before validation
ansible.builtin.meta: flush_handlers
- name: Validate slurmdbd service is active
ansible.builtin.command:
cmd: systemctl is-active slurmdbd
register: slurmdbd_active
retries: 10
delay: 2
until: slurmdbd_active.stdout == "active"
changed_when: false
- name: Validate slurmdbd is listening on port
ansible.builtin.shell: |
set -euo pipefail
ss -lntp | grep ':{{ slurmdbd_port }} '
args:
executable: /bin/bash
register: slurmdbd_port_check
retries: 10
delay: 2
until: slurmdbd_port_check.rc == 0
changed_when: false
- name: Show slurmdbd service validation
ansible.builtin.debug:
msg:
- "slurmdbd is active"
- "{{ slurmdbd_port_check.stdout_lines }}"
handlers:
- name: Restart slurmdbd
ansible.builtin.systemd:
name: slurmdbd
state: restarted
@@ -0,0 +1,178 @@
---
- name: Validate Slurm accounting production-like setup
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Validate accounting services
ansible.builtin.shell: |
set -euo pipefail
echo "### services"
systemctl is-active mariadb
systemctl is-active slurmdbd
systemctl is-active slurmctld
echo
echo "### slurmdbd listener"
ss -lntp | grep ':6819 '
args:
executable: /bin/bash
register: service_check
changed_when: false
- name: Validate Slurm accounting runtime config
ansible.builtin.shell: |
set -euo pipefail
echo "### accounting config"
scontrol show config | grep -E "AccountingStorage|JobAcctGather|ClusterName"
echo
echo "### priority / select / cgroup config"
scontrol show config | grep -E "SelectType|TaskPlugin|ProctrackType"
args:
executable: /bin/bash
register: config_check
changed_when: false
- name: Validate sacctmgr entities
ansible.builtin.shell: |
set -euo pipefail
echo "### clusters"
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
echo
echo "### accounts"
sacctmgr list account format=Account,Descr,Org
echo
echo "### users"
sacctmgr list user format=User,DefaultAccount,Admin
echo
echo "### associations"
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
args:
executable: /bin/bash
register: entity_check
changed_when: false
- name: Submit accounting validation job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=acct-prodlike-test
#SBATCH --partition=debug
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/acct-prodlike-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/acct-prodlike-test-${job_id}.out"
args:
executable: /bin/bash
register: acct_job
changed_when: true
- name: Validate sacct can read recent jobs
ansible.builtin.shell: |
set -euo pipefail
echo "### recent jobs"
sacct -S today --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
args:
executable: /bin/bash
register: sacct_recent
changed_when: false
- name: Validate sreport commands
ansible.builtin.shell: |
set -euo pipefail
echo "### cluster utilization"
sreport cluster utilization start=today || true
echo
echo "### account utilization by user"
sreport cluster AccountUtilizationByUser start=today || true
echo
echo "### user top"
sreport user top start=today || true
args:
executable: /bin/bash
register: sreport_check
changed_when: false
- name: Validate MariaDB table health summary
ansible.builtin.shell: |
set -euo pipefail
echo "### database exists"
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';"
echo
echo "### table count"
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ slurmdbd_storage_loc }}';"
echo
echo "### largest tables"
mysql -N -B -e "
SELECT table_name, table_rows
FROM information_schema.tables
WHERE table_schema='{{ slurmdbd_storage_loc }}'
ORDER BY table_rows DESC
LIMIT 10;
"
args:
executable: /bin/bash
register: db_health
changed_when: false
- name: Print accounting validation
ansible.builtin.debug:
msg:
- "### services"
- "{{ service_check.stdout_lines }}"
- "### runtime config"
- "{{ config_check.stdout_lines }}"
- "### accounting entities"
- "{{ entity_check.stdout_lines }}"
- "### accounting validation job"
- "{{ acct_job.stdout_lines }}"
- "### recent sacct data"
- "{{ sacct_recent.stdout_lines }}"
- "### sreport"
- "{{ sreport_check.stdout_lines }}"
- "### database health"
- "{{ db_health.stdout_lines }}"
@@ -0,0 +1,83 @@
---
- name: Backup Slurm and Munge state on all cluster nodes
hosts: slurm_cluster
become: true
gather_facts: true
vars:
backup_base_dir: /var/backups/slurm
tasks:
- name: Create backup base directory
ansible.builtin.file:
path: "{{ backup_base_dir }}"
state: directory
owner: root
group: root
mode: "0700"
- name: Create timestamped backup directory
ansible.builtin.shell: |
set -euo pipefail
ts="$(date +%F-%H%M%S)"
dir="{{ backup_base_dir }}/$ts"
mkdir -p "$dir"
echo "$dir"
args:
executable: /bin/bash
register: backup_dir_result
changed_when: true
- name: Store backup directory fact
ansible.builtin.set_fact:
node_backup_dir: "{{ backup_dir_result.stdout }}"
- name: Backup Slurm and Munge config/state if present
ansible.builtin.shell: |
set -euo pipefail
backup_dir="{{ node_backup_dir }}"
for p in \
/etc/slurm \
/etc/slurm-llnl \
/etc/munge \
/var/spool/slurmctld \
/var/spool/slurmd \
/var/log/slurm \
/var/log/slurm-llnl
do
if [ -e "$p" ]; then
cp -a "$p" "$backup_dir/"
fi
done
systemctl status munge --no-pager > "$backup_dir/systemctl-munge.txt" 2>&1 || true
systemctl status slurmctld --no-pager > "$backup_dir/systemctl-slurmctld.txt" 2>&1 || true
systemctl status slurmd --no-pager > "$backup_dir/systemctl-slurmd.txt" 2>&1 || true
journalctl -u munge -n 200 --no-pager > "$backup_dir/journal-munge.txt" 2>&1 || true
journalctl -u slurmctld -n 200 --no-pager > "$backup_dir/journal-slurmctld.txt" 2>&1 || true
journalctl -u slurmd -n 200 --no-pager > "$backup_dir/journal-slurmd.txt" 2>&1 || true
if command -v sinfo >/dev/null 2>&1; then
sinfo > "$backup_dir/sinfo.txt" 2>&1 || true
fi
if command -v scontrol >/dev/null 2>&1; then
scontrol show config > "$backup_dir/scontrol-show-config.txt" 2>&1 || true
scontrol show nodes > "$backup_dir/scontrol-show-nodes.txt" 2>&1 || true
scontrol show partitions > "$backup_dir/scontrol-show-partitions.txt" 2>&1 || true
fi
find "$backup_dir" -maxdepth 2 -type f -o -type d
args:
executable: /bin/bash
register: backup_content
changed_when: true
- name: Show backup location on node
ansible.builtin.debug:
msg:
- "Host: {{ inventory_hostname }}"
- "Backup directory: {{ node_backup_dir }}"
@@ -0,0 +1,46 @@
---
- name: Fetch latest Slurm backups from nodes to pvef
hosts: slurm_cluster
become: true
gather_facts: false
vars:
remote_backup_base: /var/backups/slurm
local_backup_base: "{{ playbook_dir }}/../../artifacts/backups"
tasks:
- name: Find latest remote backup directory
ansible.builtin.shell: |
set -euo pipefail
ls -1dt {{ remote_backup_base }}/* | head -n 1
args:
executable: /bin/bash
register: latest_backup_dir
changed_when: false
- name: Create local backup directory on pvef
ansible.builtin.file:
path: "{{ local_backup_base }}/{{ inventory_hostname }}"
state: directory
mode: "0700"
delegate_to: localhost
become: false
- name: Archive latest backup directory on remote node
ansible.builtin.archive:
path: "{{ latest_backup_dir.stdout }}"
dest: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz"
format: gz
force_archive: true
changed_when: true
- name: Fetch archive to pvef
ansible.builtin.fetch:
src: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz"
dest: "{{ local_backup_base }}/{{ inventory_hostname }}/"
flat: true
- name: Remove temporary remote archive
ansible.builtin.file:
path: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz"
state: absent
@@ -0,0 +1,58 @@
---
- name: Bootstrap Ansible SSH access from pvef to Slurm nodes
hosts: slurm_cluster
gather_facts: false
become: true
vars:
ansible_controller_pubkey: "{{ lookup('file', lookup('env', 'HOME') + '/.ssh/id_ed25519.pub') }}"
pre_tasks:
- name: Wait for SSH
ansible.builtin.wait_for_connection:
timeout: 30
- name: Install Python if missing - Debian/Ubuntu
ansible.builtin.raw: |
test -e /usr/bin/python3 || (apt-get update && apt-get install -y python3)
changed_when: false
tasks:
- name: Ensure sudo is installed
ansible.builtin.apt:
name:
- sudo
- openssh-server
state: present
update_cache: true
- name: Ensure SSH server is enabled and running
ansible.builtin.service:
name: ssh
state: started
enabled: true
- name: Ensure .ssh directory exists for login user
ansible.builtin.file:
path: "/home/{{ ansible_user }}/.ssh"
state: directory
owner: "{{ ansible_user }}"
group: "{{ ansible_user }}"
mode: "0700"
- name: Add pvef root public key to login user's authorized_keys
ansible.builtin.authorized_key:
user: "{{ ansible_user }}"
key: "{{ ansible_controller_pubkey }}"
state: present
manage_dir: true
- name: Allow bootstrap login user passwordless sudo
ansible.builtin.copy:
dest: "/etc/sudoers.d/90-ansible-{{ ansible_user }}"
owner: root
group: root
mode: "0440"
content: |
{{ ansible_user }} ALL=(ALL) NOPASSWD:ALL
validate: "visudo -cf %s"
@@ -0,0 +1,16 @@
---
- name: Configure /etc/hosts for Slurm cluster
hosts: slurm_cluster
become: true
gather_facts: false
tasks:
- name: Add Slurm cluster hosts to /etc/hosts
ansible.builtin.blockinfile:
path: /etc/hosts
marker: "# {mark} ANSIBLE MANAGED SLURM CLUSTER HOSTS"
block: |
{{ slurm_control_addr }} {{ slurm_control_machine }}
{% for node in slurm_nodes if node.managed_state | default('present') == 'present' %}
{{ node.addr }} {{ node.name }}
{% endfor %}
@@ -0,0 +1,218 @@
---
- name: Create slurmuser and generate SSH keys on every Slurm node
hosts: slurm_cluster
become: true
gather_facts: true
vars:
slurm_operator_user: slurmuser
slurm_operator_shell: /bin/bash
tasks:
- name: Ensure useful packages are installed
ansible.builtin.apt:
name:
- sudo
- openssh-client
- openssh-server
- acl
state: present
update_cache: true
- name: Ensure slurmuser exists
ansible.builtin.user:
name: "{{ slurm_operator_user }}"
shell: "{{ slurm_operator_shell }}"
create_home: true
state: present
- name: Ensure .ssh directory exists for slurmuser
ansible.builtin.file:
path: "/home/{{ slurm_operator_user }}/.ssh"
state: directory
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0700"
- name: Generate SSH key for slurmuser if missing
ansible.builtin.openssh_keypair:
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
type: ed25519
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0600"
comment: "{{ slurm_operator_user }}@{{ inventory_hostname }}"
force: false
- name: Read public key from each node
ansible.builtin.slurp:
src: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
register: slurmuser_pubkey_raw
- name: Store decoded public key as host fact
ansible.builtin.set_fact:
slurmuser_pubkey: "{{ slurmuser_pubkey_raw.content | b64decode | trim }}"
- name: Exchange slurmuser SSH keys across all Slurm nodes
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Install all slurmuser public keys into authorized_keys on every node
ansible.builtin.authorized_key:
user: "{{ slurm_operator_user }}"
key: "{{ hostvars[item].slurmuser_pubkey }}"
state: present
manage_dir: true
loop: "{{ groups['slurm_cluster'] }}"
- name: Build SSH known_hosts entries for all cluster nodes
ansible.builtin.shell: |
set -e
mkdir -p /home/{{ slurm_operator_user }}/.ssh
touch /home/{{ slurm_operator_user }}/.ssh/known_hosts
{% for host in groups['slurm_cluster'] %}
ssh-keyscan -H {{ host }} {{ hostvars[host].ansible_host }} 2>/dev/null >> /home/{{ slurm_operator_user }}/.ssh/known_hosts || true
{% endfor %}
sort -u /home/{{ slurm_operator_user }}/.ssh/known_hosts -o /home/{{ slurm_operator_user }}/.ssh/known_hosts
chown {{ slurm_operator_user }}:{{ slurm_operator_user }} /home/{{ slurm_operator_user }}/.ssh/known_hosts
chmod 0644 /home/{{ slurm_operator_user }}/.ssh/known_hosts
args:
executable: /bin/bash
changed_when: true
- name: Ensure SSH permissions are correct
ansible.builtin.file:
path: "/home/{{ slurm_operator_user }}/.ssh"
state: directory
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0700"
- name: Ensure private key permissions are correct
ansible.builtin.file:
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0600"
- name: Ensure public key permissions are correct
ansible.builtin.file:
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0644"
- name: Configure sudo permissions for slurmuser
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Configure sudoers for slurmuser on Slurm controller
ansible.builtin.copy:
dest: /etc/sudoers.d/91-slurmuser-slurm-controller
owner: root
group: root
mode: "0440"
content: |
# Managed by Ansible
# Operator access for Slurm controller node.
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
/bin/systemctl status slurmctld, \
/bin/systemctl restart slurmctld, \
/bin/systemctl reload slurmctld, \
/bin/systemctl stop slurmctld, \
/bin/systemctl start slurmctld, \
/bin/systemctl status slurmd, \
/bin/systemctl restart slurmd, \
/bin/systemctl reload slurmd, \
/bin/systemctl stop slurmd, \
/bin/systemctl start slurmd, \
/bin/journalctl -u slurmctld, \
/bin/journalctl -u slurmd, \
/usr/bin/scontrol, \
/usr/bin/sinfo, \
/usr/bin/squeue, \
/usr/bin/scancel, \
/usr/bin/sacct, \
/usr/bin/sacctmgr, \
/usr/bin/sbatch, \
/usr/bin/srun, \
/usr/bin/salloc
validate: "visudo -cf %s"
when: inventory_hostname in groups['slurm_controller']
- name: Configure sudoers for slurmuser on Slurm compute and GPU nodes
ansible.builtin.copy:
dest: /etc/sudoers.d/91-slurmuser-slurm-compute
owner: root
group: root
mode: "0440"
content: |
# Managed by Ansible
# Operator access for Slurm worker/GPU nodes.
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
/bin/systemctl status slurmd, \
/bin/systemctl restart slurmd, \
/bin/systemctl reload slurmd, \
/bin/systemctl stop slurmd, \
/bin/systemctl start slurmd, \
/bin/journalctl -u slurmd, \
/usr/bin/scontrol, \
/usr/bin/sinfo, \
/usr/bin/squeue, \
/usr/bin/scancel, \
/usr/bin/sacct, \
/usr/bin/sbatch, \
/usr/bin/srun, \
/usr/bin/salloc
validate: "visudo -cf %s"
when: inventory_hostname not in groups['slurm_controller']
- name: Validate slurmuser SSH mesh and Slurm access
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Test local Slurm commands as slurmuser
ansible.builtin.command: "sudo -iu {{ slurm_operator_user }} sinfo"
register: sinfo_test
changed_when: false
failed_when: sinfo_test.rc != 0
- name: Show sinfo result
ansible.builtin.debug:
var: sinfo_test.stdout_lines
- name: Test SSH from each node to every other node as slurmuser
ansible.builtin.shell: |
set -e
{% for host in groups['slurm_cluster'] %}
ssh -o BatchMode=yes -o ConnectTimeout=5 {{ host }} 'hostname'
{% endfor %}
args:
executable: /bin/bash
become_user: "{{ slurm_operator_user }}"
register: ssh_mesh_test
changed_when: false
- name: Show SSH mesh test result
ansible.builtin.debug:
var: ssh_mesh_test.stdout_lines
@@ -0,0 +1,112 @@
---
- name: Fix sudo permissions for slurmuser Slurm operations
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Configure sudoers for slurmuser on controller
ansible.builtin.copy:
dest: /etc/sudoers.d/91-slurmuser-slurm-controller
owner: root
group: root
mode: "0440"
content: |
# Managed by Ansible
Cmnd_Alias SLURM_SYSTEMCTL_CONTROLLER = \
/bin/systemctl status slurmctld, \
/bin/systemctl status slurmctld *, \
/bin/systemctl restart slurmctld, \
/bin/systemctl reload slurmctld, \
/bin/systemctl start slurmctld, \
/bin/systemctl stop slurmctld, \
/bin/systemctl status slurmd, \
/bin/systemctl status slurmd *, \
/bin/systemctl restart slurmd, \
/bin/systemctl reload slurmd, \
/bin/systemctl start slurmd, \
/bin/systemctl stop slurmd, \
/usr/bin/systemctl status slurmctld, \
/usr/bin/systemctl status slurmctld *, \
/usr/bin/systemctl restart slurmctld, \
/usr/bin/systemctl reload slurmctld, \
/usr/bin/systemctl start slurmctld, \
/usr/bin/systemctl stop slurmctld, \
/usr/bin/systemctl status slurmd, \
/usr/bin/systemctl status slurmd *, \
/usr/bin/systemctl restart slurmd, \
/usr/bin/systemctl reload slurmd, \
/usr/bin/systemctl start slurmd, \
/usr/bin/systemctl stop slurmd
Cmnd_Alias SLURM_JOURNAL_CONTROLLER = \
/bin/journalctl -u slurmctld, \
/bin/journalctl -u slurmctld *, \
/bin/journalctl -u slurmd, \
/bin/journalctl -u slurmd *, \
/usr/bin/journalctl -u slurmctld, \
/usr/bin/journalctl -u slurmctld *, \
/usr/bin/journalctl -u slurmd, \
/usr/bin/journalctl -u slurmd *
Cmnd_Alias SLURM_COMMANDS = \
/usr/bin/scontrol, /usr/bin/scontrol *, \
/usr/bin/sinfo, /usr/bin/sinfo *, \
/usr/bin/squeue, /usr/bin/squeue *, \
/usr/bin/scancel, /usr/bin/scancel *, \
/usr/bin/sacct, /usr/bin/sacct *, \
/usr/bin/sacctmgr, /usr/bin/sacctmgr *, \
/usr/bin/sbatch, /usr/bin/sbatch *, \
/usr/bin/srun, /usr/bin/srun *, \
/usr/bin/salloc, /usr/bin/salloc *
{{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_CONTROLLER, SLURM_JOURNAL_CONTROLLER, SLURM_COMMANDS
validate: "visudo -cf %s"
when: inventory_hostname in groups['slurm_controller']
- name: Configure sudoers for slurmuser on compute and GPU nodes
ansible.builtin.copy:
dest: /etc/sudoers.d/91-slurmuser-slurm-compute
owner: root
group: root
mode: "0440"
content: |
# Managed by Ansible
Cmnd_Alias SLURM_SYSTEMCTL_COMPUTE = \
/bin/systemctl status slurmd, \
/bin/systemctl status slurmd *, \
/bin/systemctl restart slurmd, \
/bin/systemctl reload slurmd, \
/bin/systemctl start slurmd, \
/bin/systemctl stop slurmd, \
/usr/bin/systemctl status slurmd, \
/usr/bin/systemctl status slurmd *, \
/usr/bin/systemctl restart slurmd, \
/usr/bin/systemctl reload slurmd, \
/usr/bin/systemctl start slurmd, \
/usr/bin/systemctl stop slurmd
Cmnd_Alias SLURM_JOURNAL_COMPUTE = \
/bin/journalctl -u slurmd, \
/bin/journalctl -u slurmd *, \
/usr/bin/journalctl -u slurmd, \
/usr/bin/journalctl -u slurmd *
Cmnd_Alias SLURM_COMMANDS = \
/usr/bin/scontrol, /usr/bin/scontrol *, \
/usr/bin/sinfo, /usr/bin/sinfo *, \
/usr/bin/squeue, /usr/bin/squeue *, \
/usr/bin/scancel, /usr/bin/scancel *, \
/usr/bin/sacct, /usr/bin/sacct *, \
/usr/bin/sbatch, /usr/bin/sbatch *, \
/usr/bin/srun, /usr/bin/srun *, \
/usr/bin/salloc, /usr/bin/salloc *
{{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_COMPUTE, SLURM_JOURNAL_COMPUTE, SLURM_COMMANDS
validate: "visudo -cf %s"
when: inventory_hostname not in groups['slurm_controller']
@@ -0,0 +1,133 @@
---
- name: Read Munge key from Slurm controller
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Check controller munge.key exists
ansible.builtin.stat:
path: /etc/munge/munge.key
register: controller_munge_key
- name: Fail if controller munge.key is missing
ansible.builtin.fail:
msg: "/etc/munge/munge.key is missing on controller. Do not continue."
when: not controller_munge_key.stat.exists
- name: Read controller munge.key
ansible.builtin.slurp:
src: /etc/munge/munge.key
register: controller_munge_key_raw
- name: Store controller Munge key as fact
ansible.builtin.set_fact:
cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}"
- name: Deploy controller Munge key to all Slurm nodes
hosts: slurm_cluster
become: true
gather_facts: false
vars:
controller_host: "{{ groups['slurm_controller'][0] }}"
tasks:
- name: Ensure munge package is installed
ansible.builtin.apt:
name:
- munge
- libmunge2
state: present
update_cache: true
- name: Ensure munge group exists
ansible.builtin.group:
name: munge
system: true
state: present
- name: Ensure munge user exists
ansible.builtin.user:
name: munge
group: munge
system: true
shell: /usr/sbin/nologin
home: /nonexistent
create_home: false
state: present
- name: Ensure /etc/munge exists
ansible.builtin.file:
path: /etc/munge
state: directory
owner: munge
group: munge
mode: "0700"
- name: Deploy shared munge.key from controller
ansible.builtin.copy:
dest: /etc/munge/munge.key
content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}"
owner: munge
group: munge
mode: "0400"
notify:
- Restart munge
- name: Ensure /var/log/munge exists
ansible.builtin.file:
path: /var/log/munge
state: directory
owner: munge
group: munge
mode: "0755"
- name: Ensure /var/lib/munge exists
ansible.builtin.file:
path: /var/lib/munge
state: directory
owner: munge
group: munge
mode: "0711"
- name: Ensure /run/munge exists
ansible.builtin.file:
path: /run/munge
state: directory
owner: munge
group: munge
mode: "0755"
- name: Ensure munge is enabled and running
ansible.builtin.systemd:
name: munge
enabled: true
state: started
handlers:
- name: Restart munge
ansible.builtin.systemd:
name: munge
state: restarted
- name: Validate Munge locally on all nodes
hosts: slurm_cluster
become: true
gather_facts: false
tasks:
- name: Test local munge encode/decode
ansible.builtin.shell: |
set -euo pipefail
munge -n | unmunge
args:
executable: /bin/bash
register: munge_local_test
changed_when: false
- name: Show local Munge validation
ansible.builtin.debug:
var: munge_local_test.stdout_lines
@@ -0,0 +1,132 @@
---
- name: Prepare Slurm config directories and logs
hosts: slurm_cluster
become: true
gather_facts: false
tasks:
- name: Ensure Slurm config directory exists
ansible.builtin.file:
path: "{{ slurm_config_dir }}"
state: directory
owner: root
group: root
mode: "0755"
- name: Ensure Slurm log directory exists
ansible.builtin.file:
path: /var/log/slurm
state: directory
owner: slurm
group: slurm
mode: "0755"
- name: Ensure slurmctld spool directory exists on controller
ansible.builtin.file:
path: /var/spool/slurmctld
state: directory
owner: slurm
group: slurm
mode: "0755"
when: inventory_hostname in groups['slurm_controller']
- name: Ensure slurmd spool directory exists on workers
ansible.builtin.file:
path: /var/spool/slurmd
state: directory
owner: slurm
group: slurm
mode: "0755"
when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu']
- name: Deploy Slurm config files
hosts: slurm_cluster
become: true
gather_facts: false
tasks:
- name: Backup current slurm.conf before managed deployment
ansible.builtin.copy:
src: "{{ slurm_config_dir }}/slurm.conf"
dest: "{{ slurm_config_dir }}/slurm.conf.pre-ansible-managed"
remote_src: true
owner: root
group: root
mode: "0644"
force: false
- name: Deploy managed slurm.conf
ansible.builtin.template:
src: ../../templates/slurm.conf.j2
dest: "{{ slurm_config_dir }}/slurm.conf"
owner: root
group: root
mode: "0644"
notify:
- Reconfigure slurmctld
- Restart slurmd
- name: Deploy managed cgroup.conf
ansible.builtin.template:
src: ../../templates/cgroup.conf.j2
dest: "{{ slurm_config_dir }}/cgroup.conf"
owner: root
group: root
mode: "0644"
when: slurm_enable_cgroup | default(false) | bool
notify:
- Reconfigure slurmctld
- Restart slurmd
- name: Deploy managed gres.conf only on GPU nodes
ansible.builtin.template:
src: ../../templates/gres.conf.j2
dest: "{{ slurm_config_dir }}/gres.conf"
owner: root
group: root
mode: "0644"
when: inventory_hostname in groups['slurm_gpu']
notify:
- Reconfigure slurmctld
- Restart slurmd
handlers:
- name: Reconfigure slurmctld
ansible.builtin.command:
cmd: scontrol reconfigure
when: inventory_hostname in groups['slurm_controller']
changed_when: true
- name: Restart slurmd
ansible.builtin.systemd:
name: slurmd
state: restarted
when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu']
- name: Validate Slurm after config deployment
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Reconfigure controller
ansible.builtin.command:
cmd: scontrol reconfigure
changed_when: true
- name: Validate cluster state
ansible.builtin.shell: |
set -euo pipefail
scontrol ping
sinfo
scontrol show nodes
args:
executable: /bin/bash
register: slurm_config_validation
changed_when: false
- name: Show validation output
ansible.builtin.debug:
var: slurm_config_validation.stdout_lines
@@ -0,0 +1,103 @@
---
- name: Restart Slurm controller safely
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Restart munge on controller
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Restart slurmctld on controller
ansible.builtin.systemd:
name: slurmctld
state: restarted
enabled: true
- name: Wait for slurmctld to answer
ansible.builtin.command:
cmd: scontrol ping
register: scontrol_ping
retries: 15
delay: 2
until: scontrol_ping.rc == 0
changed_when: false
- name: Show controller ping
ansible.builtin.debug:
var: scontrol_ping.stdout_lines
- name: Restart Slurm workers safely one by one
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: false
serial: 1
tasks:
- name: Restart munge on worker
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Restart slurmd on worker
ansible.builtin.systemd:
name: slurmd
state: restarted
enabled: true
- name: Wait for slurmd to be active
ansible.builtin.command:
cmd: systemctl is-active slurmd
register: slurmd_active
retries: 15
delay: 2
until: slurmd_active.stdout == "active"
changed_when: false
- name: Wait until this node is visible in Slurm
ansible.builtin.command:
cmd: scontrol show node {{ inventory_hostname }}
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: node_visible
retries: 15
delay: 2
until: node_visible.rc == 0
changed_when: false
- name: Validate Slurm after restart
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Validate Slurm cluster state
ansible.builtin.shell: |
set -euo pipefail
echo "### scontrol ping"
scontrol ping
echo
echo "### sinfo"
sinfo
echo
echo "### nodes"
scontrol show nodes
echo
echo "### partitions"
scontrol show partitions
args:
executable: /bin/bash
register: slurm_validation
changed_when: false
- name: Show Slurm validation
ansible.builtin.debug:
var: slurm_validation.stdout_lines
@@ -0,0 +1,40 @@
---
- name: Discover node resources for Slurm config
hosts: slurm_cluster
become: true
gather_facts: true
tasks:
- name: Discover CPU and memory
ansible.builtin.shell: |
set -euo pipefail
echo "HOST={{ inventory_hostname }}"
echo "CPUS=$(nproc)"
echo "REAL_MEMORY_MB=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo)"
echo "SOCKETS=$(lscpu | awk -F: '/Socket\\(s\\)/ {gsub(/ /,\"\",$2); print $2}')"
echo "CORES_PER_SOCKET=$(lscpu | awk -F: '/Core\\(s\\) per socket/ {gsub(/ /,\"\",$2); print $2}')"
echo "THREADS_PER_CORE=$(lscpu | awk -F: '/Thread\\(s\\) per core/ {gsub(/ /,\"\",$2); print $2}')"
args:
executable: /bin/bash
register: cpu_mem
changed_when: false
- name: Discover NVIDIA GPU if present
ansible.builtin.shell: |
set -euo pipefail
if command -v nvidia-smi >/dev/null 2>&1; then
nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader
else
echo "NO_NVIDIA_SMI"
fi
args:
executable: /bin/bash
register: gpu_info
changed_when: false
- name: Show discovered resources
ansible.builtin.debug:
msg:
- "{{ cpu_mem.stdout_lines }}"
- "GPU:"
- "{{ gpu_info.stdout_lines }}"
@@ -0,0 +1,89 @@
---
- name: Inspect current Slurm and Munge state
hosts: slurm_cluster
become: true
gather_facts: true
tasks:
- name: Basic host info
ansible.builtin.shell: |
set -e
echo "HOST=$(hostname -f 2>/dev/null || hostname)"
echo "SHORT_HOST=$(hostname -s)"
echo "IP_ADDRESSES=$(hostname -I)"
echo "OS=$(lsb_release -ds 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME || true)"
echo "KERNEL=$(uname -r)"
args:
executable: /bin/bash
register: host_info
changed_when: false
- name: Slurm package info
ansible.builtin.shell: |
dpkg -l | grep -Ei 'slurm|munge' || true
args:
executable: /bin/bash
register: package_info
changed_when: false
- name: Slurm config paths
ansible.builtin.shell: |
set -e
for p in /etc/slurm /etc/slurm-llnl /etc/munge; do
echo "### $p"
if [ -e "$p" ]; then
find "$p" -maxdepth 2 -type f -printf "%m %u %g %p\n" | sort
else
echo "MISSING"
fi
done
args:
executable: /bin/bash
register: config_paths
changed_when: false
- name: Service state
ansible.builtin.shell: |
for s in munge slurmctld slurmd; do
echo "### $s"
systemctl is-enabled "$s" 2>/dev/null || true
systemctl is-active "$s" 2>/dev/null || true
done
args:
executable: /bin/bash
register: service_state
changed_when: false
- name: Slurm commands
ansible.builtin.shell: |
echo "### which"
command -v sinfo || true
command -v scontrol || true
command -v sbatch || true
command -v srun || true
command -v munge || true
command -v unmunge || true
echo "### sinfo"
sinfo 2>&1 || true
echo "### scontrol ping"
scontrol ping 2>&1 || true
args:
executable: /bin/bash
register: slurm_commands
changed_when: false
- name: Show inspection report
ansible.builtin.debug:
msg:
- "===== {{ inventory_hostname }} :: host_info ====="
- "{{ host_info.stdout_lines }}"
- "===== {{ inventory_hostname }} :: packages ====="
- "{{ package_info.stdout_lines }}"
- "===== {{ inventory_hostname }} :: config_paths ====="
- "{{ config_paths.stdout_lines }}"
- "===== {{ inventory_hostname }} :: services ====="
- "{{ service_state.stdout_lines }}"
- "===== {{ inventory_hostname }} :: slurm_commands ====="
- "{{ slurm_commands.stdout_lines }}"
@@ -0,0 +1,216 @@
---
- name: Detect problematic Slurm nodes
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Detect nodes needing remediation
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -h -o "%N %T" | awk '
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
' | sort -u
args:
executable: /bin/bash
register: bad_nodes_raw
changed_when: false
- name: Store bad node list
ansible.builtin.set_fact:
bad_nodes: "{{ bad_nodes_raw.stdout_lines }}"
- name: Show detected problematic nodes
ansible.builtin.debug:
var: bad_nodes
- name: Attempt auto-remediation on problematic nodes
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: false
serial: 1
vars:
bad_nodes_from_controller: "{{ hostvars[groups['slurm_controller'][0]].bad_nodes | default([]) }}"
tasks:
- name: Skip healthy nodes
ansible.builtin.meta: end_host
when: inventory_hostname not in bad_nodes_from_controller
- name: Restart Munge
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Restart slurmd
ansible.builtin.systemd:
name: slurmd
state: restarted
enabled: true
- name: Validate local services after remediation attempt
ansible.builtin.shell: |
set -euo pipefail
echo "HOST=$(hostname)"
echo
echo "### services"
systemctl is-active munge
systemctl is-active slurmd
echo
echo "### munge"
munge -n | unmunge >/dev/null
echo "munge OK"
echo
echo "### controller ping"
scontrol ping
echo
echo "### slurmd listener"
ss -lntp | grep ':6818 ' || true
echo
echo "### recent slurmd logs"
journalctl -u slurmd -n 30 --no-pager || true
args:
executable: /bin/bash
register: local_repair_check
changed_when: false
- name: Print local remediation result
ansible.builtin.debug:
var: local_repair_check.stdout_lines
- name: Refresh controller and validate remediated nodes
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Restart slurmctld to refresh node states
ansible.builtin.systemd:
name: slurmctld
state: restarted
- name: Wait for controller
ansible.builtin.command:
cmd: scontrol ping
register: slurmctld_ping
retries: 15
delay: 2
until: slurmctld_ping.rc == 0
changed_when: false
- name: Clear maintenance state on previously bad nodes
ansible.builtin.shell: |
set -euo pipefail
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
if [ -z "$bad_nodes" ]; then
echo "No bad nodes detected. Nothing to clear."
sinfo -N
exit 0
fi
for node in $bad_nodes; do
echo "### clearing state on $node"
scontrol update NodeName="$node" State=RESUME 2>/dev/null || true
scontrol update NodeName="$node" State=UNDRAIN 2>/dev/null || true
scontrol update NodeName="$node" State=IDLE 2>/dev/null || true
done
sleep 5
sinfo -N
args:
executable: /bin/bash
register: clear_result
changed_when: true
- name: Print clear-state result
ansible.builtin.debug:
var: clear_result.stdout_lines
- name: Detect nodes still unhealthy after remediation
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -h -o "%N %T" | awk '
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
' | sort -u
args:
executable: /bin/bash
register: still_bad_nodes_raw
changed_when: false
- name: Store still bad nodes
ansible.builtin.set_fact:
still_bad_nodes: "{{ still_bad_nodes_raw.stdout_lines }}"
- name: Drain nodes that remain unhealthy
ansible.builtin.shell: |
set -euo pipefail
unresolved_nodes="{{ still_bad_nodes | join(' ') }}"
if [ -z "$unresolved_nodes" ]; then
echo "No unresolved unhealthy nodes."
sinfo -N
exit 0
fi
for node in $unresolved_nodes; do
echo "### draining unresolved node $node"
scontrol update NodeName="$node" State=DRAIN Reason="auto-remediation failed"
done
sinfo -N
args:
executable: /bin/bash
register: drain_unresolved
changed_when: still_bad_nodes | length > 0
- name: Show remediation summary
ansible.builtin.shell: |
set -euo pipefail
echo "### initial bad nodes"
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
if [ -z "$bad_nodes" ]; then
echo "none"
else
printf '%s\n' $bad_nodes
fi
echo
echo "### still bad nodes"
still_bad_nodes="{{ (still_bad_nodes | default([])) | join(' ') }}"
if [ -z "$still_bad_nodes" ]; then
echo "none"
else
printf '%s\n' $still_bad_nodes
fi
echo
echo "### final sinfo"
sinfo -N
echo
echo "### queue"
squeue
args:
executable: /bin/bash
register: remediation_summary
changed_when: false
- name: Print remediation summary
ansible.builtin.debug:
var: remediation_summary.stdout_lines
@@ -0,0 +1,149 @@
---
- name: Check Slurm controller health
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Check controller services and cluster state
ansible.builtin.shell: |
set -euo pipefail
echo "### controller services"
systemctl is-active munge
systemctl is-active slurmctld
systemctl is-active slurmdbd || true
systemctl is-active mariadb || true
echo
echo "### slurm ping"
scontrol ping
echo
echo "### nodes"
sinfo -N
echo
echo "### partitions"
sinfo
echo
echo "### queue"
squeue
echo
echo "### problematic nodes"
sinfo -N -h -o "%N %T %E" | awk '$2 !~ /idle|alloc|mix/ {print}' || true
echo
echo "### accounting"
sacctmgr -n list cluster || true
echo
echo "### recent failed jobs"
sacct -S today --state=FAILED,CANCELLED,TIMEOUT,NODE_FAIL,OUT_OF_MEMORY \
--format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,NodeList | tail -30 || true
args:
executable: /bin/bash
register: controller_health
changed_when: false
- name: Print controller health
ansible.builtin.debug:
var: controller_health.stdout_lines
- name: Check Slurm worker health
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: true
tasks:
- name: Check worker services, config and connectivity
ansible.builtin.shell: |
set -euo pipefail
echo "HOST=$(hostname)"
echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
echo "KERNEL=$(uname -r)"
echo "UPTIME=$(uptime -p)"
echo
echo "### services"
systemctl is-active munge
systemctl is-active slurmd
echo
echo "### munge local test"
munge -n | unmunge >/dev/null
echo "munge OK"
echo
echo "### controller connectivity"
getent hosts slurm-ctl01 || true
scontrol ping
echo
echo "### slurmd listener"
ss -lntp | grep ':6818 ' || true
echo
echo "### config checksums"
sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
echo
echo "### shared filesystem"
test -d /shared
touch /shared/.slurm-health-$(hostname)
ls -l /shared/.slurm-health-$(hostname)
rm -f /shared/.slurm-health-$(hostname)
echo
echo "### cgroup"
mount | grep cgroup || true
echo
echo "### gpu check"
if command -v nvidia-smi >/dev/null 2>&1; then
nvidia-smi --query-gpu=index,name,driver_version,memory.total,temperature.gpu,utilization.gpu --format=csv,noheader || true
else
echo "NO_NVIDIA_SMI"
fi
args:
executable: /bin/bash
register: worker_health
changed_when: false
- name: Print worker health
ansible.builtin.debug:
var: worker_health.stdout_lines
- name: Check Slurm-reported node state consistency
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Build Slurm node health summary
ansible.builtin.shell: |
set -euo pipefail
echo "### node summary"
sinfo -N -o "%N %P %T %C %m %G %E"
echo
echo "### full problematic node details"
for node in $(sinfo -N -h -o "%N %T" | awk '$2 ~ /down|drain|fail|unk|not_responding|idle\\*/ {print $1}' | sort -u); do
echo
echo "### $node"
scontrol show node "$node"
done
args:
executable: /bin/bash
register: slurm_node_summary
changed_when: false
- name: Print Slurm node summary
ansible.builtin.debug:
var: slurm_node_summary.stdout_lines
@@ -0,0 +1,217 @@
---
- name: Validate target node
hosts: localhost
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook repair-slurm-node.yml -e target_node=<hostname>"
when: target_node is not defined
- name: Ensure target_node is in inventory
ansible.builtin.fail:
msg: "target_node={{ target_node }} is not in Ansible inventory"
when: target_node not in groups['all']
- name: Capture node state before repair
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Show target node state before repair
ansible.builtin.shell: |
set -euo pipefail
echo "### sinfo"
sinfo -N -n {{ target_node }} || true
echo
echo "### scontrol"
scontrol show node {{ target_node }} || true
echo
echo "### jobs"
squeue -w {{ target_node }} || true
args:
executable: /bin/bash
register: node_state_before
changed_when: false
- name: Print target node state before repair
ansible.builtin.debug:
var: node_state_before.stdout_lines
- name: Repair local services on target node
hosts: "{{ target_node }}"
become: true
gather_facts: false
tasks:
- name: Restart Munge
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Restart slurmd
ansible.builtin.systemd:
name: slurmd
state: restarted
enabled: true
when:
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
- name: Validate local repair
ansible.builtin.shell: |
set -euo pipefail
echo "### services"
systemctl is-active munge
systemctl is-active slurmd
echo
echo "### munge"
munge -n | unmunge >/dev/null
echo "munge OK"
echo
echo "### controller ping"
scontrol ping
echo
echo "### slurmd listener"
ss -lntp | grep ':6818 ' || true
echo
echo "### recent slurmd logs"
journalctl -u slurmd -n 40 --no-pager || true
args:
executable: /bin/bash
register: local_repair_state
changed_when: false
- name: Print local repair state
ansible.builtin.debug:
var: local_repair_state.stdout_lines
- name: Clear Slurm maintenance/down state after repair
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Restart controller to refresh node state
ansible.builtin.systemd:
name: slurmctld
state: restarted
- name: Wait for controller
ansible.builtin.command:
cmd: scontrol ping
register: slurmctld_ping
retries: 15
delay: 2
until: slurmctld_ping.rc == 0
changed_when: false
- name: Clear target node state
ansible.builtin.shell: |
set -euo pipefail
scontrol update NodeName={{ target_node }} State=RESUME 2>/dev/null || true
scontrol update NodeName={{ target_node }} State=UNDRAIN 2>/dev/null || true
scontrol update NodeName={{ target_node }} State=IDLE 2>/dev/null || true
sleep 5
sinfo -N -n {{ target_node }}
scontrol show node {{ target_node }}
args:
executable: /bin/bash
register: clear_state
changed_when: true
- name: Wait until node is healthy
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }}
scontrol show node {{ target_node }}
args:
executable: /bin/bash
register: node_health_after
retries: 30
delay: 5
until:
- node_health_after.rc == 0
- "'not_responding' not in node_health_after.stdout.lower()"
- "'down' not in node_health_after.stdout.lower()"
- "'drain' not in node_health_after.stdout.lower()"
- "'idle*' not in node_health_after.stdout.lower()"
changed_when: false
- name: Print node state after repair
ansible.builtin.debug:
var: node_health_after.stdout_lines
- name: Submit repair validation job
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit validation job to repaired node
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<SBATCH
#!/bin/bash
#SBATCH --job-name=repair-node-test
#SBATCH --partition=all
#SBATCH --nodelist={{ target_node }}
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --account=lab
#SBATCH --qos=normal
#SBATCH --output=/shared/repair-node-test-%j.out
echo "HOST=\$(hostname)"
echo "USER=\$(whoami)"
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList
echo "### output"
cat "/shared/repair-node-test-${job_id}.out"
args:
executable: /bin/bash
register: repair_validation_job
changed_when: true
- name: Print repair validation job
ansible.builtin.debug:
var: repair_validation_job.stdout_lines
@@ -0,0 +1,126 @@
---
- name: Validate target_node variable
hosts: localhost
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook decommission-slurm-node.yml -e target_node=<hostname> [-e decom_reason='reason']"
when: target_node is not defined
- name: Ensure target_node is in inventory
ansible.builtin.fail:
msg: "target_node={{ target_node }} is not in Ansible inventory"
when: target_node not in groups['all']
- name: Drain target node and wait for jobs to leave
hosts: slurm_controller
become: true
gather_facts: false
vars:
decom_reason_effective: "{{ decom_reason | default('decommission by Ansible') }}"
decom_wait_retries_effective: "{{ decom_wait_retries | default(120) }}"
decom_wait_delay_effective: "{{ decom_wait_delay | default(10) }}"
tasks:
- name: Show current target node state
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }} || true
scontrol show node {{ target_node }} || true
args:
executable: /bin/bash
register: node_state_before
changed_when: false
- name: Print current target node state
ansible.builtin.debug:
var: node_state_before.stdout_lines
- name: Drain target node
ansible.builtin.command:
cmd: scontrol update NodeName={{ target_node }} State=DRAIN Reason="{{ decom_reason_effective }}"
changed_when: true
- name: Wait until no jobs are running on target node
ansible.builtin.shell: |
set -euo pipefail
squeue -h -w {{ target_node }} || true
args:
executable: /bin/bash
register: jobs_on_node
retries: "{{ decom_wait_retries_effective | int }}"
delay: "{{ decom_wait_delay_effective | int }}"
until: jobs_on_node.stdout | trim == ""
changed_when: false
- name: Show drained node state
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }} || true
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
args:
executable: /bin/bash
register: node_state_drained
changed_when: false
- name: Print drained node state
ansible.builtin.debug:
var: node_state_drained.stdout_lines
- name: Stop Slurm worker service on target node
hosts: "{{ target_node }}"
become: true
gather_facts: false
tasks:
- name: Stop slurmd
ansible.builtin.systemd:
name: slurmd
state: stopped
enabled: false
when:
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
- name: Show slurmd state
ansible.builtin.shell: |
systemctl is-enabled slurmd 2>/dev/null || true
systemctl is-active slurmd 2>/dev/null || true
args:
executable: /bin/bash
register: slurmd_state_after
changed_when: false
- name: Print slurmd state
ansible.builtin.debug:
var: slurmd_state_after.stdout_lines
- name: Mark node down in Slurm controller
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Mark target node DOWN after service stop
ansible.builtin.command:
cmd: scontrol update NodeName={{ target_node }} State=DOWN Reason="decommissioned"
changed_when: true
- name: Show final node state
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }} || true
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
args:
executable: /bin/bash
register: final_node_state
changed_when: false
- name: Print final node state
ansible.builtin.debug:
var: final_node_state.stdout_lines
@@ -0,0 +1,246 @@
---
- name: Validate target_node variable
hosts: localhost
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook provision-slurm-node.yml -e target_node=<hostname>"
when: target_node is not defined
- name: Ensure target_node is in inventory
ansible.builtin.fail:
msg: "target_node={{ target_node }} is not in Ansible inventory"
when: target_node not in groups['all']
- name: Prepare OS, packages and Slurm directories on target node
hosts: "{{ target_node }}"
become: true
gather_facts: true
tasks:
- name: Ensure target is a Slurm worker or GPU node
ansible.builtin.fail:
msg: "{{ inventory_hostname }} must be in slurm_compute or slurm_gpu group"
when:
- inventory_hostname not in groups.get('slurm_compute', [])
- inventory_hostname not in groups.get('slurm_gpu', [])
- name: Install Slurm worker packages
ansible.builtin.apt:
name:
- munge
- libmunge2
- slurm-client
- slurmd
- slurm-wlm-basic-plugins
- slurm-wlm-plugins
- slurm-wlm-mysql-plugin
state: present
update_cache: true
- name: Ensure Slurm config directory exists
ansible.builtin.file:
path: "{{ slurm_config_dir }}"
state: directory
owner: root
group: root
mode: "0755"
- name: Ensure Slurm log directory exists
ansible.builtin.file:
path: /var/log/slurm
state: directory
owner: slurm
group: slurm
mode: "0755"
- name: Ensure slurmd spool directory exists
ansible.builtin.file:
path: /var/spool/slurmd
state: directory
owner: slurm
group: slurm
mode: "0755"
- name: Ensure munge dirs exist
ansible.builtin.file:
path: "{{ item.path }}"
state: directory
owner: munge
group: munge
mode: "{{ item.mode }}"
loop:
- { path: /etc/munge, mode: "0700" }
- { path: /var/log/munge, mode: "0755" }
- { path: /var/lib/munge, mode: "0711" }
- { path: /run/munge, mode: "0755" }
- name: Deploy Munge key from controller to target node
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Read controller munge.key
ansible.builtin.slurp:
src: /etc/munge/munge.key
register: controller_munge_key_raw
- name: Store controller Munge key as fact
ansible.builtin.set_fact:
cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}"
- name: Configure target node with Munge and Slurm files
hosts: "{{ target_node }}"
become: true
gather_facts: false
vars:
controller_host: "{{ groups['slurm_controller'][0] }}"
tasks:
- name: Deploy shared munge.key
ansible.builtin.copy:
dest: /etc/munge/munge.key
content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}"
owner: munge
group: munge
mode: "0400"
notify:
- Restart munge
- name: Deploy managed slurm.conf
ansible.builtin.template:
src: ../../templates/slurm.conf.j2
dest: "{{ slurm_config_dir }}/slurm.conf"
owner: root
group: root
mode: "0644"
notify:
- Restart slurmd
- name: Deploy managed cgroup.conf
ansible.builtin.template:
src: ../../templates/cgroup.conf.j2
dest: "{{ slurm_config_dir }}/cgroup.conf"
owner: root
group: root
mode: "0644"
when: slurm_enable_cgroup | default(false) | bool
notify:
- Restart slurmd
- name: Deploy managed gres.conf on GPU nodes
ansible.builtin.template:
src: ../../templates/gres.conf.j2
dest: "{{ slurm_config_dir }}/gres.conf"
owner: root
group: root
mode: "0644"
when: inventory_hostname in groups.get('slurm_gpu', [])
notify:
- Restart slurmd
- name: Ensure munge is enabled and running
ansible.builtin.systemd:
name: munge
enabled: true
state: started
- name: Ensure slurmd is enabled and running
ansible.builtin.systemd:
name: slurmd
enabled: true
state: started
handlers:
- name: Restart munge
ansible.builtin.systemd:
name: munge
state: restarted
- name: Restart slurmd
ansible.builtin.systemd:
name: slurmd
state: restarted
- name: Deploy updated Slurm config to whole cluster and reconfigure controller
hosts: slurm_cluster
become: true
gather_facts: false
tasks:
- name: Deploy managed slurm.conf to all nodes
ansible.builtin.template:
src: ../../templates/slurm.conf.j2
dest: "{{ slurm_config_dir }}/slurm.conf"
owner: root
group: root
mode: "0644"
- name: Deploy managed cgroup.conf to all nodes
ansible.builtin.template:
src: ../../templates/cgroup.conf.j2
dest: "{{ slurm_config_dir }}/cgroup.conf"
owner: root
group: root
mode: "0644"
when: slurm_enable_cgroup | default(false) | bool
- name: Reconfigure Slurm and validate target node
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Reconfigure Slurm controller
ansible.builtin.command:
cmd: scontrol reconfigure
changed_when: true
- name: Restart Slurm controller after node reprovision
ansible.builtin.systemd:
name: slurmctld
state: restarted
- name: Wait for Slurm controller after restart
ansible.builtin.command:
cmd: scontrol ping
register: slurmctld_ping_after_restart
retries: 15
delay: 2
until: slurmctld_ping_after_restart.rc == 0
changed_when: false
- name: Resume target node in Slurm
ansible.builtin.command:
cmd: scontrol update NodeName={{ target_node }} State=RESUME
changed_when: true
- name: Wait until target node is visible and not down
ansible.builtin.shell: |
set -euo pipefail
scontrol show node {{ target_node }}
sinfo -N -n {{ target_node }}
args:
executable: /bin/bash
register: target_node_state
retries: 20
delay: 3
until:
- target_node_state.rc == 0
- "'down' not in target_node_state.stdout.lower()"
- "'not_responding' not in target_node_state.stdout.lower()"
- "'idle*' not in target_node_state.stdout.lower()"
changed_when: false
- name: Show target node state
ansible.builtin.debug:
var: target_node_state.stdout_lines
@@ -0,0 +1,33 @@
---
- name: Show Slurm node state
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook show-slurm-node.yml -e target_node=<hostname>"
when: target_node is not defined
- name: Show node state
ansible.builtin.shell: |
set -euo pipefail
echo "### sinfo"
sinfo -N -n {{ target_node }} || true
echo
echo "### scontrol"
scontrol show node {{ target_node }} || true
echo
echo "### jobs on node"
squeue -w {{ target_node }} || true
args:
executable: /bin/bash
register: node_lifecycle_state
changed_when: false
- name: Print node lifecycle state
ansible.builtin.debug:
var: node_lifecycle_state.stdout_lines
@@ -0,0 +1,169 @@
---
- name: Configure Slurm QOS, limits and fairshare
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Ensure sacctmgr is avgpu01le
ansible.builtin.command:
cmd: sacctmgr -n list cluster
changed_when: false
- name: Validate accounting GPU TRES exists
ansible.builtin.shell: |
set -euo pipefail
echo "### configured AccountingStorageTRES"
scontrol show config | grep -E "AccountingStorageTRES|AccountingStorageType|AccountingStorageEnforce"
echo
echo "### known TRES"
sacctmgr show tres
echo
echo "### checking gres/gpu"
sacctmgr -n show tres format=Type,Name | awk '$1=="gres" && $2=="gpu" {found=1} END {exit !found}'
args:
executable: /bin/bash
register: gpu_tres_check
changed_when: false
- name: Ensure normal QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos normal Priority=100
args:
executable: /bin/bash
register: add_qos_normal
changed_when: "'Adding QOS' in (add_qos_normal.stdout + add_qos_normal.stderr)"
failed_when: >
add_qos_normal.rc != 0 and
'Nothing new added' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
'already exists' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
'Already existing' not in (add_qos_normal.stdout + add_qos_normal.stderr)
- name: Ensure debug-short QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos debug-short Priority=500
args:
executable: /bin/bash
register: add_qos_debug
changed_when: "'Adding QOS' in (add_qos_debug.stdout + add_qos_debug.stderr)"
failed_when: >
add_qos_debug.rc != 0 and
'Nothing new added' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
'already exists' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
'Already existing' not in (add_qos_debug.stdout + add_qos_debug.stderr)
- name: Ensure gpu-short QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos gpu-short Priority=1000
args:
executable: /bin/bash
register: add_qos_gpu
changed_when: "'Adding QOS' in (add_qos_gpu.stdout + add_qos_gpu.stderr)"
failed_when: >
add_qos_gpu.rc != 0 and
'Nothing new added' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
'already exists' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
'Already existing' not in (add_qos_gpu.stdout + add_qos_gpu.stderr)
- name: Ensure maintenance QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos maintenance Priority=5000
args:
executable: /bin/bash
register: add_qos_maintenance
changed_when: "'Adding QOS' in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)"
failed_when: >
add_qos_maintenance.rc != 0 and
'Nothing new added' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
'already exists' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
'Already existing' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)
- name: Normalize normal QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos normal set Priority=100
args:
executable: /bin/bash
changed_when: true
- name: Normalize debug-short QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos debug-short set Priority=500 MaxWall=00:10:00 MaxTRESPU=cpu=2 MaxJobsPU=4
args:
executable: /bin/bash
changed_when: true
- name: Normalize gpu-short QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos gpu-short set Priority=1000 MaxWall=01:00:00 MaxTRESPU=gres/gpu=1,cpu=12 MaxJobsPU=2
args:
executable: /bin/bash
changed_when: true
- name: Normalize maintenance QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos maintenance set Priority=5000 MaxWall=02:00:00
args:
executable: /bin/bash
changed_when: true
- name: Assign QOS set to lab account
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify account {{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
args:
executable: /bin/bash
changed_when: true
- name: Assign default account to slurmuser
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
args:
executable: /bin/bash
changed_when: true
- name: Assign QOS set to slurmuser association
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify user where name=slurmuser account={{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
args:
executable: /bin/bash
changed_when: true
- name: Show configured QOS and associations
ansible.builtin.shell: |
set -euo pipefail
echo "### TRES"
sacctmgr show tres
echo
echo "### QOS"
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%40,MaxJobsPU
echo
echo "### Associations"
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%60,DefaultQOS,Fairshare
echo
echo "### Fairshare"
sshare -A {{ slurm_account_name }} || true
args:
executable: /bin/bash
register: qos_state
changed_when: false
- name: Print QOS state
ansible.builtin.debug:
var: qos_state.stdout_lines
@@ -0,0 +1,235 @@
---
- name: Validate Slurm QOS, fairshare and priority
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Validate priority runtime config
ansible.builtin.shell: |
set -euo pipefail
echo "### priority config"
scontrol show config | grep -E "PriorityType|PriorityWeight|PriorityDecay|PriorityCalc|PriorityMaxAge|PriorityFavor"
echo
echo "### accounting enforcement"
scontrol show config | grep -E "AccountingStorageType|AccountingStorageEnforce|AccountingStorageTRES"
echo
echo "### QOS"
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%50,MaxJobsPU
echo
echo "### associations"
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%80,DefaultQOS,Fairshare
echo
echo "### fairshare"
sshare -A {{ slurm_account_name }} || true
args:
executable: /bin/bash
register: priority_state
changed_when: false
- name: Submit debug-short QOS job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=qos-debug-test
#SBATCH --partition=debug
#SBATCH --qos=debug-short
#SBATCH --account=lab
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/qos-debug-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "QOS=${SLURM_JOB_QOS:-}"
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/qos-debug-test-${job_id}.out"
args:
executable: /bin/bash
register: debug_qos_job
changed_when: true
- name: Submit gpu-short QOS job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=qos-gpu-test
#SBATCH --partition=gpu
#SBATCH --qos=gpu-short
#SBATCH --account=lab
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=2
#SBATCH --mem=1G
#SBATCH --time=00:03:00
#SBATCH --output=/shared/qos-gpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "QOS=${SLURM_JOB_QOS:-}"
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo
nvidia-smi
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 120); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/qos-gpu-test-${job_id}.out"
args:
executable: /bin/bash
register: gpu_qos_job
changed_when: true
- name: Validate debug-short walltime limit behavior
ansible.builtin.shell: |
set -euo pipefail
set +e
output="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH' 2>&1
#!/bin/bash
#SBATCH --job-name=qos-limit-fail
#SBATCH --partition=debug
#SBATCH --qos=debug-short
#SBATCH --account=lab
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:30:00
#SBATCH --output=/shared/qos-limit-fail-%j.out
sleep 10
SBATCH
)"
rc=$?
set -e
echo "RC=$rc"
echo "$output"
if [ "$rc" -ne 0 ]; then
echo "Limit rejection test passed at submit time"
exit 0
fi
job_id="$output"
echo "Submitted job despite expected limit check: $job_id"
sleep 3
echo "### squeue"
squeue -j "$job_id" -o "%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" || true
echo
echo "### job detail"
scontrol show job "$job_id" || true
state="$(squeue -h -j "$job_id" -o "%T" || true)"
reason="$(squeue -h -j "$job_id" -o "%R" || true)"
echo "STATE=$state"
echo "REASON=$reason"
if echo "$state" | grep -qE "PENDING|CONFIGURING"; then
if echo "$reason" | grep -qiE "qos|limit|time|max|assoc"; then
echo "Limit enforcement test passed via pending reason"
scancel "$job_id" || true
exit 0
fi
fi
echo "Job was accepted without an obvious QOS/limit pending reason"
scancel "$job_id" || true
exit 1
args:
executable: /bin/bash
register: limit_rejection
changed_when: false
- name: Show priority and fairshare snapshot
ansible.builtin.shell: |
set -euo pipefail
echo "### queue"
squeue || true
echo
echo "### sprio"
sprio || true
echo
echo "### sshare"
sshare -A {{ slurm_account_name }} || true
echo
echo "### recent sacct"
sacct -S today --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -40
args:
executable: /bin/bash
register: priority_snapshot
changed_when: false
- name: Print validation result
ansible.builtin.debug:
msg:
- "### priority state"
- "{{ priority_state.stdout_lines }}"
- "### debug QOS job"
- "{{ debug_qos_job.stdout_lines }}"
- "### GPU QOS job"
- "{{ gpu_qos_job.stdout_lines }}"
- "### limit rejection"
- "{{ limit_rejection.stdout_lines }}"
- "### priority snapshot"
- "{{ priority_snapshot.stdout_lines }}"
@@ -0,0 +1,59 @@
---
- name: Test CPU cgroup enforcement on gpu01
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit cgroup CPU test to gpu01
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=cgroup-cpu-test
#SBATCH --partition=all
#SBATCH --nodelist=gpu01
#SBATCH --cpus-per-task=2
#SBATCH --mem=1G
#SBATCH --time=00:02:00
#SBATCH --output=/shared/cgroup-cpu-test-%j.out
echo "HOST=$(hostname)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo "MEM_ALLOWED=$(grep Mems_allowed_list /proc/self/status || true)"
echo
echo "### cgroup"
cat /proc/self/cgroup
echo
echo "### mounted cgroups"
mount | grep cgroup || true
sleep 5
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 60); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### output"
cat "/shared/cgroup-cpu-test-${job_id}.out"
args:
executable: /bin/bash
register: cgroup_cpu_result
changed_when: true
- name: Show cgroup CPU result
ansible.builtin.debug:
var: cgroup_cpu_result.stdout_lines
@@ -0,0 +1,60 @@
---
- name: Submit CPU test job
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit test job to debug partition
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=cpu-test
#SBATCH --partition=debug
#SBATCH --cpus-per-task=1
#SBATCH --mem=512M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/cpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 60); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true
echo "### output"
if [ -f "/shared/cpu-test-${job_id}.out" ]; then
cat "/shared/cpu-test-${job_id}.out"
else
echo "Output file not found: /shared/cpu-test-${job_id}.out"
find /shared -maxdepth 1 -name "cpu-test-*.out" -ls | tail -5 || true
exit 1
fi
args:
executable: /bin/bash
register: cpu_job_result
changed_when: true
- name: Show CPU job result
ansible.builtin.debug:
var: cpu_job_result.stdout_lines
@@ -0,0 +1,58 @@
---
- name: Test GPU access without GRES allocation
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit job to gpu01 without requesting GPU
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=gpu-deny-test
#SBATCH --partition=all
#SBATCH --nodelist=gpu01
#SBATCH --cpus-per-task=1
#SBATCH --mem=1G
#SBATCH --time=00:02:00
#SBATCH --output=/shared/gpu-deny-test-%j.out
echo "HOST=$(hostname)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo
echo "### ls nvidia devices"
ls -l /dev/nvidia* 2>&1 || true
echo
echo "### nvidia-smi without GRES"
nvidia-smi 2>&1 || true
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 60); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### output"
cat "/shared/gpu-deny-test-${job_id}.out"
args:
executable: /bin/bash
register: gpu_deny_result
changed_when: true
- name: Show GPU deny test result
ansible.builtin.debug:
var: gpu_deny_result.stdout_lines
@@ -0,0 +1,70 @@
---
- name: Submit GPU test job
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit test job to gpu partition
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=gpu-test
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=2
#SBATCH --mem=2G
#SBATCH --time=00:03:00
#SBATCH --output=/shared/gpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo
echo "### nvidia-smi"
nvidia-smi
echo
echo "### GPU process table"
nvidia-smi pmon -c 1 || true
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true
echo "### output"
if [ -f "/shared/gpu-test-${job_id}.out" ]; then
cat "/shared/gpu-test-${job_id}.out"
else
echo "Output file not found: /shared/gpu-test-${job_id}.out"
find /shared -maxdepth 1 -name "gpu-test-*.out" -ls | tail -5 || true
exit 1
fi
args:
executable: /bin/bash
register: gpu_job_result
changed_when: true
- name: Show GPU job result
ansible.builtin.debug:
var: gpu_job_result.stdout_lines
@@ -0,0 +1,95 @@
---
- name: Submit job to specific Slurm node
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook test-specific-node.yml -e target_node=<hostname>"
when: target_node is not defined
- name: Submit test job to target node
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<SBATCH
#!/bin/bash
#SBATCH --job-name=node-test
#SBATCH --partition=debug
#SBATCH --nodelist={{ target_node }}
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --account=lab
#SBATCH --qos=normal
#SBATCH --output=/shared/node-test-%j.out
echo "HOST=\$(hostname)"
echo "USER=\$(whoami)"
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
echo "### waiting for job to leave queue"
for i in $(seq 1 120); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### waiting for output file"
for i in $(seq 1 30); do
if [ -s "/shared/node-test-${job_id}.out" ]; then
break
fi
sleep 1
done
echo "### waiting for sacct final state"
final_state=""
for i in $(seq 1 30); do
final_state="$(
sacct -n -P -j "$job_id" --format=State 2>/dev/null \
| head -n 1 \
| cut -d'|' -f1 \
| awk '{print $1}'
)"
if echo "$final_state" | grep -qE "COMPLETED|FAILED|CANCELLED|TIMEOUT|NODE_FAIL|OUT_OF_MEMORY"; then
break
fi
sleep 1
done
echo "FINAL_STATE=${final_state:-UNKNOWN}"
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/node-test-${job_id}.out"
if [ "${final_state:-UNKNOWN}" != "COMPLETED" ]; then
echo "Job did not reach COMPLETED state according to sacct"
exit 1
fi
args:
executable: /bin/bash
register: node_test
changed_when: true
- name: Show node test result
ansible.builtin.debug:
var: node_test.stdout_lines
@@ -0,0 +1,60 @@
---
- name: Generate measurable Slurm usage for sreport
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit CPU usage job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=sreport-usage
#SBATCH --partition=debug
#SBATCH --cpus-per-task=2
#SBATCH --mem=512M
#SBATCH --time=00:03:00
#SBATCH --output=/shared/sreport-usage-%j.out
echo "HOST=$(hostname)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo "Burning CPU for 90 seconds"
timeout 90 bash -c 'while true; do :; done' &
timeout 90 bash -c 'while true; do :; done' &
wait
echo "Done"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 150); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 2
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/sreport-usage-${job_id}.out"
args:
executable: /bin/bash
register: sreport_usage_job
changed_when: true
- name: Show usage job result
ansible.builtin.debug:
var: sreport_usage_job.stdout_lines
@@ -0,0 +1,140 @@
---
- name: Validate Slurm operator user and SSH mesh
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: "{{ slurm_operator_user | default('slurmuser') }}"
slurm_hosts: "{{ groups['slurm_cluster'] }}"
tasks:
- name: Validate slurmuser exists
ansible.builtin.command:
cmd: id {{ slurm_operator_user }}
changed_when: false
- name: Validate sinfo as slurmuser
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} sinfo
changed_when: false
- name: Validate squeue as slurmuser
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} squeue
changed_when: false
- name: Validate SSH mesh as slurmuser
ansible.builtin.shell: |
set -euo pipefail
for h in {{ slurm_hosts | join(' ') }}; do
echo "=== $h ==="
ssh -o BatchMode=yes -o ConnectTimeout=5 "$h" hostname
done
args:
executable: /bin/bash
become_user: "{{ slurm_operator_user }}"
changed_when: false
- name: Validate Slurm controller commands
hosts: slurm_controller
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Validate slurmctld status through sudo
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmctld --no-pager
changed_when: false
- name: Validate controller Slurm commands
ansible.builtin.shell: |
set -euo pipefail
sudo -iu {{ slurm_operator_user }} sinfo
sudo -iu {{ slurm_operator_user }} squeue
sudo -iu {{ slurm_operator_user }} scontrol show nodes
args:
executable: /bin/bash
changed_when: false
- name: Validate Slurm worker commands
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Validate slurmd status through sudo
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmd --no-pager
changed_when: false
- name: Validate worker Slurm commands
ansible.builtin.shell: |
set -euo pipefail
sudo -iu {{ slurm_operator_user }} sinfo
sudo -iu {{ slurm_operator_user }} squeue
sudo -iu {{ slurm_operator_user }} scontrol show nodes
args:
executable: /bin/bash
changed_when: false
- name: Validate basic job submission
hosts: slurm_controller
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Submit simple Slurm test job as slurmuser
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu {{ slurm_operator_user }} sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=ansible-validate
#SBATCH --partition=debug
#SBATCH --time=00:01:00
#SBATCH --output=/tmp/ansible-validate-%j.out
hostname
whoami
date
SBATCH
)"
echo "$job_id"
for i in $(seq 1 20); do
state="$(sudo -iu {{ slurm_operator_user }} squeue -h -j "$job_id" -o "%T" || true)"
if [ -z "$state" ]; then
break
fi
echo "job_state=$state"
sleep 1
done
sudo -iu {{ slurm_operator_user }} sacct -j "$job_id" --format=JobID,JobName,State,ExitCode 2>/dev/null || true
if ls /tmp/ansible-validate-"$job_id".out >/dev/null 2>&1; then
cat /tmp/ansible-validate-"$job_id".out
fi
args:
executable: /bin/bash
register: slurm_job_test
changed_when: true
- name: Show basic job submission result
ansible.builtin.debug:
var: slurm_job_test.stdout_lines
@@ -0,0 +1,236 @@
---
- name: Validate canary node variable
hosts: localhost
gather_facts: false
vars:
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
tasks:
- name: Ensure canary node is in inventory
ansible.builtin.fail:
msg: "canary_node={{ canary_node_effective }} is not in inventory"
when: canary_node_effective not in groups['all']
- name: Ensure canary node is not the controller
ansible.builtin.fail:
msg: "Do not use controller as canary for worker rolling upgrade"
when: canary_node_effective in groups['slurm_controller']
- name: Drain canary node
hosts: slurm_controller
become: true
gather_facts: false
vars:
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
tasks:
- name: Show canary state before drain
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ canary_node_effective }} || true
scontrol show node {{ canary_node_effective }} || true
squeue -w {{ canary_node_effective }} || true
args:
executable: /bin/bash
register: canary_before
changed_when: false
- name: Print canary state before drain
ansible.builtin.debug:
var: canary_before.stdout_lines
- name: Drain canary node
ansible.builtin.command:
cmd: scontrol update NodeName={{ canary_node_effective }} State=DRAIN Reason="canary OS upgrade"
changed_when: true
- name: Wait until canary has no running jobs
ansible.builtin.shell: |
set -euo pipefail
squeue -h -w {{ canary_node_effective }} || true
args:
executable: /bin/bash
register: canary_jobs
retries: 120
delay: 10
until: canary_jobs.stdout | trim == ""
changed_when: false
- name: Upgrade canary node OS packages
hosts: "{{ canary_node | default('slurm-c02') }}"
become: true
gather_facts: true
tasks:
- name: Ensure apt cache is updated
ansible.builtin.apt:
update_cache: true
cache_valid_time: 1800
- name: Full upgrade packages
ansible.builtin.apt:
upgrade: full
autoremove: true
autoclean: true
register: apt_upgrade_result
- name: Check if reboot is required
ansible.builtin.stat:
path: /var/run/reboot-required
register: reboot_required
- name: Show upgrade summary
ansible.builtin.debug:
msg:
- "Host: {{ inventory_hostname }}"
- "Apt changed: {{ apt_upgrade_result.changed }}"
- "Reboot required: {{ reboot_required.stat.exists }}"
- name: Reboot canary if required
ansible.builtin.reboot:
msg: "Reboot after canary OS upgrade"
reboot_timeout: 900
connect_timeout: 20
pre_reboot_delay: 5
post_reboot_delay: 20
when: reboot_required.stat.exists
- name: Ensure munge is running
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Ensure slurmd is running
ansible.builtin.systemd:
name: slurmd
state: restarted
enabled: true
- name: Validate local services
ansible.builtin.shell: |
set -euo pipefail
systemctl is-active munge
systemctl is-active slurmd
munge -n | unmunge >/dev/null
scontrol ping
args:
executable: /bin/bash
changed_when: false
- name: Resume canary node and run canary job
hosts: slurm_controller
become: true
gather_facts: false
vars:
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
tasks:
- name: Reconfigure controller
ansible.builtin.command:
cmd: scontrol reconfigure
changed_when: true
- name: Restart controller to refresh node state
ansible.builtin.systemd:
name: slurmctld
state: restarted
- name: Wait for controller
ansible.builtin.command:
cmd: scontrol ping
register: slurmctld_ping
retries: 15
delay: 2
until: slurmctld_ping.rc == 0
changed_when: false
- name: Clear canary node maintenance state
ansible.builtin.shell: |
set -euo pipefail
scontrol update NodeName={{ canary_node_effective }} State=RESUME 2>/dev/null || true
scontrol update NodeName={{ canary_node_effective }} State=UNDRAIN 2>/dev/null || true
scontrol update NodeName={{ canary_node_effective }} State=IDLE 2>/dev/null || true
sleep 3
sinfo -N -n {{ canary_node_effective }}
scontrol show node {{ canary_node_effective }}
args:
executable: /bin/bash
register: resume_canary
changed_when: true
- name: Wait until canary is IDLE and responding
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ canary_node_effective }}
scontrol show node {{ canary_node_effective }}
args:
executable: /bin/bash
register: canary_state
retries: 30
delay: 5
until:
- canary_state.rc == 0
- "'not_responding' not in canary_state.stdout.lower()"
- "'down' not in canary_state.stdout.lower()"
- "'drain' not in canary_state.stdout.lower()"
- "'idle*' not in canary_state.stdout.lower()"
changed_when: false
- name: Submit canary test job to upgraded node
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<SBATCH
#!/bin/bash
#SBATCH --job-name=canary-upgrade-test
#SBATCH --partition=all
#SBATCH --nodelist={{ canary_node_effective }}
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/canary-upgrade-test-%j.out
echo "HOST=\$(hostname)"
echo "USER=\$(whoami)"
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
echo "KERNEL=\$(uname -r)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/canary-upgrade-test-${job_id}.out"
args:
executable: /bin/bash
register: canary_job
changed_when: true
- name: Show canary test result
ansible.builtin.debug:
var: canary_job.stdout_lines
@@ -0,0 +1,197 @@
---
- name: Rolling upgrade Slurm worker nodes
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: true
serial: 1
vars:
skip_canary_node: "{{ canary_node | default('slurm-c02') }}"
do_skip_canary: "{{ skip_canary | default(true) | bool }}"
pre_tasks:
- name: Skip canary node if requested
ansible.builtin.meta: end_host
when:
- do_skip_canary
- inventory_hostname == skip_canary_node
- name: Drain node before OS upgrade
ansible.builtin.command:
cmd: scontrol update NodeName={{ inventory_hostname }} State=DRAIN Reason="rolling OS upgrade"
delegate_to: "{{ groups['slurm_controller'][0] }}"
changed_when: true
- name: Wait until no jobs are running on this node
ansible.builtin.shell: |
set -euo pipefail
squeue -h -w {{ inventory_hostname }} || true
args:
executable: /bin/bash
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: jobs_on_node
retries: 120
delay: 10
until: jobs_on_node.stdout | trim == ""
changed_when: false
tasks:
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 1800
- name: Full upgrade packages
ansible.builtin.apt:
upgrade: full
autoremove: true
autoclean: true
register: apt_upgrade_result
- name: Check if reboot is required
ansible.builtin.stat:
path: /var/run/reboot-required
register: reboot_required
- name: Show upgrade status
ansible.builtin.debug:
msg:
- "Node: {{ inventory_hostname }}"
- "Apt changed: {{ apt_upgrade_result.changed }}"
- "Reboot required: {{ reboot_required.stat.exists }}"
- name: Reboot node if required
ansible.builtin.reboot:
msg: "Reboot after rolling OS upgrade"
reboot_timeout: 900
connect_timeout: 20
pre_reboot_delay: 5
post_reboot_delay: 20
when: reboot_required.stat.exists
- name: Restart munge
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Restart slurmd
ansible.builtin.systemd:
name: slurmd
state: restarted
enabled: true
- name: Validate local slurm services
ansible.builtin.shell: |
set -euo pipefail
systemctl is-active munge
systemctl is-active slurmd
munge -n | unmunge >/dev/null
scontrol ping
args:
executable: /bin/bash
changed_when: false
post_tasks:
- name: Restart controller to refresh state after node upgrade
ansible.builtin.systemd:
name: slurmctld
state: restarted
delegate_to: "{{ groups['slurm_controller'][0] }}"
run_once: false
- name: Wait for controller after restart
ansible.builtin.command:
cmd: scontrol ping
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: slurmctld_ping
retries: 15
delay: 2
until: slurmctld_ping.rc == 0
changed_when: false
- name: Clear upgraded node maintenance state
ansible.builtin.shell: |
set -euo pipefail
scontrol update NodeName={{ inventory_hostname }} State=RESUME 2>/dev/null || true
scontrol update NodeName={{ inventory_hostname }} State=UNDRAIN 2>/dev/null || true
scontrol update NodeName={{ inventory_hostname }} State=IDLE 2>/dev/null || true
sleep 3
sinfo -N -n {{ inventory_hostname }}
scontrol show node {{ inventory_hostname }}
args:
executable: /bin/bash
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: resume_node
changed_when: true
- name: Wait until node is healthy
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ inventory_hostname }}
scontrol show node {{ inventory_hostname }}
args:
executable: /bin/bash
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: upgraded_node_state
retries: 30
delay: 5
until:
- upgraded_node_state.rc == 0
- "'not_responding' not in upgraded_node_state.stdout.lower()"
- "'down' not in upgraded_node_state.stdout.lower()"
- "'drain' not in upgraded_node_state.stdout.lower()"
- "'idle*' not in upgraded_node_state.stdout.lower()"
changed_when: false
- name: Submit node-local post-upgrade test job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<SBATCH
#!/bin/bash
#SBATCH --job-name=rolling-upgrade-test
#SBATCH --partition=all
#SBATCH --nodelist={{ inventory_hostname }}
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/rolling-upgrade-test-%j.out
echo "HOST=\$(hostname)"
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
echo "KERNEL=\$(uname -r)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/rolling-upgrade-test-${job_id}.out"
args:
executable: /bin/bash
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: node_test_job
changed_when: true
- name: Show node post-upgrade test result
ansible.builtin.debug:
var: node_test_job.stdout_lines
@@ -0,0 +1,94 @@
---
- name: Upgrade Slurm controller OS safely
hosts: slurm_controller
become: true
gather_facts: true
tasks:
- name: Show cluster state before controller upgrade
ansible.builtin.shell: |
set -euo pipefail
scontrol ping
sinfo
squeue
systemctl is-active munge
systemctl is-active slurmctld
systemctl is-active slurmdbd || true
systemctl is-active mariadb || true
args:
executable: /bin/bash
register: before_state
changed_when: false
- name: Print cluster state before controller upgrade
ansible.builtin.debug:
var: before_state.stdout_lines
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 1800
- name: Full upgrade controller packages
ansible.builtin.apt:
upgrade: full
autoremove: true
autoclean: true
register: controller_upgrade
- name: Check if reboot is required
ansible.builtin.stat:
path: /var/run/reboot-required
register: controller_reboot_required
- name: Show controller upgrade status
ansible.builtin.debug:
msg:
- "Apt changed: {{ controller_upgrade.changed }}"
- "Reboot required: {{ controller_reboot_required.stat.exists }}"
- name: Reboot controller if required
ansible.builtin.reboot:
msg: "Reboot after controller OS upgrade"
reboot_timeout: 900
connect_timeout: 20
pre_reboot_delay: 5
post_reboot_delay: 30
when: controller_reboot_required.stat.exists
- name: Restart controller services
ansible.builtin.systemd:
name: "{{ item }}"
state: restarted
enabled: true
loop:
- munge
- mariadb
- slurmdbd
- slurmctld
- name: Wait for slurmctld
ansible.builtin.command:
cmd: scontrol ping
register: slurmctld_ping
retries: 20
delay: 3
until: slurmctld_ping.rc == 0
changed_when: false
- name: Validate controller after upgrade
ansible.builtin.shell: |
set -euo pipefail
scontrol ping
sinfo
squeue
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType"
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -20
args:
executable: /bin/bash
register: controller_after
changed_when: false
- name: Print controller validation after upgrade
ansible.builtin.debug:
var: controller_after.stdout_lines
@@ -0,0 +1,207 @@
---
- name: Validate cluster after OS rolling upgrade
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Validate Slurm controller and cluster state
ansible.builtin.shell: |
set -euo pipefail
echo "### slurmctld ping"
scontrol ping
echo
echo "### nodes"
sinfo -N
echo
echo "### partitions"
sinfo
echo
echo "### queue"
squeue
echo
echo "### important config"
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType|SelectType|ClusterName"
echo
echo "### accounting recent jobs"
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
args:
executable: /bin/bash
register: cluster_state
changed_when: false
- name: Print cluster state
ansible.builtin.debug:
var: cluster_state.stdout_lines
- name: Validate worker services after OS rolling upgrade
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: true
tasks:
- name: Validate local worker services and Slurm connectivity
ansible.builtin.shell: |
set -euo pipefail
echo "HOST=$(hostname)"
echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
echo "KERNEL=$(uname -r)"
echo "UPTIME=$(uptime -p)"
echo
echo "### services"
systemctl is-active munge
systemctl is-active slurmd
echo
echo "### munge local test"
munge -n | unmunge >/dev/null
echo "munge OK"
echo
echo "### controller ping"
scontrol ping
echo
echo "### local slurm.conf checksum"
sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
echo
echo "### gpu check if present"
if command -v nvidia-smi >/dev/null 2>&1; then
nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader || true
else
echo "NO_NVIDIA_SMI"
fi
args:
executable: /bin/bash
register: worker_state
changed_when: false
- name: Print worker state
ansible.builtin.debug:
var: worker_state.stdout_lines
- name: Submit post-upgrade CPU validation job
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit CPU validation job to debug partition
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=os-upgrade-cpu-test
#SBATCH --partition=debug
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/os-upgrade-cpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo "KERNEL=$(uname -r)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/os-upgrade-cpu-test-${job_id}.out"
args:
executable: /bin/bash
register: cpu_validation_job
changed_when: true
- name: Print CPU validation job
ansible.builtin.debug:
var: cpu_validation_job.stdout_lines
- name: Submit post-upgrade GPU validation job
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit GPU validation job to gpu partition
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=os-upgrade-gpu-test
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=2
#SBATCH --mem=1G
#SBATCH --time=00:03:00
#SBATCH --output=/shared/os-upgrade-gpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo "KERNEL=$(uname -r)"
echo
nvidia-smi
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 120); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/os-upgrade-gpu-test-${job_id}.out"
args:
executable: /bin/bash
register: gpu_validation_job
changed_when: true
- name: Print GPU validation job
ansible.builtin.debug:
var: gpu_validation_job.stdout_lines