Add Slurm AI/HPC cluster platform project

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit cd6830334b
47 changed files with 4727 additions and 0 deletions
@@ -0,0 +1,197 @@
---
- name: Rolling upgrade Slurm worker nodes
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: true
serial: 1
vars:
skip_canary_node: "{{ canary_node | default('slurm-c02') }}"
do_skip_canary: "{{ skip_canary | default(true) | bool }}"
pre_tasks:
- name: Skip canary node if requested
ansible.builtin.meta: end_host
when:
- do_skip_canary
- inventory_hostname == skip_canary_node
- name: Drain node before OS upgrade
ansible.builtin.command:
cmd: scontrol update NodeName={{ inventory_hostname }} State=DRAIN Reason="rolling OS upgrade"
delegate_to: "{{ groups['slurm_controller'][0] }}"
changed_when: true
- name: Wait until no jobs are running on this node
ansible.builtin.shell: |
set -euo pipefail
squeue -h -w {{ inventory_hostname }} || true
args:
executable: /bin/bash
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: jobs_on_node
retries: 120
delay: 10
until: jobs_on_node.stdout | trim == ""
changed_when: false
tasks:
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 1800
- name: Full upgrade packages
ansible.builtin.apt:
upgrade: full
autoremove: true
autoclean: true
register: apt_upgrade_result
- name: Check if reboot is required
ansible.builtin.stat:
path: /var/run/reboot-required
register: reboot_required
- name: Show upgrade status
ansible.builtin.debug:
msg:
- "Node: {{ inventory_hostname }}"
- "Apt changed: {{ apt_upgrade_result.changed }}"
- "Reboot required: {{ reboot_required.stat.exists }}"
- name: Reboot node if required
ansible.builtin.reboot:
msg: "Reboot after rolling OS upgrade"
reboot_timeout: 900
connect_timeout: 20
pre_reboot_delay: 5
post_reboot_delay: 20
when: reboot_required.stat.exists
- name: Restart munge
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Restart slurmd
ansible.builtin.systemd:
name: slurmd
state: restarted
enabled: true
- name: Validate local slurm services
ansible.builtin.shell: |
set -euo pipefail
systemctl is-active munge
systemctl is-active slurmd
munge -n | unmunge >/dev/null
scontrol ping
args:
executable: /bin/bash
changed_when: false
post_tasks:
- name: Restart controller to refresh state after node upgrade
ansible.builtin.systemd:
name: slurmctld
state: restarted
delegate_to: "{{ groups['slurm_controller'][0] }}"
run_once: false
- name: Wait for controller after restart
ansible.builtin.command:
cmd: scontrol ping
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: slurmctld_ping
retries: 15
delay: 2
until: slurmctld_ping.rc == 0
changed_when: false
- name: Clear upgraded node maintenance state
ansible.builtin.shell: |
set -euo pipefail
scontrol update NodeName={{ inventory_hostname }} State=RESUME 2>/dev/null || true
scontrol update NodeName={{ inventory_hostname }} State=UNDRAIN 2>/dev/null || true
scontrol update NodeName={{ inventory_hostname }} State=IDLE 2>/dev/null || true
sleep 3
sinfo -N -n {{ inventory_hostname }}
scontrol show node {{ inventory_hostname }}
args:
executable: /bin/bash
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: resume_node
changed_when: true
- name: Wait until node is healthy
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ inventory_hostname }}
scontrol show node {{ inventory_hostname }}
args:
executable: /bin/bash
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: upgraded_node_state
retries: 30
delay: 5
until:
- upgraded_node_state.rc == 0
- "'not_responding' not in upgraded_node_state.stdout.lower()"
- "'down' not in upgraded_node_state.stdout.lower()"
- "'drain' not in upgraded_node_state.stdout.lower()"
- "'idle*' not in upgraded_node_state.stdout.lower()"
changed_when: false
- name: Submit node-local post-upgrade test job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<SBATCH
#!/bin/bash
#SBATCH --job-name=rolling-upgrade-test
#SBATCH --partition=all
#SBATCH --nodelist={{ inventory_hostname }}
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/rolling-upgrade-test-%j.out
echo "HOST=\$(hostname)"
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
echo "KERNEL=\$(uname -r)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/rolling-upgrade-test-${job_id}.out"
args:
executable: /bin/bash
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: node_test_job
changed_when: true
- name: Show node post-upgrade test result
ansible.builtin.debug:
var: node_test_job.stdout_lines