237 lines
6.6 KiB
YAML
237 lines
6.6 KiB
YAML
|
|
---
|
||
|
|
- name: Validate canary node variable
|
||
|
|
hosts: localhost
|
||
|
|
gather_facts: false
|
||
|
|
|
||
|
|
vars:
|
||
|
|
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Ensure canary node is in inventory
|
||
|
|
ansible.builtin.fail:
|
||
|
|
msg: "canary_node={{ canary_node_effective }} is not in inventory"
|
||
|
|
when: canary_node_effective not in groups['all']
|
||
|
|
|
||
|
|
- name: Ensure canary node is not the controller
|
||
|
|
ansible.builtin.fail:
|
||
|
|
msg: "Do not use controller as canary for worker rolling upgrade"
|
||
|
|
when: canary_node_effective in groups['slurm_controller']
|
||
|
|
|
||
|
|
|
||
|
|
- name: Drain canary node
|
||
|
|
hosts: slurm_controller
|
||
|
|
become: true
|
||
|
|
gather_facts: false
|
||
|
|
|
||
|
|
vars:
|
||
|
|
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Show canary state before drain
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sinfo -N -n {{ canary_node_effective }} || true
|
||
|
|
scontrol show node {{ canary_node_effective }} || true
|
||
|
|
squeue -w {{ canary_node_effective }} || true
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: canary_before
|
||
|
|
changed_when: false
|
||
|
|
|
||
|
|
- name: Print canary state before drain
|
||
|
|
ansible.builtin.debug:
|
||
|
|
var: canary_before.stdout_lines
|
||
|
|
|
||
|
|
- name: Drain canary node
|
||
|
|
ansible.builtin.command:
|
||
|
|
cmd: scontrol update NodeName={{ canary_node_effective }} State=DRAIN Reason="canary OS upgrade"
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Wait until canary has no running jobs
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
squeue -h -w {{ canary_node_effective }} || true
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: canary_jobs
|
||
|
|
retries: 120
|
||
|
|
delay: 10
|
||
|
|
until: canary_jobs.stdout | trim == ""
|
||
|
|
changed_when: false
|
||
|
|
|
||
|
|
|
||
|
|
- name: Upgrade canary node OS packages
|
||
|
|
hosts: "{{ canary_node | default('slurm-c02') }}"
|
||
|
|
become: true
|
||
|
|
gather_facts: true
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Ensure apt cache is updated
|
||
|
|
ansible.builtin.apt:
|
||
|
|
update_cache: true
|
||
|
|
cache_valid_time: 1800
|
||
|
|
|
||
|
|
- name: Full upgrade packages
|
||
|
|
ansible.builtin.apt:
|
||
|
|
upgrade: full
|
||
|
|
autoremove: true
|
||
|
|
autoclean: true
|
||
|
|
register: apt_upgrade_result
|
||
|
|
|
||
|
|
- name: Check if reboot is required
|
||
|
|
ansible.builtin.stat:
|
||
|
|
path: /var/run/reboot-required
|
||
|
|
register: reboot_required
|
||
|
|
|
||
|
|
- name: Show upgrade summary
|
||
|
|
ansible.builtin.debug:
|
||
|
|
msg:
|
||
|
|
- "Host: {{ inventory_hostname }}"
|
||
|
|
- "Apt changed: {{ apt_upgrade_result.changed }}"
|
||
|
|
- "Reboot required: {{ reboot_required.stat.exists }}"
|
||
|
|
|
||
|
|
- name: Reboot canary if required
|
||
|
|
ansible.builtin.reboot:
|
||
|
|
msg: "Reboot after canary OS upgrade"
|
||
|
|
reboot_timeout: 900
|
||
|
|
connect_timeout: 20
|
||
|
|
pre_reboot_delay: 5
|
||
|
|
post_reboot_delay: 20
|
||
|
|
when: reboot_required.stat.exists
|
||
|
|
|
||
|
|
- name: Ensure munge is running
|
||
|
|
ansible.builtin.systemd:
|
||
|
|
name: munge
|
||
|
|
state: restarted
|
||
|
|
enabled: true
|
||
|
|
|
||
|
|
- name: Ensure slurmd is running
|
||
|
|
ansible.builtin.systemd:
|
||
|
|
name: slurmd
|
||
|
|
state: restarted
|
||
|
|
enabled: true
|
||
|
|
|
||
|
|
- name: Validate local services
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
systemctl is-active munge
|
||
|
|
systemctl is-active slurmd
|
||
|
|
munge -n | unmunge >/dev/null
|
||
|
|
scontrol ping
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
changed_when: false
|
||
|
|
|
||
|
|
|
||
|
|
- name: Resume canary node and run canary job
|
||
|
|
hosts: slurm_controller
|
||
|
|
become: true
|
||
|
|
gather_facts: false
|
||
|
|
|
||
|
|
vars:
|
||
|
|
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Reconfigure controller
|
||
|
|
ansible.builtin.command:
|
||
|
|
cmd: scontrol reconfigure
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Restart controller to refresh node state
|
||
|
|
ansible.builtin.systemd:
|
||
|
|
name: slurmctld
|
||
|
|
state: restarted
|
||
|
|
|
||
|
|
- name: Wait for controller
|
||
|
|
ansible.builtin.command:
|
||
|
|
cmd: scontrol ping
|
||
|
|
register: slurmctld_ping
|
||
|
|
retries: 15
|
||
|
|
delay: 2
|
||
|
|
until: slurmctld_ping.rc == 0
|
||
|
|
changed_when: false
|
||
|
|
|
||
|
|
- name: Clear canary node maintenance state
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
scontrol update NodeName={{ canary_node_effective }} State=RESUME 2>/dev/null || true
|
||
|
|
scontrol update NodeName={{ canary_node_effective }} State=UNDRAIN 2>/dev/null || true
|
||
|
|
scontrol update NodeName={{ canary_node_effective }} State=IDLE 2>/dev/null || true
|
||
|
|
|
||
|
|
sleep 3
|
||
|
|
sinfo -N -n {{ canary_node_effective }}
|
||
|
|
scontrol show node {{ canary_node_effective }}
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: resume_canary
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Wait until canary is IDLE and responding
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
sinfo -N -n {{ canary_node_effective }}
|
||
|
|
scontrol show node {{ canary_node_effective }}
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: canary_state
|
||
|
|
retries: 30
|
||
|
|
delay: 5
|
||
|
|
until:
|
||
|
|
- canary_state.rc == 0
|
||
|
|
- "'not_responding' not in canary_state.stdout.lower()"
|
||
|
|
- "'down' not in canary_state.stdout.lower()"
|
||
|
|
- "'drain' not in canary_state.stdout.lower()"
|
||
|
|
- "'idle*' not in canary_state.stdout.lower()"
|
||
|
|
changed_when: false
|
||
|
|
|
||
|
|
- name: Submit canary test job to upgraded node
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
job_id="$(
|
||
|
|
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
||
|
|
#!/bin/bash
|
||
|
|
#SBATCH --job-name=canary-upgrade-test
|
||
|
|
#SBATCH --partition=all
|
||
|
|
#SBATCH --nodelist={{ canary_node_effective }}
|
||
|
|
#SBATCH --cpus-per-task=1
|
||
|
|
#SBATCH --mem=256M
|
||
|
|
#SBATCH --time=00:02:00
|
||
|
|
#SBATCH --output=/shared/canary-upgrade-test-%j.out
|
||
|
|
|
||
|
|
echo "HOST=\$(hostname)"
|
||
|
|
echo "USER=\$(whoami)"
|
||
|
|
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
||
|
|
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
||
|
|
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
||
|
|
echo "KERNEL=\$(uname -r)"
|
||
|
|
date
|
||
|
|
SBATCH
|
||
|
|
)"
|
||
|
|
|
||
|
|
echo "JOB_ID=$job_id"
|
||
|
|
|
||
|
|
for i in $(seq 1 90); do
|
||
|
|
if squeue -h -j "$job_id" | grep -q .; then
|
||
|
|
squeue -j "$job_id"
|
||
|
|
sleep 1
|
||
|
|
else
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
echo "### sacct"
|
||
|
|
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||
|
|
|
||
|
|
echo "### output"
|
||
|
|
cat "/shared/canary-upgrade-test-${job_id}.out"
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: canary_job
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Show canary test result
|
||
|
|
ansible.builtin.debug:
|
||
|
|
var: canary_job.stdout_lines
|