Add Slurm AI/HPC cluster platform project
This commit is contained in:
@@ -0,0 +1,94 @@
|
||||
---
|
||||
- name: Upgrade Slurm controller OS safely
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Show cluster state before controller upgrade
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol ping
|
||||
sinfo
|
||||
squeue
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmctld
|
||||
systemctl is-active slurmdbd || true
|
||||
systemctl is-active mariadb || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: before_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print cluster state before controller upgrade
|
||||
ansible.builtin.debug:
|
||||
var: before_state.stdout_lines
|
||||
|
||||
- name: Update apt cache
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 1800
|
||||
|
||||
- name: Full upgrade controller packages
|
||||
ansible.builtin.apt:
|
||||
upgrade: full
|
||||
autoremove: true
|
||||
autoclean: true
|
||||
register: controller_upgrade
|
||||
|
||||
- name: Check if reboot is required
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: controller_reboot_required
|
||||
|
||||
- name: Show controller upgrade status
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Apt changed: {{ controller_upgrade.changed }}"
|
||||
- "Reboot required: {{ controller_reboot_required.stat.exists }}"
|
||||
|
||||
- name: Reboot controller if required
|
||||
ansible.builtin.reboot:
|
||||
msg: "Reboot after controller OS upgrade"
|
||||
reboot_timeout: 900
|
||||
connect_timeout: 20
|
||||
pre_reboot_delay: 5
|
||||
post_reboot_delay: 30
|
||||
when: controller_reboot_required.stat.exists
|
||||
|
||||
- name: Restart controller services
|
||||
ansible.builtin.systemd:
|
||||
name: "{{ item }}"
|
||||
state: restarted
|
||||
enabled: true
|
||||
loop:
|
||||
- munge
|
||||
- mariadb
|
||||
- slurmdbd
|
||||
- slurmctld
|
||||
|
||||
- name: Wait for slurmctld
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping
|
||||
retries: 20
|
||||
delay: 3
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Validate controller after upgrade
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol ping
|
||||
sinfo
|
||||
squeue
|
||||
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType"
|
||||
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -20
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: controller_after
|
||||
changed_when: false
|
||||
|
||||
- name: Print controller validation after upgrade
|
||||
ansible.builtin.debug:
|
||||
var: controller_after.stdout_lines
|
||||
Reference in New Issue
Block a user