This commit is contained in:
@@ -0,0 +1,149 @@
|
||||
---
|
||||
- name: Check Slurm controller health
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Check controller services and cluster state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### controller services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmctld
|
||||
systemctl is-active slurmdbd || true
|
||||
systemctl is-active mariadb || true
|
||||
|
||||
echo
|
||||
echo "### slurm ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### nodes"
|
||||
sinfo -N
|
||||
|
||||
echo
|
||||
echo "### partitions"
|
||||
sinfo
|
||||
|
||||
echo
|
||||
echo "### queue"
|
||||
squeue
|
||||
|
||||
echo
|
||||
echo "### problematic nodes"
|
||||
sinfo -N -h -o "%N %T %E" | awk '$2 !~ /idle|alloc|mix/ {print}' || true
|
||||
|
||||
echo
|
||||
echo "### accounting"
|
||||
sacctmgr -n list cluster || true
|
||||
|
||||
echo
|
||||
echo "### recent failed jobs"
|
||||
sacct -S today --state=FAILED,CANCELLED,TIMEOUT,NODE_FAIL,OUT_OF_MEMORY \
|
||||
--format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,NodeList | tail -30 || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: controller_health
|
||||
changed_when: false
|
||||
|
||||
- name: Print controller health
|
||||
ansible.builtin.debug:
|
||||
var: controller_health.stdout_lines
|
||||
|
||||
|
||||
- name: Check Slurm worker health
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Check worker services, config and connectivity
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
echo "UPTIME=$(uptime -p)"
|
||||
|
||||
echo
|
||||
echo "### services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
|
||||
echo
|
||||
echo "### munge local test"
|
||||
munge -n | unmunge >/dev/null
|
||||
echo "munge OK"
|
||||
|
||||
echo
|
||||
echo "### controller connectivity"
|
||||
getent hosts slurm-ctl01 || true
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### slurmd listener"
|
||||
ss -lntp | grep ':6818 ' || true
|
||||
|
||||
echo
|
||||
echo "### config checksums"
|
||||
sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
|
||||
|
||||
echo
|
||||
echo "### shared filesystem"
|
||||
test -d /shared
|
||||
touch /shared/.slurm-health-$(hostname)
|
||||
ls -l /shared/.slurm-health-$(hostname)
|
||||
rm -f /shared/.slurm-health-$(hostname)
|
||||
|
||||
echo
|
||||
echo "### cgroup"
|
||||
mount | grep cgroup || true
|
||||
|
||||
echo
|
||||
echo "### gpu check"
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi --query-gpu=index,name,driver_version,memory.total,temperature.gpu,utilization.gpu --format=csv,noheader || true
|
||||
else
|
||||
echo "NO_NVIDIA_SMI"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: worker_health
|
||||
changed_when: false
|
||||
|
||||
- name: Print worker health
|
||||
ansible.builtin.debug:
|
||||
var: worker_health.stdout_lines
|
||||
|
||||
|
||||
- name: Check Slurm-reported node state consistency
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Build Slurm node health summary
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### node summary"
|
||||
sinfo -N -o "%N %P %T %C %m %G %E"
|
||||
|
||||
echo
|
||||
echo "### full problematic node details"
|
||||
for node in $(sinfo -N -h -o "%N %T" | awk '$2 ~ /down|drain|fail|unk|not_responding|idle\\*/ {print $1}' | sort -u); do
|
||||
echo
|
||||
echo "### $node"
|
||||
scontrol show node "$node"
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_node_summary
|
||||
changed_when: false
|
||||
|
||||
- name: Print Slurm node summary
|
||||
ansible.builtin.debug:
|
||||
var: slurm_node_summary.stdout_lines
|
||||
Reference in New Issue
Block a user