Files
portfolio/platform-projects/hpc-slurm-ai-cluster/playbooks/health/check-slurm-health.yml
T
Mateusz Suski d300d490f5
lint / shell-yaml-ansible (push) Failing after 47s
Add Slurm AI/HPC cluster platform project
2026-06-04 19:42:45 +00:00

150 lines
3.8 KiB
YAML

---
- name: Check Slurm controller health
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Check controller services and cluster state
ansible.builtin.shell: |
set -euo pipefail
echo "### controller services"
systemctl is-active munge
systemctl is-active slurmctld
systemctl is-active slurmdbd || true
systemctl is-active mariadb || true
echo
echo "### slurm ping"
scontrol ping
echo
echo "### nodes"
sinfo -N
echo
echo "### partitions"
sinfo
echo
echo "### queue"
squeue
echo
echo "### problematic nodes"
sinfo -N -h -o "%N %T %E" | awk '$2 !~ /idle|alloc|mix/ {print}' || true
echo
echo "### accounting"
sacctmgr -n list cluster || true
echo
echo "### recent failed jobs"
sacct -S today --state=FAILED,CANCELLED,TIMEOUT,NODE_FAIL,OUT_OF_MEMORY \
--format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,NodeList | tail -30 || true
args:
executable: /bin/bash
register: controller_health
changed_when: false
- name: Print controller health
ansible.builtin.debug:
var: controller_health.stdout_lines
- name: Check Slurm worker health
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: true
tasks:
- name: Check worker services, config and connectivity
ansible.builtin.shell: |
set -euo pipefail
echo "HOST=$(hostname)"
echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
echo "KERNEL=$(uname -r)"
echo "UPTIME=$(uptime -p)"
echo
echo "### services"
systemctl is-active munge
systemctl is-active slurmd
echo
echo "### munge local test"
munge -n | unmunge >/dev/null
echo "munge OK"
echo
echo "### controller connectivity"
getent hosts slurm-ctl01 || true
scontrol ping
echo
echo "### slurmd listener"
ss -lntp | grep ':6818 ' || true
echo
echo "### config checksums"
sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
echo
echo "### shared filesystem"
test -d /shared
touch /shared/.slurm-health-$(hostname)
ls -l /shared/.slurm-health-$(hostname)
rm -f /shared/.slurm-health-$(hostname)
echo
echo "### cgroup"
mount | grep cgroup || true
echo
echo "### gpu check"
if command -v nvidia-smi >/dev/null 2>&1; then
nvidia-smi --query-gpu=index,name,driver_version,memory.total,temperature.gpu,utilization.gpu --format=csv,noheader || true
else
echo "NO_NVIDIA_SMI"
fi
args:
executable: /bin/bash
register: worker_health
changed_when: false
- name: Print worker health
ansible.builtin.debug:
var: worker_health.stdout_lines
- name: Check Slurm-reported node state consistency
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Build Slurm node health summary
ansible.builtin.shell: |
set -euo pipefail
echo "### node summary"
sinfo -N -o "%N %P %T %C %m %G %E"
echo
echo "### full problematic node details"
for node in $(sinfo -N -h -o "%N %T" | awk '$2 ~ /down|drain|fail|unk|not_responding|idle\\*/ {print $1}' | sort -u); do
echo
echo "### $node"
scontrol show node "$node"
done
args:
executable: /bin/bash
register: slurm_node_summary
changed_when: false
- name: Print Slurm node summary
ansible.builtin.debug:
var: slurm_node_summary.stdout_lines