--- - name: Check Slurm controller health hosts: slurm_controller become: true gather_facts: false tasks: - name: Check controller services and cluster state ansible.builtin.shell: | set -euo pipefail echo "### controller services" systemctl is-active munge systemctl is-active slurmctld systemctl is-active slurmdbd || true systemctl is-active mariadb || true echo echo "### slurm ping" scontrol ping echo echo "### nodes" sinfo -N echo echo "### partitions" sinfo echo echo "### queue" squeue echo echo "### problematic nodes" sinfo -N -h -o "%N %T %E" | awk '$2 !~ /idle|alloc|mix/ {print}' || true echo echo "### accounting" sacctmgr -n list cluster || true echo echo "### recent failed jobs" sacct -S today --state=FAILED,CANCELLED,TIMEOUT,NODE_FAIL,OUT_OF_MEMORY \ --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,NodeList | tail -30 || true args: executable: /bin/bash register: controller_health changed_when: false - name: Print controller health ansible.builtin.debug: var: controller_health.stdout_lines - name: Check Slurm worker health hosts: slurm_compute:slurm_gpu become: true gather_facts: true tasks: - name: Check worker services, config and connectivity ansible.builtin.shell: | set -euo pipefail echo "HOST=$(hostname)" echo "FQDN=$(hostname -f 2>/dev/null || hostname)" echo "KERNEL=$(uname -r)" echo "UPTIME=$(uptime -p)" echo echo "### services" systemctl is-active munge systemctl is-active slurmd echo echo "### munge local test" munge -n | unmunge >/dev/null echo "munge OK" echo echo "### controller connectivity" getent hosts slurm-ctl01 || true scontrol ping echo echo "### slurmd listener" ss -lntp | grep ':6818 ' || true echo echo "### config checksums" sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true echo echo "### shared filesystem" test -d /shared touch /shared/.slurm-health-$(hostname) ls -l /shared/.slurm-health-$(hostname) rm -f /shared/.slurm-health-$(hostname) echo echo "### cgroup" mount | grep cgroup || true echo echo "### gpu check" if command -v nvidia-smi >/dev/null 2>&1; then nvidia-smi --query-gpu=index,name,driver_version,memory.total,temperature.gpu,utilization.gpu --format=csv,noheader || true else echo "NO_NVIDIA_SMI" fi args: executable: /bin/bash register: worker_health changed_when: false - name: Print worker health ansible.builtin.debug: var: worker_health.stdout_lines - name: Check Slurm-reported node state consistency hosts: slurm_controller become: true gather_facts: false tasks: - name: Build Slurm node health summary ansible.builtin.shell: | set -euo pipefail echo "### node summary" sinfo -N -o "%N %P %T %C %m %G %E" echo echo "### full problematic node details" for node in $(sinfo -N -h -o "%N %T" | awk '$2 ~ /down|drain|fail|unk|not_responding|idle\\*/ {print $1}' | sort -u); do echo echo "### $node" scontrol show node "$node" done args: executable: /bin/bash register: slurm_node_summary changed_when: false - name: Print Slurm node summary ansible.builtin.debug: var: slurm_node_summary.stdout_lines