This commit is contained in:
+40
@@ -0,0 +1,40 @@
|
||||
---
|
||||
- name: Discover node resources for Slurm config
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Discover CPU and memory
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
echo "HOST={{ inventory_hostname }}"
|
||||
echo "CPUS=$(nproc)"
|
||||
echo "REAL_MEMORY_MB=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo)"
|
||||
echo "SOCKETS=$(lscpu | awk -F: '/Socket\\(s\\)/ {gsub(/ /,\"\",$2); print $2}')"
|
||||
echo "CORES_PER_SOCKET=$(lscpu | awk -F: '/Core\\(s\\) per socket/ {gsub(/ /,\"\",$2); print $2}')"
|
||||
echo "THREADS_PER_CORE=$(lscpu | awk -F: '/Thread\\(s\\) per core/ {gsub(/ /,\"\",$2); print $2}')"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: cpu_mem
|
||||
changed_when: false
|
||||
|
||||
- name: Discover NVIDIA GPU if present
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader
|
||||
else
|
||||
echo "NO_NVIDIA_SMI"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_info
|
||||
changed_when: false
|
||||
|
||||
- name: Show discovered resources
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "{{ cpu_mem.stdout_lines }}"
|
||||
- "GPU:"
|
||||
- "{{ gpu_info.stdout_lines }}"
|
||||
@@ -0,0 +1,89 @@
|
||||
---
|
||||
- name: Inspect current Slurm and Munge state
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Basic host info
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
echo "HOST=$(hostname -f 2>/dev/null || hostname)"
|
||||
echo "SHORT_HOST=$(hostname -s)"
|
||||
echo "IP_ADDRESSES=$(hostname -I)"
|
||||
echo "OS=$(lsb_release -ds 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME || true)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: host_info
|
||||
changed_when: false
|
||||
|
||||
- name: Slurm package info
|
||||
ansible.builtin.shell: |
|
||||
dpkg -l | grep -Ei 'slurm|munge' || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: package_info
|
||||
changed_when: false
|
||||
|
||||
- name: Slurm config paths
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
for p in /etc/slurm /etc/slurm-llnl /etc/munge; do
|
||||
echo "### $p"
|
||||
if [ -e "$p" ]; then
|
||||
find "$p" -maxdepth 2 -type f -printf "%m %u %g %p\n" | sort
|
||||
else
|
||||
echo "MISSING"
|
||||
fi
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: config_paths
|
||||
changed_when: false
|
||||
|
||||
- name: Service state
|
||||
ansible.builtin.shell: |
|
||||
for s in munge slurmctld slurmd; do
|
||||
echo "### $s"
|
||||
systemctl is-enabled "$s" 2>/dev/null || true
|
||||
systemctl is-active "$s" 2>/dev/null || true
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: service_state
|
||||
changed_when: false
|
||||
|
||||
- name: Slurm commands
|
||||
ansible.builtin.shell: |
|
||||
echo "### which"
|
||||
command -v sinfo || true
|
||||
command -v scontrol || true
|
||||
command -v sbatch || true
|
||||
command -v srun || true
|
||||
command -v munge || true
|
||||
command -v unmunge || true
|
||||
|
||||
echo "### sinfo"
|
||||
sinfo 2>&1 || true
|
||||
|
||||
echo "### scontrol ping"
|
||||
scontrol ping 2>&1 || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_commands
|
||||
changed_when: false
|
||||
|
||||
- name: Show inspection report
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "===== {{ inventory_hostname }} :: host_info ====="
|
||||
- "{{ host_info.stdout_lines }}"
|
||||
- "===== {{ inventory_hostname }} :: packages ====="
|
||||
- "{{ package_info.stdout_lines }}"
|
||||
- "===== {{ inventory_hostname }} :: config_paths ====="
|
||||
- "{{ config_paths.stdout_lines }}"
|
||||
- "===== {{ inventory_hostname }} :: services ====="
|
||||
- "{{ service_state.stdout_lines }}"
|
||||
- "===== {{ inventory_hostname }} :: slurm_commands ====="
|
||||
- "{{ slurm_commands.stdout_lines }}"
|
||||
Reference in New Issue
Block a user