Add Slurm AI/HPC cluster platform project
lint / shell-yaml-ansible (push) Failing after 47s

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit d300d490f5
49 changed files with 4777 additions and 0 deletions
@@ -0,0 +1,40 @@
---
- name: Discover node resources for Slurm config
hosts: slurm_cluster
become: true
gather_facts: true
tasks:
- name: Discover CPU and memory
ansible.builtin.shell: |
set -euo pipefail
echo "HOST={{ inventory_hostname }}"
echo "CPUS=$(nproc)"
echo "REAL_MEMORY_MB=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo)"
echo "SOCKETS=$(lscpu | awk -F: '/Socket\\(s\\)/ {gsub(/ /,\"\",$2); print $2}')"
echo "CORES_PER_SOCKET=$(lscpu | awk -F: '/Core\\(s\\) per socket/ {gsub(/ /,\"\",$2); print $2}')"
echo "THREADS_PER_CORE=$(lscpu | awk -F: '/Thread\\(s\\) per core/ {gsub(/ /,\"\",$2); print $2}')"
args:
executable: /bin/bash
register: cpu_mem
changed_when: false
- name: Discover NVIDIA GPU if present
ansible.builtin.shell: |
set -euo pipefail
if command -v nvidia-smi >/dev/null 2>&1; then
nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader
else
echo "NO_NVIDIA_SMI"
fi
args:
executable: /bin/bash
register: gpu_info
changed_when: false
- name: Show discovered resources
ansible.builtin.debug:
msg:
- "{{ cpu_mem.stdout_lines }}"
- "GPU:"
- "{{ gpu_info.stdout_lines }}"
@@ -0,0 +1,89 @@
---
- name: Inspect current Slurm and Munge state
hosts: slurm_cluster
become: true
gather_facts: true
tasks:
- name: Basic host info
ansible.builtin.shell: |
set -e
echo "HOST=$(hostname -f 2>/dev/null || hostname)"
echo "SHORT_HOST=$(hostname -s)"
echo "IP_ADDRESSES=$(hostname -I)"
echo "OS=$(lsb_release -ds 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME || true)"
echo "KERNEL=$(uname -r)"
args:
executable: /bin/bash
register: host_info
changed_when: false
- name: Slurm package info
ansible.builtin.shell: |
dpkg -l | grep -Ei 'slurm|munge' || true
args:
executable: /bin/bash
register: package_info
changed_when: false
- name: Slurm config paths
ansible.builtin.shell: |
set -e
for p in /etc/slurm /etc/slurm-llnl /etc/munge; do
echo "### $p"
if [ -e "$p" ]; then
find "$p" -maxdepth 2 -type f -printf "%m %u %g %p\n" | sort
else
echo "MISSING"
fi
done
args:
executable: /bin/bash
register: config_paths
changed_when: false
- name: Service state
ansible.builtin.shell: |
for s in munge slurmctld slurmd; do
echo "### $s"
systemctl is-enabled "$s" 2>/dev/null || true
systemctl is-active "$s" 2>/dev/null || true
done
args:
executable: /bin/bash
register: service_state
changed_when: false
- name: Slurm commands
ansible.builtin.shell: |
echo "### which"
command -v sinfo || true
command -v scontrol || true
command -v sbatch || true
command -v srun || true
command -v munge || true
command -v unmunge || true
echo "### sinfo"
sinfo 2>&1 || true
echo "### scontrol ping"
scontrol ping 2>&1 || true
args:
executable: /bin/bash
register: slurm_commands
changed_when: false
- name: Show inspection report
ansible.builtin.debug:
msg:
- "===== {{ inventory_hostname }} :: host_info ====="
- "{{ host_info.stdout_lines }}"
- "===== {{ inventory_hostname }} :: packages ====="
- "{{ package_info.stdout_lines }}"
- "===== {{ inventory_hostname }} :: config_paths ====="
- "{{ config_paths.stdout_lines }}"
- "===== {{ inventory_hostname }} :: services ====="
- "{{ service_state.stdout_lines }}"
- "===== {{ inventory_hostname }} :: slurm_commands ====="
- "{{ slurm_commands.stdout_lines }}"