Add Slurm AI/HPC cluster platform project
This commit is contained in:
+178
@@ -0,0 +1,178 @@
|
||||
---
|
||||
- name: Validate Slurm accounting production-like setup
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate accounting services
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### services"
|
||||
systemctl is-active mariadb
|
||||
systemctl is-active slurmdbd
|
||||
systemctl is-active slurmctld
|
||||
|
||||
echo
|
||||
echo "### slurmdbd listener"
|
||||
ss -lntp | grep ':6819 '
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: service_check
|
||||
changed_when: false
|
||||
|
||||
- name: Validate Slurm accounting runtime config
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### accounting config"
|
||||
scontrol show config | grep -E "AccountingStorage|JobAcctGather|ClusterName"
|
||||
|
||||
echo
|
||||
echo "### priority / select / cgroup config"
|
||||
scontrol show config | grep -E "SelectType|TaskPlugin|ProctrackType"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: config_check
|
||||
changed_when: false
|
||||
|
||||
- name: Validate sacctmgr entities
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### clusters"
|
||||
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
|
||||
|
||||
echo
|
||||
echo "### accounts"
|
||||
sacctmgr list account format=Account,Descr,Org
|
||||
|
||||
echo
|
||||
echo "### users"
|
||||
sacctmgr list user format=User,DefaultAccount,Admin
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: entity_check
|
||||
changed_when: false
|
||||
|
||||
- name: Submit accounting validation job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=acct-prodlike-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/acct-prodlike-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/acct-prodlike-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: acct_job
|
||||
changed_when: true
|
||||
|
||||
- name: Validate sacct can read recent jobs
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### recent jobs"
|
||||
sacct -S today --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: sacct_recent
|
||||
changed_when: false
|
||||
|
||||
- name: Validate sreport commands
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### cluster utilization"
|
||||
sreport cluster utilization start=today || true
|
||||
|
||||
echo
|
||||
echo "### account utilization by user"
|
||||
sreport cluster AccountUtilizationByUser start=today || true
|
||||
|
||||
echo
|
||||
echo "### user top"
|
||||
sreport user top start=today || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: sreport_check
|
||||
changed_when: false
|
||||
|
||||
- name: Validate MariaDB table health summary
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### database exists"
|
||||
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';"
|
||||
|
||||
echo
|
||||
echo "### table count"
|
||||
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ slurmdbd_storage_loc }}';"
|
||||
|
||||
echo
|
||||
echo "### largest tables"
|
||||
mysql -N -B -e "
|
||||
SELECT table_name, table_rows
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema='{{ slurmdbd_storage_loc }}'
|
||||
ORDER BY table_rows DESC
|
||||
LIMIT 10;
|
||||
"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: db_health
|
||||
changed_when: false
|
||||
|
||||
- name: Print accounting validation
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "### services"
|
||||
- "{{ service_check.stdout_lines }}"
|
||||
- "### runtime config"
|
||||
- "{{ config_check.stdout_lines }}"
|
||||
- "### accounting entities"
|
||||
- "{{ entity_check.stdout_lines }}"
|
||||
- "### accounting validation job"
|
||||
- "{{ acct_job.stdout_lines }}"
|
||||
- "### recent sacct data"
|
||||
- "{{ sacct_recent.stdout_lines }}"
|
||||
- "### sreport"
|
||||
- "{{ sreport_check.stdout_lines }}"
|
||||
- "### database health"
|
||||
- "{{ db_health.stdout_lines }}"
|
||||
Reference in New Issue
Block a user