Add Slurm AI/HPC cluster platform project
This commit is contained in:
@@ -0,0 +1,90 @@
|
||||
---
|
||||
- name: Backup SlurmDBD MariaDB database
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
vars:
|
||||
slurmdbd_backup_dir: /var/backups/slurmdbd
|
||||
local_fetch_dir: "{{ playbook_dir }}/../../artifacts/backups/slurmdbd"
|
||||
|
||||
tasks:
|
||||
- name: Create remote backup directory
|
||||
ansible.builtin.file:
|
||||
path: "{{ slurmdbd_backup_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0700"
|
||||
|
||||
- name: Create local fetch directory on Ansible controller
|
||||
ansible.builtin.file:
|
||||
path: "{{ local_fetch_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0700"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
|
||||
- name: Validate MariaDB is running
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active mariadb
|
||||
changed_when: false
|
||||
|
||||
- name: Validate SlurmDBD is running
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active slurmdbd
|
||||
changed_when: false
|
||||
|
||||
- name: Validate Slurm accounting database exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';" | grep -qx "{{ slurmdbd_storage_loc }}"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
- name: Dump Slurm accounting database
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
ts="$(date +%F-%H%M%S)"
|
||||
out="{{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-${ts}.sql.gz"
|
||||
|
||||
mysqldump \
|
||||
--single-transaction \
|
||||
--routines \
|
||||
--events \
|
||||
--triggers \
|
||||
{{ slurmdbd_storage_loc }} | gzip -9 > "$out"
|
||||
|
||||
chmod 0600 "$out"
|
||||
echo "$out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: db_dump
|
||||
changed_when: true
|
||||
|
||||
- name: Validate backup file is non-empty
|
||||
ansible.builtin.stat:
|
||||
path: "{{ db_dump.stdout }}"
|
||||
register: backup_file
|
||||
|
||||
- name: Fail if backup file is empty
|
||||
ansible.builtin.fail:
|
||||
msg: "Backup file is empty: {{ db_dump.stdout }}"
|
||||
when: backup_file.stat.size | int < 1024
|
||||
|
||||
- name: Fetch DB backup to Ansible controller
|
||||
ansible.builtin.fetch:
|
||||
src: "{{ db_dump.stdout }}"
|
||||
dest: "{{ local_fetch_dir }}/"
|
||||
flat: true
|
||||
|
||||
- name: Show DB backup result
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Remote backup: {{ db_dump.stdout }}"
|
||||
- "Backup size bytes: {{ backup_file.stat.size }}"
|
||||
- "Fetched to: {{ local_fetch_dir }}/"
|
||||
+126
@@ -0,0 +1,126 @@
|
||||
---
|
||||
- name: Initialize Slurm accounting entities
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Wait for sacctmgr connectivity
|
||||
ansible.builtin.command:
|
||||
cmd: sacctmgr -n list cluster
|
||||
register: sacctmgr_cluster_list
|
||||
retries: 20
|
||||
delay: 2
|
||||
until: sacctmgr_cluster_list.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Show current accounting state before changes
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### clusters"
|
||||
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
|
||||
|
||||
echo
|
||||
echo "### accounts"
|
||||
sacctmgr list account format=Account,Descr,Org
|
||||
|
||||
echo
|
||||
echo "### users"
|
||||
sacctmgr list user format=User,DefaultAccount,Admin
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: accounting_state_before
|
||||
changed_when: false
|
||||
|
||||
- name: Print current accounting state before changes
|
||||
ansible.builtin.debug:
|
||||
var: accounting_state_before.stdout_lines
|
||||
|
||||
- name: Ensure Slurm cluster exists in accounting DB
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
if sacctmgr -n list cluster format=Cluster | awk '{print $1}' | grep -qx "{{ slurm_cluster_name }}"; then
|
||||
echo "Cluster {{ slurm_cluster_name }} already exists"
|
||||
else
|
||||
sacctmgr -i add cluster {{ slurm_cluster_name }}
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: ensure_cluster
|
||||
changed_when: "'Adding Cluster' in ensure_cluster.stdout"
|
||||
|
||||
- name: Ensure default lab account exists for cluster
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="" {found=1} END {exit !found}'; then
|
||||
echo "Account {{ slurm_account_name }} already associated with cluster {{ slurm_cluster_name }}"
|
||||
else
|
||||
sacctmgr -i add account {{ slurm_account_name }} \
|
||||
Cluster={{ slurm_cluster_name }} \
|
||||
Description="{{ slurm_account_description }}" \
|
||||
Organization="{{ slurm_account_organization }}"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: ensure_account
|
||||
changed_when: "'Adding Account' in ensure_account.stdout"
|
||||
|
||||
- name: Ensure slurmuser exists with lab account association
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="slurmuser" {found=1} END {exit !found}'; then
|
||||
echo "User slurmuser already associated with account {{ slurm_account_name }} on cluster {{ slurm_cluster_name }}"
|
||||
else
|
||||
sacctmgr -i add user slurmuser \
|
||||
Cluster={{ slurm_cluster_name }} \
|
||||
Account={{ slurm_account_name }} \
|
||||
DefaultAccount={{ slurm_account_name }}
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: ensure_user_assoc
|
||||
changed_when: "'Adding User' in ensure_user_assoc.stdout"
|
||||
|
||||
- name: Ensure slurmuser has default account set
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: set_default_account
|
||||
changed_when: "'Modified user' in (set_default_account.stdout + set_default_account.stderr)"
|
||||
|
||||
- name: Show final accounting state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### clusters"
|
||||
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
|
||||
|
||||
echo
|
||||
echo "### accounts"
|
||||
sacctmgr list account format=Account,Descr,Org
|
||||
|
||||
echo
|
||||
echo "### users"
|
||||
sacctmgr list user format=User,DefaultAccount,Admin
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: accounting_state_after
|
||||
changed_when: false
|
||||
|
||||
- name: Print final accounting state
|
||||
ansible.builtin.debug:
|
||||
var: accounting_state_after.stdout_lines
|
||||
+98
@@ -0,0 +1,98 @@
|
||||
---
|
||||
- name: Restore-check latest SlurmDBD backup into test database
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
restore_check_db: "{{ slurmdbd_storage_loc }}_restorecheck"
|
||||
slurmdbd_backup_dir: /var/backups/slurmdbd
|
||||
|
||||
tasks:
|
||||
- name: Validate MariaDB is running
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active mariadb
|
||||
changed_when: false
|
||||
|
||||
- name: Find latest SlurmDBD backup
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
ls -1t {{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-*.sql.gz | head -n 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: latest_backup
|
||||
changed_when: false
|
||||
|
||||
- name: Validate latest backup exists
|
||||
ansible.builtin.stat:
|
||||
path: "{{ latest_backup.stdout }}"
|
||||
register: latest_backup_stat
|
||||
|
||||
- name: Fail if latest backup is missing or empty
|
||||
ansible.builtin.fail:
|
||||
msg: "Latest SlurmDBD backup is missing or empty: {{ latest_backup.stdout }}"
|
||||
when:
|
||||
- not latest_backup_stat.stat.exists or latest_backup_stat.stat.size | int < 1024
|
||||
|
||||
- name: Recreate restore-check database
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
mysql <<SQL
|
||||
DROP DATABASE IF EXISTS {{ restore_check_db }};
|
||||
CREATE DATABASE {{ restore_check_db }};
|
||||
SQL
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Import backup into restore-check database
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
zcat "{{ latest_backup.stdout }}" | mysql {{ restore_check_db }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Validate restored table count
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: restored_tables
|
||||
changed_when: false
|
||||
failed_when: restored_tables.stdout | int < 1
|
||||
|
||||
- name: Validate restored row count sample
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### restored database"
|
||||
echo "{{ restore_check_db }}"
|
||||
|
||||
echo
|
||||
echo "### table count"
|
||||
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
|
||||
|
||||
echo
|
||||
echo "### largest tables"
|
||||
mysql -N -B -e "
|
||||
SELECT table_name, table_rows
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema='{{ restore_check_db }}'
|
||||
ORDER BY table_rows DESC
|
||||
LIMIT 10;
|
||||
"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: restore_check_summary
|
||||
changed_when: false
|
||||
|
||||
- name: Show restore-check result
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Imported backup: {{ latest_backup.stdout }}"
|
||||
- "Restore-check DB: {{ restore_check_db }}"
|
||||
- "Restored tables: {{ restored_tables.stdout }}"
|
||||
- "Summary:"
|
||||
- "{{ restore_check_summary.stdout_lines }}"
|
||||
@@ -0,0 +1,105 @@
|
||||
---
|
||||
- name: Install and configure MariaDB for SlurmDBD
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Install MariaDB and SlurmDBD packages
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- mariadb-server
|
||||
- mariadb-client
|
||||
- slurmdbd
|
||||
- slurm-wlm-mysql-plugin
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure MariaDB is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: mariadb
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Ensure Slurm log directory exists
|
||||
ansible.builtin.file:
|
||||
path: /var/log/slurm
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
|
||||
- name: Create Slurm accounting database and DB user
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
mysql <<SQL
|
||||
CREATE DATABASE IF NOT EXISTS {{ slurmdbd_storage_loc }};
|
||||
CREATE USER IF NOT EXISTS '{{ slurmdbd_storage_user }}'@'localhost' IDENTIFIED BY '{{ slurmdbd_storage_pass }}';
|
||||
CREATE USER IF NOT EXISTS '{{ slurmdbd_storage_user }}'@'127.0.0.1' IDENTIFIED BY '{{ slurmdbd_storage_pass }}';
|
||||
GRANT ALL PRIVILEGES ON {{ slurmdbd_storage_loc }}.* TO '{{ slurmdbd_storage_user }}'@'localhost';
|
||||
GRANT ALL PRIVILEGES ON {{ slurmdbd_storage_loc }}.* TO '{{ slurmdbd_storage_user }}'@'127.0.0.1';
|
||||
FLUSH PRIVILEGES;
|
||||
SQL
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Ensure /etc/slurm exists
|
||||
ansible.builtin.file:
|
||||
path: /etc/slurm
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
|
||||
- name: Deploy slurmdbd.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/slurmdbd.conf.j2
|
||||
dest: /etc/slurm/slurmdbd.conf
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0600"
|
||||
notify:
|
||||
- Restart slurmdbd
|
||||
|
||||
- name: Ensure slurmdbd is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: slurmdbd
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Flush handlers before validation
|
||||
ansible.builtin.meta: flush_handlers
|
||||
|
||||
- name: Validate slurmdbd service is active
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active slurmdbd
|
||||
register: slurmdbd_active
|
||||
retries: 10
|
||||
delay: 2
|
||||
until: slurmdbd_active.stdout == "active"
|
||||
changed_when: false
|
||||
|
||||
- name: Validate slurmdbd is listening on port
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
ss -lntp | grep ':{{ slurmdbd_port }} '
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurmdbd_port_check
|
||||
retries: 10
|
||||
delay: 2
|
||||
until: slurmdbd_port_check.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Show slurmdbd service validation
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "slurmdbd is active"
|
||||
- "{{ slurmdbd_port_check.stdout_lines }}"
|
||||
|
||||
handlers:
|
||||
- name: Restart slurmdbd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmdbd
|
||||
state: restarted
|
||||
+178
@@ -0,0 +1,178 @@
|
||||
---
|
||||
- name: Validate Slurm accounting production-like setup
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate accounting services
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### services"
|
||||
systemctl is-active mariadb
|
||||
systemctl is-active slurmdbd
|
||||
systemctl is-active slurmctld
|
||||
|
||||
echo
|
||||
echo "### slurmdbd listener"
|
||||
ss -lntp | grep ':6819 '
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: service_check
|
||||
changed_when: false
|
||||
|
||||
- name: Validate Slurm accounting runtime config
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### accounting config"
|
||||
scontrol show config | grep -E "AccountingStorage|JobAcctGather|ClusterName"
|
||||
|
||||
echo
|
||||
echo "### priority / select / cgroup config"
|
||||
scontrol show config | grep -E "SelectType|TaskPlugin|ProctrackType"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: config_check
|
||||
changed_when: false
|
||||
|
||||
- name: Validate sacctmgr entities
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### clusters"
|
||||
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
|
||||
|
||||
echo
|
||||
echo "### accounts"
|
||||
sacctmgr list account format=Account,Descr,Org
|
||||
|
||||
echo
|
||||
echo "### users"
|
||||
sacctmgr list user format=User,DefaultAccount,Admin
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: entity_check
|
||||
changed_when: false
|
||||
|
||||
- name: Submit accounting validation job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=acct-prodlike-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/acct-prodlike-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/acct-prodlike-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: acct_job
|
||||
changed_when: true
|
||||
|
||||
- name: Validate sacct can read recent jobs
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### recent jobs"
|
||||
sacct -S today --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: sacct_recent
|
||||
changed_when: false
|
||||
|
||||
- name: Validate sreport commands
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### cluster utilization"
|
||||
sreport cluster utilization start=today || true
|
||||
|
||||
echo
|
||||
echo "### account utilization by user"
|
||||
sreport cluster AccountUtilizationByUser start=today || true
|
||||
|
||||
echo
|
||||
echo "### user top"
|
||||
sreport user top start=today || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: sreport_check
|
||||
changed_when: false
|
||||
|
||||
- name: Validate MariaDB table health summary
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### database exists"
|
||||
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';"
|
||||
|
||||
echo
|
||||
echo "### table count"
|
||||
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ slurmdbd_storage_loc }}';"
|
||||
|
||||
echo
|
||||
echo "### largest tables"
|
||||
mysql -N -B -e "
|
||||
SELECT table_name, table_rows
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema='{{ slurmdbd_storage_loc }}'
|
||||
ORDER BY table_rows DESC
|
||||
LIMIT 10;
|
||||
"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: db_health
|
||||
changed_when: false
|
||||
|
||||
- name: Print accounting validation
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "### services"
|
||||
- "{{ service_check.stdout_lines }}"
|
||||
- "### runtime config"
|
||||
- "{{ config_check.stdout_lines }}"
|
||||
- "### accounting entities"
|
||||
- "{{ entity_check.stdout_lines }}"
|
||||
- "### accounting validation job"
|
||||
- "{{ acct_job.stdout_lines }}"
|
||||
- "### recent sacct data"
|
||||
- "{{ sacct_recent.stdout_lines }}"
|
||||
- "### sreport"
|
||||
- "{{ sreport_check.stdout_lines }}"
|
||||
- "### database health"
|
||||
- "{{ db_health.stdout_lines }}"
|
||||
@@ -0,0 +1,83 @@
|
||||
---
|
||||
- name: Backup Slurm and Munge state on all cluster nodes
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
vars:
|
||||
backup_base_dir: /var/backups/slurm
|
||||
|
||||
tasks:
|
||||
- name: Create backup base directory
|
||||
ansible.builtin.file:
|
||||
path: "{{ backup_base_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0700"
|
||||
|
||||
- name: Create timestamped backup directory
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
ts="$(date +%F-%H%M%S)"
|
||||
dir="{{ backup_base_dir }}/$ts"
|
||||
mkdir -p "$dir"
|
||||
echo "$dir"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: backup_dir_result
|
||||
changed_when: true
|
||||
|
||||
- name: Store backup directory fact
|
||||
ansible.builtin.set_fact:
|
||||
node_backup_dir: "{{ backup_dir_result.stdout }}"
|
||||
|
||||
- name: Backup Slurm and Munge config/state if present
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
backup_dir="{{ node_backup_dir }}"
|
||||
|
||||
for p in \
|
||||
/etc/slurm \
|
||||
/etc/slurm-llnl \
|
||||
/etc/munge \
|
||||
/var/spool/slurmctld \
|
||||
/var/spool/slurmd \
|
||||
/var/log/slurm \
|
||||
/var/log/slurm-llnl
|
||||
do
|
||||
if [ -e "$p" ]; then
|
||||
cp -a "$p" "$backup_dir/"
|
||||
fi
|
||||
done
|
||||
|
||||
systemctl status munge --no-pager > "$backup_dir/systemctl-munge.txt" 2>&1 || true
|
||||
systemctl status slurmctld --no-pager > "$backup_dir/systemctl-slurmctld.txt" 2>&1 || true
|
||||
systemctl status slurmd --no-pager > "$backup_dir/systemctl-slurmd.txt" 2>&1 || true
|
||||
|
||||
journalctl -u munge -n 200 --no-pager > "$backup_dir/journal-munge.txt" 2>&1 || true
|
||||
journalctl -u slurmctld -n 200 --no-pager > "$backup_dir/journal-slurmctld.txt" 2>&1 || true
|
||||
journalctl -u slurmd -n 200 --no-pager > "$backup_dir/journal-slurmd.txt" 2>&1 || true
|
||||
|
||||
if command -v sinfo >/dev/null 2>&1; then
|
||||
sinfo > "$backup_dir/sinfo.txt" 2>&1 || true
|
||||
fi
|
||||
|
||||
if command -v scontrol >/dev/null 2>&1; then
|
||||
scontrol show config > "$backup_dir/scontrol-show-config.txt" 2>&1 || true
|
||||
scontrol show nodes > "$backup_dir/scontrol-show-nodes.txt" 2>&1 || true
|
||||
scontrol show partitions > "$backup_dir/scontrol-show-partitions.txt" 2>&1 || true
|
||||
fi
|
||||
|
||||
find "$backup_dir" -maxdepth 2 -type f -o -type d
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: backup_content
|
||||
changed_when: true
|
||||
|
||||
- name: Show backup location on node
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Host: {{ inventory_hostname }}"
|
||||
- "Backup directory: {{ node_backup_dir }}"
|
||||
@@ -0,0 +1,46 @@
|
||||
---
|
||||
- name: Fetch latest Slurm backups from nodes to pvef
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
remote_backup_base: /var/backups/slurm
|
||||
local_backup_base: "{{ playbook_dir }}/../../artifacts/backups"
|
||||
|
||||
tasks:
|
||||
- name: Find latest remote backup directory
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
ls -1dt {{ remote_backup_base }}/* | head -n 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: latest_backup_dir
|
||||
changed_when: false
|
||||
|
||||
- name: Create local backup directory on pvef
|
||||
ansible.builtin.file:
|
||||
path: "{{ local_backup_base }}/{{ inventory_hostname }}"
|
||||
state: directory
|
||||
mode: "0700"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
|
||||
- name: Archive latest backup directory on remote node
|
||||
ansible.builtin.archive:
|
||||
path: "{{ latest_backup_dir.stdout }}"
|
||||
dest: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz"
|
||||
format: gz
|
||||
force_archive: true
|
||||
changed_when: true
|
||||
|
||||
- name: Fetch archive to pvef
|
||||
ansible.builtin.fetch:
|
||||
src: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz"
|
||||
dest: "{{ local_backup_base }}/{{ inventory_hostname }}/"
|
||||
flat: true
|
||||
|
||||
- name: Remove temporary remote archive
|
||||
ansible.builtin.file:
|
||||
path: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz"
|
||||
state: absent
|
||||
@@ -0,0 +1,58 @@
|
||||
---
|
||||
- name: Bootstrap Ansible SSH access from pvef to Slurm nodes
|
||||
hosts: slurm_cluster
|
||||
gather_facts: false
|
||||
become: true
|
||||
|
||||
vars:
|
||||
ansible_controller_pubkey: "{{ lookup('file', lookup('env', 'HOME') + '/.ssh/id_ed25519.pub') }}"
|
||||
|
||||
pre_tasks:
|
||||
- name: Wait for SSH
|
||||
ansible.builtin.wait_for_connection:
|
||||
timeout: 30
|
||||
|
||||
- name: Install Python if missing - Debian/Ubuntu
|
||||
ansible.builtin.raw: |
|
||||
test -e /usr/bin/python3 || (apt-get update && apt-get install -y python3)
|
||||
changed_when: false
|
||||
|
||||
tasks:
|
||||
- name: Ensure sudo is installed
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- sudo
|
||||
- openssh-server
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure SSH server is enabled and running
|
||||
ansible.builtin.service:
|
||||
name: ssh
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
- name: Ensure .ssh directory exists for login user
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ ansible_user }}/.ssh"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ ansible_user }}"
|
||||
mode: "0700"
|
||||
|
||||
- name: Add pvef root public key to login user's authorized_keys
|
||||
ansible.builtin.authorized_key:
|
||||
user: "{{ ansible_user }}"
|
||||
key: "{{ ansible_controller_pubkey }}"
|
||||
state: present
|
||||
manage_dir: true
|
||||
|
||||
- name: Allow bootstrap login user passwordless sudo
|
||||
ansible.builtin.copy:
|
||||
dest: "/etc/sudoers.d/90-ansible-{{ ansible_user }}"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
{{ ansible_user }} ALL=(ALL) NOPASSWD:ALL
|
||||
validate: "visudo -cf %s"
|
||||
@@ -0,0 +1,16 @@
|
||||
---
|
||||
- name: Configure /etc/hosts for Slurm cluster
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Add Slurm cluster hosts to /etc/hosts
|
||||
ansible.builtin.blockinfile:
|
||||
path: /etc/hosts
|
||||
marker: "# {mark} ANSIBLE MANAGED SLURM CLUSTER HOSTS"
|
||||
block: |
|
||||
{{ slurm_control_addr }} {{ slurm_control_machine }}
|
||||
{% for node in slurm_nodes if node.managed_state | default('present') == 'present' %}
|
||||
{{ node.addr }} {{ node.name }}
|
||||
{% endfor %}
|
||||
@@ -0,0 +1,218 @@
|
||||
---
|
||||
- name: Create slurmuser and generate SSH keys on every Slurm node
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
slurm_operator_shell: /bin/bash
|
||||
|
||||
tasks:
|
||||
- name: Ensure useful packages are installed
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- sudo
|
||||
- openssh-client
|
||||
- openssh-server
|
||||
- acl
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure slurmuser exists
|
||||
ansible.builtin.user:
|
||||
name: "{{ slurm_operator_user }}"
|
||||
shell: "{{ slurm_operator_shell }}"
|
||||
create_home: true
|
||||
state: present
|
||||
|
||||
- name: Ensure .ssh directory exists for slurmuser
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh"
|
||||
state: directory
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0700"
|
||||
|
||||
- name: Generate SSH key for slurmuser if missing
|
||||
ansible.builtin.openssh_keypair:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
|
||||
type: ed25519
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0600"
|
||||
comment: "{{ slurm_operator_user }}@{{ inventory_hostname }}"
|
||||
force: false
|
||||
|
||||
- name: Read public key from each node
|
||||
ansible.builtin.slurp:
|
||||
src: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
|
||||
register: slurmuser_pubkey_raw
|
||||
|
||||
- name: Store decoded public key as host fact
|
||||
ansible.builtin.set_fact:
|
||||
slurmuser_pubkey: "{{ slurmuser_pubkey_raw.content | b64decode | trim }}"
|
||||
|
||||
|
||||
- name: Exchange slurmuser SSH keys across all Slurm nodes
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Install all slurmuser public keys into authorized_keys on every node
|
||||
ansible.builtin.authorized_key:
|
||||
user: "{{ slurm_operator_user }}"
|
||||
key: "{{ hostvars[item].slurmuser_pubkey }}"
|
||||
state: present
|
||||
manage_dir: true
|
||||
loop: "{{ groups['slurm_cluster'] }}"
|
||||
|
||||
- name: Build SSH known_hosts entries for all cluster nodes
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
mkdir -p /home/{{ slurm_operator_user }}/.ssh
|
||||
touch /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||||
|
||||
{% for host in groups['slurm_cluster'] %}
|
||||
ssh-keyscan -H {{ host }} {{ hostvars[host].ansible_host }} 2>/dev/null >> /home/{{ slurm_operator_user }}/.ssh/known_hosts || true
|
||||
{% endfor %}
|
||||
|
||||
sort -u /home/{{ slurm_operator_user }}/.ssh/known_hosts -o /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||||
chown {{ slurm_operator_user }}:{{ slurm_operator_user }} /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||||
chmod 0644 /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Ensure SSH permissions are correct
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh"
|
||||
state: directory
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0700"
|
||||
|
||||
- name: Ensure private key permissions are correct
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0600"
|
||||
|
||||
- name: Ensure public key permissions are correct
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0644"
|
||||
|
||||
|
||||
- name: Configure sudo permissions for slurmuser
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Configure sudoers for slurmuser on Slurm controller
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/sudoers.d/91-slurmuser-slurm-controller
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
# Operator access for Slurm controller node.
|
||||
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
|
||||
/bin/systemctl status slurmctld, \
|
||||
/bin/systemctl restart slurmctld, \
|
||||
/bin/systemctl reload slurmctld, \
|
||||
/bin/systemctl stop slurmctld, \
|
||||
/bin/systemctl start slurmctld, \
|
||||
/bin/systemctl status slurmd, \
|
||||
/bin/systemctl restart slurmd, \
|
||||
/bin/systemctl reload slurmd, \
|
||||
/bin/systemctl stop slurmd, \
|
||||
/bin/systemctl start slurmd, \
|
||||
/bin/journalctl -u slurmctld, \
|
||||
/bin/journalctl -u slurmd, \
|
||||
/usr/bin/scontrol, \
|
||||
/usr/bin/sinfo, \
|
||||
/usr/bin/squeue, \
|
||||
/usr/bin/scancel, \
|
||||
/usr/bin/sacct, \
|
||||
/usr/bin/sacctmgr, \
|
||||
/usr/bin/sbatch, \
|
||||
/usr/bin/srun, \
|
||||
/usr/bin/salloc
|
||||
validate: "visudo -cf %s"
|
||||
when: inventory_hostname in groups['slurm_controller']
|
||||
|
||||
- name: Configure sudoers for slurmuser on Slurm compute and GPU nodes
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/sudoers.d/91-slurmuser-slurm-compute
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
# Operator access for Slurm worker/GPU nodes.
|
||||
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
|
||||
/bin/systemctl status slurmd, \
|
||||
/bin/systemctl restart slurmd, \
|
||||
/bin/systemctl reload slurmd, \
|
||||
/bin/systemctl stop slurmd, \
|
||||
/bin/systemctl start slurmd, \
|
||||
/bin/journalctl -u slurmd, \
|
||||
/usr/bin/scontrol, \
|
||||
/usr/bin/sinfo, \
|
||||
/usr/bin/squeue, \
|
||||
/usr/bin/scancel, \
|
||||
/usr/bin/sacct, \
|
||||
/usr/bin/sbatch, \
|
||||
/usr/bin/srun, \
|
||||
/usr/bin/salloc
|
||||
validate: "visudo -cf %s"
|
||||
when: inventory_hostname not in groups['slurm_controller']
|
||||
|
||||
|
||||
- name: Validate slurmuser SSH mesh and Slurm access
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Test local Slurm commands as slurmuser
|
||||
ansible.builtin.command: "sudo -iu {{ slurm_operator_user }} sinfo"
|
||||
register: sinfo_test
|
||||
changed_when: false
|
||||
failed_when: sinfo_test.rc != 0
|
||||
|
||||
- name: Show sinfo result
|
||||
ansible.builtin.debug:
|
||||
var: sinfo_test.stdout_lines
|
||||
|
||||
- name: Test SSH from each node to every other node as slurmuser
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
{% for host in groups['slurm_cluster'] %}
|
||||
ssh -o BatchMode=yes -o ConnectTimeout=5 {{ host }} 'hostname'
|
||||
{% endfor %}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
become_user: "{{ slurm_operator_user }}"
|
||||
register: ssh_mesh_test
|
||||
changed_when: false
|
||||
|
||||
- name: Show SSH mesh test result
|
||||
ansible.builtin.debug:
|
||||
var: ssh_mesh_test.stdout_lines
|
||||
@@ -0,0 +1,112 @@
|
||||
---
|
||||
- name: Fix sudo permissions for slurmuser Slurm operations
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Configure sudoers for slurmuser on controller
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/sudoers.d/91-slurmuser-slurm-controller
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
|
||||
Cmnd_Alias SLURM_SYSTEMCTL_CONTROLLER = \
|
||||
/bin/systemctl status slurmctld, \
|
||||
/bin/systemctl status slurmctld *, \
|
||||
/bin/systemctl restart slurmctld, \
|
||||
/bin/systemctl reload slurmctld, \
|
||||
/bin/systemctl start slurmctld, \
|
||||
/bin/systemctl stop slurmctld, \
|
||||
/bin/systemctl status slurmd, \
|
||||
/bin/systemctl status slurmd *, \
|
||||
/bin/systemctl restart slurmd, \
|
||||
/bin/systemctl reload slurmd, \
|
||||
/bin/systemctl start slurmd, \
|
||||
/bin/systemctl stop slurmd, \
|
||||
/usr/bin/systemctl status slurmctld, \
|
||||
/usr/bin/systemctl status slurmctld *, \
|
||||
/usr/bin/systemctl restart slurmctld, \
|
||||
/usr/bin/systemctl reload slurmctld, \
|
||||
/usr/bin/systemctl start slurmctld, \
|
||||
/usr/bin/systemctl stop slurmctld, \
|
||||
/usr/bin/systemctl status slurmd, \
|
||||
/usr/bin/systemctl status slurmd *, \
|
||||
/usr/bin/systemctl restart slurmd, \
|
||||
/usr/bin/systemctl reload slurmd, \
|
||||
/usr/bin/systemctl start slurmd, \
|
||||
/usr/bin/systemctl stop slurmd
|
||||
|
||||
Cmnd_Alias SLURM_JOURNAL_CONTROLLER = \
|
||||
/bin/journalctl -u slurmctld, \
|
||||
/bin/journalctl -u slurmctld *, \
|
||||
/bin/journalctl -u slurmd, \
|
||||
/bin/journalctl -u slurmd *, \
|
||||
/usr/bin/journalctl -u slurmctld, \
|
||||
/usr/bin/journalctl -u slurmctld *, \
|
||||
/usr/bin/journalctl -u slurmd, \
|
||||
/usr/bin/journalctl -u slurmd *
|
||||
|
||||
Cmnd_Alias SLURM_COMMANDS = \
|
||||
/usr/bin/scontrol, /usr/bin/scontrol *, \
|
||||
/usr/bin/sinfo, /usr/bin/sinfo *, \
|
||||
/usr/bin/squeue, /usr/bin/squeue *, \
|
||||
/usr/bin/scancel, /usr/bin/scancel *, \
|
||||
/usr/bin/sacct, /usr/bin/sacct *, \
|
||||
/usr/bin/sacctmgr, /usr/bin/sacctmgr *, \
|
||||
/usr/bin/sbatch, /usr/bin/sbatch *, \
|
||||
/usr/bin/srun, /usr/bin/srun *, \
|
||||
/usr/bin/salloc, /usr/bin/salloc *
|
||||
|
||||
{{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_CONTROLLER, SLURM_JOURNAL_CONTROLLER, SLURM_COMMANDS
|
||||
validate: "visudo -cf %s"
|
||||
when: inventory_hostname in groups['slurm_controller']
|
||||
|
||||
- name: Configure sudoers for slurmuser on compute and GPU nodes
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/sudoers.d/91-slurmuser-slurm-compute
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
|
||||
Cmnd_Alias SLURM_SYSTEMCTL_COMPUTE = \
|
||||
/bin/systemctl status slurmd, \
|
||||
/bin/systemctl status slurmd *, \
|
||||
/bin/systemctl restart slurmd, \
|
||||
/bin/systemctl reload slurmd, \
|
||||
/bin/systemctl start slurmd, \
|
||||
/bin/systemctl stop slurmd, \
|
||||
/usr/bin/systemctl status slurmd, \
|
||||
/usr/bin/systemctl status slurmd *, \
|
||||
/usr/bin/systemctl restart slurmd, \
|
||||
/usr/bin/systemctl reload slurmd, \
|
||||
/usr/bin/systemctl start slurmd, \
|
||||
/usr/bin/systemctl stop slurmd
|
||||
|
||||
Cmnd_Alias SLURM_JOURNAL_COMPUTE = \
|
||||
/bin/journalctl -u slurmd, \
|
||||
/bin/journalctl -u slurmd *, \
|
||||
/usr/bin/journalctl -u slurmd, \
|
||||
/usr/bin/journalctl -u slurmd *
|
||||
|
||||
Cmnd_Alias SLURM_COMMANDS = \
|
||||
/usr/bin/scontrol, /usr/bin/scontrol *, \
|
||||
/usr/bin/sinfo, /usr/bin/sinfo *, \
|
||||
/usr/bin/squeue, /usr/bin/squeue *, \
|
||||
/usr/bin/scancel, /usr/bin/scancel *, \
|
||||
/usr/bin/sacct, /usr/bin/sacct *, \
|
||||
/usr/bin/sbatch, /usr/bin/sbatch *, \
|
||||
/usr/bin/srun, /usr/bin/srun *, \
|
||||
/usr/bin/salloc, /usr/bin/salloc *
|
||||
|
||||
{{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_COMPUTE, SLURM_JOURNAL_COMPUTE, SLURM_COMMANDS
|
||||
validate: "visudo -cf %s"
|
||||
when: inventory_hostname not in groups['slurm_controller']
|
||||
@@ -0,0 +1,133 @@
|
||||
---
|
||||
- name: Read Munge key from Slurm controller
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Check controller munge.key exists
|
||||
ansible.builtin.stat:
|
||||
path: /etc/munge/munge.key
|
||||
register: controller_munge_key
|
||||
|
||||
- name: Fail if controller munge.key is missing
|
||||
ansible.builtin.fail:
|
||||
msg: "/etc/munge/munge.key is missing on controller. Do not continue."
|
||||
when: not controller_munge_key.stat.exists
|
||||
|
||||
- name: Read controller munge.key
|
||||
ansible.builtin.slurp:
|
||||
src: /etc/munge/munge.key
|
||||
register: controller_munge_key_raw
|
||||
|
||||
- name: Store controller Munge key as fact
|
||||
ansible.builtin.set_fact:
|
||||
cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}"
|
||||
|
||||
|
||||
- name: Deploy controller Munge key to all Slurm nodes
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
controller_host: "{{ groups['slurm_controller'][0] }}"
|
||||
|
||||
tasks:
|
||||
- name: Ensure munge package is installed
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- munge
|
||||
- libmunge2
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure munge group exists
|
||||
ansible.builtin.group:
|
||||
name: munge
|
||||
system: true
|
||||
state: present
|
||||
|
||||
- name: Ensure munge user exists
|
||||
ansible.builtin.user:
|
||||
name: munge
|
||||
group: munge
|
||||
system: true
|
||||
shell: /usr/sbin/nologin
|
||||
home: /nonexistent
|
||||
create_home: false
|
||||
state: present
|
||||
|
||||
- name: Ensure /etc/munge exists
|
||||
ansible.builtin.file:
|
||||
path: /etc/munge
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0700"
|
||||
|
||||
- name: Deploy shared munge.key from controller
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/munge/munge.key
|
||||
content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}"
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0400"
|
||||
notify:
|
||||
- Restart munge
|
||||
|
||||
- name: Ensure /var/log/munge exists
|
||||
ansible.builtin.file:
|
||||
path: /var/log/munge
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure /var/lib/munge exists
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/munge
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0711"
|
||||
|
||||
- name: Ensure /run/munge exists
|
||||
ansible.builtin.file:
|
||||
path: /run/munge
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure munge is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
handlers:
|
||||
- name: Restart munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
|
||||
|
||||
- name: Validate Munge locally on all nodes
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Test local munge encode/decode
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
munge -n | unmunge
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: munge_local_test
|
||||
changed_when: false
|
||||
|
||||
- name: Show local Munge validation
|
||||
ansible.builtin.debug:
|
||||
var: munge_local_test.stdout_lines
|
||||
@@ -0,0 +1,132 @@
|
||||
---
|
||||
- name: Prepare Slurm config directories and logs
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Ensure Slurm config directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ slurm_config_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure Slurm log directory exists
|
||||
ansible.builtin.file:
|
||||
path: /var/log/slurm
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure slurmctld spool directory exists on controller
|
||||
ansible.builtin.file:
|
||||
path: /var/spool/slurmctld
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
when: inventory_hostname in groups['slurm_controller']
|
||||
|
||||
- name: Ensure slurmd spool directory exists on workers
|
||||
ansible.builtin.file:
|
||||
path: /var/spool/slurmd
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu']
|
||||
|
||||
|
||||
- name: Deploy Slurm config files
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Backup current slurm.conf before managed deployment
|
||||
ansible.builtin.copy:
|
||||
src: "{{ slurm_config_dir }}/slurm.conf"
|
||||
dest: "{{ slurm_config_dir }}/slurm.conf.pre-ansible-managed"
|
||||
remote_src: true
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
force: false
|
||||
|
||||
- name: Deploy managed slurm.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/slurm.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/slurm.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify:
|
||||
- Reconfigure slurmctld
|
||||
- Restart slurmd
|
||||
|
||||
- name: Deploy managed cgroup.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/cgroup.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/cgroup.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: slurm_enable_cgroup | default(false) | bool
|
||||
notify:
|
||||
- Reconfigure slurmctld
|
||||
- Restart slurmd
|
||||
|
||||
- name: Deploy managed gres.conf only on GPU nodes
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/gres.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/gres.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: inventory_hostname in groups['slurm_gpu']
|
||||
notify:
|
||||
- Reconfigure slurmctld
|
||||
- Restart slurmd
|
||||
|
||||
handlers:
|
||||
- name: Reconfigure slurmctld
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol reconfigure
|
||||
when: inventory_hostname in groups['slurm_controller']
|
||||
changed_when: true
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu']
|
||||
|
||||
|
||||
- name: Validate Slurm after config deployment
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Reconfigure controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol reconfigure
|
||||
changed_when: true
|
||||
|
||||
- name: Validate cluster state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol ping
|
||||
sinfo
|
||||
scontrol show nodes
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_config_validation
|
||||
changed_when: false
|
||||
|
||||
- name: Show validation output
|
||||
ansible.builtin.debug:
|
||||
var: slurm_config_validation.stdout_lines
|
||||
@@ -0,0 +1,103 @@
|
||||
---
|
||||
- name: Restart Slurm controller safely
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Restart munge on controller
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmctld on controller
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Wait for slurmctld to answer
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: scontrol_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: scontrol_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Show controller ping
|
||||
ansible.builtin.debug:
|
||||
var: scontrol_ping.stdout_lines
|
||||
|
||||
|
||||
- name: Restart Slurm workers safely one by one
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: false
|
||||
serial: 1
|
||||
|
||||
tasks:
|
||||
- name: Restart munge on worker
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmd on worker
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Wait for slurmd to be active
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active slurmd
|
||||
register: slurmd_active
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmd_active.stdout == "active"
|
||||
changed_when: false
|
||||
|
||||
- name: Wait until this node is visible in Slurm
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol show node {{ inventory_hostname }}
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: node_visible
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: node_visible.rc == 0
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Validate Slurm after restart
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate Slurm cluster state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
echo "### scontrol ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### sinfo"
|
||||
sinfo
|
||||
|
||||
echo
|
||||
echo "### nodes"
|
||||
scontrol show nodes
|
||||
|
||||
echo
|
||||
echo "### partitions"
|
||||
scontrol show partitions
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_validation
|
||||
changed_when: false
|
||||
|
||||
- name: Show Slurm validation
|
||||
ansible.builtin.debug:
|
||||
var: slurm_validation.stdout_lines
|
||||
+40
@@ -0,0 +1,40 @@
|
||||
---
|
||||
- name: Discover node resources for Slurm config
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Discover CPU and memory
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
echo "HOST={{ inventory_hostname }}"
|
||||
echo "CPUS=$(nproc)"
|
||||
echo "REAL_MEMORY_MB=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo)"
|
||||
echo "SOCKETS=$(lscpu | awk -F: '/Socket\\(s\\)/ {gsub(/ /,\"\",$2); print $2}')"
|
||||
echo "CORES_PER_SOCKET=$(lscpu | awk -F: '/Core\\(s\\) per socket/ {gsub(/ /,\"\",$2); print $2}')"
|
||||
echo "THREADS_PER_CORE=$(lscpu | awk -F: '/Thread\\(s\\) per core/ {gsub(/ /,\"\",$2); print $2}')"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: cpu_mem
|
||||
changed_when: false
|
||||
|
||||
- name: Discover NVIDIA GPU if present
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader
|
||||
else
|
||||
echo "NO_NVIDIA_SMI"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_info
|
||||
changed_when: false
|
||||
|
||||
- name: Show discovered resources
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "{{ cpu_mem.stdout_lines }}"
|
||||
- "GPU:"
|
||||
- "{{ gpu_info.stdout_lines }}"
|
||||
@@ -0,0 +1,89 @@
|
||||
---
|
||||
- name: Inspect current Slurm and Munge state
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Basic host info
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
echo "HOST=$(hostname -f 2>/dev/null || hostname)"
|
||||
echo "SHORT_HOST=$(hostname -s)"
|
||||
echo "IP_ADDRESSES=$(hostname -I)"
|
||||
echo "OS=$(lsb_release -ds 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME || true)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: host_info
|
||||
changed_when: false
|
||||
|
||||
- name: Slurm package info
|
||||
ansible.builtin.shell: |
|
||||
dpkg -l | grep -Ei 'slurm|munge' || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: package_info
|
||||
changed_when: false
|
||||
|
||||
- name: Slurm config paths
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
for p in /etc/slurm /etc/slurm-llnl /etc/munge; do
|
||||
echo "### $p"
|
||||
if [ -e "$p" ]; then
|
||||
find "$p" -maxdepth 2 -type f -printf "%m %u %g %p\n" | sort
|
||||
else
|
||||
echo "MISSING"
|
||||
fi
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: config_paths
|
||||
changed_when: false
|
||||
|
||||
- name: Service state
|
||||
ansible.builtin.shell: |
|
||||
for s in munge slurmctld slurmd; do
|
||||
echo "### $s"
|
||||
systemctl is-enabled "$s" 2>/dev/null || true
|
||||
systemctl is-active "$s" 2>/dev/null || true
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: service_state
|
||||
changed_when: false
|
||||
|
||||
- name: Slurm commands
|
||||
ansible.builtin.shell: |
|
||||
echo "### which"
|
||||
command -v sinfo || true
|
||||
command -v scontrol || true
|
||||
command -v sbatch || true
|
||||
command -v srun || true
|
||||
command -v munge || true
|
||||
command -v unmunge || true
|
||||
|
||||
echo "### sinfo"
|
||||
sinfo 2>&1 || true
|
||||
|
||||
echo "### scontrol ping"
|
||||
scontrol ping 2>&1 || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_commands
|
||||
changed_when: false
|
||||
|
||||
- name: Show inspection report
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "===== {{ inventory_hostname }} :: host_info ====="
|
||||
- "{{ host_info.stdout_lines }}"
|
||||
- "===== {{ inventory_hostname }} :: packages ====="
|
||||
- "{{ package_info.stdout_lines }}"
|
||||
- "===== {{ inventory_hostname }} :: config_paths ====="
|
||||
- "{{ config_paths.stdout_lines }}"
|
||||
- "===== {{ inventory_hostname }} :: services ====="
|
||||
- "{{ service_state.stdout_lines }}"
|
||||
- "===== {{ inventory_hostname }} :: slurm_commands ====="
|
||||
- "{{ slurm_commands.stdout_lines }}"
|
||||
+216
@@ -0,0 +1,216 @@
|
||||
---
|
||||
- name: Detect problematic Slurm nodes
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Detect nodes needing remediation
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
sinfo -N -h -o "%N %T" | awk '
|
||||
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
|
||||
' | sort -u
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: bad_nodes_raw
|
||||
changed_when: false
|
||||
|
||||
- name: Store bad node list
|
||||
ansible.builtin.set_fact:
|
||||
bad_nodes: "{{ bad_nodes_raw.stdout_lines }}"
|
||||
|
||||
- name: Show detected problematic nodes
|
||||
ansible.builtin.debug:
|
||||
var: bad_nodes
|
||||
|
||||
|
||||
- name: Attempt auto-remediation on problematic nodes
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: false
|
||||
serial: 1
|
||||
|
||||
vars:
|
||||
bad_nodes_from_controller: "{{ hostvars[groups['slurm_controller'][0]].bad_nodes | default([]) }}"
|
||||
|
||||
tasks:
|
||||
- name: Skip healthy nodes
|
||||
ansible.builtin.meta: end_host
|
||||
when: inventory_hostname not in bad_nodes_from_controller
|
||||
|
||||
- name: Restart Munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Validate local services after remediation attempt
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
|
||||
echo
|
||||
echo "### services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
|
||||
echo
|
||||
echo "### munge"
|
||||
munge -n | unmunge >/dev/null
|
||||
echo "munge OK"
|
||||
|
||||
echo
|
||||
echo "### controller ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### slurmd listener"
|
||||
ss -lntp | grep ':6818 ' || true
|
||||
|
||||
echo
|
||||
echo "### recent slurmd logs"
|
||||
journalctl -u slurmd -n 30 --no-pager || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: local_repair_check
|
||||
changed_when: false
|
||||
|
||||
- name: Print local remediation result
|
||||
ansible.builtin.debug:
|
||||
var: local_repair_check.stdout_lines
|
||||
|
||||
|
||||
- name: Refresh controller and validate remediated nodes
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Restart slurmctld to refresh node states
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
|
||||
- name: Wait for controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Clear maintenance state on previously bad nodes
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
|
||||
|
||||
if [ -z "$bad_nodes" ]; then
|
||||
echo "No bad nodes detected. Nothing to clear."
|
||||
sinfo -N
|
||||
exit 0
|
||||
fi
|
||||
|
||||
for node in $bad_nodes; do
|
||||
echo "### clearing state on $node"
|
||||
scontrol update NodeName="$node" State=RESUME 2>/dev/null || true
|
||||
scontrol update NodeName="$node" State=UNDRAIN 2>/dev/null || true
|
||||
scontrol update NodeName="$node" State=IDLE 2>/dev/null || true
|
||||
done
|
||||
|
||||
sleep 5
|
||||
sinfo -N
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: clear_result
|
||||
changed_when: true
|
||||
|
||||
- name: Print clear-state result
|
||||
ansible.builtin.debug:
|
||||
var: clear_result.stdout_lines
|
||||
|
||||
- name: Detect nodes still unhealthy after remediation
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
sinfo -N -h -o "%N %T" | awk '
|
||||
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
|
||||
' | sort -u
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: still_bad_nodes_raw
|
||||
changed_when: false
|
||||
|
||||
- name: Store still bad nodes
|
||||
ansible.builtin.set_fact:
|
||||
still_bad_nodes: "{{ still_bad_nodes_raw.stdout_lines }}"
|
||||
|
||||
- name: Drain nodes that remain unhealthy
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
unresolved_nodes="{{ still_bad_nodes | join(' ') }}"
|
||||
|
||||
if [ -z "$unresolved_nodes" ]; then
|
||||
echo "No unresolved unhealthy nodes."
|
||||
sinfo -N
|
||||
exit 0
|
||||
fi
|
||||
|
||||
for node in $unresolved_nodes; do
|
||||
echo "### draining unresolved node $node"
|
||||
scontrol update NodeName="$node" State=DRAIN Reason="auto-remediation failed"
|
||||
done
|
||||
|
||||
sinfo -N
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: drain_unresolved
|
||||
changed_when: still_bad_nodes | length > 0
|
||||
|
||||
- name: Show remediation summary
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### initial bad nodes"
|
||||
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
|
||||
if [ -z "$bad_nodes" ]; then
|
||||
echo "none"
|
||||
else
|
||||
printf '%s\n' $bad_nodes
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "### still bad nodes"
|
||||
still_bad_nodes="{{ (still_bad_nodes | default([])) | join(' ') }}"
|
||||
if [ -z "$still_bad_nodes" ]; then
|
||||
echo "none"
|
||||
else
|
||||
printf '%s\n' $still_bad_nodes
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "### final sinfo"
|
||||
sinfo -N
|
||||
|
||||
echo
|
||||
echo "### queue"
|
||||
squeue
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: remediation_summary
|
||||
changed_when: false
|
||||
|
||||
- name: Print remediation summary
|
||||
ansible.builtin.debug:
|
||||
var: remediation_summary.stdout_lines
|
||||
@@ -0,0 +1,149 @@
|
||||
---
|
||||
- name: Check Slurm controller health
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Check controller services and cluster state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### controller services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmctld
|
||||
systemctl is-active slurmdbd || true
|
||||
systemctl is-active mariadb || true
|
||||
|
||||
echo
|
||||
echo "### slurm ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### nodes"
|
||||
sinfo -N
|
||||
|
||||
echo
|
||||
echo "### partitions"
|
||||
sinfo
|
||||
|
||||
echo
|
||||
echo "### queue"
|
||||
squeue
|
||||
|
||||
echo
|
||||
echo "### problematic nodes"
|
||||
sinfo -N -h -o "%N %T %E" | awk '$2 !~ /idle|alloc|mix/ {print}' || true
|
||||
|
||||
echo
|
||||
echo "### accounting"
|
||||
sacctmgr -n list cluster || true
|
||||
|
||||
echo
|
||||
echo "### recent failed jobs"
|
||||
sacct -S today --state=FAILED,CANCELLED,TIMEOUT,NODE_FAIL,OUT_OF_MEMORY \
|
||||
--format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,NodeList | tail -30 || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: controller_health
|
||||
changed_when: false
|
||||
|
||||
- name: Print controller health
|
||||
ansible.builtin.debug:
|
||||
var: controller_health.stdout_lines
|
||||
|
||||
|
||||
- name: Check Slurm worker health
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Check worker services, config and connectivity
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
echo "UPTIME=$(uptime -p)"
|
||||
|
||||
echo
|
||||
echo "### services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
|
||||
echo
|
||||
echo "### munge local test"
|
||||
munge -n | unmunge >/dev/null
|
||||
echo "munge OK"
|
||||
|
||||
echo
|
||||
echo "### controller connectivity"
|
||||
getent hosts slurm-ctl01 || true
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### slurmd listener"
|
||||
ss -lntp | grep ':6818 ' || true
|
||||
|
||||
echo
|
||||
echo "### config checksums"
|
||||
sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
|
||||
|
||||
echo
|
||||
echo "### shared filesystem"
|
||||
test -d /shared
|
||||
touch /shared/.slurm-health-$(hostname)
|
||||
ls -l /shared/.slurm-health-$(hostname)
|
||||
rm -f /shared/.slurm-health-$(hostname)
|
||||
|
||||
echo
|
||||
echo "### cgroup"
|
||||
mount | grep cgroup || true
|
||||
|
||||
echo
|
||||
echo "### gpu check"
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi --query-gpu=index,name,driver_version,memory.total,temperature.gpu,utilization.gpu --format=csv,noheader || true
|
||||
else
|
||||
echo "NO_NVIDIA_SMI"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: worker_health
|
||||
changed_when: false
|
||||
|
||||
- name: Print worker health
|
||||
ansible.builtin.debug:
|
||||
var: worker_health.stdout_lines
|
||||
|
||||
|
||||
- name: Check Slurm-reported node state consistency
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Build Slurm node health summary
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### node summary"
|
||||
sinfo -N -o "%N %P %T %C %m %G %E"
|
||||
|
||||
echo
|
||||
echo "### full problematic node details"
|
||||
for node in $(sinfo -N -h -o "%N %T" | awk '$2 ~ /down|drain|fail|unk|not_responding|idle\\*/ {print $1}' | sort -u); do
|
||||
echo
|
||||
echo "### $node"
|
||||
scontrol show node "$node"
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_node_summary
|
||||
changed_when: false
|
||||
|
||||
- name: Print Slurm node summary
|
||||
ansible.builtin.debug:
|
||||
var: slurm_node_summary.stdout_lines
|
||||
@@ -0,0 +1,217 @@
|
||||
---
|
||||
- name: Validate target node
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook repair-slurm-node.yml -e target_node=<hostname>"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Ensure target_node is in inventory
|
||||
ansible.builtin.fail:
|
||||
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
||||
when: target_node not in groups['all']
|
||||
|
||||
|
||||
- name: Capture node state before repair
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Show target node state before repair
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### sinfo"
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
|
||||
echo
|
||||
echo "### scontrol"
|
||||
scontrol show node {{ target_node }} || true
|
||||
|
||||
echo
|
||||
echo "### jobs"
|
||||
squeue -w {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_state_before
|
||||
changed_when: false
|
||||
|
||||
- name: Print target node state before repair
|
||||
ansible.builtin.debug:
|
||||
var: node_state_before.stdout_lines
|
||||
|
||||
|
||||
- name: Repair local services on target node
|
||||
hosts: "{{ target_node }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Restart Munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
when:
|
||||
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
|
||||
|
||||
- name: Validate local repair
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
|
||||
echo
|
||||
echo "### munge"
|
||||
munge -n | unmunge >/dev/null
|
||||
echo "munge OK"
|
||||
|
||||
echo
|
||||
echo "### controller ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### slurmd listener"
|
||||
ss -lntp | grep ':6818 ' || true
|
||||
|
||||
echo
|
||||
echo "### recent slurmd logs"
|
||||
journalctl -u slurmd -n 40 --no-pager || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: local_repair_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print local repair state
|
||||
ansible.builtin.debug:
|
||||
var: local_repair_state.stdout_lines
|
||||
|
||||
|
||||
- name: Clear Slurm maintenance/down state after repair
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Restart controller to refresh node state
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
|
||||
- name: Wait for controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Clear target node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
scontrol update NodeName={{ target_node }} State=RESUME 2>/dev/null || true
|
||||
scontrol update NodeName={{ target_node }} State=UNDRAIN 2>/dev/null || true
|
||||
scontrol update NodeName={{ target_node }} State=IDLE 2>/dev/null || true
|
||||
|
||||
sleep 5
|
||||
|
||||
sinfo -N -n {{ target_node }}
|
||||
scontrol show node {{ target_node }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: clear_state
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until node is healthy
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }}
|
||||
scontrol show node {{ target_node }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_health_after
|
||||
retries: 30
|
||||
delay: 5
|
||||
until:
|
||||
- node_health_after.rc == 0
|
||||
- "'not_responding' not in node_health_after.stdout.lower()"
|
||||
- "'down' not in node_health_after.stdout.lower()"
|
||||
- "'drain' not in node_health_after.stdout.lower()"
|
||||
- "'idle*' not in node_health_after.stdout.lower()"
|
||||
changed_when: false
|
||||
|
||||
- name: Print node state after repair
|
||||
ansible.builtin.debug:
|
||||
var: node_health_after.stdout_lines
|
||||
|
||||
|
||||
- name: Submit repair validation job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit validation job to repaired node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=repair-node-test
|
||||
#SBATCH --partition=all
|
||||
#SBATCH --nodelist={{ target_node }}
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --qos=normal
|
||||
#SBATCH --output=/shared/repair-node-test-%j.out
|
||||
|
||||
echo "HOST=\$(hostname)"
|
||||
echo "USER=\$(whoami)"
|
||||
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/repair-node-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: repair_validation_job
|
||||
changed_when: true
|
||||
|
||||
- name: Print repair validation job
|
||||
ansible.builtin.debug:
|
||||
var: repair_validation_job.stdout_lines
|
||||
+126
@@ -0,0 +1,126 @@
|
||||
---
|
||||
- name: Validate target_node variable
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook decommission-slurm-node.yml -e target_node=<hostname> [-e decom_reason='reason']"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Ensure target_node is in inventory
|
||||
ansible.builtin.fail:
|
||||
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
||||
when: target_node not in groups['all']
|
||||
|
||||
|
||||
- name: Drain target node and wait for jobs to leave
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
decom_reason_effective: "{{ decom_reason | default('decommission by Ansible') }}"
|
||||
decom_wait_retries_effective: "{{ decom_wait_retries | default(120) }}"
|
||||
decom_wait_delay_effective: "{{ decom_wait_delay | default(10) }}"
|
||||
|
||||
tasks:
|
||||
- name: Show current target node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
scontrol show node {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_state_before
|
||||
changed_when: false
|
||||
|
||||
- name: Print current target node state
|
||||
ansible.builtin.debug:
|
||||
var: node_state_before.stdout_lines
|
||||
|
||||
- name: Drain target node
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ target_node }} State=DRAIN Reason="{{ decom_reason_effective }}"
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until no jobs are running on target node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
squeue -h -w {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: jobs_on_node
|
||||
retries: "{{ decom_wait_retries_effective | int }}"
|
||||
delay: "{{ decom_wait_delay_effective | int }}"
|
||||
until: jobs_on_node.stdout | trim == ""
|
||||
changed_when: false
|
||||
|
||||
- name: Show drained node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_state_drained
|
||||
changed_when: false
|
||||
|
||||
- name: Print drained node state
|
||||
ansible.builtin.debug:
|
||||
var: node_state_drained.stdout_lines
|
||||
|
||||
|
||||
- name: Stop Slurm worker service on target node
|
||||
hosts: "{{ target_node }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Stop slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: stopped
|
||||
enabled: false
|
||||
when:
|
||||
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
|
||||
|
||||
- name: Show slurmd state
|
||||
ansible.builtin.shell: |
|
||||
systemctl is-enabled slurmd 2>/dev/null || true
|
||||
systemctl is-active slurmd 2>/dev/null || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurmd_state_after
|
||||
changed_when: false
|
||||
|
||||
- name: Print slurmd state
|
||||
ansible.builtin.debug:
|
||||
var: slurmd_state_after.stdout_lines
|
||||
|
||||
|
||||
- name: Mark node down in Slurm controller
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Mark target node DOWN after service stop
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ target_node }} State=DOWN Reason="decommissioned"
|
||||
changed_when: true
|
||||
|
||||
- name: Show final node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: final_node_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print final node state
|
||||
ansible.builtin.debug:
|
||||
var: final_node_state.stdout_lines
|
||||
@@ -0,0 +1,246 @@
|
||||
---
|
||||
- name: Validate target_node variable
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook provision-slurm-node.yml -e target_node=<hostname>"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Ensure target_node is in inventory
|
||||
ansible.builtin.fail:
|
||||
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
||||
when: target_node not in groups['all']
|
||||
|
||||
|
||||
- name: Prepare OS, packages and Slurm directories on target node
|
||||
hosts: "{{ target_node }}"
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Ensure target is a Slurm worker or GPU node
|
||||
ansible.builtin.fail:
|
||||
msg: "{{ inventory_hostname }} must be in slurm_compute or slurm_gpu group"
|
||||
when:
|
||||
- inventory_hostname not in groups.get('slurm_compute', [])
|
||||
- inventory_hostname not in groups.get('slurm_gpu', [])
|
||||
|
||||
- name: Install Slurm worker packages
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- munge
|
||||
- libmunge2
|
||||
- slurm-client
|
||||
- slurmd
|
||||
- slurm-wlm-basic-plugins
|
||||
- slurm-wlm-plugins
|
||||
- slurm-wlm-mysql-plugin
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure Slurm config directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ slurm_config_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure Slurm log directory exists
|
||||
ansible.builtin.file:
|
||||
path: /var/log/slurm
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure slurmd spool directory exists
|
||||
ansible.builtin.file:
|
||||
path: /var/spool/slurmd
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure munge dirs exist
|
||||
ansible.builtin.file:
|
||||
path: "{{ item.path }}"
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "{{ item.mode }}"
|
||||
loop:
|
||||
- { path: /etc/munge, mode: "0700" }
|
||||
- { path: /var/log/munge, mode: "0755" }
|
||||
- { path: /var/lib/munge, mode: "0711" }
|
||||
- { path: /run/munge, mode: "0755" }
|
||||
|
||||
|
||||
- name: Deploy Munge key from controller to target node
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Read controller munge.key
|
||||
ansible.builtin.slurp:
|
||||
src: /etc/munge/munge.key
|
||||
register: controller_munge_key_raw
|
||||
|
||||
- name: Store controller Munge key as fact
|
||||
ansible.builtin.set_fact:
|
||||
cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}"
|
||||
|
||||
|
||||
- name: Configure target node with Munge and Slurm files
|
||||
hosts: "{{ target_node }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
controller_host: "{{ groups['slurm_controller'][0] }}"
|
||||
|
||||
tasks:
|
||||
- name: Deploy shared munge.key
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/munge/munge.key
|
||||
content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}"
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0400"
|
||||
notify:
|
||||
- Restart munge
|
||||
|
||||
- name: Deploy managed slurm.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/slurm.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/slurm.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify:
|
||||
- Restart slurmd
|
||||
|
||||
- name: Deploy managed cgroup.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/cgroup.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/cgroup.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: slurm_enable_cgroup | default(false) | bool
|
||||
notify:
|
||||
- Restart slurmd
|
||||
|
||||
- name: Deploy managed gres.conf on GPU nodes
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/gres.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/gres.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: inventory_hostname in groups.get('slurm_gpu', [])
|
||||
notify:
|
||||
- Restart slurmd
|
||||
|
||||
- name: Ensure munge is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Ensure slurmd is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
handlers:
|
||||
- name: Restart munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
|
||||
|
||||
- name: Deploy updated Slurm config to whole cluster and reconfigure controller
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Deploy managed slurm.conf to all nodes
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/slurm.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/slurm.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy managed cgroup.conf to all nodes
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/cgroup.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/cgroup.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: slurm_enable_cgroup | default(false) | bool
|
||||
|
||||
|
||||
- name: Reconfigure Slurm and validate target node
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Reconfigure Slurm controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol reconfigure
|
||||
changed_when: true
|
||||
|
||||
- name: Restart Slurm controller after node reprovision
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
|
||||
- name: Wait for Slurm controller after restart
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping_after_restart
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping_after_restart.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Resume target node in Slurm
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ target_node }} State=RESUME
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until target node is visible and not down
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol show node {{ target_node }}
|
||||
sinfo -N -n {{ target_node }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: target_node_state
|
||||
retries: 20
|
||||
delay: 3
|
||||
until:
|
||||
- target_node_state.rc == 0
|
||||
- "'down' not in target_node_state.stdout.lower()"
|
||||
- "'not_responding' not in target_node_state.stdout.lower()"
|
||||
- "'idle*' not in target_node_state.stdout.lower()"
|
||||
changed_when: false
|
||||
|
||||
- name: Show target node state
|
||||
ansible.builtin.debug:
|
||||
var: target_node_state.stdout_lines
|
||||
@@ -0,0 +1,33 @@
|
||||
---
|
||||
- name: Show Slurm node state
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook show-slurm-node.yml -e target_node=<hostname>"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Show node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
echo "### sinfo"
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
|
||||
echo
|
||||
echo "### scontrol"
|
||||
scontrol show node {{ target_node }} || true
|
||||
|
||||
echo
|
||||
echo "### jobs on node"
|
||||
squeue -w {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_lifecycle_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print node lifecycle state
|
||||
ansible.builtin.debug:
|
||||
var: node_lifecycle_state.stdout_lines
|
||||
@@ -0,0 +1,169 @@
|
||||
---
|
||||
- name: Configure Slurm QOS, limits and fairshare
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Ensure sacctmgr is avgpu01le
|
||||
ansible.builtin.command:
|
||||
cmd: sacctmgr -n list cluster
|
||||
changed_when: false
|
||||
|
||||
- name: Validate accounting GPU TRES exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### configured AccountingStorageTRES"
|
||||
scontrol show config | grep -E "AccountingStorageTRES|AccountingStorageType|AccountingStorageEnforce"
|
||||
|
||||
echo
|
||||
echo "### known TRES"
|
||||
sacctmgr show tres
|
||||
|
||||
echo
|
||||
echo "### checking gres/gpu"
|
||||
sacctmgr -n show tres format=Type,Name | awk '$1=="gres" && $2=="gpu" {found=1} END {exit !found}'
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_tres_check
|
||||
changed_when: false
|
||||
|
||||
- name: Ensure normal QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos normal Priority=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_normal
|
||||
changed_when: "'Adding QOS' in (add_qos_normal.stdout + add_qos_normal.stderr)"
|
||||
failed_when: >
|
||||
add_qos_normal.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
|
||||
'already exists' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
|
||||
'Already existing' not in (add_qos_normal.stdout + add_qos_normal.stderr)
|
||||
|
||||
- name: Ensure debug-short QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos debug-short Priority=500
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_debug
|
||||
changed_when: "'Adding QOS' in (add_qos_debug.stdout + add_qos_debug.stderr)"
|
||||
failed_when: >
|
||||
add_qos_debug.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
|
||||
'already exists' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
|
||||
'Already existing' not in (add_qos_debug.stdout + add_qos_debug.stderr)
|
||||
|
||||
- name: Ensure gpu-short QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos gpu-short Priority=1000
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_gpu
|
||||
changed_when: "'Adding QOS' in (add_qos_gpu.stdout + add_qos_gpu.stderr)"
|
||||
failed_when: >
|
||||
add_qos_gpu.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
|
||||
'already exists' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
|
||||
'Already existing' not in (add_qos_gpu.stdout + add_qos_gpu.stderr)
|
||||
|
||||
- name: Ensure maintenance QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos maintenance Priority=5000
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_maintenance
|
||||
changed_when: "'Adding QOS' in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)"
|
||||
failed_when: >
|
||||
add_qos_maintenance.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
|
||||
'already exists' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
|
||||
'Already existing' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)
|
||||
|
||||
- name: Normalize normal QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos normal set Priority=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Normalize debug-short QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos debug-short set Priority=500 MaxWall=00:10:00 MaxTRESPU=cpu=2 MaxJobsPU=4
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Normalize gpu-short QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos gpu-short set Priority=1000 MaxWall=01:00:00 MaxTRESPU=gres/gpu=1,cpu=12 MaxJobsPU=2
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Normalize maintenance QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos maintenance set Priority=5000 MaxWall=02:00:00
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Assign QOS set to lab account
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify account {{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Assign default account to slurmuser
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Assign QOS set to slurmuser association
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify user where name=slurmuser account={{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Show configured QOS and associations
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### TRES"
|
||||
sacctmgr show tres
|
||||
|
||||
echo
|
||||
echo "### QOS"
|
||||
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%40,MaxJobsPU
|
||||
|
||||
echo
|
||||
echo "### Associations"
|
||||
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%60,DefaultQOS,Fairshare
|
||||
|
||||
echo
|
||||
echo "### Fairshare"
|
||||
sshare -A {{ slurm_account_name }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: qos_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print QOS state
|
||||
ansible.builtin.debug:
|
||||
var: qos_state.stdout_lines
|
||||
@@ -0,0 +1,235 @@
|
||||
---
|
||||
- name: Validate Slurm QOS, fairshare and priority
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate priority runtime config
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### priority config"
|
||||
scontrol show config | grep -E "PriorityType|PriorityWeight|PriorityDecay|PriorityCalc|PriorityMaxAge|PriorityFavor"
|
||||
|
||||
echo
|
||||
echo "### accounting enforcement"
|
||||
scontrol show config | grep -E "AccountingStorageType|AccountingStorageEnforce|AccountingStorageTRES"
|
||||
|
||||
echo
|
||||
echo "### QOS"
|
||||
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%50,MaxJobsPU
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%80,DefaultQOS,Fairshare
|
||||
|
||||
echo
|
||||
echo "### fairshare"
|
||||
sshare -A {{ slurm_account_name }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: priority_state
|
||||
changed_when: false
|
||||
|
||||
- name: Submit debug-short QOS job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qos-debug-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --qos=debug-short
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/qos-debug-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "QOS=${SLURM_JOB_QOS:-}"
|
||||
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/qos-debug-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: debug_qos_job
|
||||
changed_when: true
|
||||
|
||||
- name: Submit gpu-short QOS job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qos-gpu-test
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --qos=gpu-short
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --cpus-per-task=2
|
||||
#SBATCH --mem=1G
|
||||
#SBATCH --time=00:03:00
|
||||
#SBATCH --output=/shared/qos-gpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "QOS=${SLURM_JOB_QOS:-}"
|
||||
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
|
||||
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo
|
||||
nvidia-smi
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 120); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/qos-gpu-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_qos_job
|
||||
changed_when: true
|
||||
|
||||
- name: Validate debug-short walltime limit behavior
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
set +e
|
||||
output="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH' 2>&1
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qos-limit-fail
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --qos=debug-short
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --output=/shared/qos-limit-fail-%j.out
|
||||
|
||||
sleep 10
|
||||
SBATCH
|
||||
)"
|
||||
rc=$?
|
||||
set -e
|
||||
|
||||
echo "RC=$rc"
|
||||
echo "$output"
|
||||
|
||||
if [ "$rc" -ne 0 ]; then
|
||||
echo "Limit rejection test passed at submit time"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
job_id="$output"
|
||||
echo "Submitted job despite expected limit check: $job_id"
|
||||
|
||||
sleep 3
|
||||
|
||||
echo "### squeue"
|
||||
squeue -j "$job_id" -o "%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" || true
|
||||
|
||||
echo
|
||||
echo "### job detail"
|
||||
scontrol show job "$job_id" || true
|
||||
|
||||
state="$(squeue -h -j "$job_id" -o "%T" || true)"
|
||||
reason="$(squeue -h -j "$job_id" -o "%R" || true)"
|
||||
|
||||
echo "STATE=$state"
|
||||
echo "REASON=$reason"
|
||||
|
||||
if echo "$state" | grep -qE "PENDING|CONFIGURING"; then
|
||||
if echo "$reason" | grep -qiE "qos|limit|time|max|assoc"; then
|
||||
echo "Limit enforcement test passed via pending reason"
|
||||
scancel "$job_id" || true
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Job was accepted without an obvious QOS/limit pending reason"
|
||||
scancel "$job_id" || true
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: limit_rejection
|
||||
changed_when: false
|
||||
|
||||
- name: Show priority and fairshare snapshot
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### queue"
|
||||
squeue || true
|
||||
|
||||
echo
|
||||
echo "### sprio"
|
||||
sprio || true
|
||||
|
||||
echo
|
||||
echo "### sshare"
|
||||
sshare -A {{ slurm_account_name }} || true
|
||||
|
||||
echo
|
||||
echo "### recent sacct"
|
||||
sacct -S today --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -40
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: priority_snapshot
|
||||
changed_when: false
|
||||
|
||||
- name: Print validation result
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "### priority state"
|
||||
- "{{ priority_state.stdout_lines }}"
|
||||
- "### debug QOS job"
|
||||
- "{{ debug_qos_job.stdout_lines }}"
|
||||
- "### GPU QOS job"
|
||||
- "{{ gpu_qos_job.stdout_lines }}"
|
||||
- "### limit rejection"
|
||||
- "{{ limit_rejection.stdout_lines }}"
|
||||
- "### priority snapshot"
|
||||
- "{{ priority_snapshot.stdout_lines }}"
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
- name: Test CPU cgroup enforcement on gpu01
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit cgroup CPU test to gpu01
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=cgroup-cpu-test
|
||||
#SBATCH --partition=all
|
||||
#SBATCH --nodelist=gpu01
|
||||
#SBATCH --cpus-per-task=2
|
||||
#SBATCH --mem=1G
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/cgroup-cpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK:-}"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "MEM_ALLOWED=$(grep Mems_allowed_list /proc/self/status || true)"
|
||||
echo
|
||||
echo "### cgroup"
|
||||
cat /proc/self/cgroup
|
||||
echo
|
||||
echo "### mounted cgroups"
|
||||
mount | grep cgroup || true
|
||||
sleep 5
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 60); do
|
||||
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
|
||||
sudo -iu slurmuser squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/cgroup-cpu-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: cgroup_cpu_result
|
||||
changed_when: true
|
||||
|
||||
- name: Show cgroup CPU result
|
||||
ansible.builtin.debug:
|
||||
var: cgroup_cpu_result.stdout_lines
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
- name: Submit CPU test job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit test job to debug partition
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=cpu-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=512M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/cpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 60); do
|
||||
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
|
||||
sudo -iu slurmuser squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true
|
||||
|
||||
echo "### output"
|
||||
if [ -f "/shared/cpu-test-${job_id}.out" ]; then
|
||||
cat "/shared/cpu-test-${job_id}.out"
|
||||
else
|
||||
echo "Output file not found: /shared/cpu-test-${job_id}.out"
|
||||
find /shared -maxdepth 1 -name "cpu-test-*.out" -ls | tail -5 || true
|
||||
exit 1
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: cpu_job_result
|
||||
changed_when: true
|
||||
|
||||
- name: Show CPU job result
|
||||
ansible.builtin.debug:
|
||||
var: cpu_job_result.stdout_lines
|
||||
@@ -0,0 +1,58 @@
|
||||
---
|
||||
- name: Test GPU access without GRES allocation
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit job to gpu01 without requesting GPU
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=gpu-deny-test
|
||||
#SBATCH --partition=all
|
||||
#SBATCH --nodelist=gpu01
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=1G
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/gpu-deny-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
|
||||
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo
|
||||
echo "### ls nvidia devices"
|
||||
ls -l /dev/nvidia* 2>&1 || true
|
||||
echo
|
||||
echo "### nvidia-smi without GRES"
|
||||
nvidia-smi 2>&1 || true
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 60); do
|
||||
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
|
||||
sudo -iu slurmuser squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/gpu-deny-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_deny_result
|
||||
changed_when: true
|
||||
|
||||
- name: Show GPU deny test result
|
||||
ansible.builtin.debug:
|
||||
var: gpu_deny_result.stdout_lines
|
||||
@@ -0,0 +1,70 @@
|
||||
---
|
||||
- name: Submit GPU test job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit test job to gpu partition
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=gpu-test
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --cpus-per-task=2
|
||||
#SBATCH --mem=2G
|
||||
#SBATCH --time=00:03:00
|
||||
#SBATCH --output=/shared/gpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
|
||||
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo
|
||||
|
||||
echo "### nvidia-smi"
|
||||
nvidia-smi
|
||||
|
||||
echo
|
||||
echo "### GPU process table"
|
||||
nvidia-smi pmon -c 1 || true
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
|
||||
sudo -iu slurmuser squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true
|
||||
|
||||
echo "### output"
|
||||
if [ -f "/shared/gpu-test-${job_id}.out" ]; then
|
||||
cat "/shared/gpu-test-${job_id}.out"
|
||||
else
|
||||
echo "Output file not found: /shared/gpu-test-${job_id}.out"
|
||||
find /shared -maxdepth 1 -name "gpu-test-*.out" -ls | tail -5 || true
|
||||
exit 1
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_job_result
|
||||
changed_when: true
|
||||
|
||||
- name: Show GPU job result
|
||||
ansible.builtin.debug:
|
||||
var: gpu_job_result.stdout_lines
|
||||
@@ -0,0 +1,95 @@
|
||||
---
|
||||
- name: Submit job to specific Slurm node
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook test-specific-node.yml -e target_node=<hostname>"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Submit test job to target node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=node-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --nodelist={{ target_node }}
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --qos=normal
|
||||
#SBATCH --output=/shared/node-test-%j.out
|
||||
|
||||
echo "HOST=\$(hostname)"
|
||||
echo "USER=\$(whoami)"
|
||||
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
echo "### waiting for job to leave queue"
|
||||
for i in $(seq 1 120); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### waiting for output file"
|
||||
for i in $(seq 1 30); do
|
||||
if [ -s "/shared/node-test-${job_id}.out" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "### waiting for sacct final state"
|
||||
final_state=""
|
||||
for i in $(seq 1 30); do
|
||||
final_state="$(
|
||||
sacct -n -P -j "$job_id" --format=State 2>/dev/null \
|
||||
| head -n 1 \
|
||||
| cut -d'|' -f1 \
|
||||
| awk '{print $1}'
|
||||
)"
|
||||
|
||||
if echo "$final_state" | grep -qE "COMPLETED|FAILED|CANCELLED|TIMEOUT|NODE_FAIL|OUT_OF_MEMORY"; then
|
||||
break
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "FINAL_STATE=${final_state:-UNKNOWN}"
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/node-test-${job_id}.out"
|
||||
|
||||
if [ "${final_state:-UNKNOWN}" != "COMPLETED" ]; then
|
||||
echo "Job did not reach COMPLETED state according to sacct"
|
||||
exit 1
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_test
|
||||
changed_when: true
|
||||
|
||||
- name: Show node test result
|
||||
ansible.builtin.debug:
|
||||
var: node_test.stdout_lines
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
- name: Generate measurable Slurm usage for sreport
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit CPU usage job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=sreport-usage
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --cpus-per-task=2
|
||||
#SBATCH --mem=512M
|
||||
#SBATCH --time=00:03:00
|
||||
#SBATCH --output=/shared/sreport-usage-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK:-}"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "Burning CPU for 90 seconds"
|
||||
|
||||
timeout 90 bash -c 'while true; do :; done' &
|
||||
timeout 90 bash -c 'while true; do :; done' &
|
||||
wait
|
||||
|
||||
echo "Done"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 150); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 2
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/sreport-usage-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: sreport_usage_job
|
||||
changed_when: true
|
||||
|
||||
- name: Show usage job result
|
||||
ansible.builtin.debug:
|
||||
var: sreport_usage_job.stdout_lines
|
||||
@@ -0,0 +1,140 @@
|
||||
---
|
||||
- name: Validate Slurm operator user and SSH mesh
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: "{{ slurm_operator_user | default('slurmuser') }}"
|
||||
slurm_hosts: "{{ groups['slurm_cluster'] }}"
|
||||
|
||||
tasks:
|
||||
- name: Validate slurmuser exists
|
||||
ansible.builtin.command:
|
||||
cmd: id {{ slurm_operator_user }}
|
||||
changed_when: false
|
||||
|
||||
- name: Validate sinfo as slurmuser
|
||||
ansible.builtin.command:
|
||||
cmd: sudo -iu {{ slurm_operator_user }} sinfo
|
||||
changed_when: false
|
||||
|
||||
- name: Validate squeue as slurmuser
|
||||
ansible.builtin.command:
|
||||
cmd: sudo -iu {{ slurm_operator_user }} squeue
|
||||
changed_when: false
|
||||
|
||||
- name: Validate SSH mesh as slurmuser
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
for h in {{ slurm_hosts | join(' ') }}; do
|
||||
echo "=== $h ==="
|
||||
ssh -o BatchMode=yes -o ConnectTimeout=5 "$h" hostname
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
become_user: "{{ slurm_operator_user }}"
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Validate Slurm controller commands
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Validate slurmctld status through sudo
|
||||
ansible.builtin.command:
|
||||
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmctld --no-pager
|
||||
changed_when: false
|
||||
|
||||
- name: Validate controller Slurm commands
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sudo -iu {{ slurm_operator_user }} sinfo
|
||||
sudo -iu {{ slurm_operator_user }} squeue
|
||||
sudo -iu {{ slurm_operator_user }} scontrol show nodes
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Validate Slurm worker commands
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Validate slurmd status through sudo
|
||||
ansible.builtin.command:
|
||||
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmd --no-pager
|
||||
changed_when: false
|
||||
|
||||
- name: Validate worker Slurm commands
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sudo -iu {{ slurm_operator_user }} sinfo
|
||||
sudo -iu {{ slurm_operator_user }} squeue
|
||||
sudo -iu {{ slurm_operator_user }} scontrol show nodes
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Validate basic job submission
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Submit simple Slurm test job as slurmuser
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu {{ slurm_operator_user }} sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=ansible-validate
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --time=00:01:00
|
||||
#SBATCH --output=/tmp/ansible-validate-%j.out
|
||||
|
||||
hostname
|
||||
whoami
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "$job_id"
|
||||
|
||||
for i in $(seq 1 20); do
|
||||
state="$(sudo -iu {{ slurm_operator_user }} squeue -h -j "$job_id" -o "%T" || true)"
|
||||
if [ -z "$state" ]; then
|
||||
break
|
||||
fi
|
||||
echo "job_state=$state"
|
||||
sleep 1
|
||||
done
|
||||
|
||||
sudo -iu {{ slurm_operator_user }} sacct -j "$job_id" --format=JobID,JobName,State,ExitCode 2>/dev/null || true
|
||||
|
||||
if ls /tmp/ansible-validate-"$job_id".out >/dev/null 2>&1; then
|
||||
cat /tmp/ansible-validate-"$job_id".out
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_job_test
|
||||
changed_when: true
|
||||
|
||||
- name: Show basic job submission result
|
||||
ansible.builtin.debug:
|
||||
var: slurm_job_test.stdout_lines
|
||||
+236
@@ -0,0 +1,236 @@
|
||||
---
|
||||
- name: Validate canary node variable
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
|
||||
|
||||
tasks:
|
||||
- name: Ensure canary node is in inventory
|
||||
ansible.builtin.fail:
|
||||
msg: "canary_node={{ canary_node_effective }} is not in inventory"
|
||||
when: canary_node_effective not in groups['all']
|
||||
|
||||
- name: Ensure canary node is not the controller
|
||||
ansible.builtin.fail:
|
||||
msg: "Do not use controller as canary for worker rolling upgrade"
|
||||
when: canary_node_effective in groups['slurm_controller']
|
||||
|
||||
|
||||
- name: Drain canary node
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
|
||||
|
||||
tasks:
|
||||
- name: Show canary state before drain
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ canary_node_effective }} || true
|
||||
scontrol show node {{ canary_node_effective }} || true
|
||||
squeue -w {{ canary_node_effective }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: canary_before
|
||||
changed_when: false
|
||||
|
||||
- name: Print canary state before drain
|
||||
ansible.builtin.debug:
|
||||
var: canary_before.stdout_lines
|
||||
|
||||
- name: Drain canary node
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ canary_node_effective }} State=DRAIN Reason="canary OS upgrade"
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until canary has no running jobs
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
squeue -h -w {{ canary_node_effective }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: canary_jobs
|
||||
retries: 120
|
||||
delay: 10
|
||||
until: canary_jobs.stdout | trim == ""
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Upgrade canary node OS packages
|
||||
hosts: "{{ canary_node | default('slurm-c02') }}"
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Ensure apt cache is updated
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 1800
|
||||
|
||||
- name: Full upgrade packages
|
||||
ansible.builtin.apt:
|
||||
upgrade: full
|
||||
autoremove: true
|
||||
autoclean: true
|
||||
register: apt_upgrade_result
|
||||
|
||||
- name: Check if reboot is required
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: reboot_required
|
||||
|
||||
- name: Show upgrade summary
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Host: {{ inventory_hostname }}"
|
||||
- "Apt changed: {{ apt_upgrade_result.changed }}"
|
||||
- "Reboot required: {{ reboot_required.stat.exists }}"
|
||||
|
||||
- name: Reboot canary if required
|
||||
ansible.builtin.reboot:
|
||||
msg: "Reboot after canary OS upgrade"
|
||||
reboot_timeout: 900
|
||||
connect_timeout: 20
|
||||
pre_reboot_delay: 5
|
||||
post_reboot_delay: 20
|
||||
when: reboot_required.stat.exists
|
||||
|
||||
- name: Ensure munge is running
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Ensure slurmd is running
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Validate local services
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
munge -n | unmunge >/dev/null
|
||||
scontrol ping
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Resume canary node and run canary job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
|
||||
|
||||
tasks:
|
||||
- name: Reconfigure controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol reconfigure
|
||||
changed_when: true
|
||||
|
||||
- name: Restart controller to refresh node state
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
|
||||
- name: Wait for controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Clear canary node maintenance state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
scontrol update NodeName={{ canary_node_effective }} State=RESUME 2>/dev/null || true
|
||||
scontrol update NodeName={{ canary_node_effective }} State=UNDRAIN 2>/dev/null || true
|
||||
scontrol update NodeName={{ canary_node_effective }} State=IDLE 2>/dev/null || true
|
||||
|
||||
sleep 3
|
||||
sinfo -N -n {{ canary_node_effective }}
|
||||
scontrol show node {{ canary_node_effective }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: resume_canary
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until canary is IDLE and responding
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ canary_node_effective }}
|
||||
scontrol show node {{ canary_node_effective }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: canary_state
|
||||
retries: 30
|
||||
delay: 5
|
||||
until:
|
||||
- canary_state.rc == 0
|
||||
- "'not_responding' not in canary_state.stdout.lower()"
|
||||
- "'down' not in canary_state.stdout.lower()"
|
||||
- "'drain' not in canary_state.stdout.lower()"
|
||||
- "'idle*' not in canary_state.stdout.lower()"
|
||||
changed_when: false
|
||||
|
||||
- name: Submit canary test job to upgraded node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=canary-upgrade-test
|
||||
#SBATCH --partition=all
|
||||
#SBATCH --nodelist={{ canary_node_effective }}
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/canary-upgrade-test-%j.out
|
||||
|
||||
echo "HOST=\$(hostname)"
|
||||
echo "USER=\$(whoami)"
|
||||
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "KERNEL=\$(uname -r)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/canary-upgrade-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: canary_job
|
||||
changed_when: true
|
||||
|
||||
- name: Show canary test result
|
||||
ansible.builtin.debug:
|
||||
var: canary_job.stdout_lines
|
||||
+197
@@ -0,0 +1,197 @@
|
||||
---
|
||||
- name: Rolling upgrade Slurm worker nodes
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: true
|
||||
serial: 1
|
||||
|
||||
vars:
|
||||
skip_canary_node: "{{ canary_node | default('slurm-c02') }}"
|
||||
do_skip_canary: "{{ skip_canary | default(true) | bool }}"
|
||||
|
||||
pre_tasks:
|
||||
- name: Skip canary node if requested
|
||||
ansible.builtin.meta: end_host
|
||||
when:
|
||||
- do_skip_canary
|
||||
- inventory_hostname == skip_canary_node
|
||||
|
||||
- name: Drain node before OS upgrade
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ inventory_hostname }} State=DRAIN Reason="rolling OS upgrade"
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until no jobs are running on this node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
squeue -h -w {{ inventory_hostname }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: jobs_on_node
|
||||
retries: 120
|
||||
delay: 10
|
||||
until: jobs_on_node.stdout | trim == ""
|
||||
changed_when: false
|
||||
|
||||
tasks:
|
||||
- name: Update apt cache
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 1800
|
||||
|
||||
- name: Full upgrade packages
|
||||
ansible.builtin.apt:
|
||||
upgrade: full
|
||||
autoremove: true
|
||||
autoclean: true
|
||||
register: apt_upgrade_result
|
||||
|
||||
- name: Check if reboot is required
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: reboot_required
|
||||
|
||||
- name: Show upgrade status
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Node: {{ inventory_hostname }}"
|
||||
- "Apt changed: {{ apt_upgrade_result.changed }}"
|
||||
- "Reboot required: {{ reboot_required.stat.exists }}"
|
||||
|
||||
- name: Reboot node if required
|
||||
ansible.builtin.reboot:
|
||||
msg: "Reboot after rolling OS upgrade"
|
||||
reboot_timeout: 900
|
||||
connect_timeout: 20
|
||||
pre_reboot_delay: 5
|
||||
post_reboot_delay: 20
|
||||
when: reboot_required.stat.exists
|
||||
|
||||
- name: Restart munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Validate local slurm services
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
munge -n | unmunge >/dev/null
|
||||
scontrol ping
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
post_tasks:
|
||||
- name: Restart controller to refresh state after node upgrade
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
run_once: false
|
||||
|
||||
- name: Wait for controller after restart
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: slurmctld_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Clear upgraded node maintenance state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
scontrol update NodeName={{ inventory_hostname }} State=RESUME 2>/dev/null || true
|
||||
scontrol update NodeName={{ inventory_hostname }} State=UNDRAIN 2>/dev/null || true
|
||||
scontrol update NodeName={{ inventory_hostname }} State=IDLE 2>/dev/null || true
|
||||
|
||||
sleep 3
|
||||
sinfo -N -n {{ inventory_hostname }}
|
||||
scontrol show node {{ inventory_hostname }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: resume_node
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until node is healthy
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ inventory_hostname }}
|
||||
scontrol show node {{ inventory_hostname }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: upgraded_node_state
|
||||
retries: 30
|
||||
delay: 5
|
||||
until:
|
||||
- upgraded_node_state.rc == 0
|
||||
- "'not_responding' not in upgraded_node_state.stdout.lower()"
|
||||
- "'down' not in upgraded_node_state.stdout.lower()"
|
||||
- "'drain' not in upgraded_node_state.stdout.lower()"
|
||||
- "'idle*' not in upgraded_node_state.stdout.lower()"
|
||||
changed_when: false
|
||||
|
||||
- name: Submit node-local post-upgrade test job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=rolling-upgrade-test
|
||||
#SBATCH --partition=all
|
||||
#SBATCH --nodelist={{ inventory_hostname }}
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/rolling-upgrade-test-%j.out
|
||||
|
||||
echo "HOST=\$(hostname)"
|
||||
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "KERNEL=\$(uname -r)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/rolling-upgrade-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: node_test_job
|
||||
changed_when: true
|
||||
|
||||
- name: Show node post-upgrade test result
|
||||
ansible.builtin.debug:
|
||||
var: node_test_job.stdout_lines
|
||||
@@ -0,0 +1,94 @@
|
||||
---
|
||||
- name: Upgrade Slurm controller OS safely
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Show cluster state before controller upgrade
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol ping
|
||||
sinfo
|
||||
squeue
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmctld
|
||||
systemctl is-active slurmdbd || true
|
||||
systemctl is-active mariadb || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: before_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print cluster state before controller upgrade
|
||||
ansible.builtin.debug:
|
||||
var: before_state.stdout_lines
|
||||
|
||||
- name: Update apt cache
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 1800
|
||||
|
||||
- name: Full upgrade controller packages
|
||||
ansible.builtin.apt:
|
||||
upgrade: full
|
||||
autoremove: true
|
||||
autoclean: true
|
||||
register: controller_upgrade
|
||||
|
||||
- name: Check if reboot is required
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: controller_reboot_required
|
||||
|
||||
- name: Show controller upgrade status
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Apt changed: {{ controller_upgrade.changed }}"
|
||||
- "Reboot required: {{ controller_reboot_required.stat.exists }}"
|
||||
|
||||
- name: Reboot controller if required
|
||||
ansible.builtin.reboot:
|
||||
msg: "Reboot after controller OS upgrade"
|
||||
reboot_timeout: 900
|
||||
connect_timeout: 20
|
||||
pre_reboot_delay: 5
|
||||
post_reboot_delay: 30
|
||||
when: controller_reboot_required.stat.exists
|
||||
|
||||
- name: Restart controller services
|
||||
ansible.builtin.systemd:
|
||||
name: "{{ item }}"
|
||||
state: restarted
|
||||
enabled: true
|
||||
loop:
|
||||
- munge
|
||||
- mariadb
|
||||
- slurmdbd
|
||||
- slurmctld
|
||||
|
||||
- name: Wait for slurmctld
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping
|
||||
retries: 20
|
||||
delay: 3
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Validate controller after upgrade
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol ping
|
||||
sinfo
|
||||
squeue
|
||||
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType"
|
||||
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -20
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: controller_after
|
||||
changed_when: false
|
||||
|
||||
- name: Print controller validation after upgrade
|
||||
ansible.builtin.debug:
|
||||
var: controller_after.stdout_lines
|
||||
+207
@@ -0,0 +1,207 @@
|
||||
---
|
||||
- name: Validate cluster after OS rolling upgrade
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate Slurm controller and cluster state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### slurmctld ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### nodes"
|
||||
sinfo -N
|
||||
|
||||
echo
|
||||
echo "### partitions"
|
||||
sinfo
|
||||
|
||||
echo
|
||||
echo "### queue"
|
||||
squeue
|
||||
|
||||
echo
|
||||
echo "### important config"
|
||||
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType|SelectType|ClusterName"
|
||||
|
||||
echo
|
||||
echo "### accounting recent jobs"
|
||||
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: cluster_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print cluster state
|
||||
ansible.builtin.debug:
|
||||
var: cluster_state.stdout_lines
|
||||
|
||||
|
||||
- name: Validate worker services after OS rolling upgrade
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Validate local worker services and Slurm connectivity
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
echo "UPTIME=$(uptime -p)"
|
||||
|
||||
echo
|
||||
echo "### services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
|
||||
echo
|
||||
echo "### munge local test"
|
||||
munge -n | unmunge >/dev/null
|
||||
echo "munge OK"
|
||||
|
||||
echo
|
||||
echo "### controller ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### local slurm.conf checksum"
|
||||
sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
|
||||
|
||||
echo
|
||||
echo "### gpu check if present"
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader || true
|
||||
else
|
||||
echo "NO_NVIDIA_SMI"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: worker_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print worker state
|
||||
ansible.builtin.debug:
|
||||
var: worker_state.stdout_lines
|
||||
|
||||
|
||||
- name: Submit post-upgrade CPU validation job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit CPU validation job to debug partition
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=os-upgrade-cpu-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/os-upgrade-cpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/os-upgrade-cpu-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: cpu_validation_job
|
||||
changed_when: true
|
||||
|
||||
- name: Print CPU validation job
|
||||
ansible.builtin.debug:
|
||||
var: cpu_validation_job.stdout_lines
|
||||
|
||||
|
||||
- name: Submit post-upgrade GPU validation job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit GPU validation job to gpu partition
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=os-upgrade-gpu-test
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --cpus-per-task=2
|
||||
#SBATCH --mem=1G
|
||||
#SBATCH --time=00:03:00
|
||||
#SBATCH --output=/shared/os-upgrade-gpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
|
||||
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
echo
|
||||
nvidia-smi
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 120); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/os-upgrade-gpu-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_validation_job
|
||||
changed_when: true
|
||||
|
||||
- name: Print GPU validation job
|
||||
ansible.builtin.debug:
|
||||
var: gpu_validation_job.stdout_lines
|
||||
Reference in New Issue
Block a user