Add Slurm AI/HPC cluster platform project
This commit is contained in:
+236
@@ -0,0 +1,236 @@
|
||||
---
|
||||
- name: Validate canary node variable
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
|
||||
|
||||
tasks:
|
||||
- name: Ensure canary node is in inventory
|
||||
ansible.builtin.fail:
|
||||
msg: "canary_node={{ canary_node_effective }} is not in inventory"
|
||||
when: canary_node_effective not in groups['all']
|
||||
|
||||
- name: Ensure canary node is not the controller
|
||||
ansible.builtin.fail:
|
||||
msg: "Do not use controller as canary for worker rolling upgrade"
|
||||
when: canary_node_effective in groups['slurm_controller']
|
||||
|
||||
|
||||
- name: Drain canary node
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
|
||||
|
||||
tasks:
|
||||
- name: Show canary state before drain
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ canary_node_effective }} || true
|
||||
scontrol show node {{ canary_node_effective }} || true
|
||||
squeue -w {{ canary_node_effective }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: canary_before
|
||||
changed_when: false
|
||||
|
||||
- name: Print canary state before drain
|
||||
ansible.builtin.debug:
|
||||
var: canary_before.stdout_lines
|
||||
|
||||
- name: Drain canary node
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ canary_node_effective }} State=DRAIN Reason="canary OS upgrade"
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until canary has no running jobs
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
squeue -h -w {{ canary_node_effective }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: canary_jobs
|
||||
retries: 120
|
||||
delay: 10
|
||||
until: canary_jobs.stdout | trim == ""
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Upgrade canary node OS packages
|
||||
hosts: "{{ canary_node | default('slurm-c02') }}"
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Ensure apt cache is updated
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 1800
|
||||
|
||||
- name: Full upgrade packages
|
||||
ansible.builtin.apt:
|
||||
upgrade: full
|
||||
autoremove: true
|
||||
autoclean: true
|
||||
register: apt_upgrade_result
|
||||
|
||||
- name: Check if reboot is required
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: reboot_required
|
||||
|
||||
- name: Show upgrade summary
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Host: {{ inventory_hostname }}"
|
||||
- "Apt changed: {{ apt_upgrade_result.changed }}"
|
||||
- "Reboot required: {{ reboot_required.stat.exists }}"
|
||||
|
||||
- name: Reboot canary if required
|
||||
ansible.builtin.reboot:
|
||||
msg: "Reboot after canary OS upgrade"
|
||||
reboot_timeout: 900
|
||||
connect_timeout: 20
|
||||
pre_reboot_delay: 5
|
||||
post_reboot_delay: 20
|
||||
when: reboot_required.stat.exists
|
||||
|
||||
- name: Ensure munge is running
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Ensure slurmd is running
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Validate local services
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
munge -n | unmunge >/dev/null
|
||||
scontrol ping
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Resume canary node and run canary job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
canary_node_effective: "{{ canary_node | default('slurm-c02') }}"
|
||||
|
||||
tasks:
|
||||
- name: Reconfigure controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol reconfigure
|
||||
changed_when: true
|
||||
|
||||
- name: Restart controller to refresh node state
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
|
||||
- name: Wait for controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Clear canary node maintenance state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
scontrol update NodeName={{ canary_node_effective }} State=RESUME 2>/dev/null || true
|
||||
scontrol update NodeName={{ canary_node_effective }} State=UNDRAIN 2>/dev/null || true
|
||||
scontrol update NodeName={{ canary_node_effective }} State=IDLE 2>/dev/null || true
|
||||
|
||||
sleep 3
|
||||
sinfo -N -n {{ canary_node_effective }}
|
||||
scontrol show node {{ canary_node_effective }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: resume_canary
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until canary is IDLE and responding
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ canary_node_effective }}
|
||||
scontrol show node {{ canary_node_effective }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: canary_state
|
||||
retries: 30
|
||||
delay: 5
|
||||
until:
|
||||
- canary_state.rc == 0
|
||||
- "'not_responding' not in canary_state.stdout.lower()"
|
||||
- "'down' not in canary_state.stdout.lower()"
|
||||
- "'drain' not in canary_state.stdout.lower()"
|
||||
- "'idle*' not in canary_state.stdout.lower()"
|
||||
changed_when: false
|
||||
|
||||
- name: Submit canary test job to upgraded node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=canary-upgrade-test
|
||||
#SBATCH --partition=all
|
||||
#SBATCH --nodelist={{ canary_node_effective }}
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/canary-upgrade-test-%j.out
|
||||
|
||||
echo "HOST=\$(hostname)"
|
||||
echo "USER=\$(whoami)"
|
||||
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "KERNEL=\$(uname -r)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/canary-upgrade-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: canary_job
|
||||
changed_when: true
|
||||
|
||||
- name: Show canary test result
|
||||
ansible.builtin.debug:
|
||||
var: canary_job.stdout_lines
|
||||
+197
@@ -0,0 +1,197 @@
|
||||
---
|
||||
- name: Rolling upgrade Slurm worker nodes
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: true
|
||||
serial: 1
|
||||
|
||||
vars:
|
||||
skip_canary_node: "{{ canary_node | default('slurm-c02') }}"
|
||||
do_skip_canary: "{{ skip_canary | default(true) | bool }}"
|
||||
|
||||
pre_tasks:
|
||||
- name: Skip canary node if requested
|
||||
ansible.builtin.meta: end_host
|
||||
when:
|
||||
- do_skip_canary
|
||||
- inventory_hostname == skip_canary_node
|
||||
|
||||
- name: Drain node before OS upgrade
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ inventory_hostname }} State=DRAIN Reason="rolling OS upgrade"
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until no jobs are running on this node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
squeue -h -w {{ inventory_hostname }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: jobs_on_node
|
||||
retries: 120
|
||||
delay: 10
|
||||
until: jobs_on_node.stdout | trim == ""
|
||||
changed_when: false
|
||||
|
||||
tasks:
|
||||
- name: Update apt cache
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 1800
|
||||
|
||||
- name: Full upgrade packages
|
||||
ansible.builtin.apt:
|
||||
upgrade: full
|
||||
autoremove: true
|
||||
autoclean: true
|
||||
register: apt_upgrade_result
|
||||
|
||||
- name: Check if reboot is required
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: reboot_required
|
||||
|
||||
- name: Show upgrade status
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Node: {{ inventory_hostname }}"
|
||||
- "Apt changed: {{ apt_upgrade_result.changed }}"
|
||||
- "Reboot required: {{ reboot_required.stat.exists }}"
|
||||
|
||||
- name: Reboot node if required
|
||||
ansible.builtin.reboot:
|
||||
msg: "Reboot after rolling OS upgrade"
|
||||
reboot_timeout: 900
|
||||
connect_timeout: 20
|
||||
pre_reboot_delay: 5
|
||||
post_reboot_delay: 20
|
||||
when: reboot_required.stat.exists
|
||||
|
||||
- name: Restart munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Validate local slurm services
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
munge -n | unmunge >/dev/null
|
||||
scontrol ping
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
post_tasks:
|
||||
- name: Restart controller to refresh state after node upgrade
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
run_once: false
|
||||
|
||||
- name: Wait for controller after restart
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: slurmctld_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Clear upgraded node maintenance state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
scontrol update NodeName={{ inventory_hostname }} State=RESUME 2>/dev/null || true
|
||||
scontrol update NodeName={{ inventory_hostname }} State=UNDRAIN 2>/dev/null || true
|
||||
scontrol update NodeName={{ inventory_hostname }} State=IDLE 2>/dev/null || true
|
||||
|
||||
sleep 3
|
||||
sinfo -N -n {{ inventory_hostname }}
|
||||
scontrol show node {{ inventory_hostname }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: resume_node
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until node is healthy
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ inventory_hostname }}
|
||||
scontrol show node {{ inventory_hostname }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: upgraded_node_state
|
||||
retries: 30
|
||||
delay: 5
|
||||
until:
|
||||
- upgraded_node_state.rc == 0
|
||||
- "'not_responding' not in upgraded_node_state.stdout.lower()"
|
||||
- "'down' not in upgraded_node_state.stdout.lower()"
|
||||
- "'drain' not in upgraded_node_state.stdout.lower()"
|
||||
- "'idle*' not in upgraded_node_state.stdout.lower()"
|
||||
changed_when: false
|
||||
|
||||
- name: Submit node-local post-upgrade test job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=rolling-upgrade-test
|
||||
#SBATCH --partition=all
|
||||
#SBATCH --nodelist={{ inventory_hostname }}
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/rolling-upgrade-test-%j.out
|
||||
|
||||
echo "HOST=\$(hostname)"
|
||||
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "KERNEL=\$(uname -r)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/rolling-upgrade-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: node_test_job
|
||||
changed_when: true
|
||||
|
||||
- name: Show node post-upgrade test result
|
||||
ansible.builtin.debug:
|
||||
var: node_test_job.stdout_lines
|
||||
@@ -0,0 +1,94 @@
|
||||
---
|
||||
- name: Upgrade Slurm controller OS safely
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Show cluster state before controller upgrade
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol ping
|
||||
sinfo
|
||||
squeue
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmctld
|
||||
systemctl is-active slurmdbd || true
|
||||
systemctl is-active mariadb || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: before_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print cluster state before controller upgrade
|
||||
ansible.builtin.debug:
|
||||
var: before_state.stdout_lines
|
||||
|
||||
- name: Update apt cache
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 1800
|
||||
|
||||
- name: Full upgrade controller packages
|
||||
ansible.builtin.apt:
|
||||
upgrade: full
|
||||
autoremove: true
|
||||
autoclean: true
|
||||
register: controller_upgrade
|
||||
|
||||
- name: Check if reboot is required
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: controller_reboot_required
|
||||
|
||||
- name: Show controller upgrade status
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Apt changed: {{ controller_upgrade.changed }}"
|
||||
- "Reboot required: {{ controller_reboot_required.stat.exists }}"
|
||||
|
||||
- name: Reboot controller if required
|
||||
ansible.builtin.reboot:
|
||||
msg: "Reboot after controller OS upgrade"
|
||||
reboot_timeout: 900
|
||||
connect_timeout: 20
|
||||
pre_reboot_delay: 5
|
||||
post_reboot_delay: 30
|
||||
when: controller_reboot_required.stat.exists
|
||||
|
||||
- name: Restart controller services
|
||||
ansible.builtin.systemd:
|
||||
name: "{{ item }}"
|
||||
state: restarted
|
||||
enabled: true
|
||||
loop:
|
||||
- munge
|
||||
- mariadb
|
||||
- slurmdbd
|
||||
- slurmctld
|
||||
|
||||
- name: Wait for slurmctld
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping
|
||||
retries: 20
|
||||
delay: 3
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Validate controller after upgrade
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol ping
|
||||
sinfo
|
||||
squeue
|
||||
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType"
|
||||
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -20
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: controller_after
|
||||
changed_when: false
|
||||
|
||||
- name: Print controller validation after upgrade
|
||||
ansible.builtin.debug:
|
||||
var: controller_after.stdout_lines
|
||||
+207
@@ -0,0 +1,207 @@
|
||||
---
|
||||
- name: Validate cluster after OS rolling upgrade
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate Slurm controller and cluster state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### slurmctld ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### nodes"
|
||||
sinfo -N
|
||||
|
||||
echo
|
||||
echo "### partitions"
|
||||
sinfo
|
||||
|
||||
echo
|
||||
echo "### queue"
|
||||
squeue
|
||||
|
||||
echo
|
||||
echo "### important config"
|
||||
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType|SelectType|ClusterName"
|
||||
|
||||
echo
|
||||
echo "### accounting recent jobs"
|
||||
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: cluster_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print cluster state
|
||||
ansible.builtin.debug:
|
||||
var: cluster_state.stdout_lines
|
||||
|
||||
|
||||
- name: Validate worker services after OS rolling upgrade
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Validate local worker services and Slurm connectivity
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
echo "UPTIME=$(uptime -p)"
|
||||
|
||||
echo
|
||||
echo "### services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
|
||||
echo
|
||||
echo "### munge local test"
|
||||
munge -n | unmunge >/dev/null
|
||||
echo "munge OK"
|
||||
|
||||
echo
|
||||
echo "### controller ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### local slurm.conf checksum"
|
||||
sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
|
||||
|
||||
echo
|
||||
echo "### gpu check if present"
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader || true
|
||||
else
|
||||
echo "NO_NVIDIA_SMI"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: worker_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print worker state
|
||||
ansible.builtin.debug:
|
||||
var: worker_state.stdout_lines
|
||||
|
||||
|
||||
- name: Submit post-upgrade CPU validation job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit CPU validation job to debug partition
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=os-upgrade-cpu-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/os-upgrade-cpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/os-upgrade-cpu-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: cpu_validation_job
|
||||
changed_when: true
|
||||
|
||||
- name: Print CPU validation job
|
||||
ansible.builtin.debug:
|
||||
var: cpu_validation_job.stdout_lines
|
||||
|
||||
|
||||
- name: Submit post-upgrade GPU validation job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit GPU validation job to gpu partition
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=os-upgrade-gpu-test
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --cpus-per-task=2
|
||||
#SBATCH --mem=1G
|
||||
#SBATCH --time=00:03:00
|
||||
#SBATCH --output=/shared/os-upgrade-gpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
|
||||
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
echo
|
||||
nvidia-smi
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 120); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/os-upgrade-gpu-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_validation_job
|
||||
changed_when: true
|
||||
|
||||
- name: Print GPU validation job
|
||||
ansible.builtin.debug:
|
||||
var: gpu_validation_job.stdout_lines
|
||||
Reference in New Issue
Block a user