Add Slurm AI/HPC cluster platform project

2026-06-04 19:41:05 +00:00
parent e2624a7533
commit d300d490f5
49 changed files with 4777 additions and 0 deletions
@@ -0,0 +1,216 @@
+---
+- name: Detect problematic Slurm nodes
+  hosts: slurm_controller
+  become: true
+  gather_facts: false
+
+  tasks:
+    - name: Detect nodes needing remediation
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        sinfo -N -h -o "%N %T" | awk '
+          tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
+        ' | sort -u
+      args:
+        executable: /bin/bash
+      register: bad_nodes_raw
+      changed_when: false
+
+    - name: Store bad node list
+      ansible.builtin.set_fact:
+        bad_nodes: "{{ bad_nodes_raw.stdout_lines }}"
+
+    - name: Show detected problematic nodes
+      ansible.builtin.debug:
+        var: bad_nodes
+
+
+- name: Attempt auto-remediation on problematic nodes
+  hosts: slurm_compute:slurm_gpu
+  become: true
+  gather_facts: false
+  serial: 1
+
+  vars:
+    bad_nodes_from_controller: "{{ hostvars[groups['slurm_controller'][0]].bad_nodes | default([]) }}"
+
+  tasks:
+    - name: Skip healthy nodes
+      ansible.builtin.meta: end_host
+      when: inventory_hostname not in bad_nodes_from_controller
+
+    - name: Restart Munge
+      ansible.builtin.systemd:
+        name: munge
+        state: restarted
+        enabled: true
+
+    - name: Restart slurmd
+      ansible.builtin.systemd:
+        name: slurmd
+        state: restarted
+        enabled: true
+
+    - name: Validate local services after remediation attempt
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        echo "HOST=$(hostname)"
+
+        echo
+        echo "### services"
+        systemctl is-active munge
+        systemctl is-active slurmd
+
+        echo
+        echo "### munge"
+        munge -n | unmunge >/dev/null
+        echo "munge OK"
+
+        echo
+        echo "### controller ping"
+        scontrol ping
+
+        echo
+        echo "### slurmd listener"
+        ss -lntp | grep ':6818 ' || true
+
+        echo
+        echo "### recent slurmd logs"
+        journalctl -u slurmd -n 30 --no-pager || true
+      args:
+        executable: /bin/bash
+      register: local_repair_check
+      changed_when: false
+
+    - name: Print local remediation result
+      ansible.builtin.debug:
+        var: local_repair_check.stdout_lines
+
+
+- name: Refresh controller and validate remediated nodes
+  hosts: slurm_controller
+  become: true
+  gather_facts: false
+
+  tasks:
+    - name: Restart slurmctld to refresh node states
+      ansible.builtin.systemd:
+        name: slurmctld
+        state: restarted
+
+    - name: Wait for controller
+      ansible.builtin.command:
+        cmd: scontrol ping
+      register: slurmctld_ping
+      retries: 15
+      delay: 2
+      until: slurmctld_ping.rc == 0
+      changed_when: false
+
+    - name: Clear maintenance state on previously bad nodes
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
+
+        if [ -z "$bad_nodes" ]; then
+          echo "No bad nodes detected. Nothing to clear."
+          sinfo -N
+          exit 0
+        fi
+
+        for node in $bad_nodes; do
+          echo "### clearing state on $node"
+          scontrol update NodeName="$node" State=RESUME 2>/dev/null || true
+          scontrol update NodeName="$node" State=UNDRAIN 2>/dev/null || true
+          scontrol update NodeName="$node" State=IDLE 2>/dev/null || true
+        done
+
+        sleep 5
+        sinfo -N
+      args:
+        executable: /bin/bash
+      register: clear_result
+      changed_when: true
+
+    - name: Print clear-state result
+      ansible.builtin.debug:
+        var: clear_result.stdout_lines
+
+    - name: Detect nodes still unhealthy after remediation
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        sinfo -N -h -o "%N %T" | awk '
+          tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
+        ' | sort -u
+      args:
+        executable: /bin/bash
+      register: still_bad_nodes_raw
+      changed_when: false
+
+    - name: Store still bad nodes
+      ansible.builtin.set_fact:
+        still_bad_nodes: "{{ still_bad_nodes_raw.stdout_lines }}"
+
+    - name: Drain nodes that remain unhealthy
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        unresolved_nodes="{{ still_bad_nodes | join(' ') }}"
+
+        if [ -z "$unresolved_nodes" ]; then
+          echo "No unresolved unhealthy nodes."
+          sinfo -N
+          exit 0
+        fi
+
+        for node in $unresolved_nodes; do
+          echo "### draining unresolved node $node"
+          scontrol update NodeName="$node" State=DRAIN Reason="auto-remediation failed"
+        done
+
+        sinfo -N
+      args:
+        executable: /bin/bash
+      register: drain_unresolved
+      changed_when: still_bad_nodes | length > 0
+
+    - name: Show remediation summary
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        echo "### initial bad nodes"
+        bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
+        if [ -z "$bad_nodes" ]; then
+          echo "none"
+        else
+          printf '%s\n' $bad_nodes
+        fi
+
+        echo
+        echo "### still bad nodes"
+        still_bad_nodes="{{ (still_bad_nodes | default([])) | join(' ') }}"
+        if [ -z "$still_bad_nodes" ]; then
+          echo "none"
+        else
+          printf '%s\n' $still_bad_nodes
+        fi
+
+        echo
+        echo "### final sinfo"
+        sinfo -N
+
+        echo
+        echo "### queue"
+        squeue
+      args:
+        executable: /bin/bash
+      register: remediation_summary
+      changed_when: false
+
+    - name: Print remediation summary
+      ansible.builtin.debug:
+        var: remediation_summary.stdout_lines
@@ -0,0 +1,149 @@
+---
+- name: Check Slurm controller health
+  hosts: slurm_controller
+  become: true
+  gather_facts: false
+
+  tasks:
+    - name: Check controller services and cluster state
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        echo "### controller services"
+        systemctl is-active munge
+        systemctl is-active slurmctld
+        systemctl is-active slurmdbd || true
+        systemctl is-active mariadb || true
+
+        echo
+        echo "### slurm ping"
+        scontrol ping
+
+        echo
+        echo "### nodes"
+        sinfo -N
+
+        echo
+        echo "### partitions"
+        sinfo
+
+        echo
+        echo "### queue"
+        squeue
+
+        echo
+        echo "### problematic nodes"
+        sinfo -N -h -o "%N %T %E" | awk '$2 !~ /idle|alloc|mix/ {print}' || true
+
+        echo
+        echo "### accounting"
+        sacctmgr -n list cluster || true
+
+        echo
+        echo "### recent failed jobs"
+        sacct -S today --state=FAILED,CANCELLED,TIMEOUT,NODE_FAIL,OUT_OF_MEMORY \
+          --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,NodeList | tail -30 || true
+      args:
+        executable: /bin/bash
+      register: controller_health
+      changed_when: false
+
+    - name: Print controller health
+      ansible.builtin.debug:
+        var: controller_health.stdout_lines
+
+
+- name: Check Slurm worker health
+  hosts: slurm_compute:slurm_gpu
+  become: true
+  gather_facts: true
+
+  tasks:
+    - name: Check worker services, config and connectivity
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        echo "HOST=$(hostname)"
+        echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
+        echo "KERNEL=$(uname -r)"
+        echo "UPTIME=$(uptime -p)"
+
+        echo
+        echo "### services"
+        systemctl is-active munge
+        systemctl is-active slurmd
+
+        echo
+        echo "### munge local test"
+        munge -n | unmunge >/dev/null
+        echo "munge OK"
+
+        echo
+        echo "### controller connectivity"
+        getent hosts slurm-ctl01 || true
+        scontrol ping
+
+        echo
+        echo "### slurmd listener"
+        ss -lntp | grep ':6818 ' || true
+
+        echo
+        echo "### config checksums"
+        sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
+
+        echo
+        echo "### shared filesystem"
+        test -d /shared
+        touch /shared/.slurm-health-$(hostname)
+        ls -l /shared/.slurm-health-$(hostname)
+        rm -f /shared/.slurm-health-$(hostname)
+
+        echo
+        echo "### cgroup"
+        mount | grep cgroup || true
+
+        echo
+        echo "### gpu check"
+        if command -v nvidia-smi >/dev/null 2>&1; then
+          nvidia-smi --query-gpu=index,name,driver_version,memory.total,temperature.gpu,utilization.gpu --format=csv,noheader || true
+        else
+          echo "NO_NVIDIA_SMI"
+        fi
+      args:
+        executable: /bin/bash
+      register: worker_health
+      changed_when: false
+
+    - name: Print worker health
+      ansible.builtin.debug:
+        var: worker_health.stdout_lines
+
+
+- name: Check Slurm-reported node state consistency
+  hosts: slurm_controller
+  become: true
+  gather_facts: false
+
+  tasks:
+    - name: Build Slurm node health summary
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        echo "### node summary"
+        sinfo -N -o "%N %P %T %C %m %G %E"
+
+        echo
+        echo "### full problematic node details"
+        for node in $(sinfo -N -h -o "%N %T" | awk '$2 ~ /down|drain|fail|unk|not_responding|idle\\*/ {print $1}' | sort -u); do
+          echo
+          echo "### $node"
+          scontrol show node "$node"
+        done
+      args:
+        executable: /bin/bash
+      register: slurm_node_summary
+      changed_when: false
+
+    - name: Print Slurm node summary
+      ansible.builtin.debug:
+        var: slurm_node_summary.stdout_lines
@@ -0,0 +1,217 @@
+---
+- name: Validate target node
+  hosts: localhost
+  gather_facts: false
+
+  tasks:
+    - name: Require target_node
+      ansible.builtin.fail:
+        msg: "Use: ansible-playbook repair-slurm-node.yml -e target_node=<hostname>"
+      when: target_node is not defined
+
+    - name: Ensure target_node is in inventory
+      ansible.builtin.fail:
+        msg: "target_node={{ target_node }} is not in Ansible inventory"
+      when: target_node not in groups['all']
+
+
+- name: Capture node state before repair
+  hosts: slurm_controller
+  become: true
+  gather_facts: false
+
+  tasks:
+    - name: Show target node state before repair
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        echo "### sinfo"
+        sinfo -N -n {{ target_node }} || true
+
+        echo
+        echo "### scontrol"
+        scontrol show node {{ target_node }} || true
+
+        echo
+        echo "### jobs"
+        squeue -w {{ target_node }} || true
+      args:
+        executable: /bin/bash
+      register: node_state_before
+      changed_when: false
+
+    - name: Print target node state before repair
+      ansible.builtin.debug:
+        var: node_state_before.stdout_lines
+
+
+- name: Repair local services on target node
+  hosts: "{{ target_node }}"
+  become: true
+  gather_facts: false
+
+  tasks:
+    - name: Restart Munge
+      ansible.builtin.systemd:
+        name: munge
+        state: restarted
+        enabled: true
+
+    - name: Restart slurmd
+      ansible.builtin.systemd:
+        name: slurmd
+        state: restarted
+        enabled: true
+      when:
+        - inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
+
+    - name: Validate local repair
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        echo "### services"
+        systemctl is-active munge
+        systemctl is-active slurmd
+
+        echo
+        echo "### munge"
+        munge -n | unmunge >/dev/null
+        echo "munge OK"
+
+        echo
+        echo "### controller ping"
+        scontrol ping
+
+        echo
+        echo "### slurmd listener"
+        ss -lntp | grep ':6818 ' || true
+
+        echo
+        echo "### recent slurmd logs"
+        journalctl -u slurmd -n 40 --no-pager || true
+      args:
+        executable: /bin/bash
+      register: local_repair_state
+      changed_when: false
+
+    - name: Print local repair state
+      ansible.builtin.debug:
+        var: local_repair_state.stdout_lines
+
+
+- name: Clear Slurm maintenance/down state after repair
+  hosts: slurm_controller
+  become: true
+  gather_facts: false
+
+  tasks:
+    - name: Restart controller to refresh node state
+      ansible.builtin.systemd:
+        name: slurmctld
+        state: restarted
+
+    - name: Wait for controller
+      ansible.builtin.command:
+        cmd: scontrol ping
+      register: slurmctld_ping
+      retries: 15
+      delay: 2
+      until: slurmctld_ping.rc == 0
+      changed_when: false
+
+    - name: Clear target node state
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        scontrol update NodeName={{ target_node }} State=RESUME 2>/dev/null || true
+        scontrol update NodeName={{ target_node }} State=UNDRAIN 2>/dev/null || true
+        scontrol update NodeName={{ target_node }} State=IDLE 2>/dev/null || true
+
+        sleep 5
+
+        sinfo -N -n {{ target_node }}
+        scontrol show node {{ target_node }}
+      args:
+        executable: /bin/bash
+      register: clear_state
+      changed_when: true
+
+    - name: Wait until node is healthy
+      ansible.builtin.shell: |
+        set -euo pipefail
+        sinfo -N -n {{ target_node }}
+        scontrol show node {{ target_node }}
+      args:
+        executable: /bin/bash
+      register: node_health_after
+      retries: 30
+      delay: 5
+      until:
+        - node_health_after.rc == 0
+        - "'not_responding' not in node_health_after.stdout.lower()"
+        - "'down' not in node_health_after.stdout.lower()"
+        - "'drain' not in node_health_after.stdout.lower()"
+        - "'idle*' not in node_health_after.stdout.lower()"
+      changed_when: false
+
+    - name: Print node state after repair
+      ansible.builtin.debug:
+        var: node_health_after.stdout_lines
+
+
+- name: Submit repair validation job
+  hosts: slurm_controller
+  become: true
+  gather_facts: false
+
+  tasks:
+    - name: Submit validation job to repaired node
+      ansible.builtin.shell: |
+        set -euo pipefail
+
+        job_id="$(
+          sudo -iu slurmuser sbatch --parsable <<SBATCH
+        #!/bin/bash
+        #SBATCH --job-name=repair-node-test
+        #SBATCH --partition=all
+        #SBATCH --nodelist={{ target_node }}
+        #SBATCH --cpus-per-task=1
+        #SBATCH --mem=256M
+        #SBATCH --time=00:02:00
+        #SBATCH --account=lab
+        #SBATCH --qos=normal
+        #SBATCH --output=/shared/repair-node-test-%j.out
+
+        echo "HOST=\$(hostname)"
+        echo "USER=\$(whoami)"
+        echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
+        echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
+        echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
+        date
+        SBATCH
+        )"
+
+        echo "JOB_ID=$job_id"
+
+        for i in $(seq 1 90); do
+          if squeue -h -j "$job_id" | grep -q .; then
+            squeue -j "$job_id"
+            sleep 1
+          else
+            break
+          fi
+        done
+
+        echo "### sacct"
+        sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList
+
+        echo "### output"
+        cat "/shared/repair-node-test-${job_id}.out"
+      args:
+        executable: /bin/bash
+      register: repair_validation_job
+      changed_when: true
+
+    - name: Print repair validation job
+      ansible.builtin.debug:
+        var: repair_validation_job.stdout_lines