Add Slurm AI/HPC cluster platform project
This commit is contained in:
@@ -0,0 +1,58 @@
|
||||
---
|
||||
- name: Bootstrap Ansible SSH access from pvef to Slurm nodes
|
||||
hosts: slurm_cluster
|
||||
gather_facts: false
|
||||
become: true
|
||||
|
||||
vars:
|
||||
ansible_controller_pubkey: "{{ lookup('file', lookup('env', 'HOME') + '/.ssh/id_ed25519.pub') }}"
|
||||
|
||||
pre_tasks:
|
||||
- name: Wait for SSH
|
||||
ansible.builtin.wait_for_connection:
|
||||
timeout: 30
|
||||
|
||||
- name: Install Python if missing - Debian/Ubuntu
|
||||
ansible.builtin.raw: |
|
||||
test -e /usr/bin/python3 || (apt-get update && apt-get install -y python3)
|
||||
changed_when: false
|
||||
|
||||
tasks:
|
||||
- name: Ensure sudo is installed
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- sudo
|
||||
- openssh-server
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure SSH server is enabled and running
|
||||
ansible.builtin.service:
|
||||
name: ssh
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
- name: Ensure .ssh directory exists for login user
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ ansible_user }}/.ssh"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ ansible_user }}"
|
||||
mode: "0700"
|
||||
|
||||
- name: Add pvef root public key to login user's authorized_keys
|
||||
ansible.builtin.authorized_key:
|
||||
user: "{{ ansible_user }}"
|
||||
key: "{{ ansible_controller_pubkey }}"
|
||||
state: present
|
||||
manage_dir: true
|
||||
|
||||
- name: Allow bootstrap login user passwordless sudo
|
||||
ansible.builtin.copy:
|
||||
dest: "/etc/sudoers.d/90-ansible-{{ ansible_user }}"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
{{ ansible_user }} ALL=(ALL) NOPASSWD:ALL
|
||||
validate: "visudo -cf %s"
|
||||
@@ -0,0 +1,16 @@
|
||||
---
|
||||
- name: Configure /etc/hosts for Slurm cluster
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Add Slurm cluster hosts to /etc/hosts
|
||||
ansible.builtin.blockinfile:
|
||||
path: /etc/hosts
|
||||
marker: "# {mark} ANSIBLE MANAGED SLURM CLUSTER HOSTS"
|
||||
block: |
|
||||
{{ slurm_control_addr }} {{ slurm_control_machine }}
|
||||
{% for node in slurm_nodes if node.managed_state | default('present') == 'present' %}
|
||||
{{ node.addr }} {{ node.name }}
|
||||
{% endfor %}
|
||||
@@ -0,0 +1,218 @@
|
||||
---
|
||||
- name: Create slurmuser and generate SSH keys on every Slurm node
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
slurm_operator_shell: /bin/bash
|
||||
|
||||
tasks:
|
||||
- name: Ensure useful packages are installed
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- sudo
|
||||
- openssh-client
|
||||
- openssh-server
|
||||
- acl
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure slurmuser exists
|
||||
ansible.builtin.user:
|
||||
name: "{{ slurm_operator_user }}"
|
||||
shell: "{{ slurm_operator_shell }}"
|
||||
create_home: true
|
||||
state: present
|
||||
|
||||
- name: Ensure .ssh directory exists for slurmuser
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh"
|
||||
state: directory
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0700"
|
||||
|
||||
- name: Generate SSH key for slurmuser if missing
|
||||
ansible.builtin.openssh_keypair:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
|
||||
type: ed25519
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0600"
|
||||
comment: "{{ slurm_operator_user }}@{{ inventory_hostname }}"
|
||||
force: false
|
||||
|
||||
- name: Read public key from each node
|
||||
ansible.builtin.slurp:
|
||||
src: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
|
||||
register: slurmuser_pubkey_raw
|
||||
|
||||
- name: Store decoded public key as host fact
|
||||
ansible.builtin.set_fact:
|
||||
slurmuser_pubkey: "{{ slurmuser_pubkey_raw.content | b64decode | trim }}"
|
||||
|
||||
|
||||
- name: Exchange slurmuser SSH keys across all Slurm nodes
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Install all slurmuser public keys into authorized_keys on every node
|
||||
ansible.builtin.authorized_key:
|
||||
user: "{{ slurm_operator_user }}"
|
||||
key: "{{ hostvars[item].slurmuser_pubkey }}"
|
||||
state: present
|
||||
manage_dir: true
|
||||
loop: "{{ groups['slurm_cluster'] }}"
|
||||
|
||||
- name: Build SSH known_hosts entries for all cluster nodes
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
mkdir -p /home/{{ slurm_operator_user }}/.ssh
|
||||
touch /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||||
|
||||
{% for host in groups['slurm_cluster'] %}
|
||||
ssh-keyscan -H {{ host }} {{ hostvars[host].ansible_host }} 2>/dev/null >> /home/{{ slurm_operator_user }}/.ssh/known_hosts || true
|
||||
{% endfor %}
|
||||
|
||||
sort -u /home/{{ slurm_operator_user }}/.ssh/known_hosts -o /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||||
chown {{ slurm_operator_user }}:{{ slurm_operator_user }} /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||||
chmod 0644 /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Ensure SSH permissions are correct
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh"
|
||||
state: directory
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0700"
|
||||
|
||||
- name: Ensure private key permissions are correct
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0600"
|
||||
|
||||
- name: Ensure public key permissions are correct
|
||||
ansible.builtin.file:
|
||||
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
|
||||
owner: "{{ slurm_operator_user }}"
|
||||
group: "{{ slurm_operator_user }}"
|
||||
mode: "0644"
|
||||
|
||||
|
||||
- name: Configure sudo permissions for slurmuser
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Configure sudoers for slurmuser on Slurm controller
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/sudoers.d/91-slurmuser-slurm-controller
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
# Operator access for Slurm controller node.
|
||||
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
|
||||
/bin/systemctl status slurmctld, \
|
||||
/bin/systemctl restart slurmctld, \
|
||||
/bin/systemctl reload slurmctld, \
|
||||
/bin/systemctl stop slurmctld, \
|
||||
/bin/systemctl start slurmctld, \
|
||||
/bin/systemctl status slurmd, \
|
||||
/bin/systemctl restart slurmd, \
|
||||
/bin/systemctl reload slurmd, \
|
||||
/bin/systemctl stop slurmd, \
|
||||
/bin/systemctl start slurmd, \
|
||||
/bin/journalctl -u slurmctld, \
|
||||
/bin/journalctl -u slurmd, \
|
||||
/usr/bin/scontrol, \
|
||||
/usr/bin/sinfo, \
|
||||
/usr/bin/squeue, \
|
||||
/usr/bin/scancel, \
|
||||
/usr/bin/sacct, \
|
||||
/usr/bin/sacctmgr, \
|
||||
/usr/bin/sbatch, \
|
||||
/usr/bin/srun, \
|
||||
/usr/bin/salloc
|
||||
validate: "visudo -cf %s"
|
||||
when: inventory_hostname in groups['slurm_controller']
|
||||
|
||||
- name: Configure sudoers for slurmuser on Slurm compute and GPU nodes
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/sudoers.d/91-slurmuser-slurm-compute
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
# Operator access for Slurm worker/GPU nodes.
|
||||
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
|
||||
/bin/systemctl status slurmd, \
|
||||
/bin/systemctl restart slurmd, \
|
||||
/bin/systemctl reload slurmd, \
|
||||
/bin/systemctl stop slurmd, \
|
||||
/bin/systemctl start slurmd, \
|
||||
/bin/journalctl -u slurmd, \
|
||||
/usr/bin/scontrol, \
|
||||
/usr/bin/sinfo, \
|
||||
/usr/bin/squeue, \
|
||||
/usr/bin/scancel, \
|
||||
/usr/bin/sacct, \
|
||||
/usr/bin/sbatch, \
|
||||
/usr/bin/srun, \
|
||||
/usr/bin/salloc
|
||||
validate: "visudo -cf %s"
|
||||
when: inventory_hostname not in groups['slurm_controller']
|
||||
|
||||
|
||||
- name: Validate slurmuser SSH mesh and Slurm access
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Test local Slurm commands as slurmuser
|
||||
ansible.builtin.command: "sudo -iu {{ slurm_operator_user }} sinfo"
|
||||
register: sinfo_test
|
||||
changed_when: false
|
||||
failed_when: sinfo_test.rc != 0
|
||||
|
||||
- name: Show sinfo result
|
||||
ansible.builtin.debug:
|
||||
var: sinfo_test.stdout_lines
|
||||
|
||||
- name: Test SSH from each node to every other node as slurmuser
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
{% for host in groups['slurm_cluster'] %}
|
||||
ssh -o BatchMode=yes -o ConnectTimeout=5 {{ host }} 'hostname'
|
||||
{% endfor %}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
become_user: "{{ slurm_operator_user }}"
|
||||
register: ssh_mesh_test
|
||||
changed_when: false
|
||||
|
||||
- name: Show SSH mesh test result
|
||||
ansible.builtin.debug:
|
||||
var: ssh_mesh_test.stdout_lines
|
||||
@@ -0,0 +1,112 @@
|
||||
---
|
||||
- name: Fix sudo permissions for slurmuser Slurm operations
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Configure sudoers for slurmuser on controller
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/sudoers.d/91-slurmuser-slurm-controller
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
|
||||
Cmnd_Alias SLURM_SYSTEMCTL_CONTROLLER = \
|
||||
/bin/systemctl status slurmctld, \
|
||||
/bin/systemctl status slurmctld *, \
|
||||
/bin/systemctl restart slurmctld, \
|
||||
/bin/systemctl reload slurmctld, \
|
||||
/bin/systemctl start slurmctld, \
|
||||
/bin/systemctl stop slurmctld, \
|
||||
/bin/systemctl status slurmd, \
|
||||
/bin/systemctl status slurmd *, \
|
||||
/bin/systemctl restart slurmd, \
|
||||
/bin/systemctl reload slurmd, \
|
||||
/bin/systemctl start slurmd, \
|
||||
/bin/systemctl stop slurmd, \
|
||||
/usr/bin/systemctl status slurmctld, \
|
||||
/usr/bin/systemctl status slurmctld *, \
|
||||
/usr/bin/systemctl restart slurmctld, \
|
||||
/usr/bin/systemctl reload slurmctld, \
|
||||
/usr/bin/systemctl start slurmctld, \
|
||||
/usr/bin/systemctl stop slurmctld, \
|
||||
/usr/bin/systemctl status slurmd, \
|
||||
/usr/bin/systemctl status slurmd *, \
|
||||
/usr/bin/systemctl restart slurmd, \
|
||||
/usr/bin/systemctl reload slurmd, \
|
||||
/usr/bin/systemctl start slurmd, \
|
||||
/usr/bin/systemctl stop slurmd
|
||||
|
||||
Cmnd_Alias SLURM_JOURNAL_CONTROLLER = \
|
||||
/bin/journalctl -u slurmctld, \
|
||||
/bin/journalctl -u slurmctld *, \
|
||||
/bin/journalctl -u slurmd, \
|
||||
/bin/journalctl -u slurmd *, \
|
||||
/usr/bin/journalctl -u slurmctld, \
|
||||
/usr/bin/journalctl -u slurmctld *, \
|
||||
/usr/bin/journalctl -u slurmd, \
|
||||
/usr/bin/journalctl -u slurmd *
|
||||
|
||||
Cmnd_Alias SLURM_COMMANDS = \
|
||||
/usr/bin/scontrol, /usr/bin/scontrol *, \
|
||||
/usr/bin/sinfo, /usr/bin/sinfo *, \
|
||||
/usr/bin/squeue, /usr/bin/squeue *, \
|
||||
/usr/bin/scancel, /usr/bin/scancel *, \
|
||||
/usr/bin/sacct, /usr/bin/sacct *, \
|
||||
/usr/bin/sacctmgr, /usr/bin/sacctmgr *, \
|
||||
/usr/bin/sbatch, /usr/bin/sbatch *, \
|
||||
/usr/bin/srun, /usr/bin/srun *, \
|
||||
/usr/bin/salloc, /usr/bin/salloc *
|
||||
|
||||
{{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_CONTROLLER, SLURM_JOURNAL_CONTROLLER, SLURM_COMMANDS
|
||||
validate: "visudo -cf %s"
|
||||
when: inventory_hostname in groups['slurm_controller']
|
||||
|
||||
- name: Configure sudoers for slurmuser on compute and GPU nodes
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/sudoers.d/91-slurmuser-slurm-compute
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
|
||||
Cmnd_Alias SLURM_SYSTEMCTL_COMPUTE = \
|
||||
/bin/systemctl status slurmd, \
|
||||
/bin/systemctl status slurmd *, \
|
||||
/bin/systemctl restart slurmd, \
|
||||
/bin/systemctl reload slurmd, \
|
||||
/bin/systemctl start slurmd, \
|
||||
/bin/systemctl stop slurmd, \
|
||||
/usr/bin/systemctl status slurmd, \
|
||||
/usr/bin/systemctl status slurmd *, \
|
||||
/usr/bin/systemctl restart slurmd, \
|
||||
/usr/bin/systemctl reload slurmd, \
|
||||
/usr/bin/systemctl start slurmd, \
|
||||
/usr/bin/systemctl stop slurmd
|
||||
|
||||
Cmnd_Alias SLURM_JOURNAL_COMPUTE = \
|
||||
/bin/journalctl -u slurmd, \
|
||||
/bin/journalctl -u slurmd *, \
|
||||
/usr/bin/journalctl -u slurmd, \
|
||||
/usr/bin/journalctl -u slurmd *
|
||||
|
||||
Cmnd_Alias SLURM_COMMANDS = \
|
||||
/usr/bin/scontrol, /usr/bin/scontrol *, \
|
||||
/usr/bin/sinfo, /usr/bin/sinfo *, \
|
||||
/usr/bin/squeue, /usr/bin/squeue *, \
|
||||
/usr/bin/scancel, /usr/bin/scancel *, \
|
||||
/usr/bin/sacct, /usr/bin/sacct *, \
|
||||
/usr/bin/sbatch, /usr/bin/sbatch *, \
|
||||
/usr/bin/srun, /usr/bin/srun *, \
|
||||
/usr/bin/salloc, /usr/bin/salloc *
|
||||
|
||||
{{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_COMPUTE, SLURM_JOURNAL_COMPUTE, SLURM_COMMANDS
|
||||
validate: "visudo -cf %s"
|
||||
when: inventory_hostname not in groups['slurm_controller']
|
||||
Reference in New Issue
Block a user