219 lines
7.0 KiB
YAML
219 lines
7.0 KiB
YAML
|
|
---
|
||
|
|
- name: Create slurmuser and generate SSH keys on every Slurm node
|
||
|
|
hosts: slurm_cluster
|
||
|
|
become: true
|
||
|
|
gather_facts: true
|
||
|
|
|
||
|
|
vars:
|
||
|
|
slurm_operator_user: slurmuser
|
||
|
|
slurm_operator_shell: /bin/bash
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Ensure useful packages are installed
|
||
|
|
ansible.builtin.apt:
|
||
|
|
name:
|
||
|
|
- sudo
|
||
|
|
- openssh-client
|
||
|
|
- openssh-server
|
||
|
|
- acl
|
||
|
|
state: present
|
||
|
|
update_cache: true
|
||
|
|
|
||
|
|
- name: Ensure slurmuser exists
|
||
|
|
ansible.builtin.user:
|
||
|
|
name: "{{ slurm_operator_user }}"
|
||
|
|
shell: "{{ slurm_operator_shell }}"
|
||
|
|
create_home: true
|
||
|
|
state: present
|
||
|
|
|
||
|
|
- name: Ensure .ssh directory exists for slurmuser
|
||
|
|
ansible.builtin.file:
|
||
|
|
path: "/home/{{ slurm_operator_user }}/.ssh"
|
||
|
|
state: directory
|
||
|
|
owner: "{{ slurm_operator_user }}"
|
||
|
|
group: "{{ slurm_operator_user }}"
|
||
|
|
mode: "0700"
|
||
|
|
|
||
|
|
- name: Generate SSH key for slurmuser if missing
|
||
|
|
ansible.builtin.openssh_keypair:
|
||
|
|
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
|
||
|
|
type: ed25519
|
||
|
|
owner: "{{ slurm_operator_user }}"
|
||
|
|
group: "{{ slurm_operator_user }}"
|
||
|
|
mode: "0600"
|
||
|
|
comment: "{{ slurm_operator_user }}@{{ inventory_hostname }}"
|
||
|
|
force: false
|
||
|
|
|
||
|
|
- name: Read public key from each node
|
||
|
|
ansible.builtin.slurp:
|
||
|
|
src: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
|
||
|
|
register: slurmuser_pubkey_raw
|
||
|
|
|
||
|
|
- name: Store decoded public key as host fact
|
||
|
|
ansible.builtin.set_fact:
|
||
|
|
slurmuser_pubkey: "{{ slurmuser_pubkey_raw.content | b64decode | trim }}"
|
||
|
|
|
||
|
|
|
||
|
|
- name: Exchange slurmuser SSH keys across all Slurm nodes
|
||
|
|
hosts: slurm_cluster
|
||
|
|
become: true
|
||
|
|
gather_facts: false
|
||
|
|
|
||
|
|
vars:
|
||
|
|
slurm_operator_user: slurmuser
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Install all slurmuser public keys into authorized_keys on every node
|
||
|
|
ansible.builtin.authorized_key:
|
||
|
|
user: "{{ slurm_operator_user }}"
|
||
|
|
key: "{{ hostvars[item].slurmuser_pubkey }}"
|
||
|
|
state: present
|
||
|
|
manage_dir: true
|
||
|
|
loop: "{{ groups['slurm_cluster'] }}"
|
||
|
|
|
||
|
|
- name: Build SSH known_hosts entries for all cluster nodes
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -e
|
||
|
|
mkdir -p /home/{{ slurm_operator_user }}/.ssh
|
||
|
|
touch /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||
|
|
|
||
|
|
{% for host in groups['slurm_cluster'] %}
|
||
|
|
ssh-keyscan -H {{ host }} {{ hostvars[host].ansible_host }} 2>/dev/null >> /home/{{ slurm_operator_user }}/.ssh/known_hosts || true
|
||
|
|
{% endfor %}
|
||
|
|
|
||
|
|
sort -u /home/{{ slurm_operator_user }}/.ssh/known_hosts -o /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||
|
|
chown {{ slurm_operator_user }}:{{ slurm_operator_user }} /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||
|
|
chmod 0644 /home/{{ slurm_operator_user }}/.ssh/known_hosts
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Ensure SSH permissions are correct
|
||
|
|
ansible.builtin.file:
|
||
|
|
path: "/home/{{ slurm_operator_user }}/.ssh"
|
||
|
|
state: directory
|
||
|
|
owner: "{{ slurm_operator_user }}"
|
||
|
|
group: "{{ slurm_operator_user }}"
|
||
|
|
mode: "0700"
|
||
|
|
|
||
|
|
- name: Ensure private key permissions are correct
|
||
|
|
ansible.builtin.file:
|
||
|
|
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
|
||
|
|
owner: "{{ slurm_operator_user }}"
|
||
|
|
group: "{{ slurm_operator_user }}"
|
||
|
|
mode: "0600"
|
||
|
|
|
||
|
|
- name: Ensure public key permissions are correct
|
||
|
|
ansible.builtin.file:
|
||
|
|
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
|
||
|
|
owner: "{{ slurm_operator_user }}"
|
||
|
|
group: "{{ slurm_operator_user }}"
|
||
|
|
mode: "0644"
|
||
|
|
|
||
|
|
|
||
|
|
- name: Configure sudo permissions for slurmuser
|
||
|
|
hosts: slurm_cluster
|
||
|
|
become: true
|
||
|
|
gather_facts: false
|
||
|
|
|
||
|
|
vars:
|
||
|
|
slurm_operator_user: slurmuser
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Configure sudoers for slurmuser on Slurm controller
|
||
|
|
ansible.builtin.copy:
|
||
|
|
dest: /etc/sudoers.d/91-slurmuser-slurm-controller
|
||
|
|
owner: root
|
||
|
|
group: root
|
||
|
|
mode: "0440"
|
||
|
|
content: |
|
||
|
|
# Managed by Ansible
|
||
|
|
# Operator access for Slurm controller node.
|
||
|
|
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
|
||
|
|
/bin/systemctl status slurmctld, \
|
||
|
|
/bin/systemctl restart slurmctld, \
|
||
|
|
/bin/systemctl reload slurmctld, \
|
||
|
|
/bin/systemctl stop slurmctld, \
|
||
|
|
/bin/systemctl start slurmctld, \
|
||
|
|
/bin/systemctl status slurmd, \
|
||
|
|
/bin/systemctl restart slurmd, \
|
||
|
|
/bin/systemctl reload slurmd, \
|
||
|
|
/bin/systemctl stop slurmd, \
|
||
|
|
/bin/systemctl start slurmd, \
|
||
|
|
/bin/journalctl -u slurmctld, \
|
||
|
|
/bin/journalctl -u slurmd, \
|
||
|
|
/usr/bin/scontrol, \
|
||
|
|
/usr/bin/sinfo, \
|
||
|
|
/usr/bin/squeue, \
|
||
|
|
/usr/bin/scancel, \
|
||
|
|
/usr/bin/sacct, \
|
||
|
|
/usr/bin/sacctmgr, \
|
||
|
|
/usr/bin/sbatch, \
|
||
|
|
/usr/bin/srun, \
|
||
|
|
/usr/bin/salloc
|
||
|
|
validate: "visudo -cf %s"
|
||
|
|
when: inventory_hostname in groups['slurm_controller']
|
||
|
|
|
||
|
|
- name: Configure sudoers for slurmuser on Slurm compute and GPU nodes
|
||
|
|
ansible.builtin.copy:
|
||
|
|
dest: /etc/sudoers.d/91-slurmuser-slurm-compute
|
||
|
|
owner: root
|
||
|
|
group: root
|
||
|
|
mode: "0440"
|
||
|
|
content: |
|
||
|
|
# Managed by Ansible
|
||
|
|
# Operator access for Slurm worker/GPU nodes.
|
||
|
|
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
|
||
|
|
/bin/systemctl status slurmd, \
|
||
|
|
/bin/systemctl restart slurmd, \
|
||
|
|
/bin/systemctl reload slurmd, \
|
||
|
|
/bin/systemctl stop slurmd, \
|
||
|
|
/bin/systemctl start slurmd, \
|
||
|
|
/bin/journalctl -u slurmd, \
|
||
|
|
/usr/bin/scontrol, \
|
||
|
|
/usr/bin/sinfo, \
|
||
|
|
/usr/bin/squeue, \
|
||
|
|
/usr/bin/scancel, \
|
||
|
|
/usr/bin/sacct, \
|
||
|
|
/usr/bin/sbatch, \
|
||
|
|
/usr/bin/srun, \
|
||
|
|
/usr/bin/salloc
|
||
|
|
validate: "visudo -cf %s"
|
||
|
|
when: inventory_hostname not in groups['slurm_controller']
|
||
|
|
|
||
|
|
|
||
|
|
- name: Validate slurmuser SSH mesh and Slurm access
|
||
|
|
hosts: slurm_cluster
|
||
|
|
become: true
|
||
|
|
gather_facts: false
|
||
|
|
|
||
|
|
vars:
|
||
|
|
slurm_operator_user: slurmuser
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Test local Slurm commands as slurmuser
|
||
|
|
ansible.builtin.command: "sudo -iu {{ slurm_operator_user }} sinfo"
|
||
|
|
register: sinfo_test
|
||
|
|
changed_when: false
|
||
|
|
failed_when: sinfo_test.rc != 0
|
||
|
|
|
||
|
|
- name: Show sinfo result
|
||
|
|
ansible.builtin.debug:
|
||
|
|
var: sinfo_test.stdout_lines
|
||
|
|
|
||
|
|
- name: Test SSH from each node to every other node as slurmuser
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -e
|
||
|
|
{% for host in groups['slurm_cluster'] %}
|
||
|
|
ssh -o BatchMode=yes -o ConnectTimeout=5 {{ host }} 'hostname'
|
||
|
|
{% endfor %}
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
become_user: "{{ slurm_operator_user }}"
|
||
|
|
register: ssh_mesh_test
|
||
|
|
changed_when: false
|
||
|
|
|
||
|
|
- name: Show SSH mesh test result
|
||
|
|
ansible.builtin.debug:
|
||
|
|
var: ssh_mesh_test.stdout_lines
|