Add Slurm AI/HPC cluster platform project

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit cd6830334b
47 changed files with 4727 additions and 0 deletions
@@ -0,0 +1,16 @@
# Managed by Ansible
# Slurm cgroup configuration
CgroupPlugin=autodetect
ConstrainCores=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=no
ConstrainDevices=yes
AllowedRAMSpace=100
AllowedSwapSpace=0
MaxRAMPercent=100
MaxSwapPercent=0
MinRAMSpace=30
@@ -0,0 +1,4 @@
# Managed by Ansible
{% for node in slurm_nodes if node.managed_state | default('present') == 'present' and node.gres | default('') | length > 0 %}
NodeName={{ node.name }} Name=gpu File={{ node.gres_file | default('/dev/nvidia0') }}
{% endfor %}
@@ -0,0 +1,67 @@
# Managed by Ansible
ClusterName={{ slurm_cluster_name }}
SlurmctldHost={{ slurm_control_machine }}({{ slurm_control_addr }})
SlurmUser={{ slurm_user }}
AuthType=auth/munge
StateSaveLocation=/var/spool/slurmctld
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault={{ slurm_default_mpi_type }}
ProctrackType={{ slurm_proctrack_type }}
ReturnToService={{ slurm_return_to_service }}
{% if slurm_gres_types is defined and slurm_gres_types | length > 0 %}
GresTypes={{ slurm_gres_types }}
{% endif %}
SlurmctldPidFile=/run/slurmctld.pid
SlurmdPidFile=/run/slurmd.pid
SlurmctldPort={{ slurmctld_port }}
SlurmdPort={{ slurmd_port }}
TaskPlugin={{ slurm_task_plugin }}
SelectType={{ slurm_select_type }}
SelectTypeParameters={{ slurm_select_type_parameters }}
SchedulerType=sched/backfill
# Priority / fairshare
PriorityType={{ slurm_priority_type | default('priority/multifactor') }}
PriorityDecayHalfLife={{ slurm_priority_decay_half_life | default('7-0') }}
PriorityCalcPeriod={{ slurm_priority_calc_period | default(5) }}
PriorityFavorSmall={{ slurm_priority_favor_small | default('NO') }}
PriorityWeightAge={{ slurm_priority_weight_age | default(1000) }}
PriorityWeightFairshare={{ slurm_priority_weight_fairshare | default(10000) }}
PriorityWeightJobSize={{ slurm_priority_weight_job_size | default(1000) }}
PriorityWeightPartition={{ slurm_priority_weight_partition | default(1000) }}
PriorityWeightQOS={{ slurm_priority_weight_qos | default(10000) }}
PriorityMaxAge={{ slurm_priority_max_age | default('1-0') }}
SlurmctldTimeout=120
SlurmdTimeout=300
InactiveLimit=0
KillWait=30
Waittime=0
AccountingStorageType={{ slurm_accounting_storage_type }}
{% if slurm_accounting_storage_type == "accounting_storage/slurmdbd" %}
AccountingStorageHost={{ slurm_accounting_storage_host }}
AccountingStoragePort={{ slurm_accounting_storage_port }}
AccountingStorageEnforce={{ slurm_accounting_storage_enforce | default('associations,limits,qos') }}
AccountingStorageTRES={{ slurm_accounting_storage_tres | default('cpu,mem,energy,node,billing,fs/disk,pages,vmem,gres/gpu') }}
{% endif %}
JobAcctGatherType={{ slurm_job_acct_gather_type | default('jobacct_gather/none') }}
JobCompType={{ slurm_job_comp_type }}
SlurmctldDebug=info
SlurmdDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdLogFile=/var/log/slurm/slurmd.log
{% for node in slurm_nodes if node.managed_state | default('present') == 'present' %}
NodeName={{ node.name }} NodeAddr={{ node.addr }} CPUs={{ node.cpus }}{% if node.topology | default('') | length > 0 %} {{ node.topology }}{% endif %} RealMemory={{ node.real_memory }}{% if node.gres | default('') | length > 0 %} Gres={{ node.gres }}{% endif %}{% if node.features | default('') | length > 0 %} Feature={{ node.features }}{% endif %} State=UNKNOWN
{% endfor %}
{% for partition in slurm_partitions %}
PartitionName={{ partition.name }} Nodes={{ partition.nodes }} Default={{ partition.default }} MaxTime={{ partition.max_time }} State={{ partition.state }}
{% endfor %}
@@ -0,0 +1,38 @@
# Managed by Ansible
# Slurm database daemon configuration
AuthType=auth/munge
DbdHost={{ slurmdbd_host }}
DbdPort={{ slurmdbd_port }}
SlurmUser={{ slurm_user }}
DebugLevel=info
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/run/slurmdbd.pid
CommitDelay={{ slurmdbd_commit_delay | default(1) }}
StorageType={{ slurmdbd_storage_type }}
StorageHost={{ slurmdbd_storage_host }}
StoragePort={{ slurmdbd_storage_port }}
StorageLoc={{ slurmdbd_storage_loc }}
StorageUser={{ slurmdbd_storage_user }}
StoragePass={{ slurmdbd_storage_pass }}
# Retention / purge policy
PurgeEventAfter={{ slurmdbd_purge_event_after | default('12months') }}
PurgeJobAfter={{ slurmdbd_purge_job_after | default('12months') }}
PurgeResvAfter={{ slurmdbd_purge_resv_after | default('12months') }}
PurgeStepAfter={{ slurmdbd_purge_step_after | default('3months') }}
PurgeSuspendAfter={{ slurmdbd_purge_suspend_after | default('3months') }}
PurgeTXNAfter={{ slurmdbd_purge_txn_after | default('12months') }}
PurgeUsageAfter={{ slurmdbd_purge_usage_after | default('24months') }}
ArchiveEvents={{ slurmdbd_archive_events | default('no') }}
ArchiveJobs={{ slurmdbd_archive_jobs | default('no') }}
ArchiveSteps={{ slurmdbd_archive_steps | default('no') }}
ArchiveSuspend={{ slurmdbd_archive_suspend | default('no') }}
ArchiveTXN={{ slurmdbd_archive_txn | default('no') }}
ArchiveUsage={{ slurmdbd_archive_usage | default('no') }}