1、準備3臺 機器 並設置 hosts
echo 192.168.108.138 m1 >> /etc/hosts
echo 192.168.108.139 s2 >> /etc/hosts
echo 192.168.108.140 s3 >> /etc/hosts
echo $hostname > /etc/hostname
hostnamectl set-hostname $hostname
2、創建slurm 用戶(id 一定要是 412)
export SLURMUSER=412
groupadd -g $SLURMUSER slurm
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
id slurm
3、 關閉防火牆 SElinux
systemctl stop firewalld
systemctl disable firewalld
vim /etc/selinux/config 中 SELINUX=disabled 永久需重啓
setenforce 0
4、安裝ohpc 源
yum install http://build.openhpc.community/OpenHPC:/1.3/CentOS_7/x86_64/ohpc-release-1.3-1.el7.x86_64.rpm
5、安裝依賴
yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad -y
6、安裝server 端(m1 機器)
yum -y install ohpc-slurm-servervim /etc/slurm/slurm.conf
ControlMachine=m1
###*** CPUs=1 = Sockets*CoresPerSocket*ThreadsPerCore
NodeName=m1,s[2-3] CPUs=1 RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 Procs=1 State=IDLE
PartitionName=clients Nodes=s[2-3] Default=YES MaxTime=INFINITE State=UP
7. 安裝 client 端 (s2,s3)
yum -y install ohpc-slurm-client
scp -pr $m1IP:/etc/slurm/slurm.conf /etc/slurm/
scp -pr $m1IP:/etc/scp -pr $m1IP:/etc/slurm/slurm.conf /etc/slurm/
munge/munge.keyscp -pr $m1IP:/etc/munge/munge.key /etc/munge/munge.key
8 啓動服務
server 端 (m1)
systemctl start munge
systemctl start slurmctld
client 端(s2,s3)
systemctl start munge
systemctl start slurmd
9、 sinfo 查看狀態
# sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
normal* up infinite 2 drain s[2-3]
### drain 狀態修復
scontrol update NodeName=s[2-3] State=RESUME
# sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
normal* up infinite 2 idle s[2-3]
10、運行作業
#srun hostnames2
# srun -N 2 -l hostname
0: s2
1: s3
11、 其他命令
squeue -a #查詢作業
scancel <job_id> #取消作業
# scontrol show config
# scontrol show partition
# scontrol show node
# scontrol show jobs
12 、對比 PBS(參考 https://blog.csdn.net/weixin_39497034/article/details/79100799)
Command PBS Pro SLURM
Submit batch job qsub [job script] sbatch [job script]
Request interactive shell qsub -I /bin/bash srun –pty /bin/bash
Delete job qdel [job id] scancel [job id]
Queue status qstat -q sinfo
Job status qstat -f [job id] scontrol show job [job id]
Node status pbsnodes [node name] scontrol show node [node id]