Linux并行集群的搭建


软件版本:PBS
torque-3.0.6.tar.gz
maui-3.3.1.tar.gz
openmpi-1.8.1.tar.bz2
并行软件:
apoa1.tar.gz
NAMD_2.9_Linux-x86_64-multicore.tar.gz
 
一:环境配置
1.修改hosts文件,添加内容如下
192.168.78.11  admin
192.168.78.12  node1
192.168.78.13  node2


2.设置无密码访问
ssh-keygen一直按enter键即可,进入.ssh目录生成认证密码,并设置权限
[root@admin ~]#cd.ssh/
[[email protected]]#ls
id_rsa id_rsa.pub
[[email protected]]#cp id_rsa.pub authorized_keys
[[email protected]]#chmod 600 authorized_keys
[[email protected]]#ll
total 12
-rw------- 1rootroot  394 Aug 23 03:52 authorized_keys
-rw------- 1rootroot 1675 Aug 23 03:50 id_rsa
-rw-r--r-- 1rootroot  394 Aug 23 03:50 id_rsa.pub


3.然后复制.ssh目录到所有计算节点
 [root@admin~]# for i in 1 2 ; do scp -r /root/.ssh node$i:/root/ ; done
第一次要输入两台计算节点的root密码,以后都是无密码访问了


4.复制hosts文件到所有计算节点
[root@admin ~]#for i in 1 2 ; do scp /etc/hosts node$i:/etc/ ; done


5.配置nfs服务
把管理节点上的/export作为共享目录
[root@admin~]#mkdir -p /export/{apps,home,scripts,source}                  //其中apps为软件共享目录,home为共享家目录
[root@admin ~]#cat /etc/exports
/export  192.168.78.0/255.255.255.0(rw,sync)


6.启动nfs服务并检查启动是否成功
[root@admin~]#chkconfig portmap on ; /etc/init.d/portmap start
Startingportmap:                                        [ OK  ]
[root@admin~]#chkconfig nfs on ; /etc/init.d/nfs start
[root@admin~]#showmount -e localhost
Export listforlocalhost:
/export 192.168.78.0/255.255.255.0
[root@admin ~]#


7.配置autofs
[root@admin ~]#cat /etc/auto.master
/home/etc/auto.home  --timeout=1200
/share/ec/auto.share  --timeout=1200
[root@admin ~]#cat /etc/auto.share
*                                            admin:/export/&
[root@admin ~]#cat /etc/auto.home
*              -nfsvers=3            admin:/export/home/&
[root@admin ~]#


8.启动autofs服务
[root@admin~]#chkconfig autofs on ; /etc/init.d/autofs start


9.复制auto.master auto.share auto.home到所有计算节点
[root@admin ~]#for i in 1 2; do scp /etc/auto.master node$i:/etc/; done
[root@admin ~]#for i in 1 2; do scp /etc/auto.share node$i:/etc/; done
[root@admin ~]#for i in 1 2; do scp /etc/auto.home node$i:/etc/; done


10.启动autofs服务
[root@admin ~]#for i in 1 2; do ssh node$i /etc/init.d/autofs start; done
[root@admin ~]#for i in 1 2; do ssh node$i chkconfig autofs on; done


12.启动NIS服务
[root@admin~]#/etc/init.d/ypserv start ;chkconfig ypserv on
Starting YP servers ervices:                              [ OK  ]
[root@admin~]#/etc/init.d/yppasswdd start ;chkconfig yppasswdd on
Starting YP passwd service:                              [ OK  ]
[root@admin ~]#


13.修改/etc/default/useradd文件
把HOME=/home更改为HOME=/export/home


14.在/etc/skel目录下创建.ssh目录并在.ssh目录下建立一个名为config的文件,设置如下
[root@admin~]#mkdir /etc/skel/.ssh
[root@admin~]#touch /etc/skel/.ssh/config
[root@admin ~]#cat /etc/skel/.ssh/config
StrictHostKeyChecking    no
UserKnownHostsFile        /dev/null
[root@admin~]#chmod 600 /etc/skel/.ssh/config


15.创建用于同步用户的命令
◆在/usr/local/sbin目录下创建了一个名为sync_users的脚本,内容如下:
#!/bin/bash
YPINIT=/usr/lib64/yp/ypinit
for USER in $(sed -n '/export/p' /etc/passwd | awk -F ":" '{print$1}')
do
                  if [ -z "$USER" ]; then
                          $YPINIT -m
                  else
                          usermod -d /home/$USER $USER
                fi
done
                $YPINIT -m
◆赋予可执行权限
chmod 755/usr/local/sbin/sync_users
◆以后执行sync_users命令就可以同步新创建的用户


注:以后每添加一个新用户,都需要执行sync_users命令


2.添加环境变量,在/share/scripts目录先建立了一个Path.sh,以后也方便计算节点添加环境变量
[root@adminscripts]#pwd
/share/scripts
[root@adminscripts]#cat Path.sh
#!/bin/bash
grep openmpi /etc/bashrc || cat >>/etc/bashrc <<EOF
export PATH=/share/apps/openmpi/bin:\$PATH
export LD_LIBRARY_PATH=/share/apps/openmpi/lib:\$LD_LIBRARY_PATH
EOF
[root@adminscripts]#
[root@adminscripts]#sh Path.sh
[root@adminscripts]#source /etc/bashrc


3.测试openmpi是否安装成功
[root@adminscripts]#which mpirun
/share/apps/openmpi/bin/mpirun
[root@adminscriptss]#which mpiexec
/share/apps/openmpi/bin/mpiexec


4.安装torque
[root@adminparallel]#tar xzvf torque-3.0.6.tar.gz -C /share/source/
[root@adminparallel]#cd /share/source/torque-3.0.6/
[[email protected]]#./configure  --enable-syslog --enable-nvidia-gpus --enable-cpuset --disable-gui --with-rcp=scp --with-sendmail
[[email protected]]#make
[[email protected]]#make install
[[email protected]]#pwd
/share/source/torque-3.0.6
[[email protected]]#cat install.sh
cd /share/source/torque-3.0.6
make install
[[email protected]]#


5.初始化torque创建默认队列
[[email protected]]#./torque.setup root
initializingTORQUE(admin: root@admin)
PBS_Server admin:Create mode and server database exists,
do you wishtocontinue y/(n)?y
root    26351    1  0 06:44?        00:00:00 pbs_server -t create
Max openservers:10239
Max openservers:10239
[[email protected]]#


6.查看创建的默认队列batch
[[email protected]]#qmgr -c "p s"
#
# Create queues and set their attributes.
#
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.nodes = 1
set queue batch resources_default.walltime= 01:00:00
set queue batch enabled = True
set queue batch started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = admin
set server admins= root@admin
set server operators = root@admin
set server default_queue = batch
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server keep_completed = 300
[[email protected]]#


7.更改队列batch部分属性,以满足实际需求
[[email protected]]#qmgr -c "s q batch resources_default.walltime=24:00:00"
[[email protected]]#qmgr -c "s s query_other_jobs=true"


8.建立mom配置文件,用于复制到所有计算节点
[root@adminmom_priv]#pwd
/var/spool/torque/mom_priv
[root@adminmom_priv]#cat config
$pbsserver    admin
$logevent      225


9.创建节点信息文件
[root@adminserver_priv]#pwd
/var/spool/torque/server_priv
[root@adminserver_priv]#cat nodes
node1
node2
[root@adminserver_priv]#


10.查看目前节点信息均为down状态
[root@adminserver_priv]#pbsnodes -a
node1
    state = down
    np = 1
    ntype = cluster
    mom_service_port = 15002
    mom_admin_port = 15003
    gpus = 0
 
node2
    state = down
    np = 1
    ntype = cluster
    mom_service_port = 15002
    mom_admin_port = 15003
    gpus = 0
 [root@adminserver_priv]#


11.复制pbs_server启动脚本,并设置开机自动启动
[[email protected]]#pwd
/share/apps/torque-3.0.6
[[email protected]]#cp contrib/init.d/pbs_server /etc/init.d/
[[email protected]]#chmod 755 /etc/init.d/pbs_server
[[email protected]]#chkconfig pbs_server on


12.复制pbs_mom脚本,方便复制到计算节点
[[email protected]]#cp contrib/init.d/pbs_mom /etc/init.d/
 
13.安装maui
[root@adminparallel]#tar xzvf maui-3.3.1.tar.gz -C /usr/local/src/
[root@admin ~]#cd /usr/local/src/maui-3.3.1/
[[email protected]]#./configure --prefix=/usr/local/maui --with-pbs=/usr/local
[[email protected]]#make
[[email protected]]#make install


14.复制maui启动脚本,设置正确路径,并设置为开机启动
[[email protected]]#cp etc/maui.d /etc/init.d/mauid
[[email protected]]#vi /etc/init.d/mauid
更改MAUI_PREFIX=/opt/maui为MAUI_PREFIX=/usr/local/maui
[[email protected]]#chmod 755 /etc/init.d/mauid
[[email protected]]#chkconfig mauid on


15.启动maui调度服务
[[email protected]]#/etc/init.d/mauid start
StartingMAUIScheduler:                                  [ OK  ]
[[email protected]]#


16.添加maui命令环境变量
[[email protected]]#vi /etc/bashrc
export PATH=/share/apps/openmpi/bin:/usr/local/maui/bin:$PATH
[[email protected]]#source /etc/bashrc


17.安装并行软件到共享目录
[root@adminnamd]#tar xzvf NAMD_2.9_Linux-x86_64-multicore.tar.gz -C /share/apps/
[root@adminnamd]#tar xzvf apoa1.tar.gz -C /share/apps/
[root@adminapps]#pwd
/share/apps
[root@adminapps]#mv NAMD_2.9_Linux-x86_64-multicore/ namd


18.添加namd命令环境变量,同时也添加到Path.sh方便计算节点添加环境变量
[[email protected]]#vi /etc/bashrc
export PATH=/share/apps/openmpi/bin:/usr/local/maui/bin:/share/apps/namd:$PATH
[[email protected]]#source /etc/bashrc
[root@adminscripts]#which namd2
/share/apps/namd/namd2
[root@adminscripts]#cat Path.sh
#!/bin/bash
grep openmpi /etc/bashrc || cat >>/etc/bashrc <<EOF
export PATH=/share/apps/openmpi/bin:/share/apps/namd:\$PATH
EOF
[root@adminscripts]#
至此管理端配置完成
 
三:计算节点配置torque
1.计算节点安装torque
[root@admin ~]#for i in 1 2; do ssh node$i sh /share/source/torque-3.0.6/install.sh; done


2.复制mom配置文件到计算节点

[root@admin ~]#for i in 1 2; do scp /var/spool/torque/mom_priv/confignode$i:/var/spool/torque/mom_priv/; done


3.复制mom启动脚本到计算节点,启动pbs_mom服务,并设置开机启动
[root@admin ~]#for i in 1 2; do scp /etc/init.d/pbs_mom node$i:/etc/init.d/; done
[root@admin ~]#for i in 1 2; do ssh node$i /etc/init.d/pbs_mom start; done
StartingTORQUEMom: [  OK  ]
StartingTORQUEMom: [  OK  ]
[root@admin ~]#for i in 1 2; do ssh node$i chkconfig pbs_mom on; done


4.设置环境变量
[root@admin ~]#for i in 1 2; do ssh node$i sh /share/scripts/Path.sh; done


5.测试环境变量设置是否正确
[root@admin ~]#for i in 1 2; do ssh node$i which mpirun; done
/share/apps/openmpi/bin/mpirun
/share/apps/openmpi/bin/mpirun
[root@admin ~]#for i in 1 2; do ssh node$i which namd2; done
/share/apps/namd/namd2
/share/apps/namd/namd2
[root@admin ~]#

猜你喜欢

转载自www.linuxidc.com/Linux/2016-06/132375.htm