User Tools

Site Tools


cluster:154

This is an old revision of the document!


Table of Contents


Back

OpenHPC

  • install vanilla CentOS 7.2 on master
  • find Install_guide-CentOS7.2-SLURM-1.2.1-x86_64.pdf recipe guide on http://openhpc.community
  • turn selinux off
  • next switch to iptables
[root@ohpc0-test ~]# systemctl stop firewalld                
[root@ohpc0-test ~]# systemctl disable firewalld         
     
[root@ohpc0-test ~]#  yum install iptables-services -y                          
[root@ohpc0-test ~]# systemctl enable iptables
[root@ohpc0-test ~]# systemctl enable ip6tables

[root@ohpc0-test ~]# vi /etc/sysconfig/iptables

# lock up port 22: note "eth1"
-A INPUT -i enp8s0 -p tcp -m state --state NEW -m tcp -s 129.133.0.0/16 --dport 22 -j ACCEPT

# local allow: note "eth0"
-A INPUT -i enp4s0 -d 192.168.0.0/16 -p tcp --dport 0:65535 -j ACCEPT
-A INPUT -i enp4s0 -d 192.168.0.0/16 -p udp --dport 0:65535 -j ACCEPT

[root@ohpc0-test ~]# vi /etc/sysconfig/ip6tables

# comment out port 22

[root@ohpc0-test ~]# systemctl restart iptables
[root@ohpc0-test ~]# systemctl restart ip6tables
[root@ohpc0-test ~]# iptables -L
Chain INPUT (policy ACCEPT)
target     prot opt source               destination
ACCEPT     all  --  anywhere             anywhere             state RELATED,ESTABLISHED
ACCEPT     icmp --  anywhere             anywhere
ACCEPT     all  --  anywhere             anywhere
ACCEPT     tcp  --  129.133.0.0/16       anywhere             state NEW tcp dpt:ssh
ACCEPT     tcp  --  anywhere             192.168.0.0/16       tcp
ACCEPT     udp  --  anywhere             192.168.0.0/16       udp
REJECT     all  --  anywhere             anywhere             reject-with icmp-host-prohibited

Chain FORWARD (policy ACCEPT)
target     prot opt source               destination
REJECT     all  --  anywhere             anywhere             reject-with icmp-host-prohibited

Chain OUTPUT (policy ACCEPT)
target     prot opt source               destination

[root@ohpc0-test ~]# reboot
  • next add OpenHPC component, install the RPM package which also enables repo EPEL
[root@ohpc0-test ~]# yum install http://build.openhpc.community/OpenHPC:/1.2/CentOS_7.2/x86_64/ohpc-release-1.2-1.x86_64.rpm

Installed:
  ohpc-release.x86_64 0:1.2-1
Dependency Installed:
  epel-release.noarch 0:7-9

[root@ohpc0-test ~]# yum repolist
repo id                       repo name
OpenHPC                       OpenHPC-1.2 - Base
OpenHPC-updates               OpenHPC-1.2 - Updates
base/7/x86_64                 CentOS-7 - Base
*epel/x86_64                  Extra Packages for Enterprise Linux 7 - x86_64
extras/7/x86_64               CentOS-7 - Extras
updates/7/x86_64              CentOS-7 - Updates
  • Next provisioning, pull down a suite of packages
 yum -y groupinstall ohpc-base
 yum -y groupinstall ohpc-warewulf
# for openlava if we decide not slurm
 yum install tcl-devel

 systemctl enable ntpd.service
 systemctl start ntpd
 systemctl status ntpd

 yum -y groupinstall ohpc-slurm-server
  • Configure ib0 and IPoIB if needed, consult Infiniband, also consult the PDF file.

Configure

  • Warewulf (do yourself a favor and check changes in file to avoid typos)
 
perl -pi -e "s/device = eth1/device = enp4s0/" /etc/warewulf/provision.conf

perl -pi -e "s/^\s+disable\s+= yes/ disable = no /" /etc/xinetd.d/tftp
   
perl -pi -e "s/cgi-bin>\$/cgi-bin>\n Require all granted/" /etc/httpd/conf.d/warewulf-httpd.conf
perl -pi -e "s/Allow from all/Require all granted/" /etc/httpd/conf.d/warewulf-httpd.conf

  
# -ni not -pi
perl -ni -e "print unless /^\s+Order allow,deny/" /etc/httpd/conf.d/warewulf-httpd.conf

# the recipe does not set a mysql root password but we will
[root@ohpc0-test]# vi /etc/warewulf/database-root.conf 

mysql> set password for 'root'@'localhost' = PASSWORD('some_string');
Query OK, 0 rows affected (0.00 sec)                                

[root@ohcp0-test]# chmod o-r /etc/warewulf/database-root.conf 

# restart/enable services
systemctl restart xinetd
systemctl enable mariadb.service
systemctl restart mariadb
systemctl enable httpd.service
systemctl restart httpd

  • Now let get ready to provision a node. First we need to build a CHROOT environment.
# defined repo
less /usr/libexec/warewulf/wwmkchroot/centos-7.tmpl

# admin area
ls -R /opt/ohpc/admin/

# use another disk for images
mkdir /data
mkfs.xfs -f /dev/sdb1
mount /data
mkdir -p /data/ohpc/images/centos7.2

# this yields an error as 7.2.1511 is already deprecated
wwmkchroot centos-7 /data/ohpc/images/centos7.2
"This directory (and version of CentOS) is deprecated.  For normal users,
you should use /7/ and not /7.2.1511/ in your path."
# so we follow their advise and edit the repo destination s/7.2.1511/7/
vi /usr/libexec/warewulf/wwmkchroot/centos-7.tmpl

# try again
wwmkchroot centos-7 /data/ohpc/images/centos7.2
# ls /data/ohpc/images/centos7.2/
bin  boot  dev  etc  fastboot  home  lib  lib64  media  mnt  opt  
proc  root  run  sbin  srv  sys  tmp  usr  var
# du -hs /data/ohpc/images/centos7.2/
490M    /data/ohpc/images/centos7.2/
  • Next customize the CHROOT environment
cp -p /etc/resolv.conf /data/ohpc/images/centos7.2/etc/
yum -y --installroot=/data/ohpc/images/centos7.2 groupinstall ohpc-slurm-client
yum -y --installroot=/data/ohpc/images/centos7.2 install kernel
yum -y --installroot=/data/ohpc/images/centos7.2 install ntp
yum -y --installroot=/data/ohpc/images/centos7.2 install lmod-ohpc
# pass on infiniband

# if it does not exist on **master** issue command ''wwinit ssh''
cat ~/.ssh/cluster.pub
cat ~/.ssh/cluster.pub >> /data/ohpc/images/centos7.2/root/.ssh/authorized_keys

echo "192.168.1.249:/home /home nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0 " \
      >> /data/ohpc/images/centos7.2/etc/fstab
echo "192.168.1.249:/opt/ohpc/pub /opt/ohpc/pub nfs nfsvers=3 0 0 " \
      >> /data/ohpc/images/centos7.2/etc/fstab

chroot /data/ohpc/images/centos7.2 systemctl enable ntpd
echo "server 192.168.1.249" >> /data/ohpc/images/centos7.2/etc/ntp.conf

# finally on **master** issue
perl -pi -e "s/ControlMachine=\S+/ControlMachine=ohpc0-test/" /etc/slurm/slurm.conf
# this turned out to be wrong, first I change the hostname to ''ohpc0-slurm'' ControllerMachine=
# added line in /etc/hosts pointing this to 192.168.1.249, then I defined ''/etc/slurm/slurm.con''

NodeName=ohpc0-slurm NodeAddr=192.168.1.249 CPUs=2 \
        RealMemory=8 Sockets=2 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
NodeName=n29 NodeAddr=192.168.102.38 CPUs=2 \
        RealMemory=8 Sockets=2 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
NodeName=n31 NodeAddr=192.168.102.40 CPUs=2 
        RealMemory=8 Sockets=2 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
PartitionName=test Nodes=n29,n31 Default=YES MaxTime=INFINITE State=UP





echo "/home *(rw,no_subtree_check,fsid=10,no_root_squash)" >> /etc/exports
echo "/opt/ohpc/pub *(ro,no_subtree_check,fsid=11)" >> /etc/exports
exportfs -ra
systemctl restart nfs
systemctl enable nfs-server

Deploy

  • Next we PXE boot the compute node for imaging (after building bootstrp image and VNFS)
# Both are loaded in database; back up if production
echo "drivers += updates/kernel" >> /etc/warewulf/bootstrap.conf
wwbootstrap `uname -r`
# Bootstrap image '3.10.0-327.el7.x86_64' is ready
wwvnfs -y --chroot /data/ohpc/images/centos7.2
# VNFS 'centos7.2' has been imported
# Wrote a new configuration file at: /etc/warewulf/vnfs/centos7.2.conf

wwsh -y file import /tmp/network.12501 --name network
wwsh -y file set network --path /etc/sysconfig/network --mode=0644 --uid=0 
wwsh -y node new ohpc0 --ipaddr=192.168.1.248 --hwaddr=00:15:C5:EF:08:5F -D enp4s0

wwsh -y file import /etc/passwd
wwsh -y file import /etc/group
wwsh -y file import /etc/shadow
wwsh -y file import /etc/slurm/slurm.conf
wwsh -y file import /etc/munge/munge.key
wwsh -y provision set ohpc0 --vnfs=centos7.2 --bootstrap=`uname -r` \
     --files=dynamic_hosts,passwd,group,shadow,slurm.conf,munge.key,network

wwsh pxe update
wwsh dhcp update
systemctl restart dhcpd
systemctl restart httpd
  • Note: the next part is optional but I recommend it. Warewulf by defaults deploys stateless (in memory) but I'd rather deploy statefull in which VNFS is written to disk. The advantage is that if the node crashed, it will reboot without the help of the master, and Linux typically survives crashes. Actually I like the golden image even better, customize a node, then create the image. More details at Warewulf Golden Image
yum -y --installroot=/data/ohpc/images/centos7.2 install grub2
wwvnfs -y --chroot /data/ohpc/images/centos7.2

wwsh -y object modify -s bootloader=sda -t node ohpc0
wwsh -y object modify -s diskpartition=sda -t node ohpc0
wwsh -y object modify -s diskformat=sda1,sda2,sda3 -t node ohpc0
wwsh -y object modify -s filesystems=\    "mountpoint=/boot:dev=sda1:type=ext3:size=500,\
                         dev=sda2:type=swap:size=32768,\
                         mountpoint=/:dev=sda3:type=ext3:size=fill" -t node ohpc0
wwsh -y object modify -s bootlocal=UNDEF -t node ohpc0 

wwsh pxe update
wwsh dhcp update
systemctl restart dhcpd
systemctl restart httpd


[root@ohpc0-test ~]# wwsh -y object print ohpc0 -p :all
#### node ohpc0 ###############################################################
       4: NAME       = ohpc0
       4: BOOTLOADER = sda
       4: BOOTLOCAL = UNDEF  
       4: BOOTSTRAPID = 1
       4: DISKFORMAT = sda1,sda2,sda3
       4: DISKPARTITION = sda
       4: FILEIDS    = 10,3,5,6,7,8,9
       4: FILESYSTEMS = dev=sda2:type=swap:size=32768,
                        mountpoint=/:dev=sda3:type=ext3:size=fill,
                        mountpoint=/boot:dev=sda1:type=ext3:size=500
       4: NETDEVS    = ObjectSet
            NETDEVS.enp4s0.NAME       = enp8s0
            NETDEVS.enp4s0.HWADDR     = 00:15:c5:ef:08:5F
            NETDEVS.enp4s0.IPADDR     = 192.168.1.248
       4: NODENAME   = ohpc0
       4: VNFSID     = 2
       
# Strange that netmask is not listed, but when I recreated the object I used
wwsh -y node new ohpc0 --ipaddr=192.168.1.248 --hwaddr=00:15:c5:ef:0c:bf \
     --netdev=enp4s0 --netmask=255.255.0.0  --network=255.255.0.0

[root@ohpc0-test ~]# wwsh -y bootstrap list
BOOTSTRAP NAME            SIZE (M)
3.10.0-327.el7.x86_64     26.4

[root@ohpc0-test ~]# wwsh -y vnfs list
VNFS NAME            SIZE (M) CHROOT LOCATION
centos7.2            278.2    /data/ohpc/images/centos7.2

Final notes. I now have a 3 node OpenHPC cluster up using CentOS 7.3.1611 … because of the edit mention above of the provision template URL the CHROOT is at the latest version of CentOS. Thus I updated my SMS master too so that the construct bootstrap=`uname -r` builds an image compatible between SMS and CHROOT.

On towards testing the tools.

Made a little script to recreate nodes as we'll do this often. And I also some weirdo situation where eth0/1 change NIC location during PXE boot, so I work around it.

  • deploy.sh
#!/bin/bash
# enable both NIC to boot from 501/500
# provision black on bottom, red on top (handler swithces to this)
# set bootlocal to EXIT reboot, handler exits
# switch black to top reboot (no media, fails to hdd)
# insane

node=n31
ipaddr0=192.168.102.40
hwaddr0=1c:c1:de:19:40:6f
wwsh object delete $node -y 
wwsh node new $node --netdev=eth0 --hwaddr=$hwaddr0 --ipaddr=$ipaddr0 --netmask=255.255.0.0  --network=255.255.0.0 -y
wwsh -y provision set $node --vnf=centos7.2 --bootstrap=`uname -r` --files=dynamic_hosts,passwd,group,shadow,slurm.conf,munge.key,network
wwsh object modify -s bootloader=sda $node -y
wwsh object modify -s diskpartition=sda $node -y
wwsh object modify -s diskformat=sda1,sda2,sda3 $node -y
wwsh object modify -s filesystems="mountpoint=/boot:dev=sda1:type=ext3:size=1024,dev=sda2:type=swap:size=6144,mountpoint=/:dev=sda3:type=ext3:size=+" $node -y
wwsh provision set --bootlocal=UNDEF $node -y

node=n31e
ipaddr0=192.168.102.40
hwaddr0=1c:c1:de:19:40:6e
wwsh object delete $node -y 
wwsh node new $node --netdev=eth0 --hwaddr=$hwaddr0 --ipaddr=$ipaddr0 --netmask=255.255.0.0  --network=255.255.0.0 -y
wwsh -y provision set $node --vnf=centos7.2 --bootstrap=`uname -r` --files=dynamic_hosts,passwd,group,shadow,slurm.conf,munge.key,network
wwsh object modify -s bootloader=sda $node -y
wwsh object modify -s diskpartition=sda $node -y
wwsh object modify -s diskformat=sda1,sda2,sda3 $node -y
wwsh object modify -s filesystems="mountpoint=/boot:dev=sda1:type=ext3:size=1024,dev=sda2:type=swap:size=6144,mountpoint=/:dev=sda3:type=ext3:size=+" $node -y
wwsh provision set --bootlocal=UNDEF $node -y

node=n29
ipaddr0=192.168.102.38
hwaddr0=1c:c1:de:1c:88:c3
wwsh object delete $node -y 
wwsh node new $node --netdev=eth0 --hwaddr=$hwaddr0 --ipaddr=$ipaddr0 --netmask=255.255.0.0  --network=255.255.0.0 -y
wwsh -y provision set $node --vnf=centos7.2 --bootstrap=`uname -r` --files=dynamic_hosts,passwd,group,shadow,slurm.conf,munge.key,network
wwsh object modify -s bootloader=sda $node -y
wwsh object modify -s diskpartition=sda $node -y
wwsh object modify -s diskformat=sda1,sda2,sda3 $node -y
wwsh object modify -s filesystems="mountpoint=/boot:dev=sda1:type=ext3:size=1024,dev=sda2:type=swap:size=6144,mountpoint=/:dev=sda3:type=ext3:size=+" $node -y
wwsh provision set --bootlocal=UNDEF $node -y

node=n29e
ipaddr0=192.168.102.38
hwaddr0=1c:c1:de:1c:88:c2
wwsh object delete $node -y 
wwsh node new $node --netdev=eth0 --hwaddr=$hwaddr0 --ipaddr=$ipaddr0 --netmask=255.255.0.0  --network=255.255.0.0 -y
wwsh -y provision set $node --vnf=centos7.2 --bootstrap=`uname -r` --files=dynamic_hosts,passwd,group,shadow,slurm.conf,munge.key,network
wwsh object modify -s bootloader=sda $node -y
wwsh object modify -s diskpartition=sda $node -y
wwsh object modify -s diskformat=sda1,sda2,sda3 $node -y
wwsh object modify -s filesystems="mountpoint=/boot:dev=sda1:type=ext3:size=1024,dev=sda2:type=swap:size=6144,mountpoint=/:dev=sda3:type=ext3:size=+" $node -y
wwsh provision set --bootlocal=UNDEF $node -y

wwsh pxe update
wwsh dhcp update
systemctl restart dhcpd 
systemctl restart httpd 
echo "after first boot: wwsh provision set --bootlocal=EXIT $node"


Back

cluster/154.1490641632.txt.gz · Last modified: 2017/03/27 15:07 by hmeij07