User Tools

Site Tools


cluster:224

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
cluster:224 [2023/10/13 15:52]
hmeij07
cluster:224 [2024/01/12 14:36] (current)
hmeij07
Line 7: Line 7:
  
 <code> <code>
 +
 +# first step
 +yum update -y  # get to the latest
 +reboot
  
 # IP ranges # IP ranges
Line 12: Line 16:
 10.10.102.48 n38-eth1 10.10.102.48 n38-eth1
 10.11.103.48 n38-ib0 10.11.103.48 n38-ib0
 +DEVROUTE=yes # others no
 +GATEWAY=192,168.102.251 # greentail52
  
 cd /etc/sysconfig/network-scripts/ cd /etc/sysconfig/network-scripts/
 vi ifcfg-eth0 # 192.168.102.x vi ifcfg-eth0 # 192.168.102.x
 vi ifcfg-eth1 # 10.10.102.x   # 'uuidgen eth1' to get uuid vi ifcfg-eth1 # 10.10.102.x   # 'uuidgen eth1' to get uuid
 +
 +# or via  rc.local? see n102
 +vi ifcfg-ib0
 +DEVICE=ib0
 +ONBOOT=yes
 +MTU=65520
 +CONNECTED_MODE=yes
 +BOOTPROTO=none
 +IPADDR=10.11.103.48
 +PREFIX=16
 +# check with ibstat
 + Port 1:
 + State: Active
 + Physical state: LinkUp
 +# check with ethtool ib0
 + Speed: 40000Mb/s
  
 # root: sync cottontail's master and known_hosts (tails+stores) # root: sync cottontail's master and known_hosts (tails+stores)
 ssh-keygen -t rsa ssh-keygen -t rsa
-scp 10.10.102.253:/root/.ssh/authorized_keys /root/.ssh/+scp 10.10.102.250:/root/.ssh/authorized_keys /root/.ssh/ #ctt2
 /etc/ssh/sshd_config (PermitRootLogin) /etc/ssh/sshd_config (PermitRootLogin)
 vi /etc/selinux/config # disabled, do not mistype, kernel will not boot! vi /etc/selinux/config # disabled, do not mistype, kernel will not boot!
Line 75: Line 97:
 rpm -qa | egrep  "libibverbs|libibverbs-devel" rpm -qa | egrep  "libibverbs|libibverbs-devel"
 # no # yum groupinstall "Infiniband Support" # ib already working # no # yum groupinstall "Infiniband Support" # ib already working
-yum install libibvers-devel ibutils infiniband-diags perftest qperf +yum install libibverbs-devel ibutils infiniband-diags perftest qperf -y
  
 # amber20 cmake readline error fix needs # amber20 cmake readline error fix needs
Line 107: Line 129:
 # compute nodes old level 3 # compute nodes old level 3
 systemctl set-default multi-user.target systemctl set-default multi-user.target
 +
 +### centos7 so not an OpenHPC environment
  
 # other configs # other configs
Line 128: Line 152:
 ### REST AT HOME ### REST AT HOME
  
-# or via  rc.local? see n102 
-vi ifcfg-ib0 
-DEVICE=ib0 
-ONBOOT=yes 
-MTU=65520 
-CONNECTED_MODE=yes 
-BOOTPROTO=none 
-IPADDR=10.11.103.48 
-PREFIX=16 
-# check with ibstat 
- Port 1: 
- State: Active 
- Physical state: LinkUp 
-# check with ethtool ib0 
- Speed: 40000Mb/s 
  
 # /etc/fstab # /etc/fstab
Line 178: Line 187:
 rocommunity public rocommunity public
 dontLogTCPWrappersConnects yes dontLogTCPWrappersConnects yes
-enable, start, add to zenoss +# add to zenoss 
 +systemctl enable snmpd 
 +systemctl start snmpd 
  
  
Line 190: Line 202:
 ln -s /usr/local/slurm-22.05.2 /usr/local/slurm ln -s /usr/local/slurm-22.05.2 /usr/local/slurm
  
-Put the warewulf cluster key in authorized_keys + 
-cd /root/.ssh +backup and update passwd, shadow, group and hosts files 
-scp -rp 10.10.102.89:/root/.ssh/authorized_keys .+# scp from n79 or n45 
 + 
 +# slurm config 
 +echo SLURMD_OPTIONS="--conf-server 192.168.102.250"/etc/sysconfig/slurmd 
 +  mkdir /var/log/slurm  
 +  chown slurm:munge /var/log/slurm  
 +  mkdir /var/spool/slurm  
 +  chown slurm:munge /var/spool/slurm  
 +# check 
 +chown -R munge:munge /etc/munge /var/log/munge /var/lib/munge /var/run/munge 
 +chown -R slurm:munge /var/log/slurm /var/spool/slurm 
 +systemctl enable munge 
 +systemctl start munge 
 +# test unmunge 
 +/usr/local/slurm/sbin/slurmd 
 +# check log 
 + 
 +# /etc/bashrc (login node) 
 +export PATH=/usr/local/slurm/bin:$PATH 
 +export LD_LIBRARY_PATH=/usr/local/slurm/lib:$LD_LIBRARY_PATH 
 + 
 +# crontab 
 + 
 +# ionice gaussian 
 +0,15,30,45 * * * * /share/apps/scripts/ionice_lexes.sh  > /dev/null 2>&1 
 + 
 +# cpu temps 
 +40 * * * * /share/apps/scripts/lm_sensors.sh > /dev/null 2>&
 + 
 +on compute node /etc/security/limits.conf 
 +*                      memlock         270039400 
 + 
 + 
 +/etc/rc.local 
 +#timing issue with munge 
 +#sleep 15 
 +#/usr/local/slurm/sbin/slurmd 
 +chmod +x /etc/rc.d/rc.local 
 + 
 +# important!! put private back in place 
 +systemctl disable iptables 
 +systemctl stop iptables 
 +reboot 
 + 
 +# file date_ctt2.sh 
 + 
 +# ctt /etc/pdsh 
 + 
 +# ctt:/root/scripts 
 + 
 +# ctt2:/usr/local/bin/rslurm2022.sh
  
 # Put eth0 fingerprints in cottontail/greentail52 known hosts # Put eth0 fingerprints in cottontail/greentail52 known hosts
-# add to relevant known_hosts_servername file+
 # test slurm unmunge and update slurm.conf file # test slurm unmunge and update slurm.conf file
  
cluster/224.1697212341.txt.gz · Last modified: 2023/10/13 15:52 by hmeij07