User Tools

Site Tools


cluster:229


Back

Recipe for RTX4070ti nodes

# image using usb stick rocky 8.10
# enter bios set date, note MAC address
vi /etc/selinux/config
vi /etc/ssh/sshd_config

# SKIP NO WAREWULF
vi /etc/default/grub
# add inet.ifnames=0 to CMD LINE
grub2-mkconfig -o /boot/grub2/grub.cfg
reboot
# add 10 to nodename for ip
cd /etc/sysconfig/network-scripts/
vi ifcfg-en01
mv ifcfg-eno1 ifcfg-eth0
vi ifcfg-eno2
mv ifcfg-eno2 ifcfg-eth1
systemctl restart NetworkManager
ifconfig
# SKIP NO WAREWULFy

# IPTABLES
yum install NetworkManager-initscripts-updown -y
vi ifcfg-eth0
# add 192.168 and 10.10 IPs
# 129.133.52.223/255.255.252.0/129.133.52.1
systemctl restart NetworkManager
ifconfig
dig google.com

yum install iptables-services
vi /etc/sysconfig/iptables
# ssh line add -s 129.133.22.66
systemctl enable iptables
systemctl start iptables
systemctl stop firewalld
systemctl disable firewalld
reboot
iptables -L
# DO NOT FORGET TO DISABLE after internet

date
cat /etc/fstab
lsblk
ssh-keygen -t rsa
# add public key to n108
scp 10.10.102.118:/root/.ssh/authorized_keys /root/.ssh/

cd /etc
scp passwd passwd-orig
scp shadow shadow-orig
scp group group-orig
scp hosts hosts-orig
cd

scp 10.10.102.118:/etc/passwd /etc/passwd
scp 10.10.102.118:/etc/shadow /etc/shadow
scp 10.10.102.118:/etc/group  /etc/group
scp 10.10.102.118:/etc/hosts  /etc/hosts


dnf config-manager --set-enabled powertools -y
dnf install epel-release -y
dnf install netcdf netcdf-devel -y
dnf install yum-utils -y # yumdownloader
yum install epel-release -y
yum install flex bison -y
yum install tcl tcl-devel dmtcp dmtcp-devel -y
yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y
yum install freeglut-devel libXi-devel libXmu-devel -y
yum install blas blas-devel lapack lapack-devel boost boost-devel -y
yum install lm_sensors lm_sensors-libs -y
yum install zlib-devel bzip2-devel -y
yum install openmpi openmpi-devel perl-ExtUtils-MakeMaker -y
yum install cmake -y
yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y
yum install libibverbs libibverbs-devel -y
yum install ncurses-c++-libs ncurses-devel readline-devel -y
yum install tcsh make gcc gcc-gfortran gcc-c++  which flex bison \
patch bc libXt-devel libXext-devel perl perl-ExtUtils-MakeMaker util-linux \
wget bzip2 bzip2-devel zlib-devel tar -y

dnf install postfix mailx -y
echo "relayhost = 192.168.102.251" >> /etc/postfix/main.cf
systemctl enable postfix
systemctl start postfix

mv  /etc/snmp/snmpd.conf /etc/snmp/snmpd.conf-orig
scp 10.10.102.118:/etc/snmp/snmpd.conf  /etc/snmp/snmpd.conf
# add 2 lines
systemctl enable snmpd
systemctl start snmpd

yum install R R-devel -y
yum install java-1.8.0-openjdk java-1.8.0-openjdk-devel \
java-1.8.0-openjdk-headless javapackages-filesystem -y
yum install python39 python39-devel -y
ln -s /usr/bin/python3.9 /usr/bin/python
yum install fftw fftw-devel -y
yum install gsl gsl-devel -y
yum install ruby ruby-devel -y
yum install openbabel openbabel-devel -y

umount /opt/ohpc/pub /opt/intel # make sure
scp 10.10.102.118:/usr/local/src/ohpc-release-2-1.el8.x86_64.rpm  /usr/local/src/
rpm -ivh /usr/local/src/ohpc-release-2-1.el8.x86_64.rpm
yum install singularity-ohpc -y
yum  install ohpc-base-compute --nobest -y
yum  install ohpc-slurm-client -y
yum  install --allowerasing lmod-ohpc -y
 mv /etc/yum.repos.d/OpenHPC.repo /etc/yum.repos.d/OpenHPC.repo-disabled
 mv /opt/ohpc /opt/ohpc-orig

yum install opencl-filesystem opencl-headers -y
yum install munge munge-devel -y

# n108 version
yum install kernel-4.18.0-553.30.1.el8_10.x86_64 \
kernel-modules-4.18.0-553.30.1.el8_10.x86_64 \
kernel-devel-4.18.0-553.30.1.el8_10.x86_64 \
kernel-tools-4.18.0-553.30.1.el8_10.x86_64 \
kernel-headers-4.18.0-553.30.1.el8_10.x86_64 \
kernel-core-4.18.0-553.30.1.el8_10.x86_64 \
kernel-tools-libs-4.18.0-553.30.1.el8_10.x86_64 \
kernel-debug-devel-4.18.0-553.30.1.el8_10.x86_64 -y
uname -a
# REMOVE public ip and shut down iptables
reboot

# NO INTERNET BELOW

mv /etc/issue.d/cockpit.issue /root/etc_issue.d_cockpit.issue
mv /etc/motd.d/cockpit /root/etc_motd.d_cockpit

vi /etc/chrony.conf
#pool 2.rocky.pool.ntp.org iburst
Server 192.168.102.250
Server 192.168.102.251
systemctl restart chronyd

echo SLURMD_OPTIONS="--conf-server 192.168.102.250" > /etc/sysconfig/slurmd
crontab -e # lm_sensors

#01 5,20 * * * /usr/bin/systemctl restart gmond

# ionice gaussian
#0,15,30,45 * * * * /share/apps/scripts/ionice_lexes.sh  > /dev/null 2>&1

# cpu temps
#40 * * * * /share/apps/scripts/lm_sensors.sh > /dev/null 2>&1

scp 10.10.102.118:/etc/resolv.conf  /etc/resolv.conf
scp 10.10.102.118:/etc/security/limits.conf /etc/security/limits.conf

#scp -rp cottontail2.wesleyan.edu:/etc/munge /etc/
#chown -R munge:munge /etc/munge/
scp -p 10.10.102.118:/etc/munge/munge.key /etc/munge/munge.key
ls -ld /etc/munge # check
ls -l /etc/munge/munge.key
systemctl enable munge
systemctl start munge

mkdir /usr/local/home;mv /home/hmeij07 /usr/local/home/
cd /home
mkdir localscratch
chmod ugo+rwx localscratch/
chmod o+t localscratch/
ln -s /zfshomes/apps
ln -s /zfshomes/tmp
ln -s /zfshomes/csmith06
cd /
ln -s /home/localscratch
ln -s /home /share
mkdir sanscratch
chmod ugo+rwx sanscratch/
chmod o+t sanscratch/
ls -l /home
ls -l /

cd /home
mkdir -p /zfshomes /home66 /home33 /mindstore /astrostore /vajedianlab
mkdir -p /smithlab/home;cd /smithlab;ln -s /smithlab/home/opt/rhel08 opt; ls -l
mkdir -p /opt/ohpc/pub /opt/intel /brunsonstore
cd
systemctl set-default multi-user.target
vi /etc/fstab
# add OHPC down section
mount -a
systemctl daemon-reload


scp 10.10.102.118:/etc/profile.d/lmod.sh /etc/profile.d/lmod.sh
scp -rp 10.10.102.118:/opt/ohpc/admin /opt/ohpc/ # not sure why missing
su - hmeij
 module avail; module list; gcc --version # test lmod

mkdir /var/log/slurm
chown slurm:munge /var/log/slurm
mkdir /var/spool/slurm  /var/spool/slurmd
chown -R munge:munge /etc/munge /var/log/munge /var/lib/munge /var/run/munge
chown -R slurm:munge /var/log/slurm /var/spool/slurm /var/spool/slurmd

scp -p 10.10.102.118:/etc/bashrc  /etc/bashrc
scp -p 10.10.102.118:/etc/rc.d/rc.local /etc/rc.d/rc.local
#chmod +x /etc/rc.d/rc.local


cd /usr/local/src/
scp -p 10.10.102.118:/usr/local/src/NVIDIA-Linux-x86_64-550.67.run .
sh ./NVIDIA-Linux-x86_64-550.67.run
# settings at https://dokuwiki.wesleyan.edu/doku.php?id=cluster:225
reboot # make sure iptables and firewalld are disabled
nvidia-smi

cd /usr/local/src
ls /usr/src/kernels/
uname -a
scp -p 10.10.102.118:/usr/local/src/cuda_12.6.3_560.35.05_linux.run  .
sh ./cuda_12.6.3_560.35.05_linux.run

#vi gpu-info
#vi gpu-process
cd /usr/local; ls -l
cd /usr/local/bin/
scp -p 10.10.102.118:/usr/local/bin/gpu-info .
scp -p 10.10.102.118:/usr/local/bin/gpu-process .
./gpu-info
./gpu-process


# ganglia and zabbix
mkdir -p /usr/local/src/tmp; cd /usr/local/src/tmp

scp -rp 10.10.102.118:/usr/local/src/ganglia-rpms ../
rpm -ivh ../ganglia-rpms/*.rpm;scp ../ganglia-rpms/gmond.conf.nodes /etc/ganglia/gmond.conf;\
systemctl enable gmond;systemctl start gmond;crontab -e

scp -rp 10.10.102.118:/usr/local/src/zabbix ../
rpm -ivh ../zabbix/zabbix-agent-7.0.4-release1.el8.x86_64.rpm
vi ../zabbix/zabbix_agentd.conf
scp ../zabbix/zabbix_agentd.conf /etc/zabbix/
unzip ../zabbix/zabbix-nvidia-smi-multi-gpu-master.zip
cd zabbix-nvidia-smi-multi-gpu-master/
mkdir /etc/zabbix/scripts
cp get_gpus_info.sh /etc/zabbix/scripts/
chmod +x /etc/zabbix/scripts/get_gpus_info.sh
cp userparameter_nvidia-smi.conf.linux /etc/zabbix/zabbix_agentd.d/userparameter_nvidia-smi.conf
systemctl enable zabbix-agent
systemctl start zabbix-agent


FINISH slurm-24.05.4


cd /usr/local/
scp -rp 10.10.102.118:/usr/local/amber16  .
scp -rp 10.10.102.118:/usr/local/amber20  .
scp -rp 10.10.102.118:/usr/local/lammps-22Aug18 .
scp -rp 10.10.102.118:/usr/local/slurm-24.05.4 .
ln -s slurm-24.05.4 slurm
ls -l

# check date, on cottontail2
NOW=`/bin/date +%m%d%H%M%Y.%S`; ssh nn111 date $NOW
# test both ways
munge -n -t 10 | ssh n111 unmunge
munge -n -t 10 | ssh cottontail2 unmunge

# 192.168 ip up
# iptables <---
systemctl disable iptables
systemctl stop iptables
echo $HOSTNAME |  mail -s NEWONE hmeij@wesleyan.edu
rm -f /var/log/slurm/*


reboot
# check on slurm node state


# server fingerprints
# ctt2_date.sh
# various /etc/hosts.pdsh files





Back

cluster/229.txt · Last modified: 2025/02/20 18:54 by hmeij07