Back
Recipe for RTX4070ti nodes
# image using usb stick rocky 8.10
# enter bios set date, note MAC address
vi /etc/selinux/config
vi /etc/ssh/sshd_config
# SKIP NO WAREWULF
vi /etc/default/grub
# add inet.ifnames=0 to CMD LINE
grub2-mkconfig -o /boot/grub2/grub.cfg
reboot
# add 10 to nodename for ip
cd /etc/sysconfig/network-scripts/
vi ifcfg-en01
mv ifcfg-eno1 ifcfg-eth0
vi ifcfg-eno2
mv ifcfg-eno2 ifcfg-eth1
systemctl restart NetworkManager
ifconfig
# SKIP NO WAREWULFy
# IPTABLES
yum install NetworkManager-initscripts-updown -y
vi ifcfg-eth0
# add 192.168 and 10.10 IPs
# 129.133.52.223/255.255.252.0/129.133.52.1
systemctl restart NetworkManager
ifconfig
dig google.com
yum install iptables-services
vi /etc/sysconfig/iptables
# ssh line add -s 129.133.22.66
systemctl enable iptables
systemctl start iptables
systemctl stop firewalld
systemctl disable firewalld
reboot
iptables -L
# DO NOT FORGET TO DISABLE after internet
date
cat /etc/fstab
lsblk
ssh-keygen -t rsa
# add public key to n108
scp 10.10.102.118:/root/.ssh/authorized_keys /root/.ssh/
cd /etc
scp passwd passwd-orig
scp shadow shadow-orig
scp group group-orig
scp hosts hosts-orig
cd
scp 10.10.102.118:/etc/passwd /etc/passwd
scp 10.10.102.118:/etc/shadow /etc/shadow
scp 10.10.102.118:/etc/group /etc/group
scp 10.10.102.118:/etc/hosts /etc/hosts
dnf config-manager --set-enabled powertools -y
dnf install epel-release -y
dnf install netcdf netcdf-devel -y
dnf install yum-utils -y # yumdownloader
yum install epel-release -y
yum install flex bison -y
yum install tcl tcl-devel dmtcp dmtcp-devel -y
yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y
yum install freeglut-devel libXi-devel libXmu-devel -y
yum install blas blas-devel lapack lapack-devel boost boost-devel -y
yum install lm_sensors lm_sensors-libs -y
yum install zlib-devel bzip2-devel -y
yum install openmpi openmpi-devel perl-ExtUtils-MakeMaker -y
yum install cmake -y
yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y
yum install libibverbs libibverbs-devel -y
yum install ncurses-c++-libs ncurses-devel readline-devel -y
yum install tcsh make gcc gcc-gfortran gcc-c++ which flex bison \
patch bc libXt-devel libXext-devel perl perl-ExtUtils-MakeMaker util-linux \
wget bzip2 bzip2-devel zlib-devel tar -y
dnf install postfix mailx -y
echo "relayhost = 192.168.102.251" >> /etc/postfix/main.cf
systemctl enable postfix
systemctl start postfix
mv /etc/snmp/snmpd.conf /etc/snmp/snmpd.conf-orig
scp 10.10.102.118:/etc/snmp/snmpd.conf /etc/snmp/snmpd.conf
# add 2 lines
systemctl enable snmpd
systemctl start snmpd
yum install R R-devel -y
yum install java-1.8.0-openjdk java-1.8.0-openjdk-devel \
java-1.8.0-openjdk-headless javapackages-filesystem -y
yum install python39 python39-devel -y
ln -s /usr/bin/python3.9 /usr/bin/python
yum install fftw fftw-devel -y
yum install gsl gsl-devel -y
yum install ruby ruby-devel -y
yum install openbabel openbabel-devel -y
umount /opt/ohpc/pub /opt/intel # make sure
scp 10.10.102.118:/usr/local/src/ohpc-release-2-1.el8.x86_64.rpm /usr/local/src/
rpm -ivh /usr/local/src/ohpc-release-2-1.el8.x86_64.rpm
yum install singularity-ohpc -y
yum install ohpc-base-compute --nobest -y
yum install ohpc-slurm-client -y
yum install --allowerasing lmod-ohpc -y
mv /etc/yum.repos.d/OpenHPC.repo /etc/yum.repos.d/OpenHPC.repo-disabled
mv /opt/ohpc /opt/ohpc-orig
yum install opencl-filesystem opencl-headers -y
yum install munge munge-devel -y
# n108 version
yum install kernel-4.18.0-553.30.1.el8_10.x86_64 \
kernel-modules-4.18.0-553.30.1.el8_10.x86_64 \
kernel-devel-4.18.0-553.30.1.el8_10.x86_64 \
kernel-tools-4.18.0-553.30.1.el8_10.x86_64 \
kernel-headers-4.18.0-553.30.1.el8_10.x86_64 \
kernel-core-4.18.0-553.30.1.el8_10.x86_64 \
kernel-tools-libs-4.18.0-553.30.1.el8_10.x86_64 \
kernel-debug-devel-4.18.0-553.30.1.el8_10.x86_64 -y
uname -a
# REMOVE public ip and shut down iptables
reboot
# NO INTERNET BELOW
mv /etc/issue.d/cockpit.issue /root/etc_issue.d_cockpit.issue
mv /etc/motd.d/cockpit /root/etc_motd.d_cockpit
vi /etc/chrony.conf
#pool 2.rocky.pool.ntp.org iburst
Server 192.168.102.250
Server 192.168.102.251
systemctl restart chronyd
echo SLURMD_OPTIONS="--conf-server 192.168.102.250" > /etc/sysconfig/slurmd
crontab -e # lm_sensors
#01 5,20 * * * /usr/bin/systemctl restart gmond
# ionice gaussian
#0,15,30,45 * * * * /share/apps/scripts/ionice_lexes.sh > /dev/null 2>&1
# cpu temps
#40 * * * * /share/apps/scripts/lm_sensors.sh > /dev/null 2>&1
scp 10.10.102.118:/etc/resolv.conf /etc/resolv.conf
scp 10.10.102.118:/etc/security/limits.conf /etc/security/limits.conf
#scp -rp cottontail2.wesleyan.edu:/etc/munge /etc/
#chown -R munge:munge /etc/munge/
scp -p 10.10.102.118:/etc/munge/munge.key /etc/munge/munge.key
ls -ld /etc/munge # check
ls -l /etc/munge/munge.key
systemctl enable munge
systemctl start munge
mkdir /usr/local/home;mv /home/hmeij07 /usr/local/home/
cd /home
mkdir localscratch
chmod ugo+rwx localscratch/
chmod o+t localscratch/
ln -s /zfshomes/apps
ln -s /zfshomes/tmp
ln -s /zfshomes/csmith06
cd /
ln -s /home/localscratch
ln -s /home /share
mkdir sanscratch
chmod ugo+rwx sanscratch/
chmod o+t sanscratch/
ls -l /home
ls -l /
cd /home
mkdir -p /zfshomes /home66 /home33 /mindstore /astrostore /vajedianlab
mkdir -p /smithlab/home;cd /smithlab;ln -s /smithlab/home/opt/rhel08 opt; ls -l
mkdir -p /opt/ohpc/pub /opt/intel /brunsonstore
cd
systemctl set-default multi-user.target
vi /etc/fstab
# add OHPC down section
mount -a
systemctl daemon-reload
scp 10.10.102.118:/etc/profile.d/lmod.sh /etc/profile.d/lmod.sh
scp -rp 10.10.102.118:/opt/ohpc/admin /opt/ohpc/ # not sure why missing
su - hmeij
module avail; module list; gcc --version # test lmod
mkdir /var/log/slurm
chown slurm:munge /var/log/slurm
mkdir /var/spool/slurm /var/spool/slurmd
chown -R munge:munge /etc/munge /var/log/munge /var/lib/munge /var/run/munge
chown -R slurm:munge /var/log/slurm /var/spool/slurm /var/spool/slurmd
scp -p 10.10.102.118:/etc/bashrc /etc/bashrc
scp -p 10.10.102.118:/etc/rc.d/rc.local /etc/rc.d/rc.local
#chmod +x /etc/rc.d/rc.local
cd /usr/local/src/
scp -p 10.10.102.118:/usr/local/src/NVIDIA-Linux-x86_64-550.67.run .
sh ./NVIDIA-Linux-x86_64-550.67.run
# settings at https://dokuwiki.wesleyan.edu/doku.php?id=cluster:225
reboot # make sure iptables and firewalld are disabled
nvidia-smi
cd /usr/local/src
ls /usr/src/kernels/
uname -a
scp -p 10.10.102.118:/usr/local/src/cuda_12.6.3_560.35.05_linux.run .
sh ./cuda_12.6.3_560.35.05_linux.run
#vi gpu-info
#vi gpu-process
cd /usr/local; ls -l
cd /usr/local/bin/
scp -p 10.10.102.118:/usr/local/bin/gpu-info .
scp -p 10.10.102.118:/usr/local/bin/gpu-process .
./gpu-info
./gpu-process
# ganglia and zabbix
mkdir -p /usr/local/src/tmp; cd /usr/local/src/tmp
scp -rp 10.10.102.118:/usr/local/src/ganglia-rpms ../
rpm -ivh ../ganglia-rpms/*.rpm;scp ../ganglia-rpms/gmond.conf.nodes /etc/ganglia/gmond.conf;\
systemctl enable gmond;systemctl start gmond;crontab -e
scp -rp 10.10.102.118:/usr/local/src/zabbix ../
rpm -ivh ../zabbix/zabbix-agent-7.0.4-release1.el8.x86_64.rpm
vi ../zabbix/zabbix_agentd.conf
scp ../zabbix/zabbix_agentd.conf /etc/zabbix/
unzip ../zabbix/zabbix-nvidia-smi-multi-gpu-master.zip
cd zabbix-nvidia-smi-multi-gpu-master/
mkdir /etc/zabbix/scripts
cp get_gpus_info.sh /etc/zabbix/scripts/
chmod +x /etc/zabbix/scripts/get_gpus_info.sh
cp userparameter_nvidia-smi.conf.linux /etc/zabbix/zabbix_agentd.d/userparameter_nvidia-smi.conf
systemctl enable zabbix-agent
systemctl start zabbix-agent
FINISH slurm-24.05.4
cd /usr/local/
scp -rp 10.10.102.118:/usr/local/amber16 .
scp -rp 10.10.102.118:/usr/local/amber20 .
scp -rp 10.10.102.118:/usr/local/lammps-22Aug18 .
scp -rp 10.10.102.118:/usr/local/slurm-24.05.4 .
ln -s slurm-24.05.4 slurm
ls -l
# check date, on cottontail2
NOW=`/bin/date +%m%d%H%M%Y.%S`; ssh nn111 date $NOW
# test both ways
munge -n -t 10 | ssh n111 unmunge
munge -n -t 10 | ssh cottontail2 unmunge
# 192.168 ip up
# iptables <---
systemctl disable iptables
systemctl stop iptables
echo $HOSTNAME | mail -s NEWONE hmeij@wesleyan.edu
rm -f /var/log/slurm/*
reboot
# check on slurm node state
# server fingerprints
# ctt2_date.sh
# various /etc/hosts.pdsh files
Back