\\ **[[cluster:0|Back]]** ===== Recipe for RTX4070ti nodes ===== # image using usb stick rocky 8.10 # enter bios set date, note MAC address vi /etc/selinux/config vi /etc/ssh/sshd_config # SKIP NO WAREWULF vi /etc/default/grub # add inet.ifnames=0 to CMD LINE grub2-mkconfig -o /boot/grub2/grub.cfg reboot # add 10 to nodename for ip cd /etc/sysconfig/network-scripts/ vi ifcfg-en01 mv ifcfg-eno1 ifcfg-eth0 vi ifcfg-eno2 mv ifcfg-eno2 ifcfg-eth1 systemctl restart NetworkManager ifconfig # SKIP NO WAREWULFy # IPTABLES yum install NetworkManager-initscripts-updown -y vi ifcfg-eth0 # add 192.168 and 10.10 IPs # 129.133.52.223/255.255.252.0/129.133.52.1 systemctl restart NetworkManager ifconfig dig google.com yum install iptables-services vi /etc/sysconfig/iptables # ssh line add -s 129.133.22.66 systemctl enable iptables systemctl start iptables systemctl stop firewalld systemctl disable firewalld reboot iptables -L # DO NOT FORGET TO DISABLE after internet date cat /etc/fstab lsblk ssh-keygen -t rsa # add public key to n108 scp 10.10.102.118:/root/.ssh/authorized_keys /root/.ssh/ cd /etc scp passwd passwd-orig scp shadow shadow-orig scp group group-orig scp hosts hosts-orig cd scp 10.10.102.118:/etc/passwd /etc/passwd scp 10.10.102.118:/etc/shadow /etc/shadow scp 10.10.102.118:/etc/group /etc/group scp 10.10.102.118:/etc/hosts /etc/hosts dnf config-manager --set-enabled powertools -y dnf install epel-release -y dnf install netcdf netcdf-devel -y dnf install yum-utils -y # yumdownloader yum install epel-release -y yum install flex bison -y yum install tcl tcl-devel dmtcp dmtcp-devel -y yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y yum install freeglut-devel libXi-devel libXmu-devel -y yum install blas blas-devel lapack lapack-devel boost boost-devel -y yum install lm_sensors lm_sensors-libs -y yum install zlib-devel bzip2-devel -y yum install openmpi openmpi-devel perl-ExtUtils-MakeMaker -y yum install cmake -y yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y yum install libibverbs libibverbs-devel -y yum install ncurses-c++-libs ncurses-devel readline-devel -y yum install tcsh make gcc gcc-gfortran gcc-c++ which flex bison \ patch bc libXt-devel libXext-devel perl perl-ExtUtils-MakeMaker util-linux \ wget bzip2 bzip2-devel zlib-devel tar -y dnf install postfix mailx -y echo "relayhost = 192.168.102.251" >> /etc/postfix/main.cf systemctl enable postfix systemctl start postfix mv /etc/snmp/snmpd.conf /etc/snmp/snmpd.conf-orig scp 10.10.102.118:/etc/snmp/snmpd.conf /etc/snmp/snmpd.conf # add 2 lines systemctl enable snmpd systemctl start snmpd yum install R R-devel -y yum install java-1.8.0-openjdk java-1.8.0-openjdk-devel \ java-1.8.0-openjdk-headless javapackages-filesystem -y yum install python39 python39-devel -y ln -s /usr/bin/python3.9 /usr/bin/python yum install fftw fftw-devel -y yum install gsl gsl-devel -y yum install ruby ruby-devel -y yum install openbabel openbabel-devel -y umount /opt/ohpc/pub /opt/intel # make sure scp 10.10.102.118:/usr/local/src/ohpc-release-2-1.el8.x86_64.rpm /usr/local/src/ rpm -ivh /usr/local/src/ohpc-release-2-1.el8.x86_64.rpm yum install singularity-ohpc -y yum install ohpc-base-compute --nobest -y yum install ohpc-slurm-client -y yum install --allowerasing lmod-ohpc -y mv /etc/yum.repos.d/OpenHPC.repo /etc/yum.repos.d/OpenHPC.repo-disabled mv /opt/ohpc /opt/ohpc-orig yum install opencl-filesystem opencl-headers -y yum install munge munge-devel -y # n108 version yum install kernel-4.18.0-553.30.1.el8_10.x86_64 \ kernel-modules-4.18.0-553.30.1.el8_10.x86_64 \ kernel-devel-4.18.0-553.30.1.el8_10.x86_64 \ kernel-tools-4.18.0-553.30.1.el8_10.x86_64 \ kernel-headers-4.18.0-553.30.1.el8_10.x86_64 \ kernel-core-4.18.0-553.30.1.el8_10.x86_64 \ kernel-tools-libs-4.18.0-553.30.1.el8_10.x86_64 \ kernel-debug-devel-4.18.0-553.30.1.el8_10.x86_64 -y uname -a # REMOVE public ip and shut down iptables reboot # NO INTERNET BELOW mv /etc/issue.d/cockpit.issue /root/etc_issue.d_cockpit.issue mv /etc/motd.d/cockpit /root/etc_motd.d_cockpit vi /etc/chrony.conf #pool 2.rocky.pool.ntp.org iburst Server 192.168.102.250 Server 192.168.102.251 systemctl restart chronyd echo SLURMD_OPTIONS="--conf-server 192.168.102.250" > /etc/sysconfig/slurmd crontab -e # lm_sensors #01 5,20 * * * /usr/bin/systemctl restart gmond # ionice gaussian #0,15,30,45 * * * * /share/apps/scripts/ionice_lexes.sh > /dev/null 2>&1 # cpu temps #40 * * * * /share/apps/scripts/lm_sensors.sh > /dev/null 2>&1 scp 10.10.102.118:/etc/resolv.conf /etc/resolv.conf scp 10.10.102.118:/etc/security/limits.conf /etc/security/limits.conf #scp -rp cottontail2.wesleyan.edu:/etc/munge /etc/ #chown -R munge:munge /etc/munge/ scp -p 10.10.102.118:/etc/munge/munge.key /etc/munge/munge.key ls -ld /etc/munge # check ls -l /etc/munge/munge.key systemctl enable munge systemctl start munge mkdir /usr/local/home;mv /home/hmeij07 /usr/local/home/ cd /home mkdir localscratch chmod ugo+rwx localscratch/ chmod o+t localscratch/ ln -s /zfshomes/apps ln -s /zfshomes/tmp ln -s /zfshomes/csmith06 cd / ln -s /home/localscratch ln -s /home /share mkdir sanscratch chmod ugo+rwx sanscratch/ chmod o+t sanscratch/ ls -l /home ls -l / cd /home mkdir -p /zfshomes /home66 /home33 /mindstore /astrostore /vajedianlab mkdir -p /smithlab/home;cd /smithlab;ln -s /smithlab/home/opt/rhel08 opt; ls -l mkdir -p /opt/ohpc/pub /opt/intel /brunsonstore cd systemctl set-default multi-user.target vi /etc/fstab # add OHPC down section mount -a systemctl daemon-reload scp 10.10.102.118:/etc/profile.d/lmod.sh /etc/profile.d/lmod.sh scp -rp 10.10.102.118:/opt/ohpc/admin /opt/ohpc/ # not sure why missing su - hmeij module avail; module list; gcc --version # test lmod mkdir /var/log/slurm chown slurm:munge /var/log/slurm mkdir /var/spool/slurm /var/spool/slurmd chown -R munge:munge /etc/munge /var/log/munge /var/lib/munge /var/run/munge chown -R slurm:munge /var/log/slurm /var/spool/slurm /var/spool/slurmd scp -p 10.10.102.118:/etc/bashrc /etc/bashrc scp -p 10.10.102.118:/etc/rc.d/rc.local /etc/rc.d/rc.local #chmod +x /etc/rc.d/rc.local cd /usr/local/src/ scp -p 10.10.102.118:/usr/local/src/NVIDIA-Linux-x86_64-550.67.run . sh ./NVIDIA-Linux-x86_64-550.67.run # settings at https://dokuwiki.wesleyan.edu/doku.php?id=cluster:225 reboot # make sure iptables and firewalld are disabled nvidia-smi cd /usr/local/src ls /usr/src/kernels/ uname -a scp -p 10.10.102.118:/usr/local/src/cuda_12.6.3_560.35.05_linux.run . sh ./cuda_12.6.3_560.35.05_linux.run #vi gpu-info #vi gpu-process cd /usr/local; ls -l cd /usr/local/bin/ scp -p 10.10.102.118:/usr/local/bin/gpu-info . scp -p 10.10.102.118:/usr/local/bin/gpu-process . ./gpu-info ./gpu-process # ganglia and zabbix mkdir -p /usr/local/src/tmp; cd /usr/local/src/tmp scp -rp 10.10.102.118:/usr/local/src/ganglia-rpms ../ rpm -ivh ../ganglia-rpms/*.rpm;scp ../ganglia-rpms/gmond.conf.nodes /etc/ganglia/gmond.conf;\ systemctl enable gmond;systemctl start gmond;crontab -e scp -rp 10.10.102.118:/usr/local/src/zabbix ../ rpm -ivh ../zabbix/zabbix-agent-7.0.4-release1.el8.x86_64.rpm vi ../zabbix/zabbix_agentd.conf scp ../zabbix/zabbix_agentd.conf /etc/zabbix/ unzip ../zabbix/zabbix-nvidia-smi-multi-gpu-master.zip cd zabbix-nvidia-smi-multi-gpu-master/ mkdir /etc/zabbix/scripts cp get_gpus_info.sh /etc/zabbix/scripts/ chmod +x /etc/zabbix/scripts/get_gpus_info.sh cp userparameter_nvidia-smi.conf.linux /etc/zabbix/zabbix_agentd.d/userparameter_nvidia-smi.conf systemctl enable zabbix-agent systemctl start zabbix-agent FINISH slurm-24.05.4 cd /usr/local/ scp -rp 10.10.102.118:/usr/local/amber16 . scp -rp 10.10.102.118:/usr/local/amber20 . scp -rp 10.10.102.118:/usr/local/lammps-22Aug18 . scp -rp 10.10.102.118:/usr/local/slurm-24.05.4 . ln -s slurm-24.05.4 slurm ls -l # check date, on cottontail2 NOW=`/bin/date +%m%d%H%M%Y.%S`; ssh nn111 date $NOW # test both ways munge -n -t 10 | ssh n111 unmunge munge -n -t 10 | ssh cottontail2 unmunge # 192.168 ip up # iptables <--- systemctl disable iptables systemctl stop iptables echo $HOSTNAME | mail -s NEWONE hmeij@wesleyan.edu rm -f /var/log/slurm/* reboot # check on slurm node state # server fingerprints # ctt2_date.sh # various /etc/hosts.pdsh files \\ **[[cluster:0|Back]]**