This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
cluster:192 [2020/02/27 15:46] hmeij07 [Miscellaneous] |
cluster:192 [2022/03/08 18:29] (current) hmeij07 [Recipe] |
||
---|---|---|---|
Line 10: | Line 10: | ||
The Usage section below is HPCC users wnatig to use queue '' | The Usage section below is HPCC users wnatig to use queue '' | ||
+ | Debug for node n89 which turns itself off...grrhhh. Create a usb bootable stick with https:// | ||
+ | < | ||
+ | |||
+ | [root@n89 ~]# ipmitool sel elist | ||
+ | 1 | 02/29/2020 | 16:57:33 | Memory #0xd1 | Uncorrectable ECC | Asserted | ||
+ | 2 | 03/02/2020 | 03:02:42 | Processor CPU_CATERR | IERR | Asserted | ||
+ | 3 | 03/11/2020 | 19:27:35 | Processor CPU_CATERR | IERR | Asserted | ||
+ | ...[snip]... | ||
+ | |||
+ | [root@n89 ~]# ipmitool sdr elist | ||
+ | CPU1 Temperature | 31h | ok | 3.0 | 43 degrees C | ||
+ | CPU2 Temperature | 32h | ok | 0.0 | 40 degrees C | ||
+ | PSU1 Over Temp | 92h | ok | 0.0 | Transition to OK | ||
+ | PSU2 Over Temp | 9Ah | ok | 0.0 | Transition to OK | ||
+ | ...[snip]... | ||
+ | DIMMM1_Temp | ||
+ | CPU1_ECC1 | ||
+ | CPU2_ECC1 | ||
+ | ...[snip]... | ||
+ | PMBPower1 | ||
+ | PMBPower2 | ||
+ | ...[snip]... | ||
+ | FRNT_FAN1 | ||
+ | ../ | ||
+ | PSU1 Slow FAN1 | 95h | ok | 0.0 | Transition to OK | ||
+ | PSU2 Slow FAN1 | 9Dh | ok | 0.0 | Transition to OK | ||
+ | ...[snip]... | ||
+ | |||
+ | |||
+ | [root@n89 ~]# | ||
+ | # dmidecode 3.2 | ||
+ | Getting SMBIOS data from sysfs. | ||
+ | SMBIOS 3.2 present. | ||
+ | |||
+ | Handle 0x0000, DMI type 0, 26 bytes | ||
+ | BIOS Information | ||
+ | Vendor: American Megatrends Inc. | ||
+ | Version: 5102 | ||
+ | Release Date: 02/11/2019 | ||
+ | Address: 0xF0000 | ||
+ | Runtime Size: 64 kB | ||
+ | ROM Size: 32 MB | ||
+ | Characteristics: | ||
+ | ...[snip]... | ||
+ | UEFI is supported | ||
+ | BIOS Revision: 5.14 | ||
+ | |||
+ | |||
+ | [root@n89 ~]# edac-util -s -v | ||
+ | edac-util: EDAC drivers are loaded. 4 MCs detected: | ||
+ | mc0:Skylake Socket#0 IMC#0 | ||
+ | mc1:Skylake Socket#0 IMC#1 | ||
+ | mc2:Skylake Socket#1 IMC#0 | ||
+ | mc3:Skylake Socket#1 IMC#1 | ||
+ | [root@n89 ~]# edac-util | ||
+ | edac-util: No errors to report. | ||
+ | |||
+ | syslog | ||
+ | |||
+ | </ | ||
==== Usage ==== | ==== Usage ==== | ||
Line 81: | Line 141: | ||
#/ | #/ | ||
- | # for amber16 -pm=ENABLED -c=EXCLUSIVE_PROCESS | + | # for amber16 -pm=1/ENABLED -c=1/EXCLUSIVE_PROCESS |
#nvidia-smi --persistence-mode=1 | #nvidia-smi --persistence-mode=1 | ||
#nvidia-smi --compute-mode=1 | #nvidia-smi --compute-mode=1 | ||
- | # for mwgpu/exx96 -pm=ENABLED -c=DEFAULT | + | # for mwgpu/exx96 -pm=1/ENABLED -c=0/DEFAULT |
# note: turned this off, running with defaults | # note: turned this off, running with defaults | ||
+ | # seems stable, maybe persistence later on | ||
+ | # lets see how docker interacts first... | ||
#nvidia-smi --persistence-mode=1 | #nvidia-smi --persistence-mode=1 | ||
#nvidia-smi --compute-mode=0 | #nvidia-smi --compute-mode=0 | ||
Line 134: | Line 196: | ||
systemctl restart network | systemctl restart network | ||
dig google.com | dig google.com | ||
+ | #centos7 | ||
yum install -y iptables-services | yum install -y iptables-services | ||
vi / | vi / | ||
Line 155: | Line 218: | ||
# add packages and update | # add packages and update | ||
yum install epel-release -y | yum install epel-release -y | ||
+ | yum install flex flex-devel bison bison-devel -y | ||
yum install tcl tcl-devel dmtcp -y | yum install tcl tcl-devel dmtcp -y | ||
+ | yum install net-snmp net-snmp-libs net-agent-libs net-tools net-snmp-utils -y | ||
yum install freeglut-devel libXi-devel libXmu-devel \ make mesa-libGLU-devel -y | yum install freeglut-devel libXi-devel libXmu-devel \ make mesa-libGLU-devel -y | ||
yum install blas blas-devel lapack lapack-devel boost boost-devel -y | yum install blas blas-devel lapack lapack-devel boost boost-devel -y | ||
Line 163: | Line 228: | ||
yum install cmake cmake-devel -y | yum install cmake cmake-devel -y | ||
yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y | yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y | ||
+ | # amber | ||
+ | yum -y install tcsh make \ | ||
+ | gcc gcc-gfortran gcc-c++ \ | ||
+ | which flex bison patch bc \ | ||
+ | | ||
+ | perl perl-ExtUtils-MakeMaker util-linux wget \ | ||
+ | bzip2 bzip2-devel zlib-devel tar | ||
yum update -y | yum update -y | ||
yum clean all | yum clean all | ||
Line 234: | Line 306: | ||
nvcr.io/ | nvcr.io/ | ||
- | # free -g | + | free -m |
total used free shared | total used free shared | ||
- | Mem: 92 | + | Mem: |
+ | Swap: | ||
# nvidia-smi | # nvidia-smi |