cluster:228
Differences
This shows you the differences between two versions of the page.
| Next revision | Previous revision | ||
| cluster:228 [2025/01/21 20:09] – created hmeij07 | cluster:228 [2025/02/19 20:47] (current) – hmeij07 | ||
|---|---|---|---|
| Line 66: | Line 66: | ||
| </ | </ | ||
| + | ===== MFT fix ===== | ||
| + | |||
| + | We got a new raid card. **ib0** device not available | ||
| + | |||
| + | After stepping through all memory configs from 2 dimms to 16 dimms | ||
| + | |||
| + | Then we added the card. | ||
| + | |||
| + | The card shows up but is still down/ | ||
| + | |||
| + | |||
| + | < | ||
| + | |||
| + | # modules loaded | ||
| + | |||
| + | [root@n103 ~]# lsmod | grep mlx5 | ||
| + | mlx5_ib | ||
| + | ib_uverbs | ||
| + | ib_core | ||
| + | mlx5_core | ||
| + | mlxfw 32768 1 mlx5_core | ||
| + | pci_hyperv_intf | ||
| + | tls | ||
| + | psample | ||
| + | |||
| + | # cable and power | ||
| + | |||
| + | [root@n103 ~]# dmesg | grep -i mlx5 | ||
| + | [ 3.097033] mlx5_core 0000: | ||
| + | [ 3.097071] mlx5_core 0000: | ||
| + | [ 3.505586] mlx5_core 0000: | ||
| + | [ 3.505893] mlx5_core 0000: | ||
| + | [ 3.510968] mlx5_core 0000: | ||
| + | [ 3.511247] mlx5_core 0000: | ||
| + | [ 3.550721] mlx5_core 0000: | ||
| + | [ 3.788710] mlx5_core 0000: | ||
| + | [ | ||
| + | |||
| + | # in order to enable infiniband read on and find solution... | ||
| + | |||
| + | # download mft, stored on astrostore:/ | ||
| + | https:// | ||
| + | |||
| + | # copy from n102:/ | ||
| + | # collected from a previous installation | ||
| + | | ||
| + | cd mft-4.23-rpms/ | ||
| + | [root@n103 mft-4.23-rpms]# | ||
| + | annobin-10.67-3.el8.x86_64.rpm | ||
| + | dwz-0.12-10.el8.x86_64.rpm | ||
| + | efi-srpm-macros-3-3.el8.noarch.rpm | ||
| + | elfutils-0.187-4.el8.x86_64.rpm | ||
| + | elfutils-libelf-devel-0.187-4.el8.x86_64.rpm | ||
| + | gc-7.6.4-3.el8.x86_64.rpm | ||
| + | gcc-plugin-annobin-8.5.0-16.el8_7.x86_64.rpm | ||
| + | gdb-headless-8.2-19.el8.x86_64.rpm | ||
| + | ghc-srpm-macros-1.4.2-7.el8.noarch.rpm | ||
| + | go-srpm-macros-2-17.el8.noarch.rpm | ||
| + | guile-2.0.14-7.el8.x86_64.rpm | ||
| + | kernel-devel-4.18.0-425.3.1.el8.x86_64.rpm | ||
| + | libatomic_ops-7.6.2-3.el8.x86_64.rpm | ||
| + | libbabeltrace-1.5.4-4.el8.x86_64.rpm | ||
| + | libipt-1.6.1-8.el8.x86_64.rpm | ||
| + | |||
| + | # you need these rpms | ||
| + | rpm -iv rpm-build-4.14.3-24.el8_7.x86_64.rpm elfutils-0.187-4.el8.x86_64.rpm \ | ||
| + | | ||
| + | |||
| + | # and | ||
| + | rpm -ivh rpm-build kernel-devel # for running kernel version | ||
| + | |||
| + | # gunzip, untar mft tarball | ||
| + | cd / | ||
| + | cd mft-4.23.0-104-x86_64-rpm/ | ||
| + | ./ | ||
| + | |||
| + | -I- Removing any old MFT file if exists... | ||
| + | -I- Building the MFT kernel binary RPM... | ||
| + | -I- Installing the MFT RPMs... | ||
| + | Verifying... | ||
| + | Preparing... | ||
| + | Updating / installing... | ||
| + | | ||
| + | Verifying... | ||
| + | Preparing... | ||
| + | Updating / installing... | ||
| + | | ||
| + | -I- In order to start mst, please run "mst start" | ||
| + | |||
| + | mst start | ||
| + | |||
| + | Starting MST (Mellanox Software Tools) driver set | ||
| + | Loading MST PCI module - Success | ||
| + | Loading MST PCI configuration module - Success | ||
| + | Create devices | ||
| + | Unloading MST PCI module (unused) - Success | ||
| + | |||
| + | |||
| + | # you will need | ||
| + | git clone https:// | ||
| + | cd ibswinfo/ | ||
| + | scp -p ibswinfo.sh /usr/bin/ | ||
| + | cd /usr/bin | ||
| + | ln -s ibswinfo.sh | ||
| + | |||
| + | [root@n103 src]# ibstat | ||
| + | CA ' | ||
| + | CA type: MT4119 | ||
| + | Number of ports: 1 | ||
| + | Firmware version: 16.34.1002 | ||
| + | Hardware version: 0 | ||
| + | Node GUID: 0x98039b03007045f2 | ||
| + | System image GUID: 0x98039b03007045f2 | ||
| + | Port 1: | ||
| + | State: Down | ||
| + | Physical state: Disabled | ||
| + | Rate: 40 | ||
| + | Base lid: 0 | ||
| + | LMC: 0 | ||
| + | SM lid: 0 | ||
| + | Capability mask: 0x00010000 | ||
| + | Port GUID: 0x9a039bfffe7045f2 | ||
| + | Link layer: Ethernet | ||
| + | |||
| + | # read this post, all the way to the bottom | ||
| + | https:// | ||
| + | |||
| + | # useful articles | ||
| + | https:// | ||
| + | https:// | ||
| + | |||
| + | |||
| + | rpm -qf / | ||
| + | mft-4.23.0-104.x86_64 | ||
| + | |||
| + | # now configure port on new Mellanox card, first a query | ||
| + | |||
| + | mlxconfig -d / | ||
| + | |||
| + | Device #1: | ||
| + | ---------- | ||
| + | |||
| + | Device type: ConnectX5 | ||
| + | Name: | ||
| + | Description: | ||
| + | Device: | ||
| + | |||
| + | Configurations: | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | |||
| + | # switch to port from ethernet to infiniband | ||
| + | mlxconfig -d / | ||
| + | |||
| + | Device #1: | ||
| + | ---------- | ||
| + | |||
| + | Device type: ConnectX5 | ||
| + | Name: | ||
| + | Description: | ||
| + | Device: | ||
| + | |||
| + | Configurations: | ||
| + | | ||
| + | |||
| + | Apply new Configuration? | ||
| + | Applying... Done! | ||
| + | -I- Please reboot machine to load new configurations. | ||
| + | |||
| + | reboot | ||
| + | |||
| + | [root@n103 ~]# ibstat | ||
| + | CA ' | ||
| + | CA type: MT4119 | ||
| + | Number of ports: 1 | ||
| + | Firmware version: 16.34.1002 | ||
| + | Hardware version: 0 | ||
| + | Node GUID: 0x98039b03007045f2 | ||
| + | System image GUID: 0x98039b03007045f2 | ||
| + | Port 1: | ||
| + | State: Active | ||
| + | Physical state: LinkUp | ||
| + | Rate: 100 | ||
| + | Base lid: 9 | ||
| + | LMC: 0 | ||
| + | SM lid: 1 | ||
| + | Capability mask: 0xa659e848 | ||
| + | Port GUID: 0x98039b03007045f2 | ||
| + | Link layer: InfiniBand | ||
| + | |||
| + | ib0: flags=4163< | ||
| + | inet 10.11.103.103 | ||
| + | Infiniband hardware address can be incorrect! Please read BUGS section in ifconfig(8). | ||
| + | infiniband 00: | ||
| + | RX packets 210 bytes 27332 (26.6 KiB) | ||
| + | RX errors 0 dropped 0 overruns 0 frame 0 | ||
| + | TX packets 228 bytes 17944 (17.5 KiB) | ||
| + | TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 | ||
| + | |||
| + | # mount the NFSoRDMA file system | ||
| + | |||
| + | 10.11.103.243:/ | ||
| + | |||
| + | # reload this module in / | ||
| + | |||
| + | / | ||
| + | / | ||
| + | sleep 10 | ||
| + | / | ||
| + | |||
| + | </ | ||
cluster/228.1737490181.txt.gz · Last modified: by hmeij07
