This shows you the differences between two versions of the page.
Next revision | Previous revision | ||
cluster:228 [2025/01/21 20:09] hmeij07 created |
cluster:228 [2025/02/19 20:47] (current) hmeij07 |
||
---|---|---|---|
Line 66: | Line 66: | ||
</ | </ | ||
+ | ===== MFT fix ===== | ||
+ | |||
+ | We got a new raid card. **ib0** device not available | ||
+ | |||
+ | After stepping through all memory configs from 2 dimms to 16 dimms | ||
+ | |||
+ | Then we added the card. | ||
+ | |||
+ | The card shows up but is still down/ | ||
+ | |||
+ | |||
+ | < | ||
+ | |||
+ | # modules loaded | ||
+ | |||
+ | [root@n103 ~]# lsmod | grep mlx5 | ||
+ | mlx5_ib | ||
+ | ib_uverbs | ||
+ | ib_core | ||
+ | mlx5_core | ||
+ | mlxfw 32768 1 mlx5_core | ||
+ | pci_hyperv_intf | ||
+ | tls | ||
+ | psample | ||
+ | |||
+ | # cable and power | ||
+ | |||
+ | [root@n103 ~]# dmesg | grep -i mlx5 | ||
+ | [ 3.097033] mlx5_core 0000: | ||
+ | [ 3.097071] mlx5_core 0000: | ||
+ | [ 3.505586] mlx5_core 0000: | ||
+ | [ 3.505893] mlx5_core 0000: | ||
+ | [ 3.510968] mlx5_core 0000: | ||
+ | [ 3.511247] mlx5_core 0000: | ||
+ | [ 3.550721] mlx5_core 0000: | ||
+ | [ 3.788710] mlx5_core 0000: | ||
+ | [ | ||
+ | |||
+ | # in order to enable infiniband read on and find solution... | ||
+ | |||
+ | # download mft, stored on astrostore:/ | ||
+ | https:// | ||
+ | |||
+ | # copy from n102:/ | ||
+ | # collected from a previous installation | ||
+ | | ||
+ | cd mft-4.23-rpms/ | ||
+ | [root@n103 mft-4.23-rpms]# | ||
+ | annobin-10.67-3.el8.x86_64.rpm | ||
+ | dwz-0.12-10.el8.x86_64.rpm | ||
+ | efi-srpm-macros-3-3.el8.noarch.rpm | ||
+ | elfutils-0.187-4.el8.x86_64.rpm | ||
+ | elfutils-libelf-devel-0.187-4.el8.x86_64.rpm | ||
+ | gc-7.6.4-3.el8.x86_64.rpm | ||
+ | gcc-plugin-annobin-8.5.0-16.el8_7.x86_64.rpm | ||
+ | gdb-headless-8.2-19.el8.x86_64.rpm | ||
+ | ghc-srpm-macros-1.4.2-7.el8.noarch.rpm | ||
+ | go-srpm-macros-2-17.el8.noarch.rpm | ||
+ | guile-2.0.14-7.el8.x86_64.rpm | ||
+ | kernel-devel-4.18.0-425.3.1.el8.x86_64.rpm | ||
+ | libatomic_ops-7.6.2-3.el8.x86_64.rpm | ||
+ | libbabeltrace-1.5.4-4.el8.x86_64.rpm | ||
+ | libipt-1.6.1-8.el8.x86_64.rpm | ||
+ | |||
+ | # you need these rpms | ||
+ | rpm -iv rpm-build-4.14.3-24.el8_7.x86_64.rpm elfutils-0.187-4.el8.x86_64.rpm \ | ||
+ | | ||
+ | |||
+ | # and | ||
+ | rpm -ivh rpm-build kernel-devel # for running kernel version | ||
+ | |||
+ | # gunzip, untar mft tarball | ||
+ | cd / | ||
+ | cd mft-4.23.0-104-x86_64-rpm/ | ||
+ | ./ | ||
+ | |||
+ | -I- Removing any old MFT file if exists... | ||
+ | -I- Building the MFT kernel binary RPM... | ||
+ | -I- Installing the MFT RPMs... | ||
+ | Verifying... | ||
+ | Preparing... | ||
+ | Updating / installing... | ||
+ | | ||
+ | Verifying... | ||
+ | Preparing... | ||
+ | Updating / installing... | ||
+ | | ||
+ | -I- In order to start mst, please run "mst start" | ||
+ | |||
+ | mst start | ||
+ | |||
+ | Starting MST (Mellanox Software Tools) driver set | ||
+ | Loading MST PCI module - Success | ||
+ | Loading MST PCI configuration module - Success | ||
+ | Create devices | ||
+ | Unloading MST PCI module (unused) - Success | ||
+ | |||
+ | |||
+ | # you will need | ||
+ | git clone https:// | ||
+ | cd ibswinfo/ | ||
+ | scp -p ibswinfo.sh /usr/bin/ | ||
+ | cd /usr/bin | ||
+ | ln -s ibswinfo.sh | ||
+ | |||
+ | [root@n103 src]# ibstat | ||
+ | CA ' | ||
+ | CA type: MT4119 | ||
+ | Number of ports: 1 | ||
+ | Firmware version: 16.34.1002 | ||
+ | Hardware version: 0 | ||
+ | Node GUID: 0x98039b03007045f2 | ||
+ | System image GUID: 0x98039b03007045f2 | ||
+ | Port 1: | ||
+ | State: Down | ||
+ | Physical state: Disabled | ||
+ | Rate: 40 | ||
+ | Base lid: 0 | ||
+ | LMC: 0 | ||
+ | SM lid: 0 | ||
+ | Capability mask: 0x00010000 | ||
+ | Port GUID: 0x9a039bfffe7045f2 | ||
+ | Link layer: Ethernet | ||
+ | |||
+ | # read this post, all the way to the bottom | ||
+ | https:// | ||
+ | |||
+ | # useful articles | ||
+ | https:// | ||
+ | https:// | ||
+ | |||
+ | |||
+ | rpm -qf / | ||
+ | mft-4.23.0-104.x86_64 | ||
+ | |||
+ | # now configure port on new Mellanox card, first a query | ||
+ | |||
+ | mlxconfig -d / | ||
+ | |||
+ | Device #1: | ||
+ | ---------- | ||
+ | |||
+ | Device type: ConnectX5 | ||
+ | Name: | ||
+ | Description: | ||
+ | Device: | ||
+ | |||
+ | Configurations: | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | |||
+ | # switch to port from ethernet to infiniband | ||
+ | mlxconfig -d / | ||
+ | |||
+ | Device #1: | ||
+ | ---------- | ||
+ | |||
+ | Device type: ConnectX5 | ||
+ | Name: | ||
+ | Description: | ||
+ | Device: | ||
+ | |||
+ | Configurations: | ||
+ | | ||
+ | |||
+ | Apply new Configuration? | ||
+ | Applying... Done! | ||
+ | -I- Please reboot machine to load new configurations. | ||
+ | |||
+ | reboot | ||
+ | |||
+ | [root@n103 ~]# ibstat | ||
+ | CA ' | ||
+ | CA type: MT4119 | ||
+ | Number of ports: 1 | ||
+ | Firmware version: 16.34.1002 | ||
+ | Hardware version: 0 | ||
+ | Node GUID: 0x98039b03007045f2 | ||
+ | System image GUID: 0x98039b03007045f2 | ||
+ | Port 1: | ||
+ | State: Active | ||
+ | Physical state: LinkUp | ||
+ | Rate: 100 | ||
+ | Base lid: 9 | ||
+ | LMC: 0 | ||
+ | SM lid: 1 | ||
+ | Capability mask: 0xa659e848 | ||
+ | Port GUID: 0x98039b03007045f2 | ||
+ | Link layer: InfiniBand | ||
+ | |||
+ | ib0: flags=4163< | ||
+ | inet 10.11.103.103 | ||
+ | Infiniband hardware address can be incorrect! Please read BUGS section in ifconfig(8). | ||
+ | infiniband 00: | ||
+ | RX packets 210 bytes 27332 (26.6 KiB) | ||
+ | RX errors 0 dropped 0 overruns 0 frame 0 | ||
+ | TX packets 228 bytes 17944 (17.5 KiB) | ||
+ | TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 | ||
+ | |||
+ | # mount the NFSoRDMA file system | ||
+ | |||
+ | 10.11.103.243:/ | ||
+ | |||
+ | # reload this module in / | ||
+ | |||
+ | / | ||
+ | / | ||
+ | sleep 10 | ||
+ | / | ||
+ | |||
+ | </ | ||