cluster:26
no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
| — | cluster:26 [2007/04/19 19:28] (current) – created - external edit 127.0.0.1 | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | \\ | ||
| + | **[[cluster: | ||
| + | The production copy of OpenMPI is in ''/ | ||
| + | --- // | ||
| + | |||
| + | ====== HPLinpack Runs ====== | ||
| + | |||
| + | The purpose here is to rerun the HPLinpack benchmarks Amol ran while configuring the cluster. | ||
| + | |||
| + | ^Before^ | ||
| + | |{{: | ||
| + | ^During^ | ||
| + | |{{: | ||
| + | ^Ooops^ | ||
| + | |{{: | ||
| + | |||
| + | FAQ [[http:// | ||
| + | |||
| + | |||
| + | |||
| + | |||
| + | ====== Problem Sizes ====== | ||
| + | |||
| + | N calculation, | ||
| + | 4 nodes, 4 gb each is 16 gb total which yields 2 gb double precision (8 byte) elements ... 2gb is 2*1024*1024*1024 = 2, | ||
| + | |||
| + | N calculation 16 nodes (infiniband or ethernet): | ||
| + | 16 nodes, 4 gb each is 64 gb total which yields 8 gb double precision (8 byte) elements ... 8gb is 8*1024*1024*1024 = 8, | ||
| + | |||
| + | N calculation 4 heavy weight nodes:\\ | ||
| + | 4 nodes, 16 gb each is 64 gb total which yields 8 gb double precision (8 byte) elements ... 8gb is 8*1024*1024*1024 = 8, | ||
| + | |||
| + | NB calculations: | ||
| + | range of 32...256\\ | ||
| + | ood starting values are 88 132 | ||
| + | |||
| + | PxQ Grid:\\ | ||
| + | max value PxQ should equal nr of cores \\ | ||
| + | P<Q ... close for infiniband but P much smaller than Q for ethernet | ||
| + | |||
| + | LWNi (np=128): P=8, Q=16\\ | ||
| + | LWNe (np=128): P=4, Q=32 or P=2, Q=64\\ | ||
| + | HWN (np=32): P=4, Q=8 or P=2, Q=16 | ||
| + | |||
| + | ===== Infiniband (16 nodes) ===== | ||
| + | |||
| + | * nodes: compute-1-1 thru compute-1-16 | ||
| + | * each dual quad 2.6 ghz PE1950 (2x4x16 totals 128 cores) | ||
| + | * each with 4 gb ram (4x16=64 gb total memory) | ||
| + | |||
| + | ===== HPL.dat ====== | ||
| + | |||
| + | < | ||
| + | HPLinpack benchmark input file | ||
| + | Innovative Computing Laboratory, University of Tennessee | ||
| + | HPL.out | ||
| + | 7 device out (6=stdout, | ||
| + | 8 # of problems sizes (N) | ||
| + | 1000 5000 10000 15000 20000 25000 30000 35000 Ns | ||
| + | 6 # of NBs | ||
| + | 200 300 400 500 600 700 NBs | ||
| + | 0 PMAP process mapping (0=Row-, | ||
| + | 1 # of process grids (P x Q) | ||
| + | 8 Ps | ||
| + | 16 Qs | ||
| + | 16.0 | ||
| + | 3 # of panel fact | ||
| + | 0 1 2 PFACTs (0=left, 1=Crout, 2=Right) | ||
| + | 2 # of recursive stopping criterium | ||
| + | 2 4 NBMINs (>= 1) | ||
| + | 1 # of panels in recursion | ||
| + | 2 NDIVs | ||
| + | 3 # of recursive panel fact. | ||
| + | 0 1 2 RFACTs (0=left, 1=Crout, 2=Right) | ||
| + | 1 # of broadcast | ||
| + | 0 BCASTs (0=1rg, | ||
| + | 1 # of lookahead depth | ||
| + | 0 DEPTHs (>=0) | ||
| + | 2 SWAP (0=bin-exch, | ||
| + | 64 | ||
| + | 0 L1 in (0=transposed, | ||
| + | 0 U in (0=transposed, | ||
| + | 1 Equilibration (0=no, | ||
| + | 8 memory alignment in double (> 0) | ||
| + | </ | ||
| + | |||
| + | ===== run script ====== | ||
| + | |||
| + | **=> su delltest** | ||
| + | |||
| + | < | ||
| + | #!/bin/bash | ||
| + | |||
| + | echo " | ||
| + | |||
| + | P4_GLOBMEMSIZE=10000000 | ||
| + | export P4_GLOBMEMSIZE | ||
| + | |||
| + | echo " | ||
| + | echo "/ | ||
| + | |||
| + | date > HPL.start | ||
| + | (/ | ||
| + | </ | ||
| + | |||
| + | => with above HPL.dat file, this configuration runs for 8 hours ... | ||
| + | |||
| + | |||
| + | ===== Ethernet (16 nodes) ===== | ||
| + | |||
| + | * nodes: compute-1-17 thru compute-2-32 | ||
| + | * each dual quad 2.6 ghz PE1950 (2x4x16 totals 128 cores) | ||
| + | * each with 4 gb ram (4x16=64 gb total memory) | ||
| + | |||
| + | ===== HPL.dat ====== | ||
| + | |||
| + | < | ||
| + | Innovative Computing Laboratory, University of Tennessee | ||
| + | HPL.out | ||
| + | 7 device out (6=stdout, | ||
| + | 1 # of problems sizes (N) | ||
| + | 74145 Ns | ||
| + | 1 # of NBs | ||
| + | 88 NBs | ||
| + | 0 PMAP process mapping (0=Row-, | ||
| + | 1 # of process grids (P x Q) | ||
| + | 4 Ps | ||
| + | 32 Qs | ||
| + | 16.0 | ||
| + | 3 # of panel fact | ||
| + | 0 1 2 PFACTs (0=left, 1=Crout, 2=Right) | ||
| + | 2 # of recursive stopping criterium | ||
| + | 2 4 NBMINs (>= 1) | ||
| + | 1 # of panels in recursion | ||
| + | 2 NDIVs | ||
| + | 3 # of recursive panel fact. | ||
| + | 0 1 2 RFACTs (0=left, 1=Crout, 2=Right) | ||
| + | 1 # of broadcast | ||
| + | 0 BCASTs (0=1rg, | ||
| + | 1 # of lookahead depth | ||
| + | 0 DEPTHs (>=0) | ||
| + | 2 SWAP (0=bin-exch, | ||
| + | 64 | ||
| + | 0 L1 in (0=transposed, | ||
| + | 0 U in (0=transposed, | ||
| + | 1 Equilibration (0=no, | ||
| + | 8 memory alignment in double (> 0) | ||
| + | </ | ||
| + | |||
| + | |||
| + | |||
| + | |||
| + | |||
| + | ===== run script ====== | ||
| + | |||
| + | **=> su delltest2** | ||
| + | |||
| + | < | ||
| + | #!/bin/bash | ||
| + | |||
| + | echo " | ||
| + | |||
| + | P4_GLOBMEMSIZE=10000000 | ||
| + | export P4_GLOBMEMSIZE | ||
| + | |||
| + | date > HPL.start | ||
| + | |||
| + | |||
| + | echo " | ||
| + | echo "/ | ||
| + | / | ||
| + | |||
| + | (/ | ||
| + | / | ||
| + | </ | ||
| + | |||
| + | => runs for 4 hours ... change these lines below and it'll run for 14 hours | ||
| + | |||
| + | < | ||
| + | 2 # of problems sizes (N) | ||
| + | 74145 74145 Ns | ||
| + | 2 # of NBs | ||
| + | 88 132 NBs | ||
| + | </ | ||
| + | |||
| + | |||
| + | ===== Ethernet (4 nodes) ===== | ||
| + | |||
| + | |||
| + | * nodes: nfs-2-1 thru nfs-2-4 | ||
| + | * each dual quad 2.6 ghz PE1950 (2x4x4 totals 32 cores) | ||
| + | * each with 16 gb ram (16x4=64 gb total memory) | ||
| + | |||
| + | ===== HPL.dat ====== | ||
| + | |||
| + | < | ||
| + | Innovative Computing Laboratory, University of Tennessee | ||
| + | HPL.out | ||
| + | 7 device out (6=stdout, | ||
| + | 1 # of problems sizes (N) | ||
| + | 74145 Ns | ||
| + | 1 # of NBs | ||
| + | 88 NBs | ||
| + | 0 PMAP process mapping (0=Row-, | ||
| + | 1 # of process grids (P x Q) | ||
| + | 4 Ps | ||
| + | 8 Qs | ||
| + | 16.0 | ||
| + | 3 # of panel fact | ||
| + | 0 1 2 PFACTs (0=left, 1=Crout, 2=Right) | ||
| + | 2 # of recursive stopping criterium | ||
| + | 2 4 NBMINs (>= 1) | ||
| + | 1 # of panels in recursion | ||
| + | 2 NDIVs | ||
| + | 3 # of recursive panel fact. | ||
| + | 0 1 2 RFACTs (0=left, 1=Crout, 2=Right) | ||
| + | 1 # of broadcast | ||
| + | 0 BCASTs (0=1rg, | ||
| + | 1 # of lookahead depth | ||
| + | 0 DEPTHs (>=0) | ||
| + | 2 SWAP (0=bin-exch, | ||
| + | 64 | ||
| + | 0 L1 in (0=transposed, | ||
| + | 0 U in (0=transposed, | ||
| + | 1 Equilibration (0=no, | ||
| + | 8 memory alignment in double (> 0) | ||
| + | </ | ||
| + | |||
| + | |||
| + | |||
| + | ===== run script ====== | ||
| + | |||
| + | **=> su delltest3** | ||
| + | |||
| + | < | ||
| + | #!/bin/bash | ||
| + | |||
| + | echo " | ||
| + | |||
| + | P4_GLOBMEMSIZE=10000000 | ||
| + | export P4_GLOBMEMSIZE | ||
| + | |||
| + | date > HPL.start | ||
| + | |||
| + | |||
| + | echo " | ||
| + | echo "/ | ||
| + | / | ||
| + | |||
| + | (/ | ||
| + | / | ||
| + | </ | ||
| + | |||
| + | => runs for 14 1/2 hours ... change these lines and it'll run for 2 days | ||
| + | |||
| + | < | ||
| + | 2 # of NBs | ||
| + | 88 132 NBs | ||
| + | </ | ||
| + | |||
| + | |||
| + | ===== MPIRUN-1.2 ===== | ||
| + | |||
| + | From Sili at Platform ... | ||
| + | |||
| + | < | ||
| + | Actrually, MPICH 1 always has problems in the shared memory control. It really takes time to debug on the buggy shared memory stuff. I would rather suggest using openmpi instead of MPICH 1 to launch Ethernet linpack testings as openmpi is a newer and better MPI implementation than MPICH 1 and it is MPI-2 compatible plus it supports for both ethernet and infiniband devices. | ||
| + | |||
| + | The precedure I just tested is as follows | ||
| + | |||
| + | 1. Compile Openmpi | ||
| + | Here is the procedure I used to recompile openmpi : | ||
| + | # ./configure --prefix=/ | ||
| + | # make | ||
| + | # make install | ||
| + | |||
| + | To test the installation, | ||
| + | # cat /etc/hosts | grep compute | awk ' | ||
| + | |||
| + | Then I recompiled the hello example (the hello_c.c file can be found at the examples directory on the untar' | ||
| + | # / | ||
| + | |||
| + | And tested it : | ||
| + | # / | ||
| + | |||
| + | Please note that I used the complete path to the executables because by default, lam will be picked up. This is also why I used the --prefix option. You may want to use modules to load / unload these environment settings. Please let me know if you would like to have more information about this (open-source) software. | ||
| + | |||
| + | 2. Compile Linpack with Openmpi | ||
| + | |||
| + | # wget http:// | ||
| + | # tar zxf hpl.tgz | ||
| + | # cd hpl | ||
| + | # cp setup/ | ||
| + | edit Make.Linux_PII_CBLAS, | ||
| + | |||
| + | Then you can make linpack by | ||
| + | # make arch=Linux_PII_CBLAS | ||
| + | |||
| + | To test it, edit the HPL.dat accordingly and run by: | ||
| + | # / | ||
| + | |||
| + | </ | ||
| + | |||
| + | ===== MPIRUN-1.2 (fixes) ===== | ||
| + | |||
| + | My experience ... | ||
| + | |||
| + | < | ||
| + | |||
| + | source is in / | ||
| + | |||
| + | su delltest3 | ||
| + | |||
| + | export LD_LIBRARY_PATH="/ | ||
| + | add this to ~/.bashrc | ||
| + | |||
| + | cd / | ||
| + | ./configure --prefix / | ||
| + | make | ||
| + | make install | ||
| + | |||
| + | cd ~\\ | ||
| + | / | ||
| + | / | ||
| + | |||
| + | the machines file setup does not like ' | ||
| + | so instead addd 8 lines for each node like this ' | ||
| + | |||
| + | ldd hello | ||
| + | ldd openmpi-1.2/ | ||
| + | |||
| + | test on a single machine | ||
| + | / | ||
| + | / | ||
| + | |||
| + | cd ~ | ||
| + | (for some reason you need to do this for compilation to be successful) | ||
| + | ln -s / | ||
| + | cd hpl | ||
| + | cp ~/ | ||
| + | make arch=Linux_PII_CBLAS | ||
| + | cp bin/ | ||
| + | cp bin/ | ||
| + | |||
| + | cd ~ | ||
| + | / | ||
| + | / | ||
| + | |||
| + | </ | ||
| + | \\ | ||
| + | **[[cluster: | ||
cluster/26.txt · Last modified: by 127.0.0.1
