Differences

This shows you the differences between two versions of the page.

--- cluster:164 [2017/10/24 19:22]
hmeij07 [Bench]
+++ cluster:164 [2017/10/27 19:30]
hmeij07
@@ Line 94: / Line 94: @@
 ==== Bench ====
-  * Amber 16. My sample script runs 3-4x faster than on a K20
+  * Amber 16. Nucleosome bench runs 4.5x faster than on a K20
-    * Do not have enough expertise to assess this, need stats from Kelly
+    * Not sure it is representative of our work load
+    * Adding more MPI threads decreases performance
+    * Running across more gpus (2 or 4) decreases performance
+    * One Amber process per MPI thread per GPU is optimal
+**Wow, I just realized the most important metric: Our k20 has a job throughput of 20 per unit of time. The amber128 queue will have a throughput of 4*4.5 or 18 per same unit of time. One new server matches five old ones, well purchased in 2013. From an amber only perspective.**
+<code>
+nvidia-smi -pm 0; nvidia-smi -c 0
+# gpu_id is done via CUDA_VISIBLE_DEVICES
+export CUDA_VISIBLE_DEVCES=$STRING_2
+# on n78
+/usr/local/mpich-3.1.4/bin/mpirun -launcher ssh -f /home/hmeij/amber/nucleosome/hostfile \
+-n $STRING_1 $AMBERHOME/bin/pmemd.cuda.MPI -O -o /tmp/mdout -i mdin.GPU \
+-p prmtop -c inpcrd -ref inpcrd ; grep 'ns/day' /tmp/mdout
+# on n34
+/cm/shared/apps/mvapich2/gcc/64/1.6/bin/mpirun_rsh -ssh -hostfile /home/hmeij/amber/nucleosome/hostfile2 \
+-np $STRING_1  pmemd.cuda.MPI -O -o /tmp/mdout -i mdin.GPU -p prmtop -c inpcrd -ref inpcrd; grep 'ns/day' /tmp/mdout
+Nucleosome Metric ns/day, seconds/ns  across all steps  x  nr of gpus
+GTX on n78
+-n 1, -gpu_id 0
+|         ns/day =      12.24   seconds/ns =    7058.94   x4 = 48.96  (4.5 faster than k20)
+-n 2, -gpu_id 0
+|         ns/day =      11.50   seconds/ns =    7509.97
+-n 4, -gpu_id 0
+|         ns/day =      10.54   seconds/ns =    8197.80
+-n 4, -gpu_id 01
+|         ns/day =      20.70   seconds/ns =    4173.55   x2 = 41.40
+-n 8, -gpu_id 01
+|         ns/day =      17.44   seconds/ns =    4953.04
+-n 4, -gpu_id 0123
+|         ns/day =      32.90   seconds/ns =    2626.27   x1
+-n 8, -gpu_id 0123
+|         ns/day =      28.43   seconds/ns =    3038.72   x1
+K20 on n34
+-n 1, -gpu_id 0
+|             ns/day =       2.71   seconds/ns =   31883.03
+-n 4, -gpu_id 0
+|             ns/day =       1.53   seconds/ns =   56325.00
+-n4, -gpuid 0123
+|             ns/day =       5.87   seconds/ns =   14730.45
+</code>
   * Gromacs 5.1.4 My (Colin's) multidir bench runs about 2x faster than on a K20
@@ Line 145: / Line 198: @@
 </code>
-  * Lammps
+  * Lammps 11Aug17 runs about 11x faster than K20
+    * used the colloid example, not sure if that's a good example
+    * like gromacs, lots of room for improvements
+    * used the double-double binary,surprised at speed
+      * single-double binary might run faster?
 <code>
+nvidia-smi -pm 0; nvidia-smi -c 0
+# gpu_id is done via CUDA_VISIBLE_DEVICES
 export CUDA_VISIBLE_DEVCES=$STRING_2
+# on n78
 /usr/local/mpich-3.1.4/bin/mpirun -launcher ssh -f ./hostfile -n $STRING_1 \
 /usr/local/lammps-11Aug17/lmp_mpi-double-double-with-gpu -suffix gpu \
-$STRING_3 -in in.colloid > /tmp/out
+$STRING_3 -in in.colloid > /tmp/out ; grep tau /tmp/out
+# on n34
+/cm/shared/apps/mvapich2/gcc/64/1.6/bin/mpirun_rsh -ssh \
+-hostfile /home/hmeij/sharptail/hostfile2 -np $STRING_1 \
+/share/apps/CENTOS6/lammps/31Mar17/lmp_gpu_double \
+-suffix gpu $STRING_3  -in in.colloid > /tmp/out ; grep tau /tmp/out
 Created 5625 atoms
--gpu_id is done via CUDA_VISIBLE_DEVICES
-lmp_mpi-double-double-with-gpu with -suffix gpu
 -n 1, -gpu_id 0
 Performance: 581,359 tau/day, 1,345 timesteps/s
--n 2, -gpu_id 0
+-n 2, -gpu_id 01
 Performance: 621,822 tau/day, 1,439 timesteps/s
--n 4, gpu_id 0
+-n 4, -gpu_id 0123
 Performance: 479,795 tau/day, 1,110 timesteps/s
 -n 4, -gpu_id 01, -pk gpu 2
 Performance: 819,207 tau/day, 1,896 timesteps/s
 -n 8, -gpu_id 01, -pk gpu 2
 Performance: 519,173 tau/day, 1,201 timesteps/s
 -n 6, -gpu_id 0123, -pk gpu 4
 Performance: 881,981 tau/day, 2,041 timesteps/s
 -n 8, -gpu_id 0123, -pk gpu 4
-Performance: 932,493 tau/day, 2,158 timesteps/s
+Performance: 932,493 tau/day, 2,158 timesteps/s (11x K20)
--n 10, -gpu_id 0123, -pk gpu 4
 -n 16, -gpu_id 0123, -pk gpu 4
 Performance: 582,717 tau/day, 1,348 timesteps/s
-K20 n34 last example
+K20 on n34
-/cm/shared/apps/mvapich2/gcc/64/1.6/bin/mpirun_rsh -ssh \
--hostfile /home/hmeij/sharptail/hostfile2 -np 8 \
+-n8, -gpuid 0123, -pk gpu 4
-/share/apps/CENTOS6/lammps/31Mar17/lmp_gpu_double \
+Performance: 84985 tau/day, 196 timesteps/s
--sf gpu -pk gpu 4  -in in.colloid > /tmp/out ; grep tau /tmp/out
+GTX on n78 again
+-n 8, -gpu_id 0123, -pk gpu 4
+Created 22500 atoms
+Performance: 552,986 tau/day, 1,280 timesteps/s
+Created 90000 atoms
+Performance: 210,864 tau/day, 488 timesteps/s
 </code>
@@ Line 488: / Line 560: @@
 exit $?
+</code>
+==== PPMA Bench ====
+<code>
+PMMA Benchmark Performance Metric (x  nr of gpus)
+GTX on n78
+-n 1, -gpu_id 3
+Performance: 19.974 ns/day, 1.202 hours/ns, 231.176 timesteps/s
+, GeForce GTX 1080 Ti, 38, 219 MiB, 10953 MiB, 30 %, 1 %
+-n 2, -gpu_id 3
+Performance: 33.806 ns/day, 0.710 hours/ns, 391.277 timesteps/s
+, GeForce GTX 1080 Ti, 57, 358 MiB, 10814 MiB, 47 %, 3 %
+-n 4, -gpu_id 3
+Performance: 48.504 ns/day, 0.495 hours/ns, 561.388 timesteps/s (x4 = 194 ns/day/node)
+, GeForce GTX 1080 Ti, 59, 690 MiB, 10482 MiB, 76 %, 4 %
+-n 8, -gpu_id 3
+Performance: 37.742 ns/day, 0.636 hours/ns, 436.833 timesteps/s
+, GeForce GTX 1080 Ti, 47, 1332 MiB, 9840 MiB, 90 %, 4 %
+-n 4, -gpu_id 01
+Performance: 57.621 ns/day, 0.417 hours/ns, 666.912 timesteps/s
+, GeForce GTX 1080 Ti, 48, 350 MiB, 10822 MiB, 50 %, 3 %
+, GeForce GTX 1080 Ti, 37, 344 MiB, 10828 MiB, 49 %, 3 %
+-n 8, -gpu_id 01
+Performance: 63.625 ns/day, 0.377 hours/ns, 736.400 timesteps/s (x2 = 127 ns/day/node)
+, GeForce GTX 1080 Ti, 66, 670 MiB, 10502 MiB, 77 %, 4 %
+, GeForce GTX 1080 Ti, 51, 670 MiB, 10502 MiB, 81 %, 4 %
+-n 12, -gpu_id 01
+Performance: 61.198 ns/day, 0.392 hours/ns, 708.315 timesteps/s
+, GeForce GTX 1080 Ti, 65, 988 MiB, 10184 MiB, 82 %, 4 %
+, GeForce GTX 1080 Ti, 50, 990 MiB, 10182 MiB, 85 %, 4 %
+-n 8, -gpu_id 0123
+Performance: 86.273 ns/day, 0.278 hours/ns, 998.534 timesteps/s
+, GeForce GTX 1080 Ti, 56, 340 MiB, 10832 MiB, 57 %, 3 %
+, GeForce GTX 1080 Ti, 41, 340 MiB, 10832 MiB, 52 %, 2 %
+, GeForce GTX 1080 Ti, 43, 340 MiB, 10832 MiB, 57 %, 3 %
+, GeForce GTX 1080 Ti, 42, 340 MiB, 10832 MiB, 55 %, 2 %
+-n 12, -gpuid 0123
+Performance: 108.905 ns/day, 0.220 hours/ns, 1260.478 timesteps/s (x1 = 109 ns/day/node)
+-n 16
+Performance: 88.989 ns/day, 0.270 hours/ns, 1029.964 timesteps/s
+# on n34
+unable to get it to run...
+K20 on n34
+-n 1, -gpu_id 0
+-n 4, -gpu_id 0
+-n 4, -gpuid 0123
 </code>

DokuWiki

User Tools

Site Tools

Differences

Page Tools