User Tools

Site Tools


cluster:164

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
Next revision Both sides next revision
cluster:164 [2017/10/24 19:22]
hmeij07 [Bench]
cluster:164 [2017/10/26 18:26]
hmeij07
Line 94: Line 94:
 ==== Bench ==== ==== Bench ====
  
-  * Amber 16. My sample script runs 3-4x faster than on a K20 +  * Amber 16. Nucleosome bench runs 4.5x faster than on a K20 
-    * Do not have enough expertise to assess thisneed stats from Kelly+    * Not sure it is representative of our work load 
 +    * Adding more MPI threads decreases performance 
 +    * Running across more gpus (2 or 4) decreases performance 
 +    * One Amber process per MPI thread per GPU is optimal 
 + 
 +**Wow, I just realized the most important metric: Our k20 has a job throughput of 20 per unit of time. The amber128 queue will have a throughput of 4*4.5 or 18 per same unit of time. One new server matches five old oneswell purchased in 2013. From an amber only perspective.** 
 + 
 +<code> 
 + 
 +nvidia-smi -pm 0; nvidia-smi -c 0 
 +# gpu_id is done via CUDA_VISIBLE_DEVICES 
 +export CUDA_VISIBLE_DEVCES=$STRING_2 
 +# on n78 
 +/usr/local/mpich-3.1.4/bin/mpirun -launcher ssh -f /home/hmeij/amber/nucleosome/hostfile \ 
 +-n $STRING_1 $AMBERHOME/bin/pmemd.cuda.MPI -O -o /tmp/mdout -i mdin.GPU \ 
 +-p prmtop -c inpcrd -ref inpcrd ; grep 'ns/day' /tmp/mdout 
 +# on n34 
 +/cm/shared/apps/mvapich2/gcc/64/1.6/bin/mpirun_rsh -ssh -hostfile /home/hmeij/amber/nucleosome/hostfile2 \ 
 +-np $STRING_1  pmemd.cuda.MPI -O -o /tmp/mdout -i mdin.GPU -p prmtop -c inpcrd -ref inpcrd; grep 'ns/day' /tmp/mdout 
 + 
 + 
 +Nucleosome Metric ns/day, seconds/ns  across all steps  x  nr of gpus 
 + 
 + 
 +GTX on n78 
 + 
 +-n 1, -gpu_id 0 
 +|         ns/day =      12.24   seconds/ns =    7058.94   x4 = 48.96  (4.5 faster than k20) 
 +-n 2, -gpu_id 0 
 +|         ns/day =      11.50   seconds/ns =    7509.97 
 +-n 4, -gpu_id 0 
 +|         ns/day =      10.54   seconds/ns =    8197.80 
 +-n 4, -gpu_id 01 
 +|         ns/day =      20.70   seconds/ns =    4173.55   x2 = 41.40 
 +-n 8, -gpu_id 01 
 +|         ns/day =      17.44   seconds/ns =    4953.04 
 +-n 4, -gpu_id 0123 
 +|         ns/day =      32.90   seconds/ns =    2626.27   x1 
 +-n 8, -gpu_id 0123 
 +|         ns/day =      28.43   seconds/ns =    3038.72   x1 
 + 
 + 
 +K20 on n34  
 + 
 +-n 1, -gpu_id 0 
 +|             ns/day =       2.71   seconds/ns =   31883.03 
 +-n 4, -gpu_id 0 
 +|             ns/day =       1.53   seconds/ns =   56325.00 
 +-n4, -gpuid 0123 
 +|             ns/day =       5.87   seconds/ns =   14730.45 
 + 
 + 
 + 
 +</code>
  
   * Gromacs 5.1.4 My (Colin's) multidir bench runs about 2x faster than on a K20   * Gromacs 5.1.4 My (Colin's) multidir bench runs about 2x faster than on a K20
Line 145: Line 198:
 </code> </code>
  
-  * Lammps+  * Lammps 11Aug17 runs about 11x faster than K20 
 +    * used the colloid example, not sure if that's a good example 
 +    * like gromacs, lots of room for improvements  
 +    * used the double-double binary,surprised at speed  
 +      * single-double binary might run faster?
  
 <code> <code>
  
 +nvidia-smi -pm 0; nvidia-smi -c 0
 +# gpu_id is done via CUDA_VISIBLE_DEVICES
 export CUDA_VISIBLE_DEVCES=$STRING_2 export CUDA_VISIBLE_DEVCES=$STRING_2
 +# on n78
 /usr/local/mpich-3.1.4/bin/mpirun -launcher ssh -f ./hostfile -n $STRING_1 \ /usr/local/mpich-3.1.4/bin/mpirun -launcher ssh -f ./hostfile -n $STRING_1 \
 /usr/local/lammps-11Aug17/lmp_mpi-double-double-with-gpu -suffix gpu \ /usr/local/lammps-11Aug17/lmp_mpi-double-double-with-gpu -suffix gpu \
-$STRING_3 -in in.colloid > /tmp/out+$STRING_3 -in in.colloid > /tmp/out ; grep tau /tmp/out 
 +# on n34 
 +/cm/shared/apps/mvapich2/gcc/64/1.6/bin/mpirun_rsh -ssh \ 
 +-hostfile /home/hmeij/sharptail/hostfile2 -np $STRING_1 \ 
 +/share/apps/CENTOS6/lammps/31Mar17/lmp_gpu_double \ 
 +-suffix gpu $STRING_3  -in in.colloid > /tmp/out ; grep tau /tmp/out 
 + 
  
 Created 5625 atoms Created 5625 atoms
--gpu_id is done via CUDA_VISIBLE_DEVICES 
  
-lmp_mpi-double-double-with-gpu with -suffix gpu 
 -n 1, -gpu_id 0 -n 1, -gpu_id 0
-Performance: 581,359 tau/day, 1,345 timesteps/+Performance: 581,359 tau/day, 1,345 timesteps/s  
--n 2, -gpu_id 0 +-n 2, -gpu_id 01 
-Performance: 621,822 tau/day, 1,439 timesteps/+Performance: 621,822 tau/day, 1,439 timesteps/s  
--n 4, gpu_id 0 +-n 4, -gpu_id 0123 
-Performance: 479,795 tau/day, 1,110 timesteps/s+Performance: 479,795 tau/day, 1,110 timesteps/
  
 -n 4, -gpu_id 01, -pk gpu 2 -n 4, -gpu_id 01, -pk gpu 2
-Performance: 819,207 tau/day, 1,896 timesteps/s+Performance: 819,207 tau/day, 1,896 timesteps/
 -n 8, -gpu_id 01, -pk gpu 2 -n 8, -gpu_id 01, -pk gpu 2
-Performance: 519,173 tau/day, 1,201 timesteps/s+Performance: 519,173 tau/day, 1,201 timesteps/
 -n 6, -gpu_id 0123, -pk gpu 4 -n 6, -gpu_id 0123, -pk gpu 4
 Performance: 881,981 tau/day, 2,041 timesteps/s Performance: 881,981 tau/day, 2,041 timesteps/s
 -n 8, -gpu_id 0123, -pk gpu 4 -n 8, -gpu_id 0123, -pk gpu 4
-Performance: 932,493 tau/day, 2,158 timesteps/s +Performance: 932,493 tau/day, 2,158 timesteps/(11x K20)
--n 10, -gpu_id 0123, -pk gpu 4 +
 -n 16, -gpu_id 0123, -pk gpu 4 -n 16, -gpu_id 0123, -pk gpu 4
 Performance: 582,717 tau/day, 1,348 timesteps/s Performance: 582,717 tau/day, 1,348 timesteps/s
  
  
-K20 n34 last example +K20 on n34  
-/cm/shared/apps/mvapich2/gcc/64/1.6/bin/mpirun_rsh -ssh \ + 
--hostfile /home/hmeij/sharptail/hostfile2 -np 8 \ +-n8, -gpuid 0123, -pk gpu 4 
-/share/apps/CENTOS6/lammps/31Mar17/lmp_gpu_double \ +Performance: 84985 tau/day, 196 timesteps/s  
--sf gpu -pk gpu 4  -in in.colloid > /tmp/out ; grep tau /tmp/out+ 
 + 
 +GTX on n78 again  
 +-n 8, -gpu_id 0123, -pk gpu 4 
 + 
 +Created 22500 atoms 
 +Performance: 552,986 tau/day, 1,280 timesteps/
 +Created 90000 atoms 
 +Performance: 210,864 tau/day, 488 timesteps/
  
 </code> </code>
cluster/164.txt · Last modified: 2018/09/21 11:59 by hmeij07