This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision Next revision Both sides next revision | ||
cluster:164 [2017/10/24 19:22] hmeij07 [Bench] |
cluster:164 [2017/10/27 19:30] hmeij07 |
||
---|---|---|---|
Line 94: | Line 94: | ||
==== Bench ==== | ==== Bench ==== | ||
- | * Amber 16. My sample script | + | * Amber 16. Nucleosome bench runs 4.5x faster than on a K20 |
- | * Do not have enough expertise to assess this, need stats from Kelly | + | * Not sure it is representative of our work load |
+ | * Adding more MPI threads decreases performance | ||
+ | * Running across more gpus (2 or 4) decreases performance | ||
+ | * One Amber process per MPI thread per GPU is optimal | ||
+ | |||
+ | **Wow, I just realized the most important metric: Our k20 has a job throughput of 20 per unit of time. The amber128 queue will have a throughput of 4*4.5 or 18 per same unit of time. One new server matches five old ones, well purchased in 2013. From an amber only perspective.** | ||
+ | |||
+ | < | ||
+ | |||
+ | nvidia-smi -pm 0; nvidia-smi -c 0 | ||
+ | # gpu_id is done via CUDA_VISIBLE_DEVICES | ||
+ | export CUDA_VISIBLE_DEVCES=$STRING_2 | ||
+ | # on n78 | ||
+ | / | ||
+ | -n $STRING_1 $AMBERHOME/ | ||
+ | -p prmtop -c inpcrd -ref inpcrd ; grep ' | ||
+ | # on n34 | ||
+ | / | ||
+ | -np $STRING_1 | ||
+ | |||
+ | |||
+ | Nucleosome Metric ns/day, seconds/ | ||
+ | |||
+ | |||
+ | GTX on n78 | ||
+ | |||
+ | -n 1, -gpu_id 0 | ||
+ | | | ||
+ | -n 2, -gpu_id 0 | ||
+ | | | ||
+ | -n 4, -gpu_id 0 | ||
+ | | | ||
+ | -n 4, -gpu_id 01 | ||
+ | | | ||
+ | -n 8, -gpu_id 01 | ||
+ | | | ||
+ | -n 4, -gpu_id 0123 | ||
+ | | | ||
+ | -n 8, -gpu_id 0123 | ||
+ | | | ||
+ | |||
+ | |||
+ | K20 on n34 | ||
+ | |||
+ | -n 1, -gpu_id 0 | ||
+ | | | ||
+ | -n 4, -gpu_id 0 | ||
+ | | | ||
+ | -n4, -gpuid 0123 | ||
+ | | | ||
+ | |||
+ | |||
+ | |||
+ | </ | ||
* Gromacs 5.1.4 My (Colin' | * Gromacs 5.1.4 My (Colin' | ||
Line 145: | Line 198: | ||
</ | </ | ||
- | * Lammps | + | * Lammps |
+ | * used the colloid example, not sure if that's a good example | ||
+ | * like gromacs, lots of room for improvements | ||
+ | * used the double-double binary, | ||
+ | * single-double binary might run faster? | ||
< | < | ||
+ | nvidia-smi -pm 0; nvidia-smi -c 0 | ||
+ | # gpu_id is done via CUDA_VISIBLE_DEVICES | ||
export CUDA_VISIBLE_DEVCES=$STRING_2 | export CUDA_VISIBLE_DEVCES=$STRING_2 | ||
+ | # on n78 | ||
/ | / | ||
/ | / | ||
- | $STRING_3 -in in.colloid > /tmp/out | + | $STRING_3 -in in.colloid > / |
+ | # on n34 | ||
+ | / | ||
+ | -hostfile / | ||
+ | / | ||
+ | -suffix gpu $STRING_3 | ||
+ | |||
Created 5625 atoms | Created 5625 atoms | ||
- | -gpu_id is done via CUDA_VISIBLE_DEVICES | ||
- | lmp_mpi-double-double-with-gpu with -suffix gpu | ||
-n 1, -gpu_id 0 | -n 1, -gpu_id 0 | ||
- | Performance: | + | Performance: |
- | -n 2, -gpu_id | + | -n 2, -gpu_id |
- | Performance: | + | Performance: |
- | -n 4, gpu_id | + | -n 4, -gpu_id |
- | Performance: | + | Performance: |
-n 4, -gpu_id 01, -pk gpu 2 | -n 4, -gpu_id 01, -pk gpu 2 | ||
- | Performance: | + | Performance: |
-n 8, -gpu_id 01, -pk gpu 2 | -n 8, -gpu_id 01, -pk gpu 2 | ||
- | Performance: | + | Performance: |
-n 6, -gpu_id 0123, -pk gpu 4 | -n 6, -gpu_id 0123, -pk gpu 4 | ||
Performance: | Performance: | ||
-n 8, -gpu_id 0123, -pk gpu 4 | -n 8, -gpu_id 0123, -pk gpu 4 | ||
- | Performance: | + | Performance: |
- | -n 10, -gpu_id 0123, -pk gpu 4 | + | |
-n 16, -gpu_id 0123, -pk gpu 4 | -n 16, -gpu_id 0123, -pk gpu 4 | ||
Performance: | Performance: | ||
- | K20 n34 last example | + | K20 on n34 |
- | / | + | |
- | -hostfile / | + | -n8, -gpuid 0123, -pk gpu 4 |
- | /share/apps/ | + | Performance: |
- | -sf gpu -pk gpu 4 -in in.colloid > /tmp/out ; grep tau /tmp/out | + | |
+ | |||
+ | GTX on n78 again | ||
+ | -n 8, -gpu_id 0123, -pk gpu 4 | ||
+ | |||
+ | Created 22500 atoms | ||
+ | Performance: | ||
+ | Created 90000 atoms | ||
+ | Performance: | ||
</ | </ | ||
Line 488: | Line 560: | ||
exit $? | exit $? | ||
+ | |||
+ | </ | ||
+ | |||
+ | ==== PPMA Bench ==== | ||
+ | |||
+ | < | ||
+ | |||
+ | PMMA Benchmark Performance Metric (x nr of gpus) | ||
+ | |||
+ | |||
+ | GTX on n78 | ||
+ | |||
+ | -n 1, -gpu_id 3 | ||
+ | Performance: | ||
+ | 3, GeForce GTX 1080 Ti, 38, 219 MiB, 10953 MiB, 30 %, 1 % | ||
+ | -n 2, -gpu_id 3 | ||
+ | Performance: | ||
+ | 3, GeForce GTX 1080 Ti, 57, 358 MiB, 10814 MiB, 47 %, 3 % | ||
+ | -n 4, -gpu_id 3 | ||
+ | Performance: | ||
+ | 3, GeForce GTX 1080 Ti, 59, 690 MiB, 10482 MiB, 76 %, 4 % | ||
+ | -n 8, -gpu_id 3 | ||
+ | Performance: | ||
+ | 3, GeForce GTX 1080 Ti, 47, 1332 MiB, 9840 MiB, 90 %, 4 % | ||
+ | -n 4, -gpu_id 01 | ||
+ | Performance: | ||
+ | 0, GeForce GTX 1080 Ti, 48, 350 MiB, 10822 MiB, 50 %, 3 % | ||
+ | 1, GeForce GTX 1080 Ti, 37, 344 MiB, 10828 MiB, 49 %, 3 % | ||
+ | -n 8, -gpu_id 01 | ||
+ | Performance: | ||
+ | 0, GeForce GTX 1080 Ti, 66, 670 MiB, 10502 MiB, 77 %, 4 % | ||
+ | 1, GeForce GTX 1080 Ti, 51, 670 MiB, 10502 MiB, 81 %, 4 % | ||
+ | -n 12, -gpu_id 01 | ||
+ | Performance: | ||
+ | 0, GeForce GTX 1080 Ti, 65, 988 MiB, 10184 MiB, 82 %, 4 % | ||
+ | 1, GeForce GTX 1080 Ti, 50, 990 MiB, 10182 MiB, 85 %, 4 % | ||
+ | -n 8, -gpu_id 0123 | ||
+ | Performance: | ||
+ | 0, GeForce GTX 1080 Ti, 56, 340 MiB, 10832 MiB, 57 %, 3 % | ||
+ | 1, GeForce GTX 1080 Ti, 41, 340 MiB, 10832 MiB, 52 %, 2 % | ||
+ | 2, GeForce GTX 1080 Ti, 43, 340 MiB, 10832 MiB, 57 %, 3 % | ||
+ | 3, GeForce GTX 1080 Ti, 42, 340 MiB, 10832 MiB, 55 %, 2 % | ||
+ | -n 12, -gpuid 0123 | ||
+ | Performance: | ||
+ | -n 16 | ||
+ | Performance: | ||
+ | |||
+ | |||
+ | |||
+ | # on n34 | ||
+ | unable to get it to run... | ||
+ | |||
+ | K20 on n34 | ||
+ | |||
+ | -n 1, -gpu_id 0 | ||
+ | -n 4, -gpu_id 0 | ||
+ | -n 4, -gpuid 0123 | ||
+ | |||
</ | </ |