cluster:223
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| cluster:223 [2023/09/07 12:45] – hmeij07 | cluster:223 [2023/09/18 20:56] (current) – hmeij07 | ||
|---|---|---|---|
| Line 47: | Line 47: | ||
| # update link to this version: yes | # update link to this version: yes | ||
| - | # no -silent... | + | # no -silent |
| =========== | =========== | ||
| Line 116: | Line 116: | ||
| - | ==== Test ==== | + | |
| + | ==== Testing | ||
| + | |||
| Script ~hmeij/ | Script ~hmeij/ | ||
| Line 125: | Line 128: | ||
| * #SBATCH --mem-per-gpu=7168 | * #SBATCH --mem-per-gpu=7168 | ||
| - | For some reason this yields cpus=8 which is different behavior (expected cpu=1). Slurm is overriding the above settings with partition setting of DefCpuPerGPU=8. Slurm has not changed but cuda version has. Odd. | + | For some reason this yields cpus=8 which is different behavior (expected cpu=1). Slurm is overriding the above settings with partition setting of DefCpuPerGPU=8. Slurm has not changed but cuda version has. Odd. Good news is Amber runs fine, no need to recompile. |
| < | < | ||
| Line 147: | Line 150: | ||
| </ | </ | ||
| + | |||
| + | * #SBATCH --cpus-per-gpu=1 | ||
| + | |||
| + | Adding this does force Slurm to allocate just a single cpu. Now try 4 gpu jobs per node. No need for CUDA_VISIBLE_DEVICES setting. | ||
| + | |||
| + | < | ||
| + | |||
| + | JOBID | ||
| + | 1053992 mwgpu | ||
| + | |||
| + | [hmeij@cottontail2 slurm]$ for i in `seq 1 6`; do sbatch run.centos; sleep 30; squeue | grep hmeij; done | ||
| + | |||
| + | # output | ||
| + | Submitted batch job 1054000 | ||
| + | 1054000 mwgpu | ||
| + | Submitted batch job 1054001 | ||
| + | 1054001 mwgpu | ||
| + | 1054000 mwgpu | ||
| + | Submitted batch job 1054002 | ||
| + | 1054002 mwgpu | ||
| + | 1054001 mwgpu | ||
| + | 1054000 mwgpu | ||
| + | Submitted batch job 1054003 | ||
| + | 1054003 mwgpu | ||
| + | 1054002 mwgpu | ||
| + | 1054001 mwgpu | ||
| + | 1054000 mwgpu | ||
| + | Submitted batch job 1054004 | ||
| + | 1054004 mwgpu | ||
| + | 1054003 mwgpu | ||
| + | 1054002 mwgpu | ||
| + | 1054001 mwgpu | ||
| + | 1054000 mwgpu | ||
| + | Submitted batch job 1054005 | ||
| + | 1054005 mwgpu | ||
| + | 1054004 mwgpu | ||
| + | 1054003 mwgpu | ||
| + | 1054002 mwgpu | ||
| + | 1054001 mwgpu | ||
| + | 1054000 mwgpu | ||
| + | |||
| + | |||
| + | [hmeij@cottontail2 slurm]$ ssh n33 gpu-info | ||
| + | id, | ||
| + | 0, Tesla K20m, 40, 95 MiB, 4648 MiB, 100 %, 25 % | ||
| + | 1, Tesla K20m, 40, 95 MiB, 4648 MiB, 94 %, 23 % | ||
| + | 2, Tesla K20m, 35, 95 MiB, 4648 MiB, 93 %, 21 % | ||
| + | 3, Tesla K20m, 28, 95 MiB, 4648 MiB, 97 %, 25 % | ||
| + | |||
| + | </ | ||
| + | |||
| + | Other software does need to be recompiled as it links to specific version of libraries rather than the generic libName.so (lammps). | ||
| + | |||
| + | Script ~hmeij/ | ||
| + | |||
| + | < | ||
| + | |||
| + | / | ||
| + | / | ||
| + | / | ||
| + | / | ||
| + | / | ||
| + | linux-vdso.so.1 => (0x00007ffd714ec000) | ||
| + | libjpeg.so.62 => / | ||
| + | libcudart.so.11.0 => / | ||
| + | libcuda.so.1 => / | ||
| + | libcufft.so.10 => / | ||
| + | libdl.so.2 => / | ||
| + | libmpi.so.40 => / | ||
| + | libstdc++.so.6 => / | ||
| + | libm.so.6 => / | ||
| + | libgcc_s.so.1 => / | ||
| + | libpthread.so.0 => / | ||
| + | libc.so.6 => / | ||
| + | / | ||
| + | librt.so.1 => / | ||
| + | libopen-rte.so.40 => / | ||
| + | libopen-pal.so.40 => / | ||
| + | libutil.so.1 => / | ||
| + | libz.so.1 => / | ||
| + | |||
| + | Large-scale Atomic/ | ||
| + | |||
| + | Usage example: lmp_mpi-cuda-single-single -var t 300 -echo screen -in in.alloy | ||
| + | |||
| + | List of command line options supported by this LAMMPS executable: | ||
| + | < | ||
| + | |||
| + | # hmmm, using -suffix gpu it does not jump on gpus, generic non-gpu libthread error | ||
| + | # same version rocky8/ | ||
| + | # try " | ||
| + | # libspace tarball download fails on file hash and | ||
| + | # yields a status: [1;" | ||
| + | |||
| + | # without ML-SPACE hash fails for opencl-loarder third partty, bad url | ||
| + | # https:// | ||
| + | # then extract in _deps/ dir | ||
| + | # and added -D GPU_LIBRARY=../ | ||
| + | # that works, cmake compile binary jumps on multiple gpus | ||
| + | |||
| + | |||
| + | [hmeij@n35 sharptail]$ mpirun -n 2 \ | ||
| + | / | ||
| + | -suffix gpu -in in.colloid | ||
| + | |||
| + | [root@greentail52 ~]# ssh n35 gpu-process | ||
| + | gpu_name, gpu_id, pid, process_name | ||
| + | Tesla K20m, 0, 9911, / | ||
| + | Tesla K20m, 1, 9912, / | ||
| + | |||
| + | # some stats, colloid example | ||
| + | |||
| + | 1 cpu, 1 gpu | ||
| + | Total wall time: 0:05:49 | ||
| + | 2 cpus, 2 gpus | ||
| + | Total wall time: 0:03:58 | ||
| + | 4 cpus, 4 gpus | ||
| + | Total wall time: 0:02:23 | ||
| + | 8 cpus, 4 gpus | ||
| + | Total wall time: 0:02:23 | ||
| + | |||
| + | # but the ML-PACE hash error is different, so no go there | ||
| + | |||
| + | </ | ||
| + | |||
| \\ | \\ | ||
| **[[cluster: | **[[cluster: | ||
cluster/223.1694090708.txt.gz · Last modified: by hmeij07
