This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
cluster:119 [2013/08/21 15:03] hmeij |
cluster:119 [2021/06/17 19:32] (current) hmeij07 |
||
---|---|---|---|
Line 1: | Line 1: | ||
\\ | \\ | ||
**[[cluster: | **[[cluster: | ||
- | |||
- | Jobs need to be submitted to the scheduler on host sharptail itself for now and will be dispatched to nodes n33-n37 in queue mwgpu. | ||
- | --- // | ||
==== Submitting GPU Jobs ==== | ==== Submitting GPU Jobs ==== | ||
+ | |||
+ | Please plenty of time between multiple GPU job submissions. | ||
+ | |||
+ | Jobs need to be submitted to the scheduler via cottontail to queues mwgpu, amber128, exx96. | ||
+ | |||
+ | This page is old, the gpu resource '' | ||
+ | --- // | ||
+ | |||
+ | **Articles** | ||
+ | |||
+ | * [[http:// | ||
+ | * [[http:// | ||
+ | |||
Line 44: | Line 54: | ||
</ | </ | ||
- | With '' | + | With '' |
< | < | ||
Line 58: | Line 68: | ||
3 Tesla K20m 21 C 0 % | 3 Tesla K20m 21 C 0 % | ||
==================================================== | ==================================================== | ||
+ | |||
+ | [hmeij@sharptail sharptail]$ ssh n33 gpu-free | ||
+ | 1,3,0 | ||
+ | |||
+ | |||
</ | </ | ||
Line 116: | Line 131: | ||
#!/bin/bash | #!/bin/bash | ||
# submit via 'bsub < run.gpu' | # submit via 'bsub < run.gpu' | ||
- | rm -f mdout.[0-9]* auout.[0-9]* | + | rm -f mdout.[0-9]* auout.[0-9]* apoa1out.[0-9]* |
#BSUB -e err | #BSUB -e err | ||
#BSUB -o out | #BSUB -o out | ||
Line 122: | Line 137: | ||
#BSUB -J test | #BSUB -J test | ||
- | ## leave sufficient time between job submissions (30-60 secs) | + | # from greentail we need to set up the module env |
- | ## the number of GPUs allocated matches | + | export PATH=/ |
- | ## always reserve GPU (gpu=1), setting this to 0 is a cpu job only | + | / |
- | ## reserve 6144 MB (5 GB + 20%) memory per GPU | + | / |
- | ## run all processes (1< | + | / |
+ | / | ||
+ | / | ||
+ | / | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
- | #BSUB -n 1 | ||
- | #BSUB -R " | ||
- | |||
- | # unique job scratch dirs | ||
- | MYSANSCRATCH=/ | ||
- | MYLOCALSCRATCH=/ | ||
- | export MYSANSCRATCH MYLOCALSCRATCH | ||
- | cd $MYSANSCRATCH | ||
- | |||
- | # AMBER | ||
- | # stage the data | ||
- | cp ~/ | ||
- | # feed the wrapper | ||
- | lava.mvapich2.wrapper pmemd.cuda.MPI \ | ||
- | -O -o mdout.$LSB_JOBID -inf mdinfo.1K10 -x mdcrd.1K10 -r restrt.1K10 -ref inpcrd | ||
- | # save results | ||
- | cp mdout.[0-9]* ~/ | ||
- | |||
- | # LAMMPS | ||
- | # GPUIDX=1 use allocated GPU(s), GPUIDX=0 cpu run only (view header au.inp) | ||
- | export GPUIDX=1 | ||
- | # stage the data | ||
- | cp ~/ | ||
- | # feed the wrapper | ||
- | lava.mvapich2.wrapper lmp_nVidia \ | ||
- | -c off -var GPUIDX $GPUIDX -in au.inp -l auout.$LSB_JOBID | ||
- | # save results | ||
- | cp auout.[0-9]* ~/ | ||
- | |||
- | </ | ||
- | |||
- | |||
- | ==== lava.mvampich2.wrapper ==== | ||
- | |||
- | < | ||
- | |||
- | #!/bin/bash | ||
- | # submit via 'bsub < run.gpu' | ||
- | rm -f mdout.[0-9]* auout.[0-9]* apoa1out.[0-9]* | ||
- | #BSUB -e err | ||
- | #BSUB -o out | ||
- | #BSUB -q mwgpu | ||
- | #BSUB -J test | ||
## leave sufficient time between job submissions (30-60 secs) | ## leave sufficient time between job submissions (30-60 secs) | ||
Line 217: | Line 198: | ||
# save results | # save results | ||
cp apoa1out.$LSB_JOBID ~/ | cp apoa1out.$LSB_JOBID ~/ | ||
+ | |||
+ | |||
+ | </ | ||
+ | |||
+ | |||
+ | ==== gromacs.sub ==== | ||
+ | |||
+ | < | ||
+ | |||
+ | #!/bin/bash | ||
+ | |||
+ | rm -rf gromacs.out gromacs.err \#* *.log | ||
+ | |||
+ | # from greentail we need to recreate module env | ||
+ | export PATH=/ | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | |||
+ | #BSUB -o gromacs.out | ||
+ | #BSUB -e gromacs.err | ||
+ | #BSUB -N | ||
+ | #BSUB -J 325monolayer | ||
+ | |||
+ | # read / | ||
+ | . / | ||
+ | export VMDDIR=/ | ||
+ | |||
+ | ## CPU RUN: queue mw256, n<=28, must run on one node (thread_mpi) | ||
+ | ##BSUB -q mw256 | ||
+ | ##BSUB -n 2 | ||
+ | ##BSUB -R " | ||
+ | #export PATH=/ | ||
+ | #. / | ||
+ | #mdrun -nt 2 -s 325topol.tpr -c 325monolayer.gro -e 325ener.edr -o 325traj.trr -x 325traj.xtc | ||
+ | |||
+ | ## GPU RUN: gpu (1-4), queue mwgpu, n (1-4, matches gpu count), must run on one node | ||
+ | ##BSUB -q mwgpu | ||
+ | ##BSUB -n 1 | ||
+ | ##BSUB -R " | ||
+ | ## signal GMXRC is a gpu run with: 1=thread_mpi | ||
+ | #export GMXRC=1 | ||
+ | #export PATH=/ | ||
+ | #. / | ||
+ | # | ||
+ | # | ||
+ | |||
+ | # GPU RUN: gpu (1-4), queue mwgpu, n (1-4, matches gpu count), must run on one node | ||
+ | #BSUB -q mwgpu | ||
+ | #BSUB -n 1 | ||
+ | #BSUB -R " | ||
+ | # signal GMXRC is a gpu run with: 2=mvapich2 | ||
+ | export GMXRC=2 | ||
+ | export PATH=/ | ||
+ | . / | ||
+ | lava.mvapich2.wrapper mdrun_mpi \ | ||
+ | -testverlet -s 325topol.tpr -c 325monolayer.gro -e 325ener.edr -o 325traj.trr -x 325traj.xtc | ||
+ | |||
+ | |||
+ | </ | ||
+ | |||
+ | ==== matlab.sub ==== | ||
+ | |||
+ | < | ||
+ | |||
+ | #!/bin/bash | ||
+ | |||
+ | rm -rf out err *.out | ||
+ | |||
+ | # from greentail we need to recreate module env | ||
+ | export PATH=/ | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | |||
+ | #BSUB -o out | ||
+ | #BSUB -e err | ||
+ | #BSUB -N | ||
+ | #BSUB -J test | ||
+ | |||
+ | # GPU RUN: (1-4), queue mwgpu, n (1-4, matches gpu count), must run on one node | ||
+ | #BSUB -q mwgpu | ||
+ | #BSUB -n 1 | ||
+ | #BSUB -R " | ||
+ | # signal MATGPU is a gpu run | ||
+ | export MATGPU=1 | ||
+ | lava.mvapich2.wrapper matlab -nodisplay | ||
+ | |||
+ | |||
+ | </ | ||
+ | |||
+ | ==== lava.mvampich2.wrapper ==== | ||
+ | |||
+ | < | ||
+ | |||
+ | #!/bin/sh | ||
+ | |||
+ | # This is a copy of lava.openmpi.wrapper which came with lava OCS kit | ||
+ | # Trying to make it work with mvapich2 | ||
+ | # -hmeij 13aug2013 | ||
+ | |||
+ | # | ||
+ | # Copyright (c) 2007 Platform Computing | ||
+ | # | ||
+ | # This script is a wrapper for openmpi mpirun | ||
+ | # it generates the machine file based on the hosts | ||
+ | # given to it by Lava. | ||
+ | # | ||
+ | |||
+ | # RLIMIT_MEMLOCK problem with libibverbs -hmeij | ||
+ | ulimit -l unlimited | ||
+ | |||
+ | |||
+ | usage() { | ||
+ | cat << | ||
+ | USAGE: | ||
+ | This command is a wrapper for mpirun (openmpi). | ||
+ | only be run within Lava using bsub e.g. | ||
+ | bsub -n # "$0 -np # {my mpi command and args}" | ||
+ | |||
+ | The wrapper will automatically generate the | ||
+ | machinefile used by mpirun. | ||
+ | |||
+ | NOTE: The list of hosts cannot exceed 4KBytes. | ||
+ | USEEOF | ||
+ | } | ||
+ | |||
+ | if [ x" | ||
+ | usage | ||
+ | exit -1 | ||
+ | fi | ||
+ | |||
+ | MYARGS=$* | ||
+ | WORKDIR=`dirname ${LSB_JOBFILENAME}` | ||
+ | MACHFILE=${WORKDIR}/ | ||
+ | ARGLIST=${WORKDIR}/ | ||
+ | |||
+ | # Check if mpirun is in the PATH -hmeij | ||
+ | T=`which --skip-alias mpirun_rsh` | ||
+ | #T=`which mpirun_rsh` | ||
+ | if [ $? -ne 0 ]; then | ||
+ | echo " | ||
+ | exit -2 | ||
+ | fi | ||
+ | |||
+ | echo " | ||
+ | #T=`grep -- -machinefile ${ARGLIST} |wc -l` | ||
+ | T=`grep -- -hostfile ${ARGLIST} |wc -l` | ||
+ | if [ $T -gt 0 ]; then | ||
+ | echo " | ||
+ | echo " | ||
+ | exit -3 | ||
+ | fi | ||
+ | |||
+ | # Make the open-mpi machine file | ||
+ | echo " | ||
+ | tr '\/ ' ' | ||
+ | |||
+ | MPIRUN=`which --skip-alias mpirun_rsh` | ||
+ | # | ||
+ | #echo " | ||
+ | |||
+ | # sanity checks number of processes 1-4 | ||
+ | np=`wc -l ${MACHFILE} | awk ' | ||
+ | if [ $np -lt 1 -o $np -gt 4 ]; then | ||
+ | echo " | ||
+ | echo " | ||
+ | exit -4 | ||
+ | fi | ||
+ | |||
+ | # sanity check single node | ||
+ | nh=`cat ${MACHFILE} | sort -u | wc -l` | ||
+ | if [ $nh -ne 1 ]; then | ||
+ | echo " | ||
+ | exit -5 | ||
+ | fi | ||
+ | |||
+ | # one host, one to four gpus | ||
+ | gpunp=`cat ${MACHFILE} | wc -l | awk ' | ||
+ | gpuhost=`cat ${MACHFILE} | sort -u | tr -d ' | ||
+ | gpuid=( $(for i in `ssh $gpuhost gpu-free | sed "s/,/ /g"`; do echo $i; done | shuf | head -$gpunp) ) | ||
+ | if [ $gpunp -eq 1 ]; then | ||
+ | CUDA_VISIBLE_DEVICES=$gpuid | ||
+ | echo "GPU allocation instance $gpuhost: | ||
+ | else | ||
+ | gpuids=`echo ${gpuid[@]} | sed "s/ /,/g"` | ||
+ | CUDA_VISIBLE_DEVICES=" | ||
+ | echo "GPU allocation instance $gpuhost: | ||
+ | fi | ||
+ | # namd ignores this | ||
+ | export CUDA_VISIBLE_DEVICES | ||
+ | #debug# setid=`ssh $gpuhost echo $CUDA_VISIBLE_DEVICES | tr ' | ||
+ | #debug# echo " | ||
+ | |||
+ | |||
+ | if [ -n " | ||
+ | # gromacs needs them from base 0, so gpu 2,3 is string 01 | ||
+ | if [ ${# | ||
+ | gmxrc_gpus=" | ||
+ | elif [ ${# | ||
+ | gmxrc_gpus=" | ||
+ | elif [ ${# | ||
+ | gmxrc_gpus=" | ||
+ | elif [ ${# | ||
+ | gmxrc_gpus=" | ||
+ | fi | ||
+ | |||
+ | if [ $GMXRC -eq 1 ]; then | ||
+ | newargs=`echo ${MYARGS} | sed " | ||
+ | echo " | ||
+ | $newargs | ||
+ | elif [ $GMXRC -eq 2 ]; then | ||
+ | newargs=`echo ${MYARGS} | sed " | ||
+ | echo " | ||
+ | ${MPIRUN} -ssh -hostfile ${MACHFILE} -np $gpunp $newargs | ||
+ | fi | ||
+ | |||
+ | elif [ -n " | ||
+ | echo " | ||
+ | ${MYARGS} | ||
+ | elif [ -n " | ||
+ | cat ${MACHFILE}.lst | tr '\/ ' ' | ||
+ | echo " | ||
+ | charmrun $NAMD_DIR/ | ||
+ | else | ||
+ | echo " | ||
+ | ${MPIRUN} -ssh -hostfile ${MACHFILE} -np $gpunp ${MYARGS} | ||
+ | fi | ||
+ | |||
+ | exit $? | ||
+ | |||
+ | |||
+ | </ | ||
+ | |||
+ | |||
+ | ===== elim code ===== | ||
+ | |||
+ | < | ||
+ | |||
+ | # | ||
+ | |||
+ | while (1) { | ||
+ | |||
+ | $gpu = 0; | ||
+ | $log = ''; | ||
+ | if (-e "/ | ||
+ | $tmp = `/ | ||
+ | @tmp = split(/ | ||
+ | foreach $i (0..$#tmp) { | ||
+ | ($a, | ||
+ | if ( $f == 0 ) { $gpu = $gpu + 1; } | ||
+ | #print "$a $f $gpu\n"; | ||
+ | $log .= " | ||
+ | } | ||
+ | } | ||
+ | # nr_of_args name1 value1 | ||
+ | $string = "1 gpu $gpu"; | ||
+ | |||
+ | $h = `hostname`; chop($h); | ||
+ | $d = `date +%m/ | ||
+ | foreach $i (' | ||
+ | if ( " | ||
+ | `echo " | ||
+ | } | ||
+ | } | ||
+ | |||
+ | # you need the \n to flush -hmeij | ||
+ | # you also need the space before the line feed -hmeij | ||
+ | print " | ||
+ | # or use | ||
+ | # | ||
+ | |||
+ | # smaller than specified in lsf.shared | ||
+ | sleep 10; | ||
+ | |||
+ | } | ||
+ | |||
</ | </ |