| 110 | |
| 111 | === Stampede Example Single Node PBS SCript === |
| 112 | {{{ |
| 113 | #!/bin/bash |
| 114 | |
| 115 | #SBATCH -t 00:360:00 |
| 116 | #SBATCH -n 16 |
| 117 | #SBATCH -p normal |
| 118 | |
| 119 | /home1/00950/kevinm/java/default/bin/java -Djava.awt.headless=true -Xmx25000M -Xms25000M -cp /work/00950/kevinm/ucerf3/inversion/2013_05_03-ucerf3p3-production-first-five/OpenSHA_complete.jar:/work/00950/kevinm/ucerf3/inversion/parallelcolt-0.9.4.jar:/work/00950/kevinm/ucerf3/inversion/commons-cli-1.2.jar:/work/00950/kevinm/ucerf3/inversion/csparsej.jar scratch.UCERF3.inversion.CommandLineInversionRunner --completion-time 5h --sub-completion 1s --cool FAST_SA --nonneg LIMIT_ZERO_RATES --num-threads 5 --branch-prefix FM3_1_ABM_EllB_DsrTap_CharConst_M5Rate6.5_MMaxOff7.3_NoFix_SpatSeisU2_run0 --directory /work/00950/kevinm/ucerf3/inversion/2013_05_03-ucerf3p3-production-first-five --no-plots |
| 120 | exit $ |
| 121 | }}} |
| 122 | |
| 123 | == Bundled Large MPI Jobs == |
| 124 | Some schedulers give preference to single large jobs over many small jobs. They may also have limits on the total number of jobs that can be submitted. You can get around this limitation by submitting a single MPI job that runs many inversions on many nodes. This has the added benefit of allowing you to run multiple inversions per node if enough processors/memory is available. UCERF3 production runs were run with this method on the Stampede supercomputer with 3 inversions per node. |
| 125 | |
| 126 | To use this method, your PBS script must call scratch.UCERF3.simulatedAnnealing.hpc.MPJInversionDistributor. You must also supply the "--exact-dispatch X" (where X is the number of threads per node). The total number of inversions must be less than X*NODES, so with 3 threads per node and 256 nodes, you can submit at most 768 inversions. You must also supply an xml file argument, which is described below. Additionally, you must download and install FastMPJ in your user account as this library is required. |
| 127 | |
| 128 | === Stampede Batch PBS Script === |
| 129 | {{{ |
| 130 | #!/bin/bash |
| 131 | |
| 132 | #SBATCH -t 00:420:00 |
| 133 | #SBATCH -n 2048 |
| 134 | #SBATCH -p normal |
| 135 | |
| 136 | PBS_NODEFILE="/tmp/${USER}-hostfile-${SLURM_JOBID}" |
| 137 | echo "creating PBS_NODEFILE: $PBS_NODEFILE" |
| 138 | scontrol show hostnames $SLURM_NODELIST > $PBS_NODEFILE |
| 139 | |
| 140 | export FMPJ_HOME=/home1/00950/kevinm/FastMPJ |
| 141 | export PATH=$PATH:$FMPJ_HOME/bin |
| 142 | |
| 143 | if [[ -e $PBS_NODEFILE ]]; then |
| 144 | #count the number of processors assigned by PBS |
| 145 | NP=`wc -l < $PBS_NODEFILE` |
| 146 | echo "Running on $NP processors: "`cat $PBS_NODEFILE` |
| 147 | else |
| 148 | echo "This script must be submitted to PBS with 'qsub -l nodes=X'" |
| 149 | exit 1 |
| 150 | fi |
| 151 | |
| 152 | if [[ $NP -le 0 ]]; then |
| 153 | echo "invalid NP: $NP" |
| 154 | exit 1 |
| 155 | fi |
| 156 | |
| 157 | date |
| 158 | echo "RUNNING FMPJ" |
| 159 | fmpjrun -machinefile $PBS_NODEFILE -np $NP -dev niodev -Djava.library.path=$FMPJ_HOME/lib -Djava.awt.headless=true -Xmx25000M -Xms25000M -cp /work/00950/kevinm/ucerf3/inversion/2013_05_03-ucerf3p3-production-first-five/OpenSHA_complete.jar:/work/00950/kevinm/ucerf3/inversion/parallelcolt-0.9.4.jar:/work/00950/kevinm/ucerf3/inversion/commons-cli-1.2.jar:/work/00950/kevinm/ucerf3/inversion/csparsej.jar -class scratch.UCERF3.simulatedAnnealing.hpc.MPJInversionDistributor --exact-dispatch 3 /work/00950/kevinm/ucerf3/inversion/2013_05_03-ucerf3p3-production-first-five/batch00.xml |
| 160 | ret=$? |
| 161 | |
| 162 | date |
| 163 | echo "DONE with process 0. EXIT CODE: $ret" |
| 164 | |
| 165 | exit $ret |
| 166 | }}} |
| 167 | |
| 168 | === XML Input File === |
| 169 | The XML input file simply supplies a list of arguments for each inversion. This is an example for 384 inversions. The "num" argument at the end is a sanity check which verifies the correct number of arguments. |
| 170 | {{{ |
| 171 | <?xml version="1.0" encoding="UTF-8"?> |
| 172 | |
| 173 | <OpenSHA> |
| 174 | <InversionConfigurations num="384"> |
| 175 | <InversionConfiguration index="0" args="--completion-time 5h --sub-completion 1s --cool FAST_SA --nonneg LIMIT_ZERO_RATES --num-threads 5 --branch-prefix FM3_1_ABM_Shaw09Mod_DsrUni_CharConst_M5Rate6.5_MMaxOff7.3_NoFix_SpatSeisU2_run0 --directory /work/00950/kevinm/ucerf3/inversion/2013_05_03-ucerf3p3-production-first-five --no-plots" num="15"/> |
| 176 | <InversionConfiguration index="1" args="--completion-time 5h --sub-completion 1s --cool FAST_SA --nonneg LIMIT_ZERO_RATES --num-threads 5 --branch-prefix FM3_1_ABM_Shaw09Mod_DsrUni_CharConst_M5Rate6.5_MMaxOff7.3_NoFix_SpatSeisU3_run0 --directory /work/00950/kevinm/ucerf3/inversion/2013_05_03-ucerf3p3-production-first-five --no-plots" num="15"/> |
| 177 | <InversionConfiguration index="2" args="--completion-time 5h --sub-completion 1s --cool FAST_SA --nonneg LIMIT_ZERO_RATES --num-threads 5 --branch-prefix FM3_1_ABM_Shaw09Mod_DsrUni_CharConst_M5Rate6.5_MMaxOff7.6_NoFix_SpatSeisU2_run0 --directory /work/00950/kevinm/ucerf3/inversion/2013_05_03-ucerf3p3-production-first-five --no-plots" num="15"/> |
| 178 | ... |
| 179 | <InversionConfiguration index="383" args="--completion-time 5h --sub-completion 1s --cool FAST_SA --nonneg LIMIT_ZERO_RATES --num-threads 5 --branch-prefix FM3_1_ZENGBB_Shaw09Mod_DsrTap_CharConst_M5Rate6.5_MMaxOff7.9_NoFix_SpatSeisU3_run0 --directory /work/00950/kevinm/ucerf3/inversion/2013_05_03-ucerf3p3-production-first-five --no-plots" num="15"/> |
| 180 | </InversionConfigurations> |
| 181 | </OpenSHA> |
| 182 | |
| 183 | }}} |