Skip to content

MadGraph

How to start ATLAS Slurm+Shifter job

Warning

This instruction works only for user=kkrizka, use it as a guidance only.

MadGraph is a mutli-core compute task, no need for CVMFS

ssh kkrizka@pdsf
$sbatch oneMG.slr
$  cat oneMG.slr
#!/bin/bash

#SBATCH -t 225:00   

# Pick one of the following lines to toggle: chos or shifter or Cori
# (toggle  '#-SBATCH' is OFF vs. '#SBATCH' is ON  )

#SBATCH -J jan-shift -p shared --image=custom:pdsf-chos-sl64:v4 --ntasks=6 
#-SBATCH -J atlas-chos -p shared-chos   --ntasks=6 
#-SBATCH -J jan-cori -p debug -N1 --image=custom:pdsf-chos-sl64:v2  -C haswell
# the Python (merge step) task requires 10GB of RAM,  the swap space is not protecting nodes from OOM, 
# soo we need --mem flag, it will consume 6 vCores (aka 1/10 of the Haswell node) 
#SBATCH --mem 10G  

#tasks to be executed
job_sh=runMad.sh
export NUM_EVE=${1-200100}
export NUM_CORE=$SLURM_CPUS_ON_NODE
export DATA_STORE=/global/project/projectdirs/atlas/kkrizka/madStoreJan3
export CODE_DIR=`pwd`
#env|grep SLURM

startSkew=300 # (seconds), random delay for each pilot
nsleep=$(($RANDOM % $startSkew)) 

# use local scratch
export WORKDIR=${SLURM_TMP}
# OR use global scratch on project (never cleand up)
#export WORKDIR=/global/project/projectdirs/atlas/kkrizka/janTmp/job${SLURM_JOBID}

echo "start-A "`hostname`"  NUM_CORE="$NUM_CORE"  nEve="$NUM_EVE
echo 'nproc='`nproc`' nsleep='$nsleep'   WORKDIR='$WORKDIR
sleep $nsleep
date
ls -l  $CODE_DIR/${job_sh}

if [[ $SLURM_JOB_PARTITION == *"-chos" ]]
then
  echo  run-in-chos CODE_DIR=$CODE_DIR
  CHOS=sl64 chos  $CODE_DIR/${job_sh}  
else
 echo  run-in-shifter
 shifter  --volume=/global/project:/project  /bin/bash $CODE_DIR/${job_sh}
fi
echo end-A
date
mkdir -p logs/
if [ -z ${SLURM_ARRAY_JOB_ID+x} ]; then
  mv slurm-${SLURM_JOB_ID}.out logs/
else
  mv slurm-${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.out  logs/
fi
    

This example will run MadGraph on 6 cores.

Note

the oneMG.slr is setup to run on all 3 slurm partitions: PDSF+Chos, PDSF+Shifter, Cori+Shifter - you need only to toggle the '-' in front of SBATCH.

The bash task script 'runMad.sh' requires sourcing of your envirement - if you use Shifter

$  cat runMad.sh
#!/bin/bash -l

IDX=${SLURM_ARRAY_TASK_ID}
if [[ $SLURM_JOB_PARTITION == *"-chos" ]]
then
   echo  task-in-chos
   chosenv
   ls -l /proc/chos/link
else
  echo  task-in-shifter
  echo inShifter:`env|grep  SHIFTER_RUNTIME`
  cat /etc/*release
fi

# increase the number of user processes allowed by a user on a system.
ulimit -s unlimited
ulimit -u 8192
ulimit -a

echo run on node `hostname`
module load python/2.7.9
module list
python -V

printf -v runidx "%05d" ${IDX}

MGDIR=${DATA_STORE-fixMe1}
echo "WORKDIR=${WORKDIR}"
echo "MGDIR=${MGDIR}"

mkdir -p ${WORKDIR}
cd ${WORKDIR}

echo Prepare a copy so MG can edit it
time ( cp /global/project/projectdirs/atlas/kkrizka/PROC_xia.tgz .; tar -zxf PROC_xia.tgz )
ls -l  .

(sleep 120;  echo Dump information after delay; date; top ibn1 ; free -g)&
echo "launch --nb_core=$SLURM_CPUS_ON_NODE" >runX.cmd 
echo "set nevents $NUM_EVE" >>runX.cmd 

echo # Generate a new run with
cat runX.cmd

head -n40 PROC_xia/Cards/run_card.dat
/usr/bin/time -v  ./PROC_xia/bin/madevent runX.cmd

# Save the output
OUTDIR=${MGDIR}/PROC_xia/Events/${SLURM_JOBID}
mkdir -p ${OUTDIR}
cp -r PROC_xia/Events/run_01 ${OUTDIR}/run_${runidx}
echo "Events copied to ${OUTDIR}/run_${runidx}"
echo "task-done  "`date`