Skip to content

STAR

How to start STAR Slurm+Shifter job

The root4star reads raw data file and reconstructs events. It requires acccess to STAR DB. This example will connect to LBL mirrir of STAR DB.

The Slurm job definition 'starOne.slr' will run on the 4 NERSC susbsystems:

  • PDSF w/ CHOS
  • PDSF w/ Shifter
  • Cori w/ Shifter

To select the righ version you need to remove '-' sign from #-SBTACH bhla line(s). The other 3 lines will be ignord by Slurm

ssh  ssh  -X pdsf.nersc.gov
$ sbatch starOne.slr
$  cat starOne.slr
#!/bin/bash

#SBATCH -t 25:00   
#SBATCH  --account nstaff

# Pick one of the following lines to toggle: chos or shifter or Cori
# (toggle  '#-SBATCH' vs. '#SBATCH'  )
#-SBATCH -J star-chos -p shared-chos --ntasks=1
#SBATCH -J star-shift -p shared --ntasks=1 --image=custom:pdsf-sl64-star:v3
#-SBATCH -J star-cori -p debug -N1 -t 25:00 --image=custom:pdsf-sl64-star:v7  -C haswell

echo "start-A "`hostname`" in PWD="`pwd`

startSkew=30 # (seconds), random delay for task
nsleep=$(($RANDOM % $startSkew))
echo nsleep=$nsleep
sleep $nsleep

#tasks script to be executed
job_sh=r4sTask_bfc.csh
export NUM_EVE=9
# 1 event takses 30-60 sec (or 5 minutes), 1st event takes 300-1000 seconds depending on the DB load

# fixed one data file
#dataName=st_mtd_adc_16114048_raw_2500009.daq

# OR use data from the listB,  10k events in the daq, 1 eve taks 10-15 sec
# but some daw files, e.g. st_physics_17133038_raw_4000014.daq require 40-70 sec/eve
export PATH_DAQ=/project/projectdirs/mpccc/balewski/star_daq_2016
dataList=dataListB.txt
export STAR_VER=SL17a
export BFC_String="DbV20161216,P2016a,StiCA,mtd,mtdCalib,btof,PxlHit,IstHit,SstHit,beamline3D,picoWrite,PicoVtxVpd,BEmcChkStat,-evout,CorrX,OSpaceZ2,OGridLeak3D,-hitfilt"

export LOG_PATH=/global/project/projectdirs/mpccc/balewski/tmp/logs
mkdir -p $LOG_PATH
echo write r4s logs to  $LOG_PATH

kD=${SLURM_ARRAY_TASK_ID-1}
echo pick data $kD from list $dataList
dataName=${PATH_DAQ}/`head -n $kD  $dataList | tail -n1`

# pick STAR library you want to use
export EXEC_NAME=root4star

# define permanent output dir, here it is jobID dependent
export OUT_DIR=/global/project/projectdirs/mpccc/balewski/tmp/outSTAR3/${SLURM_JOB_NAME}/${SLURM_JOB_ID}
mkdir -p ${OUT_DIR}

# prepare sandbox - it is done for you by SLURM on PDSF and must be done manualy on Cori
export WRK_DIR=$SLURM_TMP
echo aaa $SLURM_CLUSTER_NAME
if [[ $SLURM_CLUSTER_NAME != 'pdsf'* ]] ; then
    export WRK_DIR=$CSCRATCH/tmp-star/${SLURM_JOB_ID}
    echo make sandbox $WRK_DIR which will be NOT erased by Slurm
    mkdir -p  $WRK_DIR    
fi
    

# used code must be copied to the sandbox
# optional:
# it is safer to copy all code to the sandbox, so job still runs fine even if you recompile your local code 
codeDir=/global/homes/b/balewski/star-pipeline/embedPdsf1/

echo Prepare a local copy of binaries
time( cp -rpL r4sTask_bfc.csh  $WRK_DIR ; cp -rpL $codeDir/StRoot $codeDir/.sl64_gcc482/   $WRK_DIR )

echo run job in STAR_VER=$STAR_VER  WRK_DIR=$WRK_DIR
echo see vCores=$SLURM_CPUS_ON_NODE

ls -l  ${job_sh}
if [[ $SLURM_JOB_PARTITION == *"-chos" ]]
then
    echo  run-in-chos
    CHOS=sl64 chos  $WRK_DIR/${job_sh}  $dataName
else
    echo  run-in-shifter
    # minor operation allowing to jump into tcsh inside shifter image
    unset MODULE_VERSION_STACK
    unset MODULE_VERSION
    unset MODULEPATH MODULESHOME
    unset LOADEDMODULES PRGENVMODULES
    shifter   --volume=/global/project:/project   /bin/tcsh $WRK_DIR/${job_sh} $dataName
fi
echo end-A-slurm-job

# mv slurm log to final destination 
if [ -z ${SLURM_ARRAY_JOB_ID+x} ]; then 
  mv slurm-${SLURM_JOB_ID}.out $LOG_PATH
else 
  mv slurm-${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.out $LOG_PATH
fi

This example will run root4star and store finle muDst in a different location than all auxiliary root4star output.

The tcsh task script 'r4sTask_bfc.csh' looks like this

$  cat r4sTask_bfc.csh
#!/bin/tcsh 
# Note, all code in this scrpt is executed under tcsh
if (  "$SLURM_JOB_PARTITION" =~ *"chos"  ) then  
    echo  task-in-chos
    chosenv
    ls -l /proc/chos/link
else
    echo  task-in-shifter
    echo inShifter:`env|grep  SHIFTER_RUNTIME`
    cat /etc/*release
    #
    # - - - -  D O   N O T   T O U C H  T H I S   S E C T I O N- - - - 
    #
    whoami    
    echo  load STAR enviroment in shifter
    set NCHOS = sl64
    set SCHOS = 64
    set DECHO = 1
    set SCRATCH = $WRK_DIR/out-star1
    setenv GROUP_DIR /common/star/star${SCHOS}/group/
    source $GROUP_DIR/star_cshrc.csh    
    # Georg fix for tcsh
    setenv LD_LIBRARY_PATH /usr/common/usg/software/gcc/4.8.2/lib:/usr/common/usg/software/java/jdk1.7.0_60/lib:/usr/common/usg/software/gcc/4.8.2/lib64:/usr/common/usg/software/mpc/1.0.3/lib/:/usr/common/usg/software/gmp/6.0.0/lib/:/usr/common/usg/software/mpfr/3.1.3/lib/:$LD_LIBRARY_PATH
    echo
    echo avaliable STAR-lib version in this OS image:
    ls -d /common/star/star64/packages/SL*
    #
    # - - - -   Y O U   C A N   C H A N G E   B E L O W  - - - -
    #    
endif  

    #cd ${WRK_DIR} # important- not any more, shifter was fixed
    set daqN = $argv[1]
     
    echo  starting new-r4s PATH_DAQ=$PATH_DAQ, daqN=$daqN, execName=$EXEC_NAME, NUM_EVE=$NUM_EVE, OUT_DIR=$OUT_DIR, workerName=`hostname -f`, startDate=`date`

    echo "use BFC $BFC_String "
    if ( !  -f $daqN ) then
	echo "ERROR: file ${daqN} does not exist, Aborting-33"
	exit
    endif
    echo size of daqN
    ls -l  $daqN 
     
    echo testing STAR setup $STAR_VER in `pwd`
    starver $STAR_VER 
    env |grep STAR

    echo 'my new STAR ver='$STAR'  test root4star '
    root4star -b -q 
    if ( $? != 0) then
	echo STAR environment is corrupted, aborting job
	echo $STAR
	which root4star
	exit
    endif
 
    #echo EEEEE ;   exit

    echo `date`" Fire: $EXEC_NAME for daqN=$daqN  numEve=$NUM_EVE  [wiat]"
    /usr/bin/time -v  $EXEC_NAME -b -q bfc.C\($NUM_EVE,\"$BFC_String\",\"$daqN\"\) >& $LOG_PATH/r4s-${SLURM_JOB_ID}.log
    echo `date`" completed job $daqN  , save results to "$OUT_DIR
    ls -l
    time cp *MuDst* $OUT_DIR
    echo 'copy done '`date`

How to start STAR job array using Slurm+Shifter

The same 'starOne.slr' can be used to launch an array of jobs, each processing a different file from a 'dataListB.txt'

$  cat dataListB.txt
st_physics_17133005_raw_1000005.daq
st_physics_17133005_raw_1500015.daq
st_physics_17133006_raw_1000021.daq
st_physics_17133006_raw_1500013.daq
st_physics_17133006_raw_1500017.daq
st_physics_17133006_raw_3500006.daq
st_physics_17133006_raw_3500024.daq
st_physics_17133006_raw_4000018.daq
st_physics_17133007_raw_2000022.daq
st_physics_17133007_raw_2500013.daq

To make this work you need to enable fire the same job as array, counting from 0 to N-1:

ssh  ssh  -X pdsf.nersc.gov
$ sbatch --array 0-9 starOne.slr

which will result with submission 10 independent Slurm jobs, each will have assigned a different line to the variable 'dataName' passed as an argument to 'starOne.slr'