Skip to content


Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

Cori performance

A table

System Type Cray XC40
Theoretical Peak Performance (System) 31.4 PFlops
Theoretical Peak Performance (Haswell nodes) 2.3 PFlops
Theoretical Peak Performance (Xeon Phi nodes) 29.1 PFlops

Basic example


Include paths are specified relative to the base directory (where mkdocs.yml is located).

The full syntax for includes is available here. Includes are done via ASCII scissors (--8<--) followed by the filename in quotes. See the source of this page for examples.

A useful code for checking thread and process affinity.

#define _GNU_SOURCE

#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sched.h>
#include <mpi.h>
#include <omp.h>

/* Borrowed from util-linux-2.13-pre7/schedutils/taskset.c */
static char *cpuset_to_cstr(cpu_set_t *mask, char *str)
  char *ptr = str;
  int i, j, entry_made = 0;
  for (i = 0; i < CPU_SETSIZE; i++) {
    if (CPU_ISSET(i, mask)) {
      int run = 0;
      entry_made = 1;
      for (j = i + 1; j < CPU_SETSIZE; j++) {
        if (CPU_ISSET(j, mask)) run++;
        else break;
      if (!run)
        sprintf(ptr, "%d,", i);
      else if (run == 1) {
        sprintf(ptr, "%d,%d,", i, i + 1);
      } else {
        sprintf(ptr, "%d-%d,", i, i + run);
        i += run;
      while (*ptr != 0) ptr++;
  ptr -= entry_made;
  *ptr = 0;

int main(int argc, char *argv[])
  int rank, thread;
  cpu_set_t coremask;
  char clbuf[7 * CPU_SETSIZE], hnbuf[64];

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  memset(clbuf, 0, sizeof(clbuf));
  memset(hnbuf, 0, sizeof(hnbuf));
  (void)gethostname(hnbuf, sizeof(hnbuf));
#pragma omp parallel private(thread, coremask, clbuf)
    thread = omp_get_thread_num();
    (void)sched_getaffinity(0, sizeof(coremask), &coremask);
    cpuset_to_cstr(&coremask, clbuf);
    #pragma omp barrier
    printf("Hello from rank %d, thread %d, on %s. (core affinity = %s)\n",
	   rank, thread, hnbuf, clbuf);


TARGET = xthi
CC = cc
CFLAGS = -g -Wall -qopenmp

.PHONY: default all clean

default: $(TARGET)
all: default

OBJECTS = $(patsubst %.c, %.o, $(wildcard *.c))
HEADERS = $(wildcard *.h)

%.o: %.c $(HEADERS)
	$(CC) $(CFLAGS) -c $< -o $@


	$(CC) $(CFLAGS) $(OBJECTS) -Wall $(LIBS) -o $@

	-rm -f *.o
	-rm -f $(TARGET)

Running the executable

Submit the script with the sbatch command:

#SBATCH -p debug
#SBATCH -t 00:05:00

#OpenMP settings:
export OMP_PLACES=threads
export OMP_PROC_BIND=spread

#run the application:
sbcast ./xthi /tmp/xthi
srun -n 12 -c 8 --cpu_bind=cores /tmp/xthi | sort


The -c and --cpu_bind= options for srun are required for hybrid jobs or jobs which do not utilize all physical cores

Some source code

Instrumented C code to measure AI

// Code must be built with appropriate paths for VTune include file (ittnotify.h) and library (-littnotify)
#include <ittnotify.h>

__SSC_MARK(0x111); // start SDE tracing, note it uses 2 underscores
__itt_resume(); // start VTune, again use 2 underscores

for (k=0; k<NTIMES; k++) {
 #pragma omp parallel for
 for (j=0; j<STREAM_ARRAY_SIZE; j++)
 a[j] = b[j]+scalar*c[j];

__itt_pause(); // stop VTune
__SSC_MARK(0x222); // stop SDE tracing

LaTex support

\frac{n!}{k!(n-k)!} = \binom{n}{k}


\frac{n!}{k!(n-k)!} = \binom{n}{k}