Skip to content

SYCL

SYCL is a cross-platform abstraction layer that enables code for heterogeneous processors to be written using C++ with the host and kernel code for an application contained in the same source file. On NERSC systems, only the Intel 2024.1.0 module supports SYCL.

Vector Addition Example

sycl-vecadd-buffer.cpp

#include <sycl/sycl.hpp>

#include <cmath>
#include <iostream>

int main() {
  const int n = 100000;

  sycl::buffer<double> b_a{n}, b_b{n}, b_c{n};

  {
    sycl::host_accessor a{b_a, sycl::write_only};
    sycl::host_accessor b{b_b, sycl::write_only};
    for (size_t i = 0; i < n; i++) {
      a[i] = sin(i) * sin(i);
      b[i] = cos(i) * cos(i);
    }
  }

  sycl::queue q{sycl::gpu_selector_v};

  q.submit([&](sycl::handler &h) {
    sycl::accessor a{b_a, h, sycl::read_only};
    sycl::accessor b{b_b, h, sycl::read_only};
    sycl::accessor c{b_c, h, sycl::write_only};

    h.parallel_for(n, [=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
  });

  {
    double sum = 0.0;
    sycl::host_accessor c{b_c, sycl::read_only};
    for (size_t i = 0; i < n; i++)
      sum += c[i];
    std::cout << "sum = " << sum / n << std::endl;

    if (!(fabs(sum - static_cast<double>(n)) <= 1.0e-8))
      return 1;
  }

  return 0;
}

Makefile

CXX = icpx
CXXFLAGS = -std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_80'

all: sycl-vecadd-buffer.x

sycl-vecadd-buffer.x: sycl-vecadd-buffer.cpp
    $(CXX) $(CXXFLAGS) -o $@ $^

.PHONY: test
test:
    ./sycl-vecadd-buffer.x
$ module load intel/2024.1.0
$ make
icpx -std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_80' -o sycl-vecadd-buffer.x sycl-vecadd-buffer.cpp
$ ./sycl-vecadd-buffer.x
sum = 1

oneMKL Example

sycl-gemm-usm.cpp

#include <sycl/sycl.hpp>
#include <oneapi/mkl/blas.hpp>

#include <cmath>
#include <iostream>

int main() {

  const int m = 600;
  const int n = 1200;
  const int p = 2400;

  std::vector<double> matA(m * n);
  std::vector<double> matB(n * p);
  std::vector<double> matC_serial(m * p);

  for (size_t i = 0; i < m; i++)
    for (size_t j = 0; j < n; j++)
      matA[i * n + j] = 1.0;

  for (size_t i = 0; i < n; i++)
    for (size_t j = 0; j < p; j++)
      matB[i * p + j] = 2.0;

  for (size_t i = 0; i < m; i++) {
    for (size_t j = 0; j < p; j++) {
      for (size_t d = 0; d < n; d++) {
        matC_serial[i * p + j] += matA[i * n + d] * matB[d * p + j];
      }
    }
  }

  sycl::queue q{sycl::gpu_selector_v};

  double *dev_a = sycl::malloc_device<double>((m * n), q);
  double *dev_b = sycl::malloc_device<double>((n * p), q);
  double *dev_c = sycl::malloc_device<double>((m * p), q);
  q.memcpy(dev_a, matA.data(), sizeof(double) * m * n);
  q.memcpy(dev_b, matB.data(), sizeof(double) * n * p);
  q.wait();

  oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
  oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans;
  oneapi::mkl::blas::column_major::gemm(q, transA, transB, p, m, n, 1.0, dev_b, p, dev_a, n, 0.0, dev_c, p);
  q.wait();

  std::vector<double> matC_parallel(m * p);
  q.memcpy(matC_parallel.data(), dev_c, sizeof(double) * m * p);
  q.wait();

  for (size_t i = 0; i < m; i++) {
    for (size_t j = 0; j < p; j++) {
      if (!(fabs(matC_parallel[i * p + j] - matC_serial[i * p + j]) <= 1.0e-8))
        return 1;
    }
  }

  sycl::free(dev_a, q);
  sycl::free(dev_b, q);
  sycl::free(dev_c, q);

  return 0;
}

Makefile

CXX = icpx
CXXFLAGS = -std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_80'
INCFLAGS = -I/global/common/software/nersc9/onemkl/experimental-2024-06-20/include
LDFLAGS = -L/global/common/software/nersc9/onemkl/experimental-2024-06-20/lib -lonemkl

all: sycl-gemm-usm.x

sycl-gemm-usm.x: sycl-gemm-usm.cpp
    ${CXX} $(CXXFLAGS) $(INCFLAGS) $(LDFLAGS) -o $@ $^

.PHONY: test
test:
    ./sycl-gemm-usm.x
$ module load intel/2024.1.0
$ export LD_LIBRARY_PATH=/global/common/software/nersc9/onemkl/experimental-2024-06-20/lib:$LD_LIBRARY_PATH
$ make
icpx -std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend '--cuda-gpu-arch=sm_80' -I/global/common/software/nersc9/onemkl/experimental-2024-06-20/include -L/global/common/software/nersc9/onemkl/experimental-2024-06-20/lib -lonemkl -o sycl-gemm-usm.x sycl-gemm-usm.cpp
$ ./sycl-gemm-usm.x

References

Support