Tests for existing collective communication technologies

NCCL

test setup: CUDA 10.0, NCCL 2.2.13

Two processes on a machine with two P100 GPUs. Process 0 creates an ncclUniqueId and saves it to a file. Process 1 reads the ncclUniqueId from the file. Then both processes call ncclCommInitRank with this ncclUniqueId. After initialization, both processes start to doing ncclAllReduce continuously in a loop.

After killing one process, the other process will hang in ncclAllReduce call. Killing it is the only way out.

Code

nccltest.cpp

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "cuda_runtime.h"
#include "nccl.h"

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",       \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

#define NCCLCHECK(cmd) do {                         \
  ncclResult_t r = cmd;                             \
  if (r!= ncclSuccess) {                            \
    printf("Failed, NCCL error %s:%d '%s'\n",       \
        __FILE__,__LINE__,ncclGetErrorString(r));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

void save_to_file(ncclUniqueId &id, const char* file_name)
{
  FILE* ofile;
  ofile = fopen(file_name, "wb");
  fwrite(id.internal, 128, 1, ofile);
  fclose(ofile);
}

void read_from_file(ncclUniqueId &id, const char* file_name)
{
  FILE* ofile;
  ofile = fopen(file_name, "rb");
  fread(id.internal, 128, 1, ofile);
  fclose(ofile);
}

int main(int argc, char* argv[])
{
  int size = 2;
  float cpu_buf[2];

  cudaStream_t s;
  ncclComm_t comm;

  float *sendbuf;
  int local_rank, num_ranks;
  int gpu_count;

  local_rank = atoi(argv[1]);
  num_ranks = atoi(argv[2]);

  ncclUniqueId id;
  char values[128];
  const char* file_name = "output.data";

  CUDACHECK(cudaGetDeviceCount(&gpu_count));
  CUDACHECK(cudaSetDevice(local_rank % gpu_count));
  printf("Total gpu: %d, current rank=%d using gpu id %d\n", gpu_count, local_rank, local_rank % gpu_count);
  CUDACHECK(cudaMalloc(&sendbuf, size * sizeof(float)));
  CUDACHECK(cudaStreamCreate(&s));

  if (local_rank == 0)
  {
      ncclGetUniqueId(&id);
      save_to_file(id, file_name);
  }
  else {
      read_from_file(id, file_name);
  }

  NCCLCHECK(ncclCommInitRank(&comm, num_ranks, id, local_rank));
  printf("Comm created\n");
  for (int i = 0; i < 100; i++)
  {
      cpu_buf[0] = local_rank * i * 0.1f;
      cpu_buf[1] = local_rank * i * 0.12f;
      CUDACHECK(cudaMemcpy(sendbuf, cpu_buf, sizeof(float) * size, cudaMemcpyHostToDevice));
      NCCLCHECK(ncclAllReduce((const void*)sendbuf, (void*)sendbuf, size, ncclFloat, ncclSum,
        comm, s));
      CUDACHECK(cudaStreamSynchronize(s));
      CUDACHECK(cudaMemcpy(cpu_buf, sendbuf, sizeof(float) * size, cudaMemcpyDeviceToHost));
      printf("[%d] allreduce result is : %g %g\n", i, cpu_buf[0], cpu_buf[1]);
      sleep(1);
  }
  CUDACHECK(cudaFree(sendbuf));
  ncclCommDestroy(comm);

  return 0;
}

Makefile

FLAGS := -I/usr/local/cuda-10.0/include -I/path_to_nccl/include -L/path_to_nccl/lib -L/usr/local/cuda-10.0/lib64 -lnccl -lcudart
nccltest: nccltest.cpp
	g++ $< -g -o $@ $(FLAGS)

Running

Start process 0 first as:

./nccltest 0 2

In another terminal, start process 1:

./nccltest 1 2

After both processes are inside the ncclAllReduce loop, kill one of them, the other will hang.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Tests for existing collective communication technologies

NCCL

Code

Running

Clone this wiki locally