Skip to content

Tests for existing collective communication technologies

HT edited this page Nov 1, 2019 · 2 revisions

NCCL

test setup: CUDA 10.0, NCCL 2.2.13

Two processes on a machine with two P100 GPUs. Process 0 creates an ncclUniqueId and saves it to a file. Process 1 reads the ncclUniqueId from the file. Then both processes call ncclCommInitRank with this ncclUniqueId. After initialization, both processes start to doing ncclAllReduce continuously in a loop.

After killing one process, the other process will hang in ncclAllReduce call. Killing it is the only way out.

Code

nccltest.cpp

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "cuda_runtime.h"
#include "nccl.h"

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",       \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

#define NCCLCHECK(cmd) do {                         \
  ncclResult_t r = cmd;                             \
  if (r!= ncclSuccess) {                            \
    printf("Failed, NCCL error %s:%d '%s'\n",       \
        __FILE__,__LINE__,ncclGetErrorString(r));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

void save_to_file(ncclUniqueId &id, const char* file_name)
{
  FILE* ofile;
  ofile = fopen(file_name, "wb");
  fwrite(id.internal, 128, 1, ofile);
  fclose(ofile);
}

void read_from_file(ncclUniqueId &id, const char* file_name)
{
  FILE* ofile;
  ofile = fopen(file_name, "rb");
  fread(id.internal, 128, 1, ofile);
  fclose(ofile);
}

int main(int argc, char* argv[])
{
  int size = 2;
  float cpu_buf[2];

  cudaStream_t s;
  ncclComm_t comm;

  float *sendbuf;
  int local_rank, num_ranks;
  int gpu_count;

  local_rank = atoi(argv[1]);
  num_ranks = atoi(argv[2]);

  ncclUniqueId id;
  char values[128];
  const char* file_name = "output.data";

  CUDACHECK(cudaGetDeviceCount(&gpu_count));
  CUDACHECK(cudaSetDevice(local_rank % gpu_count));
  printf("Total gpu: %d, current rank=%d using gpu id %d\n", gpu_count, local_rank, local_rank % gpu_count);
  CUDACHECK(cudaMalloc(&sendbuf, size * sizeof(float)));
  CUDACHECK(cudaStreamCreate(&s));

  if (local_rank == 0)
  {
      ncclGetUniqueId(&id);
      save_to_file(id, file_name);
  }
  else {
      read_from_file(id, file_name);
  }

  NCCLCHECK(ncclCommInitRank(&comm, num_ranks, id, local_rank));
  printf("Comm created\n");
  for (int i = 0; i < 100; i++)
  {
      cpu_buf[0] = local_rank * i * 0.1f;
      cpu_buf[1] = local_rank * i * 0.12f;
      CUDACHECK(cudaMemcpy(sendbuf, cpu_buf, sizeof(float) * size, cudaMemcpyHostToDevice));
      NCCLCHECK(ncclAllReduce((const void*)sendbuf, (void*)sendbuf, size, ncclFloat, ncclSum,
        comm, s));
      CUDACHECK(cudaStreamSynchronize(s));
      CUDACHECK(cudaMemcpy(cpu_buf, sendbuf, sizeof(float) * size, cudaMemcpyDeviceToHost));
      printf("[%d] allreduce result is : %g %g\n", i, cpu_buf[0], cpu_buf[1]);
      sleep(1);
  }
  CUDACHECK(cudaFree(sendbuf));
  ncclCommDestroy(comm);

  return 0;
}

Makefile

FLAGS := -I/usr/local/cuda-10.0/include -I/path_to_nccl/include -L/path_to_nccl/lib -L/usr/local/cuda-10.0/lib64 -lnccl -lcudart
nccltest: nccltest.cpp
	g++ $< -g -o $@ $(FLAGS)

Running

Start process 0 first as:

./nccltest 0 2

In another terminal, start process 1:

./nccltest 1 2

After both processes are inside the ncclAllReduce loop, kill one of them, the other will hang.