-
Notifications
You must be signed in to change notification settings - Fork 115
Tests for existing collective communication technologies
HT edited this page Nov 1, 2019
·
2 revisions
test setup: CUDA 10.0, NCCL 2.2.13
Two processes on a machine with two P100 GPUs. Process 0 creates an ncclUniqueId
and saves it to a file. Process 1 reads the ncclUniqueId
from the file. Then both processes call ncclCommInitRank
with this ncclUniqueId
. After initialization, both processes start to doing ncclAllReduce
continuously in a loop.
After killing one process, the other process will hang in ncclAllReduce
call. Killing it is the only way out.
nccltest.cpp
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "cuda_runtime.h"
#include "nccl.h"
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
printf("Failed: Cuda error %s:%d '%s'\n", \
__FILE__,__LINE__,cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define NCCLCHECK(cmd) do { \
ncclResult_t r = cmd; \
if (r!= ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", \
__FILE__,__LINE__,ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)
void save_to_file(ncclUniqueId &id, const char* file_name)
{
FILE* ofile;
ofile = fopen(file_name, "wb");
fwrite(id.internal, 128, 1, ofile);
fclose(ofile);
}
void read_from_file(ncclUniqueId &id, const char* file_name)
{
FILE* ofile;
ofile = fopen(file_name, "rb");
fread(id.internal, 128, 1, ofile);
fclose(ofile);
}
int main(int argc, char* argv[])
{
int size = 2;
float cpu_buf[2];
cudaStream_t s;
ncclComm_t comm;
float *sendbuf;
int local_rank, num_ranks;
int gpu_count;
local_rank = atoi(argv[1]);
num_ranks = atoi(argv[2]);
ncclUniqueId id;
char values[128];
const char* file_name = "output.data";
CUDACHECK(cudaGetDeviceCount(&gpu_count));
CUDACHECK(cudaSetDevice(local_rank % gpu_count));
printf("Total gpu: %d, current rank=%d using gpu id %d\n", gpu_count, local_rank, local_rank % gpu_count);
CUDACHECK(cudaMalloc(&sendbuf, size * sizeof(float)));
CUDACHECK(cudaStreamCreate(&s));
if (local_rank == 0)
{
ncclGetUniqueId(&id);
save_to_file(id, file_name);
}
else {
read_from_file(id, file_name);
}
NCCLCHECK(ncclCommInitRank(&comm, num_ranks, id, local_rank));
printf("Comm created\n");
for (int i = 0; i < 100; i++)
{
cpu_buf[0] = local_rank * i * 0.1f;
cpu_buf[1] = local_rank * i * 0.12f;
CUDACHECK(cudaMemcpy(sendbuf, cpu_buf, sizeof(float) * size, cudaMemcpyHostToDevice));
NCCLCHECK(ncclAllReduce((const void*)sendbuf, (void*)sendbuf, size, ncclFloat, ncclSum,
comm, s));
CUDACHECK(cudaStreamSynchronize(s));
CUDACHECK(cudaMemcpy(cpu_buf, sendbuf, sizeof(float) * size, cudaMemcpyDeviceToHost));
printf("[%d] allreduce result is : %g %g\n", i, cpu_buf[0], cpu_buf[1]);
sleep(1);
}
CUDACHECK(cudaFree(sendbuf));
ncclCommDestroy(comm);
return 0;
}
Makefile
FLAGS := -I/usr/local/cuda-10.0/include -I/path_to_nccl/include -L/path_to_nccl/lib -L/usr/local/cuda-10.0/lib64 -lnccl -lcudart
nccltest: nccltest.cpp
g++ $< -g -o $@ $(FLAGS)
Start process 0 first as:
./nccltest 0 2
In another terminal, start process 1:
./nccltest 1 2
After both processes are inside the ncclAllReduce
loop, kill one of them, the other will hang.