-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cu
62 lines (45 loc) · 1.57 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include <iostream>
#include "input.hpp"
#include "debug.cu"
const unsigned long long N = pow(512, 3);
__global__ void vector_nand(int *out, int *a, int *b, int n) {
int gridSize = gridDim.x * gridDim.y * gridDim.z;
int threadSize = blockDim.x * blockDim.y * blockDim.z;
int blockIndex = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
int threadIndex = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
int index = blockIndex * threadSize + threadIndex;
int stride = gridSize * threadSize;
for (int i = index; i < n; i += stride) {
out[i] = ~(a[i] & b[i]);
}
}
void nand(int *a, int *b, int *out, int n) {
int *d_a, *d_b, *d_out;
cudaMalloc((void**)&d_a, sizeof(int) * n);
cudaMalloc((void**)&d_b, sizeof(int) * n);
cudaMalloc((void**)&d_out, sizeof(int) * n);
cudaMemcpy(d_a, a, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(int) * N, cudaMemcpyHostToDevice);
dim3 dimGrid(8, 8, 8);
dim3 dimBlock(8, 8, 8);
vector_nand<<<dimGrid, dimBlock>>>(d_out, d_a, d_b, n);
cudaMemcpy(out, d_out, sizeof(int) * N, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
}
int main(){
int *a, *b, *out;
a = (int*)malloc(sizeof(int) * N);
b = (int*)malloc(sizeof(int) * N);
out = (int*)malloc(sizeof(int) * N);
for(int i = 0; i < N; i++) {
generate_data(&a[i], &b[i], i);
}
nand(a, b, out, N);
dump(a, b, out, 64);
test(out, N);
free(a);
free(b);
free(out);
}