-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcuda_matrix_mul.cu
161 lines (124 loc) · 4.52 KB
/
cuda_matrix_mul.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/*
column
A[][] = ---------------------threadIdx.y
|
|
|
|
row |
|
|
|
|
threadIdx.x
*/
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#define TILE_WIDTH 16
#define TILE_WIDTH 16
#define ar 311
#define ac_br 312
#define bc 115
using namespace std;
void check_gpu_error(const char *msg)
{
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess)
{
cerr<<"\n Error in: "<<msg<<" cuda error string: "<<cudaGetErrorString(err);
exit(-1);
}
}// End of check_gpu_error function
__global__ void mat_mul(int *d_A, int *d_B, int *d_C, int rowA, int colA, int rowB, int colB, int rowC, int colC)
{
int row, col;
row = threadIdx.x + blockIdx.x*blockDim.x; // 0 to rowA/rowC
col = threadIdx.y + blockIdx.y*blockDim.y; // 0 to colB/colC
if(row < rowC && col < colC)
{
for(int i = 0; i < colA; i++) // colA = rowB
d_C[row*colC + col] += d_A[row*colA + i]*d_B[i*colB + col];
}
}// End of mat_mul function
__global__ void mat_mul_shared(int *d_A, int *d_B, int *d_C, int rowA, int colA, int rowB, int colB, int rowC, int colC)
{
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = tx + bx*TILE_WIDTH; // 0 to rowA/rowC
int col = ty + by*TILE_WIDTH; // 0 to colB/colC
__shared__ int s_A[TILE_WIDTH][TILE_WIDTH], s_B[TILE_WIDTH][TILE_WIDTH];
int cvalue = 0;
for(int i = 0; i < (colA+TILE_WIDTH-1)/TILE_WIDTH; i++)
{
if(row < rowA && i*TILE_WIDTH+ty < colA)
s_A[tx][ty] = d_A[row*colA + i*TILE_WIDTH+ty];
else
s_A[tx][ty] = 0;
if(i*TILE_WIDTH+tx < rowB && col < colB)
s_B[tx][ty] = d_B[(i*TILE_WIDTH+tx)*colB + col];
else
s_B[tx][ty] = 0;
__syncthreads();
for(int k = 0; k < TILE_WIDTH; k++)
cvalue += s_A[tx][k]*s_B[k][ty];
__syncthreads();
}
if(row < rowC && col < colC)
d_C[row*colC + col] = cvalue;
}// End of mat_mul_shared function
int main()
{
int *A, *B, *C1, *C2, rowA, colA, rowB, colB, rowC, colC;
int *d_A, *d_B, *d_C;
dim3 dimg, dimb;
cudaEvent_t start, stop;
float elapsed_time;
rowA = ar; rowC = ar;
colA = ac_br; rowB = ac_br;
colB = bc; colC = bc;
A = new int[rowA*colA]; B = new int[rowB*colB];
C1 = new int[rowC*colC]; C2 = new int[rowC*colC];
cudaMalloc((void**)&d_A, rowA*colA*sizeof(int));
cudaMalloc((void**)&d_B, rowB*colB*sizeof(int));
cudaMalloc((void**)&d_C, rowC*colC*sizeof(int));
srand(time(NULL));
for(int i = 0; i < rowA*colA; i++)
A[i] = rand()%5;
for(int i = 0; i < rowB*colB; i++)
B[i] = rand()%5;
dimg = dim3((rowC+TILE_WIDTH-1)/TILE_WIDTH, (colC+TILE_WIDTH-1)/TILE_WIDTH);
dimb = dim3(TILE_WIDTH, TILE_WIDTH);
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaMemcpy(d_A, A, rowA*colA*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, rowB*colB*sizeof(int), cudaMemcpyHostToDevice);
check_gpu_error("cuda memcpy host to device");
// Without shared memory
cudaMemset(d_C, 0, rowC*colC*sizeof(int));
cudaEventRecord(start, 0);
mat_mul<<<dimg, dimb>>>(d_A, d_B, d_C, rowA, colA, rowB, colB, rowC, colC);
cudaEventRecord(stop, 0); cudaEventSynchronize(stop);
cudaMemcpy(C1, d_C, rowC*colC*sizeof(int), cudaMemcpyDeviceToHost);
check_gpu_error("cuda memcpy device to host");
cudaEventElapsedTime(&elapsed_time, start, stop);
cout<<"\n Matrix mulitplication(without shared memory): "<<elapsed_time<<" mili-seconds";
// With Shared memory
cudaMemset(d_C, 0, rowC*colC*sizeof(int));
cudaEventRecord(start, 0);
mat_mul_shared<<<dimg, dimb>>>(d_A, d_B, d_C, rowA, colA, rowB, colB, rowC, colC);
cudaEventRecord(stop, 0); cudaEventSynchronize(stop);
cudaMemcpy(C2, d_C, rowC*colC*sizeof(int), cudaMemcpyDeviceToHost);
check_gpu_error("cuda memcpy device to host");
cudaEventElapsedTime(&elapsed_time, start, stop);
cout<<"\n Matrix mulitplication(with shared memory) : "<<elapsed_time<<" mili-seconds";
for(int i = 0; i < rowC*colC; i++)
{
if(C1[i] != C2[i])
{
cerr<<"\n Error!!! wrong Matrix calculation is done....";
exit(-2);
}
}
cout<<"\n Matrix mulitplication done...\n";
return 0;
}// End of main