-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbcht_insert_kernel.ptx
173 lines (144 loc) · 4.25 KB
/
bcht_insert_kernel.ptx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
// Written by Muhammad Awad
//
.version 7.2
//.relaxed loads require at least sm_70
.target sm_80, sm_86,
sm_75, sm_72, sm_70
.address_size 64
.visible .entry bcht_insert(
.param .u64 param_table, //uint32_t* table
.param .u64 param_keys, //uint32_t* keys_in
.param .u32 param_num_keys, //uint32_t num_keys
.param .align 4 .b8 param_hf0[12], // hash_function f0
.param .align 4 .b8 param_hf1[12], // hash_function f1
.param .align 4 .b8 param_hf2[12], // hash_function f2
.param .u32 param_num_buckets // num_buckets
)
{
// register declaration
.reg .b32 %r<4>; // 4 32-bit registers
.reg .b64 %rd<2>; // 2 64-bit registers
.reg .pred %p<3>; // 3 predicates registers
.reg .b32 %thread_id;
.reg .b32 %lane_id;
.reg .b32 %lane_key;
.reg .b32 %cuckoo_counter;
.reg .b32 %rng;
.reg .b32 %hfx<3>; // three hash functions
.reg .b32 %hfy<3>;
.reg .b32 %hf_prime;
.reg .b64 %keys_in;
.reg .b64 %results;
.reg .b64 %table;
.reg .b32 %num_buckets;
.reg .u32 %num_keys;
.reg .pred %p_to_insert;
// Prepare the above registers
mov.u32 %thread_id, %ctaid.x;
mov.u32 %r0, %ntid.x;
mov.u32 %r1, %tid.x;
mad.lo.s32 %thread_id, %thread_id, %r0, %r1;
and.b32 %lane_id, %thread_id, 0x1f;
mov.u32 %cuckoo_counter, 0;
mov.u32 %rng, 2463534242;
// input keys ptr
ld.param.u64 %keys_in, [param_keys];
cvta.to.global.u64 %keys_in, %keys_in;
// table ptr
ld.param.u64 %table, [param_table];
cvta.to.global.u64 %table, %table;
// Hash functions
ld.param.u32 %hfx0, [param_hf0];
ld.param.u32 %hfx1, [param_hf1];
ld.param.u32 %hfx2, [param_hf2];
ld.param.u32 %hfy0, [param_hf0+4];
ld.param.u32 %hfy1, [param_hf1+4];
ld.param.u32 %hfy2, [param_hf2+4];
ld.param.u32 %hf_prime, [param_hf0+8];
ld.param.u32 %num_buckets, [param_num_buckets];
ld.param.u32 %num_keys, [param_num_keys];
setp.lt.u32 %p_to_insert, %thread_id, %num_keys;
// Loading the input
mul.wide.s32 %rd0, %thread_id, 4;
add.s64 %rd1, %keys_in, %rd0;
mov.b32 %lane_key, -1;
@%p_to_insert
ld.global.u32 %lane_key, [%rd1];
.reg .b32 %cur_bucket;
.reg .b32 %cur_key;
// loop over the keys in the warp
Loop:
vote.sync.ballot.b32 %r0, %p_to_insert, -1;
setp.eq.u32 %p0, %r0, 0; //terminate if no items in the queue
@%p0
bra.uni Exit_loop;
bfind.u32 %r1, %r0;
setp.eq.u32 %p0, %lane_id, %r1;
@%p0
mov.pred %p_to_insert, 0;
shfl.sync.idx.b32 %cur_key, %lane_key, %r1, 31, -1;
// Hash function
xor.b32 %cur_bucket, %cur_key, %hfx0;
add.u32 %cur_bucket, %cur_bucket, %hfy0;
rem.u32 %cur_bucket, %cur_bucket, %hf_prime;
rem.u32 %cur_bucket, %cur_bucket, %num_buckets;
Load_bucket:
mul.wide.s32 %rd1, %lane_id, 4;
mad.wide.s32 %rd0, %cur_bucket, 32*4, %rd1; //32 is bucket size
add.s64 %rd1, %table, %rd0;
Retry:
ld.global.u32 %r0, [%rd1];
setp.ne.u32 %p0, %r0, -1;
vote.sync.ballot.b32 %r0, %p0, -1;
popc.b32 %r0, %r0; // bucket load
setp.eq.u32 %p0, %r0, 32;
@%p0
bra.uni Cuckoo;
Try_insert:
setp.eq.u32 %p0, %r0, %lane_id;
@%p0
atom.relaxed.gpu.global.cas.b32 %r1, [%rd1], -1, %cur_key;
shfl.sync.idx.b32 %r2, %r1, %r0, 31, -1;
setp.eq.u32 %p0, %r2, -1;
@%p0
bra.uni Loop;
//it is probably better to increment
//the CAS pointer position instead of re-loading
//and computing the CAS location.
//Somehow it didn't matter (very low contention?)
bra.uni Retry;
Cuckoo:
shl.b32 %r0, %rng, 13;
xor.b32 %rng, %rng, %r0;
shr.b32 %rng, %rng, 17;
shl.b32 %r0, %rng, 5;
xor.b32 %rng, %rng, %r0;
rem.u32 %r0, %rng, 32;
setp.eq.u32 %p0, %lane_id, %r0;
@%p0
atom.relaxed.gpu.global.exch.b32 %r1, [%rd1], %cur_key;
shfl.sync.idx.b32 %cur_key, %r1, %r0, 31, -1;
xor.b32 %r0, %cur_key, %hfx0;
add.u32 %r0, %r0, %hfy0;
rem.u32 %r0, %r0, %hf_prime;
rem.u32 %r0, %r0, %num_buckets;
xor.b32 %r1, %cur_key, %hfx1;
add.u32 %r1, %r1, %hfy1;
rem.u32 %r1, %r1, %hf_prime;
rem.u32 %r1, %r1, %num_buckets;
xor.b32 %r2, %cur_key, %hfx2;
add.u32 %r2, %r2, %hfy2;
rem.u32 %r2, %r2, %hf_prime;
rem.u32 %r2, %r2, %num_buckets;
mov.u32 %r3, %r0;
setp.eq.u32 %p0, %cur_bucket, %r1;
@%p0
mov.u32 %r3, %r2;
setp.eq.u32 %p0, %cur_bucket, %r0;
@%p0
mov.u32 %r3, %r1;
mov.u32 %cur_bucket, %r3;
bra.uni Load_bucket;
Exit_loop:
ret;
}