@@ -20,6 +20,7 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]
2020
2121@testset " barrier" begin
2222
23+ # work-group
2324@on_device barrier (OpenCL. LOCAL_MEM_FENCE)
2425@on_device barrier (OpenCL. GLOBAL_MEM_FENCE)
2526@on_device barrier (OpenCL. LOCAL_MEM_FENCE | OpenCL. GLOBAL_MEM_FENCE)
@@ -38,6 +39,10 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]
3839cl. memory_backend () isa cl. SVMBackend && @on_device work_group_barrier (OpenCL. LOCAL_MEM_FENCE, OpenCL. memory_scope_all_svm_devices)
3940@on_device work_group_barrier (OpenCL. LOCAL_MEM_FENCE, OpenCL. memory_scope_sub_group)
4041
42+ # sub-group
43+ @on_device sub_group_barrier (OpenCL. LOCAL_MEM_FENCE)
44+ @on_device sub_group_barrier (OpenCL. GLOBAL_MEM_FENCE)
45+ @on_device sub_group_barrier (OpenCL. LOCAL_MEM_FENCE | OpenCL. GLOBAL_MEM_FENCE)
4146end
4247
4348@testset " mem_fence" begin
@@ -166,6 +171,103 @@ end
166171 @test call_on_device (OpenCL. mad, x, y, z) ≈ x * y + z
167172end
168173
174+ if cl. sub_groups_supported (cl. device ())
175+
176+ struct SubgroupData
177+ sub_group_size:: UInt32
178+ max_sub_group_size:: UInt32
179+ num_sub_groups:: UInt32
180+ sub_group_id:: UInt32
181+ sub_group_local_id:: UInt32
182+ end
183+ function test_subgroup_kernel (results)
184+ i = get_global_id (1 )
185+
186+ if i <= length (results)
187+ @inbounds results[i] = SubgroupData (
188+ get_sub_group_size (),
189+ get_max_sub_group_size (),
190+ get_num_sub_groups (),
191+ get_sub_group_id (),
192+ get_sub_group_local_id ()
193+ )
194+ end
195+ return
196+ end
197+
198+ @testset " Sub-groups" begin
199+ sg_size = cl. sub_group_size (cl. device ())
200+
201+ @testset " Indexing intrinsics" begin
202+ # Test with small kernel
203+ sg_n = 2
204+ local_size = sg_size * sg_n
205+ numworkgroups = 2
206+ N = local_size * numworkgroups
207+
208+ results = CLVector {SubgroupData} (undef, N)
209+ kernel = @opencl launch = false test_subgroup_kernel (results)
210+
211+ kernel (results; local_size, global_size= N)
212+
213+ host_results = Array (results)
214+
215+ # Verify results make sense
216+ for (i, sg_data) in enumerate (host_results)
217+ @test sg_data. sub_group_size == sg_size
218+ @test sg_data. max_sub_group_size == sg_size
219+ @test sg_data. num_sub_groups == sg_n
220+
221+ # Group ID should be 1-based
222+ expected_sub_group = div (((i - 1 ) % local_size), sg_size) + 1
223+ @test sg_data. sub_group_id == expected_sub_group
224+
225+ # Local ID should be 1-based within group
226+ expected_sg_local = ((i - 1 ) % sg_size) + 1
227+ @test sg_data. sub_group_local_id == expected_sg_local
228+ end
229+ end
230+
231+ @testset " shuffle idx" begin
232+ function shfl_idx_kernel (d)
233+ i = get_sub_group_local_id ()
234+ j = get_sub_group_size () - i + 0x1
235+
236+ d[i] = sub_group_shuffle (d[i], j)
237+
238+ return
239+ end
240+
241+ @testset for T in cl. sub_group_shuffle_supported_types (cl. device ())
242+ a = rand (T, sg_size)
243+ d_a = CLArray (a)
244+ @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel (d_a)
245+ @test Array (d_a) == reverse (a)
246+ end
247+ end
248+ @testset " shuffle xor" begin
249+ function shfl_xor_kernel (in)
250+ i = get_sub_group_local_id ()
251+
252+ # val = in[i]
253+ new_val = sub_group_shuffle_xor (in[i], 1 )
254+
255+ in[i] = new_val
256+ return
257+ end
258+
259+ # tests that each pair of values a get swapped using sub_group_shuffle_xor
260+ @testset for T in cl. sub_group_shuffle_supported_types (cl. device ())
261+ in = rand (T, sg_size)
262+ idxs = xor .(0 : (sg_size - 1 ), 1 ) .+ 1
263+ d_in = CLArray (in)
264+ @opencl local_size = sg_size global_size = sg_size shfl_xor_kernel (d_in)
265+ @test Array (d_in) == in[idxs]
266+ end
267+ end
268+ end
269+ end # if cl.sub_groups_supported(cl.device())
270+
169271@testset " SIMD - $N x $T " for N in simd_ns, T in float_types
170272 # codegen emits i48 here, which SPIR-V doesn't support
171273 # XXX : fix upstream?
0 commit comments