Skip to content

Commit bea5703

Browse files
[OpenCL] Sub-group shuffle (#413)
1 parent d476f50 commit bea5703

4 files changed

Lines changed: 173 additions & 4 deletions

File tree

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "OpenCL"
22
uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2"
3-
version = "0.10.8"
3+
version = "0.10.9"
44

55
[deps]
66
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -38,7 +38,7 @@ Random = "1"
3838
Random123 = "1.7.1"
3939
RandomNumbers = "1.6.0"
4040
Reexport = "1"
41-
SPIRVIntrinsics = "0.5"
41+
SPIRVIntrinsics = "0.5.7"
4242
SPIRV_LLVM_Backend_jll = "20"
4343
SPIRV_Tools_jll = "2025.1"
4444
StaticArrays = "1"

lib/cl/device.jl

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,20 @@ end
139139
return tuple([Int(r) for r in result]...)
140140
end
141141

142+
# error handling inspired by rusticl
143+
# https://gitlab.freedesktop.org/mesa/mesa/-/blob/c4385d6fb0938231114eb3023082cd33788b89b4/src/gallium/frontends/rusticl/api/device.rs#L314-320
144+
if s == :sub_group_sizes
145+
res_size = Ref{Csize_t}()
146+
err = unchecked_clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size)
147+
if err == CL_SUCCESS && res_size[] > 1
148+
result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t))
149+
clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL)
150+
return tuple([Int(r) for r in result]...)
151+
else
152+
return tuple(0, 1)
153+
end
154+
end
155+
142156
if s == :max_image2d_shape
143157
width = Ref{Csize_t}()
144158
height = Ref{Csize_t}()
@@ -273,3 +287,40 @@ function cl_device_type(dtype::Symbol)
273287
end
274288
return cl_dtype
275289
end
290+
291+
sub_groups_supported(d::Device) = "cl_khr_subgroups" in d.extensions || "cl_intel_subgroups" in d.extensions
292+
function sub_group_size(d::Device)
293+
sub_groups_supported(d) || 0
294+
if "cl_amd_device_attribute_query" in d.extensions
295+
scalar = Ref{cl_uint}()
296+
clGetDeviceInfo(d, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(cl_uint), scalar, C_NULL)
297+
return Int(scalar[])
298+
elseif "cl_nv_device_attribute_query" in d.extensions
299+
scalar = Ref{cl_uint}()
300+
clGetDeviceInfo(d, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), scalar, C_NULL)
301+
return Int(scalar[])
302+
else
303+
sg_sizes = d.sub_group_sizes
304+
return if length(sg_sizes) == 1
305+
Int(only(sg_sizes))
306+
elseif 32 in sg_sizes
307+
32
308+
elseif 64 in sg_sizes
309+
64
310+
elseif 16 in sg_sizes
311+
16
312+
else
313+
Int(first(sg_sizes))
314+
end
315+
end
316+
end
317+
function sub_group_shuffle_supported_types(d::Device)
318+
if "cl_khr_subgroup_shuffle" in d.extensions
319+
res = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float32]
320+
"cl_khr_fp16" in d.extensions && push!(res, Float16)
321+
"cl_khr_fp64" in d.extensions && push!(res, Float64)
322+
res
323+
else
324+
DataType[]
325+
end
326+
end

src/compiler/compilation.jl

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
## gpucompiler interface
22

3-
struct OpenCLCompilerParams <: AbstractCompilerParams end
3+
Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
4+
sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed
5+
end
6+
47
const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
58
const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams}
69

@@ -29,6 +32,12 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
2932
Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
3033
job, mod, entry)
3134

35+
# Set the subgroup size if supported
36+
sg_size = job.config.params.sub_group_size
37+
if sg_size >= 0
38+
metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))])
39+
end
40+
3241
# if this kernel uses our RNG, we should prime the shared state.
3342
# XXX: these transformations should really happen at the Julia IR level...
3443
if haskey(functions(mod), "julia.opencl.random_keys") && job.config.kernel
@@ -131,9 +140,16 @@ end
131140
supports_fp16 = "cl_khr_fp16" in dev.extensions
132141
supports_fp64 = "cl_khr_fp64" in dev.extensions
133142

143+
# Set to -1 if specifying a subgroup size is not supported
144+
sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions
145+
cl.sub_group_size(dev)
146+
else
147+
-1
148+
end
149+
134150
# create GPUCompiler objects
135151
target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true, kwargs...)
136-
params = OpenCLCompilerParams()
152+
params = OpenCLCompilerParams(; sub_group_size)
137153
CompilerConfig(target, params; kernel, name, always_inline)
138154
end
139155

test/intrinsics.jl

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]
2020

2121
@testset "barrier" begin
2222

23+
# work-group
2324
@on_device barrier(OpenCL.LOCAL_MEM_FENCE)
2425
@on_device barrier(OpenCL.GLOBAL_MEM_FENCE)
2526
@on_device barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
@@ -38,6 +39,10 @@ const simd_ns = (Sys.iswindows() && ispocl) ? [3, 4] : [2, 3, 4, 8, 16]
3839
cl.memory_backend() isa cl.SVMBackend && @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_all_svm_devices)
3940
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_sub_group)
4041

42+
# sub-group
43+
@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE)
44+
@on_device sub_group_barrier(OpenCL.GLOBAL_MEM_FENCE)
45+
@on_device sub_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
4146
end
4247

4348
@testset "mem_fence" begin
@@ -166,6 +171,103 @@ end
166171
@test call_on_device(OpenCL.mad, x, y, z) x * y + z
167172
end
168173

174+
if cl.sub_groups_supported(cl.device())
175+
176+
struct SubgroupData
177+
sub_group_size::UInt32
178+
max_sub_group_size::UInt32
179+
num_sub_groups::UInt32
180+
sub_group_id::UInt32
181+
sub_group_local_id::UInt32
182+
end
183+
function test_subgroup_kernel(results)
184+
i = get_global_id(1)
185+
186+
if i <= length(results)
187+
@inbounds results[i] = SubgroupData(
188+
get_sub_group_size(),
189+
get_max_sub_group_size(),
190+
get_num_sub_groups(),
191+
get_sub_group_id(),
192+
get_sub_group_local_id()
193+
)
194+
end
195+
return
196+
end
197+
198+
@testset "Sub-groups" begin
199+
sg_size = cl.sub_group_size(cl.device())
200+
201+
@testset "Indexing intrinsics" begin
202+
# Test with small kernel
203+
sg_n = 2
204+
local_size = sg_size * sg_n
205+
numworkgroups = 2
206+
N = local_size * numworkgroups
207+
208+
results = CLVector{SubgroupData}(undef, N)
209+
kernel = @opencl launch = false test_subgroup_kernel(results)
210+
211+
kernel(results; local_size, global_size=N)
212+
213+
host_results = Array(results)
214+
215+
# Verify results make sense
216+
for (i, sg_data) in enumerate(host_results)
217+
@test sg_data.sub_group_size == sg_size
218+
@test sg_data.max_sub_group_size == sg_size
219+
@test sg_data.num_sub_groups == sg_n
220+
221+
# Group ID should be 1-based
222+
expected_sub_group = div(((i - 1) % local_size), sg_size) + 1
223+
@test sg_data.sub_group_id == expected_sub_group
224+
225+
# Local ID should be 1-based within group
226+
expected_sg_local = ((i - 1) % sg_size) + 1
227+
@test sg_data.sub_group_local_id == expected_sg_local
228+
end
229+
end
230+
231+
@testset "shuffle idx" begin
232+
function shfl_idx_kernel(d)
233+
i = get_sub_group_local_id()
234+
j = get_sub_group_size() - i + 0x1
235+
236+
d[i] = sub_group_shuffle(d[i], j)
237+
238+
return
239+
end
240+
241+
@testset for T in cl.sub_group_shuffle_supported_types(cl.device())
242+
a = rand(T, sg_size)
243+
d_a = CLArray(a)
244+
@opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
245+
@test Array(d_a) == reverse(a)
246+
end
247+
end
248+
@testset "shuffle xor" begin
249+
function shfl_xor_kernel(in)
250+
i = get_sub_group_local_id()
251+
252+
# val = in[i]
253+
new_val = sub_group_shuffle_xor(in[i], 1)
254+
255+
in[i] = new_val
256+
return
257+
end
258+
259+
# tests that each pair of values a get swapped using sub_group_shuffle_xor
260+
@testset for T in cl.sub_group_shuffle_supported_types(cl.device())
261+
in = rand(T, sg_size)
262+
idxs = xor.(0:(sg_size - 1), 1) .+ 1
263+
d_in = CLArray(in)
264+
@opencl local_size = sg_size global_size = sg_size shfl_xor_kernel(d_in)
265+
@test Array(d_in) == in[idxs]
266+
end
267+
end
268+
end
269+
end # if cl.sub_groups_supported(cl.device())
270+
169271
@testset "SIMD - $N x $T" for N in simd_ns, T in float_types
170272
# codegen emits i48 here, which SPIR-V doesn't support
171273
# XXX: fix upstream?

0 commit comments

Comments
 (0)