Skip to content

Commit bcf9377

Browse files
authored
Memory reclaim fixes (#565)
* Sparse matrix release queue * Memory tracking for GC * Flush deferred sparse releases during memory reclaim * Format
1 parent 457a862 commit bcf9377

File tree

5 files changed

+103
-6
lines changed

5 files changed

+103
-6
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "oneAPI"
22
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
33
authors = ["Tim Besard <tim.besard@gmail.com>", "Alexis Montoison", "Michel Schanen <michel.schanen@gmail.com>"]
4-
version = "2.6.0"
4+
version = "2.6.1"
55

66
[deps]
77
AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"

lib/level-zero/utils.jl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,24 @@
11
isdebug(group) = Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug, group, oneL0) !== nothing
22

3+
# Registered callbacks invoked during memory reclamation (e.g., flushing deferred MKL
4+
# sparse handle releases). Extensions like oneMKL can register cleanup functions here
5+
# so they run when Level Zero reports OOM or when proactive GC fires.
6+
const _reclaim_callbacks = Function[]
7+
8+
function register_reclaim_callback!(f::Function)
9+
return push!(_reclaim_callbacks, f)
10+
end
11+
12+
function _run_reclaim_callbacks()
13+
for cb in _reclaim_callbacks
14+
try
15+
cb()
16+
catch
17+
end
18+
end
19+
return
20+
end
21+
322
function retry_reclaim(f, isfailed)
423
ret = f()
524

@@ -11,6 +30,12 @@ function retry_reclaim(f, isfailed)
1130
GC.gc(false)
1231
elseif phase == 2
1332
GC.gc(true)
33+
elseif phase == 3
34+
# After GC, finalizers may have deferred resource releases (e.g., MKL
35+
# sparse handles). Flush them now, then GC again to free the memory
36+
# those releases made available.
37+
_run_reclaim_callbacks()
38+
GC.gc(true)
1439
else
1540
break
1641
end

lib/mkl/oneMKL.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ include("linalg.jl")
3131
include("interfaces.jl")
3232
include("fft.jl")
3333

34+
# Register deferred sparse handle flush as a memory reclaim callback so that OOM
35+
# recovery (retry_reclaim) and proactive GC (_maybe_gc) can free MKL internal buffers
36+
# associated with sparse matrix handles that were deferred from finalizer threads.
37+
oneL0.register_reclaim_callback!(flush_deferred_sparse_releases)
38+
3439
function version()
3540
major = Ref{Int64}()
3641
minor = Ref{Int64}()

lib/mkl/wrappers_sparse.jl

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,41 @@
1+
# Deferred release queue for sparse matrix handles.
2+
# Finalizers run on the GC thread, but onemklXsparse_release_matrix_handle submits
3+
# work to the SYCL queue. Using the same queue from the GC thread and the main thread
4+
# concurrently is not safe and causes ZE_RESULT_ERROR_DEVICE_LOST / ZE_RESULT_ERROR_UNKNOWN.
5+
# Instead, finalizers push handles here and they are released on the main thread.
6+
const _deferred_sparse_handles = Vector{matrix_handle_t}()
7+
const _deferred_sparse_handles_lock = ReentrantLock()
8+
19
function sparse_release_matrix_handle(A::oneAbstractSparseMatrix)
210
return if A.handle !== nothing
11+
lock(_deferred_sparse_handles_lock) do
12+
push!(_deferred_sparse_handles, A.handle)
13+
end
14+
end
15+
end
16+
17+
function flush_deferred_sparse_releases()
18+
handles = lock(_deferred_sparse_handles_lock) do
19+
if isempty(_deferred_sparse_handles)
20+
return matrix_handle_t[]
21+
end
22+
h = copy(_deferred_sparse_handles)
23+
empty!(_deferred_sparse_handles)
24+
return h
25+
end
26+
isempty(handles) && return
27+
dev = device()
28+
ctx = context()
29+
queue = global_queue(ctx, dev)
30+
for handle in handles
331
try
4-
queue = global_queue(context(A.nzVal), device(A.nzVal))
5-
handle_ptr = Ref{matrix_handle_t}(A.handle)
32+
handle_ptr = Ref{matrix_handle_t}(handle)
633
onemklXsparse_release_matrix_handle(sycl_queue(queue), handle_ptr)
7-
# Only synchronize after successful release to ensure completion
8-
synchronize(queue)
934
catch err
10-
# Don't let finalizer errors crash the program
1135
@warn "Error releasing sparse matrix handle" exception = err
1236
end
1337
end
38+
return synchronize(queue)
1439
end
1540

1641
for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int32),
@@ -27,6 +52,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int3
2752
rowPtr::oneVector{$intty}, colVal::oneVector{$intty},
2853
nzVal::oneVector{$elty}, dims::NTuple{2, Int}
2954
)
55+
flush_deferred_sparse_releases()
3056
handle_ptr = Ref{matrix_handle_t}()
3157
onemklXsparse_init_matrix_handle(handle_ptr)
3258
m, n = dims
@@ -47,6 +73,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int3
4773
colPtr::oneVector{$intty}, rowVal::oneVector{$intty},
4874
nzVal::oneVector{$elty}, dims::NTuple{2, Int}
4975
)
76+
flush_deferred_sparse_releases()
5077
queue = global_queue(context(nzVal), device(nzVal))
5178
handle_ptr = Ref{matrix_handle_t}()
5279
onemklXsparse_init_matrix_handle(handle_ptr)
@@ -106,6 +133,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_coo_data , :Float32 , :Int3
106133
(:onemklZsparse_set_coo_data_64, :ComplexF64, :Int64))
107134
@eval begin
108135
function oneSparseMatrixCOO(A::SparseMatrixCSC{$elty, $intty})
136+
flush_deferred_sparse_releases()
109137
handle_ptr = Ref{matrix_handle_t}()
110138
onemklXsparse_init_matrix_handle(handle_ptr)
111139
m, n = size(A)

src/pool.jl

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,41 @@
1+
# Track total allocated GPU memory (device + shared buffers) for proactive GC.
2+
# This mirrors AMDGPU.jl's approach: trigger GC before OOM so that finalizers
3+
# can free stale GPU buffers that Julia's GC hasn't collected yet (Julia's GC
4+
# only sees CPU memory pressure, not GPU memory pressure).
5+
const _allocated_bytes = Threads.Atomic{Int64}(0)
6+
const _total_mem_cache = Threads.Atomic{Int64}(0)
7+
8+
function _get_total_mem(dev)
9+
cached = _total_mem_cache[]
10+
cached > 0 && return cached
11+
total = only(oneL0.memory_properties(dev)).totalSize
12+
Threads.atomic_cas!(_total_mem_cache, Int64(0), Int64(total))
13+
return _total_mem_cache[]
14+
end
15+
16+
function _maybe_gc(dev, bytes)
17+
allocated = _allocated_bytes[]
18+
allocated <= 0 && return
19+
total_mem = _get_total_mem(dev)
20+
return if allocated + bytes > total_mem * 0.8
21+
# Flush deferred resource releases (e.g., MKL sparse handles) from previous GC
22+
# cycles first — these are safe to release now because they were deferred earlier.
23+
# Do this BEFORE GC to avoid racing with new finalizers.
24+
oneL0._run_reclaim_callbacks()
25+
# Full GC to collect old-generation objects whose finalizers free GPU memory.
26+
GC.gc(true)
27+
elseif allocated + bytes > total_mem * 0.4
28+
GC.gc(false)
29+
end
30+
end
31+
132
function allocate(::Type{oneL0.DeviceBuffer}, ctx, dev, bytes::Int, alignment::Int)
233
bytes == 0 && return oneL0.DeviceBuffer(ZE_NULL, bytes, ctx, dev)
334

35+
_maybe_gc(dev, bytes)
436
buf = device_alloc(ctx, dev, bytes, alignment)
537
make_resident(ctx, dev, buf)
38+
Threads.atomic_add!(_allocated_bytes, Int64(bytes))
639

740
return buf
841
end
@@ -12,8 +45,10 @@ function allocate(::Type{oneL0.SharedBuffer}, ctx, dev, bytes::Int, alignment::I
1245

1346
# TODO: support cross-device shared buffers (by setting `dev=nothing`)
1447

48+
_maybe_gc(dev, bytes)
1549
buf = shared_alloc(ctx, dev, bytes, alignment)
1650
make_resident(ctx, dev, buf)
51+
Threads.atomic_add!(_allocated_bytes, Int64(bytes))
1752

1853
return buf
1954
end
@@ -26,6 +61,10 @@ end
2661
function release(buf::oneL0.AbstractBuffer)
2762
sizeof(buf) == 0 && return
2863

64+
if buf isa oneL0.DeviceBuffer || buf isa oneL0.SharedBuffer
65+
Threads.atomic_sub!(_allocated_bytes, Int64(sizeof(buf)))
66+
end
67+
2968
# XXX: is it necessary to evice memory if we are going to free it?
3069
# this is racy, because eviction is not queue-ordered, and
3170
# we don't want to synchronize inside what could have been a

0 commit comments

Comments
 (0)