Memory reclaim fixes (#565)

michel2323 · web-flow · commit bcf9377eb2a1 · 2026-02-11T20:41:54.000Z
* Sparse matrix release queue

* Memory tracking for GC

* Flush deferred sparse releases during memory reclaim

* Format
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "oneAPI"
 uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 authors = ["Tim Besard <tim.besard@gmail.com>", "Alexis Montoison", "Michel Schanen <michel.schanen@gmail.com>"]
-version = "2.6.0"
+version = "2.6.1"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
diff --git a/lib/level-zero/utils.jl b/lib/level-zero/utils.jl
@@ -1,5 +1,24 @@
 isdebug(group) = Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug, group, oneL0) !== nothing
 
+# Registered callbacks invoked during memory reclamation (e.g., flushing deferred MKL
+# sparse handle releases).  Extensions like oneMKL can register cleanup functions here
+# so they run when Level Zero reports OOM or when proactive GC fires.
+const _reclaim_callbacks = Function[]
+
+function register_reclaim_callback!(f::Function)
+    return push!(_reclaim_callbacks, f)
+end
+
+function _run_reclaim_callbacks()
+    for cb in _reclaim_callbacks
+        try
+            cb()
+        catch
+        end
+    end
+    return
+end
+
 function retry_reclaim(f, isfailed)
     ret = f()
 
@@ -11,6 +30,12 @@ function retry_reclaim(f, isfailed)
                 GC.gc(false)
             elseif phase == 2
                 GC.gc(true)
+            elseif phase == 3
+                # After GC, finalizers may have deferred resource releases (e.g., MKL
+                # sparse handles).  Flush them now, then GC again to free the memory
+                # those releases made available.
+                _run_reclaim_callbacks()
+                GC.gc(true)
             else
                 break
             end
diff --git a/lib/mkl/oneMKL.jl b/lib/mkl/oneMKL.jl
@@ -31,6 +31,11 @@ include("linalg.jl")
 include("interfaces.jl")
 include("fft.jl")
 
+# Register deferred sparse handle flush as a memory reclaim callback so that OOM
+# recovery (retry_reclaim) and proactive GC (_maybe_gc) can free MKL internal buffers
+# associated with sparse matrix handles that were deferred from finalizer threads.
+oneL0.register_reclaim_callback!(flush_deferred_sparse_releases)
+
 function version()
     major = Ref{Int64}()
     minor = Ref{Int64}()
diff --git a/lib/mkl/wrappers_sparse.jl b/lib/mkl/wrappers_sparse.jl
@@ -1,16 +1,41 @@
+# Deferred release queue for sparse matrix handles.
+# Finalizers run on the GC thread, but onemklXsparse_release_matrix_handle submits
+# work to the SYCL queue. Using the same queue from the GC thread and the main thread
+# concurrently is not safe and causes ZE_RESULT_ERROR_DEVICE_LOST / ZE_RESULT_ERROR_UNKNOWN.
+# Instead, finalizers push handles here and they are released on the main thread.
+const _deferred_sparse_handles = Vector{matrix_handle_t}()
+const _deferred_sparse_handles_lock = ReentrantLock()
+
 function sparse_release_matrix_handle(A::oneAbstractSparseMatrix)
     return if A.handle !== nothing
+        lock(_deferred_sparse_handles_lock) do
+            push!(_deferred_sparse_handles, A.handle)
+        end
+    end
+end
+
+function flush_deferred_sparse_releases()
+    handles = lock(_deferred_sparse_handles_lock) do
+        if isempty(_deferred_sparse_handles)
+            return matrix_handle_t[]
+        end
+        h = copy(_deferred_sparse_handles)
+        empty!(_deferred_sparse_handles)
+        return h
+    end
+    isempty(handles) && return
+    dev = device()
+    ctx = context()
+    queue = global_queue(ctx, dev)
+    for handle in handles
         try
-            queue = global_queue(context(A.nzVal), device(A.nzVal))
-            handle_ptr = Ref{matrix_handle_t}(A.handle)
+            handle_ptr = Ref{matrix_handle_t}(handle)
             onemklXsparse_release_matrix_handle(sycl_queue(queue), handle_ptr)
-            # Only synchronize after successful release to ensure completion
-            synchronize(queue)
         catch err
-            # Don't let finalizer errors crash the program
             @warn "Error releasing sparse matrix handle" exception = err
         end
     end
+    return synchronize(queue)
 end
 
 for (fname, elty, intty) in ((:onemklSsparse_set_csr_data   , :Float32   , :Int32),
@@ -27,6 +52,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data   , :Float32   , :Int3
                 rowPtr::oneVector{$intty}, colVal::oneVector{$intty},
                 nzVal::oneVector{$elty}, dims::NTuple{2, Int}
             )
+            flush_deferred_sparse_releases()
             handle_ptr = Ref{matrix_handle_t}()
             onemklXsparse_init_matrix_handle(handle_ptr)
             m, n = dims
@@ -47,6 +73,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data   , :Float32   , :Int3
                 colPtr::oneVector{$intty}, rowVal::oneVector{$intty},
                 nzVal::oneVector{$elty}, dims::NTuple{2, Int}
             )
+            flush_deferred_sparse_releases()
             queue = global_queue(context(nzVal), device(nzVal))
             handle_ptr = Ref{matrix_handle_t}()
             onemklXsparse_init_matrix_handle(handle_ptr)
@@ -106,6 +133,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_coo_data   , :Float32   , :Int3
                              (:onemklZsparse_set_coo_data_64, :ComplexF64, :Int64))
     @eval begin
         function oneSparseMatrixCOO(A::SparseMatrixCSC{$elty, $intty})
+            flush_deferred_sparse_releases()
             handle_ptr = Ref{matrix_handle_t}()
             onemklXsparse_init_matrix_handle(handle_ptr)
             m, n = size(A)
diff --git a/src/pool.jl b/src/pool.jl
@@ -1,8 +1,41 @@
+# Track total allocated GPU memory (device + shared buffers) for proactive GC.
+# This mirrors AMDGPU.jl's approach: trigger GC before OOM so that finalizers
+# can free stale GPU buffers that Julia's GC hasn't collected yet (Julia's GC
+# only sees CPU memory pressure, not GPU memory pressure).
+const _allocated_bytes = Threads.Atomic{Int64}(0)
+const _total_mem_cache = Threads.Atomic{Int64}(0)
+
+function _get_total_mem(dev)
+    cached = _total_mem_cache[]
+    cached > 0 && return cached
+    total = only(oneL0.memory_properties(dev)).totalSize
+    Threads.atomic_cas!(_total_mem_cache, Int64(0), Int64(total))
+    return _total_mem_cache[]
+end
+
+function _maybe_gc(dev, bytes)
+    allocated = _allocated_bytes[]
+    allocated <= 0 && return
+    total_mem = _get_total_mem(dev)
+    return if allocated + bytes > total_mem * 0.8
+        # Flush deferred resource releases (e.g., MKL sparse handles) from previous GC
+        # cycles first — these are safe to release now because they were deferred earlier.
+        # Do this BEFORE GC to avoid racing with new finalizers.
+        oneL0._run_reclaim_callbacks()
+        # Full GC to collect old-generation objects whose finalizers free GPU memory.
+        GC.gc(true)
+    elseif allocated + bytes > total_mem * 0.4
+        GC.gc(false)
+    end
+end
+
 function allocate(::Type{oneL0.DeviceBuffer}, ctx, dev, bytes::Int, alignment::Int)
     bytes == 0 && return oneL0.DeviceBuffer(ZE_NULL, bytes, ctx, dev)
 
+    _maybe_gc(dev, bytes)
     buf = device_alloc(ctx, dev, bytes, alignment)
     make_resident(ctx, dev, buf)
+    Threads.atomic_add!(_allocated_bytes, Int64(bytes))
 
     return buf
 end
@@ -12,8 +45,10 @@ function allocate(::Type{oneL0.SharedBuffer}, ctx, dev, bytes::Int, alignment::I
 
     # TODO: support cross-device shared buffers (by setting `dev=nothing`)
 
+    _maybe_gc(dev, bytes)
     buf = shared_alloc(ctx, dev, bytes, alignment)
     make_resident(ctx, dev, buf)
+    Threads.atomic_add!(_allocated_bytes, Int64(bytes))
 
     return buf
 end
@@ -26,6 +61,10 @@ end
 function release(buf::oneL0.AbstractBuffer)
     sizeof(buf) == 0 && return
 
+    if buf isa oneL0.DeviceBuffer || buf isa oneL0.SharedBuffer
+        Threads.atomic_sub!(_allocated_bytes, Int64(sizeof(buf)))
+    end
+
     # XXX: is it necessary to evice memory if we are going to free it?
     #      this is racy, because eviction is not queue-ordered, and
     #      we don't want to synchronize inside what could have been a