Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/uct/ib/base/ib_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,12 @@ ucs_config_field_t uct_ib_md_config_table[] = {
"Enable DMA-BUF in GDA.",
ucs_offsetof(uct_ib_md_config_t, ext.gda_dmabuf_enable), UCS_CONFIG_TYPE_TERNARY},

{"GDA_RETAIN_INACTIVE_CTX", "n",
"Retain and use an inactive CUDA primary context to query device "
"capabilities.",
ucs_offsetof(uct_ib_md_config_t, ext.gda_retain_inactive_ctx),
UCS_CONFIG_TYPE_BOOL},

{"PCI_BW", "",
"Maximum effective data transfer rate of PCI bus connected to HCA\n",
ucs_offsetof(uct_ib_md_config_t, pci_bw), UCS_CONFIG_TYPE_ARRAY(pci_bw)},
Expand Down
2 changes: 2 additions & 0 deletions src/uct/ib/base/ib_md.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ typedef struct uct_ib_md_ext_config {
int direct_nic; /**< Direct NIC with GPU functionality */
unsigned gda_max_hca_per_gpu; /**< Threshold of IB per GPU */
int gda_dmabuf_enable; /**< Enable DMA-BUF in GDA */
/**< Retain and use an inactive CUDA primary context to query device capabilities */
int gda_retain_inactive_ctx;
} uct_ib_md_ext_config_t;


Expand Down
158 changes: 90 additions & 68 deletions src/uct/ib/mlx5/gdaki/gdaki.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <uct/ib/mlx5/rc/rc_mlx5.h>
#include <uct/cuda/cuda_copy/cuda_copy_md.h>
#include <uct/cuda/base/cuda_util.h>
#include <uct/cuda/base/cuda_ctx.h>

#include "gpunetio/common/doca_gpunetio_verbs_def.h"

Expand Down Expand Up @@ -126,29 +127,17 @@ static void uct_rc_gdaki_calc_dev_ep_layout(size_t num_channels, size_t wq_len,

static int uct_gdaki_check_umem_dmabuf(const uct_ib_md_t *md)
{
ucs_status_t status = UCS_ERR_UNSUPPORTED;
ucs_status_t ret = 0;
#if HAVE_DECL_MLX5DV_UMEM_MASK_DMABUF
struct mlx5dv_devx_umem_in umem_in = {};
struct mlx5dv_devx_umem *umem;
uct_cuda_copy_md_dmabuf_t dmabuf;
CUdeviceptr buff;
CUcontext cuda_ctx;

status = UCT_CUDADRV_FUNC_LOG_ERR(cuDevicePrimaryCtxRetain(&cuda_ctx, 0));
if (status != UCS_OK) {
if (UCT_CUDADRV_FUNC_LOG_ERR(cuMemAlloc(&buff, 1)) != UCS_OK) {
return 0;
}

status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(cuda_ctx));
if (status != UCS_OK) {
goto out_ctx_release;
}

status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemAlloc(&buff, 1));
if (status != UCS_OK) {
goto out_ctx_pop;
}

dmabuf = uct_cuda_copy_md_get_dmabuf((void*)buff, 1,
UCS_SYS_DEVICE_ID_UNKNOWN);

Expand All @@ -160,22 +149,18 @@ static int uct_gdaki_check_umem_dmabuf(const uct_ib_md_t *md)
umem_in.dmabuf_fd = dmabuf.fd;

umem = mlx5dv_devx_umem_reg_ex(md->dev.ibv_context, &umem_in);
if (umem == NULL) {
status = UCS_ERR_NO_MEMORY;
goto out_free;
if (umem != NULL) {
mlx5dv_devx_umem_dereg(umem);
ret = 1;
} else {
ret = 0;
}

mlx5dv_devx_umem_dereg(umem);
out_free:
ucs_close_fd(&dmabuf.fd);
cuMemFree(buff);
out_ctx_pop:
UCT_CUDADRV_FUNC_LOG_WARN(cuCtxPopCurrent(NULL));
out_ctx_release:
UCT_CUDADRV_FUNC_LOG_WARN(cuDevicePrimaryCtxRelease(0));
(void)UCT_CUDADRV_FUNC_LOG_WARN(cuMemFree(buff));
#endif

return status == UCS_OK;
return ret;
}

static int uct_gdaki_is_dmabuf_supported(const uct_ib_md_t *md)
Expand Down Expand Up @@ -1134,44 +1119,26 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_gdaki_iface_t, uct_iface_t, uct_md_h,

static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_gdaki_iface_t, uct_iface_t);

static ucs_status_t
uct_gdaki_md_check_uar(uct_ib_mlx5_md_t *md, CUdevice cuda_dev)
static ucs_status_t uct_gdaki_md_check_uar(uct_ib_mlx5_md_t *md)
{
struct mlx5dv_devx_uar *uar;
ucs_status_t status;
CUcontext cuda_ctx;
unsigned flags;

status = uct_ib_mlx5_devx_alloc_uar(md, 0, &uar);
if (status != UCS_OK) {
goto out;
}

status = UCT_CUDADRV_FUNC_LOG_ERR(
cuDevicePrimaryCtxRetain(&cuda_ctx, cuda_dev));
if (status != UCS_OK) {
goto out_free_uar;
}

status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(cuda_ctx));
if (status != UCS_OK) {
goto out_ctx_release;
return status;
}

flags = CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP |
CU_MEMHOSTREGISTER_IOMEMORY;
status = UCT_CUDADRV_FUNC_LOG_DEBUG(
cuMemHostRegister(uar->reg_addr, UCT_IB_MLX5_BF_REG_SIZE, flags));
if (status == UCS_OK) {
UCT_CUDADRV_FUNC_LOG_DEBUG(cuMemHostUnregister(uar->reg_addr));
UCT_CUDADRV_FUNC_LOG_WARN(cuMemHostUnregister(uar->reg_addr));
}

UCT_CUDADRV_FUNC_LOG_WARN(cuCtxPopCurrent(NULL));
out_ctx_release:
UCT_CUDADRV_FUNC_LOG_WARN(cuDevicePrimaryCtxRelease(cuda_dev));
out_free_uar:
mlx5dv_devx_free_uar(uar);
out:
return status;
}

Expand All @@ -1196,7 +1163,7 @@ static int uct_gdaki_is_peermem_loaded(const uct_ib_md_t *md)
return peermem_loaded;
}

static int uct_gdaki_is_uar_supported(uct_ib_mlx5_md_t *md, CUdevice cu_device)
static int uct_gdaki_is_uar_supported(uct_ib_mlx5_md_t *md)
{
/**
* Save the result of UAR support in a global flag to avoid the overhead of
Expand All @@ -1209,7 +1176,7 @@ static int uct_gdaki_is_uar_supported(uct_ib_mlx5_md_t *md, CUdevice cu_device)
return uar_supported;
}

uar_supported = (uct_gdaki_md_check_uar(md, cu_device) == UCS_OK);
uar_supported = (uct_gdaki_md_check_uar(md) == UCS_OK);
if (uar_supported == 0) {
ucs_diag("GDAKI not supported, please add NVreg_RegistryDwords="
"\"PeerMappingOverride=1;\" option for nvidia kernel driver");
Expand Down Expand Up @@ -1369,6 +1336,80 @@ uct_gdaki_dev_matrix_init(const uct_ib_md_t *ib_md, size_t *dmat_length_p)
return dmat;
}

static CUdevice uct_gdaki_push_primary_ctx(int retain_inactive_ctx)
{
CUdevice cuda_dev;
ucs_status_t status;

status = uct_cuda_ctx_primary_push_first_active(&cuda_dev);
if (status == UCS_OK) {
return cuda_dev;
}

if ((status != UCS_ERR_NO_DEVICE) || !retain_inactive_ctx) {
if (status == UCS_ERR_NO_DEVICE) {
ucs_diag("no active primary CUDA context on any device. Please set "
"UCX_IB_GDA_RETAIN_INACTIVE_CTX=yes to retain inactive "
"context.");
}
return CU_DEVICE_INVALID;
}

status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cuda_dev, 0));
if (status != UCS_OK) {
return CU_DEVICE_INVALID;
}

status = uct_cuda_ctx_primary_push(cuda_dev, 1, UCS_LOG_LEVEL_ERROR);
if (status != UCS_OK) {
return CU_DEVICE_INVALID;
}

return cuda_dev;
}

static int
uct_gdaki_check_cuda_ctx_dependent_features(uct_ib_mlx5_md_t *ib_mlx5_md)
{
uct_ib_md_t *ib_md = &ib_mlx5_md->super;
CUdevice cuda_dev;
char dmabuf_str[8];
int ret;

cuda_dev = uct_gdaki_push_primary_ctx(ib_md->config.gda_retain_inactive_ctx);
if (cuda_dev == CU_DEVICE_INVALID) {
return 0;
}

if ((ib_md->config.gda_dmabuf_enable != UCS_NO) &&
uct_gdaki_is_dmabuf_supported(ib_md)) {
ib_mlx5_md->flags |= UCT_IB_MLX5_MD_FLAG_REG_DMABUF_UMEM;
ucs_debug("%s: using dmabuf for gda transport",
uct_ib_device_name(&ib_md->dev));
} else if ((ib_md->config.gda_dmabuf_enable != UCS_YES) &&
uct_gdaki_is_peermem_loaded(ib_md)) {
ucs_debug("%s: using peermem for gda transport",
uct_ib_device_name(&ib_md->dev));
} else {
ucs_config_sprintf_ternary_auto(dmabuf_str, sizeof(dmabuf_str),
&ib_md->config.gda_dmabuf_enable, NULL);
ucs_diag("%s: GPU-direct RDMA is not available (GDA_DMABUF_ENABLE=%s)",
uct_ib_device_name(&ib_md->dev), dmabuf_str);
ret = 0;
goto out;
}

if (uct_gdaki_is_uar_supported(ib_mlx5_md)) {
ret = 1;
} else {
ret = 0;
}

out:
uct_cuda_ctx_primary_pop_and_release(cuda_dev);
return ret;
}

static ucs_status_t
uct_gdaki_query_tl_devices(uct_md_h tl_md,
uct_tl_device_resource_t **tl_devices_p,
Expand All @@ -1386,7 +1427,6 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
ucs_sys_device_t dev;
int i;
uct_gdaki_dev_matrix_elem_t *ibdesc;
char dmabuf_str[8];

UCS_INIT_ONCE(&dmat_once) {
dmat = uct_gdaki_dev_matrix_init(ib_md, &dmat_length);
Expand All @@ -1397,20 +1437,7 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
goto out;
}

if ((ib_md->config.gda_dmabuf_enable != UCS_NO) &&
uct_gdaki_is_dmabuf_supported(ib_md)) {
ib_mlx5_md->flags |= UCT_IB_MLX5_MD_FLAG_REG_DMABUF_UMEM;
ucs_debug("%s: using dmabuf for gda transport",
uct_ib_device_name(&ib_md->dev));
} else if ((ib_md->config.gda_dmabuf_enable != UCS_YES) &&
uct_gdaki_is_peermem_loaded(ib_md)) {
ucs_debug("%s: using peermem for gda transport",
uct_ib_device_name(&ib_md->dev));
} else {
ucs_config_sprintf_ternary_auto(dmabuf_str, sizeof(dmabuf_str),
&ib_md->config.gda_dmabuf_enable, NULL);
ucs_diag("%s: GPU-direct RDMA is not available (GDA_DMABUF_ENABLE=%s)",
uct_ib_device_name(&ib_md->dev), dmabuf_str);
if (!uct_gdaki_check_cuda_ctx_dependent_features(ib_mlx5_md)) {
status = UCS_ERR_NO_DEVICE;
goto out;
}
Expand Down Expand Up @@ -1444,11 +1471,6 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
goto err;
}

if (!uct_gdaki_is_uar_supported(ib_mlx5_md, device)) {
status = UCS_ERR_NO_DEVICE;
goto err;
}

dev = uct_cuda_get_sys_dev(device);

snprintf(tl_devices[num_tl_devices].name,
Expand Down