RDMA/uverbs: Add DMABUF object type and operations

Expose DMABUF functionality to userspace through the uverbs interface,
enabling InfiniBand/RDMA devices to export PCI based memory regions
(e.g. device memory) as DMABUF file descriptors. This allows
zero-copy sharing of RDMA memory with other subsystems that support the
dma-buf framework.

A new UVERBS_OBJECT_DMABUF object type and allocation method were
introduced.

During allocation, uverbs invokes the driver to supply the
rdma_user_mmap_entry associated with the given page offset (pgoff).

Based on the returned rdma_user_mmap_entry, uverbs requests the driver
to provide the corresponding physical-memory details as well as the
driver’s PCI provider information.

Using this information, dma_buf_export() is called; if it succeeds,
uobj->object is set to the underlying file pointer returned by the
dma-buf framework.

The file descriptor number follows the standard uverbs allocation flow,
but the file pointer comes from the dma-buf subsystem, including its own
fops and private data.

When an mmap entry is removed, uverbs iterates over its associated
DMABUFs, marks them as revoked, and calls dma_buf_move_notify() so that
their importers are notified.

The same procedure applies during the disassociate flow; final cleanup
occurs when the application closes the file.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20260201-dmabuf-export-v3-2-da238b614fe3@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
This commit is contained in:
Yishai Hadas 2026-02-01 16:34:05 +02:00 committed by Leon Romanovsky
parent 9ad95a0f2b
commit 0ac6f4056c
11 changed files with 286 additions and 12 deletions

View file

@ -33,6 +33,7 @@ ib_umad-y := user_mad.o
ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
uverbs_std_types_cq.o \
uverbs_std_types_dmabuf.o \
uverbs_std_types_dmah.o \
uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
uverbs_std_types_mr.o uverbs_std_types_counters.o \

View file

@ -2765,6 +2765,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, map_mr_sg);
SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
SET_DEVICE_OP(dev_ops, mmap);
SET_DEVICE_OP(dev_ops, mmap_get_pfns);
SET_DEVICE_OP(dev_ops, mmap_free);
SET_DEVICE_OP(dev_ops, modify_ah);
SET_DEVICE_OP(dev_ops, modify_cq);
@ -2775,6 +2776,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, modify_srq);
SET_DEVICE_OP(dev_ops, modify_wq);
SET_DEVICE_OP(dev_ops, peek_cq);
SET_DEVICE_OP(dev_ops, pgoff_to_mmap_entry);
SET_DEVICE_OP(dev_ops, pre_destroy_cq);
SET_DEVICE_OP(dev_ops, poll_cq);
SET_DEVICE_OP(dev_ops, port_groups);

View file

@ -5,9 +5,13 @@
* Copyright 2019 Marvell. All rights reserved.
*/
#include <linux/xarray.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
#include "uverbs.h"
#include "core_priv.h"
MODULE_IMPORT_NS("DMA_BUF");
/**
* rdma_umap_priv_init() - Initialize the private data of a vma
*
@ -229,12 +233,29 @@ EXPORT_SYMBOL(rdma_user_mmap_entry_put);
*/
void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
{
struct ib_uverbs_dmabuf_file *uverbs_dmabuf, *tmp;
if (!entry)
return;
mutex_lock(&entry->dmabufs_lock);
xa_lock(&entry->ucontext->mmap_xa);
entry->driver_removed = true;
xa_unlock(&entry->ucontext->mmap_xa);
list_for_each_entry_safe(uverbs_dmabuf, tmp, &entry->dmabufs, dmabufs_elm) {
dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL);
list_del(&uverbs_dmabuf->dmabufs_elm);
uverbs_dmabuf->revoked = true;
dma_buf_move_notify(uverbs_dmabuf->dmabuf);
dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv,
DMA_RESV_USAGE_BOOKKEEP, false,
MAX_SCHEDULE_TIMEOUT);
dma_resv_unlock(uverbs_dmabuf->dmabuf->resv);
kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done);
wait_for_completion(&uverbs_dmabuf->comp);
}
mutex_unlock(&entry->dmabufs_lock);
kref_put(&entry->ref, rdma_user_mmap_entry_free);
}
EXPORT_SYMBOL(rdma_user_mmap_entry_remove);
@ -274,6 +295,9 @@ int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext,
return -EINVAL;
kref_init(&entry->ref);
INIT_LIST_HEAD(&entry->dmabufs);
mutex_init(&entry->dmabufs_lock);
entry->ucontext = ucontext;
/*

View file

@ -809,21 +809,10 @@ const struct uverbs_obj_type_class uverbs_idr_class = {
};
EXPORT_SYMBOL(uverbs_idr_class);
/*
* Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct
* file_operations release method.
*/
int uverbs_uobject_fd_release(struct inode *inode, struct file *filp)
int uverbs_uobject_release(struct ib_uobject *uobj)
{
struct ib_uverbs_file *ufile;
struct ib_uobject *uobj;
/*
* This can only happen if the fput came from alloc_abort_fd_uobject()
*/
if (!filp->private_data)
return 0;
uobj = filp->private_data;
ufile = uobj->ufile;
if (down_read_trylock(&ufile->hw_destroy_rwsem)) {
@ -850,6 +839,21 @@ int uverbs_uobject_fd_release(struct inode *inode, struct file *filp)
uverbs_uobject_put(uobj);
return 0;
}
/*
* Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct
* file_operations release method.
*/
int uverbs_uobject_fd_release(struct inode *inode, struct file *filp)
{
/*
* This can only happen if the fput came from alloc_abort_fd_uobject()
*/
if (!filp->private_data)
return 0;
return uverbs_uobject_release(filp->private_data);
}
EXPORT_SYMBOL(uverbs_uobject_fd_release);
/*

View file

@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[];
extern const struct uapi_definition uverbs_def_obj_cq[];
extern const struct uapi_definition uverbs_def_obj_device[];
extern const struct uapi_definition uverbs_def_obj_dm[];
extern const struct uapi_definition uverbs_def_obj_dmabuf[];
extern const struct uapi_definition uverbs_def_obj_dmah[];
extern const struct uapi_definition uverbs_def_obj_flow_action[];
extern const struct uapi_definition uverbs_def_obj_intf[];

View file

@ -133,6 +133,18 @@ struct ib_uverbs_completion_event_file {
struct ib_uverbs_event_queue ev_queue;
};
struct ib_uverbs_dmabuf_file {
struct ib_uobject uobj;
struct dma_buf *dmabuf;
struct list_head dmabufs_elm;
struct rdma_user_mmap_entry *mmap_entry;
struct phys_vec phys_vec;
struct p2pdma_provider *provider;
struct kref kref;
struct completion comp;
u8 revoked :1;
};
struct ib_uverbs_event {
union {
struct ib_uverbs_async_event_desc async;
@ -290,4 +302,13 @@ ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs,
void copy_port_attr_to_resp(struct ib_port_attr *attr,
struct ib_uverbs_query_port_resp *resp,
struct ib_device *ib_dev, u8 port_num);
static inline void ib_uverbs_dmabuf_done(struct kref *kref)
{
struct ib_uverbs_dmabuf_file *priv =
container_of(kref, struct ib_uverbs_dmabuf_file, kref);
complete(&priv->comp);
}
#endif /* UVERBS_H */

View file

@ -0,0 +1,200 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
*/
#include <linux/dma-buf-mapping.h>
#include <linux/pci-p2pdma.h>
#include <linux/dma-resv.h>
#include <rdma/uverbs_std_types.h>
#include "rdma_core.h"
#include "uverbs.h"
static int uverbs_dmabuf_attach(struct dma_buf *dmabuf,
struct dma_buf_attachment *attachment)
{
if (!attachment->peer2peer)
return -EOPNOTSUPP;
return 0;
}
static struct sg_table *
uverbs_dmabuf_map(struct dma_buf_attachment *attachment,
enum dma_data_direction dir)
{
struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv;
struct sg_table *ret;
dma_resv_assert_held(priv->dmabuf->resv);
if (priv->revoked)
return ERR_PTR(-ENODEV);
ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
&priv->phys_vec, 1, priv->phys_vec.len,
dir);
if (IS_ERR(ret))
return ret;
kref_get(&priv->kref);
return ret;
}
static void uverbs_dmabuf_unmap(struct dma_buf_attachment *attachment,
struct sg_table *sgt,
enum dma_data_direction dir)
{
struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv;
dma_resv_assert_held(priv->dmabuf->resv);
dma_buf_free_sgt(attachment, sgt, dir);
kref_put(&priv->kref, ib_uverbs_dmabuf_done);
}
static int uverbs_dmabuf_pin(struct dma_buf_attachment *attach)
{
return -EOPNOTSUPP;
}
static void uverbs_dmabuf_unpin(struct dma_buf_attachment *attach)
{
}
static void uverbs_dmabuf_release(struct dma_buf *dmabuf)
{
struct ib_uverbs_dmabuf_file *priv = dmabuf->priv;
/*
* This can only happen if the fput came from alloc_abort_fd_uobject()
*/
if (!priv->uobj.context)
return;
uverbs_uobject_release(&priv->uobj);
}
static const struct dma_buf_ops uverbs_dmabuf_ops = {
.attach = uverbs_dmabuf_attach,
.map_dma_buf = uverbs_dmabuf_map,
.unmap_dma_buf = uverbs_dmabuf_unmap,
.pin = uverbs_dmabuf_pin,
.unpin = uverbs_dmabuf_unpin,
.release = uverbs_dmabuf_release,
};
static int UVERBS_HANDLER(UVERBS_METHOD_DMABUF_ALLOC)(
struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj =
uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE)
->obj_attr.uobject;
struct ib_uverbs_dmabuf_file *uverbs_dmabuf =
container_of(uobj, struct ib_uverbs_dmabuf_file, uobj);
struct ib_device *ib_dev = attrs->context->device;
struct rdma_user_mmap_entry *mmap_entry;
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
off_t pg_off;
int ret;
ret = uverbs_get_const(&pg_off, attrs, UVERBS_ATTR_ALLOC_DMABUF_PGOFF);
if (ret)
return ret;
mmap_entry = ib_dev->ops.pgoff_to_mmap_entry(attrs->context, pg_off);
if (!mmap_entry)
return -EINVAL;
ret = ib_dev->ops.mmap_get_pfns(mmap_entry, &uverbs_dmabuf->phys_vec,
&uverbs_dmabuf->provider);
if (ret)
goto err;
exp_info.ops = &uverbs_dmabuf_ops;
exp_info.size = uverbs_dmabuf->phys_vec.len;
exp_info.flags = O_CLOEXEC;
exp_info.priv = uverbs_dmabuf;
uverbs_dmabuf->dmabuf = dma_buf_export(&exp_info);
if (IS_ERR(uverbs_dmabuf->dmabuf)) {
ret = PTR_ERR(uverbs_dmabuf->dmabuf);
goto err;
}
kref_init(&uverbs_dmabuf->kref);
init_completion(&uverbs_dmabuf->comp);
INIT_LIST_HEAD(&uverbs_dmabuf->dmabufs_elm);
mutex_lock(&mmap_entry->dmabufs_lock);
if (mmap_entry->driver_removed)
ret = -EIO;
else
list_add_tail(&uverbs_dmabuf->dmabufs_elm, &mmap_entry->dmabufs);
mutex_unlock(&mmap_entry->dmabufs_lock);
if (ret)
goto err_revoked;
uobj->object = uverbs_dmabuf->dmabuf->file;
uverbs_dmabuf->mmap_entry = mmap_entry;
uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE);
return 0;
err_revoked:
dma_buf_put(uverbs_dmabuf->dmabuf);
err:
rdma_user_mmap_entry_put(mmap_entry);
return ret;
}
DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_DMABUF_ALLOC,
UVERBS_ATTR_FD(UVERBS_ATTR_ALLOC_DMABUF_HANDLE,
UVERBS_OBJECT_DMABUF,
UVERBS_ACCESS_NEW,
UA_MANDATORY),
UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMABUF_PGOFF,
UVERBS_ATTR_TYPE(u64),
UA_MANDATORY));
static void uverbs_dmabuf_fd_destroy_uobj(struct ib_uobject *uobj,
enum rdma_remove_reason why)
{
struct ib_uverbs_dmabuf_file *uverbs_dmabuf =
container_of(uobj, struct ib_uverbs_dmabuf_file, uobj);
bool wait_for_comp = false;
mutex_lock(&uverbs_dmabuf->mmap_entry->dmabufs_lock);
dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL);
if (!uverbs_dmabuf->revoked) {
uverbs_dmabuf->revoked = true;
list_del(&uverbs_dmabuf->dmabufs_elm);
dma_buf_move_notify(uverbs_dmabuf->dmabuf);
dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv,
DMA_RESV_USAGE_BOOKKEEP, false,
MAX_SCHEDULE_TIMEOUT);
wait_for_comp = true;
}
dma_resv_unlock(uverbs_dmabuf->dmabuf->resv);
if (wait_for_comp) {
kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done);
/* Let's wait till all DMA unmap are completed. */
wait_for_completion(&uverbs_dmabuf->comp);
}
mutex_unlock(&uverbs_dmabuf->mmap_entry->dmabufs_lock);
/* Matches the get done as part of pgoff_to_mmap_entry() */
rdma_user_mmap_entry_put(uverbs_dmabuf->mmap_entry);
}
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_DMABUF,
UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_dmabuf_file),
uverbs_dmabuf_fd_destroy_uobj,
NULL, NULL, O_RDONLY),
&UVERBS_METHOD(UVERBS_METHOD_DMABUF_ALLOC));
const struct uapi_definition uverbs_def_obj_dmabuf[] = {
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMABUF),
UAPI_DEF_OBJ_NEEDS_FN(mmap_get_pfns),
UAPI_DEF_OBJ_NEEDS_FN(pgoff_to_mmap_entry),
{}
};

View file

@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = {
UAPI_DEF_CHAIN(uverbs_def_obj_cq),
UAPI_DEF_CHAIN(uverbs_def_obj_device),
UAPI_DEF_CHAIN(uverbs_def_obj_dm),
UAPI_DEF_CHAIN(uverbs_def_obj_dmabuf),
UAPI_DEF_CHAIN(uverbs_def_obj_dmah),
UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
UAPI_DEF_CHAIN(uverbs_def_obj_intf),

View file

@ -44,6 +44,7 @@
#include <uapi/rdma/rdma_user_ioctl.h>
#include <uapi/rdma/ib_user_ioctl_verbs.h>
#include <linux/pci-tph.h>
#include <linux/dma-buf.h>
#define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN
@ -2364,6 +2365,9 @@ struct rdma_user_mmap_entry {
unsigned long start_pgoff;
size_t npages;
bool driver_removed;
/* protects access to dmabufs */
struct mutex dmabufs_lock;
struct list_head dmabufs;
};
/* Return the offset (in bytes) the user should pass to libc's mmap() */
@ -2501,6 +2505,11 @@ struct ib_device_ops {
* Therefore needs to be implemented by the driver in mmap_free.
*/
void (*mmap_free)(struct rdma_user_mmap_entry *entry);
int (*mmap_get_pfns)(struct rdma_user_mmap_entry *entry,
struct phys_vec *phys_vec,
struct p2pdma_provider **provider);
struct rdma_user_mmap_entry *(*pgoff_to_mmap_entry)(struct ib_ucontext *ucontext,
off_t pg_off);
void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);

View file

@ -186,6 +186,7 @@ struct ib_uverbs_file {
extern const struct uverbs_obj_type_class uverbs_idr_class;
extern const struct uverbs_obj_type_class uverbs_fd_class;
int uverbs_uobject_fd_release(struct inode *inode, struct file *filp);
int uverbs_uobject_release(struct ib_uobject *uobj);
#define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \
sizeof(char))

View file

@ -56,6 +56,7 @@ enum uverbs_default_objects {
UVERBS_OBJECT_COUNTERS,
UVERBS_OBJECT_ASYNC_EVENT,
UVERBS_OBJECT_DMAH,
UVERBS_OBJECT_DMABUF,
};
enum {
@ -263,6 +264,15 @@ enum uverbs_methods_dmah {
UVERBS_METHOD_DMAH_FREE,
};
enum uverbs_attrs_alloc_dmabuf_cmd_attr_ids {
UVERBS_ATTR_ALLOC_DMABUF_HANDLE,
UVERBS_ATTR_ALLOC_DMABUF_PGOFF,
};
enum uverbs_methods_dmabuf {
UVERBS_METHOD_DMABUF_ALLOC,
};
enum uverbs_attrs_reg_dm_mr_cmd_attr_ids {
UVERBS_ATTR_REG_DM_MR_HANDLE,
UVERBS_ATTR_REG_DM_MR_OFFSET,