RDMA v7.0 merge window

Usual smallish cycle:
 
 - Various code improvements in irdma, rtrs, qedr, ocrdma, irdma, rxe
 
 - Small driver improvements and minor bug fixes to hns, mlx5, rxe, mana,
   mlx5, irdma
 
 - Robusness improvements in completion processing for EFA
 
 - New query_port_speed() verb to move past limited IBA defined speed steps
 
 - Support for SG_GAPS in rts and many other small improvements
 
 - Rare list corruption fix in iwcm
 
 - Better support different page sizes in rxe
 
 - Device memory support for mana
 
 - Direct bio vec to kernel MR for use by NFS-RDMA
 
 - QP rate limiting for bnxt_re
 
 - Remote triggerable NULL pointer crash in siw
 
 - DMA-buf exporter support for RDMA mmaps like doorbells
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRRRCHOFoQz/8F5bUaFwuHvBreFYQUCaY44vgAKCRCFwuHvBreF
 YfiZAP91cMZfogN7r1FMD75xDZu55dI3Jvy8OaixyRxlWLGPcQEAjritdL0o7fZp
 YrD1OXNS/1XG//rPBVw7xj+54Aa8hAU=
 =AVcu
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma updates from Jason Gunthorpe:
 "Usual smallish cycle. The NFS biovec work to push it down into RDMA
  instead of indirecting through a scatterlist is pretty nice to see,
  been talked about for a long time now.

   - Various code improvements in irdma, rtrs, qedr, ocrdma, irdma, rxe

   - Small driver improvements and minor bug fixes to hns, mlx5, rxe,
     mana, mlx5, irdma

   - Robusness improvements in completion processing for EFA

   - New query_port_speed() verb to move past limited IBA defined speed
     steps

   - Support for SG_GAPS in rts and many other small improvements

   - Rare list corruption fix in iwcm

   - Better support different page sizes in rxe

   - Device memory support for mana

   - Direct bio vec to kernel MR for use by NFS-RDMA

   - QP rate limiting for bnxt_re

   - Remote triggerable NULL pointer crash in siw

   - DMA-buf exporter support for RDMA mmaps like doorbells"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (66 commits)
  RDMA/mlx5: Implement DMABUF export ops
  RDMA/uverbs: Add DMABUF object type and operations
  RDMA/uverbs: Support external FD uobjects
  RDMA/siw: Fix potential NULL pointer dereference in header processing
  RDMA/umad: Reject negative data_len in ib_umad_write
  IB/core: Extend rate limit support for RC QPs
  RDMA/mlx5: Support rate limit only for Raw Packet QP
  RDMA/bnxt_re: Report QP rate limit in debugfs
  RDMA/bnxt_re: Report packet pacing capabilities when querying device
  RDMA/bnxt_re: Add support for QP rate limiting
  MAINTAINERS: Drop RDMA files from Hyper-V section
  RDMA/uverbs: Add __GFP_NOWARN to ib_uverbs_unmarshall_recv() kmalloc
  svcrdma: use bvec-based RDMA read/write API
  RDMA/core: add rdma_rw_max_sge() helper for SQ sizing
  RDMA/core: add MR support for bvec-based RDMA operations
  RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations
  RDMA/core: add bio_vec based RDMA read/write API
  RDMA/irdma: Use kvzalloc for paged memory DMA address array
  RDMA/rxe: Fix race condition in QP timer handlers
  RDMA/mana_ib: Add device‑memory support
  ...
This commit is contained in:
Linus Torvalds 2026-02-12 17:05:20 -08:00
commit 311aa68319
77 changed files with 2648 additions and 722 deletions

View file

@ -11842,7 +11842,6 @@ F: arch/x86/kernel/cpu/mshyperv.c
F: drivers/clocksource/hyperv_timer.c
F: drivers/hid/hid-hyperv.c
F: drivers/hv/
F: drivers/infiniband/hw/mana/
F: drivers/input/serio/hyperv-keyboard.c
F: drivers/iommu/hyperv-iommu.c
F: drivers/net/ethernet/microsoft/
@ -11861,7 +11860,6 @@ F: include/hyperv/hvhdk_mini.h
F: include/linux/hyperv.h
F: include/net/mana
F: include/uapi/linux/hyperv.h
F: include/uapi/rdma/mana-abi.h
F: net/vmw_vsock/hyperv_transport.c
F: tools/hv/
@ -17468,6 +17466,7 @@ MICROSOFT MANA RDMA DRIVER
M: Long Li <longli@microsoft.com>
M: Konstantin Taranov <kotaranov@microsoft.com>
L: linux-rdma@vger.kernel.org
L: linux-hyperv@vger.kernel.org
S: Supported
F: drivers/infiniband/hw/mana/
F: include/net/mana

View file

@ -33,6 +33,7 @@ ib_umad-y := user_mad.o
ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
uverbs_std_types_cq.o \
uverbs_std_types_dmabuf.o \
uverbs_std_types_dmah.o \
uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
uverbs_std_types_mr.o uverbs_std_types_counters.o \

View file

@ -1537,7 +1537,8 @@ static void ib_cache_event_task(struct work_struct *_work)
* the cache.
*/
ret = ib_cache_update(work->event.device, work->event.element.port_num,
work->event.event == IB_EVENT_GID_CHANGE,
work->event.event == IB_EVENT_GID_CHANGE ||
work->event.event == IB_EVENT_CLIENT_REREGISTER,
work->event.event == IB_EVENT_PKEY_CHANGE,
work->enforce_security);

View file

@ -361,34 +361,6 @@ static struct ib_device *__ib_device_get_by_name(const char *name)
return NULL;
}
/**
* ib_device_get_by_name - Find an IB device by name
* @name: The name to look for
* @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
*
* Find and hold an ib_device by its name. The caller must call
* ib_device_put() on the returned pointer.
*/
struct ib_device *ib_device_get_by_name(const char *name,
enum rdma_driver_id driver_id)
{
struct ib_device *device;
down_read(&devices_rwsem);
device = __ib_device_get_by_name(name);
if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
device->ops.driver_id != driver_id)
device = NULL;
if (device) {
if (!ib_device_try_get(device))
device = NULL;
}
up_read(&devices_rwsem);
return device;
}
EXPORT_SYMBOL(ib_device_get_by_name);
static int rename_compat_devs(struct ib_device *device)
{
struct ib_core_device *cdev;
@ -2793,6 +2765,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, map_mr_sg);
SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
SET_DEVICE_OP(dev_ops, mmap);
SET_DEVICE_OP(dev_ops, mmap_get_pfns);
SET_DEVICE_OP(dev_ops, mmap_free);
SET_DEVICE_OP(dev_ops, modify_ah);
SET_DEVICE_OP(dev_ops, modify_cq);
@ -2803,6 +2776,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, modify_srq);
SET_DEVICE_OP(dev_ops, modify_wq);
SET_DEVICE_OP(dev_ops, peek_cq);
SET_DEVICE_OP(dev_ops, pgoff_to_mmap_entry);
SET_DEVICE_OP(dev_ops, pre_destroy_cq);
SET_DEVICE_OP(dev_ops, poll_cq);
SET_DEVICE_OP(dev_ops, port_groups);
@ -2816,6 +2790,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, query_gid);
SET_DEVICE_OP(dev_ops, query_pkey);
SET_DEVICE_OP(dev_ops, query_port);
SET_DEVICE_OP(dev_ops, query_port_speed);
SET_DEVICE_OP(dev_ops, query_qp);
SET_DEVICE_OP(dev_ops, query_srq);
SET_DEVICE_OP(dev_ops, query_ucontext);
@ -2875,7 +2850,6 @@ int ib_add_sub_device(struct ib_device *parent,
return ret;
}
EXPORT_SYMBOL(ib_add_sub_device);
int ib_del_sub_device_and_put(struct ib_device *sub)
{
@ -2896,7 +2870,6 @@ int ib_del_sub_device_and_put(struct ib_device *sub)
return 0;
}
EXPORT_SYMBOL(ib_del_sub_device_and_put);
#ifdef CONFIG_INFINIBAND_VIRT_DMA
int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents)

View file

@ -5,9 +5,13 @@
* Copyright 2019 Marvell. All rights reserved.
*/
#include <linux/xarray.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
#include "uverbs.h"
#include "core_priv.h"
MODULE_IMPORT_NS("DMA_BUF");
/**
* rdma_umap_priv_init() - Initialize the private data of a vma
*
@ -229,12 +233,29 @@ EXPORT_SYMBOL(rdma_user_mmap_entry_put);
*/
void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
{
struct ib_uverbs_dmabuf_file *uverbs_dmabuf, *tmp;
if (!entry)
return;
mutex_lock(&entry->dmabufs_lock);
xa_lock(&entry->ucontext->mmap_xa);
entry->driver_removed = true;
xa_unlock(&entry->ucontext->mmap_xa);
list_for_each_entry_safe(uverbs_dmabuf, tmp, &entry->dmabufs, dmabufs_elm) {
dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL);
list_del(&uverbs_dmabuf->dmabufs_elm);
uverbs_dmabuf->revoked = true;
dma_buf_move_notify(uverbs_dmabuf->dmabuf);
dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv,
DMA_RESV_USAGE_BOOKKEEP, false,
MAX_SCHEDULE_TIMEOUT);
dma_resv_unlock(uverbs_dmabuf->dmabuf->resv);
kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done);
wait_for_completion(&uverbs_dmabuf->comp);
}
mutex_unlock(&entry->dmabufs_lock);
kref_put(&entry->ref, rdma_user_mmap_entry_free);
}
EXPORT_SYMBOL(rdma_user_mmap_entry_remove);
@ -274,6 +295,9 @@ int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext,
return -EINVAL;
kref_init(&entry->ref);
INIT_LIST_HEAD(&entry->dmabufs);
mutex_init(&entry->dmabufs_lock);
entry->ucontext = ucontext;
/*

View file

@ -95,7 +95,6 @@ static struct workqueue_struct *iwcm_wq;
struct iwcm_work {
struct work_struct work;
struct iwcm_id_private *cm_id;
struct list_head list;
struct iw_cm_event event;
struct list_head free_list;
};
@ -178,7 +177,6 @@ static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
return -ENOMEM;
}
work->cm_id = cm_id_priv;
INIT_LIST_HEAD(&work->list);
put_work(work);
}
return 0;
@ -213,7 +211,6 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
{
if (refcount_dec_and_test(&cm_id_priv->refcount)) {
BUG_ON(!list_empty(&cm_id_priv->work_list));
free_cm_id(cm_id_priv);
return true;
}
@ -260,7 +257,6 @@ struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
refcount_set(&cm_id_priv->refcount, 1);
init_waitqueue_head(&cm_id_priv->connect_wait);
init_completion(&cm_id_priv->destroy_comp);
INIT_LIST_HEAD(&cm_id_priv->work_list);
INIT_LIST_HEAD(&cm_id_priv->work_free_list);
return &cm_id_priv->id;
@ -1007,13 +1003,13 @@ static int process_event(struct iwcm_id_private *cm_id_priv,
}
/*
* Process events on the work_list for the cm_id. If the callback
* function requests that the cm_id be deleted, a flag is set in the
* cm_id flags to indicate that when the last reference is
* removed, the cm_id is to be destroyed. This is necessary to
* distinguish between an object that will be destroyed by the app
* thread asleep on the destroy_comp list vs. an object destroyed
* here synchronously when the last reference is removed.
* Process events for the cm_id. If the callback function requests
* that the cm_id be deleted, a flag is set in the cm_id flags to
* indicate that when the last reference is removed, the cm_id is
* to be destroyed. This is necessary to distinguish between an
* object that will be destroyed by the app thread asleep on the
* destroy_comp list vs. an object destroyed here synchronously
* when the last reference is removed.
*/
static void cm_work_handler(struct work_struct *_work)
{
@ -1024,35 +1020,26 @@ static void cm_work_handler(struct work_struct *_work)
int ret = 0;
spin_lock_irqsave(&cm_id_priv->lock, flags);
while (!list_empty(&cm_id_priv->work_list)) {
work = list_first_entry(&cm_id_priv->work_list,
struct iwcm_work, list);
list_del_init(&work->list);
levent = work->event;
put_work(work);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
ret = process_event(cm_id_priv, &levent);
if (ret) {
destroy_cm_id(&cm_id_priv->id);
WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
}
} else
pr_debug("dropping event %d\n", levent.event);
if (iwcm_deref_id(cm_id_priv))
return;
spin_lock_irqsave(&cm_id_priv->lock, flags);
}
levent = work->event;
put_work(work);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
ret = process_event(cm_id_priv, &levent);
if (ret) {
destroy_cm_id(&cm_id_priv->id);
WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
}
} else
pr_debug("dropping event %d\n", levent.event);
if (iwcm_deref_id(cm_id_priv))
return;
}
/*
* This function is called on interrupt context. Schedule events on
* the iwcm_wq thread to allow callback functions to downcall into
* the CM and/or block. Events are queued to a per-CM_ID
* work_list. If this is the first event on the work_list, the work
* element is also queued on the iwcm_wq thread.
* the CM and/or block.
*
* Each event holds a reference on the cm_id. Until the last posted
* event has been delivered and processed, the cm_id cannot be
@ -1094,7 +1081,6 @@ static int cm_event_handler(struct iw_cm_id *cm_id,
}
refcount_inc(&cm_id_priv->refcount);
list_add_tail(&work->list, &cm_id_priv->work_list);
queue_work(iwcm_wq, &work->work);
out:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);

View file

@ -50,7 +50,6 @@ struct iwcm_id_private {
struct ib_qp *qp;
struct completion destroy_comp;
wait_queue_head_t connect_wait;
struct list_head work_list;
spinlock_t lock;
refcount_t refcount;
struct list_head work_free_list;

View file

@ -465,7 +465,7 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj,
fd_type =
container_of(obj->type_attrs, struct uverbs_obj_fd_type, type);
if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release &&
if (WARN_ON(fd_type->fops && fd_type->fops->release != &uverbs_uobject_fd_release &&
fd_type->fops->release != &uverbs_async_event_release)) {
ret = ERR_PTR(-EINVAL);
goto err_fd;
@ -477,14 +477,16 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj,
goto err_fd;
}
/* Note that uverbs_uobject_fd_release() is called during abort */
filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL,
fd_type->flags);
if (IS_ERR(filp)) {
ret = ERR_CAST(filp);
goto err_getfile;
if (fd_type->fops) {
/* Note that uverbs_uobject_fd_release() is called during abort */
filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL,
fd_type->flags);
if (IS_ERR(filp)) {
ret = ERR_CAST(filp);
goto err_getfile;
}
uobj->object = filp;
}
uobj->object = filp;
uobj->id = new_fd;
return uobj;
@ -561,7 +563,9 @@ static void alloc_abort_fd_uobject(struct ib_uobject *uobj)
{
struct file *filp = uobj->object;
fput(filp);
if (filp)
fput(filp);
put_unused_fd(uobj->id);
}
@ -628,11 +632,14 @@ static void alloc_commit_fd_uobject(struct ib_uobject *uobj)
/* This shouldn't be used anymore. Use the file object instead */
uobj->id = 0;
/*
* NOTE: Once we install the file we loose ownership of our kref on
* uobj. It will be put by uverbs_uobject_fd_release()
*/
filp->private_data = uobj;
if (!filp->private_data) {
/*
* NOTE: Once we install the file we loose ownership of our kref on
* uobj. It will be put by uverbs_uobject_fd_release()
*/
filp->private_data = uobj;
}
fd_install(fd, filp);
}
@ -802,21 +809,10 @@ const struct uverbs_obj_type_class uverbs_idr_class = {
};
EXPORT_SYMBOL(uverbs_idr_class);
/*
* Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct
* file_operations release method.
*/
int uverbs_uobject_fd_release(struct inode *inode, struct file *filp)
int uverbs_uobject_release(struct ib_uobject *uobj)
{
struct ib_uverbs_file *ufile;
struct ib_uobject *uobj;
/*
* This can only happen if the fput came from alloc_abort_fd_uobject()
*/
if (!filp->private_data)
return 0;
uobj = filp->private_data;
ufile = uobj->ufile;
if (down_read_trylock(&ufile->hw_destroy_rwsem)) {
@ -843,6 +839,21 @@ int uverbs_uobject_fd_release(struct inode *inode, struct file *filp)
uverbs_uobject_put(uobj);
return 0;
}
/*
* Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct
* file_operations release method.
*/
int uverbs_uobject_fd_release(struct inode *inode, struct file *filp)
{
/*
* This can only happen if the fput came from alloc_abort_fd_uobject()
*/
if (!filp->private_data)
return 0;
return uverbs_uobject_release(filp->private_data);
}
EXPORT_SYMBOL(uverbs_uobject_fd_release);
/*

View file

@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[];
extern const struct uapi_definition uverbs_def_obj_cq[];
extern const struct uapi_definition uverbs_def_obj_device[];
extern const struct uapi_definition uverbs_def_obj_dm[];
extern const struct uapi_definition uverbs_def_obj_dmabuf[];
extern const struct uapi_definition uverbs_def_obj_dmah[];
extern const struct uapi_definition uverbs_def_obj_flow_action[];
extern const struct uapi_definition uverbs_def_obj_intf[];

View file

@ -14,6 +14,7 @@ enum {
RDMA_RW_MULTI_WR,
RDMA_RW_MR,
RDMA_RW_SIG_MR,
RDMA_RW_IOVA,
};
static bool rdma_rw_force_mr;
@ -121,6 +122,36 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num,
return count;
}
static int rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx *reg,
struct rdma_rw_reg_ctx *prev, struct ib_qp *qp, u32 port_num,
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
{
if (prev) {
if (reg->mr->need_inval)
prev->wr.wr.next = &reg->inv_wr;
else
prev->wr.wr.next = &reg->reg_wr.wr;
}
reg->reg_wr.wr.next = &reg->wr.wr;
reg->wr.wr.sg_list = &reg->sge;
reg->wr.wr.num_sge = 1;
reg->wr.remote_addr = remote_addr;
reg->wr.rkey = rkey;
if (dir == DMA_TO_DEVICE) {
reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
} else if (!rdma_cap_read_inv(qp->device, port_num)) {
reg->wr.wr.opcode = IB_WR_RDMA_READ;
} else {
reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
}
return 1;
}
static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
@ -146,30 +177,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
if (ret < 0)
goto out_free;
count += ret;
if (prev) {
if (reg->mr->need_inval)
prev->wr.wr.next = &reg->inv_wr;
else
prev->wr.wr.next = &reg->reg_wr.wr;
}
reg->reg_wr.wr.next = &reg->wr.wr;
reg->wr.wr.sg_list = &reg->sge;
reg->wr.wr.num_sge = 1;
reg->wr.remote_addr = remote_addr;
reg->wr.rkey = rkey;
if (dir == DMA_TO_DEVICE) {
reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
} else if (!rdma_cap_read_inv(qp->device, port_num)) {
reg->wr.wr.opcode = IB_WR_RDMA_READ;
} else {
reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
}
count++;
count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
remote_addr, rkey, dir);
remote_addr += reg->sge.length;
sg_cnt -= nents;
for (j = 0; j < nents; j++)
@ -192,6 +201,92 @@ out:
return ret;
}
static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
struct bvec_iter *iter, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
struct ib_device *dev = qp->pd->device;
struct rdma_rw_reg_ctx *prev = NULL;
u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en);
struct scatterlist *sg;
int i, ret, count = 0;
u32 nents = 0;
ctx->reg = kcalloc(DIV_ROUND_UP(nr_bvec, pages_per_mr),
sizeof(*ctx->reg), GFP_KERNEL);
if (!ctx->reg)
return -ENOMEM;
/*
* Build scatterlist from bvecs using the iterator. This follows
* the pattern from __blk_rq_map_sg.
*/
ctx->reg[0].sgt.sgl = kmalloc_array(nr_bvec,
sizeof(*ctx->reg[0].sgt.sgl),
GFP_KERNEL);
if (!ctx->reg[0].sgt.sgl) {
ret = -ENOMEM;
goto out_free_reg;
}
sg_init_table(ctx->reg[0].sgt.sgl, nr_bvec);
for (sg = ctx->reg[0].sgt.sgl; iter->bi_size; sg = sg_next(sg)) {
struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
if (nents >= nr_bvec) {
ret = -EINVAL;
goto out_free_sgl;
}
sg_set_page(sg, bv.bv_page, bv.bv_len, bv.bv_offset);
bvec_iter_advance(bvecs, iter, bv.bv_len);
nents++;
}
sg_mark_end(sg_last(ctx->reg[0].sgt.sgl, nents));
ctx->reg[0].sgt.orig_nents = nents;
/* DMA map the scatterlist */
ret = ib_dma_map_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
if (ret)
goto out_free_sgl;
ctx->nr_ops = DIV_ROUND_UP(ctx->reg[0].sgt.nents, pages_per_mr);
sg = ctx->reg[0].sgt.sgl;
nents = ctx->reg[0].sgt.nents;
for (i = 0; i < ctx->nr_ops; i++) {
struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
u32 sge_cnt = min(nents, pages_per_mr);
ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sge_cnt, 0);
if (ret < 0)
goto out_free_mrs;
count += ret;
count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
remote_addr, rkey, dir);
remote_addr += reg->sge.length;
nents -= sge_cnt;
sg += sge_cnt;
prev = reg;
}
if (prev)
prev->wr.wr.next = NULL;
ctx->type = RDMA_RW_MR;
return count;
out_free_mrs:
while (--i >= 0)
ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
out_free_sgl:
kfree(ctx->reg[0].sgt.sgl);
out_free_reg:
kfree(ctx->reg);
return ret;
}
static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
struct scatterlist *sg, u32 sg_cnt, u32 offset,
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
@ -274,6 +369,196 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
return 1;
}
static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx,
struct ib_qp *qp, const struct bio_vec *bvecs,
struct bvec_iter *iter, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
struct ib_device *dev = qp->pd->device;
struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
u64 dma_addr;
ctx->nr_ops = 1;
dma_addr = ib_dma_map_bvec(dev, &bv, dir);
if (ib_dma_mapping_error(dev, dma_addr))
return -ENOMEM;
ctx->single.sge.lkey = qp->pd->local_dma_lkey;
ctx->single.sge.addr = dma_addr;
ctx->single.sge.length = bv.bv_len;
memset(rdma_wr, 0, sizeof(*rdma_wr));
if (dir == DMA_TO_DEVICE)
rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
else
rdma_wr->wr.opcode = IB_WR_RDMA_READ;
rdma_wr->wr.sg_list = &ctx->single.sge;
rdma_wr->wr.num_sge = 1;
rdma_wr->remote_addr = remote_addr;
rdma_wr->rkey = rkey;
ctx->type = RDMA_RW_SINGLE_WR;
return 1;
}
static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
const struct bio_vec *bvecs, u32 nr_bvec, struct bvec_iter *iter,
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
{
struct ib_device *dev = qp->pd->device;
u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
qp->max_read_sge;
struct ib_sge *sge;
u32 total_len = 0, i, j;
u32 mapped_bvecs = 0;
u32 nr_ops = DIV_ROUND_UP(nr_bvec, max_sge);
size_t sges_size = array_size(nr_bvec, sizeof(*ctx->map.sges));
size_t wrs_offset = ALIGN(sges_size, __alignof__(*ctx->map.wrs));
size_t wrs_size = array_size(nr_ops, sizeof(*ctx->map.wrs));
void *mem;
if (sges_size == SIZE_MAX || wrs_size == SIZE_MAX ||
check_add_overflow(wrs_offset, wrs_size, &wrs_size))
return -ENOMEM;
mem = kzalloc(wrs_size, GFP_KERNEL);
if (!mem)
return -ENOMEM;
ctx->map.sges = sge = mem;
ctx->map.wrs = mem + wrs_offset;
for (i = 0; i < nr_ops; i++) {
struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
u32 nr_sge = min(nr_bvec - mapped_bvecs, max_sge);
if (dir == DMA_TO_DEVICE)
rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
else
rdma_wr->wr.opcode = IB_WR_RDMA_READ;
rdma_wr->remote_addr = remote_addr + total_len;
rdma_wr->rkey = rkey;
rdma_wr->wr.num_sge = nr_sge;
rdma_wr->wr.sg_list = sge;
for (j = 0; j < nr_sge; j++) {
struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
u64 dma_addr;
dma_addr = ib_dma_map_bvec(dev, &bv, dir);
if (ib_dma_mapping_error(dev, dma_addr))
goto out_unmap;
mapped_bvecs++;
sge->addr = dma_addr;
sge->length = bv.bv_len;
sge->lkey = qp->pd->local_dma_lkey;
total_len += bv.bv_len;
sge++;
bvec_iter_advance_single(bvecs, iter, bv.bv_len);
}
rdma_wr->wr.next = i + 1 < nr_ops ?
&ctx->map.wrs[i + 1].wr : NULL;
}
ctx->nr_ops = nr_ops;
ctx->type = RDMA_RW_MULTI_WR;
return nr_ops;
out_unmap:
for (i = 0; i < mapped_bvecs; i++)
ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
ctx->map.sges[i].length, dir);
kfree(ctx->map.sges);
return -ENOMEM;
}
/*
* Try to use the two-step IOVA API to map bvecs into a contiguous DMA range.
* This reduces IOTLB sync overhead by doing one sync at the end instead of
* one per bvec, and produces a contiguous DMA address range that can be
* described by a single SGE.
*
* Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA
* mapping is not available, or another negative error code on failure.
*/
static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx,
struct ib_qp *qp, const struct bio_vec *bvec,
struct bvec_iter *iter, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
struct ib_device *dev = qp->pd->device;
struct device *dma_dev = dev->dma_device;
size_t total_len = iter->bi_size;
struct bio_vec first_bv;
size_t mapped_len = 0;
int ret;
/* Virtual DMA devices cannot support IOVA allocators */
if (ib_uses_virt_dma(dev))
return -EOPNOTSUPP;
/* Try to allocate contiguous IOVA space */
first_bv = mp_bvec_iter_bvec(bvec, *iter);
if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state,
bvec_phys(&first_bv), total_len))
return -EOPNOTSUPP;
/* Link all bvecs into the IOVA space */
while (iter->bi_size) {
struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter);
ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv),
mapped_len, bv.bv_len, dir, 0);
if (ret)
goto out_destroy;
mapped_len += bv.bv_len;
bvec_iter_advance(bvec, iter, bv.bv_len);
}
/* Sync the IOTLB once for all linked pages */
ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len);
if (ret)
goto out_destroy;
ctx->iova.mapped_len = mapped_len;
/* Single SGE covers the entire contiguous IOVA range */
ctx->iova.sge.addr = ctx->iova.state.addr;
ctx->iova.sge.length = mapped_len;
ctx->iova.sge.lkey = qp->pd->local_dma_lkey;
/* Single WR for the whole transfer */
memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr));
if (dir == DMA_TO_DEVICE)
ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE;
else
ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ;
ctx->iova.wr.wr.num_sge = 1;
ctx->iova.wr.wr.sg_list = &ctx->iova.sge;
ctx->iova.wr.remote_addr = remote_addr;
ctx->iova.wr.rkey = rkey;
ctx->type = RDMA_RW_IOVA;
ctx->nr_ops = 1;
return 1;
out_destroy:
/*
* dma_iova_destroy() expects the actual mapped length, not the
* total allocation size. It unlinks only the successfully linked
* range and frees the entire IOVA allocation.
*/
dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0);
return ret;
}
/**
* rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
* @ctx: context to initialize
@ -344,6 +629,79 @@ out_unmap_sg:
}
EXPORT_SYMBOL(rdma_rw_ctx_init);
/**
* rdma_rw_ctx_init_bvec - initialize a RDMA READ/WRITE context from bio_vec
* @ctx: context to initialize
* @qp: queue pair to operate on
* @port_num: port num to which the connection is bound
* @bvecs: bio_vec array to READ/WRITE from/to
* @nr_bvec: number of entries in @bvecs
* @iter: bvec iterator describing offset and length
* @remote_addr: remote address to read/write (relative to @rkey)
* @rkey: remote key to operate on
* @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
*
* Maps the bio_vec array directly, avoiding intermediate scatterlist
* conversion. Supports MR registration for iWARP devices and force_mr mode.
*
* Returns the number of WQEs that will be needed on the workqueue if
* successful, or a negative error code:
*
* * -EINVAL - @nr_bvec is zero or @iter.bi_size is zero
* * -ENOMEM - DMA mapping or memory allocation failed
*/
int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
struct bvec_iter iter, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
struct ib_device *dev = qp->pd->device;
int ret;
if (nr_bvec == 0 || iter.bi_size == 0)
return -EINVAL;
/*
* iWARP requires MR registration for all RDMA READs. The force_mr
* debug option also mandates MR usage.
*/
if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp(dev, port_num))
return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
nr_bvec, &iter, remote_addr,
rkey, dir);
if (unlikely(rdma_rw_force_mr))
return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
nr_bvec, &iter, remote_addr,
rkey, dir);
if (nr_bvec == 1)
return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter,
remote_addr, rkey, dir);
/*
* Try IOVA-based mapping first for multi-bvec transfers.
* IOVA coalesces bvecs into a single DMA-contiguous region,
* reducing the number of WRs needed and avoiding MR overhead.
*/
ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr,
rkey, dir);
if (ret != -EOPNOTSUPP)
return ret;
/*
* IOVA mapping not available. Check if MR registration provides
* better performance than multiple SGE entries.
*/
if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec))
return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
nr_bvec, &iter, remote_addr,
rkey, dir);
return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter,
remote_addr, rkey, dir);
}
EXPORT_SYMBOL(rdma_rw_ctx_init_bvec);
/**
* rdma_rw_ctx_signature_init - initialize a RW context with signature offload
* @ctx: context to initialize
@ -515,6 +873,10 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
first_wr = &ctx->reg[0].reg_wr.wr;
last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
break;
case RDMA_RW_IOVA:
first_wr = &ctx->iova.wr.wr;
last_wr = &ctx->iova.wr.wr;
break;
case RDMA_RW_MULTI_WR:
first_wr = &ctx->map.wrs[0].wr;
last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
@ -579,6 +941,8 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
switch (ctx->type) {
case RDMA_RW_MR:
/* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */
WARN_ON_ONCE(ctx->reg[0].sgt.sgl);
for (i = 0; i < ctx->nr_ops; i++)
ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
kfree(ctx->reg);
@ -589,6 +953,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
break;
case RDMA_RW_SINGLE_WR:
break;
case RDMA_RW_IOVA:
/* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */
WARN_ON_ONCE(1);
return;
default:
BUG();
break;
@ -598,6 +966,58 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
}
EXPORT_SYMBOL(rdma_rw_ctx_destroy);
/**
* rdma_rw_ctx_destroy_bvec - release resources from rdma_rw_ctx_init_bvec
* @ctx: context to release
* @qp: queue pair to operate on
* @port_num: port num to which the connection is bound (unused)
* @bvecs: bio_vec array that was used for the READ/WRITE (unused)
* @nr_bvec: number of entries in @bvecs
* @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
*
* Releases all resources allocated by a successful rdma_rw_ctx_init_bvec()
* call. Must not be called if rdma_rw_ctx_init_bvec() returned an error.
*
* The @port_num and @bvecs parameters are unused but present for API
* symmetry with rdma_rw_ctx_destroy().
*/
void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 __maybe_unused port_num,
const struct bio_vec __maybe_unused *bvecs,
u32 nr_bvec, enum dma_data_direction dir)
{
struct ib_device *dev = qp->pd->device;
u32 i;
switch (ctx->type) {
case RDMA_RW_MR:
for (i = 0; i < ctx->nr_ops; i++)
ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
kfree(ctx->reg[0].sgt.sgl);
kfree(ctx->reg);
break;
case RDMA_RW_IOVA:
dma_iova_destroy(dev->dma_device, &ctx->iova.state,
ctx->iova.mapped_len, dir, 0);
break;
case RDMA_RW_MULTI_WR:
for (i = 0; i < nr_bvec; i++)
ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,
ctx->map.sges[i].length, dir);
kfree(ctx->map.sges);
break;
case RDMA_RW_SINGLE_WR:
ib_dma_unmap_bvec(dev, ctx->single.sge.addr,
ctx->single.sge.length, dir);
break;
default:
WARN_ON_ONCE(1);
return;
}
}
EXPORT_SYMBOL(rdma_rw_ctx_destroy_bvec);
/**
* rdma_rw_ctx_destroy_signature - release all resources allocated by
* rdma_rw_ctx_signature_init
@ -651,34 +1071,57 @@ unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
}
EXPORT_SYMBOL(rdma_rw_mr_factor);
/**
* rdma_rw_max_send_wr - compute max Send WRs needed for RDMA R/W contexts
* @dev: RDMA device
* @port_num: port number
* @max_rdma_ctxs: number of rdma_rw_ctx structures
* @create_flags: QP create flags (pass IB_QP_CREATE_INTEGRITY_EN if
* data integrity will be enabled on the QP)
*
* Returns the total number of Send Queue entries needed for
* @max_rdma_ctxs. The result accounts for memory registration and
* invalidation work requests when the device requires them.
*
* ULPs use this to size Send Queues and Send CQs before creating a
* Queue Pair.
*/
unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num,
unsigned int max_rdma_ctxs, u32 create_flags)
{
unsigned int factor = 1;
unsigned int result;
if (create_flags & IB_QP_CREATE_INTEGRITY_EN ||
rdma_rw_can_use_mr(dev, port_num))
factor += 2; /* reg + inv */
if (check_mul_overflow(factor, max_rdma_ctxs, &result))
return UINT_MAX;
return result;
}
EXPORT_SYMBOL(rdma_rw_max_send_wr);
void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
{
u32 factor;
unsigned int factor = 1;
WARN_ON_ONCE(attr->port_num == 0);
/*
* Each context needs at least one RDMA READ or WRITE WR.
*
* For some hardware we might need more, eventually we should ask the
* HCA driver for a multiplier here.
*/
factor = 1;
/*
* If the device needs MRs to perform RDMA READ or WRITE operations,
* we'll need two additional MRs for the registrations and the
* invalidation.
* If the device uses MRs to perform RDMA READ or WRITE operations,
* or if data integrity is enabled, account for registration and
* invalidation work requests.
*/
if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
rdma_rw_can_use_mr(dev, attr->port_num))
factor += 2; /* inv + reg */
factor += 2; /* reg + inv */
attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
/*
* But maybe we were just too high in the sky and the device doesn't
* even support all we need, and we'll have to live with what we get..
* The device might not support all we need, and we'll have to
* live with what we get.
*/
attr->cap.max_send_wr =
min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);

View file

@ -292,62 +292,22 @@ static ssize_t cap_mask_show(struct ib_device *ibdev, u32 port_num,
static ssize_t rate_show(struct ib_device *ibdev, u32 port_num,
struct ib_port_attribute *unused, char *buf)
{
struct ib_port_speed_info speed_info;
struct ib_port_attr attr;
char *speed = "";
int rate; /* in deci-Gb/sec */
ssize_t ret;
ret = ib_query_port(ibdev, port_num, &attr);
if (ret)
return ret;
switch (attr.active_speed) {
case IB_SPEED_DDR:
speed = " DDR";
rate = 50;
break;
case IB_SPEED_QDR:
speed = " QDR";
rate = 100;
break;
case IB_SPEED_FDR10:
speed = " FDR10";
rate = 100;
break;
case IB_SPEED_FDR:
speed = " FDR";
rate = 140;
break;
case IB_SPEED_EDR:
speed = " EDR";
rate = 250;
break;
case IB_SPEED_HDR:
speed = " HDR";
rate = 500;
break;
case IB_SPEED_NDR:
speed = " NDR";
rate = 1000;
break;
case IB_SPEED_XDR:
speed = " XDR";
rate = 2000;
break;
case IB_SPEED_SDR:
default: /* default to SDR for invalid rates */
speed = " SDR";
rate = 25;
break;
}
ret = ib_port_attr_to_speed_info(&attr, &speed_info);
if (ret)
return ret;
rate *= ib_width_enum_to_int(attr.active_width);
if (rate < 0)
return -EINVAL;
return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10,
rate % 10 ? ".5" : "",
ib_width_enum_to_int(attr.active_width), speed);
return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", speed_info.rate / 10,
speed_info.rate % 10 ? ".5" : "",
ib_width_enum_to_int(attr.active_width),
speed_info.str);
}
static const char *phys_state_to_str(enum ib_port_phys_state phys_state)

View file

@ -129,9 +129,6 @@ ib_umem_dmabuf_get_with_dma_device(struct ib_device *device,
if (check_add_overflow(offset, (unsigned long)size, &end))
return ret;
if (unlikely(!ops || !ops->move_notify))
return ret;
dmabuf = dma_buf_get(fd);
if (IS_ERR(dmabuf))
return ERR_CAST(dmabuf);

View file

@ -514,7 +514,8 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
struct rdma_ah_attr ah_attr;
struct ib_ah *ah;
__be64 *tid;
int ret, data_len, hdr_len, copy_offset, rmpp_active;
int ret, hdr_len, copy_offset, rmpp_active;
size_t data_len;
u8 base_version;
if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
@ -588,7 +589,10 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
}
base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version;
data_len = count - hdr_size(file) - hdr_len;
if (check_sub_overflow(count, hdr_size(file) + hdr_len, &data_len)) {
ret = -EINVAL;
goto err_ah;
}
packet->msg = ib_create_send_mad(agent,
be32_to_cpu(packet->mad.hdr.qpn),
packet->mad.hdr.pkey_index, rmpp_active,

View file

@ -133,6 +133,18 @@ struct ib_uverbs_completion_event_file {
struct ib_uverbs_event_queue ev_queue;
};
struct ib_uverbs_dmabuf_file {
struct ib_uobject uobj;
struct dma_buf *dmabuf;
struct list_head dmabufs_elm;
struct rdma_user_mmap_entry *mmap_entry;
struct phys_vec phys_vec;
struct p2pdma_provider *provider;
struct kref kref;
struct completion comp;
u8 revoked :1;
};
struct ib_uverbs_event {
union {
struct ib_uverbs_async_event_desc async;
@ -290,4 +302,13 @@ ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs,
void copy_port_attr_to_resp(struct ib_port_attr *attr,
struct ib_uverbs_query_port_resp *resp,
struct ib_device *ib_dev, u8 port_num);
static inline void ib_uverbs_dmabuf_done(struct kref *kref)
{
struct ib_uverbs_dmabuf_file *priv =
container_of(kref, struct ib_uverbs_dmabuf_file, kref);
complete(&priv->comp);
}
#endif /* UVERBS_H */

View file

@ -2049,7 +2049,10 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
if (ret)
return ret;
user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
if (cmd.wqe_size < sizeof(struct ib_uverbs_send_wr))
return -EINVAL;
user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL | __GFP_NOWARN);
if (!user_wr)
return -ENOMEM;
@ -2239,7 +2242,7 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
if (ret)
return ERR_PTR(ret);
user_wr = kmalloc(wqe_size, GFP_KERNEL);
user_wr = kmalloc(wqe_size, GFP_KERNEL | __GFP_NOWARN);
if (!user_wr)
return ERR_PTR(-ENOMEM);

View file

@ -209,6 +209,39 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)(
&resp, sizeof(resp));
}
static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT_SPEED)(
struct uverbs_attr_bundle *attrs)
{
struct ib_ucontext *ucontext;
struct ib_device *ib_dev;
u32 port_num;
u64 speed;
int ret;
ucontext = ib_uverbs_get_ucontext(attrs);
if (IS_ERR(ucontext))
return PTR_ERR(ucontext);
ib_dev = ucontext->device;
if (!ib_dev->ops.query_port_speed)
return -EOPNOTSUPP;
ret = uverbs_get_const(&port_num, attrs,
UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM);
if (ret)
return ret;
if (!rdma_is_port_valid(ib_dev, port_num))
return -EINVAL;
ret = ib_dev->ops.query_port_speed(ib_dev, port_num, &speed);
if (ret)
return ret;
return uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_PORT_SPEED_RESP,
&speed, sizeof(speed));
}
static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)(
struct uverbs_attr_bundle *attrs)
{
@ -469,6 +502,14 @@ DECLARE_UVERBS_NAMED_METHOD(
active_speed_ex),
UA_MANDATORY));
DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_QUERY_PORT_SPEED,
UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM, u32,
UA_MANDATORY),
UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_PORT_SPEED_RESP,
UVERBS_ATTR_TYPE(u64),
UA_MANDATORY));
DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_QUERY_GID_TABLE,
UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64,
@ -498,6 +539,7 @@ DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE,
&UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE),
&UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES),
&UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT),
&UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT_SPEED),
&UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT),
&UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE),
&UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY));

View file

@ -0,0 +1,200 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
*/
#include <linux/dma-buf-mapping.h>
#include <linux/pci-p2pdma.h>
#include <linux/dma-resv.h>
#include <rdma/uverbs_std_types.h>
#include "rdma_core.h"
#include "uverbs.h"
static int uverbs_dmabuf_attach(struct dma_buf *dmabuf,
struct dma_buf_attachment *attachment)
{
if (!attachment->peer2peer)
return -EOPNOTSUPP;
return 0;
}
static struct sg_table *
uverbs_dmabuf_map(struct dma_buf_attachment *attachment,
enum dma_data_direction dir)
{
struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv;
struct sg_table *ret;
dma_resv_assert_held(priv->dmabuf->resv);
if (priv->revoked)
return ERR_PTR(-ENODEV);
ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
&priv->phys_vec, 1, priv->phys_vec.len,
dir);
if (IS_ERR(ret))
return ret;
kref_get(&priv->kref);
return ret;
}
static void uverbs_dmabuf_unmap(struct dma_buf_attachment *attachment,
struct sg_table *sgt,
enum dma_data_direction dir)
{
struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv;
dma_resv_assert_held(priv->dmabuf->resv);
dma_buf_free_sgt(attachment, sgt, dir);
kref_put(&priv->kref, ib_uverbs_dmabuf_done);
}
static int uverbs_dmabuf_pin(struct dma_buf_attachment *attach)
{
return -EOPNOTSUPP;
}
static void uverbs_dmabuf_unpin(struct dma_buf_attachment *attach)
{
}
static void uverbs_dmabuf_release(struct dma_buf *dmabuf)
{
struct ib_uverbs_dmabuf_file *priv = dmabuf->priv;
/*
* This can only happen if the fput came from alloc_abort_fd_uobject()
*/
if (!priv->uobj.context)
return;
uverbs_uobject_release(&priv->uobj);
}
static const struct dma_buf_ops uverbs_dmabuf_ops = {
.attach = uverbs_dmabuf_attach,
.map_dma_buf = uverbs_dmabuf_map,
.unmap_dma_buf = uverbs_dmabuf_unmap,
.pin = uverbs_dmabuf_pin,
.unpin = uverbs_dmabuf_unpin,
.release = uverbs_dmabuf_release,
};
static int UVERBS_HANDLER(UVERBS_METHOD_DMABUF_ALLOC)(
struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj =
uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE)
->obj_attr.uobject;
struct ib_uverbs_dmabuf_file *uverbs_dmabuf =
container_of(uobj, struct ib_uverbs_dmabuf_file, uobj);
struct ib_device *ib_dev = attrs->context->device;
struct rdma_user_mmap_entry *mmap_entry;
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
off_t pg_off;
int ret;
ret = uverbs_get_const(&pg_off, attrs, UVERBS_ATTR_ALLOC_DMABUF_PGOFF);
if (ret)
return ret;
mmap_entry = ib_dev->ops.pgoff_to_mmap_entry(attrs->context, pg_off);
if (!mmap_entry)
return -EINVAL;
ret = ib_dev->ops.mmap_get_pfns(mmap_entry, &uverbs_dmabuf->phys_vec,
&uverbs_dmabuf->provider);
if (ret)
goto err;
exp_info.ops = &uverbs_dmabuf_ops;
exp_info.size = uverbs_dmabuf->phys_vec.len;
exp_info.flags = O_CLOEXEC;
exp_info.priv = uverbs_dmabuf;
uverbs_dmabuf->dmabuf = dma_buf_export(&exp_info);
if (IS_ERR(uverbs_dmabuf->dmabuf)) {
ret = PTR_ERR(uverbs_dmabuf->dmabuf);
goto err;
}
kref_init(&uverbs_dmabuf->kref);
init_completion(&uverbs_dmabuf->comp);
INIT_LIST_HEAD(&uverbs_dmabuf->dmabufs_elm);
mutex_lock(&mmap_entry->dmabufs_lock);
if (mmap_entry->driver_removed)
ret = -EIO;
else
list_add_tail(&uverbs_dmabuf->dmabufs_elm, &mmap_entry->dmabufs);
mutex_unlock(&mmap_entry->dmabufs_lock);
if (ret)
goto err_revoked;
uobj->object = uverbs_dmabuf->dmabuf->file;
uverbs_dmabuf->mmap_entry = mmap_entry;
uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE);
return 0;
err_revoked:
dma_buf_put(uverbs_dmabuf->dmabuf);
err:
rdma_user_mmap_entry_put(mmap_entry);
return ret;
}
DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_DMABUF_ALLOC,
UVERBS_ATTR_FD(UVERBS_ATTR_ALLOC_DMABUF_HANDLE,
UVERBS_OBJECT_DMABUF,
UVERBS_ACCESS_NEW,
UA_MANDATORY),
UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMABUF_PGOFF,
UVERBS_ATTR_TYPE(u64),
UA_MANDATORY));
static void uverbs_dmabuf_fd_destroy_uobj(struct ib_uobject *uobj,
enum rdma_remove_reason why)
{
struct ib_uverbs_dmabuf_file *uverbs_dmabuf =
container_of(uobj, struct ib_uverbs_dmabuf_file, uobj);
bool wait_for_comp = false;
mutex_lock(&uverbs_dmabuf->mmap_entry->dmabufs_lock);
dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL);
if (!uverbs_dmabuf->revoked) {
uverbs_dmabuf->revoked = true;
list_del(&uverbs_dmabuf->dmabufs_elm);
dma_buf_move_notify(uverbs_dmabuf->dmabuf);
dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv,
DMA_RESV_USAGE_BOOKKEEP, false,
MAX_SCHEDULE_TIMEOUT);
wait_for_comp = true;
}
dma_resv_unlock(uverbs_dmabuf->dmabuf->resv);
if (wait_for_comp) {
kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done);
/* Let's wait till all DMA unmap are completed. */
wait_for_completion(&uverbs_dmabuf->comp);
}
mutex_unlock(&uverbs_dmabuf->mmap_entry->dmabufs_lock);
/* Matches the get done as part of pgoff_to_mmap_entry() */
rdma_user_mmap_entry_put(uverbs_dmabuf->mmap_entry);
}
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_DMABUF,
UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_dmabuf_file),
uverbs_dmabuf_fd_destroy_uobj,
NULL, NULL, O_RDONLY),
&UVERBS_METHOD(UVERBS_METHOD_DMABUF_ALLOC));
const struct uapi_definition uverbs_def_obj_dmabuf[] = {
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMABUF),
UAPI_DEF_OBJ_NEEDS_FN(mmap_get_pfns),
UAPI_DEF_OBJ_NEEDS_FN(pgoff_to_mmap_entry),
{}
};

View file

@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = {
UAPI_DEF_CHAIN(uverbs_def_obj_cq),
UAPI_DEF_CHAIN(uverbs_def_obj_device),
UAPI_DEF_CHAIN(uverbs_def_obj_dm),
UAPI_DEF_CHAIN(uverbs_def_obj_dmabuf),
UAPI_DEF_CHAIN(uverbs_def_obj_dmah),
UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
UAPI_DEF_CHAIN(uverbs_def_obj_intf),

View file

@ -78,6 +78,7 @@ static const char * const ib_events[] = {
[IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached",
[IB_EVENT_CLIENT_REREGISTER] = "client reregister",
[IB_EVENT_GID_CHANGE] = "GID changed",
[IB_EVENT_DEVICE_SPEED_CHANGE] = "device speed change"
};
const char *__attribute_const__ ib_event_msg(enum ib_event_type event)
@ -216,6 +217,57 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
}
EXPORT_SYMBOL(ib_rate_to_mbps);
struct ib_speed_attr {
const char *str;
int speed;
};
#define IB_SPEED_ATTR(speed_type, _str, _speed) \
[speed_type] = {.str = _str, .speed = _speed}
static const struct ib_speed_attr ib_speed_attrs[] = {
IB_SPEED_ATTR(IB_SPEED_SDR, " SDR", 25),
IB_SPEED_ATTR(IB_SPEED_DDR, " DDR", 50),
IB_SPEED_ATTR(IB_SPEED_QDR, " QDR", 100),
IB_SPEED_ATTR(IB_SPEED_FDR10, " FDR10", 100),
IB_SPEED_ATTR(IB_SPEED_FDR, " FDR", 140),
IB_SPEED_ATTR(IB_SPEED_EDR, " EDR", 250),
IB_SPEED_ATTR(IB_SPEED_HDR, " HDR", 500),
IB_SPEED_ATTR(IB_SPEED_NDR, " NDR", 1000),
IB_SPEED_ATTR(IB_SPEED_XDR, " XDR", 2000),
};
int ib_port_attr_to_speed_info(struct ib_port_attr *attr,
struct ib_port_speed_info *speed_info)
{
int speed_idx = attr->active_speed;
switch (attr->active_speed) {
case IB_SPEED_DDR:
case IB_SPEED_QDR:
case IB_SPEED_FDR10:
case IB_SPEED_FDR:
case IB_SPEED_EDR:
case IB_SPEED_HDR:
case IB_SPEED_NDR:
case IB_SPEED_XDR:
case IB_SPEED_SDR:
break;
default:
speed_idx = IB_SPEED_SDR; /* Default to SDR for invalid rates */
break;
}
speed_info->str = ib_speed_attrs[speed_idx].str;
speed_info->rate = ib_speed_attrs[speed_idx].speed;
speed_info->rate *= ib_width_enum_to_int(attr->active_width);
if (speed_info->rate < 0)
return -EINVAL;
return 0;
}
EXPORT_SYMBOL(ib_port_attr_to_speed_info);
__attribute_const__ enum rdma_transport_type
rdma_node_get_transport(unsigned int node_type)
{
@ -1485,7 +1537,8 @@ static const struct {
IB_QP_PKEY_INDEX),
[IB_QPT_RC] = (IB_QP_ALT_PATH |
IB_QP_ACCESS_FLAGS |
IB_QP_PKEY_INDEX),
IB_QP_PKEY_INDEX |
IB_QP_RATE_LIMIT),
[IB_QPT_XRC_INI] = (IB_QP_ALT_PATH |
IB_QP_ACCESS_FLAGS |
IB_QP_PKEY_INDEX),
@ -1533,7 +1586,8 @@ static const struct {
IB_QP_ALT_PATH |
IB_QP_ACCESS_FLAGS |
IB_QP_MIN_RNR_TIMER |
IB_QP_PATH_MIG_STATE),
IB_QP_PATH_MIG_STATE |
IB_QP_RATE_LIMIT),
[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
IB_QP_ALT_PATH |
IB_QP_ACCESS_FLAGS |
@ -1567,7 +1621,8 @@ static const struct {
IB_QP_ACCESS_FLAGS |
IB_QP_ALT_PATH |
IB_QP_PATH_MIG_STATE |
IB_QP_MIN_RNR_TIMER),
IB_QP_MIN_RNR_TIMER |
IB_QP_RATE_LIMIT),
[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
IB_QP_ACCESS_FLAGS |
IB_QP_ALT_PATH |

View file

@ -87,25 +87,35 @@ static ssize_t qp_info_read(struct file *filep,
size_t count, loff_t *ppos)
{
struct bnxt_re_qp *qp = filep->private_data;
struct bnxt_qplib_qp *qplib_qp;
u32 rate_limit = 0;
char *buf;
int len;
if (*ppos)
return 0;
qplib_qp = &qp->qplib_qp;
if (qplib_qp->shaper_allocation_status)
rate_limit = qplib_qp->rate_limit;
buf = kasprintf(GFP_KERNEL,
"QPN\t\t: %d\n"
"transport\t: %s\n"
"state\t\t: %s\n"
"mtu\t\t: %d\n"
"timeout\t\t: %d\n"
"remote QPN\t: %d\n",
"remote QPN\t: %d\n"
"shaper allocated : %d\n"
"rate limit\t: %d kbps\n",
qp->qplib_qp.id,
bnxt_re_qp_type_str(qp->qplib_qp.type),
bnxt_re_qp_state_str(qp->qplib_qp.state),
qp->qplib_qp.mtu,
qp->qplib_qp.timeout,
qp->qplib_qp.dest_qpn);
qp->qplib_qp.dest_qpn,
qplib_qp->shaper_allocation_status,
rate_limit);
if (!buf)
return -ENOMEM;
if (count < strlen(buf)) {

View file

@ -186,6 +186,9 @@ int bnxt_re_query_device(struct ib_device *ibdev,
{
struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr;
struct bnxt_re_query_device_ex_resp resp = {};
size_t outlen = (udata) ? udata->outlen : 0;
int rc = 0;
memset(ib_attr, 0, sizeof(*ib_attr));
memcpy(&ib_attr->fw_ver, dev_attr->fw_ver,
@ -250,7 +253,21 @@ int bnxt_re_query_device(struct ib_device *ibdev,
ib_attr->max_pkeys = 1;
ib_attr->local_ca_ack_delay = BNXT_RE_DEFAULT_ACK_DELAY;
return 0;
if ((offsetofend(typeof(resp), packet_pacing_caps) <= outlen) &&
_is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) {
resp.packet_pacing_caps.qp_rate_limit_min =
dev_attr->rate_limit_min;
resp.packet_pacing_caps.qp_rate_limit_max =
dev_attr->rate_limit_max;
resp.packet_pacing_caps.supported_qpts =
1 << IB_QPT_RC;
}
if (outlen)
rc = ib_copy_to_udata(udata, &resp,
min(sizeof(resp), outlen));
return rc;
}
int bnxt_re_modify_device(struct ib_device *ibdev,
@ -2089,10 +2106,11 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
unsigned int flags;
u8 nw_type;
if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
if (qp_attr_mask & ~(IB_QP_ATTR_STANDARD_BITS | IB_QP_RATE_LIMIT))
return -EOPNOTSUPP;
qp->qplib_qp.modify_flags = 0;
qp->qplib_qp.ext_modify_flags = 0;
if (qp_attr_mask & IB_QP_STATE) {
curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state);
new_qp_state = qp_attr->qp_state;
@ -2129,6 +2147,15 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
bnxt_re_unlock_cqs(qp, flags);
}
}
if (qp_attr_mask & IB_QP_RATE_LIMIT) {
if (qp->qplib_qp.type != IB_QPT_RC ||
!_is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2))
return -EOPNOTSUPP;
qp->qplib_qp.ext_modify_flags |=
CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID;
qp->qplib_qp.rate_limit = qp_attr->rate_limit;
}
if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
qp->qplib_qp.modify_flags |=
CMDQ_MODIFY_QP_MODIFY_MASK_EN_SQD_ASYNC_NOTIFY;
@ -4386,6 +4413,9 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2))
resp.comp_mask |= BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED;
if (_is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2))
resp.comp_mask |= BNXT_RE_UCNTX_CMASK_QP_RATE_LIMIT_ENABLED;
if (udata->inlen >= sizeof(ureq)) {
rc = ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq)));
if (rc)

View file

@ -1313,8 +1313,8 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
struct bnxt_qplib_cmdqmsg msg = {};
struct cmdq_modify_qp req = {};
u16 vlan_pcp_vlan_dei_vlan_id;
u32 bmask, bmask_ext;
u32 temp32[4];
u32 bmask;
int rc;
bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req,
@ -1329,9 +1329,16 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
is_optimized_state_transition(qp))
bnxt_set_mandatory_attributes(res, qp, &req);
}
bmask = qp->modify_flags;
req.modify_mask = cpu_to_le32(qp->modify_flags);
bmask_ext = qp->ext_modify_flags;
req.ext_modify_mask = cpu_to_le32(qp->ext_modify_flags);
req.qp_cid = cpu_to_le32(qp->id);
if (bmask_ext & CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID)
req.rate_limit = cpu_to_le32(qp->rate_limit);
if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_STATE) {
req.network_type_en_sqd_async_notify_new_state =
(qp->state & CMDQ_MODIFY_QP_NEW_STATE_MASK) |
@ -1429,6 +1436,9 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
rc = bnxt_qplib_rcfw_send_message(rcfw, &msg);
if (rc)
return rc;
if (bmask_ext & CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID)
qp->shaper_allocation_status = resp.shaper_allocation_status;
qp->cur_qp_state = qp->state;
return 0;
}

View file

@ -280,6 +280,7 @@ struct bnxt_qplib_qp {
u8 state;
u8 cur_qp_state;
u64 modify_flags;
u32 ext_modify_flags;
u32 max_inline_data;
u32 mtu;
u8 path_mtu;
@ -346,6 +347,8 @@ struct bnxt_qplib_qp {
bool is_host_msn_tbl;
u8 tos_dscp;
u32 ugid_index;
u32 rate_limit;
u8 shaper_allocation_status;
};
#define BNXT_RE_MAX_MSG_SIZE 0x80000000

View file

@ -623,4 +623,10 @@ static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2)
return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED);
}
static inline bool _is_modify_qp_rate_limit_supported(u16 dev_cap_ext_flags2)
{
return dev_cap_ext_flags2 &
CREQ_QUERY_FUNC_RESP_SB_MODIFY_QP_RATE_LIMIT_SUPPORTED;
}
#endif /* __BNXT_QPLIB_RES_H__ */

View file

@ -193,6 +193,11 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw)
attr->max_dpi = le32_to_cpu(sb->max_dpi);
attr->is_atomic = bnxt_qplib_is_atomic_cap(rcfw);
if (_is_modify_qp_rate_limit_supported(attr->dev_cap_flags2)) {
attr->rate_limit_min = le16_to_cpu(sb->rate_limit_min);
attr->rate_limit_max = le32_to_cpu(sb->rate_limit_max);
}
bail:
dma_free_coherent(&rcfw->pdev->dev, sbuf.size,
sbuf.sb, sbuf.dma_addr);

View file

@ -76,6 +76,8 @@ struct bnxt_qplib_dev_attr {
u16 dev_cap_flags;
u16 dev_cap_flags2;
u32 max_dpi;
u16 rate_limit_min;
u32 rate_limit_max;
};
struct bnxt_qplib_pd {

View file

@ -690,10 +690,11 @@ struct cmdq_modify_qp {
__le32 ext_modify_mask;
#define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_EXT_STATS_CTX 0x1UL
#define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_SCHQ_ID_VALID 0x2UL
#define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID 0x8UL
__le32 ext_stats_ctx_id;
__le16 schq_id;
__le16 unused_0;
__le32 reserved32;
__le32 rate_limit;
};
/* creq_modify_qp_resp (size:128b/16B) */
@ -716,7 +717,8 @@ struct creq_modify_qp_resp {
#define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_INDEX_MASK 0xeUL
#define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_INDEX_SFT 1
#define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_STATE 0x10UL
u8 reserved8;
u8 shaper_allocation_status;
#define CREQ_MODIFY_QP_RESP_SHAPER_ALLOCATED 0x1UL
__le32 lag_src_mac;
};
@ -2179,7 +2181,7 @@ struct creq_query_func_resp {
u8 reserved48[6];
};
/* creq_query_func_resp_sb (size:1088b/136B) */
/* creq_query_func_resp_sb (size:1280b/160B) */
struct creq_query_func_resp_sb {
u8 opcode;
#define CREQ_QUERY_FUNC_RESP_SB_OPCODE_QUERY_FUNC 0x83UL
@ -2256,12 +2258,15 @@ struct creq_query_func_resp_sb {
#define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \
CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE
#define CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED 0x40UL
#define CREQ_QUERY_FUNC_RESP_SB_MODIFY_QP_RATE_LIMIT_SUPPORTED 0x400UL
#define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL
__le16 max_xp_qp_size;
__le16 create_qp_batch_size;
__le16 destroy_qp_batch_size;
__le16 max_srq_ext;
__le64 reserved64;
__le16 reserved16;
__le16 rate_limit_min;
__le32 rate_limit_max;
};
/* cmdq_set_func_resources (size:448b/56B) */

View file

@ -3,6 +3,8 @@
* Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved.
*/
#include <linux/log2.h>
#include "efa_com.h"
#include "efa_regs_defs.h"
@ -21,6 +23,8 @@
#define EFA_CTRL_SUB_MINOR 1
enum efa_cmd_status {
EFA_CMD_UNUSED,
EFA_CMD_ALLOCATED,
EFA_CMD_SUBMITTED,
EFA_CMD_COMPLETED,
};
@ -32,7 +36,6 @@ struct efa_comp_ctx {
enum efa_cmd_status status;
u16 cmd_id;
u8 cmd_opcode;
u8 occupied;
};
static const char *efa_com_cmd_str(u8 cmd)
@ -241,7 +244,6 @@ static int efa_com_admin_init_aenq(struct efa_com_dev *edev,
return 0;
}
/* ID to be used with efa_com_get_comp_ctx */
static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq)
{
u16 ctx_id;
@ -263,36 +265,47 @@ static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq,
spin_unlock(&aq->comp_ctx_lock);
}
static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq,
struct efa_comp_ctx *comp_ctx)
static struct efa_comp_ctx *efa_com_alloc_comp_ctx(struct efa_com_admin_queue *aq)
{
u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command,
EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
u16 ctx_id = cmd_id & (aq->depth - 1);
struct efa_comp_ctx *comp_ctx;
u16 ctx_id;
ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id);
comp_ctx->occupied = 0;
efa_com_dealloc_ctx_id(aq, ctx_id);
}
ctx_id = efa_com_alloc_ctx_id(aq);
static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq,
u16 cmd_id, bool capture)
{
u16 ctx_id = cmd_id & (aq->depth - 1);
if (aq->comp_ctx[ctx_id].occupied && capture) {
ibdev_err_ratelimited(
aq->efa_dev,
"Completion context for command_id %#x is occupied\n",
cmd_id);
comp_ctx = &aq->comp_ctx[ctx_id];
if (comp_ctx->status != EFA_CMD_UNUSED) {
efa_com_dealloc_ctx_id(aq, ctx_id);
ibdev_err_ratelimited(aq->efa_dev,
"Completion context[%u] is used[%u]\n",
ctx_id, comp_ctx->status);
return NULL;
}
if (capture) {
aq->comp_ctx[ctx_id].occupied = 1;
ibdev_dbg(aq->efa_dev,
"Take completion ctxt for command_id %#x\n", cmd_id);
}
comp_ctx->status = EFA_CMD_ALLOCATED;
ibdev_dbg(aq->efa_dev, "Take completion context[%u]\n", ctx_id);
return comp_ctx;
}
static inline u16 efa_com_get_comp_ctx_id(struct efa_com_admin_queue *aq,
struct efa_comp_ctx *comp_ctx)
{
return comp_ctx - aq->comp_ctx;
}
static inline void efa_com_dealloc_comp_ctx(struct efa_com_admin_queue *aq,
struct efa_comp_ctx *comp_ctx)
{
u16 ctx_id = efa_com_get_comp_ctx_id(aq, comp_ctx);
ibdev_dbg(aq->efa_dev, "Put completion context[%u]\n", ctx_id);
comp_ctx->status = EFA_CMD_UNUSED;
efa_com_dealloc_ctx_id(aq, ctx_id);
}
static inline struct efa_comp_ctx *efa_com_get_comp_ctx_by_cmd_id(struct efa_com_admin_queue *aq,
u16 cmd_id)
{
u16 ctx_id = cmd_id & (aq->depth - 1);
return &aq->comp_ctx[ctx_id];
}
@ -310,26 +323,23 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu
u16 ctx_id;
u16 pi;
comp_ctx = efa_com_alloc_comp_ctx(aq);
if (!comp_ctx)
return ERR_PTR(-EINVAL);
queue_size_mask = aq->depth - 1;
pi = aq->sq.pc & queue_size_mask;
ctx_id = efa_com_alloc_ctx_id(aq);
ctx_id = efa_com_get_comp_ctx_id(aq, comp_ctx);
/* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */
cmd_id = ctx_id & queue_size_mask;
cmd_id |= aq->sq.pc & ~queue_size_mask;
cmd_id |= aq->sq.pc << ilog2(aq->depth);
cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
cmd->aq_common_descriptor.command_id = cmd_id;
EFA_SET(&cmd->aq_common_descriptor.flags,
EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase);
comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true);
if (!comp_ctx) {
efa_com_dealloc_ctx_id(aq, ctx_id);
return ERR_PTR(-EINVAL);
}
comp_ctx->status = EFA_CMD_SUBMITTED;
comp_ctx->comp_size = comp_size_in_bytes;
comp_ctx->user_cqe = comp;
@ -370,9 +380,9 @@ static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq)
}
for (i = 0; i < aq->depth; i++) {
comp_ctx = efa_com_get_comp_ctx(aq, i, false);
if (comp_ctx)
init_completion(&comp_ctx->wait_event);
comp_ctx = &aq->comp_ctx[i];
comp_ctx->status = EFA_CMD_UNUSED;
init_completion(&comp_ctx->wait_event);
aq->comp_ctx_pool[i] = i;
}
@ -417,11 +427,12 @@ static int efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq
cmd_id = EFA_GET(&cqe->acq_common_descriptor.command,
EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false);
if (comp_ctx->status != EFA_CMD_SUBMITTED) {
comp_ctx = efa_com_get_comp_ctx_by_cmd_id(aq, cmd_id);
if (comp_ctx->status != EFA_CMD_SUBMITTED || comp_ctx->cmd_id != cmd_id) {
ibdev_err(aq->efa_dev,
"Received completion with unexpected command id[%d], sq producer: %d, sq consumer: %d, cq consumer: %d\n",
cmd_id, aq->sq.pc, aq->sq.cc, aq->cq.cc);
"Received completion with unexpected command id[%x], status[%d] sq producer[%d], sq consumer[%d], cq consumer[%d]\n",
cmd_id, comp_ctx->status, aq->sq.pc, aq->sq.cc,
aq->cq.cc);
return -EINVAL;
}
@ -530,7 +541,7 @@ static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_c
err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status);
out:
efa_com_put_comp_ctx(aq, comp_ctx);
efa_com_dealloc_comp_ctx(aq, comp_ctx);
return err;
}
@ -580,7 +591,7 @@ static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *com
err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status);
out:
efa_com_put_comp_ctx(aq, comp_ctx);
efa_com_dealloc_comp_ctx(aq, comp_ctx);
return err;
}

View file

@ -60,7 +60,7 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
u8 tclass = get_tclass(grh);
u8 priority = 0;
u8 tc_mode = 0;
int ret;
int ret = 0;
if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) {
ret = -EOPNOTSUPP;
@ -77,19 +77,18 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
ah->av.flowlabel = grh->flow_label;
ah->av.udp_sport = get_ah_udp_sport(ah_attr);
ah->av.tclass = tclass;
ah->av.sl = rdma_ah_get_sl(ah_attr);
ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority);
if (ret == -EOPNOTSUPP)
ret = 0;
if (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority);
if (ret == -EOPNOTSUPP)
ret = 0;
else if (ret)
goto err_out;
if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
goto err_out;
if (tc_mode == HNAE3_TC_MAP_MODE_DSCP &&
grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
ah->av.sl = priority;
else
ah->av.sl = rdma_ah_get_sl(ah_attr);
if (tc_mode == HNAE3_TC_MAP_MODE_DSCP)
ah->av.sl = priority;
}
if (!check_sl_valid(hr_dev, ah->av.sl)) {
ret = -EINVAL;

View file

@ -55,7 +55,7 @@ void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx)
{
struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device);
struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
u32 least_load = cq_table->ctx_num[0];
u32 least_load = U32_MAX;
u8 bankid = 0;
u8 i;
@ -63,7 +63,10 @@ void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx)
return;
mutex_lock(&cq_table->bank_mutex);
for (i = 1; i < HNS_ROCE_CQ_BANK_NUM; i++) {
for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) {
if (!(cq_table->valid_cq_bank_mask & BIT(i)))
continue;
if (cq_table->ctx_num[i] < least_load) {
least_load = cq_table->ctx_num[i];
bankid = i;
@ -581,6 +584,11 @@ void hns_roce_init_cq_table(struct hns_roce_dev *hr_dev)
cq_table->bank[i].max = hr_dev->caps.num_cqs /
HNS_ROCE_CQ_BANK_NUM - 1;
}
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK)
cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_LIMIT;
else
cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_DEFAULT;
}
void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev)

View file

@ -103,6 +103,10 @@
#define CQ_BANKID_SHIFT 2
#define CQ_BANKID_MASK GENMASK(1, 0)
#define VALID_CQ_BANK_MASK_DEFAULT 0xF
#define VALID_CQ_BANK_MASK_LIMIT 0x9
#define VALID_EXT_SGE_QP_BANK_MASK_LIMIT 0x42
#define HNS_ROCE_MAX_CQ_COUNT 0xFFFF
#define HNS_ROCE_MAX_CQ_PERIOD 0xFFFF
@ -156,6 +160,7 @@ enum {
HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19),
HNS_ROCE_CAP_FLAG_BOND = BIT(21),
HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22),
HNS_ROCE_CAP_FLAG_LIMIT_BANK = BIT(23),
};
#define HNS_ROCE_DB_TYPE_COUNT 2
@ -500,6 +505,7 @@ struct hns_roce_cq_table {
struct hns_roce_bank bank[HNS_ROCE_CQ_BANK_NUM];
struct mutex bank_mutex;
u32 ctx_num[HNS_ROCE_CQ_BANK_NUM];
u8 valid_cq_bank_mask;
};
struct hns_roce_srq_table {

View file

@ -876,6 +876,170 @@ out:
return ret;
}
static int hns_roce_push_drain_wr(struct hns_roce_wq *wq, struct ib_cq *cq,
u64 wr_id)
{
unsigned long flags;
int ret = 0;
spin_lock_irqsave(&wq->lock, flags);
if (hns_roce_wq_overflow(wq, 1, cq)) {
ret = -ENOMEM;
goto out;
}
wq->wrid[wq->head & (wq->wqe_cnt - 1)] = wr_id;
wq->head++;
out:
spin_unlock_irqrestore(&wq->lock, flags);
return ret;
}
struct hns_roce_drain_cqe {
struct ib_cqe cqe;
struct completion done;
};
static void hns_roce_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct hns_roce_drain_cqe *cqe = container_of(wc->wr_cqe,
struct hns_roce_drain_cqe,
cqe);
complete(&cqe->done);
}
static void handle_drain_completion(struct ib_cq *ibcq,
struct hns_roce_drain_cqe *drain,
struct hns_roce_dev *hr_dev)
{
#define TIMEOUT (HZ / 10)
struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
unsigned long flags;
bool triggered;
if (ibcq->poll_ctx == IB_POLL_DIRECT) {
while (wait_for_completion_timeout(&drain->done, TIMEOUT) <= 0)
ib_process_cq_direct(ibcq, -1);
return;
}
if (hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN)
goto waiting_done;
spin_lock_irqsave(&hr_cq->lock, flags);
triggered = hr_cq->is_armed;
hr_cq->is_armed = 1;
spin_unlock_irqrestore(&hr_cq->lock, flags);
/* Triggered means this cq is processing or has been processed
* by hns_roce_handle_device_err() or this function. We need to
* cancel the already invoked comp_handler() to avoid concurrency.
* If it has not been triggered, we can directly invoke
* comp_handler().
*/
if (triggered) {
switch (ibcq->poll_ctx) {
case IB_POLL_SOFTIRQ:
irq_poll_disable(&ibcq->iop);
irq_poll_enable(&ibcq->iop);
break;
case IB_POLL_WORKQUEUE:
case IB_POLL_UNBOUND_WORKQUEUE:
cancel_work_sync(&ibcq->work);
break;
default:
WARN_ON_ONCE(1);
}
}
if (ibcq->comp_handler)
ibcq->comp_handler(ibcq, ibcq->cq_context);
waiting_done:
if (ibcq->comp_handler)
wait_for_completion(&drain->done);
}
static void hns_roce_v2_drain_rq(struct ib_qp *ibqp)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
struct hns_roce_drain_cqe rdrain = {};
const struct ib_recv_wr *bad_rwr;
struct ib_cq *cq = ibqp->recv_cq;
struct ib_recv_wr rwr = {};
int ret;
ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE);
if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) {
ibdev_err_ratelimited(&hr_dev->ib_dev,
"failed to modify qp during drain rq, ret = %d.\n",
ret);
return;
}
rwr.wr_cqe = &rdrain.cqe;
rdrain.cqe.done = hns_roce_drain_qp_done;
init_completion(&rdrain.done);
if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)
ret = hns_roce_push_drain_wr(&hr_qp->rq, cq, rwr.wr_id);
else
ret = hns_roce_v2_post_recv(ibqp, &rwr, &bad_rwr);
if (ret) {
ibdev_err_ratelimited(&hr_dev->ib_dev,
"failed to post recv for drain rq, ret = %d.\n",
ret);
return;
}
handle_drain_completion(cq, &rdrain, hr_dev);
}
static void hns_roce_v2_drain_sq(struct ib_qp *ibqp)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
struct hns_roce_drain_cqe sdrain = {};
const struct ib_send_wr *bad_swr;
struct ib_cq *cq = ibqp->send_cq;
struct ib_rdma_wr swr = {
.wr = {
.next = NULL,
{ .wr_cqe = &sdrain.cqe, },
.opcode = IB_WR_RDMA_WRITE,
},
};
int ret;
ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE);
if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) {
ibdev_err_ratelimited(&hr_dev->ib_dev,
"failed to modify qp during drain sq, ret = %d.\n",
ret);
return;
}
sdrain.cqe.done = hns_roce_drain_qp_done;
init_completion(&sdrain.done);
if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)
ret = hns_roce_push_drain_wr(&hr_qp->sq, cq, swr.wr.wr_id);
else
ret = hns_roce_v2_post_send(ibqp, &swr.wr, &bad_swr);
if (ret) {
ibdev_err_ratelimited(&hr_dev->ib_dev,
"failed to post send for drain sq, ret = %d.\n",
ret);
return;
}
handle_drain_completion(cq, &sdrain, hr_dev);
}
static void *get_srq_wqe_buf(struct hns_roce_srq *srq, u32 n)
{
return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift);
@ -3739,6 +3903,23 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev,
HNS_ROCE_V2_CQ_DEFAULT_INTERVAL);
}
static bool left_sw_wc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
{
struct hns_roce_qp *hr_qp;
list_for_each_entry(hr_qp, &hr_cq->sq_list, sq_node) {
if (hr_qp->sq.head != hr_qp->sq.tail)
return true;
}
list_for_each_entry(hr_qp, &hr_cq->rq_list, rq_node) {
if (hr_qp->rq.head != hr_qp->rq.tail)
return true;
}
return false;
}
static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
enum ib_cq_notify_flags flags)
{
@ -3747,6 +3928,12 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
struct hns_roce_v2_db cq_db = {};
u32 notify_flag;
if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) {
if ((flags & IB_CQ_REPORT_MISSED_EVENTS) &&
left_sw_wc(hr_dev, hr_cq))
return 1;
return 0;
}
/*
* flags = 0, then notify_flag : next
* flags = 1, then notify flag : solocited
@ -5053,20 +5240,22 @@ static int hns_roce_set_sl(struct ib_qp *ibqp,
struct ib_device *ibdev = &hr_dev->ib_dev;
int ret;
ret = hns_roce_hw_v2_get_dscp(hr_dev, get_tclass(&attr->ah_attr.grh),
&hr_qp->tc_mode, &hr_qp->priority);
if (ret && ret != -EOPNOTSUPP &&
grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
ibdev_err_ratelimited(ibdev,
"failed to get dscp, ret = %d.\n", ret);
return ret;
}
hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP &&
grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
hr_qp->sl = hr_qp->priority;
else
hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
if (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
ret = hns_roce_hw_v2_get_dscp(hr_dev,
get_tclass(&attr->ah_attr.grh),
&hr_qp->tc_mode, &hr_qp->priority);
if (ret && ret != -EOPNOTSUPP) {
ibdev_err_ratelimited(ibdev,
"failed to get dscp, ret = %d.\n",
ret);
return ret;
}
if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP)
hr_qp->sl = hr_qp->priority;
}
if (!check_sl_valid(hr_dev, hr_qp->sl))
return -EINVAL;
@ -6956,7 +7145,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work);
hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0);
hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq",
WQ_MEM_RECLAIM);
if (!hr_dev->irq_workq) {
dev_err(dev, "failed to create irq workqueue.\n");
ret = -ENOMEM;
@ -7014,6 +7204,8 @@ static const struct ib_device_ops hns_roce_v2_dev_ops = {
.post_send = hns_roce_v2_post_send,
.query_qp = hns_roce_v2_query_qp,
.req_notify_cq = hns_roce_v2_req_notify_cq,
.drain_rq = hns_roce_v2_drain_rq,
.drain_sq = hns_roce_v2_drain_sq,
};
static const struct ib_device_ops hns_roce_v2_dev_srq_ops = {

View file

@ -259,6 +259,11 @@ static int hns_roce_query_device(struct ib_device *ib_dev,
props->max_srq_sge = hr_dev->caps.max_srq_sges;
}
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) {
props->max_cq >>= 1;
props->max_qp >>= 1;
}
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR &&
hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;

View file

@ -197,22 +197,16 @@ static u8 get_affinity_cq_bank(u8 qp_bank)
return (qp_bank >> 1) & CQ_BANKID_MASK;
}
static u8 get_least_load_bankid_for_qp(struct ib_qp_init_attr *init_attr,
struct hns_roce_bank *bank)
static u8 get_least_load_bankid_for_qp(struct hns_roce_bank *bank, u8 valid_qp_bank_mask)
{
#define INVALID_LOAD_QPNUM 0xFFFFFFFF
struct ib_cq *scq = init_attr->send_cq;
u32 least_load = INVALID_LOAD_QPNUM;
unsigned long cqn = 0;
u8 bankid = 0;
u32 bankcnt;
u8 i;
if (scq)
cqn = to_hr_cq(scq)->cqn;
for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) {
if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK)))
if (!(valid_qp_bank_mask & BIT(i)))
continue;
bankcnt = bank[i].inuse;
@ -246,6 +240,42 @@ static int alloc_qpn_with_bankid(struct hns_roce_bank *bank, u8 bankid,
return 0;
}
static bool use_ext_sge(struct ib_qp_init_attr *init_attr)
{
return init_attr->cap.max_send_sge > HNS_ROCE_SGE_IN_WQE ||
init_attr->qp_type == IB_QPT_UD ||
init_attr->qp_type == IB_QPT_GSI;
}
static u8 select_qp_bankid(struct hns_roce_dev *hr_dev,
struct ib_qp_init_attr *init_attr)
{
struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
struct hns_roce_bank *bank = qp_table->bank;
struct ib_cq *scq = init_attr->send_cq;
u8 valid_qp_bank_mask = 0;
unsigned long cqn = 0;
u8 i;
if (scq)
cqn = to_hr_cq(scq)->cqn;
for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) {
if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK)))
continue;
if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) &&
use_ext_sge(init_attr) &&
!(VALID_EXT_SGE_QP_BANK_MASK_LIMIT & BIT(i)))
continue;
valid_qp_bank_mask |= BIT(i);
}
return get_least_load_bankid_for_qp(bank, valid_qp_bank_mask);
}
static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
struct ib_qp_init_attr *init_attr)
{
@ -258,8 +288,7 @@ static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
num = 1;
} else {
mutex_lock(&qp_table->bank_mutex);
bankid = get_least_load_bankid_for_qp(init_attr, qp_table->bank);
bankid = select_qp_bankid(hr_dev, init_attr);
ret = alloc_qpn_with_bankid(&qp_table->bank[bankid], bankid,
&num);
if (ret) {

View file

@ -51,7 +51,7 @@ int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq)
ret = hr_dev->hw->query_cqc(hr_dev, hr_cq->cqn, &context);
if (ret)
return -EINVAL;
return ret;
ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context);
@ -177,7 +177,7 @@ int hns_roce_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr)
ret = hr_dev->hw->query_mpt(hr_dev, hr_mr->key, &context);
if (ret)
return -EINVAL;
return ret;
ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context);

View file

@ -2886,15 +2886,6 @@ static int irdma_sc_resume_qp(struct irdma_sc_cqp *cqp, struct irdma_sc_qp *qp,
return 0;
}
/**
* irdma_sc_cq_ack - acknowledge completion q
* @cq: cq struct
*/
static inline void irdma_sc_cq_ack(struct irdma_sc_cq *cq)
{
writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db);
}
/**
* irdma_sc_cq_init - initialize completion q
* @cq: cq struct
@ -2956,7 +2947,7 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch,
return -ENOMEM;
set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
set_64bit_val(wqe, 8, (uintptr_t)cq >> 1);
set_64bit_val(wqe, 8, cq->cq_uk.cq_id);
set_64bit_val(wqe, 16,
FIELD_PREP(IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD, cq->shadow_read_threshold));
set_64bit_val(wqe, 32, (cq->virtual_map ? 0 : cq->cq_pa));
@ -3013,7 +3004,7 @@ int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq)
return -ENOMEM;
set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
set_64bit_val(wqe, 8, (uintptr_t)cq >> 1);
set_64bit_val(wqe, 8, cq->cq_uk.cq_id);
set_64bit_val(wqe, 40, cq->shadow_area_pa);
set_64bit_val(wqe, 48,
(cq->virtual_map ? cq->first_pm_pbl_idx : 0));
@ -3082,7 +3073,7 @@ static int irdma_sc_cq_modify(struct irdma_sc_cq *cq,
return -ENOMEM;
set_64bit_val(wqe, 0, info->cq_size);
set_64bit_val(wqe, 8, (uintptr_t)cq >> 1);
set_64bit_val(wqe, 8, cq->cq_uk.cq_id);
set_64bit_val(wqe, 16,
FIELD_PREP(IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD, info->shadow_read_threshold));
set_64bit_val(wqe, 32, info->cq_pa);
@ -3887,8 +3878,6 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq)
set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val);
spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags);
dma_wmb(); /* make sure shadow area is updated before arming */
writel(ccq->cq_uk.cq_id, ccq->dev->cq_arm_db);
}
@ -4460,47 +4449,38 @@ int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq)
* irdma_sc_process_ceq - process ceq
* @dev: sc device struct
* @ceq: ceq sc structure
* @cq_idx: Pointer to a CQ ID that will be populated.
*
* It is expected caller serializes this function with cleanup_ceqes()
* because these functions manipulate the same ceq
*
* Return: True if cq_idx has been populated with a CQ ID.
*/
void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq)
bool irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq,
u32 *cq_idx)
{
u64 temp;
__le64 *ceqe;
struct irdma_sc_cq *cq = NULL;
struct irdma_sc_cq *temp_cq;
u8 polarity;
u32 cq_idx;
do {
cq_idx = 0;
ceqe = IRDMA_GET_CURRENT_CEQ_ELEM(ceq);
get_64bit_val(ceqe, 0, &temp);
polarity = (u8)FIELD_GET(IRDMA_CEQE_VALID, temp);
if (polarity != ceq->polarity)
return NULL;
return false;
temp_cq = (struct irdma_sc_cq *)(unsigned long)(temp << 1);
if (!temp_cq) {
cq_idx = IRDMA_INVALID_CQ_IDX;
IRDMA_RING_MOVE_TAIL(ceq->ceq_ring);
if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring))
ceq->polarity ^= 1;
continue;
}
cq = temp_cq;
/* Truncate. Discard valid bit which is MSb of temp. */
*cq_idx = temp;
if (*cq_idx >= dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt)
*cq_idx = IRDMA_INVALID_CQ_IDX;
IRDMA_RING_MOVE_TAIL(ceq->ceq_ring);
if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring))
ceq->polarity ^= 1;
} while (cq_idx == IRDMA_INVALID_CQ_IDX);
} while (*cq_idx == IRDMA_INVALID_CQ_IDX);
if (cq)
irdma_sc_cq_ack(cq);
return cq;
return true;
}
/**
@ -4514,10 +4494,10 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq)
*/
void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq)
{
struct irdma_sc_cq *next_cq;
u8 ceq_polarity = ceq->polarity;
__le64 *ceqe;
u8 polarity;
u32 cq_idx;
u64 temp;
int next;
u32 i;
@ -4532,9 +4512,10 @@ void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq)
if (polarity != ceq_polarity)
return;
next_cq = (struct irdma_sc_cq *)(unsigned long)(temp << 1);
if (cq == next_cq)
set_64bit_val(ceqe, 0, temp & IRDMA_CEQE_VALID);
cq_idx = temp;
if (cq_idx == cq->cq_uk.cq_id)
set_64bit_val(ceqe, 0, (temp & IRDMA_CEQE_VALID) |
IRDMA_INVALID_CQ_IDX);
next = IRDMA_RING_GET_NEXT_TAIL(ceq->ceq_ring, i);
if (!next)
@ -4975,7 +4956,7 @@ int irdma_sc_ccq_destroy(struct irdma_sc_cq *ccq, u64 scratch, bool post_sq)
return -ENOMEM;
set_64bit_val(wqe, 0, ccq->cq_uk.cq_size);
set_64bit_val(wqe, 8, (uintptr_t)ccq >> 1);
set_64bit_val(wqe, 8, ccq->cq_uk.cq_id);
set_64bit_val(wqe, 40, ccq->shadow_area_pa);
hdr = ccq->cq_uk.cq_id |
@ -5788,8 +5769,7 @@ static int cfg_fpm_value_gen_3(struct irdma_sc_dev *dev,
bool is_mrte_loc_mem;
loc_mem_pages = hmc_fpm_misc->loc_mem_pages;
is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds ?
true : false;
is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds;
irdma_get_rsrc_mem_config(dev, is_mrte_loc_mem);
mrte_loc = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc;
@ -6462,6 +6442,9 @@ int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev,
int ret_code = 0;
u8 db_size;
spin_lock_init(&dev->puda_cq_lock);
dev->ilq_cq = NULL;
dev->ieq_cq = NULL;
INIT_LIST_HEAD(&dev->cqp_cmd_head); /* for CQP command backlog */
mutex_init(&dev->ws_mutex);
dev->hmc_fn_id = info->hmc_fn_id;

View file

@ -98,6 +98,74 @@ static void irdma_puda_ce_handler(struct irdma_pci_f *rf,
irdma_sc_ccq_arm(cq);
}
/**
* irdma_process_normal_ceqe - Handle a CEQE for a normal CQ.
* @rf: RDMA PCI function.
* @dev: iWARP device.
* @cq_idx: CQ ID. Must be in table bounds.
*
* Context: Atomic (CEQ lock must be held)
*/
static void irdma_process_normal_ceqe(struct irdma_pci_f *rf,
struct irdma_sc_dev *dev, u32 cq_idx)
{
/* cq_idx bounds validated in irdma_sc_process_ceq. */
struct irdma_cq *icq = READ_ONCE(rf->cq_table[cq_idx]);
struct irdma_sc_cq *cq;
if (unlikely(!icq)) {
/* Should not happen since CEQ is scrubbed upon CQ delete. */
ibdev_warn_ratelimited(to_ibdev(dev), "Stale CEQE for CQ %u",
cq_idx);
return;
}
cq = &icq->sc_cq;
if (unlikely(cq->cq_type != IRDMA_CQ_TYPE_IWARP)) {
ibdev_warn_ratelimited(to_ibdev(dev), "Unexpected CQ type %u",
cq->cq_type);
return;
}
writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db);
irdma_iwarp_ce_handler(cq);
}
/**
* irdma_process_reserved_ceqe - Handle a CEQE for a reserved CQ.
* @rf: RDMA PCI function.
* @dev: iWARP device.
* @cq_idx: CQ ID.
*
* Context: Atomic
*/
static void irdma_process_reserved_ceqe(struct irdma_pci_f *rf,
struct irdma_sc_dev *dev, u32 cq_idx)
{
struct irdma_sc_cq *cq;
if (cq_idx == IRDMA_RSVD_CQ_ID_CQP) {
cq = &rf->ccq.sc_cq;
/* CQP CQ lifetime > CEQ. */
writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db);
queue_work(rf->cqp_cmpl_wq, &rf->cqp_cmpl_work);
} else if (cq_idx == IRDMA_RSVD_CQ_ID_ILQ ||
cq_idx == IRDMA_RSVD_CQ_ID_IEQ) {
scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) {
cq = (cq_idx == IRDMA_RSVD_CQ_ID_ILQ) ?
dev->ilq_cq : dev->ieq_cq;
if (!cq) {
ibdev_warn_ratelimited(to_ibdev(dev),
"Stale ILQ/IEQ CEQE");
return;
}
writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db);
irdma_puda_ce_handler(rf, cq);
}
}
}
/**
* irdma_process_ceq - handle ceq for completions
* @rf: RDMA PCI function
@ -107,28 +175,28 @@ static void irdma_process_ceq(struct irdma_pci_f *rf, struct irdma_ceq *ceq)
{
struct irdma_sc_dev *dev = &rf->sc_dev;
struct irdma_sc_ceq *sc_ceq;
struct irdma_sc_cq *cq;
unsigned long flags;
u32 cq_idx;
sc_ceq = &ceq->sc_ceq;
do {
spin_lock_irqsave(&ceq->ce_lock, flags);
cq = irdma_sc_process_ceq(dev, sc_ceq);
if (!cq) {
if (!irdma_sc_process_ceq(dev, sc_ceq, &cq_idx)) {
spin_unlock_irqrestore(&ceq->ce_lock, flags);
break;
}
if (cq->cq_type == IRDMA_CQ_TYPE_IWARP)
irdma_iwarp_ce_handler(cq);
/* Normal CQs must be handled while holding CEQ lock. */
if (likely(cq_idx > IRDMA_RSVD_CQ_ID_IEQ)) {
irdma_process_normal_ceqe(rf, dev, cq_idx);
spin_unlock_irqrestore(&ceq->ce_lock, flags);
continue;
}
spin_unlock_irqrestore(&ceq->ce_lock, flags);
if (cq->cq_type == IRDMA_CQ_TYPE_CQP)
queue_work(rf->cqp_cmpl_wq, &rf->cqp_cmpl_work);
else if (cq->cq_type == IRDMA_CQ_TYPE_ILQ ||
cq->cq_type == IRDMA_CQ_TYPE_IEQ)
irdma_puda_ce_handler(rf, cq);
irdma_process_reserved_ceqe(rf, dev, cq_idx);
} while (1);
}
@ -1532,8 +1600,8 @@ static int irdma_initialize_ilq(struct irdma_device *iwdev)
int status;
info.type = IRDMA_PUDA_RSRC_TYPE_ILQ;
info.cq_id = 1;
info.qp_id = 1;
info.cq_id = IRDMA_RSVD_CQ_ID_ILQ;
info.qp_id = IRDMA_RSVD_QP_ID_GSI_ILQ;
info.count = 1;
info.pd_id = 1;
info.abi_ver = IRDMA_ABI_VER;
@ -1562,7 +1630,7 @@ static int irdma_initialize_ieq(struct irdma_device *iwdev)
int status;
info.type = IRDMA_PUDA_RSRC_TYPE_IEQ;
info.cq_id = 2;
info.cq_id = IRDMA_RSVD_CQ_ID_IEQ;
info.qp_id = iwdev->vsi.exception_lan_q;
info.count = 1;
info.pd_id = 2;
@ -1868,7 +1936,7 @@ int irdma_rt_init_hw(struct irdma_device *iwdev,
vsi_info.pf_data_vsi_num = iwdev->vsi_num;
vsi_info.register_qset = rf->gen_ops.register_qset;
vsi_info.unregister_qset = rf->gen_ops.unregister_qset;
vsi_info.exception_lan_q = 2;
vsi_info.exception_lan_q = IRDMA_RSVD_QP_ID_IEQ;
irdma_sc_vsi_init(&iwdev->vsi, &vsi_info);
status = irdma_setup_cm_core(iwdev, rf->rdma_ver);
@ -2099,18 +2167,18 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf)
irdma_set_hw_rsrc(rf);
set_bit(0, rf->allocated_mrs);
set_bit(0, rf->allocated_qps);
set_bit(0, rf->allocated_cqs);
set_bit(IRDMA_RSVD_QP_ID_0, rf->allocated_qps);
set_bit(IRDMA_RSVD_CQ_ID_CQP, rf->allocated_cqs);
set_bit(0, rf->allocated_srqs);
set_bit(0, rf->allocated_pds);
set_bit(0, rf->allocated_arps);
set_bit(0, rf->allocated_ahs);
set_bit(0, rf->allocated_mcgs);
set_bit(2, rf->allocated_qps); /* qp 2 IEQ */
set_bit(1, rf->allocated_qps); /* qp 1 ILQ */
set_bit(1, rf->allocated_cqs);
set_bit(IRDMA_RSVD_QP_ID_IEQ, rf->allocated_qps);
set_bit(IRDMA_RSVD_QP_ID_GSI_ILQ, rf->allocated_qps);
set_bit(IRDMA_RSVD_CQ_ID_ILQ, rf->allocated_cqs);
set_bit(1, rf->allocated_pds);
set_bit(2, rf->allocated_cqs);
set_bit(IRDMA_RSVD_CQ_ID_IEQ, rf->allocated_cqs);
set_bit(2, rf->allocated_pds);
INIT_LIST_HEAD(&rf->mc_qht_list.list);

View file

@ -23,6 +23,7 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/io.h>
#include <linux/iopoll.h>
#include <linux/crc32c.h>
#include <linux/kthread.h>
#ifndef CONFIG_64BIT
@ -528,6 +529,7 @@ void irdma_cq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_cq *cq);
void irdma_srq_event(struct irdma_sc_srq *srq);
void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq);
void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf);
int irdma_get_timeout_threshold(struct irdma_sc_dev *dev);
int irdma_hw_modify_qp(struct irdma_device *iwdev, struct irdma_qp *iwqp,
struct irdma_modify_qp_info *info, bool wait);
int irdma_qp_suspend_resume(struct irdma_sc_qp *qp, bool suspend);

View file

@ -809,6 +809,13 @@ error:
dma_free_coherent(dev->hw->device, rsrc->cqmem.size,
rsrc->cqmem.va, rsrc->cqmem.pa);
rsrc->cqmem.va = NULL;
} else {
scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) {
if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ)
dev->ilq_cq = cq;
else
dev->ieq_cq = cq;
}
}
return ret;
@ -856,6 +863,13 @@ static void irdma_puda_free_cq(struct irdma_puda_rsrc *rsrc)
struct irdma_ccq_cqe_info compl_info;
struct irdma_sc_dev *dev = rsrc->dev;
scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) {
if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ)
dev->ilq_cq = NULL;
else
dev->ieq_cq = NULL;
}
if (rsrc->dev->ceq_valid) {
irdma_cqp_cq_destroy_cmd(dev, &rsrc->cq);
return;

View file

@ -239,6 +239,18 @@ enum irdma_queue_type {
IRDMA_QUEUE_TYPE_SRQ,
};
enum irdma_rsvd_cq_id {
IRDMA_RSVD_CQ_ID_CQP,
IRDMA_RSVD_CQ_ID_ILQ,
IRDMA_RSVD_CQ_ID_IEQ,
};
enum irdma_rsvd_qp_id {
IRDMA_RSVD_QP_ID_0,
IRDMA_RSVD_QP_ID_GSI_ILQ,
IRDMA_RSVD_QP_ID_IEQ,
};
struct irdma_sc_dev;
struct irdma_vsi_pestat;
@ -695,6 +707,9 @@ struct irdma_sc_dev {
struct irdma_sc_aeq *aeq;
struct irdma_sc_ceq *ceq[IRDMA_CEQ_MAX_COUNT];
struct irdma_sc_cq *ccq;
spinlock_t puda_cq_lock;
struct irdma_sc_cq *ilq_cq;
struct irdma_sc_cq *ieq_cq;
const struct irdma_irq_ops *irq_ops;
struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY];
struct irdma_hmc_fpm_misc hmc_fpm_misc;
@ -1332,7 +1347,8 @@ int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq);
int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq,
struct irdma_ceq_init_info *info);
void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq);
void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq);
bool irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq,
u32 *cq_idx);
int irdma_sc_aeq_init(struct irdma_sc_aeq *aeq,
struct irdma_aeq_init_info *info);

View file

@ -114,7 +114,6 @@ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx)
*/
void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp)
{
dma_wmb();
writel(qp->qp_id, qp->wqe_alloc_db);
}
@ -1107,8 +1106,6 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq,
set_64bit_val(cq->shadow_area, 32, temp_val);
dma_wmb(); /* make sure WQE is populated before valid bit is set */
writel(cq->cq_id, cq->cqe_alloc_db);
}
@ -1408,8 +1405,7 @@ exit:
* from SW for all unprocessed WQEs. For GEN3 and beyond
* FW will generate/flush these CQEs so move to the next CQE
*/
move_cq_head = qp->uk_attrs->hw_rev <= IRDMA_GEN_2 ?
false : true;
move_cq_head = qp->uk_attrs->hw_rev > IRDMA_GEN_2;
}
if (move_cq_head) {

View file

@ -573,7 +573,7 @@ void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf)
}
}
static int irdma_get_timeout_threshold(struct irdma_sc_dev *dev)
int irdma_get_timeout_threshold(struct irdma_sc_dev *dev)
{
u16 time_s = dev->vc_caps.cqp_timeout_s;
@ -830,7 +830,8 @@ void irdma_cq_rem_ref(struct ib_cq *ibcq)
return;
}
iwdev->rf->cq_table[iwcq->cq_num] = NULL;
/* May be asynchronously sampled by CEQ ISR without holding tbl lock. */
WRITE_ONCE(iwdev->rf->cq_table[iwcq->cq_num], NULL);
spin_unlock_irqrestore(&iwdev->rf->cqtable_lock, flags);
complete(&iwcq->free_cq);
}
@ -2239,7 +2240,7 @@ void irdma_pble_free_paged_mem(struct irdma_chunk *chunk)
chunk->pg_cnt);
done:
kfree(chunk->dmainfo.dmaaddrs);
kvfree(chunk->dmainfo.dmaaddrs);
chunk->dmainfo.dmaaddrs = NULL;
vfree(chunk->vaddr);
chunk->vaddr = NULL;
@ -2256,7 +2257,7 @@ int irdma_pble_get_paged_mem(struct irdma_chunk *chunk, u32 pg_cnt)
u32 size;
void *va;
chunk->dmainfo.dmaaddrs = kzalloc(pg_cnt << 3, GFP_KERNEL);
chunk->dmainfo.dmaaddrs = kvzalloc(pg_cnt << 3, GFP_KERNEL);
if (!chunk->dmainfo.dmaaddrs)
return -ENOMEM;
@ -2277,7 +2278,7 @@ int irdma_pble_get_paged_mem(struct irdma_chunk *chunk, u32 pg_cnt)
return 0;
err:
kfree(chunk->dmainfo.dmaaddrs);
kvfree(chunk->dmainfo.dmaaddrs);
chunk->dmainfo.dmaaddrs = NULL;
return -ENOMEM;

View file

@ -2669,9 +2669,12 @@ static int irdma_create_cq(struct ib_cq *ibcq,
goto cq_destroy;
}
}
rf->cq_table[cq_num] = iwcq;
init_completion(&iwcq->free_cq);
/* Populate table entry after CQ is fully created. */
smp_store_release(&rf->cq_table[cq_num], iwcq);
return 0;
cq_destroy:
irdma_cq_wq_destroy(rf, cq);
@ -5027,15 +5030,15 @@ static int irdma_create_hw_ah(struct irdma_device *iwdev, struct irdma_ah *ah, b
}
if (!sleep) {
int cnt = CQP_COMPL_WAIT_TIME_MS * CQP_TIMEOUT_THRESHOLD;
const u64 tmout_ms = irdma_get_timeout_threshold(&rf->sc_dev) *
CQP_COMPL_WAIT_TIME_MS;
do {
irdma_cqp_ce_handler(rf, &rf->ccq.sc_cq);
mdelay(1);
} while (!ah->sc_ah.ah_info.ah_valid && --cnt);
if (!cnt) {
ibdev_dbg(&iwdev->ibdev, "VERBS: CQP create AH timed out");
if (poll_timeout_us_atomic(irdma_cqp_ce_handler(rf,
&rf->ccq.sc_cq),
ah->sc_ah.ah_info.ah_valid, 1,
tmout_ms * USEC_PER_MSEC, false)) {
ibdev_dbg(&iwdev->ibdev,
"VERBS: CQP create AH timed out");
err = -ETIMEDOUT;
goto err_ah_create;
}

View file

@ -24,6 +24,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors;
cq->cq_handle = INVALID_MANA_HANDLE;
is_rnic_cq = mana_ib_is_rnic(mdev);
if (udata) {
if (udata->inlen < offsetof(struct mana_ib_create_cq, flags))
@ -35,8 +36,6 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
return err;
}
is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ);
if ((!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) ||
attr->cqe > U32_MAX / COMP_ENTRY_SIZE) {
ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
@ -55,7 +54,6 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
ibucontext);
doorbell = mana_ucontext->doorbell;
} else {
is_rnic_cq = true;
if (attr->cqe > U32_MAX / COMP_ENTRY_SIZE / 2 + 1) {
ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
return -EINVAL;

View file

@ -69,6 +69,12 @@ static const struct ib_device_ops mana_ib_device_stats_ops = {
.alloc_hw_device_stats = mana_ib_alloc_hw_device_stats,
};
const struct ib_device_ops mana_ib_dev_dm_ops = {
.alloc_dm = mana_ib_alloc_dm,
.dealloc_dm = mana_ib_dealloc_dm,
.reg_dm_mr = mana_ib_reg_dm_mr,
};
static int mana_ib_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
@ -139,6 +145,7 @@ static int mana_ib_probe(struct auxiliary_device *adev,
ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops);
if (dev->adapter_caps.feature_flags & MANA_IB_FEATURE_DEV_COUNTERS_SUPPORT)
ib_set_device_ops(&dev->ib_dev, &mana_ib_device_stats_ops);
ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_dm_ops);
ret = mana_ib_create_eqs(dev);
if (ret) {

View file

@ -131,6 +131,11 @@ struct mana_ib_mr {
mana_handle_t mr_handle;
};
struct mana_ib_dm {
struct ib_dm ibdm;
mana_handle_t dm_handle;
};
struct mana_ib_cq {
struct ib_cq ibcq;
struct mana_ib_queue queue;
@ -735,4 +740,11 @@ struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 leng
u64 iova, int fd, int mr_access_flags,
struct ib_dmah *dmah,
struct uverbs_attr_bundle *attrs);
struct ib_dm *mana_ib_alloc_dm(struct ib_device *dev, struct ib_ucontext *context,
struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs);
int mana_ib_dealloc_dm(struct ib_dm *dm, struct uverbs_attr_bundle *attrs);
struct ib_mr *mana_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr,
struct uverbs_attr_bundle *attrs);
#endif

View file

@ -40,6 +40,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req),
sizeof(resp));
req.hdr.req.msg_version = GDMA_MESSAGE_V2;
req.pd_handle = mr_params->pd_handle;
req.mr_type = mr_params->mr_type;
@ -55,6 +56,12 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
req.zbva.dma_region_handle = mr_params->zbva.dma_region_handle;
req.zbva.access_flags = mr_params->zbva.access_flags;
break;
case GDMA_MR_TYPE_DM:
req.da_ext.length = mr_params->da.length;
req.da.dm_handle = mr_params->da.dm_handle;
req.da.offset = mr_params->da.offset;
req.da.access_flags = mr_params->da.access_flags;
break;
default:
ibdev_dbg(&dev->ib_dev,
"invalid param (GDMA_MR_TYPE) passed, type %d\n",
@ -317,3 +324,126 @@ int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
return 0;
}
static int mana_ib_gd_alloc_dm(struct mana_ib_dev *mdev, struct mana_ib_dm *dm,
struct ib_dm_alloc_attr *attr)
{
struct gdma_context *gc = mdev_to_gc(mdev);
struct gdma_alloc_dm_resp resp = {};
struct gdma_alloc_dm_req req = {};
int err;
mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOC_DM, sizeof(req), sizeof(resp));
req.length = attr->length;
req.alignment = attr->alignment;
req.flags = attr->flags;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
if (err || resp.hdr.status) {
if (!err)
err = -EPROTO;
return err;
}
dm->dm_handle = resp.dm_handle;
return 0;
}
struct ib_dm *mana_ib_alloc_dm(struct ib_device *ibdev,
struct ib_ucontext *context,
struct ib_dm_alloc_attr *attr,
struct uverbs_attr_bundle *attrs)
{
struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
struct mana_ib_dm *dm;
int err;
dm = kzalloc(sizeof(*dm), GFP_KERNEL);
if (!dm)
return ERR_PTR(-ENOMEM);
err = mana_ib_gd_alloc_dm(dev, dm, attr);
if (err)
goto err_free;
return &dm->ibdm;
err_free:
kfree(dm);
return ERR_PTR(err);
}
static int mana_ib_gd_destroy_dm(struct mana_ib_dev *mdev, struct mana_ib_dm *dm)
{
struct gdma_context *gc = mdev_to_gc(mdev);
struct gdma_destroy_dm_resp resp = {};
struct gdma_destroy_dm_req req = {};
int err;
mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_DM, sizeof(req), sizeof(resp));
req.dm_handle = dm->dm_handle;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
if (err || resp.hdr.status) {
if (!err)
err = -EPROTO;
return err;
}
return 0;
}
int mana_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs)
{
struct mana_ib_dev *dev = container_of(ibdm->device, struct mana_ib_dev, ib_dev);
struct mana_ib_dm *dm = container_of(ibdm, struct mana_ib_dm, ibdm);
int err;
err = mana_ib_gd_destroy_dm(dev, dm);
if (err)
return err;
kfree(dm);
return 0;
}
struct ib_mr *mana_ib_reg_dm_mr(struct ib_pd *ibpd, struct ib_dm *ibdm,
struct ib_dm_mr_attr *attr,
struct uverbs_attr_bundle *attrs)
{
struct mana_ib_dev *dev = container_of(ibpd->device, struct mana_ib_dev, ib_dev);
struct mana_ib_dm *mana_dm = container_of(ibdm, struct mana_ib_dm, ibdm);
struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
struct gdma_create_mr_params mr_params = {};
struct mana_ib_mr *mr;
int err;
attr->access_flags &= ~IB_ACCESS_OPTIONAL;
if (attr->access_flags & ~VALID_MR_FLAGS)
return ERR_PTR(-EOPNOTSUPP);
mr = kzalloc(sizeof(*mr), GFP_KERNEL);
if (!mr)
return ERR_PTR(-ENOMEM);
mr_params.pd_handle = pd->pd_handle;
mr_params.mr_type = GDMA_MR_TYPE_DM;
mr_params.da.dm_handle = mana_dm->dm_handle;
mr_params.da.offset = attr->offset;
mr_params.da.length = attr->length;
mr_params.da.access_flags =
mana_ib_verbs_to_gdma_access_flags(attr->access_flags);
err = mana_ib_gd_create_mr(dev, mr, &mr_params);
if (err)
goto err_free;
return &mr->ibmr;
err_free:
kfree(mr);
return ERR_PTR(err);
}

View file

@ -561,12 +561,20 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num,
* of an error it will still be zeroed out.
* Use native port in case of reps
*/
if (dev->is_rep)
err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
1, 0);
else
err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
mdev_port_num, 0);
if (dev->is_rep) {
struct mlx5_eswitch_rep *rep;
rep = dev->port[port_num - 1].rep;
if (rep) {
mdev = mlx5_eswitch_get_core_dev(rep->esw);
WARN_ON(!mdev);
}
mdev_port_num = 1;
}
err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
mdev_port_num, 0);
if (err)
goto out;
ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability);
@ -1581,6 +1589,129 @@ static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
return 0;
}
static int mlx5_ib_query_port_speed_from_port(struct mlx5_ib_dev *dev,
u32 port_num, u64 *speed)
{
struct ib_port_speed_info speed_info;
struct ib_port_attr attr = {};
int err;
err = mlx5_ib_query_port(&dev->ib_dev, port_num, &attr);
if (err)
return err;
if (attr.state == IB_PORT_DOWN) {
*speed = 0;
return 0;
}
err = ib_port_attr_to_speed_info(&attr, &speed_info);
if (err)
return err;
*speed = speed_info.rate;
return 0;
}
static int mlx5_ib_query_port_speed_from_vport(struct mlx5_core_dev *mdev,
u8 op_mod, u16 vport,
u8 other_vport, u64 *speed,
struct mlx5_ib_dev *dev,
u32 port_num)
{
u32 max_tx_speed;
int err;
err = mlx5_query_vport_max_tx_speed(mdev, op_mod, vport, other_vport,
&max_tx_speed);
if (err)
return err;
if (max_tx_speed == 0)
/* Value 0 indicates field not supported, fallback */
return mlx5_ib_query_port_speed_from_port(dev, port_num,
speed);
*speed = max_tx_speed;
return 0;
}
static int mlx5_ib_query_port_speed_from_bond(struct mlx5_ib_dev *dev,
u32 port_num, u64 *speed)
{
struct mlx5_core_dev *mdev = dev->mdev;
u32 bond_speed;
int err;
err = mlx5_lag_query_bond_speed(mdev, &bond_speed);
if (err)
return err;
*speed = bond_speed / MLX5_MAX_TX_SPEED_UNIT;
return 0;
}
static int mlx5_ib_query_port_speed_non_rep(struct mlx5_ib_dev *dev,
u32 port_num, u64 *speed)
{
u16 op_mod = MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT;
if (mlx5_lag_is_roce(dev->mdev))
return mlx5_ib_query_port_speed_from_bond(dev, port_num,
speed);
return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod, 0, false,
speed, dev, port_num);
}
static int mlx5_ib_query_port_speed_rep(struct mlx5_ib_dev *dev, u32 port_num,
u64 *speed)
{
struct mlx5_eswitch_rep *rep;
struct mlx5_core_dev *mdev;
u16 op_mod;
if (!dev->port[port_num - 1].rep) {
mlx5_ib_warn(dev, "Representor doesn't exist for port %u\n",
port_num);
return -EINVAL;
}
rep = dev->port[port_num - 1].rep;
mdev = mlx5_eswitch_get_core_dev(rep->esw);
if (!mdev)
return -ENODEV;
if (rep->vport == MLX5_VPORT_UPLINK) {
if (mlx5_lag_is_sriov(mdev))
return mlx5_ib_query_port_speed_from_bond(dev,
port_num,
speed);
return mlx5_ib_query_port_speed_from_port(dev, port_num,
speed);
}
op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT;
return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod,
rep->vport, true, speed, dev,
port_num);
}
int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num, u64 *speed)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
if (mlx5_ib_port_link_layer(ibdev, port_num) ==
IB_LINK_LAYER_INFINIBAND || mlx5_core_mp_enabled(dev->mdev))
return mlx5_ib_query_port_speed_from_port(dev, port_num, speed);
else if (!dev->is_rep)
return mlx5_ib_query_port_speed_non_rep(dev, port_num, speed);
else
return mlx5_ib_query_port_speed_rep(dev, port_num, speed);
}
static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
union ib_gid *gid)
{
@ -2323,6 +2454,70 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
virt_to_page(dev->mdev->clock_info));
}
static int phys_addr_to_bar(struct pci_dev *pdev, phys_addr_t pa)
{
resource_size_t start, end;
int bar;
for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
/* Skip BARs not present or not memory-mapped */
if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
continue;
start = pci_resource_start(pdev, bar);
end = pci_resource_end(pdev, bar);
if (!start || !end)
continue;
if (pa >= start && pa <= end)
return bar;
}
return -1;
}
static int mlx5_ib_mmap_get_pfns(struct rdma_user_mmap_entry *entry,
struct phys_vec *phys_vec,
struct p2pdma_provider **provider)
{
struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
struct pci_dev *pdev = to_mdev(entry->ucontext->device)->mdev->pdev;
int bar;
phys_vec->paddr = mentry->address;
phys_vec->len = entry->npages * PAGE_SIZE;
bar = phys_addr_to_bar(pdev, phys_vec->paddr);
if (bar < 0)
return -EINVAL;
*provider = pcim_p2pdma_provider(pdev, bar);
/* If the kernel was not compiled with CONFIG_PCI_P2PDMA the
* functionality is not supported.
*/
if (!*provider)
return -EOPNOTSUPP;
return 0;
}
static struct rdma_user_mmap_entry *
mlx5_ib_pgoff_to_mmap_entry(struct ib_ucontext *ucontext, off_t pg_off)
{
unsigned long entry_pgoff;
unsigned long idx;
u8 command;
pg_off = pg_off >> PAGE_SHIFT;
command = get_command(pg_off);
idx = get_extended_index(pg_off);
entry_pgoff = command << 16 | idx;
return rdma_user_mmap_entry_get_pgoff(ucontext, entry_pgoff);
}
static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
{
struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
@ -2838,6 +3033,14 @@ static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
if (ibdev->ib_active) {
struct ib_event speed_event = {};
speed_event.device = &ibdev->ib_dev;
speed_event.event = IB_EVENT_DEVICE_SPEED_CHANGE;
ib_dispatch_event(&speed_event);
}
/* In RoCE, port up/down events are handled in
* mlx5_netdev_event().
*/
@ -2878,7 +3081,6 @@ static void mlx5_ib_handle_event(struct work_struct *_work)
container_of(_work, struct mlx5_ib_event_work, work);
struct mlx5_ib_dev *ibdev;
struct ib_event ibev;
bool fatal = false;
if (work->is_slave) {
ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
@ -2889,12 +3091,6 @@ static void mlx5_ib_handle_event(struct work_struct *_work)
}
switch (work->event) {
case MLX5_DEV_EVENT_SYS_ERROR:
ibev.event = IB_EVENT_DEVICE_FATAL;
mlx5_ib_handle_internal_error(ibdev);
ibev.element.port_num = (u8)(unsigned long)work->param;
fatal = true;
break;
case MLX5_EVENT_TYPE_PORT_CHANGE:
if (handle_port_change(ibdev, work->param, &ibev))
goto out;
@ -2916,8 +3112,6 @@ static void mlx5_ib_handle_event(struct work_struct *_work)
if (ibdev->ib_active)
ib_dispatch_event(&ibev);
if (fatal)
ibdev->ib_active = false;
out:
kfree(work);
}
@ -2961,6 +3155,66 @@ static int mlx5_ib_event_slave_port(struct notifier_block *nb,
return NOTIFY_OK;
}
static void mlx5_ib_handle_sys_error_event(struct work_struct *_work)
{
struct mlx5_ib_event_work *work =
container_of(_work, struct mlx5_ib_event_work, work);
struct mlx5_ib_dev *ibdev = work->dev;
struct ib_event ibev;
ibev.event = IB_EVENT_DEVICE_FATAL;
mlx5_ib_handle_internal_error(ibdev);
ibev.element.port_num = (u8)(unsigned long)work->param;
ibev.device = &ibdev->ib_dev;
if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
mlx5_ib_warn(ibdev, "warning: event on port %d\n", ibev.element.port_num);
goto out;
}
if (ibdev->ib_active)
ib_dispatch_event(&ibev);
ibdev->ib_active = false;
out:
kfree(work);
}
static int mlx5_ib_sys_error_event(struct notifier_block *nb,
unsigned long event, void *param)
{
struct mlx5_ib_event_work *work;
if (event != MLX5_DEV_EVENT_SYS_ERROR)
return NOTIFY_DONE;
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (!work)
return NOTIFY_DONE;
INIT_WORK(&work->work, mlx5_ib_handle_sys_error_event);
work->dev = container_of(nb, struct mlx5_ib_dev, sys_error_events);
work->is_slave = false;
work->param = param;
work->event = event;
queue_work(mlx5_ib_event_wq, &work->work);
return NOTIFY_OK;
}
static int mlx5_ib_stage_sys_error_notifier_init(struct mlx5_ib_dev *dev)
{
dev->sys_error_events.notifier_call = mlx5_ib_sys_error_event;
mlx5_notifier_register(dev->mdev, &dev->sys_error_events);
return 0;
}
static void mlx5_ib_stage_sys_error_notifier_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_notifier_unregister(dev->mdev, &dev->sys_error_events);
}
static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane)
{
struct mlx5_hca_vport_context vport_ctx;
@ -4229,7 +4483,13 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
if (err)
goto err_mp;
err = pcim_p2pdma_init(mdev->pdev);
if (err && err != -EOPNOTSUPP)
goto err_dd;
return 0;
err_dd:
mlx5_ib_data_direct_cleanup(dev);
err_mp:
mlx5_ib_cleanup_multiport_master(dev);
err:
@ -4281,11 +4541,13 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
.mmap = mlx5_ib_mmap,
.mmap_free = mlx5_ib_mmap_free,
.mmap_get_pfns = mlx5_ib_mmap_get_pfns,
.modify_cq = mlx5_ib_modify_cq,
.modify_device = mlx5_ib_modify_device,
.modify_port = mlx5_ib_modify_port,
.modify_qp = mlx5_ib_modify_qp,
.modify_srq = mlx5_ib_modify_srq,
.pgoff_to_mmap_entry = mlx5_ib_pgoff_to_mmap_entry,
.pre_destroy_cq = mlx5_ib_pre_destroy_cq,
.poll_cq = mlx5_ib_poll_cq,
.post_destroy_cq = mlx5_ib_post_destroy_cq,
@ -4297,6 +4559,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.query_device = mlx5_ib_query_device,
.query_gid = mlx5_ib_query_gid,
.query_pkey = mlx5_ib_query_pkey,
.query_port_speed = mlx5_ib_query_port_speed,
.query_qp = mlx5_ib_query_qp,
.query_srq = mlx5_ib_query_srq,
.query_ucontext = mlx5_ib_query_ucontext,
@ -4466,12 +4729,16 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) {
err = mlx5_ib_init_ucaps(dev);
if (err)
return err;
goto err_ucaps;
}
dev->ib_dev.use_cq_dim = true;
return 0;
err_ucaps:
bitmap_free(dev->var_table.bitmap);
return err;
}
static const struct ib_device_ops mlx5_ib_dev_port_ops = {
@ -4807,6 +5074,9 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
mlx5_ib_devx_init,
mlx5_ib_devx_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_SYS_ERROR_NOTIFIER,
mlx5_ib_stage_sys_error_notifier_init,
mlx5_ib_stage_sys_error_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
@ -4864,6 +5134,9 @@ const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
mlx5_ib_devx_init,
mlx5_ib_devx_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_SYS_ERROR_NOTIFIER,
mlx5_ib_stage_sys_error_notifier_init,
mlx5_ib_stage_sys_error_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),

View file

@ -1007,6 +1007,7 @@ enum mlx5_ib_stages {
MLX5_IB_STAGE_BFREG,
MLX5_IB_STAGE_PRE_IB_REG_UMR,
MLX5_IB_STAGE_WHITELIST_UID,
MLX5_IB_STAGE_SYS_ERROR_NOTIFIER,
MLX5_IB_STAGE_IB_REG,
MLX5_IB_STAGE_DEVICE_NOTIFIER,
MLX5_IB_STAGE_POST_IB_REG_UMR,
@ -1165,6 +1166,7 @@ struct mlx5_ib_dev {
/* protect accessing data_direct_dev */
struct mutex data_direct_lock;
struct notifier_block mdev_events;
struct notifier_block sys_error_events;
struct notifier_block lag_events;
int num_ports;
/* serialize update of capability mask
@ -1435,6 +1437,8 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u32 port,
struct ib_port_attr *props);
int mlx5_ib_query_port(struct ib_device *ibdev, u32 port,
struct ib_port_attr *props);
int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num,
u64 *speed);
void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
u64 access_flags);
int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);

View file

@ -1646,10 +1646,13 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
offset, length, fd,
access_flags,
&mlx5_ib_dmabuf_attach_ops);
else
else if (dma_device)
umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
dma_device, offset, length,
fd, access_flags);
else
umem_dmabuf = ib_umem_dmabuf_get_pinned(
&dev->ib_dev, offset, length, fd, access_flags);
if (IS_ERR(umem_dmabuf)) {
mlx5_ib_dbg(dev, "umem_dmabuf get failed (%pe)\n", umem_dmabuf);
@ -1782,10 +1785,8 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
fd, access_flags);
return reg_user_mr_dmabuf(pd, pd->device->dma_device,
offset, length, virt_addr,
fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT,
dmah);
return reg_user_mr_dmabuf(pd, NULL, offset, length, virt_addr, fd,
access_flags, MLX5_MKC_ACCESS_MODE_MTT, dmah);
}
/*

View file

@ -4362,6 +4362,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
optpar |= ib_mask_to_mlx5_opt(attr_mask);
optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
if (attr_mask & IB_QP_RATE_LIMIT && qp->type != IB_QPT_RAW_PACKET) {
err = -EOPNOTSUPP;
goto out;
}
if (qp->type == IB_QPT_RAW_PACKET ||
qp->flags & IB_QP_CREATE_SOURCE_QPN) {
struct mlx5_modify_raw_qp_param raw_qp_param = {};

View file

@ -195,7 +195,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)(
int out_len = uverbs_attr_get_len(attrs,
MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH);
u32 dev_path_len;
char *dev_path;
char *dev_path = NULL;
int ret;
c = to_mucontext(ib_uverbs_get_ucontext(attrs));
@ -223,9 +223,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)(
ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path,
dev_path_len);
kfree(dev_path);
end:
kfree(dev_path);
mutex_unlock(&dev->data_direct_lock);
return ret;
}

View file

@ -67,8 +67,6 @@
#define OC_SKH_DEVICE_VF 0x728
#define OCRDMA_MAX_AH 512
#define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME)
#define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo)
#define EQ_INTR_PER_SEC_THRSH_HI 150000
#define EQ_INTR_PER_SEC_THRSH_LOW 100000

View file

@ -53,11 +53,8 @@
DP_NAME(dev) ? DP_NAME(dev) : "", ## __VA_ARGS__)
#define QEDR_MSG_INIT "INIT"
#define QEDR_MSG_MISC "MISC"
#define QEDR_MSG_CQ " CQ"
#define QEDR_MSG_MR " MR"
#define QEDR_MSG_RQ " RQ"
#define QEDR_MSG_SQ " SQ"
#define QEDR_MSG_QP " QP"
#define QEDR_MSG_SRQ " SRQ"
#define QEDR_MSG_GSI " GSI"
@ -65,7 +62,6 @@
#define QEDR_CQ_MAGIC_NUMBER (0x11223344)
#define FW_PAGE_SIZE (RDMA_RING_PAGE_SIZE)
#define FW_PAGE_SHIFT (12)
struct qedr_dev;
@ -178,24 +174,18 @@ struct qedr_dev {
u8 user_dpm_enabled;
};
#define QEDR_MAX_SQ_PBL (0x8000)
#define QEDR_MAX_SQ_PBL_ENTRIES (0x10000 / sizeof(void *))
#define QEDR_SQE_ELEMENT_SIZE (sizeof(struct rdma_sq_sge))
#define QEDR_MAX_SQE_ELEMENTS_PER_SQE (ROCE_REQ_MAX_SINGLE_SQ_WQE_SIZE / \
QEDR_SQE_ELEMENT_SIZE)
#define QEDR_MAX_SQE_ELEMENTS_PER_PAGE ((RDMA_RING_PAGE_SIZE) / \
QEDR_SQE_ELEMENT_SIZE)
#define QEDR_MAX_SQE ((QEDR_MAX_SQ_PBL_ENTRIES) *\
(RDMA_RING_PAGE_SIZE) / \
(QEDR_SQE_ELEMENT_SIZE) /\
(QEDR_MAX_SQE_ELEMENTS_PER_SQE))
/* RQ */
#define QEDR_MAX_RQ_PBL (0x2000)
#define QEDR_MAX_RQ_PBL_ENTRIES (0x10000 / sizeof(void *))
#define QEDR_RQE_ELEMENT_SIZE (sizeof(struct rdma_rq_sge))
#define QEDR_MAX_RQE_ELEMENTS_PER_RQE (RDMA_MAX_SGE_PER_RQ_WQE)
#define QEDR_MAX_RQE_ELEMENTS_PER_PAGE ((RDMA_RING_PAGE_SIZE) / \
QEDR_RQE_ELEMENT_SIZE)
#define QEDR_MAX_RQE ((QEDR_MAX_RQ_PBL_ENTRIES) *\
(RDMA_RING_PAGE_SIZE) / \
(QEDR_RQE_ELEMENT_SIZE) /\
@ -210,12 +200,8 @@ struct qedr_dev {
#define QEDR_ROCE_MAX_CNQ_SIZE (0x4000)
#define QEDR_MAX_PORT (1)
#define QEDR_PORT (1)
#define QEDR_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME)
#define QEDR_ROCE_PKEY_MAX 1
#define QEDR_ROCE_PKEY_TABLE_LEN 1
#define QEDR_ROCE_PKEY_DEFAULT 0xffff
@ -336,12 +322,6 @@ struct qedr_qp_hwq_info {
union db_prod32 iwarp_db2_data;
};
#define QEDR_INC_SW_IDX(p_info, index) \
do { \
p_info->index = (p_info->index + 1) & \
qed_chain_get_capacity(p_info->pbl) \
} while (0)
struct qedr_srq_hwq_info {
u32 max_sges;
u32 max_wr;

View file

@ -119,12 +119,15 @@ void retransmit_timer(struct timer_list *t)
rxe_dbg_qp(qp, "retransmit timer fired\n");
if (!rxe_get(qp))
return;
spin_lock_irqsave(&qp->state_lock, flags);
if (qp->valid) {
qp->comp.timeout = 1;
rxe_sched_task(&qp->send_task);
}
spin_unlock_irqrestore(&qp->state_lock, flags);
rxe_put(qp);
}
void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)

View file

@ -72,14 +72,46 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr)
mr->ibmr.type = IB_MR_TYPE_DMA;
}
/*
* Convert iova to page_info index. The page_info stores pages of size
* PAGE_SIZE, but MRs can have different page sizes. This function
* handles the conversion for all cases:
*
* 1. mr->page_size > PAGE_SIZE:
* The MR's iova may not be aligned to mr->page_size. We use the
* aligned base (iova & page_mask) as reference, then calculate
* which PAGE_SIZE sub-page the iova falls into.
*
* 2. mr->page_size <= PAGE_SIZE:
* Use simple shift arithmetic since each page_info entry corresponds
* to one or more MR pages.
*/
static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova)
{
return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift);
int idx;
if (mr_page_size(mr) > PAGE_SIZE)
idx = (iova - (mr->ibmr.iova & mr->page_mask)) >> PAGE_SHIFT;
else
idx = (iova >> mr->page_shift) -
(mr->ibmr.iova >> mr->page_shift);
WARN_ON(idx >= mr->nbuf);
return idx;
}
/*
* Convert iova to offset within the page_info entry.
*
* For mr_page_size > PAGE_SIZE, the offset is within the system page.
* For mr_page_size <= PAGE_SIZE, the offset is within the MR page size.
*/
static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova)
{
return iova & (mr_page_size(mr) - 1);
if (mr_page_size(mr) > PAGE_SIZE)
return iova & (PAGE_SIZE - 1);
else
return iova & (mr_page_size(mr) - 1);
}
static bool is_pmem_page(struct page *pg)
@ -93,37 +125,69 @@ static bool is_pmem_page(struct page *pg)
static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt)
{
XA_STATE(xas, &mr->page_list, 0);
struct sg_page_iter sg_iter;
struct page *page;
bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT);
WARN_ON(mr_page_size(mr) != PAGE_SIZE);
__sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0);
if (!__sg_page_iter_next(&sg_iter))
return 0;
do {
xas_lock(&xas);
while (true) {
page = sg_page_iter_page(&sg_iter);
while (true) {
page = sg_page_iter_page(&sg_iter);
if (persistent && !is_pmem_page(page)) {
rxe_dbg_mr(mr, "Page can't be persistent\n");
xas_set_err(&xas, -EINVAL);
break;
}
xas_store(&xas, page);
if (xas_error(&xas))
break;
xas_next(&xas);
if (!__sg_page_iter_next(&sg_iter))
break;
if (persistent && !is_pmem_page(page)) {
rxe_dbg_mr(mr, "Page can't be persistent\n");
return -EINVAL;
}
xas_unlock(&xas);
} while (xas_nomem(&xas, GFP_KERNEL));
return xas_error(&xas);
mr->page_info[mr->nbuf].page = page;
mr->page_info[mr->nbuf].offset = 0;
mr->nbuf++;
if (!__sg_page_iter_next(&sg_iter))
break;
}
return 0;
}
static int __alloc_mr_page_info(struct rxe_mr *mr, int num_pages)
{
mr->page_info = kcalloc(num_pages, sizeof(struct rxe_mr_page),
GFP_KERNEL);
if (!mr->page_info)
return -ENOMEM;
mr->max_allowed_buf = num_pages;
mr->nbuf = 0;
return 0;
}
static int alloc_mr_page_info(struct rxe_mr *mr, int num_pages)
{
int ret;
WARN_ON(mr->num_buf);
ret = __alloc_mr_page_info(mr, num_pages);
if (ret)
return ret;
mr->num_buf = num_pages;
return 0;
}
static void free_mr_page_info(struct rxe_mr *mr)
{
if (!mr->page_info)
return;
kfree(mr->page_info);
mr->page_info = NULL;
}
int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
@ -134,8 +198,6 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
rxe_mr_init(access, mr);
xa_init(&mr->page_list);
umem = ib_umem_get(&rxe->ib_dev, start, length, access);
if (IS_ERR(umem)) {
rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n",
@ -143,46 +205,24 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
return PTR_ERR(umem);
}
err = alloc_mr_page_info(mr, ib_umem_num_pages(umem));
if (err)
goto err2;
err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt);
if (err) {
ib_umem_release(umem);
return err;
}
if (err)
goto err1;
mr->umem = umem;
mr->ibmr.type = IB_MR_TYPE_USER;
mr->state = RXE_MR_STATE_VALID;
return 0;
}
static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf)
{
XA_STATE(xas, &mr->page_list, 0);
int i = 0;
int err;
xa_init(&mr->page_list);
do {
xas_lock(&xas);
while (i != num_buf) {
xas_store(&xas, XA_ZERO_ENTRY);
if (xas_error(&xas))
break;
xas_next(&xas);
i++;
}
xas_unlock(&xas);
} while (xas_nomem(&xas, GFP_KERNEL));
err = xas_error(&xas);
if (err)
return err;
mr->num_buf = num_buf;
return 0;
err1:
free_mr_page_info(mr);
err2:
ib_umem_release(umem);
return err;
}
int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr)
@ -192,7 +232,7 @@ int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr)
/* always allow remote access for FMRs */
rxe_mr_init(RXE_ACCESS_REMOTE, mr);
err = rxe_mr_alloc(mr, max_pages);
err = alloc_mr_page_info(mr, max_pages);
if (err)
goto err1;
@ -205,26 +245,43 @@ err1:
return err;
}
/*
* I) MRs with page_size >= PAGE_SIZE,
* Split a large MR page (mr->page_size) into multiple PAGE_SIZE
* sub-pages and store them in page_info, offset is always 0.
*
* Called when mr->page_size > PAGE_SIZE. Each call to rxe_set_page()
* represents one mr->page_size region, which we must split into
* (mr->page_size >> PAGE_SHIFT) individual pages.
*
* II) MRs with page_size < PAGE_SIZE,
* Save each PAGE_SIZE page and its offset within the system page in page_info.
*/
static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr)
{
struct rxe_mr *mr = to_rmr(ibmr);
struct page *page = ib_virt_dma_to_page(dma_addr);
bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT);
int err;
u32 i, pages_per_mr = mr_page_size(mr) >> PAGE_SHIFT;
if (persistent && !is_pmem_page(page)) {
rxe_dbg_mr(mr, "Page cannot be persistent\n");
return -EINVAL;
pages_per_mr = MAX(1, pages_per_mr);
for (i = 0; i < pages_per_mr; i++) {
u64 addr = dma_addr + i * PAGE_SIZE;
struct page *sub_page = ib_virt_dma_to_page(addr);
if (unlikely(mr->nbuf >= mr->max_allowed_buf))
return -ENOMEM;
if (persistent && !is_pmem_page(sub_page)) {
rxe_dbg_mr(mr, "Page cannot be persistent\n");
return -EINVAL;
}
mr->page_info[mr->nbuf].page = sub_page;
mr->page_info[mr->nbuf].offset = addr & (PAGE_SIZE - 1);
mr->nbuf++;
}
if (unlikely(mr->nbuf == mr->num_buf))
return -ENOMEM;
err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL));
if (err)
return err;
mr->nbuf++;
return 0;
}
@ -234,10 +291,34 @@ int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl,
struct rxe_mr *mr = to_rmr(ibmr);
unsigned int page_size = mr_page_size(mr);
/*
* Ensure page_size and PAGE_SIZE are compatible for mapping.
* We require one to be a multiple of the other for correct
* iova-to-page conversion.
*/
if (!IS_ALIGNED(page_size, PAGE_SIZE) &&
!IS_ALIGNED(PAGE_SIZE, page_size)) {
rxe_dbg_mr(mr, "MR page size %u must be compatible with PAGE_SIZE %lu\n",
page_size, PAGE_SIZE);
return -EINVAL;
}
if (mr_page_size(mr) > PAGE_SIZE) {
/* resize page_info if needed */
u32 map_mr_pages = (page_size >> PAGE_SHIFT) * mr->num_buf;
if (map_mr_pages > mr->max_allowed_buf) {
rxe_dbg_mr(mr, "requested pages %u exceed max %u\n",
map_mr_pages, mr->max_allowed_buf);
free_mr_page_info(mr);
if (__alloc_mr_page_info(mr, map_mr_pages))
return -ENOMEM;
}
}
mr->nbuf = 0;
mr->page_shift = ilog2(page_size);
mr->page_mask = ~((u64)page_size - 1);
mr->page_offset = mr->ibmr.iova & (page_size - 1);
return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page);
}
@ -245,30 +326,30 @@ int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl,
static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr,
unsigned int length, enum rxe_mr_copy_dir dir)
{
unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova);
unsigned long index = rxe_mr_iova_to_index(mr, iova);
unsigned int bytes;
struct page *page;
void *va;
u8 *va;
while (length) {
page = xa_load(&mr->page_list, index);
if (!page)
unsigned long index = rxe_mr_iova_to_index(mr, iova);
struct rxe_mr_page *info = &mr->page_info[index];
unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova);
if (!info->page)
return -EFAULT;
bytes = min_t(unsigned int, length,
mr_page_size(mr) - page_offset);
va = kmap_local_page(page);
page_offset += info->offset;
bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset);
va = kmap_local_page(info->page);
if (dir == RXE_FROM_MR_OBJ)
memcpy(addr, va + page_offset, bytes);
else
memcpy(va + page_offset, addr, bytes);
kunmap_local(va);
page_offset = 0;
addr += bytes;
iova += bytes;
length -= bytes;
index++;
}
return 0;
@ -426,9 +507,6 @@ err1:
static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
{
unsigned int page_offset;
unsigned long index;
struct page *page;
unsigned int bytes;
int err;
u8 *va;
@ -438,15 +516,17 @@ static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int leng
return err;
while (length > 0) {
index = rxe_mr_iova_to_index(mr, iova);
page = xa_load(&mr->page_list, index);
page_offset = rxe_mr_iova_to_page_offset(mr, iova);
if (!page)
return -EFAULT;
bytes = min_t(unsigned int, length,
mr_page_size(mr) - page_offset);
unsigned long index = rxe_mr_iova_to_index(mr, iova);
struct rxe_mr_page *info = &mr->page_info[index];
unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova);
va = kmap_local_page(page);
if (!info->page)
return -EFAULT;
page_offset += info->offset;
bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset);
va = kmap_local_page(info->page);
arch_wb_cache_pmem(va + page_offset, bytes);
kunmap_local(va);
@ -501,6 +581,7 @@ enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
} else {
unsigned long index;
int err;
struct rxe_mr_page *info;
err = mr_check_range(mr, iova, sizeof(value));
if (err) {
@ -509,9 +590,12 @@ enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
}
page_offset = rxe_mr_iova_to_page_offset(mr, iova);
index = rxe_mr_iova_to_index(mr, iova);
page = xa_load(&mr->page_list, index);
if (!page)
info = &mr->page_info[index];
if (!info->page)
return RESPST_ERR_RKEY_VIOLATION;
page_offset += info->offset;
page = info->page;
}
if (unlikely(page_offset & 0x7)) {
@ -550,6 +634,7 @@ enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
} else {
unsigned long index;
int err;
struct rxe_mr_page *info;
/* See IBA oA19-28 */
err = mr_check_range(mr, iova, sizeof(value));
@ -559,9 +644,12 @@ enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
}
page_offset = rxe_mr_iova_to_page_offset(mr, iova);
index = rxe_mr_iova_to_index(mr, iova);
page = xa_load(&mr->page_list, index);
if (!page)
info = &mr->page_info[index];
if (!info->page)
return RESPST_ERR_RKEY_VIOLATION;
page_offset += info->offset;
page = info->page;
}
/* See IBA A19.4.2 */
@ -725,5 +813,5 @@ void rxe_mr_cleanup(struct rxe_pool_elem *elem)
ib_umem_release(mr->umem);
if (mr->ibmr.type != IB_MR_TYPE_DMA)
xa_destroy(&mr->page_list);
free_mr_page_info(mr);
}

View file

@ -110,7 +110,6 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
mr->access = access_flags;
mr->ibmr.length = length;
mr->ibmr.iova = iova;
mr->page_offset = ib_umem_offset(&umem_odp->umem);
err = rxe_odp_init_pages(mr);
if (err) {

View file

@ -102,6 +102,8 @@ void rnr_nak_timer(struct timer_list *t)
rxe_dbg_qp(qp, "nak timer fired\n");
if (!rxe_get(qp))
return;
spin_lock_irqsave(&qp->state_lock, flags);
if (qp->valid) {
/* request a send queue retry */
@ -110,6 +112,7 @@ void rnr_nak_timer(struct timer_list *t)
rxe_sched_task(&qp->send_task);
}
spin_unlock_irqrestore(&qp->state_lock, flags);
rxe_put(qp);
}
static void req_check_sq_drain_done(struct rxe_qp *qp)

View file

@ -77,9 +77,6 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
goto err_free;
}
srq->rq.queue = q;
init->attr.max_wr = srq->rq.max_wr;
if (uresp) {
if (copy_to_user(&uresp->srq_num, &srq->srq_num,
sizeof(uresp->srq_num))) {
@ -88,6 +85,9 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
}
}
srq->rq.queue = q;
init->attr.max_wr = srq->rq.max_wr;
return 0;
err_free:

View file

@ -335,6 +335,11 @@ static inline int rkey_is_mw(u32 rkey)
return (index >= RXE_MIN_MW_INDEX) && (index <= RXE_MAX_MW_INDEX);
}
struct rxe_mr_page {
struct page *page;
unsigned int offset; /* offset in system page */
};
struct rxe_mr {
struct rxe_pool_elem elem;
struct ib_mr ibmr;
@ -347,14 +352,16 @@ struct rxe_mr {
int access;
atomic_t num_mw;
unsigned int page_offset;
unsigned int page_shift;
u64 page_mask;
/* size of page_info when mr allocated */
u32 num_buf;
/* real size of page_info */
u32 max_allowed_buf;
u32 nbuf;
struct xarray page_list;
struct rxe_mr_page *page_info;
};
static inline unsigned int mr_page_size(struct rxe_mr *mr)

View file

@ -1435,7 +1435,8 @@ int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
}
if (unlikely(rv != 0 && rv != -EAGAIN)) {
if ((srx->state > SIW_GET_HDR ||
qp->rx_fpdu->more_ddp_segs) && run_completion)
(qp->rx_fpdu && qp->rx_fpdu->more_ddp_segs)) &&
run_completion)
siw_rdmap_complete(qp, rv);
siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,

View file

@ -439,19 +439,19 @@ int rtrs_clt_create_path_files(struct rtrs_clt_path *clt_path)
clt->kobj_paths,
"%s", str);
if (err) {
pr_err("kobject_init_and_add: %d\n", err);
pr_err("kobject_init_and_add: %pe\n", ERR_PTR(err));
kobject_put(&clt_path->kobj);
return err;
}
err = sysfs_create_group(&clt_path->kobj, &rtrs_clt_path_attr_group);
if (err) {
pr_err("sysfs_create_group(): %d\n", err);
pr_err("sysfs_create_group(): %pe\n", ERR_PTR(err));
goto put_kobj;
}
err = kobject_init_and_add(&clt_path->stats->kobj_stats, &ktype_stats,
&clt_path->kobj, "stats");
if (err) {
pr_err("kobject_init_and_add: %d\n", err);
pr_err("kobject_init_and_add: %pe\n", ERR_PTR(err));
kobject_put(&clt_path->stats->kobj_stats);
goto remove_group;
}
@ -459,7 +459,7 @@ int rtrs_clt_create_path_files(struct rtrs_clt_path *clt_path)
err = sysfs_create_group(&clt_path->stats->kobj_stats,
&rtrs_clt_stats_attr_group);
if (err) {
pr_err("failed to create stats sysfs group, err: %d\n", err);
pr_err("failed to create stats sysfs group, err: %pe\n", ERR_PTR(err));
goto put_kobj_stats;
}

View file

@ -422,8 +422,8 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
refcount_inc(&req->ref);
err = rtrs_inv_rkey(req);
if (err) {
rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %d\n",
req->mr->rkey, err);
rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %pe\n",
req->mr->rkey, ERR_PTR(err));
} else if (can_wait) {
wait_for_completion(&req->inv_comp);
}
@ -443,8 +443,8 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
if (errno) {
rtrs_err_rl(con->c.path,
"IO %s request failed: error=%d path=%s [%s:%u] notify=%d\n",
req->dir == DMA_TO_DEVICE ? "write" : "read", errno,
"IO %s request failed: error=%pe path=%s [%s:%u] notify=%d\n",
req->dir == DMA_TO_DEVICE ? "write" : "read", ERR_PTR(errno),
kobject_name(&clt_path->kobj), clt_path->hca_name,
clt_path->hca_port, notify);
}
@ -514,7 +514,7 @@ static void rtrs_clt_recv_done(struct rtrs_clt_con *con, struct ib_wc *wc)
cqe);
err = rtrs_iu_post_recv(&con->c, iu);
if (err) {
rtrs_err(con->c.path, "post iu failed %d\n", err);
rtrs_err(con->c.path, "post iu failed %pe\n", ERR_PTR(err));
rtrs_rdma_error_recovery(con);
}
}
@ -659,8 +659,8 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
else
err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
if (err) {
rtrs_err(con->c.path, "rtrs_post_recv_empty(): %d\n",
err);
rtrs_err(con->c.path, "rtrs_post_recv_empty(): %pe\n",
ERR_PTR(err));
rtrs_rdma_error_recovery(con);
}
break;
@ -731,8 +731,8 @@ static int post_recv_path(struct rtrs_clt_path *clt_path)
err = post_recv_io(to_clt_con(clt_path->s.con[cid]), q_size);
if (err) {
rtrs_err(clt_path->clt, "post_recv_io(), err: %d\n",
err);
rtrs_err(clt_path->clt, "post_recv_io(), err: %pe\n",
ERR_PTR(err));
return err;
}
}
@ -1122,8 +1122,8 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req)
ret = rtrs_map_sg_fr(req, count);
if (ret < 0) {
rtrs_err_rl(s,
"Write request failed, failed to map fast reg. data, err: %d\n",
ret);
"Write request failed, failed to map fast reg. data, err: %pe\n",
ERR_PTR(ret));
ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
req->sg_cnt, req->dir);
return ret;
@ -1150,9 +1150,9 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req)
imm, wr, NULL);
if (ret) {
rtrs_err_rl(s,
"Write request failed: error=%d path=%s [%s:%u]\n",
ret, kobject_name(&clt_path->kobj), clt_path->hca_name,
clt_path->hca_port);
"Write request failed: error=%pe path=%s [%s:%u]\n",
ERR_PTR(ret), kobject_name(&clt_path->kobj),
clt_path->hca_name, clt_path->hca_port);
if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
atomic_dec(&clt_path->stats->inflight);
if (req->mr->need_inval) {
@ -1208,8 +1208,8 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req)
ret = rtrs_map_sg_fr(req, count);
if (ret < 0) {
rtrs_err_rl(s,
"Read request failed, failed to map fast reg. data, err: %d\n",
ret);
"Read request failed, failed to map fast reg. data, err: %pe\n",
ERR_PTR(ret));
ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt,
req->dir);
return ret;
@ -1260,9 +1260,9 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req)
req->data_len, imm, wr);
if (ret) {
rtrs_err_rl(s,
"Read request failed: error=%d path=%s [%s:%u]\n",
ret, kobject_name(&clt_path->kobj), clt_path->hca_name,
clt_path->hca_port);
"Read request failed: error=%pe path=%s [%s:%u]\n",
ERR_PTR(ret), kobject_name(&clt_path->kobj),
clt_path->hca_name, clt_path->hca_port);
if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
atomic_dec(&clt_path->stats->inflight);
req->mr->need_inval = false;
@ -1359,7 +1359,9 @@ static void free_path_reqs(struct rtrs_clt_path *clt_path)
static int alloc_path_reqs(struct rtrs_clt_path *clt_path)
{
struct ib_device *ib_dev = clt_path->s.dev->ib_dev;
struct rtrs_clt_io_req *req;
enum ib_mr_type mr_type;
int i, err = -ENOMEM;
clt_path->reqs = kcalloc(clt_path->queue_depth,
@ -1368,6 +1370,11 @@ static int alloc_path_reqs(struct rtrs_clt_path *clt_path)
if (!clt_path->reqs)
return -ENOMEM;
if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
mr_type = IB_MR_TYPE_SG_GAPS;
else
mr_type = IB_MR_TYPE_MEM_REG;
for (i = 0; i < clt_path->queue_depth; ++i) {
req = &clt_path->reqs[i];
req->iu = rtrs_iu_alloc(1, clt_path->max_hdr_size, GFP_KERNEL,
@ -1381,8 +1388,7 @@ static int alloc_path_reqs(struct rtrs_clt_path *clt_path)
if (!req->sge)
goto out;
req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd,
IB_MR_TYPE_MEM_REG,
req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd, mr_type,
clt_path->max_pages_per_mr);
if (IS_ERR(req->mr)) {
err = PTR_ERR(req->mr);
@ -1775,12 +1781,12 @@ static int rtrs_rdma_addr_resolved(struct rtrs_clt_con *con)
err = create_con_cq_qp(con);
mutex_unlock(&con->con_mutex);
if (err) {
rtrs_err(s, "create_con_cq_qp(), err: %d\n", err);
rtrs_err(s, "create_con_cq_qp(), err: %pe\n", ERR_PTR(err));
return err;
}
err = rdma_resolve_route(con->c.cm_id, RTRS_CONNECT_TIMEOUT_MS);
if (err)
rtrs_err(s, "Resolving route failed, err: %d\n", err);
rtrs_err(s, "Resolving route failed, err: %pe\n", ERR_PTR(err));
return err;
}
@ -1814,7 +1820,7 @@ static int rtrs_rdma_route_resolved(struct rtrs_clt_con *con)
err = rdma_connect_locked(con->c.cm_id, &param);
if (err)
rtrs_err(clt, "rdma_connect_locked(): %d\n", err);
rtrs_err(clt, "rdma_connect_locked(): %pe\n", ERR_PTR(err));
return err;
}
@ -1847,8 +1853,8 @@ static int rtrs_rdma_conn_established(struct rtrs_clt_con *con,
}
errno = le16_to_cpu(msg->errno);
if (errno) {
rtrs_err(clt, "Invalid RTRS message: errno %d\n",
errno);
rtrs_err(clt, "Invalid RTRS message: errno %pe\n",
ERR_PTR(errno));
return -ECONNRESET;
}
if (con->c.cid == 0) {
@ -1923,7 +1929,7 @@ static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con,
struct rtrs_path *s = con->c.path;
const struct rtrs_msg_conn_rsp *msg;
const char *rej_msg;
int status, errno;
int status, errno = -ECONNRESET;
u8 data_len;
status = ev->status;
@ -1937,15 +1943,15 @@ static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con,
"Previous session is still exists on the server, please reconnect later\n");
else
rtrs_err(s,
"Connect rejected: status %d (%s), rtrs errno %d\n",
status, rej_msg, errno);
"Connect rejected: status %d (%s), rtrs errno %pe\n",
status, rej_msg, ERR_PTR(errno));
} else {
rtrs_err(s,
"Connect rejected but with malformed message: status %d (%s)\n",
status, rej_msg);
}
return -ECONNRESET;
return errno;
}
void rtrs_clt_close_conns(struct rtrs_clt_path *clt_path, bool wait)
@ -2009,27 +2015,53 @@ static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_ADDR_CHANGE:
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
rdma_event_msg(ev->event), ev->status);
if (ev->status < 0) {
rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n",
rdma_event_msg(ev->event), ERR_PTR(ev->status));
} else if (ev->status > 0) {
rtrs_wrn(s, "CM error (CM event: %s, err: %s)\n",
rdma_event_msg(ev->event),
rdma_reject_msg(cm_id, ev->status));
}
cm_err = -ECONNRESET;
break;
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR:
rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
rdma_event_msg(ev->event), ev->status);
if (ev->status < 0) {
rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n",
rdma_event_msg(ev->event),
ERR_PTR(ev->status));
} else if (ev->status > 0) {
rtrs_wrn(s, "CM error (CM event: %s, err: %s)\n",
rdma_event_msg(ev->event),
rdma_reject_msg(cm_id, ev->status));
}
cm_err = -EHOSTUNREACH;
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
/*
* Device removal is a special case. Queue close and return 0.
*/
rtrs_wrn_rl(s, "CM event: %s, status: %d\n", rdma_event_msg(ev->event),
ev->status);
if (ev->status < 0) {
rtrs_wrn_rl(s, "CM event: %s, status: %pe\n",
rdma_event_msg(ev->event),
ERR_PTR(ev->status));
} else if (ev->status > 0) {
rtrs_wrn_rl(s, "CM event: %s, status: %s\n",
rdma_event_msg(ev->event),
rdma_reject_msg(cm_id, ev->status));
}
rtrs_clt_close_conns(clt_path, false);
return 0;
default:
rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %d)\n",
rdma_event_msg(ev->event), ev->status);
if (ev->status < 0) {
rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %pe)\n",
rdma_event_msg(ev->event), ERR_PTR(ev->status));
} else if (ev->status > 0) {
rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %s)\n",
rdma_event_msg(ev->event),
rdma_reject_msg(cm_id, ev->status));
}
cm_err = -ECONNRESET;
break;
}
@ -2066,14 +2098,14 @@ static int create_cm(struct rtrs_clt_con *con)
/* allow the port to be reused */
err = rdma_set_reuseaddr(cm_id, 1);
if (err != 0) {
rtrs_err(s, "Set address reuse failed, err: %d\n", err);
rtrs_err(s, "Set address reuse failed, err: %pe\n", ERR_PTR(err));
return err;
}
err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr,
(struct sockaddr *)&clt_path->s.dst_addr,
RTRS_CONNECT_TIMEOUT_MS);
if (err) {
rtrs_err(s, "Failed to resolve address, err: %d\n", err);
rtrs_err(s, "Failed to resolve address, err: %pe\n", ERR_PTR(err));
return err;
}
/*
@ -2548,7 +2580,7 @@ static int rtrs_send_path_info(struct rtrs_clt_path *clt_path)
/* Prepare for getting info response */
err = rtrs_iu_post_recv(&usr_con->c, rx_iu);
if (err) {
rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %d\n", err);
rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %pe\n", ERR_PTR(err));
goto out;
}
rx_iu = NULL;
@ -2564,7 +2596,7 @@ static int rtrs_send_path_info(struct rtrs_clt_path *clt_path)
/* Send info request */
err = rtrs_iu_post_send(&usr_con->c, tx_iu, sizeof(*msg), NULL);
if (err) {
rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %d\n", err);
rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %pe\n", ERR_PTR(err));
goto out;
}
tx_iu = NULL;
@ -2615,15 +2647,15 @@ static int init_path(struct rtrs_clt_path *clt_path)
err = init_conns(clt_path);
if (err) {
rtrs_err(clt_path->clt,
"init_conns() failed: err=%d path=%s [%s:%u]\n", err,
str, clt_path->hca_name, clt_path->hca_port);
"init_conns() failed: err=%pe path=%s [%s:%u]\n",
ERR_PTR(err), str, clt_path->hca_name, clt_path->hca_port);
goto out;
}
err = rtrs_send_path_info(clt_path);
if (err) {
rtrs_err(clt_path->clt,
"rtrs_send_path_info() failed: err=%d path=%s [%s:%u]\n",
err, str, clt_path->hca_name, clt_path->hca_port);
"rtrs_send_path_info() failed: err=%pe path=%s [%s:%u]\n",
ERR_PTR(err), str, clt_path->hca_name, clt_path->hca_port);
goto out;
}
rtrs_clt_path_up(clt_path);
@ -3147,8 +3179,11 @@ close_path:
void rtrs_clt_ib_event_handler(struct ib_event_handler *handler,
struct ib_event *ibevent)
{
pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event),
ibevent->event);
struct ib_device *idev = ibevent->device;
u32 port_num = ibevent->element.port_num;
pr_info("Handling event: %s (%d). HCA name: %s, port num: %u\n",
ib_event_msg(ibevent->event), ibevent->event, idev->name, port_num);
}

View file

@ -92,7 +92,6 @@ struct rtrs_permit {
* rtrs_clt_io_req - describes one inflight IO request
*/
struct rtrs_clt_io_req {
struct list_head list;
struct rtrs_iu *iu;
struct scatterlist *sglist; /* list holding user data */
unsigned int sg_cnt;
@ -103,12 +102,10 @@ struct rtrs_clt_io_req {
bool in_use;
enum rtrs_mp_policy mp_policy;
struct rtrs_clt_con *con;
struct rtrs_sg_desc *desc;
struct ib_sge *sge;
struct rtrs_permit *permit;
enum dma_data_direction dir;
void (*conf)(void *priv, int errno);
unsigned long start_jiffies;
struct ib_mr *mr;
struct ib_cqe inv_cqe;

View file

@ -176,14 +176,14 @@ static int rtrs_srv_create_once_sysfs_root_folders(struct rtrs_srv_path *srv_pat
dev_set_uevent_suppress(&srv->dev, true);
err = device_add(&srv->dev);
if (err) {
pr_err("device_add(): %d\n", err);
pr_err("device_add(): %pe\n", ERR_PTR(err));
put_device(&srv->dev);
goto unlock;
}
srv->kobj_paths = kobject_create_and_add("paths", &srv->dev.kobj);
if (!srv->kobj_paths) {
err = -ENOMEM;
pr_err("kobject_create_and_add(): %d\n", err);
pr_err("kobject_create_and_add(): %pe\n", ERR_PTR(err));
device_del(&srv->dev);
put_device(&srv->dev);
goto unlock;
@ -237,14 +237,14 @@ static int rtrs_srv_create_stats_files(struct rtrs_srv_path *srv_path)
err = kobject_init_and_add(&srv_path->stats->kobj_stats, &ktype_stats,
&srv_path->kobj, "stats");
if (err) {
rtrs_err(s, "kobject_init_and_add(): %d\n", err);
rtrs_err(s, "kobject_init_and_add(): %pe\n", ERR_PTR(err));
kobject_put(&srv_path->stats->kobj_stats);
return err;
}
err = sysfs_create_group(&srv_path->stats->kobj_stats,
&rtrs_srv_stats_attr_group);
if (err) {
rtrs_err(s, "sysfs_create_group(): %d\n", err);
rtrs_err(s, "sysfs_create_group(): %pe\n", ERR_PTR(err));
goto err;
}
@ -276,12 +276,12 @@ int rtrs_srv_create_path_files(struct rtrs_srv_path *srv_path)
err = kobject_init_and_add(&srv_path->kobj, &ktype, srv->kobj_paths,
"%s", str);
if (err) {
rtrs_err(s, "kobject_init_and_add(): %d\n", err);
rtrs_err(s, "kobject_init_and_add(): %pe\n", ERR_PTR(err));
goto destroy_root;
}
err = sysfs_create_group(&srv_path->kobj, &rtrs_srv_path_attr_group);
if (err) {
rtrs_err(s, "sysfs_create_group(): %d\n", err);
rtrs_err(s, "sysfs_create_group(): %pe\n", ERR_PTR(err));
goto put_kobj;
}
err = rtrs_srv_create_stats_files(srv_path);

View file

@ -184,7 +184,7 @@ static void rtrs_srv_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc)
struct rtrs_srv_path *srv_path = to_srv_path(s);
if (wc->status != IB_WC_SUCCESS) {
rtrs_err(s, "REG MR failed: %s\n",
rtrs_err_rl(s, "REG MR failed: %s\n",
ib_wc_status_msg(wc->status));
close_path(srv_path);
return;
@ -208,7 +208,6 @@ static int rdma_write_sg(struct rtrs_srv_op *id)
size_t sg_cnt;
int err, offset;
bool need_inval;
u32 rkey = 0;
struct ib_reg_wr rwr;
struct ib_sge *plist;
struct ib_sge list;
@ -240,11 +239,6 @@ static int rdma_write_sg(struct rtrs_srv_op *id)
wr->wr.num_sge = 1;
wr->remote_addr = le64_to_cpu(id->rd_msg->desc[0].addr);
wr->rkey = le32_to_cpu(id->rd_msg->desc[0].key);
if (rkey == 0)
rkey = wr->rkey;
else
/* Only one key is actually used */
WARN_ON_ONCE(rkey != wr->rkey);
wr->wr.opcode = IB_WR_RDMA_WRITE;
wr->wr.wr_cqe = &io_comp_cqe;
@ -277,7 +271,7 @@ static int rdma_write_sg(struct rtrs_srv_op *id)
inv_wr.opcode = IB_WR_SEND_WITH_INV;
inv_wr.wr_cqe = &io_comp_cqe;
inv_wr.send_flags = 0;
inv_wr.ex.invalidate_rkey = rkey;
inv_wr.ex.invalidate_rkey = wr->rkey;
}
imm_wr.wr.next = NULL;
@ -323,8 +317,8 @@ static int rdma_write_sg(struct rtrs_srv_op *id)
err = ib_post_send(id->con->c.qp, &id->tx_wr.wr, NULL);
if (err)
rtrs_err(s,
"Posting RDMA-Write-Request to QP failed, err: %d\n",
err);
"Posting RDMA-Write-Request to QP failed, err: %pe\n",
ERR_PTR(err));
return err;
}
@ -440,8 +434,8 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id,
err = ib_post_send(id->con->c.qp, wr, NULL);
if (err)
rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %d\n",
err);
rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %pe\n",
ERR_PTR(err));
return err;
}
@ -525,8 +519,8 @@ bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int status)
err = rdma_write_sg(id);
if (err) {
rtrs_err_rl(s, "IO response failed: %d: srv_path=%s\n", err,
kobject_name(&srv_path->kobj));
rtrs_err_rl(s, "IO response failed: %pe: srv_path=%s\n",
ERR_PTR(err), kobject_name(&srv_path->kobj));
close_path(srv_path);
}
out:
@ -568,13 +562,15 @@ static void unmap_cont_bufs(struct rtrs_srv_path *srv_path)
static int map_cont_bufs(struct rtrs_srv_path *srv_path)
{
struct ib_device *ib_dev = srv_path->s.dev->ib_dev;
struct rtrs_srv_sess *srv = srv_path->srv;
struct rtrs_path *ss = &srv_path->s;
int i, err, mrs_num;
unsigned int chunk_bits;
enum ib_mr_type mr_type;
int chunks_per_mr = 1;
struct ib_mr *mr;
struct sg_table *sgt;
struct ib_mr *mr;
/*
* Here we map queue_depth chunks to MR. Firstly we have to
@ -601,7 +597,7 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path)
srv_path->mrs_num++) {
struct rtrs_srv_mr *srv_mr = &srv_path->mrs[srv_path->mrs_num];
struct scatterlist *s;
int nr, nr_sgt, chunks;
int nr, nr_sgt, chunks, ind;
sgt = &srv_mr->sgt;
chunks = chunks_per_mr * srv_path->mrs_num;
@ -623,15 +619,20 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path)
err = -EINVAL;
goto free_sg;
}
mr = ib_alloc_mr(srv_path->s.dev->ib_pd, IB_MR_TYPE_MEM_REG,
nr_sgt);
if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
mr_type = IB_MR_TYPE_SG_GAPS;
else
mr_type = IB_MR_TYPE_MEM_REG;
mr = ib_alloc_mr(srv_path->s.dev->ib_pd, mr_type, nr_sgt);
if (IS_ERR(mr)) {
err = PTR_ERR(mr);
goto unmap_sg;
}
nr = ib_map_mr_sg(mr, sgt->sgl, nr_sgt,
NULL, max_chunk_size);
if (nr != nr_sgt) {
if (nr < nr_sgt) {
err = nr < 0 ? nr : -EINVAL;
goto dereg_mr;
}
@ -643,13 +644,28 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path)
DMA_TO_DEVICE, rtrs_srv_rdma_done);
if (!srv_mr->iu) {
err = -ENOMEM;
rtrs_err(ss, "rtrs_iu_alloc(), err: %d\n", err);
rtrs_err(ss, "rtrs_iu_alloc(), err: %pe\n", ERR_PTR(err));
goto dereg_mr;
}
}
/* Eventually dma addr for each chunk can be cached */
for_each_sg(sgt->sgl, s, nr_sgt, i)
srv_path->dma_addr[chunks + i] = sg_dma_address(s);
/*
* Cache DMA addresses by traversing sg entries. If
* regions were merged, an inner loop is required to
* populate the DMA address array by traversing larger
* regions.
*/
ind = chunks;
for_each_sg(sgt->sgl, s, nr_sgt, i) {
unsigned int dma_len = sg_dma_len(s);
u64 dma_addr = sg_dma_address(s);
u64 dma_addr_end = dma_addr + dma_len;
do {
srv_path->dma_addr[ind++] = dma_addr;
dma_addr += max_chunk_size;
} while (dma_addr < dma_addr_end);
}
ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
srv_mr->mr = mr;
@ -804,7 +820,7 @@ static int process_info_req(struct rtrs_srv_con *con,
err = post_recv_path(srv_path);
if (err) {
rtrs_err(s, "post_recv_path(), err: %d\n", err);
rtrs_err(s, "post_recv_path(), err: %pe\n", ERR_PTR(err));
return err;
}
@ -867,7 +883,7 @@ static int process_info_req(struct rtrs_srv_con *con,
get_device(&srv_path->srv->dev);
err = rtrs_srv_change_state(srv_path, RTRS_SRV_CONNECTED);
if (!err) {
rtrs_err(s, "rtrs_srv_change_state(), err: %d\n", err);
rtrs_err(s, "rtrs_srv_change_state() failed\n");
goto iu_free;
}
@ -881,7 +897,7 @@ static int process_info_req(struct rtrs_srv_con *con,
*/
err = rtrs_srv_path_up(srv_path);
if (err) {
rtrs_err(s, "rtrs_srv_path_up(), err: %d\n", err);
rtrs_err(s, "rtrs_srv_path_up(), err: %pe\n", ERR_PTR(err));
goto iu_free;
}
@ -889,10 +905,16 @@ static int process_info_req(struct rtrs_srv_con *con,
tx_iu->dma_addr,
tx_iu->size, DMA_TO_DEVICE);
/*
* Now disable zombie connection closing. Since from the logs and code,
* we know that it can never be in CONNECTED state.
*/
srv_path->connection_timeout = 0;
/* Send info response */
err = rtrs_iu_post_send(&con->c, tx_iu, tx_sz, reg_wr);
if (err) {
rtrs_err(s, "rtrs_iu_post_send(), err: %d\n", err);
rtrs_err(s, "rtrs_iu_post_send(), err: %pe\n", ERR_PTR(err));
iu_free:
rtrs_iu_free(tx_iu, srv_path->s.dev->ib_dev, 1);
}
@ -960,7 +982,7 @@ static int post_recv_info_req(struct rtrs_srv_con *con)
/* Prepare for getting info response */
err = rtrs_iu_post_recv(&con->c, rx_iu);
if (err) {
rtrs_err(s, "rtrs_iu_post_recv(), err: %d\n", err);
rtrs_err(s, "rtrs_iu_post_recv(), err: %pe\n", ERR_PTR(err));
rtrs_iu_free(rx_iu, srv_path->s.dev->ib_dev, 1);
return err;
}
@ -1006,7 +1028,7 @@ static int post_recv_path(struct rtrs_srv_path *srv_path)
err = post_recv_io(to_srv_con(srv_path->s.con[cid]), q_size);
if (err) {
rtrs_err(s, "post_recv_io(), err: %d\n", err);
rtrs_err(s, "post_recv_io(), err: %pe\n", ERR_PTR(err));
return err;
}
}
@ -1054,8 +1076,8 @@ static void process_read(struct rtrs_srv_con *con,
if (ret) {
rtrs_err_rl(s,
"Processing read request failed, user module cb reported for msg_id %d, err: %d\n",
buf_id, ret);
"Processing read request failed, user module cb reported for msg_id %d, err: %pe\n",
buf_id, ERR_PTR(ret));
goto send_err_msg;
}
@ -1065,8 +1087,8 @@ send_err_msg:
ret = send_io_resp_imm(con, id, ret);
if (ret < 0) {
rtrs_err_rl(s,
"Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %d\n",
buf_id, ret);
"Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %pe\n",
buf_id, ERR_PTR(ret));
close_path(srv_path);
}
rtrs_srv_put_ops_ids(srv_path);
@ -1106,8 +1128,8 @@ static void process_write(struct rtrs_srv_con *con,
data + data_len, usr_len);
if (ret) {
rtrs_err_rl(s,
"Processing write request failed, user module callback reports err: %d\n",
ret);
"Processing write request failed, user module callback reports err: %pe\n",
ERR_PTR(ret));
goto send_err_msg;
}
@ -1117,8 +1139,8 @@ send_err_msg:
ret = send_io_resp_imm(con, id, ret);
if (ret < 0) {
rtrs_err_rl(s,
"Processing write request failed, sending I/O response failed, msg_id %d, err: %d\n",
buf_id, ret);
"Processing write request failed, sending I/O response failed, msg_id %d, err: %pe\n",
buf_id, ERR_PTR(ret));
close_path(srv_path);
}
rtrs_srv_put_ops_ids(srv_path);
@ -1248,7 +1270,8 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
srv_path->s.hb_missed_cnt = 0;
err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
if (err) {
rtrs_err(s, "rtrs_post_recv(), err: %d\n", err);
rtrs_err(s, "rtrs_post_recv(), err: %pe\n",
ERR_PTR(err));
close_path(srv_path);
break;
}
@ -1273,8 +1296,8 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
mr->msg_id = msg_id;
err = rtrs_srv_inv_rkey(con, mr);
if (err) {
rtrs_err(s, "rtrs_post_recv(), err: %d\n",
err);
rtrs_err(s, "rtrs_post_recv(), err: %pe\n",
ERR_PTR(err));
close_path(srv_path);
break;
}
@ -1514,17 +1537,38 @@ static int sockaddr_cmp(const struct sockaddr *a, const struct sockaddr *b)
}
}
/* Let's close connections which have been waiting for more than 30 seconds */
#define RTRS_MAX_CONN_TIMEOUT 30000
static void rtrs_srv_check_close_path(struct rtrs_srv_path *srv_path)
{
struct rtrs_path *s = &srv_path->s;
if (srv_path->state == RTRS_SRV_CONNECTING && srv_path->connection_timeout &&
(jiffies_to_msecs(jiffies - srv_path->connection_timeout) > RTRS_MAX_CONN_TIMEOUT)) {
rtrs_err(s, "Closing zombie path\n");
close_path(srv_path);
}
}
static bool __is_path_w_addr_exists(struct rtrs_srv_sess *srv,
struct rdma_addr *addr)
{
struct rtrs_srv_path *srv_path;
list_for_each_entry(srv_path, &srv->paths_list, s.entry)
list_for_each_entry(srv_path, &srv->paths_list, s.entry) {
if (!sockaddr_cmp((struct sockaddr *)&srv_path->s.dst_addr,
(struct sockaddr *)&addr->dst_addr) &&
!sockaddr_cmp((struct sockaddr *)&srv_path->s.src_addr,
(struct sockaddr *)&addr->src_addr))
(struct sockaddr *)&addr->src_addr)) {
rtrs_err((&srv_path->s),
"Path (%s) with same addr exists (lifetime %u)\n",
rtrs_srv_state_str(srv_path->state),
(jiffies_to_msecs(jiffies - srv_path->connection_timeout)));
rtrs_srv_check_close_path(srv_path);
return true;
}
}
return false;
}
@ -1623,7 +1667,7 @@ static int rtrs_rdma_do_accept(struct rtrs_srv_path *srv_path,
err = rdma_accept(cm_id, &param);
if (err)
pr_err("rdma_accept(), err: %d\n", err);
pr_err("rdma_accept(), err: %pe\n", ERR_PTR(err));
return err;
}
@ -1641,7 +1685,7 @@ static int rtrs_rdma_do_reject(struct rdma_cm_id *cm_id, int errno)
err = rdma_reject(cm_id, &msg, sizeof(msg), IB_CM_REJ_CONSUMER_DEFINED);
if (err)
pr_err("rdma_reject(), err: %d\n", err);
pr_err("rdma_reject(), err: %pe\n", ERR_PTR(err));
/* Bounce errno back */
return errno;
@ -1717,7 +1761,7 @@ static int create_con(struct rtrs_srv_path *srv_path,
max_send_wr, max_recv_wr,
IB_POLL_WORKQUEUE);
if (err) {
rtrs_err(s, "rtrs_cq_qp_create(), err: %d\n", err);
rtrs_err(s, "rtrs_cq_qp_create(), err: %pe\n", ERR_PTR(err));
goto free_con;
}
if (con->c.cid == 0) {
@ -1762,7 +1806,6 @@ static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv,
}
if (__is_path_w_addr_exists(srv, &cm_id->route.addr)) {
err = -EEXIST;
pr_err("Path with same addr exists\n");
goto err;
}
srv_path = kzalloc(sizeof(*srv_path), GFP_KERNEL);
@ -1809,6 +1852,7 @@ static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv,
spin_lock_init(&srv_path->state_lock);
INIT_WORK(&srv_path->close_work, rtrs_srv_close_work);
rtrs_srv_init_hb(srv_path);
srv_path->connection_timeout = 0;
srv_path->s.dev = rtrs_ib_dev_find_or_add(cm_id->device, &dev_pd);
if (!srv_path->s.dev) {
@ -1914,8 +1958,10 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id,
goto reject_w_err;
}
if (s->con[cid]) {
rtrs_err(s, "Connection already exists: %d\n",
cid);
rtrs_err(s, "Connection (%s) already exists: %d (lifetime %u)\n",
rtrs_srv_state_str(srv_path->state), cid,
(jiffies_to_msecs(jiffies - srv_path->connection_timeout)));
rtrs_srv_check_close_path(srv_path);
mutex_unlock(&srv->paths_mutex);
goto reject_w_err;
}
@ -1930,9 +1976,15 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id,
goto reject_w_err;
}
}
/*
* Start of any connection creation resets the timeout for the path.
*/
srv_path->connection_timeout = jiffies;
err = create_con(srv_path, cm_id, cid);
if (err) {
rtrs_err((&srv_path->s), "create_con(), error %d\n", err);
rtrs_err((&srv_path->s), "create_con(), error %pe\n", ERR_PTR(err));
rtrs_rdma_do_reject(cm_id, err);
/*
* Since session has other connections we follow normal way
@ -1943,7 +1995,8 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id,
}
err = rtrs_rdma_do_accept(srv_path, cm_id);
if (err) {
rtrs_err((&srv_path->s), "rtrs_rdma_do_accept(), error %d\n", err);
rtrs_err((&srv_path->s), "rtrs_rdma_do_accept(), error %pe\n",
ERR_PTR(err));
rtrs_rdma_do_reject(cm_id, err);
/*
* Since current connection was successfully added to the
@ -1994,8 +2047,15 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_REJECTED:
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
rtrs_err(s, "CM error (CM event: %s, err: %d)\n",
rdma_event_msg(ev->event), ev->status);
if (ev->status < 0) {
rtrs_err(s, "CM error (CM event: %s, err: %pe)\n",
rdma_event_msg(ev->event),
ERR_PTR(ev->status));
} else if (ev->status > 0) {
rtrs_err(s, "CM error (CM event: %s, err: %s)\n",
rdma_event_msg(ev->event),
rdma_reject_msg(cm_id, ev->status));
}
fallthrough;
case RDMA_CM_EVENT_DISCONNECTED:
case RDMA_CM_EVENT_ADDR_CHANGE:
@ -2004,8 +2064,15 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id,
close_path(srv_path);
break;
default:
pr_err("Ignoring unexpected CM event %s, err %d\n",
rdma_event_msg(ev->event), ev->status);
if (ev->status < 0) {
pr_err("Ignoring unexpected CM event %s, err %pe\n",
rdma_event_msg(ev->event),
ERR_PTR(ev->status));
} else if (ev->status > 0) {
pr_err("Ignoring unexpected CM event %s, err %s\n",
rdma_event_msg(ev->event),
rdma_reject_msg(cm_id, ev->status));
}
break;
}
@ -2029,13 +2096,13 @@ static struct rdma_cm_id *rtrs_srv_cm_init(struct rtrs_srv_ctx *ctx,
}
ret = rdma_bind_addr(cm_id, addr);
if (ret) {
pr_err("Binding RDMA address failed, err: %d\n", ret);
pr_err("Binding RDMA address failed, err: %pe\n", ERR_PTR(ret));
goto err_cm;
}
ret = rdma_listen(cm_id, 64);
if (ret) {
pr_err("Listening on RDMA connection failed, err: %d\n",
ret);
pr_err("Listening on RDMA connection failed, err: %pe\n",
ERR_PTR(ret));
goto err_cm;
}
@ -2275,8 +2342,11 @@ static int check_module_params(void)
void rtrs_srv_ib_event_handler(struct ib_event_handler *handler,
struct ib_event *ibevent)
{
pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event),
ibevent->event);
struct ib_device *idev = ibevent->device;
u32 port_num = ibevent->element.port_num;
pr_info("Handling event: %s (%d). HCA name: %s, port num: %u\n",
ib_event_msg(ibevent->event), ibevent->event, idev->name, port_num);
}
static int rtrs_srv_ib_dev_init(struct rtrs_ib_dev *dev)
@ -2313,8 +2383,8 @@ static int __init rtrs_server_init(void)
err = check_module_params();
if (err) {
pr_err("Failed to load module, invalid module parameters, err: %d\n",
err);
pr_err("Failed to load module, invalid module parameters, err: %pe\n",
ERR_PTR(err));
return err;
}
err = class_register(&rtrs_dev_class);

View file

@ -89,6 +89,7 @@ struct rtrs_srv_path {
unsigned int mem_bits;
struct kobject kobj;
struct rtrs_srv_stats *stats;
unsigned long connection_timeout;
};
static inline struct rtrs_srv_path *to_srv_path(struct rtrs_path *s)

View file

@ -273,7 +273,8 @@ static int create_qp(struct rtrs_con *con, struct ib_pd *pd,
ret = rdma_create_qp(cm_id, pd, &init_attr);
if (ret) {
rtrs_err(con->path, "Creating QP failed, err: %d\n", ret);
rtrs_err(con->path, "Creating QP failed, err: %pe\n",
ERR_PTR(ret));
return ret;
}
con->qp = cm_id->qp;
@ -341,7 +342,8 @@ void rtrs_send_hb_ack(struct rtrs_path *path)
err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm,
NULL);
if (err) {
rtrs_err(path, "send HB ACK failed, errno: %d\n", err);
rtrs_err(path, "send HB ACK failed, errno: %pe\n",
ERR_PTR(err));
path->hb_err_handler(usr_con);
return;
}
@ -375,7 +377,8 @@ static void hb_work(struct work_struct *work)
err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm,
NULL);
if (err) {
rtrs_err(path, "HB send failed, errno: %d\n", err);
rtrs_err(path, "HB send failed, errno: %pe\n",
ERR_PTR(err));
path->hb_err_handler(usr_con);
return;
}

View file

@ -35,6 +35,8 @@ enum gdma_request_type {
GDMA_CREATE_MR = 31,
GDMA_DESTROY_MR = 32,
GDMA_QUERY_HWC_TIMEOUT = 84, /* 0x54 */
GDMA_ALLOC_DM = 96, /* 0x60 */
GDMA_DESTROY_DM = 97, /* 0x61 */
};
#define GDMA_RESOURCE_DOORBELL_PAGE 27
@ -866,6 +868,8 @@ enum gdma_mr_type {
GDMA_MR_TYPE_GVA = 2,
/* Guest zero-based address MRs */
GDMA_MR_TYPE_ZBVA = 4,
/* Device address MRs */
GDMA_MR_TYPE_DM = 5,
};
struct gdma_create_mr_params {
@ -881,6 +885,12 @@ struct gdma_create_mr_params {
u64 dma_region_handle;
enum gdma_mr_access_flags access_flags;
} zbva;
struct {
u64 dm_handle;
u64 offset;
u64 length;
enum gdma_mr_access_flags access_flags;
} da;
};
};
@ -895,13 +905,23 @@ struct gdma_create_mr_request {
u64 dma_region_handle;
u64 virtual_address;
enum gdma_mr_access_flags access_flags;
} gva;
} __packed gva;
struct {
u64 dma_region_handle;
enum gdma_mr_access_flags access_flags;
} zbva;
};
} __packed zbva;
struct {
u64 dm_handle;
u64 offset;
enum gdma_mr_access_flags access_flags;
} __packed da;
} __packed;
u32 reserved_2;
union {
struct {
u64 length;
} da_ext;
};
};/* HW DATA */
struct gdma_create_mr_response {
@ -920,6 +940,27 @@ struct gdma_destroy_mr_response {
struct gdma_resp_hdr hdr;
};/* HW DATA */
struct gdma_alloc_dm_req {
struct gdma_req_hdr hdr;
u64 length;
u32 alignment;
u32 flags;
}; /* HW Data */
struct gdma_alloc_dm_resp {
struct gdma_resp_hdr hdr;
u64 dm_handle;
}; /* HW Data */
struct gdma_destroy_dm_req {
struct gdma_req_hdr hdr;
u64 dm_handle;
}; /* HW Data */
struct gdma_destroy_dm_resp {
struct gdma_resp_hdr hdr;
}; /* HW Data */
int mana_gd_verify_vf_version(struct pci_dev *pdev);
int mana_gd_register_device(struct gdma_dev *gd);

View file

@ -15,6 +15,7 @@
#include <linux/ethtool.h>
#include <linux/types.h>
#include <linux/device.h>
#include <linux/bvec.h>
#include <linux/dma-mapping.h>
#include <linux/kref.h>
#include <linux/list.h>
@ -43,6 +44,7 @@
#include <uapi/rdma/rdma_user_ioctl.h>
#include <uapi/rdma/ib_user_ioctl_verbs.h>
#include <linux/pci-tph.h>
#include <linux/dma-buf.h>
#define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN
@ -764,6 +766,7 @@ enum ib_event_type {
IB_EVENT_CLIENT_REREGISTER,
IB_EVENT_GID_CHANGE,
IB_EVENT_WQ_FATAL,
IB_EVENT_DEVICE_SPEED_CHANGE,
};
const char *__attribute_const__ ib_event_msg(enum ib_event_type event);
@ -877,6 +880,20 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate);
*/
__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
struct ib_port_speed_info {
const char *str;
int rate; /* in deci-Gb/sec (100 MBps units) */
};
/**
* ib_port_attr_to_speed_info - Convert port attributes to speed information
* @attr: Port attributes containing active_speed and active_width
* @speed_info: Speed information to return
*
* Returns 0 on success, -EINVAL on error.
*/
int ib_port_attr_to_speed_info(struct ib_port_attr *attr,
struct ib_port_speed_info *speed_info);
/**
* enum ib_mr_type - memory region type
@ -2348,6 +2365,9 @@ struct rdma_user_mmap_entry {
unsigned long start_pgoff;
size_t npages;
bool driver_removed;
/* protects access to dmabufs */
struct mutex dmabufs_lock;
struct list_head dmabufs;
};
/* Return the offset (in bytes) the user should pass to libc's mmap() */
@ -2403,6 +2423,8 @@ struct ib_device_ops {
int comp_vector);
int (*query_port)(struct ib_device *device, u32 port_num,
struct ib_port_attr *port_attr);
int (*query_port_speed)(struct ib_device *device, u32 port_num,
u64 *speed);
int (*modify_port)(struct ib_device *device, u32 port_num,
int port_modify_mask,
struct ib_port_modify *port_modify);
@ -2483,6 +2505,11 @@ struct ib_device_ops {
* Therefore needs to be implemented by the driver in mmap_free.
*/
void (*mmap_free)(struct rdma_user_mmap_entry *entry);
int (*mmap_get_pfns)(struct rdma_user_mmap_entry *entry,
struct phys_vec *phys_vec,
struct p2pdma_provider **provider);
struct rdma_user_mmap_entry *(*pgoff_to_mmap_entry)(struct ib_ucontext *ucontext,
off_t pg_off);
void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
@ -4249,6 +4276,47 @@ static inline void ib_dma_unmap_page(struct ib_device *dev,
dma_unmap_page(dev->dma_device, addr, size, direction);
}
/**
* ib_dma_map_bvec - Map a bio_vec to DMA address
* @dev: The device for which the dma_addr is to be created
* @bvec: The bio_vec to map
* @direction: The direction of the DMA
*
* Returns a DMA address for the bio_vec. The caller must check the
* result with ib_dma_mapping_error() before use; a failed mapping
* must not be passed to ib_dma_unmap_bvec().
*
* For software RDMA devices (rxe, siw), returns a virtual address
* and no actual DMA mapping occurs.
*/
static inline u64 ib_dma_map_bvec(struct ib_device *dev,
struct bio_vec *bvec,
enum dma_data_direction direction)
{
if (ib_uses_virt_dma(dev))
return (uintptr_t)bvec_virt(bvec);
return dma_map_phys(dev->dma_device, bvec_phys(bvec),
bvec->bv_len, direction, 0);
}
/**
* ib_dma_unmap_bvec - Unmap a bio_vec DMA mapping
* @dev: The device for which the DMA address was created
* @addr: The DMA address returned by ib_dma_map_bvec()
* @size: The size of the region in bytes
* @direction: The direction of the DMA
*
* Releases a DMA mapping created by ib_dma_map_bvec(). For software
* RDMA devices this is a no-op since no actual mapping occurred.
*/
static inline void ib_dma_unmap_bvec(struct ib_device *dev,
u64 addr, size_t size,
enum dma_data_direction direction)
{
if (!ib_uses_virt_dma(dev))
dma_unmap_phys(dev->dma_device, addr, size, direction, 0);
}
int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents);
static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
struct scatterlist *sg, int nents,
@ -4545,8 +4613,6 @@ static inline bool ib_device_try_get(struct ib_device *dev)
void ib_device_put(struct ib_device *device);
struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
enum rdma_driver_id driver_id);
struct ib_device *ib_device_get_by_name(const char *name,
enum rdma_driver_id driver_id);
struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port,
u16 pkey, const union ib_gid *gid,
const struct sockaddr *addr);

View file

@ -5,6 +5,7 @@
#ifndef _RDMA_RW_H
#define _RDMA_RW_H
#include <linux/bvec.h>
#include <linux/dma-mapping.h>
#include <linux/scatterlist.h>
#include <rdma/ib_verbs.h>
@ -31,6 +32,14 @@ struct rdma_rw_ctx {
struct ib_rdma_wr *wrs;
} map;
/* for IOVA-based mapping of bvecs into contiguous DMA range: */
struct {
struct dma_iova_state state;
struct ib_sge sge;
struct ib_rdma_wr wr;
size_t mapped_len;
} iova;
/* for registering multiple WRs: */
struct rdma_rw_reg_ctx {
struct ib_sge sge;
@ -38,6 +47,7 @@ struct rdma_rw_ctx {
struct ib_reg_wr reg_wr;
struct ib_send_wr inv_wr;
struct ib_mr *mr;
struct sg_table sgt;
} *reg;
};
};
@ -49,6 +59,16 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 port_num, struct scatterlist *sg, u32 sg_cnt,
enum dma_data_direction dir);
struct bio_vec;
int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
struct bvec_iter iter, u64 remote_addr, u32 rkey,
enum dma_data_direction dir);
void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
enum dma_data_direction dir);
int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u32 port_num, struct scatterlist *sg, u32 sg_cnt,
struct scatterlist *prot_sg, u32 prot_sg_cnt,
@ -66,6 +86,8 @@ int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
unsigned int maxpages);
unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num,
unsigned int max_rdma_ctxs, u32 create_flags);
void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr);
int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr);
void rdma_rw_cleanup_mrs(struct ib_qp *qp);

View file

@ -186,6 +186,7 @@ struct ib_uverbs_file {
extern const struct uverbs_obj_type_class uverbs_idr_class;
extern const struct uverbs_obj_type_class uverbs_fd_class;
int uverbs_uobject_fd_release(struct inode *inode, struct file *filp);
int uverbs_uobject_release(struct ib_uobject *uobj);
#define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \
sizeof(char))

View file

@ -56,6 +56,7 @@ enum {
BNXT_RE_UCNTX_CMASK_DBR_PACING_ENABLED = 0x08ULL,
BNXT_RE_UCNTX_CMASK_POW2_DISABLED = 0x10ULL,
BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED = 0x40,
BNXT_RE_UCNTX_CMASK_QP_RATE_LIMIT_ENABLED = 0x80ULL,
};
enum bnxt_re_wqe_mode {
@ -215,4 +216,19 @@ enum bnxt_re_toggle_mem_methods {
BNXT_RE_METHOD_GET_TOGGLE_MEM = (1U << UVERBS_ID_NS_SHIFT),
BNXT_RE_METHOD_RELEASE_TOGGLE_MEM,
};
struct bnxt_re_packet_pacing_caps {
__u32 qp_rate_limit_min;
__u32 qp_rate_limit_max; /* In kbps */
/* Corresponding bit will be set if qp type from
* 'enum ib_qp_type' is supported, e.g.
* supported_qpts |= 1 << IB_QPT_RC
*/
__u32 supported_qpts;
__u32 reserved;
};
struct bnxt_re_query_device_ex_resp {
struct bnxt_re_packet_pacing_caps packet_pacing_caps;
};
#endif /* __BNXT_RE_UVERBS_ABI_H__*/

View file

@ -56,6 +56,7 @@ enum uverbs_default_objects {
UVERBS_OBJECT_COUNTERS,
UVERBS_OBJECT_ASYNC_EVENT,
UVERBS_OBJECT_DMAH,
UVERBS_OBJECT_DMABUF,
};
enum {
@ -73,6 +74,7 @@ enum uverbs_methods_device {
UVERBS_METHOD_QUERY_CONTEXT,
UVERBS_METHOD_QUERY_GID_TABLE,
UVERBS_METHOD_QUERY_GID_ENTRY,
UVERBS_METHOD_QUERY_PORT_SPEED,
};
enum uverbs_attrs_invoke_write_cmd_attr_ids {
@ -86,6 +88,11 @@ enum uverbs_attrs_query_port_cmd_attr_ids {
UVERBS_ATTR_QUERY_PORT_RESP,
};
enum uverbs_attrs_query_port_speed_cmd_attr_ids {
UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM,
UVERBS_ATTR_QUERY_PORT_SPEED_RESP,
};
enum uverbs_attrs_get_context_attr_ids {
UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS,
UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT,
@ -257,6 +264,15 @@ enum uverbs_methods_dmah {
UVERBS_METHOD_DMAH_FREE,
};
enum uverbs_attrs_alloc_dmabuf_cmd_attr_ids {
UVERBS_ATTR_ALLOC_DMABUF_HANDLE,
UVERBS_ATTR_ALLOC_DMABUF_PGOFF,
};
enum uverbs_methods_dmabuf {
UVERBS_METHOD_DMABUF_ALLOC,
};
enum uverbs_attrs_reg_dm_mr_cmd_attr_ids {
UVERBS_ATTR_REG_DM_MR_HANDLE,
UVERBS_ATTR_REG_DM_MR_OFFSET,

View file

@ -17,6 +17,9 @@
#define MANA_IB_UVERBS_ABI_VERSION 1
enum mana_ib_create_cq_flags {
/* Reserved for backward compatibility. Legacy
* kernel versions use it to create CQs in RNIC
*/
MANA_IB_CREATE_RNIC_CQ = 1 << 0,
};

View file

@ -5,6 +5,8 @@
* Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
*/
#include <linux/bvec.h>
#include <linux/overflow.h>
#include <rdma/rw.h>
#include <linux/sunrpc/xdr.h>
@ -20,30 +22,33 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
/* Each R/W context contains state for one chain of RDMA Read or
* Write Work Requests.
*
* Each WR chain handles a single contiguous server-side buffer,
* because scatterlist entries after the first have to start on
* page alignment. xdr_buf iovecs cannot guarantee alignment.
* Each WR chain handles a single contiguous server-side buffer.
* - each xdr_buf iovec is a single contiguous buffer
* - the xdr_buf pages array is a single contiguous buffer because the
* second through the last element always start on a page boundary
*
* Each WR chain handles only one R_key. Each RPC-over-RDMA segment
* from a client may contain a unique R_key, so each WR chain moves
* up to one segment at a time.
*
* The scatterlist makes this data structure over 4KB in size. To
* make it less likely to fail, and to handle the allocation for
* smaller I/O requests without disabling bottom-halves, these
* contexts are created on demand, but cached and reused until the
* controlling svcxprt_rdma is destroyed.
* The inline bvec array is sized to handle most I/O requests without
* additional allocation. Larger requests fall back to dynamic allocation.
* These contexts are created on demand, but cached and reused until
* the controlling svcxprt_rdma is destroyed.
*/
struct svc_rdma_rw_ctxt {
struct llist_node rw_node;
struct list_head rw_list;
struct rdma_rw_ctx rw_ctx;
unsigned int rw_nents;
unsigned int rw_first_sgl_nents;
struct sg_table rw_sg_table;
struct scatterlist rw_first_sgl[];
unsigned int rw_first_bvec_nents;
struct bio_vec *rw_bvec;
struct bio_vec rw_first_bvec[];
};
static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
struct svc_rdma_rw_ctxt *ctxt);
static inline struct svc_rdma_rw_ctxt *
svc_rdma_next_ctxt(struct list_head *list)
{
@ -52,10 +57,10 @@ svc_rdma_next_ctxt(struct list_head *list)
}
static struct svc_rdma_rw_ctxt *
svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec)
{
struct ib_device *dev = rdma->sc_cm_id->device;
unsigned int first_sgl_nents = dev->attrs.max_send_sge;
unsigned int first_bvec_nents = dev->attrs.max_send_sge;
struct svc_rdma_rw_ctxt *ctxt;
struct llist_node *node;
@ -65,33 +70,44 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
if (node) {
ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
} else {
ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents),
ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec,
first_bvec_nents),
GFP_KERNEL, ibdev_to_node(dev));
if (!ctxt)
goto out_noctx;
INIT_LIST_HEAD(&ctxt->rw_list);
ctxt->rw_first_sgl_nents = first_sgl_nents;
ctxt->rw_first_bvec_nents = first_bvec_nents;
}
ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
ctxt->rw_sg_table.sgl,
first_sgl_nents))
goto out_free;
if (nr_bvec <= ctxt->rw_first_bvec_nents) {
ctxt->rw_bvec = ctxt->rw_first_bvec;
} else {
ctxt->rw_bvec = kmalloc_array_node(nr_bvec,
sizeof(*ctxt->rw_bvec),
GFP_KERNEL,
ibdev_to_node(dev));
if (!ctxt->rw_bvec)
goto out_free;
}
return ctxt;
out_free:
kfree(ctxt);
/* Return cached contexts to cache; free freshly allocated ones */
if (node)
svc_rdma_put_rw_ctxt(rdma, ctxt);
else
kfree(ctxt);
out_noctx:
trace_svcrdma_rwctx_empty(rdma, sges);
trace_svcrdma_rwctx_empty(rdma, nr_bvec);
return NULL;
}
static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt,
struct llist_head *list)
{
sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents);
if (ctxt->rw_bvec != ctxt->rw_first_bvec)
kfree(ctxt->rw_bvec);
llist_add(&ctxt->rw_node, list);
}
@ -123,6 +139,7 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
* @ctxt: R/W context to prepare
* @offset: RDMA offset
* @handle: RDMA tag/handle
* @length: total number of bytes in the bvec array
* @direction: I/O direction
*
* Returns on success, the number of WQEs that will be needed
@ -130,14 +147,18 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
*/
static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
struct svc_rdma_rw_ctxt *ctxt,
u64 offset, u32 handle,
u64 offset, u32 handle, unsigned int length,
enum dma_data_direction direction)
{
struct bvec_iter iter = {
.bi_size = length,
};
int ret;
ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num,
ctxt->rw_sg_table.sgl, ctxt->rw_nents,
0, offset, handle, direction);
ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp,
rdma->sc_port_num,
ctxt->rw_bvec, ctxt->rw_nents,
iter, offset, handle, direction);
if (unlikely(ret < 0)) {
trace_svcrdma_dma_map_rw_err(rdma, offset, handle,
ctxt->rw_nents, ret);
@ -175,7 +196,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
{
struct llist_node *first, *last;
struct svc_rdma_rw_ctxt *ctxt;
LLIST_HEAD(free);
trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount);
@ -183,10 +203,11 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
list_del(&ctxt->rw_list);
rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
rdma->sc_port_num, ctxt->rw_sg_table.sgl,
ctxt->rw_nents, dir);
__svc_rdma_put_rw_ctxt(ctxt, &free);
rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp,
rdma->sc_port_num,
ctxt->rw_bvec, ctxt->rw_nents, dir);
if (ctxt->rw_bvec != ctxt->rw_first_bvec)
kfree(ctxt->rw_bvec);
ctxt->rw_node.next = first;
first = &ctxt->rw_node;
@ -414,29 +435,26 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma,
return -ENOTCONN;
}
/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
/* Build a bvec that covers one kvec in an xdr_buf.
*/
static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
unsigned int len,
struct svc_rdma_rw_ctxt *ctxt)
static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info,
unsigned int len,
struct svc_rdma_rw_ctxt *ctxt)
{
struct scatterlist *sg = ctxt->rw_sg_table.sgl;
sg_set_buf(&sg[0], info->wi_base, len);
bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len);
info->wi_base += len;
ctxt->rw_nents = 1;
}
/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
/* Build a bvec array that covers part of an xdr_buf's pagelist.
*/
static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
unsigned int remaining,
struct svc_rdma_rw_ctxt *ctxt)
static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info,
unsigned int remaining,
struct svc_rdma_rw_ctxt *ctxt)
{
unsigned int sge_no, sge_bytes, page_off, page_no;
unsigned int bvec_idx, bvec_len, page_off, page_no;
const struct xdr_buf *xdr = info->wi_xdr;
struct scatterlist *sg;
struct page **page;
page_off = info->wi_next_off + xdr->page_base;
@ -444,21 +462,19 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
page_off = offset_in_page(page_off);
page = xdr->pages + page_no;
info->wi_next_off += remaining;
sg = ctxt->rw_sg_table.sgl;
sge_no = 0;
bvec_idx = 0;
do {
sge_bytes = min_t(unsigned int, remaining,
PAGE_SIZE - page_off);
sg_set_page(sg, *page, sge_bytes, page_off);
remaining -= sge_bytes;
sg = sg_next(sg);
bvec_len = min_t(unsigned int, remaining,
PAGE_SIZE - page_off);
bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, bvec_len,
page_off);
remaining -= bvec_len;
page_off = 0;
sge_no++;
bvec_idx++;
page++;
} while (remaining);
ctxt->rw_nents = sge_no;
ctxt->rw_nents = bvec_idx;
}
/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
@ -496,7 +512,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
constructor(info, write_len, ctxt);
offset = seg->rs_offset + info->wi_seg_off;
ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle,
DMA_TO_DEVICE);
write_len, DMA_TO_DEVICE);
if (ret < 0)
return -EIO;
percpu_counter_inc(&svcrdma_stat_write);
@ -535,7 +551,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info,
const struct kvec *iov)
{
info->wi_base = iov->iov_base;
return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec,
iov->iov_len);
}
@ -559,7 +575,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
{
info->wi_xdr = xdr;
info->wi_next_off = offset - xdr->head[0].iov_len;
return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec,
length);
}
@ -734,29 +750,29 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
{
struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
unsigned int sge_no, seg_len, len;
unsigned int bvec_idx, nr_bvec, seg_len, len, total;
struct svc_rdma_rw_ctxt *ctxt;
struct scatterlist *sg;
int ret;
len = segment->rs_length;
sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT;
ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no);
if (check_add_overflow(head->rc_pageoff, len, &total))
return -EINVAL;
nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT;
ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec);
if (!ctxt)
return -ENOMEM;
ctxt->rw_nents = sge_no;
ctxt->rw_nents = nr_bvec;
sg = ctxt->rw_sg_table.sgl;
for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) {
seg_len = min_t(unsigned int, len,
PAGE_SIZE - head->rc_pageoff);
if (!head->rc_pageoff)
head->rc_page_count++;
sg_set_page(sg, rqstp->rq_pages[head->rc_curpage],
seg_len, head->rc_pageoff);
sg = sg_next(sg);
bvec_set_page(&ctxt->rw_bvec[bvec_idx],
rqstp->rq_pages[head->rc_curpage],
seg_len, head->rc_pageoff);
head->rc_pageoff += seg_len;
if (head->rc_pageoff == PAGE_SIZE) {
@ -770,7 +786,8 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
}
ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset,
segment->rs_handle, DMA_FROM_DEVICE);
segment->rs_handle, segment->rs_length,
DMA_FROM_DEVICE);
if (ret < 0)
return -EIO;
percpu_counter_inc(&svcrdma_stat_read);

View file

@ -462,7 +462,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_bc_requests = 2;
}
/* Arbitrary estimate of the needed number of rdma_rw contexts.
/* Estimate the needed number of rdma_rw contexts. The maximum
* Read and Write chunks have one segment each. Each request
* can involve one Read chunk and either a Write chunk or Reply
* chunk; thus a factor of three.
*/
maxpayload = min(xprt->xpt_server->sv_max_payload,
RPCSVC_MAXPAYLOAD_RDMA);
@ -470,7 +473,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
rdma_rw_mr_factor(dev, newxprt->sc_port_num,
maxpayload >> PAGE_SHIFT);
newxprt->sc_sq_depth = rq_depth + ctxts;
newxprt->sc_sq_depth = rq_depth +
rdma_rw_max_send_wr(dev, newxprt->sc_port_num, ctxts, 0);
if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);