diff --git a/MAINTAINERS b/MAINTAINERS index 28d6fd75d43a..9b746cccef03 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11842,7 +11842,6 @@ F: arch/x86/kernel/cpu/mshyperv.c F: drivers/clocksource/hyperv_timer.c F: drivers/hid/hid-hyperv.c F: drivers/hv/ -F: drivers/infiniband/hw/mana/ F: drivers/input/serio/hyperv-keyboard.c F: drivers/iommu/hyperv-iommu.c F: drivers/net/ethernet/microsoft/ @@ -11861,7 +11860,6 @@ F: include/hyperv/hvhdk_mini.h F: include/linux/hyperv.h F: include/net/mana F: include/uapi/linux/hyperv.h -F: include/uapi/rdma/mana-abi.h F: net/vmw_vsock/hyperv_transport.c F: tools/hv/ @@ -17468,6 +17466,7 @@ MICROSOFT MANA RDMA DRIVER M: Long Li M: Konstantin Taranov L: linux-rdma@vger.kernel.org +L: linux-hyperv@vger.kernel.org S: Supported F: drivers/infiniband/hw/mana/ F: include/net/mana diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index f483e0c12444..a2a7a9d2e0d3 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -33,6 +33,7 @@ ib_umad-y := user_mad.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_std_types_cq.o \ + uverbs_std_types_dmabuf.o \ uverbs_std_types_dmah.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 81cf3c902e81..0fc1c5bce2f0 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1537,7 +1537,8 @@ static void ib_cache_event_task(struct work_struct *_work) * the cache. */ ret = ib_cache_update(work->event.device, work->event.element.port_num, - work->event.event == IB_EVENT_GID_CHANGE, + work->event.event == IB_EVENT_GID_CHANGE || + work->event.event == IB_EVENT_CLIENT_REREGISTER, work->event.event == IB_EVENT_PKEY_CHANGE, work->enforce_security); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 1174ab7da629..2f2081e75bce 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -361,34 +361,6 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } -/** - * ib_device_get_by_name - Find an IB device by name - * @name: The name to look for - * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) - * - * Find and hold an ib_device by its name. The caller must call - * ib_device_put() on the returned pointer. - */ -struct ib_device *ib_device_get_by_name(const char *name, - enum rdma_driver_id driver_id) -{ - struct ib_device *device; - - down_read(&devices_rwsem); - device = __ib_device_get_by_name(name); - if (device && driver_id != RDMA_DRIVER_UNKNOWN && - device->ops.driver_id != driver_id) - device = NULL; - - if (device) { - if (!ib_device_try_get(device)) - device = NULL; - } - up_read(&devices_rwsem); - return device; -} -EXPORT_SYMBOL(ib_device_get_by_name); - static int rename_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; @@ -2793,6 +2765,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); + SET_DEVICE_OP(dev_ops, mmap_get_pfns); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); @@ -2803,6 +2776,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); + SET_DEVICE_OP(dev_ops, pgoff_to_mmap_entry); SET_DEVICE_OP(dev_ops, pre_destroy_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); @@ -2816,6 +2790,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); + SET_DEVICE_OP(dev_ops, query_port_speed); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); @@ -2875,7 +2850,6 @@ int ib_add_sub_device(struct ib_device *parent, return ret; } -EXPORT_SYMBOL(ib_add_sub_device); int ib_del_sub_device_and_put(struct ib_device *sub) { @@ -2896,7 +2870,6 @@ int ib_del_sub_device_and_put(struct ib_device *sub) return 0; } -EXPORT_SYMBOL(ib_del_sub_device_and_put); #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c index b51bd7087a88..1de72ff4610c 100644 --- a/drivers/infiniband/core/ib_core_uverbs.c +++ b/drivers/infiniband/core/ib_core_uverbs.c @@ -5,9 +5,13 @@ * Copyright 2019 Marvell. All rights reserved. */ #include +#include +#include #include "uverbs.h" #include "core_priv.h" +MODULE_IMPORT_NS("DMA_BUF"); + /** * rdma_umap_priv_init() - Initialize the private data of a vma * @@ -229,12 +233,29 @@ EXPORT_SYMBOL(rdma_user_mmap_entry_put); */ void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry) { + struct ib_uverbs_dmabuf_file *uverbs_dmabuf, *tmp; + if (!entry) return; + mutex_lock(&entry->dmabufs_lock); xa_lock(&entry->ucontext->mmap_xa); entry->driver_removed = true; xa_unlock(&entry->ucontext->mmap_xa); + list_for_each_entry_safe(uverbs_dmabuf, tmp, &entry->dmabufs, dmabufs_elm) { + dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL); + list_del(&uverbs_dmabuf->dmabufs_elm); + uverbs_dmabuf->revoked = true; + dma_buf_move_notify(uverbs_dmabuf->dmabuf); + dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv, + DMA_RESV_USAGE_BOOKKEEP, false, + MAX_SCHEDULE_TIMEOUT); + dma_resv_unlock(uverbs_dmabuf->dmabuf->resv); + kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done); + wait_for_completion(&uverbs_dmabuf->comp); + } + mutex_unlock(&entry->dmabufs_lock); + kref_put(&entry->ref, rdma_user_mmap_entry_free); } EXPORT_SYMBOL(rdma_user_mmap_entry_remove); @@ -274,6 +295,9 @@ int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, return -EINVAL; kref_init(&entry->ref); + INIT_LIST_HEAD(&entry->dmabufs); + mutex_init(&entry->dmabufs_lock); + entry->ucontext = ucontext; /* diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 62410578dec3..eb942ab9c405 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -95,7 +95,6 @@ static struct workqueue_struct *iwcm_wq; struct iwcm_work { struct work_struct work; struct iwcm_id_private *cm_id; - struct list_head list; struct iw_cm_event event; struct list_head free_list; }; @@ -178,7 +177,6 @@ static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) return -ENOMEM; } work->cm_id = cm_id_priv; - INIT_LIST_HEAD(&work->list); put_work(work); } return 0; @@ -213,7 +211,6 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv) static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { if (refcount_dec_and_test(&cm_id_priv->refcount)) { - BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); return true; } @@ -260,7 +257,6 @@ struct iw_cm_id *iw_create_cm_id(struct ib_device *device, refcount_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); - INIT_LIST_HEAD(&cm_id_priv->work_list); INIT_LIST_HEAD(&cm_id_priv->work_free_list); return &cm_id_priv->id; @@ -1007,13 +1003,13 @@ static int process_event(struct iwcm_id_private *cm_id_priv, } /* - * Process events on the work_list for the cm_id. If the callback - * function requests that the cm_id be deleted, a flag is set in the - * cm_id flags to indicate that when the last reference is - * removed, the cm_id is to be destroyed. This is necessary to - * distinguish between an object that will be destroyed by the app - * thread asleep on the destroy_comp list vs. an object destroyed - * here synchronously when the last reference is removed. + * Process events for the cm_id. If the callback function requests + * that the cm_id be deleted, a flag is set in the cm_id flags to + * indicate that when the last reference is removed, the cm_id is + * to be destroyed. This is necessary to distinguish between an + * object that will be destroyed by the app thread asleep on the + * destroy_comp list vs. an object destroyed here synchronously + * when the last reference is removed. */ static void cm_work_handler(struct work_struct *_work) { @@ -1024,35 +1020,26 @@ static void cm_work_handler(struct work_struct *_work) int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); - while (!list_empty(&cm_id_priv->work_list)) { - work = list_first_entry(&cm_id_priv->work_list, - struct iwcm_work, list); - list_del_init(&work->list); - levent = work->event; - put_work(work); - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - - if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { - ret = process_event(cm_id_priv, &levent); - if (ret) { - destroy_cm_id(&cm_id_priv->id); - WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); - } - } else - pr_debug("dropping event %d\n", levent.event); - if (iwcm_deref_id(cm_id_priv)) - return; - spin_lock_irqsave(&cm_id_priv->lock, flags); - } + levent = work->event; + put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { + ret = process_event(cm_id_priv, &levent); + if (ret) { + destroy_cm_id(&cm_id_priv->id); + WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); + } + } else + pr_debug("dropping event %d\n", levent.event); + if (iwcm_deref_id(cm_id_priv)) + return; } /* * This function is called on interrupt context. Schedule events on * the iwcm_wq thread to allow callback functions to downcall into - * the CM and/or block. Events are queued to a per-CM_ID - * work_list. If this is the first event on the work_list, the work - * element is also queued on the iwcm_wq thread. + * the CM and/or block. * * Each event holds a reference on the cm_id. Until the last posted * event has been delivered and processed, the cm_id cannot be @@ -1094,7 +1081,6 @@ static int cm_event_handler(struct iw_cm_id *cm_id, } refcount_inc(&cm_id_priv->refcount); - list_add_tail(&work->list, &cm_id_priv->work_list); queue_work(iwcm_wq, &work->work); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h index bf74639be128..b56fb12edece 100644 --- a/drivers/infiniband/core/iwcm.h +++ b/drivers/infiniband/core/iwcm.h @@ -50,7 +50,6 @@ struct iwcm_id_private { struct ib_qp *qp; struct completion destroy_comp; wait_queue_head_t connect_wait; - struct list_head work_list; spinlock_t lock; refcount_t refcount; struct list_head work_free_list; diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 18918f463361..3e0a8b9cd288 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -465,7 +465,7 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj, fd_type = container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); - if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release && + if (WARN_ON(fd_type->fops && fd_type->fops->release != &uverbs_uobject_fd_release && fd_type->fops->release != &uverbs_async_event_release)) { ret = ERR_PTR(-EINVAL); goto err_fd; @@ -477,14 +477,16 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj, goto err_fd; } - /* Note that uverbs_uobject_fd_release() is called during abort */ - filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, - fd_type->flags); - if (IS_ERR(filp)) { - ret = ERR_CAST(filp); - goto err_getfile; + if (fd_type->fops) { + /* Note that uverbs_uobject_fd_release() is called during abort */ + filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, + fd_type->flags); + if (IS_ERR(filp)) { + ret = ERR_CAST(filp); + goto err_getfile; + } + uobj->object = filp; } - uobj->object = filp; uobj->id = new_fd; return uobj; @@ -561,7 +563,9 @@ static void alloc_abort_fd_uobject(struct ib_uobject *uobj) { struct file *filp = uobj->object; - fput(filp); + if (filp) + fput(filp); + put_unused_fd(uobj->id); } @@ -628,11 +632,14 @@ static void alloc_commit_fd_uobject(struct ib_uobject *uobj) /* This shouldn't be used anymore. Use the file object instead */ uobj->id = 0; - /* - * NOTE: Once we install the file we loose ownership of our kref on - * uobj. It will be put by uverbs_uobject_fd_release() - */ - filp->private_data = uobj; + if (!filp->private_data) { + /* + * NOTE: Once we install the file we loose ownership of our kref on + * uobj. It will be put by uverbs_uobject_fd_release() + */ + filp->private_data = uobj; + } + fd_install(fd, filp); } @@ -802,21 +809,10 @@ const struct uverbs_obj_type_class uverbs_idr_class = { }; EXPORT_SYMBOL(uverbs_idr_class); -/* - * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct - * file_operations release method. - */ -int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) +int uverbs_uobject_release(struct ib_uobject *uobj) { struct ib_uverbs_file *ufile; - struct ib_uobject *uobj; - /* - * This can only happen if the fput came from alloc_abort_fd_uobject() - */ - if (!filp->private_data) - return 0; - uobj = filp->private_data; ufile = uobj->ufile; if (down_read_trylock(&ufile->hw_destroy_rwsem)) { @@ -843,6 +839,21 @@ int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) uverbs_uobject_put(uobj); return 0; } + +/* + * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct + * file_operations release method. + */ +int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) +{ + /* + * This can only happen if the fput came from alloc_abort_fd_uobject() + */ + if (!filp->private_data) + return 0; + + return uverbs_uobject_release(filp->private_data); +} EXPORT_SYMBOL(uverbs_uobject_fd_release); /* diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index a59b087611cb..55f1e3558856 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; extern const struct uapi_definition uverbs_def_obj_device[]; extern const struct uapi_definition uverbs_def_obj_dm[]; +extern const struct uapi_definition uverbs_def_obj_dmabuf[]; extern const struct uapi_definition uverbs_def_obj_dmah[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 6354ddf2a274..518095d82d5d 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -14,6 +14,7 @@ enum { RDMA_RW_MULTI_WR, RDMA_RW_MR, RDMA_RW_SIG_MR, + RDMA_RW_IOVA, }; static bool rdma_rw_force_mr; @@ -121,6 +122,36 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num, return count; } +static int rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx *reg, + struct rdma_rw_reg_ctx *prev, struct ib_qp *qp, u32 port_num, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + if (prev) { + if (reg->mr->need_inval) + prev->wr.wr.next = ®->inv_wr; + else + prev->wr.wr.next = ®->reg_wr.wr; + } + + reg->reg_wr.wr.next = ®->wr.wr; + + reg->wr.wr.sg_list = ®->sge; + reg->wr.wr.num_sge = 1; + reg->wr.remote_addr = remote_addr; + reg->wr.rkey = rkey; + + if (dir == DMA_TO_DEVICE) { + reg->wr.wr.opcode = IB_WR_RDMA_WRITE; + } else if (!rdma_cap_read_inv(qp->device, port_num)) { + reg->wr.wr.opcode = IB_WR_RDMA_READ; + } else { + reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; + reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; + } + + return 1; +} + static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) @@ -146,30 +177,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, if (ret < 0) goto out_free; count += ret; - - if (prev) { - if (reg->mr->need_inval) - prev->wr.wr.next = ®->inv_wr; - else - prev->wr.wr.next = ®->reg_wr.wr; - } - - reg->reg_wr.wr.next = ®->wr.wr; - - reg->wr.wr.sg_list = ®->sge; - reg->wr.wr.num_sge = 1; - reg->wr.remote_addr = remote_addr; - reg->wr.rkey = rkey; - if (dir == DMA_TO_DEVICE) { - reg->wr.wr.opcode = IB_WR_RDMA_WRITE; - } else if (!rdma_cap_read_inv(qp->device, port_num)) { - reg->wr.wr.opcode = IB_WR_RDMA_READ; - } else { - reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; - reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; - } - count++; - + count += rdma_rw_init_reg_wr(reg, prev, qp, port_num, + remote_addr, rkey, dir); remote_addr += reg->sge.length; sg_cnt -= nents; for (j = 0; j < nents; j++) @@ -192,6 +201,92 @@ out: return ret; } +static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + struct bvec_iter *iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct rdma_rw_reg_ctx *prev = NULL; + u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en); + struct scatterlist *sg; + int i, ret, count = 0; + u32 nents = 0; + + ctx->reg = kcalloc(DIV_ROUND_UP(nr_bvec, pages_per_mr), + sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) + return -ENOMEM; + + /* + * Build scatterlist from bvecs using the iterator. This follows + * the pattern from __blk_rq_map_sg. + */ + ctx->reg[0].sgt.sgl = kmalloc_array(nr_bvec, + sizeof(*ctx->reg[0].sgt.sgl), + GFP_KERNEL); + if (!ctx->reg[0].sgt.sgl) { + ret = -ENOMEM; + goto out_free_reg; + } + sg_init_table(ctx->reg[0].sgt.sgl, nr_bvec); + + for (sg = ctx->reg[0].sgt.sgl; iter->bi_size; sg = sg_next(sg)) { + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); + + if (nents >= nr_bvec) { + ret = -EINVAL; + goto out_free_sgl; + } + sg_set_page(sg, bv.bv_page, bv.bv_len, bv.bv_offset); + bvec_iter_advance(bvecs, iter, bv.bv_len); + nents++; + } + sg_mark_end(sg_last(ctx->reg[0].sgt.sgl, nents)); + ctx->reg[0].sgt.orig_nents = nents; + + /* DMA map the scatterlist */ + ret = ib_dma_map_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); + if (ret) + goto out_free_sgl; + + ctx->nr_ops = DIV_ROUND_UP(ctx->reg[0].sgt.nents, pages_per_mr); + + sg = ctx->reg[0].sgt.sgl; + nents = ctx->reg[0].sgt.nents; + for (i = 0; i < ctx->nr_ops; i++) { + struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; + u32 sge_cnt = min(nents, pages_per_mr); + + ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sge_cnt, 0); + if (ret < 0) + goto out_free_mrs; + count += ret; + count += rdma_rw_init_reg_wr(reg, prev, qp, port_num, + remote_addr, rkey, dir); + remote_addr += reg->sge.length; + nents -= sge_cnt; + sg += sge_cnt; + prev = reg; + } + + if (prev) + prev->wr.wr.next = NULL; + + ctx->type = RDMA_RW_MR; + return count; + +out_free_mrs: + while (--i >= 0) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); +out_free_sgl: + kfree(ctx->reg[0].sgt.sgl); +out_free_reg: + kfree(ctx->reg); + return ret; +} + static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) @@ -274,6 +369,196 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, return 1; } +static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx, + struct ib_qp *qp, const struct bio_vec *bvecs, + struct bvec_iter *iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct ib_rdma_wr *rdma_wr = &ctx->single.wr; + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); + u64 dma_addr; + + ctx->nr_ops = 1; + + dma_addr = ib_dma_map_bvec(dev, &bv, dir); + if (ib_dma_mapping_error(dev, dma_addr)) + return -ENOMEM; + + ctx->single.sge.lkey = qp->pd->local_dma_lkey; + ctx->single.sge.addr = dma_addr; + ctx->single.sge.length = bv.bv_len; + + memset(rdma_wr, 0, sizeof(*rdma_wr)); + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->wr.sg_list = &ctx->single.sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + + ctx->type = RDMA_RW_SINGLE_WR; + return 1; +} + +static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + const struct bio_vec *bvecs, u32 nr_bvec, struct bvec_iter *iter, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : + qp->max_read_sge; + struct ib_sge *sge; + u32 total_len = 0, i, j; + u32 mapped_bvecs = 0; + u32 nr_ops = DIV_ROUND_UP(nr_bvec, max_sge); + size_t sges_size = array_size(nr_bvec, sizeof(*ctx->map.sges)); + size_t wrs_offset = ALIGN(sges_size, __alignof__(*ctx->map.wrs)); + size_t wrs_size = array_size(nr_ops, sizeof(*ctx->map.wrs)); + void *mem; + + if (sges_size == SIZE_MAX || wrs_size == SIZE_MAX || + check_add_overflow(wrs_offset, wrs_size, &wrs_size)) + return -ENOMEM; + + mem = kzalloc(wrs_size, GFP_KERNEL); + if (!mem) + return -ENOMEM; + + ctx->map.sges = sge = mem; + ctx->map.wrs = mem + wrs_offset; + + for (i = 0; i < nr_ops; i++) { + struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; + u32 nr_sge = min(nr_bvec - mapped_bvecs, max_sge); + + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->remote_addr = remote_addr + total_len; + rdma_wr->rkey = rkey; + rdma_wr->wr.num_sge = nr_sge; + rdma_wr->wr.sg_list = sge; + + for (j = 0; j < nr_sge; j++) { + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); + u64 dma_addr; + + dma_addr = ib_dma_map_bvec(dev, &bv, dir); + if (ib_dma_mapping_error(dev, dma_addr)) + goto out_unmap; + + mapped_bvecs++; + sge->addr = dma_addr; + sge->length = bv.bv_len; + sge->lkey = qp->pd->local_dma_lkey; + + total_len += bv.bv_len; + sge++; + + bvec_iter_advance_single(bvecs, iter, bv.bv_len); + } + + rdma_wr->wr.next = i + 1 < nr_ops ? + &ctx->map.wrs[i + 1].wr : NULL; + } + + ctx->nr_ops = nr_ops; + ctx->type = RDMA_RW_MULTI_WR; + return nr_ops; + +out_unmap: + for (i = 0; i < mapped_bvecs; i++) + ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr, + ctx->map.sges[i].length, dir); + kfree(ctx->map.sges); + return -ENOMEM; +} + +/* + * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range. + * This reduces IOTLB sync overhead by doing one sync at the end instead of + * one per bvec, and produces a contiguous DMA address range that can be + * described by a single SGE. + * + * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA + * mapping is not available, or another negative error code on failure. + */ +static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx, + struct ib_qp *qp, const struct bio_vec *bvec, + struct bvec_iter *iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct device *dma_dev = dev->dma_device; + size_t total_len = iter->bi_size; + struct bio_vec first_bv; + size_t mapped_len = 0; + int ret; + + /* Virtual DMA devices cannot support IOVA allocators */ + if (ib_uses_virt_dma(dev)) + return -EOPNOTSUPP; + + /* Try to allocate contiguous IOVA space */ + first_bv = mp_bvec_iter_bvec(bvec, *iter); + if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state, + bvec_phys(&first_bv), total_len)) + return -EOPNOTSUPP; + + /* Link all bvecs into the IOVA space */ + while (iter->bi_size) { + struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter); + + ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv), + mapped_len, bv.bv_len, dir, 0); + if (ret) + goto out_destroy; + + mapped_len += bv.bv_len; + bvec_iter_advance(bvec, iter, bv.bv_len); + } + + /* Sync the IOTLB once for all linked pages */ + ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len); + if (ret) + goto out_destroy; + + ctx->iova.mapped_len = mapped_len; + + /* Single SGE covers the entire contiguous IOVA range */ + ctx->iova.sge.addr = ctx->iova.state.addr; + ctx->iova.sge.length = mapped_len; + ctx->iova.sge.lkey = qp->pd->local_dma_lkey; + + /* Single WR for the whole transfer */ + memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr)); + if (dir == DMA_TO_DEVICE) + ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE; + else + ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ; + ctx->iova.wr.wr.num_sge = 1; + ctx->iova.wr.wr.sg_list = &ctx->iova.sge; + ctx->iova.wr.remote_addr = remote_addr; + ctx->iova.wr.rkey = rkey; + + ctx->type = RDMA_RW_IOVA; + ctx->nr_ops = 1; + return 1; + +out_destroy: + /* + * dma_iova_destroy() expects the actual mapped length, not the + * total allocation size. It unlinks only the successfully linked + * range and frees the entire IOVA allocation. + */ + dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0); + return ret; +} + /** * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context * @ctx: context to initialize @@ -344,6 +629,79 @@ out_unmap_sg: } EXPORT_SYMBOL(rdma_rw_ctx_init); +/** + * rdma_rw_ctx_init_bvec - initialize a RDMA READ/WRITE context from bio_vec + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @bvecs: bio_vec array to READ/WRITE from/to + * @nr_bvec: number of entries in @bvecs + * @iter: bvec iterator describing offset and length + * @remote_addr: remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Maps the bio_vec array directly, avoiding intermediate scatterlist + * conversion. Supports MR registration for iWARP devices and force_mr mode. + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code: + * + * * -EINVAL - @nr_bvec is zero or @iter.bi_size is zero + * * -ENOMEM - DMA mapping or memory allocation failed + */ +int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + struct bvec_iter iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + int ret; + + if (nr_bvec == 0 || iter.bi_size == 0) + return -EINVAL; + + /* + * iWARP requires MR registration for all RDMA READs. The force_mr + * debug option also mandates MR usage. + */ + if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp(dev, port_num)) + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, + nr_bvec, &iter, remote_addr, + rkey, dir); + if (unlikely(rdma_rw_force_mr)) + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, + nr_bvec, &iter, remote_addr, + rkey, dir); + + if (nr_bvec == 1) + return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter, + remote_addr, rkey, dir); + + /* + * Try IOVA-based mapping first for multi-bvec transfers. + * IOVA coalesces bvecs into a single DMA-contiguous region, + * reducing the number of WRs needed and avoiding MR overhead. + */ + ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr, + rkey, dir); + if (ret != -EOPNOTSUPP) + return ret; + + /* + * IOVA mapping not available. Check if MR registration provides + * better performance than multiple SGE entries. + */ + if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec)) + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, + nr_bvec, &iter, remote_addr, + rkey, dir); + + return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter, + remote_addr, rkey, dir); +} +EXPORT_SYMBOL(rdma_rw_ctx_init_bvec); + /** * rdma_rw_ctx_signature_init - initialize a RW context with signature offload * @ctx: context to initialize @@ -515,6 +873,10 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, first_wr = &ctx->reg[0].reg_wr.wr; last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; break; + case RDMA_RW_IOVA: + first_wr = &ctx->iova.wr.wr; + last_wr = &ctx->iova.wr.wr; + break; case RDMA_RW_MULTI_WR: first_wr = &ctx->map.wrs[0].wr; last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; @@ -579,6 +941,8 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, switch (ctx->type) { case RDMA_RW_MR: + /* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */ + WARN_ON_ONCE(ctx->reg[0].sgt.sgl); for (i = 0; i < ctx->nr_ops; i++) ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); kfree(ctx->reg); @@ -589,6 +953,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, break; case RDMA_RW_SINGLE_WR: break; + case RDMA_RW_IOVA: + /* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */ + WARN_ON_ONCE(1); + return; default: BUG(); break; @@ -598,6 +966,58 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, } EXPORT_SYMBOL(rdma_rw_ctx_destroy); +/** + * rdma_rw_ctx_destroy_bvec - release resources from rdma_rw_ctx_init_bvec + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound (unused) + * @bvecs: bio_vec array that was used for the READ/WRITE (unused) + * @nr_bvec: number of entries in @bvecs + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Releases all resources allocated by a successful rdma_rw_ctx_init_bvec() + * call. Must not be called if rdma_rw_ctx_init_bvec() returned an error. + * + * The @port_num and @bvecs parameters are unused but present for API + * symmetry with rdma_rw_ctx_destroy(). + */ +void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 __maybe_unused port_num, + const struct bio_vec __maybe_unused *bvecs, + u32 nr_bvec, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 i; + + switch (ctx->type) { + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); + kfree(ctx->reg[0].sgt.sgl); + kfree(ctx->reg); + break; + case RDMA_RW_IOVA: + dma_iova_destroy(dev->dma_device, &ctx->iova.state, + ctx->iova.mapped_len, dir, 0); + break; + case RDMA_RW_MULTI_WR: + for (i = 0; i < nr_bvec; i++) + ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr, + ctx->map.sges[i].length, dir); + kfree(ctx->map.sges); + break; + case RDMA_RW_SINGLE_WR: + ib_dma_unmap_bvec(dev, ctx->single.sge.addr, + ctx->single.sge.length, dir); + break; + default: + WARN_ON_ONCE(1); + return; + } +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy_bvec); + /** * rdma_rw_ctx_destroy_signature - release all resources allocated by * rdma_rw_ctx_signature_init @@ -651,34 +1071,57 @@ unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, } EXPORT_SYMBOL(rdma_rw_mr_factor); +/** + * rdma_rw_max_send_wr - compute max Send WRs needed for RDMA R/W contexts + * @dev: RDMA device + * @port_num: port number + * @max_rdma_ctxs: number of rdma_rw_ctx structures + * @create_flags: QP create flags (pass IB_QP_CREATE_INTEGRITY_EN if + * data integrity will be enabled on the QP) + * + * Returns the total number of Send Queue entries needed for + * @max_rdma_ctxs. The result accounts for memory registration and + * invalidation work requests when the device requires them. + * + * ULPs use this to size Send Queues and Send CQs before creating a + * Queue Pair. + */ +unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num, + unsigned int max_rdma_ctxs, u32 create_flags) +{ + unsigned int factor = 1; + unsigned int result; + + if (create_flags & IB_QP_CREATE_INTEGRITY_EN || + rdma_rw_can_use_mr(dev, port_num)) + factor += 2; /* reg + inv */ + + if (check_mul_overflow(factor, max_rdma_ctxs, &result)) + return UINT_MAX; + return result; +} +EXPORT_SYMBOL(rdma_rw_max_send_wr); + void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) { - u32 factor; + unsigned int factor = 1; WARN_ON_ONCE(attr->port_num == 0); /* - * Each context needs at least one RDMA READ or WRITE WR. - * - * For some hardware we might need more, eventually we should ask the - * HCA driver for a multiplier here. - */ - factor = 1; - - /* - * If the device needs MRs to perform RDMA READ or WRITE operations, - * we'll need two additional MRs for the registrations and the - * invalidation. + * If the device uses MRs to perform RDMA READ or WRITE operations, + * or if data integrity is enabled, account for registration and + * invalidation work requests. */ if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN || rdma_rw_can_use_mr(dev, attr->port_num)) - factor += 2; /* inv + reg */ + factor += 2; /* reg + inv */ attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; /* - * But maybe we were just too high in the sky and the device doesn't - * even support all we need, and we'll have to live with what we get.. + * The device might not support all we need, and we'll have to + * live with what we get. */ attr->cap.max_send_wr = min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 0ed862b38b44..bfaca07933d8 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -292,62 +292,22 @@ static ssize_t cap_mask_show(struct ib_device *ibdev, u32 port_num, static ssize_t rate_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { + struct ib_port_speed_info speed_info; struct ib_port_attr attr; - char *speed = ""; - int rate; /* in deci-Gb/sec */ ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - switch (attr.active_speed) { - case IB_SPEED_DDR: - speed = " DDR"; - rate = 50; - break; - case IB_SPEED_QDR: - speed = " QDR"; - rate = 100; - break; - case IB_SPEED_FDR10: - speed = " FDR10"; - rate = 100; - break; - case IB_SPEED_FDR: - speed = " FDR"; - rate = 140; - break; - case IB_SPEED_EDR: - speed = " EDR"; - rate = 250; - break; - case IB_SPEED_HDR: - speed = " HDR"; - rate = 500; - break; - case IB_SPEED_NDR: - speed = " NDR"; - rate = 1000; - break; - case IB_SPEED_XDR: - speed = " XDR"; - rate = 2000; - break; - case IB_SPEED_SDR: - default: /* default to SDR for invalid rates */ - speed = " SDR"; - rate = 25; - break; - } + ret = ib_port_attr_to_speed_info(&attr, &speed_info); + if (ret) + return ret; - rate *= ib_width_enum_to_int(attr.active_width); - if (rate < 0) - return -EINVAL; - - return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10, - rate % 10 ? ".5" : "", - ib_width_enum_to_int(attr.active_width), speed); + return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", speed_info.rate / 10, + speed_info.rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), + speed_info.str); } static const char *phys_state_to_str(enum ib_port_phys_state phys_state) diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 0ec2e4120cc9..939da49b0dcc 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -129,9 +129,6 @@ ib_umem_dmabuf_get_with_dma_device(struct ib_device *device, if (check_add_overflow(offset, (unsigned long)size, &end)) return ret; - if (unlikely(!ops || !ops->move_notify)) - return ret; - dmabuf = dma_buf_get(fd); if (IS_ERR(dmabuf)) return ERR_CAST(dmabuf); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index fd67fc9fe85a..2f7e3c4483fc 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -514,7 +514,8 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, struct rdma_ah_attr ah_attr; struct ib_ah *ah; __be64 *tid; - int ret, data_len, hdr_len, copy_offset, rmpp_active; + int ret, hdr_len, copy_offset, rmpp_active; + size_t data_len; u8 base_version; if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) @@ -588,7 +589,10 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, } base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version; - data_len = count - hdr_size(file) - hdr_len; + if (check_sub_overflow(count, hdr_size(file) + hdr_len, &data_len)) { + ret = -EINVAL; + goto err_ah; + } packet->msg = ib_create_send_mad(agent, be32_to_cpu(packet->mad.hdr.qpn), packet->mad.hdr.pkey_index, rmpp_active, diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 797e2fcc8072..6d4295277e0e 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -133,6 +133,18 @@ struct ib_uverbs_completion_event_file { struct ib_uverbs_event_queue ev_queue; }; +struct ib_uverbs_dmabuf_file { + struct ib_uobject uobj; + struct dma_buf *dmabuf; + struct list_head dmabufs_elm; + struct rdma_user_mmap_entry *mmap_entry; + struct phys_vec phys_vec; + struct p2pdma_provider *provider; + struct kref kref; + struct completion comp; + u8 revoked :1; +}; + struct ib_uverbs_event { union { struct ib_uverbs_async_event_desc async; @@ -290,4 +302,13 @@ ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs, void copy_port_attr_to_resp(struct ib_port_attr *attr, struct ib_uverbs_query_port_resp *resp, struct ib_device *ib_dev, u8 port_num); + +static inline void ib_uverbs_dmabuf_done(struct kref *kref) +{ + struct ib_uverbs_dmabuf_file *priv = + container_of(kref, struct ib_uverbs_dmabuf_file, kref); + + complete(&priv->comp); +} + #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index ce16404cdfb8..f4616deeca54 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2049,7 +2049,10 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) if (ret) return ret; - user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL); + if (cmd.wqe_size < sizeof(struct ib_uverbs_send_wr)) + return -EINVAL; + + user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL | __GFP_NOWARN); if (!user_wr) return -ENOMEM; @@ -2239,7 +2242,7 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, if (ret) return ERR_PTR(ret); - user_wr = kmalloc(wqe_size, GFP_KERNEL); + user_wr = kmalloc(wqe_size, GFP_KERNEL | __GFP_NOWARN); if (!user_wr) return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index c0fd283d9d6c..a28f9f21bed8 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -209,6 +209,39 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)( &resp, sizeof(resp)); } +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT_SPEED)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + u32 port_num; + u64 speed; + int ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!ib_dev->ops.query_port_speed) + return -EOPNOTSUPP; + + ret = uverbs_get_const(&port_num, attrs, + UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM); + if (ret) + return ret; + + if (!rdma_is_port_valid(ib_dev, port_num)) + return -EINVAL; + + ret = ib_dev->ops.query_port_speed(ib_dev, port_num, &speed); + if (ret) + return ret; + + return uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_PORT_SPEED_RESP, + &speed, sizeof(speed)); +} + static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( struct uverbs_attr_bundle *attrs) { @@ -469,6 +502,14 @@ DECLARE_UVERBS_NAMED_METHOD( active_speed_ex), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_PORT_SPEED, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM, u32, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_PORT_SPEED_RESP, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_QUERY_GID_TABLE, UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64, @@ -498,6 +539,7 @@ DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE, &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT_SPEED), &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT), &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE), &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY)); diff --git a/drivers/infiniband/core/uverbs_std_types_dmabuf.c b/drivers/infiniband/core/uverbs_std_types_dmabuf.c new file mode 100644 index 000000000000..dfdfcd1d1a44 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_dmabuf.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_dmabuf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + return 0; +} + +static struct sg_table * +uverbs_dmabuf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv; + struct sg_table *ret; + + dma_resv_assert_held(priv->dmabuf->resv); + + if (priv->revoked) + return ERR_PTR(-ENODEV); + + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider, + &priv->phys_vec, 1, priv->phys_vec.len, + dir); + if (IS_ERR(ret)) + return ret; + + kref_get(&priv->kref); + return ret; +} + +static void uverbs_dmabuf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(priv->dmabuf->resv); + dma_buf_free_sgt(attachment, sgt, dir); + kref_put(&priv->kref, ib_uverbs_dmabuf_done); +} + +static int uverbs_dmabuf_pin(struct dma_buf_attachment *attach) +{ + return -EOPNOTSUPP; +} + +static void uverbs_dmabuf_unpin(struct dma_buf_attachment *attach) +{ +} + +static void uverbs_dmabuf_release(struct dma_buf *dmabuf) +{ + struct ib_uverbs_dmabuf_file *priv = dmabuf->priv; + + /* + * This can only happen if the fput came from alloc_abort_fd_uobject() + */ + if (!priv->uobj.context) + return; + + uverbs_uobject_release(&priv->uobj); +} + +static const struct dma_buf_ops uverbs_dmabuf_ops = { + .attach = uverbs_dmabuf_attach, + .map_dma_buf = uverbs_dmabuf_map, + .unmap_dma_buf = uverbs_dmabuf_unmap, + .pin = uverbs_dmabuf_pin, + .unpin = uverbs_dmabuf_unpin, + .release = uverbs_dmabuf_release, +}; + +static int UVERBS_HANDLER(UVERBS_METHOD_DMABUF_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE) + ->obj_attr.uobject; + struct ib_uverbs_dmabuf_file *uverbs_dmabuf = + container_of(uobj, struct ib_uverbs_dmabuf_file, uobj); + struct ib_device *ib_dev = attrs->context->device; + struct rdma_user_mmap_entry *mmap_entry; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + off_t pg_off; + int ret; + + ret = uverbs_get_const(&pg_off, attrs, UVERBS_ATTR_ALLOC_DMABUF_PGOFF); + if (ret) + return ret; + + mmap_entry = ib_dev->ops.pgoff_to_mmap_entry(attrs->context, pg_off); + if (!mmap_entry) + return -EINVAL; + + ret = ib_dev->ops.mmap_get_pfns(mmap_entry, &uverbs_dmabuf->phys_vec, + &uverbs_dmabuf->provider); + if (ret) + goto err; + + exp_info.ops = &uverbs_dmabuf_ops; + exp_info.size = uverbs_dmabuf->phys_vec.len; + exp_info.flags = O_CLOEXEC; + exp_info.priv = uverbs_dmabuf; + + uverbs_dmabuf->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(uverbs_dmabuf->dmabuf)) { + ret = PTR_ERR(uverbs_dmabuf->dmabuf); + goto err; + } + + kref_init(&uverbs_dmabuf->kref); + init_completion(&uverbs_dmabuf->comp); + INIT_LIST_HEAD(&uverbs_dmabuf->dmabufs_elm); + mutex_lock(&mmap_entry->dmabufs_lock); + if (mmap_entry->driver_removed) + ret = -EIO; + else + list_add_tail(&uverbs_dmabuf->dmabufs_elm, &mmap_entry->dmabufs); + mutex_unlock(&mmap_entry->dmabufs_lock); + if (ret) + goto err_revoked; + + uobj->object = uverbs_dmabuf->dmabuf->file; + uverbs_dmabuf->mmap_entry = mmap_entry; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE); + return 0; + +err_revoked: + dma_buf_put(uverbs_dmabuf->dmabuf); +err: + rdma_user_mmap_entry_put(mmap_entry); + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DMABUF_ALLOC, + UVERBS_ATTR_FD(UVERBS_ATTR_ALLOC_DMABUF_HANDLE, + UVERBS_OBJECT_DMABUF, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMABUF_PGOFF, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +static void uverbs_dmabuf_fd_destroy_uobj(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + struct ib_uverbs_dmabuf_file *uverbs_dmabuf = + container_of(uobj, struct ib_uverbs_dmabuf_file, uobj); + bool wait_for_comp = false; + + mutex_lock(&uverbs_dmabuf->mmap_entry->dmabufs_lock); + dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL); + if (!uverbs_dmabuf->revoked) { + uverbs_dmabuf->revoked = true; + list_del(&uverbs_dmabuf->dmabufs_elm); + dma_buf_move_notify(uverbs_dmabuf->dmabuf); + dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv, + DMA_RESV_USAGE_BOOKKEEP, false, + MAX_SCHEDULE_TIMEOUT); + wait_for_comp = true; + } + dma_resv_unlock(uverbs_dmabuf->dmabuf->resv); + if (wait_for_comp) { + kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done); + /* Let's wait till all DMA unmap are completed. */ + wait_for_completion(&uverbs_dmabuf->comp); + } + mutex_unlock(&uverbs_dmabuf->mmap_entry->dmabufs_lock); + + /* Matches the get done as part of pgoff_to_mmap_entry() */ + rdma_user_mmap_entry_put(uverbs_dmabuf->mmap_entry); +} + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_DMABUF, + UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_dmabuf_file), + uverbs_dmabuf_fd_destroy_uobj, + NULL, NULL, O_RDONLY), + &UVERBS_METHOD(UVERBS_METHOD_DMABUF_ALLOC)); + +const struct uapi_definition uverbs_def_obj_dmabuf[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMABUF), + UAPI_DEF_OBJ_NEEDS_FN(mmap_get_pfns), + UAPI_DEF_OBJ_NEEDS_FN(pgoff_to_mmap_entry), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index e00ea63175bd..38d0bbbee796 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_cq), UAPI_DEF_CHAIN(uverbs_def_obj_device), UAPI_DEF_CHAIN(uverbs_def_obj_dm), + UAPI_DEF_CHAIN(uverbs_def_obj_dmabuf), UAPI_DEF_CHAIN(uverbs_def_obj_dmah), UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index ee390928511a..a2670a031faf 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -78,6 +78,7 @@ static const char * const ib_events[] = { [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", [IB_EVENT_CLIENT_REREGISTER] = "client reregister", [IB_EVENT_GID_CHANGE] = "GID changed", + [IB_EVENT_DEVICE_SPEED_CHANGE] = "device speed change" }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event) @@ -216,6 +217,57 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) } EXPORT_SYMBOL(ib_rate_to_mbps); +struct ib_speed_attr { + const char *str; + int speed; +}; + +#define IB_SPEED_ATTR(speed_type, _str, _speed) \ + [speed_type] = {.str = _str, .speed = _speed} + +static const struct ib_speed_attr ib_speed_attrs[] = { + IB_SPEED_ATTR(IB_SPEED_SDR, " SDR", 25), + IB_SPEED_ATTR(IB_SPEED_DDR, " DDR", 50), + IB_SPEED_ATTR(IB_SPEED_QDR, " QDR", 100), + IB_SPEED_ATTR(IB_SPEED_FDR10, " FDR10", 100), + IB_SPEED_ATTR(IB_SPEED_FDR, " FDR", 140), + IB_SPEED_ATTR(IB_SPEED_EDR, " EDR", 250), + IB_SPEED_ATTR(IB_SPEED_HDR, " HDR", 500), + IB_SPEED_ATTR(IB_SPEED_NDR, " NDR", 1000), + IB_SPEED_ATTR(IB_SPEED_XDR, " XDR", 2000), +}; + +int ib_port_attr_to_speed_info(struct ib_port_attr *attr, + struct ib_port_speed_info *speed_info) +{ + int speed_idx = attr->active_speed; + + switch (attr->active_speed) { + case IB_SPEED_DDR: + case IB_SPEED_QDR: + case IB_SPEED_FDR10: + case IB_SPEED_FDR: + case IB_SPEED_EDR: + case IB_SPEED_HDR: + case IB_SPEED_NDR: + case IB_SPEED_XDR: + case IB_SPEED_SDR: + break; + default: + speed_idx = IB_SPEED_SDR; /* Default to SDR for invalid rates */ + break; + } + + speed_info->str = ib_speed_attrs[speed_idx].str; + speed_info->rate = ib_speed_attrs[speed_idx].speed; + speed_info->rate *= ib_width_enum_to_int(attr->active_width); + if (speed_info->rate < 0) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(ib_port_attr_to_speed_info); + __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type) { @@ -1485,7 +1537,8 @@ static const struct { IB_QP_PKEY_INDEX), [IB_QPT_RC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), + IB_QP_PKEY_INDEX | + IB_QP_RATE_LIMIT), [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), @@ -1533,7 +1586,8 @@ static const struct { IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), + IB_QP_PATH_MIG_STATE | + IB_QP_RATE_LIMIT), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | @@ -1567,7 +1621,8 @@ static const struct { IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | - IB_QP_MIN_RNR_TIMER), + IB_QP_MIN_RNR_TIMER | + IB_QP_RATE_LIMIT), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index 88817c86ae24..e025217861c2 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -87,25 +87,35 @@ static ssize_t qp_info_read(struct file *filep, size_t count, loff_t *ppos) { struct bnxt_re_qp *qp = filep->private_data; + struct bnxt_qplib_qp *qplib_qp; + u32 rate_limit = 0; char *buf; int len; if (*ppos) return 0; + qplib_qp = &qp->qplib_qp; + if (qplib_qp->shaper_allocation_status) + rate_limit = qplib_qp->rate_limit; + buf = kasprintf(GFP_KERNEL, "QPN\t\t: %d\n" "transport\t: %s\n" "state\t\t: %s\n" "mtu\t\t: %d\n" "timeout\t\t: %d\n" - "remote QPN\t: %d\n", + "remote QPN\t: %d\n" + "shaper allocated : %d\n" + "rate limit\t: %d kbps\n", qp->qplib_qp.id, bnxt_re_qp_type_str(qp->qplib_qp.type), bnxt_re_qp_state_str(qp->qplib_qp.state), qp->qplib_qp.mtu, qp->qplib_qp.timeout, - qp->qplib_qp.dest_qpn); + qp->qplib_qp.dest_qpn, + qplib_qp->shaper_allocation_status, + rate_limit); if (!buf) return -ENOMEM; if (count < strlen(buf)) { diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index ff91511bd338..be3c3f1f87f7 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -186,6 +186,9 @@ int bnxt_re_query_device(struct ib_device *ibdev, { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; + struct bnxt_re_query_device_ex_resp resp = {}; + size_t outlen = (udata) ? udata->outlen : 0; + int rc = 0; memset(ib_attr, 0, sizeof(*ib_attr)); memcpy(&ib_attr->fw_ver, dev_attr->fw_ver, @@ -250,7 +253,21 @@ int bnxt_re_query_device(struct ib_device *ibdev, ib_attr->max_pkeys = 1; ib_attr->local_ca_ack_delay = BNXT_RE_DEFAULT_ACK_DELAY; - return 0; + + if ((offsetofend(typeof(resp), packet_pacing_caps) <= outlen) && + _is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) { + resp.packet_pacing_caps.qp_rate_limit_min = + dev_attr->rate_limit_min; + resp.packet_pacing_caps.qp_rate_limit_max = + dev_attr->rate_limit_max; + resp.packet_pacing_caps.supported_qpts = + 1 << IB_QPT_RC; + } + if (outlen) + rc = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), outlen)); + + return rc; } int bnxt_re_modify_device(struct ib_device *ibdev, @@ -2089,10 +2106,11 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, unsigned int flags; u8 nw_type; - if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS) + if (qp_attr_mask & ~(IB_QP_ATTR_STANDARD_BITS | IB_QP_RATE_LIMIT)) return -EOPNOTSUPP; qp->qplib_qp.modify_flags = 0; + qp->qplib_qp.ext_modify_flags = 0; if (qp_attr_mask & IB_QP_STATE) { curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state); new_qp_state = qp_attr->qp_state; @@ -2129,6 +2147,15 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, bnxt_re_unlock_cqs(qp, flags); } } + + if (qp_attr_mask & IB_QP_RATE_LIMIT) { + if (qp->qplib_qp.type != IB_QPT_RC || + !_is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) + return -EOPNOTSUPP; + qp->qplib_qp.ext_modify_flags |= + CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID; + qp->qplib_qp.rate_limit = qp_attr->rate_limit; + } if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_EN_SQD_ASYNC_NOTIFY; @@ -4386,6 +4413,9 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED; + if (_is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_QP_RATE_LIMIT_ENABLED; + if (udata->inlen >= sizeof(ureq)) { rc = ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq))); if (rc) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index c88f049136fc..3e44311bf939 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1313,8 +1313,8 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_modify_qp req = {}; u16 vlan_pcp_vlan_dei_vlan_id; + u32 bmask, bmask_ext; u32 temp32[4]; - u32 bmask; int rc; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, @@ -1329,9 +1329,16 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) is_optimized_state_transition(qp)) bnxt_set_mandatory_attributes(res, qp, &req); } + bmask = qp->modify_flags; req.modify_mask = cpu_to_le32(qp->modify_flags); + bmask_ext = qp->ext_modify_flags; + req.ext_modify_mask = cpu_to_le32(qp->ext_modify_flags); req.qp_cid = cpu_to_le32(qp->id); + + if (bmask_ext & CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID) + req.rate_limit = cpu_to_le32(qp->rate_limit); + if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_STATE) { req.network_type_en_sqd_async_notify_new_state = (qp->state & CMDQ_MODIFY_QP_NEW_STATE_MASK) | @@ -1429,6 +1436,9 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); if (rc) return rc; + + if (bmask_ext & CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID) + qp->shaper_allocation_status = resp.shaper_allocation_status; qp->cur_qp_state = qp->state; return 0; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 1b414a73b46d..30c3f99be07b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -280,6 +280,7 @@ struct bnxt_qplib_qp { u8 state; u8 cur_qp_state; u64 modify_flags; + u32 ext_modify_flags; u32 max_inline_data; u32 mtu; u8 path_mtu; @@ -346,6 +347,8 @@ struct bnxt_qplib_qp { bool is_host_msn_tbl; u8 tos_dscp; u32 ugid_index; + u32 rate_limit; + u8 shaper_allocation_status; }; #define BNXT_RE_MAX_MSG_SIZE 0x80000000 diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 2ea3b7f232a3..9a5dcf97b6f4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -623,4 +623,10 @@ static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2) return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); } +static inline bool _is_modify_qp_rate_limit_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & + CREQ_QUERY_FUNC_RESP_SB_MODIFY_QP_RATE_LIMIT_SUPPORTED; +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 408a34df2667..ec9eb52a8ebf 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -193,6 +193,11 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) attr->max_dpi = le32_to_cpu(sb->max_dpi); attr->is_atomic = bnxt_qplib_is_atomic_cap(rcfw); + + if (_is_modify_qp_rate_limit_supported(attr->dev_cap_flags2)) { + attr->rate_limit_min = le16_to_cpu(sb->rate_limit_min); + attr->rate_limit_max = le32_to_cpu(sb->rate_limit_max); + } bail: dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 5a45c55c6464..9fadd637cb5b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -76,6 +76,8 @@ struct bnxt_qplib_dev_attr { u16 dev_cap_flags; u16 dev_cap_flags2; u32 max_dpi; + u16 rate_limit_min; + u32 rate_limit_max; }; struct bnxt_qplib_pd { diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 99ecd72e72e2..aac338f2afd8 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -690,10 +690,11 @@ struct cmdq_modify_qp { __le32 ext_modify_mask; #define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_EXT_STATS_CTX 0x1UL #define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_SCHQ_ID_VALID 0x2UL + #define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID 0x8UL __le32 ext_stats_ctx_id; __le16 schq_id; __le16 unused_0; - __le32 reserved32; + __le32 rate_limit; }; /* creq_modify_qp_resp (size:128b/16B) */ @@ -716,7 +717,8 @@ struct creq_modify_qp_resp { #define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_INDEX_MASK 0xeUL #define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_INDEX_SFT 1 #define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_STATE 0x10UL - u8 reserved8; + u8 shaper_allocation_status; + #define CREQ_MODIFY_QP_RESP_SHAPER_ALLOCATED 0x1UL __le32 lag_src_mac; }; @@ -2179,7 +2181,7 @@ struct creq_query_func_resp { u8 reserved48[6]; }; -/* creq_query_func_resp_sb (size:1088b/136B) */ +/* creq_query_func_resp_sb (size:1280b/160B) */ struct creq_query_func_resp_sb { u8 opcode; #define CREQ_QUERY_FUNC_RESP_SB_OPCODE_QUERY_FUNC 0x83UL @@ -2256,12 +2258,15 @@ struct creq_query_func_resp_sb { #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE #define CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED 0x40UL + #define CREQ_QUERY_FUNC_RESP_SB_MODIFY_QP_RATE_LIMIT_SUPPORTED 0x400UL #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL __le16 max_xp_qp_size; __le16 create_qp_batch_size; __le16 destroy_qp_batch_size; __le16 max_srq_ext; - __le64 reserved64; + __le16 reserved16; + __le16 rate_limit_min; + __le32 rate_limit_max; }; /* cmdq_set_func_resources (size:448b/56B) */ diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index 0e979ca10d24..229b0ad3b0cb 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -3,6 +3,8 @@ * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ +#include + #include "efa_com.h" #include "efa_regs_defs.h" @@ -21,6 +23,8 @@ #define EFA_CTRL_SUB_MINOR 1 enum efa_cmd_status { + EFA_CMD_UNUSED, + EFA_CMD_ALLOCATED, EFA_CMD_SUBMITTED, EFA_CMD_COMPLETED, }; @@ -32,7 +36,6 @@ struct efa_comp_ctx { enum efa_cmd_status status; u16 cmd_id; u8 cmd_opcode; - u8 occupied; }; static const char *efa_com_cmd_str(u8 cmd) @@ -241,7 +244,6 @@ static int efa_com_admin_init_aenq(struct efa_com_dev *edev, return 0; } -/* ID to be used with efa_com_get_comp_ctx */ static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq) { u16 ctx_id; @@ -263,36 +265,47 @@ static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq, spin_unlock(&aq->comp_ctx_lock); } -static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq, - struct efa_comp_ctx *comp_ctx) +static struct efa_comp_ctx *efa_com_alloc_comp_ctx(struct efa_com_admin_queue *aq) { - u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command, - EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); - u16 ctx_id = cmd_id & (aq->depth - 1); + struct efa_comp_ctx *comp_ctx; + u16 ctx_id; - ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id); - comp_ctx->occupied = 0; - efa_com_dealloc_ctx_id(aq, ctx_id); -} + ctx_id = efa_com_alloc_ctx_id(aq); -static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq, - u16 cmd_id, bool capture) -{ - u16 ctx_id = cmd_id & (aq->depth - 1); - - if (aq->comp_ctx[ctx_id].occupied && capture) { - ibdev_err_ratelimited( - aq->efa_dev, - "Completion context for command_id %#x is occupied\n", - cmd_id); + comp_ctx = &aq->comp_ctx[ctx_id]; + if (comp_ctx->status != EFA_CMD_UNUSED) { + efa_com_dealloc_ctx_id(aq, ctx_id); + ibdev_err_ratelimited(aq->efa_dev, + "Completion context[%u] is used[%u]\n", + ctx_id, comp_ctx->status); return NULL; } - if (capture) { - aq->comp_ctx[ctx_id].occupied = 1; - ibdev_dbg(aq->efa_dev, - "Take completion ctxt for command_id %#x\n", cmd_id); - } + comp_ctx->status = EFA_CMD_ALLOCATED; + ibdev_dbg(aq->efa_dev, "Take completion context[%u]\n", ctx_id); + return comp_ctx; +} + +static inline u16 efa_com_get_comp_ctx_id(struct efa_com_admin_queue *aq, + struct efa_comp_ctx *comp_ctx) +{ + return comp_ctx - aq->comp_ctx; +} + +static inline void efa_com_dealloc_comp_ctx(struct efa_com_admin_queue *aq, + struct efa_comp_ctx *comp_ctx) +{ + u16 ctx_id = efa_com_get_comp_ctx_id(aq, comp_ctx); + + ibdev_dbg(aq->efa_dev, "Put completion context[%u]\n", ctx_id); + comp_ctx->status = EFA_CMD_UNUSED; + efa_com_dealloc_ctx_id(aq, ctx_id); +} + +static inline struct efa_comp_ctx *efa_com_get_comp_ctx_by_cmd_id(struct efa_com_admin_queue *aq, + u16 cmd_id) +{ + u16 ctx_id = cmd_id & (aq->depth - 1); return &aq->comp_ctx[ctx_id]; } @@ -310,26 +323,23 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu u16 ctx_id; u16 pi; + comp_ctx = efa_com_alloc_comp_ctx(aq); + if (!comp_ctx) + return ERR_PTR(-EINVAL); + queue_size_mask = aq->depth - 1; pi = aq->sq.pc & queue_size_mask; - - ctx_id = efa_com_alloc_ctx_id(aq); + ctx_id = efa_com_get_comp_ctx_id(aq, comp_ctx); /* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */ cmd_id = ctx_id & queue_size_mask; - cmd_id |= aq->sq.pc & ~queue_size_mask; + cmd_id |= aq->sq.pc << ilog2(aq->depth); cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; cmd->aq_common_descriptor.command_id = cmd_id; EFA_SET(&cmd->aq_common_descriptor.flags, EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase); - comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true); - if (!comp_ctx) { - efa_com_dealloc_ctx_id(aq, ctx_id); - return ERR_PTR(-EINVAL); - } - comp_ctx->status = EFA_CMD_SUBMITTED; comp_ctx->comp_size = comp_size_in_bytes; comp_ctx->user_cqe = comp; @@ -370,9 +380,9 @@ static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq) } for (i = 0; i < aq->depth; i++) { - comp_ctx = efa_com_get_comp_ctx(aq, i, false); - if (comp_ctx) - init_completion(&comp_ctx->wait_event); + comp_ctx = &aq->comp_ctx[i]; + comp_ctx->status = EFA_CMD_UNUSED; + init_completion(&comp_ctx->wait_event); aq->comp_ctx_pool[i] = i; } @@ -417,11 +427,12 @@ static int efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq cmd_id = EFA_GET(&cqe->acq_common_descriptor.command, EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); - comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); - if (comp_ctx->status != EFA_CMD_SUBMITTED) { + comp_ctx = efa_com_get_comp_ctx_by_cmd_id(aq, cmd_id); + if (comp_ctx->status != EFA_CMD_SUBMITTED || comp_ctx->cmd_id != cmd_id) { ibdev_err(aq->efa_dev, - "Received completion with unexpected command id[%d], sq producer: %d, sq consumer: %d, cq consumer: %d\n", - cmd_id, aq->sq.pc, aq->sq.cc, aq->cq.cc); + "Received completion with unexpected command id[%x], status[%d] sq producer[%d], sq consumer[%d], cq consumer[%d]\n", + cmd_id, comp_ctx->status, aq->sq.pc, aq->sq.cc, + aq->cq.cc); return -EINVAL; } @@ -530,7 +541,7 @@ static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_c err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); out: - efa_com_put_comp_ctx(aq, comp_ctx); + efa_com_dealloc_comp_ctx(aq, comp_ctx); return err; } @@ -580,7 +591,7 @@ static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *com err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); out: - efa_com_put_comp_ctx(aq, comp_ctx); + efa_com_dealloc_comp_ctx(aq, comp_ctx); return err; } diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 0c1c32d23c88..8a605da8a93c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -60,7 +60,7 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, u8 tclass = get_tclass(grh); u8 priority = 0; u8 tc_mode = 0; - int ret; + int ret = 0; if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) { ret = -EOPNOTSUPP; @@ -77,19 +77,18 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, ah->av.flowlabel = grh->flow_label; ah->av.udp_sport = get_ah_udp_sport(ah_attr); ah->av.tclass = tclass; + ah->av.sl = rdma_ah_get_sl(ah_attr); - ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority); - if (ret == -EOPNOTSUPP) - ret = 0; + if (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority); + if (ret == -EOPNOTSUPP) + ret = 0; + else if (ret) + goto err_out; - if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - goto err_out; - - if (tc_mode == HNAE3_TC_MAP_MODE_DSCP && - grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - ah->av.sl = priority; - else - ah->av.sl = rdma_ah_get_sl(ah_attr); + if (tc_mode == HNAE3_TC_MAP_MODE_DSCP) + ah->av.sl = priority; + } if (!check_sl_valid(hr_dev, ah->av.sl)) { ret = -EINVAL; diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 6aa82fe9dd3d..857a913326cd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -55,7 +55,7 @@ void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx) { struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device); struct hns_roce_cq_table *cq_table = &hr_dev->cq_table; - u32 least_load = cq_table->ctx_num[0]; + u32 least_load = U32_MAX; u8 bankid = 0; u8 i; @@ -63,7 +63,10 @@ void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx) return; mutex_lock(&cq_table->bank_mutex); - for (i = 1; i < HNS_ROCE_CQ_BANK_NUM; i++) { + for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) { + if (!(cq_table->valid_cq_bank_mask & BIT(i))) + continue; + if (cq_table->ctx_num[i] < least_load) { least_load = cq_table->ctx_num[i]; bankid = i; @@ -581,6 +584,11 @@ void hns_roce_init_cq_table(struct hns_roce_dev *hr_dev) cq_table->bank[i].max = hr_dev->caps.num_cqs / HNS_ROCE_CQ_BANK_NUM - 1; } + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) + cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_LIMIT; + else + cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_DEFAULT; } void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 318f18cf37aa..3f032b8038af 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -103,6 +103,10 @@ #define CQ_BANKID_SHIFT 2 #define CQ_BANKID_MASK GENMASK(1, 0) +#define VALID_CQ_BANK_MASK_DEFAULT 0xF +#define VALID_CQ_BANK_MASK_LIMIT 0x9 + +#define VALID_EXT_SGE_QP_BANK_MASK_LIMIT 0x42 #define HNS_ROCE_MAX_CQ_COUNT 0xFFFF #define HNS_ROCE_MAX_CQ_PERIOD 0xFFFF @@ -156,6 +160,7 @@ enum { HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), HNS_ROCE_CAP_FLAG_BOND = BIT(21), HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22), + HNS_ROCE_CAP_FLAG_LIMIT_BANK = BIT(23), }; #define HNS_ROCE_DB_TYPE_COUNT 2 @@ -500,6 +505,7 @@ struct hns_roce_cq_table { struct hns_roce_bank bank[HNS_ROCE_CQ_BANK_NUM]; struct mutex bank_mutex; u32 ctx_num[HNS_ROCE_CQ_BANK_NUM]; + u8 valid_cq_bank_mask; }; struct hns_roce_srq_table { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2d6ae89e525b..5d0a8662249d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -876,6 +876,170 @@ out: return ret; } +static int hns_roce_push_drain_wr(struct hns_roce_wq *wq, struct ib_cq *cq, + u64 wr_id) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&wq->lock, flags); + if (hns_roce_wq_overflow(wq, 1, cq)) { + ret = -ENOMEM; + goto out; + } + + wq->wrid[wq->head & (wq->wqe_cnt - 1)] = wr_id; + wq->head++; + +out: + spin_unlock_irqrestore(&wq->lock, flags); + return ret; +} + +struct hns_roce_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void hns_roce_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct hns_roce_drain_cqe *cqe = container_of(wc->wr_cqe, + struct hns_roce_drain_cqe, + cqe); + complete(&cqe->done); +} + +static void handle_drain_completion(struct ib_cq *ibcq, + struct hns_roce_drain_cqe *drain, + struct hns_roce_dev *hr_dev) +{ +#define TIMEOUT (HZ / 10) + struct hns_roce_cq *hr_cq = to_hr_cq(ibcq); + unsigned long flags; + bool triggered; + + if (ibcq->poll_ctx == IB_POLL_DIRECT) { + while (wait_for_completion_timeout(&drain->done, TIMEOUT) <= 0) + ib_process_cq_direct(ibcq, -1); + return; + } + + if (hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) + goto waiting_done; + + spin_lock_irqsave(&hr_cq->lock, flags); + triggered = hr_cq->is_armed; + hr_cq->is_armed = 1; + spin_unlock_irqrestore(&hr_cq->lock, flags); + + /* Triggered means this cq is processing or has been processed + * by hns_roce_handle_device_err() or this function. We need to + * cancel the already invoked comp_handler() to avoid concurrency. + * If it has not been triggered, we can directly invoke + * comp_handler(). + */ + if (triggered) { + switch (ibcq->poll_ctx) { + case IB_POLL_SOFTIRQ: + irq_poll_disable(&ibcq->iop); + irq_poll_enable(&ibcq->iop); + break; + case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: + cancel_work_sync(&ibcq->work); + break; + default: + WARN_ON_ONCE(1); + } + } + + if (ibcq->comp_handler) + ibcq->comp_handler(ibcq, ibcq->cq_context); + +waiting_done: + if (ibcq->comp_handler) + wait_for_completion(&drain->done); +} + +static void hns_roce_v2_drain_rq(struct ib_qp *ibqp) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct hns_roce_drain_cqe rdrain = {}; + const struct ib_recv_wr *bad_rwr; + struct ib_cq *cq = ibqp->recv_cq; + struct ib_recv_wr rwr = {}; + int ret; + + ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE); + if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to modify qp during drain rq, ret = %d.\n", + ret); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = hns_roce_drain_qp_done; + init_completion(&rdrain.done); + + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) + ret = hns_roce_push_drain_wr(&hr_qp->rq, cq, rwr.wr_id); + else + ret = hns_roce_v2_post_recv(ibqp, &rwr, &bad_rwr); + if (ret) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to post recv for drain rq, ret = %d.\n", + ret); + return; + } + + handle_drain_completion(cq, &rdrain, hr_dev); +} + +static void hns_roce_v2_drain_sq(struct ib_qp *ibqp) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct hns_roce_drain_cqe sdrain = {}; + const struct ib_send_wr *bad_swr; + struct ib_cq *cq = ibqp->send_cq; + struct ib_rdma_wr swr = { + .wr = { + .next = NULL, + { .wr_cqe = &sdrain.cqe, }, + .opcode = IB_WR_RDMA_WRITE, + }, + }; + int ret; + + ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE); + if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to modify qp during drain sq, ret = %d.\n", + ret); + return; + } + + sdrain.cqe.done = hns_roce_drain_qp_done; + init_completion(&sdrain.done); + + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) + ret = hns_roce_push_drain_wr(&hr_qp->sq, cq, swr.wr.wr_id); + else + ret = hns_roce_v2_post_send(ibqp, &swr.wr, &bad_swr); + if (ret) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to post send for drain sq, ret = %d.\n", + ret); + return; + } + + handle_drain_completion(cq, &sdrain, hr_dev); +} + static void *get_srq_wqe_buf(struct hns_roce_srq *srq, u32 n) { return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift); @@ -3739,6 +3903,23 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, HNS_ROCE_V2_CQ_DEFAULT_INTERVAL); } +static bool left_sw_wc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) +{ + struct hns_roce_qp *hr_qp; + + list_for_each_entry(hr_qp, &hr_cq->sq_list, sq_node) { + if (hr_qp->sq.head != hr_qp->sq.tail) + return true; + } + + list_for_each_entry(hr_qp, &hr_cq->rq_list, rq_node) { + if (hr_qp->rq.head != hr_qp->rq.tail) + return true; + } + + return false; +} + static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { @@ -3747,6 +3928,12 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, struct hns_roce_v2_db cq_db = {}; u32 notify_flag; + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) { + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && + left_sw_wc(hr_dev, hr_cq)) + return 1; + return 0; + } /* * flags = 0, then notify_flag : next * flags = 1, then notify flag : solocited @@ -5053,20 +5240,22 @@ static int hns_roce_set_sl(struct ib_qp *ibqp, struct ib_device *ibdev = &hr_dev->ib_dev; int ret; - ret = hns_roce_hw_v2_get_dscp(hr_dev, get_tclass(&attr->ah_attr.grh), - &hr_qp->tc_mode, &hr_qp->priority); - if (ret && ret != -EOPNOTSUPP && - grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { - ibdev_err_ratelimited(ibdev, - "failed to get dscp, ret = %d.\n", ret); - return ret; - } + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); - if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP && - grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - hr_qp->sl = hr_qp->priority; - else - hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); + if (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + ret = hns_roce_hw_v2_get_dscp(hr_dev, + get_tclass(&attr->ah_attr.grh), + &hr_qp->tc_mode, &hr_qp->priority); + if (ret && ret != -EOPNOTSUPP) { + ibdev_err_ratelimited(ibdev, + "failed to get dscp, ret = %d.\n", + ret); + return ret; + } + + if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP) + hr_qp->sl = hr_qp->priority; + } if (!check_sl_valid(hr_dev, hr_qp->sl)) return -EINVAL; @@ -6956,7 +7145,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work); - hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0); + hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", + WQ_MEM_RECLAIM); if (!hr_dev->irq_workq) { dev_err(dev, "failed to create irq workqueue.\n"); ret = -ENOMEM; @@ -7014,6 +7204,8 @@ static const struct ib_device_ops hns_roce_v2_dev_ops = { .post_send = hns_roce_v2_post_send, .query_qp = hns_roce_v2_query_qp, .req_notify_cq = hns_roce_v2_req_notify_cq, + .drain_rq = hns_roce_v2_drain_rq, + .drain_sq = hns_roce_v2_drain_sq, }; static const struct ib_device_ops hns_roce_v2_dev_srq_ops = { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 2f4864ab7d4e..a3490bab297a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -259,6 +259,11 @@ static int hns_roce_query_device(struct ib_device *ib_dev, props->max_srq_sge = hr_dev->caps.max_srq_sges; } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) { + props->max_cq >>= 1; + props->max_qp >>= 1; + } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR && hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index d1640c5fbaab..5f7ea6c16644 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -197,22 +197,16 @@ static u8 get_affinity_cq_bank(u8 qp_bank) return (qp_bank >> 1) & CQ_BANKID_MASK; } -static u8 get_least_load_bankid_for_qp(struct ib_qp_init_attr *init_attr, - struct hns_roce_bank *bank) +static u8 get_least_load_bankid_for_qp(struct hns_roce_bank *bank, u8 valid_qp_bank_mask) { #define INVALID_LOAD_QPNUM 0xFFFFFFFF - struct ib_cq *scq = init_attr->send_cq; u32 least_load = INVALID_LOAD_QPNUM; - unsigned long cqn = 0; u8 bankid = 0; u32 bankcnt; u8 i; - if (scq) - cqn = to_hr_cq(scq)->cqn; - for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) { - if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK))) + if (!(valid_qp_bank_mask & BIT(i))) continue; bankcnt = bank[i].inuse; @@ -246,6 +240,42 @@ static int alloc_qpn_with_bankid(struct hns_roce_bank *bank, u8 bankid, return 0; } + +static bool use_ext_sge(struct ib_qp_init_attr *init_attr) +{ + return init_attr->cap.max_send_sge > HNS_ROCE_SGE_IN_WQE || + init_attr->qp_type == IB_QPT_UD || + init_attr->qp_type == IB_QPT_GSI; +} + +static u8 select_qp_bankid(struct hns_roce_dev *hr_dev, + struct ib_qp_init_attr *init_attr) +{ + struct hns_roce_qp_table *qp_table = &hr_dev->qp_table; + struct hns_roce_bank *bank = qp_table->bank; + struct ib_cq *scq = init_attr->send_cq; + u8 valid_qp_bank_mask = 0; + unsigned long cqn = 0; + u8 i; + + if (scq) + cqn = to_hr_cq(scq)->cqn; + + for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) { + if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK))) + continue; + + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) && + use_ext_sge(init_attr) && + !(VALID_EXT_SGE_QP_BANK_MASK_LIMIT & BIT(i))) + continue; + + valid_qp_bank_mask |= BIT(i); + } + + return get_least_load_bankid_for_qp(bank, valid_qp_bank_mask); +} + static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_init_attr *init_attr) { @@ -258,8 +288,7 @@ static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, num = 1; } else { mutex_lock(&qp_table->bank_mutex); - bankid = get_least_load_bankid_for_qp(init_attr, qp_table->bank); - + bankid = select_qp_bankid(hr_dev, init_attr); ret = alloc_qpn_with_bankid(&qp_table->bank[bankid], bankid, &num); if (ret) { diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c index 230187dda6a0..085791cc617c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_restrack.c +++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c @@ -51,7 +51,7 @@ int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq) ret = hr_dev->hw->query_cqc(hr_dev, hr_cq->cqn, &context); if (ret) - return -EINVAL; + return ret; ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context); @@ -177,7 +177,7 @@ int hns_roce_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr) ret = hr_dev->hw->query_mpt(hr_dev, hr_mr->key, &context); if (ret) - return -EINVAL; + return ret; ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context); diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index ce5cf89c463c..45c7433c96f3 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -2886,15 +2886,6 @@ static int irdma_sc_resume_qp(struct irdma_sc_cqp *cqp, struct irdma_sc_qp *qp, return 0; } -/** - * irdma_sc_cq_ack - acknowledge completion q - * @cq: cq struct - */ -static inline void irdma_sc_cq_ack(struct irdma_sc_cq *cq) -{ - writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); -} - /** * irdma_sc_cq_init - initialize completion q * @cq: cq struct @@ -2956,7 +2947,7 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, return -ENOMEM; set_64bit_val(wqe, 0, cq->cq_uk.cq_size); - set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); + set_64bit_val(wqe, 8, cq->cq_uk.cq_id); set_64bit_val(wqe, 16, FIELD_PREP(IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD, cq->shadow_read_threshold)); set_64bit_val(wqe, 32, (cq->virtual_map ? 0 : cq->cq_pa)); @@ -3013,7 +3004,7 @@ int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq) return -ENOMEM; set_64bit_val(wqe, 0, cq->cq_uk.cq_size); - set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); + set_64bit_val(wqe, 8, cq->cq_uk.cq_id); set_64bit_val(wqe, 40, cq->shadow_area_pa); set_64bit_val(wqe, 48, (cq->virtual_map ? cq->first_pm_pbl_idx : 0)); @@ -3082,7 +3073,7 @@ static int irdma_sc_cq_modify(struct irdma_sc_cq *cq, return -ENOMEM; set_64bit_val(wqe, 0, info->cq_size); - set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); + set_64bit_val(wqe, 8, cq->cq_uk.cq_id); set_64bit_val(wqe, 16, FIELD_PREP(IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD, info->shadow_read_threshold)); set_64bit_val(wqe, 32, info->cq_pa); @@ -3887,8 +3878,6 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val); spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags); - dma_wmb(); /* make sure shadow area is updated before arming */ - writel(ccq->cq_uk.cq_id, ccq->dev->cq_arm_db); } @@ -4460,47 +4449,38 @@ int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq) * irdma_sc_process_ceq - process ceq * @dev: sc device struct * @ceq: ceq sc structure + * @cq_idx: Pointer to a CQ ID that will be populated. * * It is expected caller serializes this function with cleanup_ceqes() * because these functions manipulate the same ceq + * + * Return: True if cq_idx has been populated with a CQ ID. */ -void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) +bool irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq, + u32 *cq_idx) { u64 temp; __le64 *ceqe; - struct irdma_sc_cq *cq = NULL; - struct irdma_sc_cq *temp_cq; u8 polarity; - u32 cq_idx; do { - cq_idx = 0; ceqe = IRDMA_GET_CURRENT_CEQ_ELEM(ceq); get_64bit_val(ceqe, 0, &temp); polarity = (u8)FIELD_GET(IRDMA_CEQE_VALID, temp); if (polarity != ceq->polarity) - return NULL; + return false; - temp_cq = (struct irdma_sc_cq *)(unsigned long)(temp << 1); - if (!temp_cq) { - cq_idx = IRDMA_INVALID_CQ_IDX; - IRDMA_RING_MOVE_TAIL(ceq->ceq_ring); - - if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring)) - ceq->polarity ^= 1; - continue; - } - - cq = temp_cq; + /* Truncate. Discard valid bit which is MSb of temp. */ + *cq_idx = temp; + if (*cq_idx >= dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt) + *cq_idx = IRDMA_INVALID_CQ_IDX; IRDMA_RING_MOVE_TAIL(ceq->ceq_ring); if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring)) ceq->polarity ^= 1; - } while (cq_idx == IRDMA_INVALID_CQ_IDX); + } while (*cq_idx == IRDMA_INVALID_CQ_IDX); - if (cq) - irdma_sc_cq_ack(cq); - return cq; + return true; } /** @@ -4514,10 +4494,10 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) */ void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq) { - struct irdma_sc_cq *next_cq; u8 ceq_polarity = ceq->polarity; __le64 *ceqe; u8 polarity; + u32 cq_idx; u64 temp; int next; u32 i; @@ -4532,9 +4512,10 @@ void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq) if (polarity != ceq_polarity) return; - next_cq = (struct irdma_sc_cq *)(unsigned long)(temp << 1); - if (cq == next_cq) - set_64bit_val(ceqe, 0, temp & IRDMA_CEQE_VALID); + cq_idx = temp; + if (cq_idx == cq->cq_uk.cq_id) + set_64bit_val(ceqe, 0, (temp & IRDMA_CEQE_VALID) | + IRDMA_INVALID_CQ_IDX); next = IRDMA_RING_GET_NEXT_TAIL(ceq->ceq_ring, i); if (!next) @@ -4975,7 +4956,7 @@ int irdma_sc_ccq_destroy(struct irdma_sc_cq *ccq, u64 scratch, bool post_sq) return -ENOMEM; set_64bit_val(wqe, 0, ccq->cq_uk.cq_size); - set_64bit_val(wqe, 8, (uintptr_t)ccq >> 1); + set_64bit_val(wqe, 8, ccq->cq_uk.cq_id); set_64bit_val(wqe, 40, ccq->shadow_area_pa); hdr = ccq->cq_uk.cq_id | @@ -5788,8 +5769,7 @@ static int cfg_fpm_value_gen_3(struct irdma_sc_dev *dev, bool is_mrte_loc_mem; loc_mem_pages = hmc_fpm_misc->loc_mem_pages; - is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds ? - true : false; + is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds; irdma_get_rsrc_mem_config(dev, is_mrte_loc_mem); mrte_loc = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc; @@ -6462,6 +6442,9 @@ int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev, int ret_code = 0; u8 db_size; + spin_lock_init(&dev->puda_cq_lock); + dev->ilq_cq = NULL; + dev->ieq_cq = NULL; INIT_LIST_HEAD(&dev->cqp_cmd_head); /* for CQP command backlog */ mutex_init(&dev->ws_mutex); dev->hmc_fn_id = info->hmc_fn_id; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index d1fc5726b979..31c67b753fc0 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -98,6 +98,74 @@ static void irdma_puda_ce_handler(struct irdma_pci_f *rf, irdma_sc_ccq_arm(cq); } +/** + * irdma_process_normal_ceqe - Handle a CEQE for a normal CQ. + * @rf: RDMA PCI function. + * @dev: iWARP device. + * @cq_idx: CQ ID. Must be in table bounds. + * + * Context: Atomic (CEQ lock must be held) + */ +static void irdma_process_normal_ceqe(struct irdma_pci_f *rf, + struct irdma_sc_dev *dev, u32 cq_idx) +{ + /* cq_idx bounds validated in irdma_sc_process_ceq. */ + struct irdma_cq *icq = READ_ONCE(rf->cq_table[cq_idx]); + struct irdma_sc_cq *cq; + + if (unlikely(!icq)) { + /* Should not happen since CEQ is scrubbed upon CQ delete. */ + ibdev_warn_ratelimited(to_ibdev(dev), "Stale CEQE for CQ %u", + cq_idx); + return; + } + + cq = &icq->sc_cq; + + if (unlikely(cq->cq_type != IRDMA_CQ_TYPE_IWARP)) { + ibdev_warn_ratelimited(to_ibdev(dev), "Unexpected CQ type %u", + cq->cq_type); + return; + } + + writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); + irdma_iwarp_ce_handler(cq); +} + +/** + * irdma_process_reserved_ceqe - Handle a CEQE for a reserved CQ. + * @rf: RDMA PCI function. + * @dev: iWARP device. + * @cq_idx: CQ ID. + * + * Context: Atomic + */ +static void irdma_process_reserved_ceqe(struct irdma_pci_f *rf, + struct irdma_sc_dev *dev, u32 cq_idx) +{ + struct irdma_sc_cq *cq; + + if (cq_idx == IRDMA_RSVD_CQ_ID_CQP) { + cq = &rf->ccq.sc_cq; + /* CQP CQ lifetime > CEQ. */ + writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); + queue_work(rf->cqp_cmpl_wq, &rf->cqp_cmpl_work); + } else if (cq_idx == IRDMA_RSVD_CQ_ID_ILQ || + cq_idx == IRDMA_RSVD_CQ_ID_IEQ) { + scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) { + cq = (cq_idx == IRDMA_RSVD_CQ_ID_ILQ) ? + dev->ilq_cq : dev->ieq_cq; + if (!cq) { + ibdev_warn_ratelimited(to_ibdev(dev), + "Stale ILQ/IEQ CEQE"); + return; + } + writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); + irdma_puda_ce_handler(rf, cq); + } + } +} + /** * irdma_process_ceq - handle ceq for completions * @rf: RDMA PCI function @@ -107,28 +175,28 @@ static void irdma_process_ceq(struct irdma_pci_f *rf, struct irdma_ceq *ceq) { struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_sc_ceq *sc_ceq; - struct irdma_sc_cq *cq; unsigned long flags; + u32 cq_idx; sc_ceq = &ceq->sc_ceq; do { spin_lock_irqsave(&ceq->ce_lock, flags); - cq = irdma_sc_process_ceq(dev, sc_ceq); - if (!cq) { + + if (!irdma_sc_process_ceq(dev, sc_ceq, &cq_idx)) { spin_unlock_irqrestore(&ceq->ce_lock, flags); break; } - if (cq->cq_type == IRDMA_CQ_TYPE_IWARP) - irdma_iwarp_ce_handler(cq); + /* Normal CQs must be handled while holding CEQ lock. */ + if (likely(cq_idx > IRDMA_RSVD_CQ_ID_IEQ)) { + irdma_process_normal_ceqe(rf, dev, cq_idx); + spin_unlock_irqrestore(&ceq->ce_lock, flags); + continue; + } spin_unlock_irqrestore(&ceq->ce_lock, flags); - if (cq->cq_type == IRDMA_CQ_TYPE_CQP) - queue_work(rf->cqp_cmpl_wq, &rf->cqp_cmpl_work); - else if (cq->cq_type == IRDMA_CQ_TYPE_ILQ || - cq->cq_type == IRDMA_CQ_TYPE_IEQ) - irdma_puda_ce_handler(rf, cq); + irdma_process_reserved_ceqe(rf, dev, cq_idx); } while (1); } @@ -1532,8 +1600,8 @@ static int irdma_initialize_ilq(struct irdma_device *iwdev) int status; info.type = IRDMA_PUDA_RSRC_TYPE_ILQ; - info.cq_id = 1; - info.qp_id = 1; + info.cq_id = IRDMA_RSVD_CQ_ID_ILQ; + info.qp_id = IRDMA_RSVD_QP_ID_GSI_ILQ; info.count = 1; info.pd_id = 1; info.abi_ver = IRDMA_ABI_VER; @@ -1562,7 +1630,7 @@ static int irdma_initialize_ieq(struct irdma_device *iwdev) int status; info.type = IRDMA_PUDA_RSRC_TYPE_IEQ; - info.cq_id = 2; + info.cq_id = IRDMA_RSVD_CQ_ID_IEQ; info.qp_id = iwdev->vsi.exception_lan_q; info.count = 1; info.pd_id = 2; @@ -1868,7 +1936,7 @@ int irdma_rt_init_hw(struct irdma_device *iwdev, vsi_info.pf_data_vsi_num = iwdev->vsi_num; vsi_info.register_qset = rf->gen_ops.register_qset; vsi_info.unregister_qset = rf->gen_ops.unregister_qset; - vsi_info.exception_lan_q = 2; + vsi_info.exception_lan_q = IRDMA_RSVD_QP_ID_IEQ; irdma_sc_vsi_init(&iwdev->vsi, &vsi_info); status = irdma_setup_cm_core(iwdev, rf->rdma_ver); @@ -2099,18 +2167,18 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) irdma_set_hw_rsrc(rf); set_bit(0, rf->allocated_mrs); - set_bit(0, rf->allocated_qps); - set_bit(0, rf->allocated_cqs); + set_bit(IRDMA_RSVD_QP_ID_0, rf->allocated_qps); + set_bit(IRDMA_RSVD_CQ_ID_CQP, rf->allocated_cqs); set_bit(0, rf->allocated_srqs); set_bit(0, rf->allocated_pds); set_bit(0, rf->allocated_arps); set_bit(0, rf->allocated_ahs); set_bit(0, rf->allocated_mcgs); - set_bit(2, rf->allocated_qps); /* qp 2 IEQ */ - set_bit(1, rf->allocated_qps); /* qp 1 ILQ */ - set_bit(1, rf->allocated_cqs); + set_bit(IRDMA_RSVD_QP_ID_IEQ, rf->allocated_qps); + set_bit(IRDMA_RSVD_QP_ID_GSI_ILQ, rf->allocated_qps); + set_bit(IRDMA_RSVD_CQ_ID_ILQ, rf->allocated_cqs); set_bit(1, rf->allocated_pds); - set_bit(2, rf->allocated_cqs); + set_bit(IRDMA_RSVD_CQ_ID_IEQ, rf->allocated_cqs); set_bit(2, rf->allocated_pds); INIT_LIST_HEAD(&rf->mc_qht_list.list); diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index baab61e424a2..d320d1a228b3 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #ifndef CONFIG_64BIT @@ -528,6 +529,7 @@ void irdma_cq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_cq *cq); void irdma_srq_event(struct irdma_sc_srq *srq); void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq); void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf); +int irdma_get_timeout_threshold(struct irdma_sc_dev *dev); int irdma_hw_modify_qp(struct irdma_device *iwdev, struct irdma_qp *iwqp, struct irdma_modify_qp_info *info, bool wait); int irdma_qp_suspend_resume(struct irdma_sc_qp *qp, bool suspend); diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index cee47ddbd1b5..4f1a8c97faf1 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -809,6 +809,13 @@ error: dma_free_coherent(dev->hw->device, rsrc->cqmem.size, rsrc->cqmem.va, rsrc->cqmem.pa); rsrc->cqmem.va = NULL; + } else { + scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) { + if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ) + dev->ilq_cq = cq; + else + dev->ieq_cq = cq; + } } return ret; @@ -856,6 +863,13 @@ static void irdma_puda_free_cq(struct irdma_puda_rsrc *rsrc) struct irdma_ccq_cqe_info compl_info; struct irdma_sc_dev *dev = rsrc->dev; + scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) { + if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ) + dev->ilq_cq = NULL; + else + dev->ieq_cq = NULL; + } + if (rsrc->dev->ceq_valid) { irdma_cqp_cq_destroy_cmd(dev, &rsrc->cq); return; diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index cab4896640a1..da8c54d1f035 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -239,6 +239,18 @@ enum irdma_queue_type { IRDMA_QUEUE_TYPE_SRQ, }; +enum irdma_rsvd_cq_id { + IRDMA_RSVD_CQ_ID_CQP, + IRDMA_RSVD_CQ_ID_ILQ, + IRDMA_RSVD_CQ_ID_IEQ, +}; + +enum irdma_rsvd_qp_id { + IRDMA_RSVD_QP_ID_0, + IRDMA_RSVD_QP_ID_GSI_ILQ, + IRDMA_RSVD_QP_ID_IEQ, +}; + struct irdma_sc_dev; struct irdma_vsi_pestat; @@ -695,6 +707,9 @@ struct irdma_sc_dev { struct irdma_sc_aeq *aeq; struct irdma_sc_ceq *ceq[IRDMA_CEQ_MAX_COUNT]; struct irdma_sc_cq *ccq; + spinlock_t puda_cq_lock; + struct irdma_sc_cq *ilq_cq; + struct irdma_sc_cq *ieq_cq; const struct irdma_irq_ops *irq_ops; struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY]; struct irdma_hmc_fpm_misc hmc_fpm_misc; @@ -1332,7 +1347,8 @@ int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq); int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, struct irdma_ceq_init_info *info); void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq); -void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq); +bool irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq, + u32 *cq_idx); int irdma_sc_aeq_init(struct irdma_sc_aeq *aeq, struct irdma_aeq_init_info *info); diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index f0846b800913..ac3721a5747a 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -114,7 +114,6 @@ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx) */ void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp) { - dma_wmb(); writel(qp->qp_id, qp->wqe_alloc_db); } @@ -1107,8 +1106,6 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, set_64bit_val(cq->shadow_area, 32, temp_val); - dma_wmb(); /* make sure WQE is populated before valid bit is set */ - writel(cq->cq_id, cq->cqe_alloc_db); } @@ -1408,8 +1405,7 @@ exit: * from SW for all unprocessed WQEs. For GEN3 and beyond * FW will generate/flush these CQEs so move to the next CQE */ - move_cq_head = qp->uk_attrs->hw_rev <= IRDMA_GEN_2 ? - false : true; + move_cq_head = qp->uk_attrs->hw_rev > IRDMA_GEN_2; } if (move_cq_head) { diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 13d7499131d4..960432bf7fc9 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -573,7 +573,7 @@ void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf) } } -static int irdma_get_timeout_threshold(struct irdma_sc_dev *dev) +int irdma_get_timeout_threshold(struct irdma_sc_dev *dev) { u16 time_s = dev->vc_caps.cqp_timeout_s; @@ -830,7 +830,8 @@ void irdma_cq_rem_ref(struct ib_cq *ibcq) return; } - iwdev->rf->cq_table[iwcq->cq_num] = NULL; + /* May be asynchronously sampled by CEQ ISR without holding tbl lock. */ + WRITE_ONCE(iwdev->rf->cq_table[iwcq->cq_num], NULL); spin_unlock_irqrestore(&iwdev->rf->cqtable_lock, flags); complete(&iwcq->free_cq); } @@ -2239,7 +2240,7 @@ void irdma_pble_free_paged_mem(struct irdma_chunk *chunk) chunk->pg_cnt); done: - kfree(chunk->dmainfo.dmaaddrs); + kvfree(chunk->dmainfo.dmaaddrs); chunk->dmainfo.dmaaddrs = NULL; vfree(chunk->vaddr); chunk->vaddr = NULL; @@ -2256,7 +2257,7 @@ int irdma_pble_get_paged_mem(struct irdma_chunk *chunk, u32 pg_cnt) u32 size; void *va; - chunk->dmainfo.dmaaddrs = kzalloc(pg_cnt << 3, GFP_KERNEL); + chunk->dmainfo.dmaaddrs = kvzalloc(pg_cnt << 3, GFP_KERNEL); if (!chunk->dmainfo.dmaaddrs) return -ENOMEM; @@ -2277,7 +2278,7 @@ int irdma_pble_get_paged_mem(struct irdma_chunk *chunk, u32 pg_cnt) return 0; err: - kfree(chunk->dmainfo.dmaaddrs); + kvfree(chunk->dmainfo.dmaaddrs); chunk->dmainfo.dmaaddrs = NULL; return -ENOMEM; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 6d9af41a2884..cf8d19150574 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2669,9 +2669,12 @@ static int irdma_create_cq(struct ib_cq *ibcq, goto cq_destroy; } } - rf->cq_table[cq_num] = iwcq; + init_completion(&iwcq->free_cq); + /* Populate table entry after CQ is fully created. */ + smp_store_release(&rf->cq_table[cq_num], iwcq); + return 0; cq_destroy: irdma_cq_wq_destroy(rf, cq); @@ -5027,15 +5030,15 @@ static int irdma_create_hw_ah(struct irdma_device *iwdev, struct irdma_ah *ah, b } if (!sleep) { - int cnt = CQP_COMPL_WAIT_TIME_MS * CQP_TIMEOUT_THRESHOLD; + const u64 tmout_ms = irdma_get_timeout_threshold(&rf->sc_dev) * + CQP_COMPL_WAIT_TIME_MS; - do { - irdma_cqp_ce_handler(rf, &rf->ccq.sc_cq); - mdelay(1); - } while (!ah->sc_ah.ah_info.ah_valid && --cnt); - - if (!cnt) { - ibdev_dbg(&iwdev->ibdev, "VERBS: CQP create AH timed out"); + if (poll_timeout_us_atomic(irdma_cqp_ce_handler(rf, + &rf->ccq.sc_cq), + ah->sc_ah.ah_info.ah_valid, 1, + tmout_ms * USEC_PER_MSEC, false)) { + ibdev_dbg(&iwdev->ibdev, + "VERBS: CQP create AH timed out"); err = -ETIMEDOUT; goto err_ah_create; } diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 7600412b0739..974dd610dcbf 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -24,6 +24,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; cq->cq_handle = INVALID_MANA_HANDLE; + is_rnic_cq = mana_ib_is_rnic(mdev); if (udata) { if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) @@ -35,8 +36,6 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, return err; } - is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); - if ((!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) || attr->cqe > U32_MAX / COMP_ENTRY_SIZE) { ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); @@ -55,7 +54,6 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, ibucontext); doorbell = mana_ucontext->doorbell; } else { - is_rnic_cq = true; if (attr->cqe > U32_MAX / COMP_ENTRY_SIZE / 2 + 1) { ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); return -EINVAL; diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index bdeddb642b87..ccc2279ca63c 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -69,6 +69,12 @@ static const struct ib_device_ops mana_ib_device_stats_ops = { .alloc_hw_device_stats = mana_ib_alloc_hw_device_stats, }; +const struct ib_device_ops mana_ib_dev_dm_ops = { + .alloc_dm = mana_ib_alloc_dm, + .dealloc_dm = mana_ib_dealloc_dm, + .reg_dm_mr = mana_ib_reg_dm_mr, +}; + static int mana_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -139,6 +145,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); if (dev->adapter_caps.feature_flags & MANA_IB_FEATURE_DEV_COUNTERS_SUPPORT) ib_set_device_ops(&dev->ib_dev, &mana_ib_device_stats_ops); + ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_dm_ops); ret = mana_ib_create_eqs(dev); if (ret) { diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 9d36232ed880..e447acfd2071 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -131,6 +131,11 @@ struct mana_ib_mr { mana_handle_t mr_handle; }; +struct mana_ib_dm { + struct ib_dm ibdm; + mana_handle_t dm_handle; +}; + struct mana_ib_cq { struct ib_cq ibcq; struct mana_ib_queue queue; @@ -735,4 +740,11 @@ struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 leng u64 iova, int fd, int mr_access_flags, struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); + +struct ib_dm *mana_ib_alloc_dm(struct ib_device *dev, struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); +int mana_ib_dealloc_dm(struct ib_dm *dm, struct uverbs_attr_bundle *attrs); +struct ib_mr *mana_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs); + #endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 3d0245a4c1ed..f979f26adc3b 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -40,6 +40,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req), sizeof(resp)); + req.hdr.req.msg_version = GDMA_MESSAGE_V2; req.pd_handle = mr_params->pd_handle; req.mr_type = mr_params->mr_type; @@ -55,6 +56,12 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, req.zbva.dma_region_handle = mr_params->zbva.dma_region_handle; req.zbva.access_flags = mr_params->zbva.access_flags; break; + case GDMA_MR_TYPE_DM: + req.da_ext.length = mr_params->da.length; + req.da.dm_handle = mr_params->da.dm_handle; + req.da.offset = mr_params->da.offset; + req.da.access_flags = mr_params->da.access_flags; + break; default: ibdev_dbg(&dev->ib_dev, "invalid param (GDMA_MR_TYPE) passed, type %d\n", @@ -317,3 +324,126 @@ int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return 0; } + +static int mana_ib_gd_alloc_dm(struct mana_ib_dev *mdev, struct mana_ib_dm *dm, + struct ib_dm_alloc_attr *attr) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_alloc_dm_resp resp = {}; + struct gdma_alloc_dm_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOC_DM, sizeof(req), sizeof(resp)); + req.length = attr->length; + req.alignment = attr->alignment; + req.flags = attr->flags; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + if (!err) + err = -EPROTO; + + return err; + } + + dm->dm_handle = resp.dm_handle; + + return 0; +} + +struct ib_dm *mana_ib_alloc_dm(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + struct mana_ib_dm *dm; + int err; + + dm = kzalloc(sizeof(*dm), GFP_KERNEL); + if (!dm) + return ERR_PTR(-ENOMEM); + + err = mana_ib_gd_alloc_dm(dev, dm, attr); + if (err) + goto err_free; + + return &dm->ibdm; + +err_free: + kfree(dm); + return ERR_PTR(err); +} + +static int mana_ib_gd_destroy_dm(struct mana_ib_dev *mdev, struct mana_ib_dm *dm) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_destroy_dm_resp resp = {}; + struct gdma_destroy_dm_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_DM, sizeof(req), sizeof(resp)); + req.dm_handle = dm->dm_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + if (!err) + err = -EPROTO; + + return err; + } + + return 0; +} + +int mana_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs) +{ + struct mana_ib_dev *dev = container_of(ibdm->device, struct mana_ib_dev, ib_dev); + struct mana_ib_dm *dm = container_of(ibdm, struct mana_ib_dm, ibdm); + int err; + + err = mana_ib_gd_destroy_dm(dev, dm); + if (err) + return err; + + kfree(dm); + return 0; +} + +struct ib_mr *mana_ib_reg_dm_mr(struct ib_pd *ibpd, struct ib_dm *ibdm, + struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mana_ib_dev *dev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_dm *mana_dm = container_of(ibdm, struct mana_ib_dm, ibdm); + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct mana_ib_mr *mr; + int err; + + attr->access_flags &= ~IB_ACCESS_OPTIONAL; + if (attr->access_flags & ~VALID_MR_FLAGS) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_DM; + mr_params.da.dm_handle = mana_dm->dm_handle; + mr_params.da.offset = attr->offset; + mr_params.da.length = attr->length; + mr_params.da.access_flags = + mana_ib_verbs_to_gdma_access_flags(attr->access_flags); + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_free; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 947faacd75bb..5e06177ace26 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -561,12 +561,20 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, * of an error it will still be zeroed out. * Use native port in case of reps */ - if (dev->is_rep) - err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - 1, 0); - else - err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - mdev_port_num, 0); + if (dev->is_rep) { + struct mlx5_eswitch_rep *rep; + + rep = dev->port[port_num - 1].rep; + if (rep) { + mdev = mlx5_eswitch_get_core_dev(rep->esw); + WARN_ON(!mdev); + } + mdev_port_num = 1; + } + + err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, + mdev_port_num, 0); + if (err) goto out; ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); @@ -1581,6 +1589,129 @@ static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u32 port, u16 index, return 0; } +static int mlx5_ib_query_port_speed_from_port(struct mlx5_ib_dev *dev, + u32 port_num, u64 *speed) +{ + struct ib_port_speed_info speed_info; + struct ib_port_attr attr = {}; + int err; + + err = mlx5_ib_query_port(&dev->ib_dev, port_num, &attr); + if (err) + return err; + + if (attr.state == IB_PORT_DOWN) { + *speed = 0; + return 0; + } + + err = ib_port_attr_to_speed_info(&attr, &speed_info); + if (err) + return err; + + *speed = speed_info.rate; + return 0; +} + +static int mlx5_ib_query_port_speed_from_vport(struct mlx5_core_dev *mdev, + u8 op_mod, u16 vport, + u8 other_vport, u64 *speed, + struct mlx5_ib_dev *dev, + u32 port_num) +{ + u32 max_tx_speed; + int err; + + err = mlx5_query_vport_max_tx_speed(mdev, op_mod, vport, other_vport, + &max_tx_speed); + if (err) + return err; + + if (max_tx_speed == 0) + /* Value 0 indicates field not supported, fallback */ + return mlx5_ib_query_port_speed_from_port(dev, port_num, + speed); + + *speed = max_tx_speed; + return 0; +} + +static int mlx5_ib_query_port_speed_from_bond(struct mlx5_ib_dev *dev, + u32 port_num, u64 *speed) +{ + struct mlx5_core_dev *mdev = dev->mdev; + u32 bond_speed; + int err; + + err = mlx5_lag_query_bond_speed(mdev, &bond_speed); + if (err) + return err; + + *speed = bond_speed / MLX5_MAX_TX_SPEED_UNIT; + + return 0; +} + +static int mlx5_ib_query_port_speed_non_rep(struct mlx5_ib_dev *dev, + u32 port_num, u64 *speed) +{ + u16 op_mod = MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT; + + if (mlx5_lag_is_roce(dev->mdev)) + return mlx5_ib_query_port_speed_from_bond(dev, port_num, + speed); + + return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod, 0, false, + speed, dev, port_num); +} + +static int mlx5_ib_query_port_speed_rep(struct mlx5_ib_dev *dev, u32 port_num, + u64 *speed) +{ + struct mlx5_eswitch_rep *rep; + struct mlx5_core_dev *mdev; + u16 op_mod; + + if (!dev->port[port_num - 1].rep) { + mlx5_ib_warn(dev, "Representor doesn't exist for port %u\n", + port_num); + return -EINVAL; + } + + rep = dev->port[port_num - 1].rep; + mdev = mlx5_eswitch_get_core_dev(rep->esw); + if (!mdev) + return -ENODEV; + + if (rep->vport == MLX5_VPORT_UPLINK) { + if (mlx5_lag_is_sriov(mdev)) + return mlx5_ib_query_port_speed_from_bond(dev, + port_num, + speed); + + return mlx5_ib_query_port_speed_from_port(dev, port_num, + speed); + } + + op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod, + rep->vport, true, speed, dev, + port_num); +} + +int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num, u64 *speed) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + if (mlx5_ib_port_link_layer(ibdev, port_num) == + IB_LINK_LAYER_INFINIBAND || mlx5_core_mp_enabled(dev->mdev)) + return mlx5_ib_query_port_speed_from_port(dev, port_num, speed); + else if (!dev->is_rep) + return mlx5_ib_query_port_speed_non_rep(dev, port_num, speed); + else + return mlx5_ib_query_port_speed_rep(dev, port_num, speed); +} + static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index, union ib_gid *gid) { @@ -2323,6 +2454,70 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, virt_to_page(dev->mdev->clock_info)); } +static int phys_addr_to_bar(struct pci_dev *pdev, phys_addr_t pa) +{ + resource_size_t start, end; + int bar; + + for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) { + /* Skip BARs not present or not memory-mapped */ + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) + continue; + + start = pci_resource_start(pdev, bar); + end = pci_resource_end(pdev, bar); + + if (!start || !end) + continue; + + if (pa >= start && pa <= end) + return bar; + } + + return -1; +} + +static int mlx5_ib_mmap_get_pfns(struct rdma_user_mmap_entry *entry, + struct phys_vec *phys_vec, + struct p2pdma_provider **provider) +{ + struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); + struct pci_dev *pdev = to_mdev(entry->ucontext->device)->mdev->pdev; + int bar; + + phys_vec->paddr = mentry->address; + phys_vec->len = entry->npages * PAGE_SIZE; + + bar = phys_addr_to_bar(pdev, phys_vec->paddr); + if (bar < 0) + return -EINVAL; + + *provider = pcim_p2pdma_provider(pdev, bar); + /* If the kernel was not compiled with CONFIG_PCI_P2PDMA the + * functionality is not supported. + */ + if (!*provider) + return -EOPNOTSUPP; + + return 0; +} + +static struct rdma_user_mmap_entry * +mlx5_ib_pgoff_to_mmap_entry(struct ib_ucontext *ucontext, off_t pg_off) +{ + unsigned long entry_pgoff; + unsigned long idx; + u8 command; + + pg_off = pg_off >> PAGE_SHIFT; + command = get_command(pg_off); + idx = get_extended_index(pg_off); + + entry_pgoff = command << 16 | idx; + + return rdma_user_mmap_entry_get_pgoff(ucontext, entry_pgoff); +} + static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry) { struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); @@ -2838,6 +3033,14 @@ static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe, case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: case MLX5_PORT_CHANGE_SUBTYPE_DOWN: case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED: + if (ibdev->ib_active) { + struct ib_event speed_event = {}; + + speed_event.device = &ibdev->ib_dev; + speed_event.event = IB_EVENT_DEVICE_SPEED_CHANGE; + ib_dispatch_event(&speed_event); + } + /* In RoCE, port up/down events are handled in * mlx5_netdev_event(). */ @@ -2878,7 +3081,6 @@ static void mlx5_ib_handle_event(struct work_struct *_work) container_of(_work, struct mlx5_ib_event_work, work); struct mlx5_ib_dev *ibdev; struct ib_event ibev; - bool fatal = false; if (work->is_slave) { ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi); @@ -2889,12 +3091,6 @@ static void mlx5_ib_handle_event(struct work_struct *_work) } switch (work->event) { - case MLX5_DEV_EVENT_SYS_ERROR: - ibev.event = IB_EVENT_DEVICE_FATAL; - mlx5_ib_handle_internal_error(ibdev); - ibev.element.port_num = (u8)(unsigned long)work->param; - fatal = true; - break; case MLX5_EVENT_TYPE_PORT_CHANGE: if (handle_port_change(ibdev, work->param, &ibev)) goto out; @@ -2916,8 +3112,6 @@ static void mlx5_ib_handle_event(struct work_struct *_work) if (ibdev->ib_active) ib_dispatch_event(&ibev); - if (fatal) - ibdev->ib_active = false; out: kfree(work); } @@ -2961,6 +3155,66 @@ static int mlx5_ib_event_slave_port(struct notifier_block *nb, return NOTIFY_OK; } +static void mlx5_ib_handle_sys_error_event(struct work_struct *_work) +{ + struct mlx5_ib_event_work *work = + container_of(_work, struct mlx5_ib_event_work, work); + struct mlx5_ib_dev *ibdev = work->dev; + struct ib_event ibev; + + ibev.event = IB_EVENT_DEVICE_FATAL; + mlx5_ib_handle_internal_error(ibdev); + ibev.element.port_num = (u8)(unsigned long)work->param; + ibev.device = &ibdev->ib_dev; + + if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", ibev.element.port_num); + goto out; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); + + ibdev->ib_active = false; +out: + kfree(work); +} + +static int mlx5_ib_sys_error_event(struct notifier_block *nb, + unsigned long event, void *param) +{ + struct mlx5_ib_event_work *work; + + if (event != MLX5_DEV_EVENT_SYS_ERROR) + return NOTIFY_DONE; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return NOTIFY_DONE; + + INIT_WORK(&work->work, mlx5_ib_handle_sys_error_event); + work->dev = container_of(nb, struct mlx5_ib_dev, sys_error_events); + work->is_slave = false; + work->param = param; + work->event = event; + + queue_work(mlx5_ib_event_wq, &work->work); + + return NOTIFY_OK; +} + +static int mlx5_ib_stage_sys_error_notifier_init(struct mlx5_ib_dev *dev) +{ + dev->sys_error_events.notifier_call = mlx5_ib_sys_error_event; + mlx5_notifier_register(dev->mdev, &dev->sys_error_events); + return 0; +} + +static void mlx5_ib_stage_sys_error_notifier_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_notifier_unregister(dev->mdev, &dev->sys_error_events); +} + static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) { struct mlx5_hca_vport_context vport_ctx; @@ -4229,7 +4483,13 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (err) goto err_mp; + err = pcim_p2pdma_init(mdev->pdev); + if (err && err != -EOPNOTSUPP) + goto err_dd; + return 0; +err_dd: + mlx5_ib_data_direct_cleanup(dev); err_mp: mlx5_ib_cleanup_multiport_master(dev); err: @@ -4281,11 +4541,13 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi, .mmap = mlx5_ib_mmap, .mmap_free = mlx5_ib_mmap_free, + .mmap_get_pfns = mlx5_ib_mmap_get_pfns, .modify_cq = mlx5_ib_modify_cq, .modify_device = mlx5_ib_modify_device, .modify_port = mlx5_ib_modify_port, .modify_qp = mlx5_ib_modify_qp, .modify_srq = mlx5_ib_modify_srq, + .pgoff_to_mmap_entry = mlx5_ib_pgoff_to_mmap_entry, .pre_destroy_cq = mlx5_ib_pre_destroy_cq, .poll_cq = mlx5_ib_poll_cq, .post_destroy_cq = mlx5_ib_post_destroy_cq, @@ -4297,6 +4559,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .query_device = mlx5_ib_query_device, .query_gid = mlx5_ib_query_gid, .query_pkey = mlx5_ib_query_pkey, + .query_port_speed = mlx5_ib_query_port_speed, .query_qp = mlx5_ib_query_qp, .query_srq = mlx5_ib_query_srq, .query_ucontext = mlx5_ib_query_ucontext, @@ -4466,12 +4729,16 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) { err = mlx5_ib_init_ucaps(dev); if (err) - return err; + goto err_ucaps; } dev->ib_dev.use_cq_dim = true; return 0; + +err_ucaps: + bitmap_free(dev->var_table.bitmap); + return err; } static const struct ib_device_ops mlx5_ib_dev_port_ops = { @@ -4807,6 +5074,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, mlx5_ib_devx_init, mlx5_ib_devx_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_SYS_ERROR_NOTIFIER, + mlx5_ib_stage_sys_error_notifier_init, + mlx5_ib_stage_sys_error_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), @@ -4864,6 +5134,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, mlx5_ib_devx_init, mlx5_ib_devx_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_SYS_ERROR_NOTIFIER, + mlx5_ib_stage_sys_error_notifier_init, + mlx5_ib_stage_sys_error_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 09d82d5f95e3..4f4114d95130 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1007,6 +1007,7 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_BFREG, MLX5_IB_STAGE_PRE_IB_REG_UMR, MLX5_IB_STAGE_WHITELIST_UID, + MLX5_IB_STAGE_SYS_ERROR_NOTIFIER, MLX5_IB_STAGE_IB_REG, MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_POST_IB_REG_UMR, @@ -1165,6 +1166,7 @@ struct mlx5_ib_dev { /* protect accessing data_direct_dev */ struct mutex data_direct_lock; struct notifier_block mdev_events; + struct notifier_block sys_error_events; struct notifier_block lag_events; int num_ports; /* serialize update of capability mask @@ -1435,6 +1437,8 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props); int mlx5_ib_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props); +int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num, + u64 *speed); void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, u64 access_flags); int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 325fa04cbe8a..a7b37e3df072 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1646,10 +1646,13 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, offset, length, fd, access_flags, &mlx5_ib_dmabuf_attach_ops); - else + else if (dma_device) umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, dma_device, offset, length, fd, access_flags); + else + umem_dmabuf = ib_umem_dmabuf_get_pinned( + &dev->ib_dev, offset, length, fd, access_flags); if (IS_ERR(umem_dmabuf)) { mlx5_ib_dbg(dev, "umem_dmabuf get failed (%pe)\n", umem_dmabuf); @@ -1782,10 +1785,8 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, fd, access_flags); - return reg_user_mr_dmabuf(pd, pd->device->dma_device, - offset, length, virt_addr, - fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT, - dmah); + return reg_user_mr_dmabuf(pd, NULL, offset, length, virt_addr, fd, + access_flags, MLX5_MKC_ACCESS_MODE_MTT, dmah); } /* diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 69af20790481..0324909e3151 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4362,6 +4362,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, optpar |= ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + if (attr_mask & IB_QP_RATE_LIMIT && qp->type != IB_QPT_RAW_PACKET) { + err = -EOPNOTSUPP; + goto out; + } + if (qp->type == IB_QPT_RAW_PACKET || qp->flags & IB_QP_CREATE_SOURCE_QPN) { struct mlx5_modify_raw_qp_param raw_qp_param = {}; diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index 2fcf553044e1..1ee31611b4b3 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -195,7 +195,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)( int out_len = uverbs_attr_get_len(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH); u32 dev_path_len; - char *dev_path; + char *dev_path = NULL; int ret; c = to_mucontext(ib_uverbs_get_ucontext(attrs)); @@ -223,9 +223,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)( ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path, dev_path_len); - kfree(dev_path); end: + kfree(dev_path); mutex_unlock(&dev->data_direct_lock); return ret; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 5eb61c110090..5584b781e2e8 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -67,8 +67,6 @@ #define OC_SKH_DEVICE_VF 0x728 #define OCRDMA_MAX_AH 512 -#define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) - #define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo) #define EQ_INTR_PER_SEC_THRSH_HI 150000 #define EQ_INTR_PER_SEC_THRSH_LOW 100000 diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index db9ef3e1eb97..a6c9a4d9ab93 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -53,11 +53,8 @@ DP_NAME(dev) ? DP_NAME(dev) : "", ## __VA_ARGS__) #define QEDR_MSG_INIT "INIT" -#define QEDR_MSG_MISC "MISC" #define QEDR_MSG_CQ " CQ" #define QEDR_MSG_MR " MR" -#define QEDR_MSG_RQ " RQ" -#define QEDR_MSG_SQ " SQ" #define QEDR_MSG_QP " QP" #define QEDR_MSG_SRQ " SRQ" #define QEDR_MSG_GSI " GSI" @@ -65,7 +62,6 @@ #define QEDR_CQ_MAGIC_NUMBER (0x11223344) -#define FW_PAGE_SIZE (RDMA_RING_PAGE_SIZE) #define FW_PAGE_SHIFT (12) struct qedr_dev; @@ -178,24 +174,18 @@ struct qedr_dev { u8 user_dpm_enabled; }; -#define QEDR_MAX_SQ_PBL (0x8000) #define QEDR_MAX_SQ_PBL_ENTRIES (0x10000 / sizeof(void *)) #define QEDR_SQE_ELEMENT_SIZE (sizeof(struct rdma_sq_sge)) #define QEDR_MAX_SQE_ELEMENTS_PER_SQE (ROCE_REQ_MAX_SINGLE_SQ_WQE_SIZE / \ QEDR_SQE_ELEMENT_SIZE) -#define QEDR_MAX_SQE_ELEMENTS_PER_PAGE ((RDMA_RING_PAGE_SIZE) / \ - QEDR_SQE_ELEMENT_SIZE) #define QEDR_MAX_SQE ((QEDR_MAX_SQ_PBL_ENTRIES) *\ (RDMA_RING_PAGE_SIZE) / \ (QEDR_SQE_ELEMENT_SIZE) /\ (QEDR_MAX_SQE_ELEMENTS_PER_SQE)) /* RQ */ -#define QEDR_MAX_RQ_PBL (0x2000) #define QEDR_MAX_RQ_PBL_ENTRIES (0x10000 / sizeof(void *)) #define QEDR_RQE_ELEMENT_SIZE (sizeof(struct rdma_rq_sge)) #define QEDR_MAX_RQE_ELEMENTS_PER_RQE (RDMA_MAX_SGE_PER_RQ_WQE) -#define QEDR_MAX_RQE_ELEMENTS_PER_PAGE ((RDMA_RING_PAGE_SIZE) / \ - QEDR_RQE_ELEMENT_SIZE) #define QEDR_MAX_RQE ((QEDR_MAX_RQ_PBL_ENTRIES) *\ (RDMA_RING_PAGE_SIZE) / \ (QEDR_RQE_ELEMENT_SIZE) /\ @@ -210,12 +200,8 @@ struct qedr_dev { #define QEDR_ROCE_MAX_CNQ_SIZE (0x4000) -#define QEDR_MAX_PORT (1) #define QEDR_PORT (1) -#define QEDR_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) - -#define QEDR_ROCE_PKEY_MAX 1 #define QEDR_ROCE_PKEY_TABLE_LEN 1 #define QEDR_ROCE_PKEY_DEFAULT 0xffff @@ -336,12 +322,6 @@ struct qedr_qp_hwq_info { union db_prod32 iwarp_db2_data; }; -#define QEDR_INC_SW_IDX(p_info, index) \ - do { \ - p_info->index = (p_info->index + 1) & \ - qed_chain_get_capacity(p_info->pbl) \ - } while (0) - struct qedr_srq_hwq_info { u32 max_sges; u32 max_wr; diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index a5b2b62f596b..1390e861bd1d 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -119,12 +119,15 @@ void retransmit_timer(struct timer_list *t) rxe_dbg_qp(qp, "retransmit timer fired\n"); + if (!rxe_get(qp)) + return; spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { qp->comp.timeout = 1; rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); + rxe_put(qp); } void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index b1df05238848..c71ab780e379 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -72,14 +72,46 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr) mr->ibmr.type = IB_MR_TYPE_DMA; } +/* + * Convert iova to page_info index. The page_info stores pages of size + * PAGE_SIZE, but MRs can have different page sizes. This function + * handles the conversion for all cases: + * + * 1. mr->page_size > PAGE_SIZE: + * The MR's iova may not be aligned to mr->page_size. We use the + * aligned base (iova & page_mask) as reference, then calculate + * which PAGE_SIZE sub-page the iova falls into. + * + * 2. mr->page_size <= PAGE_SIZE: + * Use simple shift arithmetic since each page_info entry corresponds + * to one or more MR pages. + */ static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) { - return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); + int idx; + + if (mr_page_size(mr) > PAGE_SIZE) + idx = (iova - (mr->ibmr.iova & mr->page_mask)) >> PAGE_SHIFT; + else + idx = (iova >> mr->page_shift) - + (mr->ibmr.iova >> mr->page_shift); + + WARN_ON(idx >= mr->nbuf); + return idx; } +/* + * Convert iova to offset within the page_info entry. + * + * For mr_page_size > PAGE_SIZE, the offset is within the system page. + * For mr_page_size <= PAGE_SIZE, the offset is within the MR page size. + */ static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) { - return iova & (mr_page_size(mr) - 1); + if (mr_page_size(mr) > PAGE_SIZE) + return iova & (PAGE_SIZE - 1); + else + return iova & (mr_page_size(mr) - 1); } static bool is_pmem_page(struct page *pg) @@ -93,37 +125,69 @@ static bool is_pmem_page(struct page *pg) static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) { - XA_STATE(xas, &mr->page_list, 0); struct sg_page_iter sg_iter; struct page *page; bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); + WARN_ON(mr_page_size(mr) != PAGE_SIZE); + __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); if (!__sg_page_iter_next(&sg_iter)) return 0; - do { - xas_lock(&xas); - while (true) { - page = sg_page_iter_page(&sg_iter); + while (true) { + page = sg_page_iter_page(&sg_iter); - if (persistent && !is_pmem_page(page)) { - rxe_dbg_mr(mr, "Page can't be persistent\n"); - xas_set_err(&xas, -EINVAL); - break; - } - - xas_store(&xas, page); - if (xas_error(&xas)) - break; - xas_next(&xas); - if (!__sg_page_iter_next(&sg_iter)) - break; + if (persistent && !is_pmem_page(page)) { + rxe_dbg_mr(mr, "Page can't be persistent\n"); + return -EINVAL; } - xas_unlock(&xas); - } while (xas_nomem(&xas, GFP_KERNEL)); - return xas_error(&xas); + mr->page_info[mr->nbuf].page = page; + mr->page_info[mr->nbuf].offset = 0; + mr->nbuf++; + + if (!__sg_page_iter_next(&sg_iter)) + break; + } + + return 0; +} + +static int __alloc_mr_page_info(struct rxe_mr *mr, int num_pages) +{ + mr->page_info = kcalloc(num_pages, sizeof(struct rxe_mr_page), + GFP_KERNEL); + if (!mr->page_info) + return -ENOMEM; + + mr->max_allowed_buf = num_pages; + mr->nbuf = 0; + + return 0; +} + +static int alloc_mr_page_info(struct rxe_mr *mr, int num_pages) +{ + int ret; + + WARN_ON(mr->num_buf); + ret = __alloc_mr_page_info(mr, num_pages); + if (ret) + return ret; + + mr->num_buf = num_pages; + + return 0; +} + +static void free_mr_page_info(struct rxe_mr *mr) +{ + if (!mr->page_info) + return; + + kfree(mr->page_info); + mr->page_info = NULL; } int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, @@ -134,8 +198,6 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, rxe_mr_init(access, mr); - xa_init(&mr->page_list); - umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", @@ -143,46 +205,24 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, return PTR_ERR(umem); } + err = alloc_mr_page_info(mr, ib_umem_num_pages(umem)); + if (err) + goto err2; + err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); - if (err) { - ib_umem_release(umem); - return err; - } + if (err) + goto err1; mr->umem = umem; mr->ibmr.type = IB_MR_TYPE_USER; mr->state = RXE_MR_STATE_VALID; return 0; -} - -static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) -{ - XA_STATE(xas, &mr->page_list, 0); - int i = 0; - int err; - - xa_init(&mr->page_list); - - do { - xas_lock(&xas); - while (i != num_buf) { - xas_store(&xas, XA_ZERO_ENTRY); - if (xas_error(&xas)) - break; - xas_next(&xas); - i++; - } - xas_unlock(&xas); - } while (xas_nomem(&xas, GFP_KERNEL)); - - err = xas_error(&xas); - if (err) - return err; - - mr->num_buf = num_buf; - - return 0; +err1: + free_mr_page_info(mr); +err2: + ib_umem_release(umem); + return err; } int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) @@ -192,7 +232,7 @@ int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) /* always allow remote access for FMRs */ rxe_mr_init(RXE_ACCESS_REMOTE, mr); - err = rxe_mr_alloc(mr, max_pages); + err = alloc_mr_page_info(mr, max_pages); if (err) goto err1; @@ -205,26 +245,43 @@ err1: return err; } +/* + * I) MRs with page_size >= PAGE_SIZE, + * Split a large MR page (mr->page_size) into multiple PAGE_SIZE + * sub-pages and store them in page_info, offset is always 0. + * + * Called when mr->page_size > PAGE_SIZE. Each call to rxe_set_page() + * represents one mr->page_size region, which we must split into + * (mr->page_size >> PAGE_SHIFT) individual pages. + * + * II) MRs with page_size < PAGE_SIZE, + * Save each PAGE_SIZE page and its offset within the system page in page_info. + */ static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr) { struct rxe_mr *mr = to_rmr(ibmr); - struct page *page = ib_virt_dma_to_page(dma_addr); bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); - int err; + u32 i, pages_per_mr = mr_page_size(mr) >> PAGE_SHIFT; - if (persistent && !is_pmem_page(page)) { - rxe_dbg_mr(mr, "Page cannot be persistent\n"); - return -EINVAL; + pages_per_mr = MAX(1, pages_per_mr); + + for (i = 0; i < pages_per_mr; i++) { + u64 addr = dma_addr + i * PAGE_SIZE; + struct page *sub_page = ib_virt_dma_to_page(addr); + + if (unlikely(mr->nbuf >= mr->max_allowed_buf)) + return -ENOMEM; + + if (persistent && !is_pmem_page(sub_page)) { + rxe_dbg_mr(mr, "Page cannot be persistent\n"); + return -EINVAL; + } + + mr->page_info[mr->nbuf].page = sub_page; + mr->page_info[mr->nbuf].offset = addr & (PAGE_SIZE - 1); + mr->nbuf++; } - if (unlikely(mr->nbuf == mr->num_buf)) - return -ENOMEM; - - err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); - if (err) - return err; - - mr->nbuf++; return 0; } @@ -234,10 +291,34 @@ int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, struct rxe_mr *mr = to_rmr(ibmr); unsigned int page_size = mr_page_size(mr); + /* + * Ensure page_size and PAGE_SIZE are compatible for mapping. + * We require one to be a multiple of the other for correct + * iova-to-page conversion. + */ + if (!IS_ALIGNED(page_size, PAGE_SIZE) && + !IS_ALIGNED(PAGE_SIZE, page_size)) { + rxe_dbg_mr(mr, "MR page size %u must be compatible with PAGE_SIZE %lu\n", + page_size, PAGE_SIZE); + return -EINVAL; + } + + if (mr_page_size(mr) > PAGE_SIZE) { + /* resize page_info if needed */ + u32 map_mr_pages = (page_size >> PAGE_SHIFT) * mr->num_buf; + + if (map_mr_pages > mr->max_allowed_buf) { + rxe_dbg_mr(mr, "requested pages %u exceed max %u\n", + map_mr_pages, mr->max_allowed_buf); + free_mr_page_info(mr); + if (__alloc_mr_page_info(mr, map_mr_pages)) + return -ENOMEM; + } + } + mr->nbuf = 0; mr->page_shift = ilog2(page_size); mr->page_mask = ~((u64)page_size - 1); - mr->page_offset = mr->ibmr.iova & (page_size - 1); return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); } @@ -245,30 +326,30 @@ int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, unsigned int length, enum rxe_mr_copy_dir dir) { - unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); - unsigned long index = rxe_mr_iova_to_index(mr, iova); unsigned int bytes; - struct page *page; - void *va; + u8 *va; while (length) { - page = xa_load(&mr->page_list, index); - if (!page) + unsigned long index = rxe_mr_iova_to_index(mr, iova); + struct rxe_mr_page *info = &mr->page_info[index]; + unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); + + if (!info->page) return -EFAULT; - bytes = min_t(unsigned int, length, - mr_page_size(mr) - page_offset); - va = kmap_local_page(page); + page_offset += info->offset; + bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset); + va = kmap_local_page(info->page); + if (dir == RXE_FROM_MR_OBJ) memcpy(addr, va + page_offset, bytes); else memcpy(va + page_offset, addr, bytes); kunmap_local(va); - page_offset = 0; addr += bytes; + iova += bytes; length -= bytes; - index++; } return 0; @@ -426,9 +507,6 @@ err1: static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) { - unsigned int page_offset; - unsigned long index; - struct page *page; unsigned int bytes; int err; u8 *va; @@ -438,15 +516,17 @@ static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int leng return err; while (length > 0) { - index = rxe_mr_iova_to_index(mr, iova); - page = xa_load(&mr->page_list, index); - page_offset = rxe_mr_iova_to_page_offset(mr, iova); - if (!page) - return -EFAULT; - bytes = min_t(unsigned int, length, - mr_page_size(mr) - page_offset); + unsigned long index = rxe_mr_iova_to_index(mr, iova); + struct rxe_mr_page *info = &mr->page_info[index]; + unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); - va = kmap_local_page(page); + if (!info->page) + return -EFAULT; + + page_offset += info->offset; + bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset); + + va = kmap_local_page(info->page); arch_wb_cache_pmem(va + page_offset, bytes); kunmap_local(va); @@ -501,6 +581,7 @@ enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, } else { unsigned long index; int err; + struct rxe_mr_page *info; err = mr_check_range(mr, iova, sizeof(value)); if (err) { @@ -509,9 +590,12 @@ enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, } page_offset = rxe_mr_iova_to_page_offset(mr, iova); index = rxe_mr_iova_to_index(mr, iova); - page = xa_load(&mr->page_list, index); - if (!page) + info = &mr->page_info[index]; + if (!info->page) return RESPST_ERR_RKEY_VIOLATION; + + page_offset += info->offset; + page = info->page; } if (unlikely(page_offset & 0x7)) { @@ -550,6 +634,7 @@ enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) } else { unsigned long index; int err; + struct rxe_mr_page *info; /* See IBA oA19-28 */ err = mr_check_range(mr, iova, sizeof(value)); @@ -559,9 +644,12 @@ enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) } page_offset = rxe_mr_iova_to_page_offset(mr, iova); index = rxe_mr_iova_to_index(mr, iova); - page = xa_load(&mr->page_list, index); - if (!page) + info = &mr->page_info[index]; + if (!info->page) return RESPST_ERR_RKEY_VIOLATION; + + page_offset += info->offset; + page = info->page; } /* See IBA A19.4.2 */ @@ -725,5 +813,5 @@ void rxe_mr_cleanup(struct rxe_pool_elem *elem) ib_umem_release(mr->umem); if (mr->ibmr.type != IB_MR_TYPE_DMA) - xa_destroy(&mr->page_list); + free_mr_page_info(mr); } diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index c928cbf2e35f..d3a54bfaf92f 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -110,7 +110,6 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, mr->access = access_flags; mr->ibmr.length = length; mr->ibmr.iova = iova; - mr->page_offset = ib_umem_offset(&umem_odp->umem); err = rxe_odp_init_pages(mr); if (err) { diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 373b03f223be..12d03f390b09 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -102,6 +102,8 @@ void rnr_nak_timer(struct timer_list *t) rxe_dbg_qp(qp, "nak timer fired\n"); + if (!rxe_get(qp)) + return; spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { /* request a send queue retry */ @@ -110,6 +112,7 @@ void rnr_nak_timer(struct timer_list *t) rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); + rxe_put(qp); } static void req_check_sq_drain_done(struct rxe_qp *qp) diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 2a234f26ac10..c9a7cd38953d 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -77,9 +77,6 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, goto err_free; } - srq->rq.queue = q; - init->attr.max_wr = srq->rq.max_wr; - if (uresp) { if (copy_to_user(&uresp->srq_num, &srq->srq_num, sizeof(uresp->srq_num))) { @@ -88,6 +85,9 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, } } + srq->rq.queue = q; + init->attr.max_wr = srq->rq.max_wr; + return 0; err_free: diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index fd48075810dd..fb149f37e91d 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -335,6 +335,11 @@ static inline int rkey_is_mw(u32 rkey) return (index >= RXE_MIN_MW_INDEX) && (index <= RXE_MAX_MW_INDEX); } +struct rxe_mr_page { + struct page *page; + unsigned int offset; /* offset in system page */ +}; + struct rxe_mr { struct rxe_pool_elem elem; struct ib_mr ibmr; @@ -347,14 +352,16 @@ struct rxe_mr { int access; atomic_t num_mw; - unsigned int page_offset; unsigned int page_shift; u64 page_mask; + /* size of page_info when mr allocated */ u32 num_buf; + /* real size of page_info */ + u32 max_allowed_buf; u32 nbuf; - struct xarray page_list; + struct rxe_mr_page *page_info; }; static inline unsigned int mr_page_size(struct rxe_mr *mr) diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index a10820e33887..e8a88b378d51 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -1435,7 +1435,8 @@ int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, } if (unlikely(rv != 0 && rv != -EAGAIN)) { if ((srx->state > SIW_GET_HDR || - qp->rx_fpdu->more_ddp_segs) && run_completion) + (qp->rx_fpdu && qp->rx_fpdu->more_ddp_segs)) && + run_completion) siw_rdmap_complete(qp, rv); siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv, diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c index 4aa80c9388f0..287e0ea43287 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c @@ -439,19 +439,19 @@ int rtrs_clt_create_path_files(struct rtrs_clt_path *clt_path) clt->kobj_paths, "%s", str); if (err) { - pr_err("kobject_init_and_add: %d\n", err); + pr_err("kobject_init_and_add: %pe\n", ERR_PTR(err)); kobject_put(&clt_path->kobj); return err; } err = sysfs_create_group(&clt_path->kobj, &rtrs_clt_path_attr_group); if (err) { - pr_err("sysfs_create_group(): %d\n", err); + pr_err("sysfs_create_group(): %pe\n", ERR_PTR(err)); goto put_kobj; } err = kobject_init_and_add(&clt_path->stats->kobj_stats, &ktype_stats, &clt_path->kobj, "stats"); if (err) { - pr_err("kobject_init_and_add: %d\n", err); + pr_err("kobject_init_and_add: %pe\n", ERR_PTR(err)); kobject_put(&clt_path->stats->kobj_stats); goto remove_group; } @@ -459,7 +459,7 @@ int rtrs_clt_create_path_files(struct rtrs_clt_path *clt_path) err = sysfs_create_group(&clt_path->stats->kobj_stats, &rtrs_clt_stats_attr_group); if (err) { - pr_err("failed to create stats sysfs group, err: %d\n", err); + pr_err("failed to create stats sysfs group, err: %pe\n", ERR_PTR(err)); goto put_kobj_stats; } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 2b397a544cb9..59e30640f94a 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -422,8 +422,8 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, refcount_inc(&req->ref); err = rtrs_inv_rkey(req); if (err) { - rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %d\n", - req->mr->rkey, err); + rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %pe\n", + req->mr->rkey, ERR_PTR(err)); } else if (can_wait) { wait_for_completion(&req->inv_comp); } @@ -443,8 +443,8 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, if (errno) { rtrs_err_rl(con->c.path, - "IO %s request failed: error=%d path=%s [%s:%u] notify=%d\n", - req->dir == DMA_TO_DEVICE ? "write" : "read", errno, + "IO %s request failed: error=%pe path=%s [%s:%u] notify=%d\n", + req->dir == DMA_TO_DEVICE ? "write" : "read", ERR_PTR(errno), kobject_name(&clt_path->kobj), clt_path->hca_name, clt_path->hca_port, notify); } @@ -514,7 +514,7 @@ static void rtrs_clt_recv_done(struct rtrs_clt_con *con, struct ib_wc *wc) cqe); err = rtrs_iu_post_recv(&con->c, iu); if (err) { - rtrs_err(con->c.path, "post iu failed %d\n", err); + rtrs_err(con->c.path, "post iu failed %pe\n", ERR_PTR(err)); rtrs_rdma_error_recovery(con); } } @@ -659,8 +659,8 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) else err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); if (err) { - rtrs_err(con->c.path, "rtrs_post_recv_empty(): %d\n", - err); + rtrs_err(con->c.path, "rtrs_post_recv_empty(): %pe\n", + ERR_PTR(err)); rtrs_rdma_error_recovery(con); } break; @@ -731,8 +731,8 @@ static int post_recv_path(struct rtrs_clt_path *clt_path) err = post_recv_io(to_clt_con(clt_path->s.con[cid]), q_size); if (err) { - rtrs_err(clt_path->clt, "post_recv_io(), err: %d\n", - err); + rtrs_err(clt_path->clt, "post_recv_io(), err: %pe\n", + ERR_PTR(err)); return err; } } @@ -1122,8 +1122,8 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) ret = rtrs_map_sg_fr(req, count); if (ret < 0) { rtrs_err_rl(s, - "Write request failed, failed to map fast reg. data, err: %d\n", - ret); + "Write request failed, failed to map fast reg. data, err: %pe\n", + ERR_PTR(ret)); ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist, req->sg_cnt, req->dir); return ret; @@ -1150,9 +1150,9 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) imm, wr, NULL); if (ret) { rtrs_err_rl(s, - "Write request failed: error=%d path=%s [%s:%u]\n", - ret, kobject_name(&clt_path->kobj), clt_path->hca_name, - clt_path->hca_port); + "Write request failed: error=%pe path=%s [%s:%u]\n", + ERR_PTR(ret), kobject_name(&clt_path->kobj), + clt_path->hca_name, clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); if (req->mr->need_inval) { @@ -1208,8 +1208,8 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) ret = rtrs_map_sg_fr(req, count); if (ret < 0) { rtrs_err_rl(s, - "Read request failed, failed to map fast reg. data, err: %d\n", - ret); + "Read request failed, failed to map fast reg. data, err: %pe\n", + ERR_PTR(ret)); ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, req->dir); return ret; @@ -1260,9 +1260,9 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) req->data_len, imm, wr); if (ret) { rtrs_err_rl(s, - "Read request failed: error=%d path=%s [%s:%u]\n", - ret, kobject_name(&clt_path->kobj), clt_path->hca_name, - clt_path->hca_port); + "Read request failed: error=%pe path=%s [%s:%u]\n", + ERR_PTR(ret), kobject_name(&clt_path->kobj), + clt_path->hca_name, clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); req->mr->need_inval = false; @@ -1359,7 +1359,9 @@ static void free_path_reqs(struct rtrs_clt_path *clt_path) static int alloc_path_reqs(struct rtrs_clt_path *clt_path) { + struct ib_device *ib_dev = clt_path->s.dev->ib_dev; struct rtrs_clt_io_req *req; + enum ib_mr_type mr_type; int i, err = -ENOMEM; clt_path->reqs = kcalloc(clt_path->queue_depth, @@ -1368,6 +1370,11 @@ static int alloc_path_reqs(struct rtrs_clt_path *clt_path) if (!clt_path->reqs) return -ENOMEM; + if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + for (i = 0; i < clt_path->queue_depth; ++i) { req = &clt_path->reqs[i]; req->iu = rtrs_iu_alloc(1, clt_path->max_hdr_size, GFP_KERNEL, @@ -1381,8 +1388,7 @@ static int alloc_path_reqs(struct rtrs_clt_path *clt_path) if (!req->sge) goto out; - req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd, - IB_MR_TYPE_MEM_REG, + req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd, mr_type, clt_path->max_pages_per_mr); if (IS_ERR(req->mr)) { err = PTR_ERR(req->mr); @@ -1775,12 +1781,12 @@ static int rtrs_rdma_addr_resolved(struct rtrs_clt_con *con) err = create_con_cq_qp(con); mutex_unlock(&con->con_mutex); if (err) { - rtrs_err(s, "create_con_cq_qp(), err: %d\n", err); + rtrs_err(s, "create_con_cq_qp(), err: %pe\n", ERR_PTR(err)); return err; } err = rdma_resolve_route(con->c.cm_id, RTRS_CONNECT_TIMEOUT_MS); if (err) - rtrs_err(s, "Resolving route failed, err: %d\n", err); + rtrs_err(s, "Resolving route failed, err: %pe\n", ERR_PTR(err)); return err; } @@ -1814,7 +1820,7 @@ static int rtrs_rdma_route_resolved(struct rtrs_clt_con *con) err = rdma_connect_locked(con->c.cm_id, ¶m); if (err) - rtrs_err(clt, "rdma_connect_locked(): %d\n", err); + rtrs_err(clt, "rdma_connect_locked(): %pe\n", ERR_PTR(err)); return err; } @@ -1847,8 +1853,8 @@ static int rtrs_rdma_conn_established(struct rtrs_clt_con *con, } errno = le16_to_cpu(msg->errno); if (errno) { - rtrs_err(clt, "Invalid RTRS message: errno %d\n", - errno); + rtrs_err(clt, "Invalid RTRS message: errno %pe\n", + ERR_PTR(errno)); return -ECONNRESET; } if (con->c.cid == 0) { @@ -1923,7 +1929,7 @@ static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con, struct rtrs_path *s = con->c.path; const struct rtrs_msg_conn_rsp *msg; const char *rej_msg; - int status, errno; + int status, errno = -ECONNRESET; u8 data_len; status = ev->status; @@ -1937,15 +1943,15 @@ static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con, "Previous session is still exists on the server, please reconnect later\n"); else rtrs_err(s, - "Connect rejected: status %d (%s), rtrs errno %d\n", - status, rej_msg, errno); + "Connect rejected: status %d (%s), rtrs errno %pe\n", + status, rej_msg, ERR_PTR(errno)); } else { rtrs_err(s, "Connect rejected but with malformed message: status %d (%s)\n", status, rej_msg); } - return -ECONNRESET; + return errno; } void rtrs_clt_close_conns(struct rtrs_clt_path *clt_path, bool wait) @@ -2009,27 +2015,53 @@ static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_TIMEWAIT_EXIT: - rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n", - rdma_event_msg(ev->event), ev->status); + if (ev->status < 0) { + rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_wrn(s, "CM error (CM event: %s, err: %s)\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } cm_err = -ECONNRESET; break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: - rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n", - rdma_event_msg(ev->event), ev->status); + if (ev->status < 0) { + rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), + ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_wrn(s, "CM error (CM event: %s, err: %s)\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } cm_err = -EHOSTUNREACH; break; case RDMA_CM_EVENT_DEVICE_REMOVAL: /* * Device removal is a special case. Queue close and return 0. */ - rtrs_wrn_rl(s, "CM event: %s, status: %d\n", rdma_event_msg(ev->event), - ev->status); + if (ev->status < 0) { + rtrs_wrn_rl(s, "CM event: %s, status: %pe\n", + rdma_event_msg(ev->event), + ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_wrn_rl(s, "CM event: %s, status: %s\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } rtrs_clt_close_conns(clt_path, false); return 0; default: - rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %d)\n", - rdma_event_msg(ev->event), ev->status); + if (ev->status < 0) { + rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %s)\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } cm_err = -ECONNRESET; break; } @@ -2066,14 +2098,14 @@ static int create_cm(struct rtrs_clt_con *con) /* allow the port to be reused */ err = rdma_set_reuseaddr(cm_id, 1); if (err != 0) { - rtrs_err(s, "Set address reuse failed, err: %d\n", err); + rtrs_err(s, "Set address reuse failed, err: %pe\n", ERR_PTR(err)); return err; } err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr, (struct sockaddr *)&clt_path->s.dst_addr, RTRS_CONNECT_TIMEOUT_MS); if (err) { - rtrs_err(s, "Failed to resolve address, err: %d\n", err); + rtrs_err(s, "Failed to resolve address, err: %pe\n", ERR_PTR(err)); return err; } /* @@ -2548,7 +2580,7 @@ static int rtrs_send_path_info(struct rtrs_clt_path *clt_path) /* Prepare for getting info response */ err = rtrs_iu_post_recv(&usr_con->c, rx_iu); if (err) { - rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %d\n", err); + rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %pe\n", ERR_PTR(err)); goto out; } rx_iu = NULL; @@ -2564,7 +2596,7 @@ static int rtrs_send_path_info(struct rtrs_clt_path *clt_path) /* Send info request */ err = rtrs_iu_post_send(&usr_con->c, tx_iu, sizeof(*msg), NULL); if (err) { - rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %d\n", err); + rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %pe\n", ERR_PTR(err)); goto out; } tx_iu = NULL; @@ -2615,15 +2647,15 @@ static int init_path(struct rtrs_clt_path *clt_path) err = init_conns(clt_path); if (err) { rtrs_err(clt_path->clt, - "init_conns() failed: err=%d path=%s [%s:%u]\n", err, - str, clt_path->hca_name, clt_path->hca_port); + "init_conns() failed: err=%pe path=%s [%s:%u]\n", + ERR_PTR(err), str, clt_path->hca_name, clt_path->hca_port); goto out; } err = rtrs_send_path_info(clt_path); if (err) { rtrs_err(clt_path->clt, - "rtrs_send_path_info() failed: err=%d path=%s [%s:%u]\n", - err, str, clt_path->hca_name, clt_path->hca_port); + "rtrs_send_path_info() failed: err=%pe path=%s [%s:%u]\n", + ERR_PTR(err), str, clt_path->hca_name, clt_path->hca_port); goto out; } rtrs_clt_path_up(clt_path); @@ -3147,8 +3179,11 @@ close_path: void rtrs_clt_ib_event_handler(struct ib_event_handler *handler, struct ib_event *ibevent) { - pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), - ibevent->event); + struct ib_device *idev = ibevent->device; + u32 port_num = ibevent->element.port_num; + + pr_info("Handling event: %s (%d). HCA name: %s, port num: %u\n", + ib_event_msg(ibevent->event), ibevent->event, idev->name, port_num); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h index 0f57759b3080..986239ed2d3b 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -92,7 +92,6 @@ struct rtrs_permit { * rtrs_clt_io_req - describes one inflight IO request */ struct rtrs_clt_io_req { - struct list_head list; struct rtrs_iu *iu; struct scatterlist *sglist; /* list holding user data */ unsigned int sg_cnt; @@ -103,12 +102,10 @@ struct rtrs_clt_io_req { bool in_use; enum rtrs_mp_policy mp_policy; struct rtrs_clt_con *con; - struct rtrs_sg_desc *desc; struct ib_sge *sge; struct rtrs_permit *permit; enum dma_data_direction dir; void (*conf)(void *priv, int errno); - unsigned long start_jiffies; struct ib_mr *mr; struct ib_cqe inv_cqe; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index 3f305e694fe8..51727c7d710c 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -176,14 +176,14 @@ static int rtrs_srv_create_once_sysfs_root_folders(struct rtrs_srv_path *srv_pat dev_set_uevent_suppress(&srv->dev, true); err = device_add(&srv->dev); if (err) { - pr_err("device_add(): %d\n", err); + pr_err("device_add(): %pe\n", ERR_PTR(err)); put_device(&srv->dev); goto unlock; } srv->kobj_paths = kobject_create_and_add("paths", &srv->dev.kobj); if (!srv->kobj_paths) { err = -ENOMEM; - pr_err("kobject_create_and_add(): %d\n", err); + pr_err("kobject_create_and_add(): %pe\n", ERR_PTR(err)); device_del(&srv->dev); put_device(&srv->dev); goto unlock; @@ -237,14 +237,14 @@ static int rtrs_srv_create_stats_files(struct rtrs_srv_path *srv_path) err = kobject_init_and_add(&srv_path->stats->kobj_stats, &ktype_stats, &srv_path->kobj, "stats"); if (err) { - rtrs_err(s, "kobject_init_and_add(): %d\n", err); + rtrs_err(s, "kobject_init_and_add(): %pe\n", ERR_PTR(err)); kobject_put(&srv_path->stats->kobj_stats); return err; } err = sysfs_create_group(&srv_path->stats->kobj_stats, &rtrs_srv_stats_attr_group); if (err) { - rtrs_err(s, "sysfs_create_group(): %d\n", err); + rtrs_err(s, "sysfs_create_group(): %pe\n", ERR_PTR(err)); goto err; } @@ -276,12 +276,12 @@ int rtrs_srv_create_path_files(struct rtrs_srv_path *srv_path) err = kobject_init_and_add(&srv_path->kobj, &ktype, srv->kobj_paths, "%s", str); if (err) { - rtrs_err(s, "kobject_init_and_add(): %d\n", err); + rtrs_err(s, "kobject_init_and_add(): %pe\n", ERR_PTR(err)); goto destroy_root; } err = sysfs_create_group(&srv_path->kobj, &rtrs_srv_path_attr_group); if (err) { - rtrs_err(s, "sysfs_create_group(): %d\n", err); + rtrs_err(s, "sysfs_create_group(): %pe\n", ERR_PTR(err)); goto put_kobj; } err = rtrs_srv_create_stats_files(srv_path); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 9ecc6343455d..2e09811a10b2 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -184,7 +184,7 @@ static void rtrs_srv_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc) struct rtrs_srv_path *srv_path = to_srv_path(s); if (wc->status != IB_WC_SUCCESS) { - rtrs_err(s, "REG MR failed: %s\n", + rtrs_err_rl(s, "REG MR failed: %s\n", ib_wc_status_msg(wc->status)); close_path(srv_path); return; @@ -208,7 +208,6 @@ static int rdma_write_sg(struct rtrs_srv_op *id) size_t sg_cnt; int err, offset; bool need_inval; - u32 rkey = 0; struct ib_reg_wr rwr; struct ib_sge *plist; struct ib_sge list; @@ -240,11 +239,6 @@ static int rdma_write_sg(struct rtrs_srv_op *id) wr->wr.num_sge = 1; wr->remote_addr = le64_to_cpu(id->rd_msg->desc[0].addr); wr->rkey = le32_to_cpu(id->rd_msg->desc[0].key); - if (rkey == 0) - rkey = wr->rkey; - else - /* Only one key is actually used */ - WARN_ON_ONCE(rkey != wr->rkey); wr->wr.opcode = IB_WR_RDMA_WRITE; wr->wr.wr_cqe = &io_comp_cqe; @@ -277,7 +271,7 @@ static int rdma_write_sg(struct rtrs_srv_op *id) inv_wr.opcode = IB_WR_SEND_WITH_INV; inv_wr.wr_cqe = &io_comp_cqe; inv_wr.send_flags = 0; - inv_wr.ex.invalidate_rkey = rkey; + inv_wr.ex.invalidate_rkey = wr->rkey; } imm_wr.wr.next = NULL; @@ -323,8 +317,8 @@ static int rdma_write_sg(struct rtrs_srv_op *id) err = ib_post_send(id->con->c.qp, &id->tx_wr.wr, NULL); if (err) rtrs_err(s, - "Posting RDMA-Write-Request to QP failed, err: %d\n", - err); + "Posting RDMA-Write-Request to QP failed, err: %pe\n", + ERR_PTR(err)); return err; } @@ -440,8 +434,8 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, err = ib_post_send(id->con->c.qp, wr, NULL); if (err) - rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %d\n", - err); + rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %pe\n", + ERR_PTR(err)); return err; } @@ -525,8 +519,8 @@ bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int status) err = rdma_write_sg(id); if (err) { - rtrs_err_rl(s, "IO response failed: %d: srv_path=%s\n", err, - kobject_name(&srv_path->kobj)); + rtrs_err_rl(s, "IO response failed: %pe: srv_path=%s\n", + ERR_PTR(err), kobject_name(&srv_path->kobj)); close_path(srv_path); } out: @@ -568,13 +562,15 @@ static void unmap_cont_bufs(struct rtrs_srv_path *srv_path) static int map_cont_bufs(struct rtrs_srv_path *srv_path) { + struct ib_device *ib_dev = srv_path->s.dev->ib_dev; struct rtrs_srv_sess *srv = srv_path->srv; struct rtrs_path *ss = &srv_path->s; int i, err, mrs_num; unsigned int chunk_bits; + enum ib_mr_type mr_type; int chunks_per_mr = 1; - struct ib_mr *mr; struct sg_table *sgt; + struct ib_mr *mr; /* * Here we map queue_depth chunks to MR. Firstly we have to @@ -601,7 +597,7 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) srv_path->mrs_num++) { struct rtrs_srv_mr *srv_mr = &srv_path->mrs[srv_path->mrs_num]; struct scatterlist *s; - int nr, nr_sgt, chunks; + int nr, nr_sgt, chunks, ind; sgt = &srv_mr->sgt; chunks = chunks_per_mr * srv_path->mrs_num; @@ -623,15 +619,20 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) err = -EINVAL; goto free_sg; } - mr = ib_alloc_mr(srv_path->s.dev->ib_pd, IB_MR_TYPE_MEM_REG, - nr_sgt); + + if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + + mr = ib_alloc_mr(srv_path->s.dev->ib_pd, mr_type, nr_sgt); if (IS_ERR(mr)) { err = PTR_ERR(mr); goto unmap_sg; } nr = ib_map_mr_sg(mr, sgt->sgl, nr_sgt, NULL, max_chunk_size); - if (nr != nr_sgt) { + if (nr < nr_sgt) { err = nr < 0 ? nr : -EINVAL; goto dereg_mr; } @@ -643,13 +644,28 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) DMA_TO_DEVICE, rtrs_srv_rdma_done); if (!srv_mr->iu) { err = -ENOMEM; - rtrs_err(ss, "rtrs_iu_alloc(), err: %d\n", err); + rtrs_err(ss, "rtrs_iu_alloc(), err: %pe\n", ERR_PTR(err)); goto dereg_mr; } } - /* Eventually dma addr for each chunk can be cached */ - for_each_sg(sgt->sgl, s, nr_sgt, i) - srv_path->dma_addr[chunks + i] = sg_dma_address(s); + + /* + * Cache DMA addresses by traversing sg entries. If + * regions were merged, an inner loop is required to + * populate the DMA address array by traversing larger + * regions. + */ + ind = chunks; + for_each_sg(sgt->sgl, s, nr_sgt, i) { + unsigned int dma_len = sg_dma_len(s); + u64 dma_addr = sg_dma_address(s); + u64 dma_addr_end = dma_addr + dma_len; + + do { + srv_path->dma_addr[ind++] = dma_addr; + dma_addr += max_chunk_size; + } while (dma_addr < dma_addr_end); + } ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); srv_mr->mr = mr; @@ -804,7 +820,7 @@ static int process_info_req(struct rtrs_srv_con *con, err = post_recv_path(srv_path); if (err) { - rtrs_err(s, "post_recv_path(), err: %d\n", err); + rtrs_err(s, "post_recv_path(), err: %pe\n", ERR_PTR(err)); return err; } @@ -867,7 +883,7 @@ static int process_info_req(struct rtrs_srv_con *con, get_device(&srv_path->srv->dev); err = rtrs_srv_change_state(srv_path, RTRS_SRV_CONNECTED); if (!err) { - rtrs_err(s, "rtrs_srv_change_state(), err: %d\n", err); + rtrs_err(s, "rtrs_srv_change_state() failed\n"); goto iu_free; } @@ -881,7 +897,7 @@ static int process_info_req(struct rtrs_srv_con *con, */ err = rtrs_srv_path_up(srv_path); if (err) { - rtrs_err(s, "rtrs_srv_path_up(), err: %d\n", err); + rtrs_err(s, "rtrs_srv_path_up(), err: %pe\n", ERR_PTR(err)); goto iu_free; } @@ -889,10 +905,16 @@ static int process_info_req(struct rtrs_srv_con *con, tx_iu->dma_addr, tx_iu->size, DMA_TO_DEVICE); + /* + * Now disable zombie connection closing. Since from the logs and code, + * we know that it can never be in CONNECTED state. + */ + srv_path->connection_timeout = 0; + /* Send info response */ err = rtrs_iu_post_send(&con->c, tx_iu, tx_sz, reg_wr); if (err) { - rtrs_err(s, "rtrs_iu_post_send(), err: %d\n", err); + rtrs_err(s, "rtrs_iu_post_send(), err: %pe\n", ERR_PTR(err)); iu_free: rtrs_iu_free(tx_iu, srv_path->s.dev->ib_dev, 1); } @@ -960,7 +982,7 @@ static int post_recv_info_req(struct rtrs_srv_con *con) /* Prepare for getting info response */ err = rtrs_iu_post_recv(&con->c, rx_iu); if (err) { - rtrs_err(s, "rtrs_iu_post_recv(), err: %d\n", err); + rtrs_err(s, "rtrs_iu_post_recv(), err: %pe\n", ERR_PTR(err)); rtrs_iu_free(rx_iu, srv_path->s.dev->ib_dev, 1); return err; } @@ -1006,7 +1028,7 @@ static int post_recv_path(struct rtrs_srv_path *srv_path) err = post_recv_io(to_srv_con(srv_path->s.con[cid]), q_size); if (err) { - rtrs_err(s, "post_recv_io(), err: %d\n", err); + rtrs_err(s, "post_recv_io(), err: %pe\n", ERR_PTR(err)); return err; } } @@ -1054,8 +1076,8 @@ static void process_read(struct rtrs_srv_con *con, if (ret) { rtrs_err_rl(s, - "Processing read request failed, user module cb reported for msg_id %d, err: %d\n", - buf_id, ret); + "Processing read request failed, user module cb reported for msg_id %d, err: %pe\n", + buf_id, ERR_PTR(ret)); goto send_err_msg; } @@ -1065,8 +1087,8 @@ send_err_msg: ret = send_io_resp_imm(con, id, ret); if (ret < 0) { rtrs_err_rl(s, - "Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %d\n", - buf_id, ret); + "Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %pe\n", + buf_id, ERR_PTR(ret)); close_path(srv_path); } rtrs_srv_put_ops_ids(srv_path); @@ -1106,8 +1128,8 @@ static void process_write(struct rtrs_srv_con *con, data + data_len, usr_len); if (ret) { rtrs_err_rl(s, - "Processing write request failed, user module callback reports err: %d\n", - ret); + "Processing write request failed, user module callback reports err: %pe\n", + ERR_PTR(ret)); goto send_err_msg; } @@ -1117,8 +1139,8 @@ send_err_msg: ret = send_io_resp_imm(con, id, ret); if (ret < 0) { rtrs_err_rl(s, - "Processing write request failed, sending I/O response failed, msg_id %d, err: %d\n", - buf_id, ret); + "Processing write request failed, sending I/O response failed, msg_id %d, err: %pe\n", + buf_id, ERR_PTR(ret)); close_path(srv_path); } rtrs_srv_put_ops_ids(srv_path); @@ -1248,7 +1270,8 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc) srv_path->s.hb_missed_cnt = 0; err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); if (err) { - rtrs_err(s, "rtrs_post_recv(), err: %d\n", err); + rtrs_err(s, "rtrs_post_recv(), err: %pe\n", + ERR_PTR(err)); close_path(srv_path); break; } @@ -1273,8 +1296,8 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc) mr->msg_id = msg_id; err = rtrs_srv_inv_rkey(con, mr); if (err) { - rtrs_err(s, "rtrs_post_recv(), err: %d\n", - err); + rtrs_err(s, "rtrs_post_recv(), err: %pe\n", + ERR_PTR(err)); close_path(srv_path); break; } @@ -1514,17 +1537,38 @@ static int sockaddr_cmp(const struct sockaddr *a, const struct sockaddr *b) } } +/* Let's close connections which have been waiting for more than 30 seconds */ +#define RTRS_MAX_CONN_TIMEOUT 30000 + +static void rtrs_srv_check_close_path(struct rtrs_srv_path *srv_path) +{ + struct rtrs_path *s = &srv_path->s; + + if (srv_path->state == RTRS_SRV_CONNECTING && srv_path->connection_timeout && + (jiffies_to_msecs(jiffies - srv_path->connection_timeout) > RTRS_MAX_CONN_TIMEOUT)) { + rtrs_err(s, "Closing zombie path\n"); + close_path(srv_path); + } +} + static bool __is_path_w_addr_exists(struct rtrs_srv_sess *srv, struct rdma_addr *addr) { struct rtrs_srv_path *srv_path; - list_for_each_entry(srv_path, &srv->paths_list, s.entry) + list_for_each_entry(srv_path, &srv->paths_list, s.entry) { if (!sockaddr_cmp((struct sockaddr *)&srv_path->s.dst_addr, (struct sockaddr *)&addr->dst_addr) && !sockaddr_cmp((struct sockaddr *)&srv_path->s.src_addr, - (struct sockaddr *)&addr->src_addr)) + (struct sockaddr *)&addr->src_addr)) { + rtrs_err((&srv_path->s), + "Path (%s) with same addr exists (lifetime %u)\n", + rtrs_srv_state_str(srv_path->state), + (jiffies_to_msecs(jiffies - srv_path->connection_timeout))); + rtrs_srv_check_close_path(srv_path); return true; + } + } return false; } @@ -1623,7 +1667,7 @@ static int rtrs_rdma_do_accept(struct rtrs_srv_path *srv_path, err = rdma_accept(cm_id, ¶m); if (err) - pr_err("rdma_accept(), err: %d\n", err); + pr_err("rdma_accept(), err: %pe\n", ERR_PTR(err)); return err; } @@ -1641,7 +1685,7 @@ static int rtrs_rdma_do_reject(struct rdma_cm_id *cm_id, int errno) err = rdma_reject(cm_id, &msg, sizeof(msg), IB_CM_REJ_CONSUMER_DEFINED); if (err) - pr_err("rdma_reject(), err: %d\n", err); + pr_err("rdma_reject(), err: %pe\n", ERR_PTR(err)); /* Bounce errno back */ return errno; @@ -1717,7 +1761,7 @@ static int create_con(struct rtrs_srv_path *srv_path, max_send_wr, max_recv_wr, IB_POLL_WORKQUEUE); if (err) { - rtrs_err(s, "rtrs_cq_qp_create(), err: %d\n", err); + rtrs_err(s, "rtrs_cq_qp_create(), err: %pe\n", ERR_PTR(err)); goto free_con; } if (con->c.cid == 0) { @@ -1762,7 +1806,6 @@ static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv, } if (__is_path_w_addr_exists(srv, &cm_id->route.addr)) { err = -EEXIST; - pr_err("Path with same addr exists\n"); goto err; } srv_path = kzalloc(sizeof(*srv_path), GFP_KERNEL); @@ -1809,6 +1852,7 @@ static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv, spin_lock_init(&srv_path->state_lock); INIT_WORK(&srv_path->close_work, rtrs_srv_close_work); rtrs_srv_init_hb(srv_path); + srv_path->connection_timeout = 0; srv_path->s.dev = rtrs_ib_dev_find_or_add(cm_id->device, &dev_pd); if (!srv_path->s.dev) { @@ -1914,8 +1958,10 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id, goto reject_w_err; } if (s->con[cid]) { - rtrs_err(s, "Connection already exists: %d\n", - cid); + rtrs_err(s, "Connection (%s) already exists: %d (lifetime %u)\n", + rtrs_srv_state_str(srv_path->state), cid, + (jiffies_to_msecs(jiffies - srv_path->connection_timeout))); + rtrs_srv_check_close_path(srv_path); mutex_unlock(&srv->paths_mutex); goto reject_w_err; } @@ -1930,9 +1976,15 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id, goto reject_w_err; } } + + /* + * Start of any connection creation resets the timeout for the path. + */ + srv_path->connection_timeout = jiffies; + err = create_con(srv_path, cm_id, cid); if (err) { - rtrs_err((&srv_path->s), "create_con(), error %d\n", err); + rtrs_err((&srv_path->s), "create_con(), error %pe\n", ERR_PTR(err)); rtrs_rdma_do_reject(cm_id, err); /* * Since session has other connections we follow normal way @@ -1943,7 +1995,8 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id, } err = rtrs_rdma_do_accept(srv_path, cm_id); if (err) { - rtrs_err((&srv_path->s), "rtrs_rdma_do_accept(), error %d\n", err); + rtrs_err((&srv_path->s), "rtrs_rdma_do_accept(), error %pe\n", + ERR_PTR(err)); rtrs_rdma_do_reject(cm_id, err); /* * Since current connection was successfully added to the @@ -1994,8 +2047,15 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: - rtrs_err(s, "CM error (CM event: %s, err: %d)\n", - rdma_event_msg(ev->event), ev->status); + if (ev->status < 0) { + rtrs_err(s, "CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), + ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_err(s, "CM error (CM event: %s, err: %s)\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } fallthrough; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_ADDR_CHANGE: @@ -2004,8 +2064,15 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id, close_path(srv_path); break; default: - pr_err("Ignoring unexpected CM event %s, err %d\n", - rdma_event_msg(ev->event), ev->status); + if (ev->status < 0) { + pr_err("Ignoring unexpected CM event %s, err %pe\n", + rdma_event_msg(ev->event), + ERR_PTR(ev->status)); + } else if (ev->status > 0) { + pr_err("Ignoring unexpected CM event %s, err %s\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } break; } @@ -2029,13 +2096,13 @@ static struct rdma_cm_id *rtrs_srv_cm_init(struct rtrs_srv_ctx *ctx, } ret = rdma_bind_addr(cm_id, addr); if (ret) { - pr_err("Binding RDMA address failed, err: %d\n", ret); + pr_err("Binding RDMA address failed, err: %pe\n", ERR_PTR(ret)); goto err_cm; } ret = rdma_listen(cm_id, 64); if (ret) { - pr_err("Listening on RDMA connection failed, err: %d\n", - ret); + pr_err("Listening on RDMA connection failed, err: %pe\n", + ERR_PTR(ret)); goto err_cm; } @@ -2275,8 +2342,11 @@ static int check_module_params(void) void rtrs_srv_ib_event_handler(struct ib_event_handler *handler, struct ib_event *ibevent) { - pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), - ibevent->event); + struct ib_device *idev = ibevent->device; + u32 port_num = ibevent->element.port_num; + + pr_info("Handling event: %s (%d). HCA name: %s, port num: %u\n", + ib_event_msg(ibevent->event), ibevent->event, idev->name, port_num); } static int rtrs_srv_ib_dev_init(struct rtrs_ib_dev *dev) @@ -2313,8 +2383,8 @@ static int __init rtrs_server_init(void) err = check_module_params(); if (err) { - pr_err("Failed to load module, invalid module parameters, err: %d\n", - err); + pr_err("Failed to load module, invalid module parameters, err: %pe\n", + ERR_PTR(err)); return err; } err = class_register(&rtrs_dev_class); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h index 014f85681f37..3d36876527f5 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h @@ -89,6 +89,7 @@ struct rtrs_srv_path { unsigned int mem_bits; struct kobject kobj; struct rtrs_srv_stats *stats; + unsigned long connection_timeout; }; static inline struct rtrs_srv_path *to_srv_path(struct rtrs_path *s) diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index bf38ac6f87c4..bc1208ae8216 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -273,7 +273,8 @@ static int create_qp(struct rtrs_con *con, struct ib_pd *pd, ret = rdma_create_qp(cm_id, pd, &init_attr); if (ret) { - rtrs_err(con->path, "Creating QP failed, err: %d\n", ret); + rtrs_err(con->path, "Creating QP failed, err: %pe\n", + ERR_PTR(ret)); return ret; } con->qp = cm_id->qp; @@ -341,7 +342,8 @@ void rtrs_send_hb_ack(struct rtrs_path *path) err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm, NULL); if (err) { - rtrs_err(path, "send HB ACK failed, errno: %d\n", err); + rtrs_err(path, "send HB ACK failed, errno: %pe\n", + ERR_PTR(err)); path->hb_err_handler(usr_con); return; } @@ -375,7 +377,8 @@ static void hb_work(struct work_struct *work) err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm, NULL); if (err) { - rtrs_err(path, "HB send failed, errno: %d\n", err); + rtrs_err(path, "HB send failed, errno: %pe\n", + ERR_PTR(err)); path->hb_err_handler(usr_con); return; } diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index a59bd4035a99..766f4fb25e26 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -35,6 +35,8 @@ enum gdma_request_type { GDMA_CREATE_MR = 31, GDMA_DESTROY_MR = 32, GDMA_QUERY_HWC_TIMEOUT = 84, /* 0x54 */ + GDMA_ALLOC_DM = 96, /* 0x60 */ + GDMA_DESTROY_DM = 97, /* 0x61 */ }; #define GDMA_RESOURCE_DOORBELL_PAGE 27 @@ -866,6 +868,8 @@ enum gdma_mr_type { GDMA_MR_TYPE_GVA = 2, /* Guest zero-based address MRs */ GDMA_MR_TYPE_ZBVA = 4, + /* Device address MRs */ + GDMA_MR_TYPE_DM = 5, }; struct gdma_create_mr_params { @@ -881,6 +885,12 @@ struct gdma_create_mr_params { u64 dma_region_handle; enum gdma_mr_access_flags access_flags; } zbva; + struct { + u64 dm_handle; + u64 offset; + u64 length; + enum gdma_mr_access_flags access_flags; + } da; }; }; @@ -895,13 +905,23 @@ struct gdma_create_mr_request { u64 dma_region_handle; u64 virtual_address; enum gdma_mr_access_flags access_flags; - } gva; + } __packed gva; struct { u64 dma_region_handle; enum gdma_mr_access_flags access_flags; - } zbva; - }; + } __packed zbva; + struct { + u64 dm_handle; + u64 offset; + enum gdma_mr_access_flags access_flags; + } __packed da; + } __packed; u32 reserved_2; + union { + struct { + u64 length; + } da_ext; + }; };/* HW DATA */ struct gdma_create_mr_response { @@ -920,6 +940,27 @@ struct gdma_destroy_mr_response { struct gdma_resp_hdr hdr; };/* HW DATA */ +struct gdma_alloc_dm_req { + struct gdma_req_hdr hdr; + u64 length; + u32 alignment; + u32 flags; +}; /* HW Data */ + +struct gdma_alloc_dm_resp { + struct gdma_resp_hdr hdr; + u64 dm_handle; +}; /* HW Data */ + +struct gdma_destroy_dm_req { + struct gdma_req_hdr hdr; + u64 dm_handle; +}; /* HW Data */ + +struct gdma_destroy_dm_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + int mana_gd_verify_vf_version(struct pci_dev *pdev); int mana_gd_register_device(struct gdma_dev *gd); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6aad66bc5dd7..3f3827e1c711 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN @@ -764,6 +766,7 @@ enum ib_event_type { IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, + IB_EVENT_DEVICE_SPEED_CHANGE, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); @@ -877,6 +880,20 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); +struct ib_port_speed_info { + const char *str; + int rate; /* in deci-Gb/sec (100 MBps units) */ +}; + +/** + * ib_port_attr_to_speed_info - Convert port attributes to speed information + * @attr: Port attributes containing active_speed and active_width + * @speed_info: Speed information to return + * + * Returns 0 on success, -EINVAL on error. + */ +int ib_port_attr_to_speed_info(struct ib_port_attr *attr, + struct ib_port_speed_info *speed_info); /** * enum ib_mr_type - memory region type @@ -2348,6 +2365,9 @@ struct rdma_user_mmap_entry { unsigned long start_pgoff; size_t npages; bool driver_removed; + /* protects access to dmabufs */ + struct mutex dmabufs_lock; + struct list_head dmabufs; }; /* Return the offset (in bytes) the user should pass to libc's mmap() */ @@ -2403,6 +2423,8 @@ struct ib_device_ops { int comp_vector); int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); + int (*query_port_speed)(struct ib_device *device, u32 port_num, + u64 *speed); int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); @@ -2483,6 +2505,11 @@ struct ib_device_ops { * Therefore needs to be implemented by the driver in mmap_free. */ void (*mmap_free)(struct rdma_user_mmap_entry *entry); + int (*mmap_get_pfns)(struct rdma_user_mmap_entry *entry, + struct phys_vec *phys_vec, + struct p2pdma_provider **provider); + struct rdma_user_mmap_entry *(*pgoff_to_mmap_entry)(struct ib_ucontext *ucontext, + off_t pg_off); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); @@ -4249,6 +4276,47 @@ static inline void ib_dma_unmap_page(struct ib_device *dev, dma_unmap_page(dev->dma_device, addr, size, direction); } +/** + * ib_dma_map_bvec - Map a bio_vec to DMA address + * @dev: The device for which the dma_addr is to be created + * @bvec: The bio_vec to map + * @direction: The direction of the DMA + * + * Returns a DMA address for the bio_vec. The caller must check the + * result with ib_dma_mapping_error() before use; a failed mapping + * must not be passed to ib_dma_unmap_bvec(). + * + * For software RDMA devices (rxe, siw), returns a virtual address + * and no actual DMA mapping occurs. + */ +static inline u64 ib_dma_map_bvec(struct ib_device *dev, + struct bio_vec *bvec, + enum dma_data_direction direction) +{ + if (ib_uses_virt_dma(dev)) + return (uintptr_t)bvec_virt(bvec); + return dma_map_phys(dev->dma_device, bvec_phys(bvec), + bvec->bv_len, direction, 0); +} + +/** + * ib_dma_unmap_bvec - Unmap a bio_vec DMA mapping + * @dev: The device for which the DMA address was created + * @addr: The DMA address returned by ib_dma_map_bvec() + * @size: The size of the region in bytes + * @direction: The direction of the DMA + * + * Releases a DMA mapping created by ib_dma_map_bvec(). For software + * RDMA devices this is a no-op since no actual mapping occurred. + */ +static inline void ib_dma_unmap_bvec(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + if (!ib_uses_virt_dma(dev)) + dma_unmap_phys(dev->dma_device, addr, size, direction, 0); +} + int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, @@ -4545,8 +4613,6 @@ static inline bool ib_device_try_get(struct ib_device *dev) void ib_device_put(struct ib_device *device); struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id); -struct ib_device *ib_device_get_by_name(const char *name, - enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); diff --git a/include/rdma/rw.h b/include/rdma/rw.h index d606cac48233..6a1d08614e09 100644 --- a/include/rdma/rw.h +++ b/include/rdma/rw.h @@ -5,6 +5,7 @@ #ifndef _RDMA_RW_H #define _RDMA_RW_H +#include #include #include #include @@ -31,6 +32,14 @@ struct rdma_rw_ctx { struct ib_rdma_wr *wrs; } map; + /* for IOVA-based mapping of bvecs into contiguous DMA range: */ + struct { + struct dma_iova_state state; + struct ib_sge sge; + struct ib_rdma_wr wr; + size_t mapped_len; + } iova; + /* for registering multiple WRs: */ struct rdma_rw_reg_ctx { struct ib_sge sge; @@ -38,6 +47,7 @@ struct rdma_rw_ctx { struct ib_reg_wr reg_wr; struct ib_send_wr inv_wr; struct ib_mr *mr; + struct sg_table sgt; } *reg; }; }; @@ -49,6 +59,16 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir); +struct bio_vec; + +int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + struct bvec_iter iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir); +void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + enum dma_data_direction dir); + int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, @@ -66,6 +86,8 @@ int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, unsigned int maxpages); +unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num, + unsigned int max_rdma_ctxs, u32 create_flags); void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr); int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr); void rdma_rw_cleanup_mrs(struct ib_qp *qp); diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index 26ba919ac245..6a253b7dc5ea 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -186,6 +186,7 @@ struct ib_uverbs_file { extern const struct uverbs_obj_type_class uverbs_idr_class; extern const struct uverbs_obj_type_class uverbs_fd_class; int uverbs_uobject_fd_release(struct inode *inode, struct file *filp); +int uverbs_uobject_release(struct ib_uobject *uobj); #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ sizeof(char)) diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index faa9d62b3b30..f24edf1c75eb 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -56,6 +56,7 @@ enum { BNXT_RE_UCNTX_CMASK_DBR_PACING_ENABLED = 0x08ULL, BNXT_RE_UCNTX_CMASK_POW2_DISABLED = 0x10ULL, BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED = 0x40, + BNXT_RE_UCNTX_CMASK_QP_RATE_LIMIT_ENABLED = 0x80ULL, }; enum bnxt_re_wqe_mode { @@ -215,4 +216,19 @@ enum bnxt_re_toggle_mem_methods { BNXT_RE_METHOD_GET_TOGGLE_MEM = (1U << UVERBS_ID_NS_SHIFT), BNXT_RE_METHOD_RELEASE_TOGGLE_MEM, }; + +struct bnxt_re_packet_pacing_caps { + __u32 qp_rate_limit_min; + __u32 qp_rate_limit_max; /* In kbps */ + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_RC + */ + __u32 supported_qpts; + __u32 reserved; +}; + +struct bnxt_re_query_device_ex_resp { + struct bnxt_re_packet_pacing_caps packet_pacing_caps; +}; #endif /* __BNXT_RE_UVERBS_ABI_H__*/ diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index de6f5a94f1e3..72041c1b0ea5 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -56,6 +56,7 @@ enum uverbs_default_objects { UVERBS_OBJECT_COUNTERS, UVERBS_OBJECT_ASYNC_EVENT, UVERBS_OBJECT_DMAH, + UVERBS_OBJECT_DMABUF, }; enum { @@ -73,6 +74,7 @@ enum uverbs_methods_device { UVERBS_METHOD_QUERY_CONTEXT, UVERBS_METHOD_QUERY_GID_TABLE, UVERBS_METHOD_QUERY_GID_ENTRY, + UVERBS_METHOD_QUERY_PORT_SPEED, }; enum uverbs_attrs_invoke_write_cmd_attr_ids { @@ -86,6 +88,11 @@ enum uverbs_attrs_query_port_cmd_attr_ids { UVERBS_ATTR_QUERY_PORT_RESP, }; +enum uverbs_attrs_query_port_speed_cmd_attr_ids { + UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM, + UVERBS_ATTR_QUERY_PORT_SPEED_RESP, +}; + enum uverbs_attrs_get_context_attr_ids { UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, @@ -257,6 +264,15 @@ enum uverbs_methods_dmah { UVERBS_METHOD_DMAH_FREE, }; +enum uverbs_attrs_alloc_dmabuf_cmd_attr_ids { + UVERBS_ATTR_ALLOC_DMABUF_HANDLE, + UVERBS_ATTR_ALLOC_DMABUF_PGOFF, +}; + +enum uverbs_methods_dmabuf { + UVERBS_METHOD_DMABUF_ALLOC, +}; + enum uverbs_attrs_reg_dm_mr_cmd_attr_ids { UVERBS_ATTR_REG_DM_MR_HANDLE, UVERBS_ATTR_REG_DM_MR_OFFSET, diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h index 45c2df619f07..a75bf32b8cfb 100644 --- a/include/uapi/rdma/mana-abi.h +++ b/include/uapi/rdma/mana-abi.h @@ -17,6 +17,9 @@ #define MANA_IB_UVERBS_ABI_VERSION 1 enum mana_ib_create_cq_flags { + /* Reserved for backward compatibility. Legacy + * kernel versions use it to create CQs in RNIC + */ MANA_IB_CREATE_RNIC_CQ = 1 << 0, }; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 310de7a80be5..4ec2f9ae06aa 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -5,6 +5,8 @@ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. */ +#include +#include #include #include @@ -20,30 +22,33 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); /* Each R/W context contains state for one chain of RDMA Read or * Write Work Requests. * - * Each WR chain handles a single contiguous server-side buffer, - * because scatterlist entries after the first have to start on - * page alignment. xdr_buf iovecs cannot guarantee alignment. + * Each WR chain handles a single contiguous server-side buffer. + * - each xdr_buf iovec is a single contiguous buffer + * - the xdr_buf pages array is a single contiguous buffer because the + * second through the last element always start on a page boundary * * Each WR chain handles only one R_key. Each RPC-over-RDMA segment * from a client may contain a unique R_key, so each WR chain moves * up to one segment at a time. * - * The scatterlist makes this data structure over 4KB in size. To - * make it less likely to fail, and to handle the allocation for - * smaller I/O requests without disabling bottom-halves, these - * contexts are created on demand, but cached and reused until the - * controlling svcxprt_rdma is destroyed. + * The inline bvec array is sized to handle most I/O requests without + * additional allocation. Larger requests fall back to dynamic allocation. + * These contexts are created on demand, but cached and reused until + * the controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; - unsigned int rw_first_sgl_nents; - struct sg_table rw_sg_table; - struct scatterlist rw_first_sgl[]; + unsigned int rw_first_bvec_nents; + struct bio_vec *rw_bvec; + struct bio_vec rw_first_bvec[]; }; +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt); + static inline struct svc_rdma_rw_ctxt * svc_rdma_next_ctxt(struct list_head *list) { @@ -52,10 +57,10 @@ svc_rdma_next_ctxt(struct list_head *list) } static struct svc_rdma_rw_ctxt * -svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) +svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec) { struct ib_device *dev = rdma->sc_cm_id->device; - unsigned int first_sgl_nents = dev->attrs.max_send_sge; + unsigned int first_bvec_nents = dev->attrs.max_send_sge; struct svc_rdma_rw_ctxt *ctxt; struct llist_node *node; @@ -65,33 +70,44 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) if (node) { ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents), + ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec, + first_bvec_nents), GFP_KERNEL, ibdev_to_node(dev)); if (!ctxt) goto out_noctx; INIT_LIST_HEAD(&ctxt->rw_list); - ctxt->rw_first_sgl_nents = first_sgl_nents; + ctxt->rw_first_bvec_nents = first_bvec_nents; } - ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; - if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, - ctxt->rw_sg_table.sgl, - first_sgl_nents)) - goto out_free; + if (nr_bvec <= ctxt->rw_first_bvec_nents) { + ctxt->rw_bvec = ctxt->rw_first_bvec; + } else { + ctxt->rw_bvec = kmalloc_array_node(nr_bvec, + sizeof(*ctxt->rw_bvec), + GFP_KERNEL, + ibdev_to_node(dev)); + if (!ctxt->rw_bvec) + goto out_free; + } return ctxt; out_free: - kfree(ctxt); + /* Return cached contexts to cache; free freshly allocated ones */ + if (node) + svc_rdma_put_rw_ctxt(rdma, ctxt); + else + kfree(ctxt); out_noctx: - trace_svcrdma_rwctx_empty(rdma, sges); + trace_svcrdma_rwctx_empty(rdma, nr_bvec); return NULL; } static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, struct llist_head *list) { - sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents); + if (ctxt->rw_bvec != ctxt->rw_first_bvec) + kfree(ctxt->rw_bvec); llist_add(&ctxt->rw_node, list); } @@ -123,6 +139,7 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) * @ctxt: R/W context to prepare * @offset: RDMA offset * @handle: RDMA tag/handle + * @length: total number of bytes in the bvec array * @direction: I/O direction * * Returns on success, the number of WQEs that will be needed @@ -130,14 +147,18 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) */ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, struct svc_rdma_rw_ctxt *ctxt, - u64 offset, u32 handle, + u64 offset, u32 handle, unsigned int length, enum dma_data_direction direction) { + struct bvec_iter iter = { + .bi_size = length, + }; int ret; - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, - ctxt->rw_sg_table.sgl, ctxt->rw_nents, - 0, offset, handle, direction); + ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, + ctxt->rw_bvec, ctxt->rw_nents, + iter, offset, handle, direction); if (unlikely(ret < 0)) { trace_svcrdma_dma_map_rw_err(rdma, offset, handle, ctxt->rw_nents, ret); @@ -175,7 +196,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, { struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; - LLIST_HEAD(free); trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); @@ -183,10 +203,11 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); - rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, ctxt->rw_sg_table.sgl, - ctxt->rw_nents, dir); - __svc_rdma_put_rw_ctxt(ctxt, &free); + rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, + ctxt->rw_bvec, ctxt->rw_nents, dir); + if (ctxt->rw_bvec != ctxt->rw_first_bvec) + kfree(ctxt->rw_bvec); ctxt->rw_node.next = first; first = &ctxt->rw_node; @@ -414,29 +435,26 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, return -ENOTCONN; } -/* Build and DMA-map an SGL that covers one kvec in an xdr_buf +/* Build a bvec that covers one kvec in an xdr_buf. */ -static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, - unsigned int len, - struct svc_rdma_rw_ctxt *ctxt) +static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt) { - struct scatterlist *sg = ctxt->rw_sg_table.sgl; - - sg_set_buf(&sg[0], info->wi_base, len); + bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len); info->wi_base += len; ctxt->rw_nents = 1; } -/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. +/* Build a bvec array that covers part of an xdr_buf's pagelist. */ -static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, - unsigned int remaining, - struct svc_rdma_rw_ctxt *ctxt) +static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info, + unsigned int remaining, + struct svc_rdma_rw_ctxt *ctxt) { - unsigned int sge_no, sge_bytes, page_off, page_no; + unsigned int bvec_idx, bvec_len, page_off, page_no; const struct xdr_buf *xdr = info->wi_xdr; - struct scatterlist *sg; struct page **page; page_off = info->wi_next_off + xdr->page_base; @@ -444,21 +462,19 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, page_off = offset_in_page(page_off); page = xdr->pages + page_no; info->wi_next_off += remaining; - sg = ctxt->rw_sg_table.sgl; - sge_no = 0; + bvec_idx = 0; do { - sge_bytes = min_t(unsigned int, remaining, - PAGE_SIZE - page_off); - sg_set_page(sg, *page, sge_bytes, page_off); - - remaining -= sge_bytes; - sg = sg_next(sg); + bvec_len = min_t(unsigned int, remaining, + PAGE_SIZE - page_off); + bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, bvec_len, + page_off); + remaining -= bvec_len; page_off = 0; - sge_no++; + bvec_idx++; page++; } while (remaining); - ctxt->rw_nents = sge_no; + ctxt->rw_nents = bvec_idx; } /* Construct RDMA Write WRs to send a portion of an xdr_buf containing @@ -496,7 +512,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, constructor(info, write_len, ctxt); offset = seg->rs_offset + info->wi_seg_off; ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, - DMA_TO_DEVICE); + write_len, DMA_TO_DEVICE); if (ret < 0) return -EIO; percpu_counter_inc(&svcrdma_stat_write); @@ -535,7 +551,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info, const struct kvec *iov) { info->wi_base = iov->iov_base; - return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, + return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec, iov->iov_len); } @@ -559,7 +575,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info, { info->wi_xdr = xdr; info->wi_next_off = offset - xdr->head[0].iov_len; - return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, + return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec, length); } @@ -734,29 +750,29 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, { struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; - unsigned int sge_no, seg_len, len; + unsigned int bvec_idx, nr_bvec, seg_len, len, total; struct svc_rdma_rw_ctxt *ctxt; - struct scatterlist *sg; int ret; len = segment->rs_length; - sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT; - ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); + if (check_add_overflow(head->rc_pageoff, len, &total)) + return -EINVAL; + nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT; + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec); if (!ctxt) return -ENOMEM; - ctxt->rw_nents = sge_no; + ctxt->rw_nents = nr_bvec; - sg = ctxt->rw_sg_table.sgl; - for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { + for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) { seg_len = min_t(unsigned int, len, PAGE_SIZE - head->rc_pageoff); if (!head->rc_pageoff) head->rc_page_count++; - sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], - seg_len, head->rc_pageoff); - sg = sg_next(sg); + bvec_set_page(&ctxt->rw_bvec[bvec_idx], + rqstp->rq_pages[head->rc_curpage], + seg_len, head->rc_pageoff); head->rc_pageoff += seg_len; if (head->rc_pageoff == PAGE_SIZE) { @@ -770,7 +786,8 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, } ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, - segment->rs_handle, DMA_FROM_DEVICE); + segment->rs_handle, segment->rs_length, + DMA_FROM_DEVICE); if (ret < 0) return -EIO; percpu_counter_inc(&svcrdma_stat_read); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b7b318ad25c4..9b623849723e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -462,7 +462,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_bc_requests = 2; } - /* Arbitrary estimate of the needed number of rdma_rw contexts. + /* Estimate the needed number of rdma_rw contexts. The maximum + * Read and Write chunks have one segment each. Each request + * can involve one Read chunk and either a Write chunk or Reply + * chunk; thus a factor of three. */ maxpayload = min(xprt->xpt_server->sv_max_payload, RPCSVC_MAXPAYLOAD_RDMA); @@ -470,7 +473,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) rdma_rw_mr_factor(dev, newxprt->sc_port_num, maxpayload >> PAGE_SHIFT); - newxprt->sc_sq_depth = rq_depth + ctxts; + newxprt->sc_sq_depth = rq_depth + + rdma_rw_max_send_wr(dev, newxprt->sc_port_num, ctxts, 0); if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);