From 80351761facb63d6fab1e42c77d7565047bc10ad Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 4 Dec 2025 17:24:13 +0800 Subject: [PATCH 01/66] RDMA/irdma: Simplify bool conversion ./drivers/infiniband/hw/irdma/ctrl.c:5792:10-15: WARNING: conversion to bool not needed here. ./drivers/infiniband/hw/irdma/uk.c:1412:6-11: WARNING: conversion to bool not needed here. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=27521 Signed-off-by: Jiapeng Chong Link: https://patch.msgid.link/20251204092414.1261795-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/ctrl.c | 3 +-- drivers/infiniband/hw/irdma/uk.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index ce5cf89c463c..081551da763a 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -5788,8 +5788,7 @@ static int cfg_fpm_value_gen_3(struct irdma_sc_dev *dev, bool is_mrte_loc_mem; loc_mem_pages = hmc_fpm_misc->loc_mem_pages; - is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds ? - true : false; + is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds; irdma_get_rsrc_mem_config(dev, is_mrte_loc_mem); mrte_loc = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc; diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index f0846b800913..91669326d464 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -1408,8 +1408,7 @@ exit: * from SW for all unprocessed WQEs. For GEN3 and beyond * FW will generate/flush these CQEs so move to the next CQE */ - move_cq_head = qp->uk_attrs->hw_rev <= IRDMA_GEN_2 ? - false : true; + move_cq_head = qp->uk_attrs->hw_rev > IRDMA_GEN_2; } if (move_cq_head) { From 4b01ec0f133b3fe1038dc538d6bfcbd72462d2f0 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Wed, 10 Dec 2025 13:06:13 +0000 Subject: [PATCH 02/66] RDMA/efa: Check stored completion CTX command ID with received one In admin command completion, we receive a CQE with the command ID which is constructed from context index and entropy bits from the admin queue producer counter. To try to detect memory corruptions in the received CQE, validate the full command ID of the fetched context with the CQE command ID. If there is a mismatch, complete the CQE with error. Also use LSBs of the admin queue producer counter to better detect entropy mismatch between smaller number of commands. Reviewed-by: Daniel Kranzdorf Reviewed-by: Michael Margolin Signed-off-by: Yonatan Nachum Link: https://patch.msgid.link/20251210130614.36460-2-ynachum@amazon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_com.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index 0e979ca10d24..b31478f3a121 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -3,6 +3,8 @@ * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ +#include + #include "efa_com.h" #include "efa_regs_defs.h" @@ -317,7 +319,7 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu /* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */ cmd_id = ctx_id & queue_size_mask; - cmd_id |= aq->sq.pc & ~queue_size_mask; + cmd_id |= aq->sq.pc << ilog2(aq->depth); cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; cmd->aq_common_descriptor.command_id = cmd_id; @@ -418,7 +420,7 @@ static int efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); - if (comp_ctx->status != EFA_CMD_SUBMITTED) { + if (comp_ctx->status != EFA_CMD_SUBMITTED || comp_ctx->cmd_id != cmd_id) { ibdev_err(aq->efa_dev, "Received completion with unexpected command id[%d], sq producer: %d, sq consumer: %d, cq consumer: %d\n", cmd_id, aq->sq.pc, aq->sq.cc, aq->cq.cc); From dab5825491f7b0ea92a09390f39df0a51100f12f Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Wed, 10 Dec 2025 13:06:14 +0000 Subject: [PATCH 03/66] RDMA/efa: Improve admin completion context state machine Add a new unused state to the admin completion contexts state machine instead of the occupied field. This improves the completion validity check because it now enforce the context to be in submitted state prior to completing it. Also add allocated state as a intermediate state between unused and submitted. Reviewed-by: Daniel Kranzdorf Reviewed-by: Michael Margolin Signed-off-by: Yonatan Nachum Link: https://patch.msgid.link/20251210130614.36460-3-ynachum@amazon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_com.c | 93 ++++++++++++++++------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index b31478f3a121..229b0ad3b0cb 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -23,6 +23,8 @@ #define EFA_CTRL_SUB_MINOR 1 enum efa_cmd_status { + EFA_CMD_UNUSED, + EFA_CMD_ALLOCATED, EFA_CMD_SUBMITTED, EFA_CMD_COMPLETED, }; @@ -34,7 +36,6 @@ struct efa_comp_ctx { enum efa_cmd_status status; u16 cmd_id; u8 cmd_opcode; - u8 occupied; }; static const char *efa_com_cmd_str(u8 cmd) @@ -243,7 +244,6 @@ static int efa_com_admin_init_aenq(struct efa_com_dev *edev, return 0; } -/* ID to be used with efa_com_get_comp_ctx */ static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq) { u16 ctx_id; @@ -265,36 +265,47 @@ static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq, spin_unlock(&aq->comp_ctx_lock); } -static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq, - struct efa_comp_ctx *comp_ctx) +static struct efa_comp_ctx *efa_com_alloc_comp_ctx(struct efa_com_admin_queue *aq) { - u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command, - EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); - u16 ctx_id = cmd_id & (aq->depth - 1); + struct efa_comp_ctx *comp_ctx; + u16 ctx_id; - ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id); - comp_ctx->occupied = 0; - efa_com_dealloc_ctx_id(aq, ctx_id); -} + ctx_id = efa_com_alloc_ctx_id(aq); -static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq, - u16 cmd_id, bool capture) -{ - u16 ctx_id = cmd_id & (aq->depth - 1); - - if (aq->comp_ctx[ctx_id].occupied && capture) { - ibdev_err_ratelimited( - aq->efa_dev, - "Completion context for command_id %#x is occupied\n", - cmd_id); + comp_ctx = &aq->comp_ctx[ctx_id]; + if (comp_ctx->status != EFA_CMD_UNUSED) { + efa_com_dealloc_ctx_id(aq, ctx_id); + ibdev_err_ratelimited(aq->efa_dev, + "Completion context[%u] is used[%u]\n", + ctx_id, comp_ctx->status); return NULL; } - if (capture) { - aq->comp_ctx[ctx_id].occupied = 1; - ibdev_dbg(aq->efa_dev, - "Take completion ctxt for command_id %#x\n", cmd_id); - } + comp_ctx->status = EFA_CMD_ALLOCATED; + ibdev_dbg(aq->efa_dev, "Take completion context[%u]\n", ctx_id); + return comp_ctx; +} + +static inline u16 efa_com_get_comp_ctx_id(struct efa_com_admin_queue *aq, + struct efa_comp_ctx *comp_ctx) +{ + return comp_ctx - aq->comp_ctx; +} + +static inline void efa_com_dealloc_comp_ctx(struct efa_com_admin_queue *aq, + struct efa_comp_ctx *comp_ctx) +{ + u16 ctx_id = efa_com_get_comp_ctx_id(aq, comp_ctx); + + ibdev_dbg(aq->efa_dev, "Put completion context[%u]\n", ctx_id); + comp_ctx->status = EFA_CMD_UNUSED; + efa_com_dealloc_ctx_id(aq, ctx_id); +} + +static inline struct efa_comp_ctx *efa_com_get_comp_ctx_by_cmd_id(struct efa_com_admin_queue *aq, + u16 cmd_id) +{ + u16 ctx_id = cmd_id & (aq->depth - 1); return &aq->comp_ctx[ctx_id]; } @@ -312,10 +323,13 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu u16 ctx_id; u16 pi; + comp_ctx = efa_com_alloc_comp_ctx(aq); + if (!comp_ctx) + return ERR_PTR(-EINVAL); + queue_size_mask = aq->depth - 1; pi = aq->sq.pc & queue_size_mask; - - ctx_id = efa_com_alloc_ctx_id(aq); + ctx_id = efa_com_get_comp_ctx_id(aq, comp_ctx); /* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */ cmd_id = ctx_id & queue_size_mask; @@ -326,12 +340,6 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu EFA_SET(&cmd->aq_common_descriptor.flags, EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase); - comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true); - if (!comp_ctx) { - efa_com_dealloc_ctx_id(aq, ctx_id); - return ERR_PTR(-EINVAL); - } - comp_ctx->status = EFA_CMD_SUBMITTED; comp_ctx->comp_size = comp_size_in_bytes; comp_ctx->user_cqe = comp; @@ -372,9 +380,9 @@ static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq) } for (i = 0; i < aq->depth; i++) { - comp_ctx = efa_com_get_comp_ctx(aq, i, false); - if (comp_ctx) - init_completion(&comp_ctx->wait_event); + comp_ctx = &aq->comp_ctx[i]; + comp_ctx->status = EFA_CMD_UNUSED; + init_completion(&comp_ctx->wait_event); aq->comp_ctx_pool[i] = i; } @@ -419,11 +427,12 @@ static int efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq cmd_id = EFA_GET(&cqe->acq_common_descriptor.command, EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); - comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); + comp_ctx = efa_com_get_comp_ctx_by_cmd_id(aq, cmd_id); if (comp_ctx->status != EFA_CMD_SUBMITTED || comp_ctx->cmd_id != cmd_id) { ibdev_err(aq->efa_dev, - "Received completion with unexpected command id[%d], sq producer: %d, sq consumer: %d, cq consumer: %d\n", - cmd_id, aq->sq.pc, aq->sq.cc, aq->cq.cc); + "Received completion with unexpected command id[%x], status[%d] sq producer[%d], sq consumer[%d], cq consumer[%d]\n", + cmd_id, comp_ctx->status, aq->sq.pc, aq->sq.cc, + aq->cq.cc); return -EINVAL; } @@ -532,7 +541,7 @@ static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_c err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); out: - efa_com_put_comp_ctx(aq, comp_ctx); + efa_com_dealloc_comp_ctx(aq, comp_ctx); return err; } @@ -582,7 +591,7 @@ static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *com err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); out: - efa_com_put_comp_ctx(aq, comp_ctx); + efa_com_dealloc_comp_ctx(aq, comp_ctx); return err; } From a3572bdc3a028ca47f77d7166ac95b719cf77d50 Mon Sep 17 00:00:00 2001 From: Honggang LI Date: Wed, 24 Dec 2025 10:38:19 +0800 Subject: [PATCH 04/66] RDMA/rtrs: server: remove dead code As rkey had been initialized to zero, the WARN_ON_ONCE should never been triggered. Remove it. Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality") Signed-off-by: Honggang LI Link: https://patch.msgid.link/20251224023819.138846-1-honggangli@163.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 9ecc6343455d..7a402eb8e0bf 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -208,7 +208,6 @@ static int rdma_write_sg(struct rtrs_srv_op *id) size_t sg_cnt; int err, offset; bool need_inval; - u32 rkey = 0; struct ib_reg_wr rwr; struct ib_sge *plist; struct ib_sge list; @@ -240,11 +239,6 @@ static int rdma_write_sg(struct rtrs_srv_op *id) wr->wr.num_sge = 1; wr->remote_addr = le64_to_cpu(id->rd_msg->desc[0].addr); wr->rkey = le32_to_cpu(id->rd_msg->desc[0].key); - if (rkey == 0) - rkey = wr->rkey; - else - /* Only one key is actually used */ - WARN_ON_ONCE(rkey != wr->rkey); wr->wr.opcode = IB_WR_RDMA_WRITE; wr->wr.wr_cqe = &io_comp_cqe; @@ -277,7 +271,7 @@ static int rdma_write_sg(struct rtrs_srv_op *id) inv_wr.opcode = IB_WR_SEND_WITH_INV; inv_wr.wr_cqe = &io_comp_cqe; inv_wr.send_flags = 0; - inv_wr.ex.invalidate_rkey = rkey; + inv_wr.ex.invalidate_rkey = wr->rkey; } imm_wr.wr.next = NULL; From 8818ffb04bfa168dfe5056cd24cee5211dcc4b3c Mon Sep 17 00:00:00 2001 From: Lianfa Weng Date: Tue, 30 Dec 2025 23:49:11 +0800 Subject: [PATCH 05/66] RDMA/hns: Introduce limit_bank mode with better performance In limit_bank mode, QPs/CQs are restricted to using half of the banks. HW concentrates resources on these banks, thereby improving performance compared to the default mode. Switch between limit_bank mode and default mode by setting the cap flag in FW. Since the number of QPs and CQs will be halved, this mode is suitable for scenarios where fewer QPs and CQs are required. Signed-off-by: Lianfa Weng Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20251230154911.3397584-1-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_cq.c | 12 ++++- drivers/infiniband/hw/hns/hns_roce_device.h | 6 +++ drivers/infiniband/hw/hns/hns_roce_main.c | 5 +++ drivers/infiniband/hw/hns/hns_roce_qp.c | 49 ++++++++++++++++----- 4 files changed, 60 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 6aa82fe9dd3d..857a913326cd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -55,7 +55,7 @@ void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx) { struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device); struct hns_roce_cq_table *cq_table = &hr_dev->cq_table; - u32 least_load = cq_table->ctx_num[0]; + u32 least_load = U32_MAX; u8 bankid = 0; u8 i; @@ -63,7 +63,10 @@ void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx) return; mutex_lock(&cq_table->bank_mutex); - for (i = 1; i < HNS_ROCE_CQ_BANK_NUM; i++) { + for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) { + if (!(cq_table->valid_cq_bank_mask & BIT(i))) + continue; + if (cq_table->ctx_num[i] < least_load) { least_load = cq_table->ctx_num[i]; bankid = i; @@ -581,6 +584,11 @@ void hns_roce_init_cq_table(struct hns_roce_dev *hr_dev) cq_table->bank[i].max = hr_dev->caps.num_cqs / HNS_ROCE_CQ_BANK_NUM - 1; } + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) + cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_LIMIT; + else + cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_DEFAULT; } void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 318f18cf37aa..3f032b8038af 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -103,6 +103,10 @@ #define CQ_BANKID_SHIFT 2 #define CQ_BANKID_MASK GENMASK(1, 0) +#define VALID_CQ_BANK_MASK_DEFAULT 0xF +#define VALID_CQ_BANK_MASK_LIMIT 0x9 + +#define VALID_EXT_SGE_QP_BANK_MASK_LIMIT 0x42 #define HNS_ROCE_MAX_CQ_COUNT 0xFFFF #define HNS_ROCE_MAX_CQ_PERIOD 0xFFFF @@ -156,6 +160,7 @@ enum { HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), HNS_ROCE_CAP_FLAG_BOND = BIT(21), HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22), + HNS_ROCE_CAP_FLAG_LIMIT_BANK = BIT(23), }; #define HNS_ROCE_DB_TYPE_COUNT 2 @@ -500,6 +505,7 @@ struct hns_roce_cq_table { struct hns_roce_bank bank[HNS_ROCE_CQ_BANK_NUM]; struct mutex bank_mutex; u32 ctx_num[HNS_ROCE_CQ_BANK_NUM]; + u8 valid_cq_bank_mask; }; struct hns_roce_srq_table { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 2f4864ab7d4e..a3490bab297a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -259,6 +259,11 @@ static int hns_roce_query_device(struct ib_device *ib_dev, props->max_srq_sge = hr_dev->caps.max_srq_sges; } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) { + props->max_cq >>= 1; + props->max_qp >>= 1; + } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR && hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index d1640c5fbaab..5f7ea6c16644 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -197,22 +197,16 @@ static u8 get_affinity_cq_bank(u8 qp_bank) return (qp_bank >> 1) & CQ_BANKID_MASK; } -static u8 get_least_load_bankid_for_qp(struct ib_qp_init_attr *init_attr, - struct hns_roce_bank *bank) +static u8 get_least_load_bankid_for_qp(struct hns_roce_bank *bank, u8 valid_qp_bank_mask) { #define INVALID_LOAD_QPNUM 0xFFFFFFFF - struct ib_cq *scq = init_attr->send_cq; u32 least_load = INVALID_LOAD_QPNUM; - unsigned long cqn = 0; u8 bankid = 0; u32 bankcnt; u8 i; - if (scq) - cqn = to_hr_cq(scq)->cqn; - for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) { - if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK))) + if (!(valid_qp_bank_mask & BIT(i))) continue; bankcnt = bank[i].inuse; @@ -246,6 +240,42 @@ static int alloc_qpn_with_bankid(struct hns_roce_bank *bank, u8 bankid, return 0; } + +static bool use_ext_sge(struct ib_qp_init_attr *init_attr) +{ + return init_attr->cap.max_send_sge > HNS_ROCE_SGE_IN_WQE || + init_attr->qp_type == IB_QPT_UD || + init_attr->qp_type == IB_QPT_GSI; +} + +static u8 select_qp_bankid(struct hns_roce_dev *hr_dev, + struct ib_qp_init_attr *init_attr) +{ + struct hns_roce_qp_table *qp_table = &hr_dev->qp_table; + struct hns_roce_bank *bank = qp_table->bank; + struct ib_cq *scq = init_attr->send_cq; + u8 valid_qp_bank_mask = 0; + unsigned long cqn = 0; + u8 i; + + if (scq) + cqn = to_hr_cq(scq)->cqn; + + for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) { + if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK))) + continue; + + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) && + use_ext_sge(init_attr) && + !(VALID_EXT_SGE_QP_BANK_MASK_LIMIT & BIT(i))) + continue; + + valid_qp_bank_mask |= BIT(i); + } + + return get_least_load_bankid_for_qp(bank, valid_qp_bank_mask); +} + static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_init_attr *init_attr) { @@ -258,8 +288,7 @@ static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, num = 1; } else { mutex_lock(&qp_table->bank_mutex); - bankid = get_least_load_bankid_for_qp(init_attr, qp_table->bank); - + bankid = select_qp_bankid(hr_dev, init_attr); ret = alloc_qpn_with_bankid(&qp_table->bank[bankid], bankid, &num); if (ret) { From ddd6c8c873e912cb1ead79def54de5e24ff71c80 Mon Sep 17 00:00:00 2001 From: Etienne AUJAMES Date: Wed, 31 Dec 2025 14:07:45 +0100 Subject: [PATCH 06/66] IB/cache: update gid cache on client reregister event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some HCAs (e.g: ConnectX4) do not trigger a IB_EVENT_GID_CHANGE on subnet prefix update from SM (PortInfo). Since the commit d58c23c92548 ("IB/core: Only update PKEY and GID caches on respective events"), the GID cache is updated exclusively on IB_EVENT_GID_CHANGE. If this event is not emitted, the subnet prefix in the IPoIB interface’s hardware address remains set to its default value (0xfe80000000000000). Then rdma_bind_addr() failed because it relies on hardware address to find the port GID (subnet_prefix + port GUID). This patch fixes this issue by updating the GID cache on IB_EVENT_CLIENT_REREGISTER event (emitted on PortInfo::ClientReregister=1). Fixes: d58c23c92548 ("IB/core: Only update PKEY and GID caches on respective events") Signed-off-by: Etienne AUJAMES Link: https://patch.msgid.link/aVUfsO58QIDn5bGX@eaujamesFR0130 Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 81cf3c902e81..0fc1c5bce2f0 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1537,7 +1537,8 @@ static void ib_cache_event_task(struct work_struct *_work) * the cache. */ ret = ib_cache_update(work->event.device, work->event.element.port_num, - work->event.event == IB_EVENT_GID_CHANGE, + work->event.event == IB_EVENT_GID_CHANGE || + work->event.event == IB_EVENT_CLIENT_REREGISTER, work->event.event == IB_EVENT_PKEY_CHANGE, work->enforce_security); From c0a26bbd3f99b7b03f072e3409aff4e6ec8af6f6 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sun, 4 Jan 2026 14:40:54 +0800 Subject: [PATCH 07/66] RDMA/hns: Fix WQ_MEM_RECLAIM warning When sunrpc is used, if a reset triggered, our wq may lead the following trace: workqueue: WQ_MEM_RECLAIM xprtiod:xprt_rdma_connect_worker [rpcrdma] is flushing !WQ_MEM_RECLAIM hns_roce_irq_workq:flush_work_handle [hns_roce_hw_v2] WARNING: CPU: 0 PID: 8250 at kernel/workqueue.c:2644 check_flush_dependency+0xe0/0x144 Call trace: check_flush_dependency+0xe0/0x144 start_flush_work.constprop.0+0x1d0/0x2f0 __flush_work.isra.0+0x40/0xb0 flush_work+0x14/0x30 hns_roce_v2_destroy_qp+0xac/0x1e0 [hns_roce_hw_v2] ib_destroy_qp_user+0x9c/0x2b4 rdma_destroy_qp+0x34/0xb0 rpcrdma_ep_destroy+0x28/0xcc [rpcrdma] rpcrdma_ep_put+0x74/0xb4 [rpcrdma] rpcrdma_xprt_disconnect+0x1d8/0x260 [rpcrdma] xprt_rdma_connect_worker+0xc0/0x120 [rpcrdma] process_one_work+0x1cc/0x4d0 worker_thread+0x154/0x414 kthread+0x104/0x144 ret_from_fork+0x10/0x18 Since QP destruction frees memory, this wq should have the WQ_MEM_RECLAIM. Fixes: ffd541d45726 ("RDMA/hns: Add the workqueue framework for flush cqe handler") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20260104064057.1582216-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2d6ae89e525b..f95442798ddb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6956,7 +6956,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work); - hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0); + hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", + WQ_MEM_RECLAIM); if (!hr_dev->irq_workq) { dev_err(dev, "failed to create irq workqueue.\n"); ret = -ENOMEM; From 8cda8acbb1f8c6c0fec45b7166bb558b5af59da8 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Sun, 4 Jan 2026 14:40:55 +0800 Subject: [PATCH 08/66] RDMA/hns: Return actual error code instead of fixed EINVAL query_cqc() and query_mpt() may return various error codes in different cases. Return actual error code instead of fixed EINVAL. Fixes: f2b070f36d1b ("RDMA/hns: Support CQ's restrack raw ops for hns driver") Fixes: 3d67e7e236ad ("RDMA/hns: Support MR's restrack raw ops for hns driver") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20260104064057.1582216-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_restrack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c index 230187dda6a0..085791cc617c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_restrack.c +++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c @@ -51,7 +51,7 @@ int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq) ret = hr_dev->hw->query_cqc(hr_dev, hr_cq->cqn, &context); if (ret) - return -EINVAL; + return ret; ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context); @@ -177,7 +177,7 @@ int hns_roce_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr) ret = hr_dev->hw->query_mpt(hr_dev, hr_mr->key, &context); if (ret) - return -EINVAL; + return ret; ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context); From 84bd5d60f0a2b9c763c5e6d0b3d8f4f61f6c5470 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Sun, 4 Jan 2026 14:40:56 +0800 Subject: [PATCH 09/66] RDMA/hns: Fix RoCEv1 failure due to DSCP DSCP is not supported in RoCEv1, but get_dscp() is still called. If get_dscp() returns an error, it'll eventually cause create_ah to fail even when using RoCEv1. Correct the return value and avoid calling get_dscp() when using RoCEv1. Fixes: ee20cc17e9d8 ("RDMA/hns: Support DSCP") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20260104064057.1582216-4-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_ah.c | 23 +++++++++--------- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 28 ++++++++++++---------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 0c1c32d23c88..8a605da8a93c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -60,7 +60,7 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, u8 tclass = get_tclass(grh); u8 priority = 0; u8 tc_mode = 0; - int ret; + int ret = 0; if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) { ret = -EOPNOTSUPP; @@ -77,19 +77,18 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, ah->av.flowlabel = grh->flow_label; ah->av.udp_sport = get_ah_udp_sport(ah_attr); ah->av.tclass = tclass; + ah->av.sl = rdma_ah_get_sl(ah_attr); - ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority); - if (ret == -EOPNOTSUPP) - ret = 0; + if (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority); + if (ret == -EOPNOTSUPP) + ret = 0; + else if (ret) + goto err_out; - if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - goto err_out; - - if (tc_mode == HNAE3_TC_MAP_MODE_DSCP && - grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - ah->av.sl = priority; - else - ah->av.sl = rdma_ah_get_sl(ah_attr); + if (tc_mode == HNAE3_TC_MAP_MODE_DSCP) + ah->av.sl = priority; + } if (!check_sl_valid(hr_dev, ah->av.sl)) { ret = -EINVAL; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f95442798ddb..1f37d74b466b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5053,20 +5053,22 @@ static int hns_roce_set_sl(struct ib_qp *ibqp, struct ib_device *ibdev = &hr_dev->ib_dev; int ret; - ret = hns_roce_hw_v2_get_dscp(hr_dev, get_tclass(&attr->ah_attr.grh), - &hr_qp->tc_mode, &hr_qp->priority); - if (ret && ret != -EOPNOTSUPP && - grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { - ibdev_err_ratelimited(ibdev, - "failed to get dscp, ret = %d.\n", ret); - return ret; - } + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); - if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP && - grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - hr_qp->sl = hr_qp->priority; - else - hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); + if (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + ret = hns_roce_hw_v2_get_dscp(hr_dev, + get_tclass(&attr->ah_attr.grh), + &hr_qp->tc_mode, &hr_qp->priority); + if (ret && ret != -EOPNOTSUPP) { + ibdev_err_ratelimited(ibdev, + "failed to get dscp, ret = %d.\n", + ret); + return ret; + } + + if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP) + hr_qp->sl = hr_qp->priority; + } if (!check_sl_valid(hr_dev, hr_qp->sl)) return -EINVAL; From 0789f929900d85b80b343c5f04f8b9444e991384 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sun, 4 Jan 2026 14:40:57 +0800 Subject: [PATCH 10/66] RDMA/hns: Notify ULP of remaining soft-WCs during reset During a reset, software-generated WCs cannot be reported via interrupts. This may cause the ULP to miss some WCs. To avoid this, add check in the CQ arm process: if a hardware reset has occurred and there are still unreported soft-WCs, notify the ULP to handle the remaining WCs, thereby preventing any loss of completions. Fixes: 626903e9355b ("RDMA/hns: Add support for reporting wc as software mode") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20260104064057.1582216-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 1f37d74b466b..a2ae4f33e459 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3739,6 +3739,23 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, HNS_ROCE_V2_CQ_DEFAULT_INTERVAL); } +static bool left_sw_wc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) +{ + struct hns_roce_qp *hr_qp; + + list_for_each_entry(hr_qp, &hr_cq->sq_list, sq_node) { + if (hr_qp->sq.head != hr_qp->sq.tail) + return true; + } + + list_for_each_entry(hr_qp, &hr_cq->rq_list, rq_node) { + if (hr_qp->rq.head != hr_qp->rq.tail) + return true; + } + + return false; +} + static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { @@ -3747,6 +3764,12 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, struct hns_roce_v2_db cq_db = {}; u32 notify_flag; + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) { + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && + left_sw_wc(hr_dev, hr_cq)) + return 1; + return 0; + } /* * flags = 0, then notify_flag : next * flags = 1, then notify flag : solocited From 263d1d9975b5ac2b813c3107fe21c957b57f0a59 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:26 +0200 Subject: [PATCH 11/66] IB/core: Add async event on device speed change Add IB_EVENT_DEVICE_SPEED_CHANGE for notifying user applications on device's ports speed changes. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/verbs.c | 1 + include/rdma/ib_verbs.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 11b1a194de44..f495a2182c84 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -78,6 +78,7 @@ static const char * const ib_events[] = { [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", [IB_EVENT_CLIENT_REREGISTER] = "client reregister", [IB_EVENT_GID_CHANGE] = "GID changed", + [IB_EVENT_DEVICE_SPEED_CHANGE] = "device speed change" }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6aad66bc5dd7..95f1e557cbb8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -764,6 +764,7 @@ enum ib_event_type { IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, + IB_EVENT_DEVICE_SPEED_CHANGE, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); From 2941abac6d0bffd6bf8f438135505b39168a0a08 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:32 +0200 Subject: [PATCH 12/66] IB/core: Add helper to convert port attributes to data rate Introduce ib_port_attr_to_rate() to compute the data rate in 100 Mbps units (deci-Gb/sec) from a port's active_speed and active_width attributes. This generic helper removes duplicated speed-to-rate calculations, which are used by sysfs and the upcoming new verb. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/verbs.c | 51 +++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 14 +++++++++ 2 files changed, 65 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index f495a2182c84..8b56b6b62352 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -217,6 +217,57 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) } EXPORT_SYMBOL(ib_rate_to_mbps); +struct ib_speed_attr { + const char *str; + int speed; +}; + +#define IB_SPEED_ATTR(speed_type, _str, _speed) \ + [speed_type] = {.str = _str, .speed = _speed} + +static const struct ib_speed_attr ib_speed_attrs[] = { + IB_SPEED_ATTR(IB_SPEED_SDR, " SDR", 25), + IB_SPEED_ATTR(IB_SPEED_DDR, " DDR", 50), + IB_SPEED_ATTR(IB_SPEED_QDR, " QDR", 100), + IB_SPEED_ATTR(IB_SPEED_FDR10, " FDR10", 100), + IB_SPEED_ATTR(IB_SPEED_FDR, " FDR", 140), + IB_SPEED_ATTR(IB_SPEED_EDR, " EDR", 250), + IB_SPEED_ATTR(IB_SPEED_HDR, " HDR", 500), + IB_SPEED_ATTR(IB_SPEED_NDR, " NDR", 1000), + IB_SPEED_ATTR(IB_SPEED_XDR, " XDR", 2000), +}; + +int ib_port_attr_to_speed_info(struct ib_port_attr *attr, + struct ib_port_speed_info *speed_info) +{ + int speed_idx = attr->active_speed; + + switch (attr->active_speed) { + case IB_SPEED_DDR: + case IB_SPEED_QDR: + case IB_SPEED_FDR10: + case IB_SPEED_FDR: + case IB_SPEED_EDR: + case IB_SPEED_HDR: + case IB_SPEED_NDR: + case IB_SPEED_XDR: + case IB_SPEED_SDR: + break; + default: + speed_idx = IB_SPEED_SDR; /* Default to SDR for invalid rates */ + break; + } + + speed_info->str = ib_speed_attrs[speed_idx].str; + speed_info->rate = ib_speed_attrs[speed_idx].speed; + speed_info->rate *= ib_width_enum_to_int(attr->active_width); + if (speed_info->rate < 0) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(ib_port_attr_to_speed_info); + __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 95f1e557cbb8..b984f9581a73 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -878,6 +878,20 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); +struct ib_port_speed_info { + const char *str; + int rate; /* in deci-Gb/sec (100 MBps units) */ +}; + +/** + * ib_port_attr_to_speed_info - Convert port attributes to speed information + * @attr: Port attributes containing active_speed and active_width + * @speed_info: Speed information to return + * + * Returns 0 on success, -EINVAL on error. + */ +int ib_port_attr_to_speed_info(struct ib_port_attr *attr, + struct ib_port_speed_info *speed_info); /** * enum ib_mr_type - memory region type From d4adeff26c3e8f1a9bc86d5dfb14f227c9041070 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:40 +0200 Subject: [PATCH 13/66] IB/core: Refactor rate_show to use ib_port_attr_to_rate() Update sysfs rate_show() to rely on ib_port_attr_to_speed_info() for converting IB port speed and width attributes to data rate and speed string. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/sysfs.c | 56 +++++---------------------------- 1 file changed, 8 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 0ed862b38b44..bfaca07933d8 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -292,62 +292,22 @@ static ssize_t cap_mask_show(struct ib_device *ibdev, u32 port_num, static ssize_t rate_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { + struct ib_port_speed_info speed_info; struct ib_port_attr attr; - char *speed = ""; - int rate; /* in deci-Gb/sec */ ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - switch (attr.active_speed) { - case IB_SPEED_DDR: - speed = " DDR"; - rate = 50; - break; - case IB_SPEED_QDR: - speed = " QDR"; - rate = 100; - break; - case IB_SPEED_FDR10: - speed = " FDR10"; - rate = 100; - break; - case IB_SPEED_FDR: - speed = " FDR"; - rate = 140; - break; - case IB_SPEED_EDR: - speed = " EDR"; - rate = 250; - break; - case IB_SPEED_HDR: - speed = " HDR"; - rate = 500; - break; - case IB_SPEED_NDR: - speed = " NDR"; - rate = 1000; - break; - case IB_SPEED_XDR: - speed = " XDR"; - rate = 2000; - break; - case IB_SPEED_SDR: - default: /* default to SDR for invalid rates */ - speed = " SDR"; - rate = 25; - break; - } + ret = ib_port_attr_to_speed_info(&attr, &speed_info); + if (ret) + return ret; - rate *= ib_width_enum_to_int(attr.active_width); - if (rate < 0) - return -EINVAL; - - return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10, - rate % 10 ? ".5" : "", - ib_width_enum_to_int(attr.active_width), speed); + return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", speed_info.rate / 10, + speed_info.rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), + speed_info.str); } static const char *phys_state_to_str(enum ib_port_phys_state phys_state) From 51a07ce2fefd061edf4ba552a741c85f07b3e6dd Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:46 +0200 Subject: [PATCH 14/66] IB/core: Add query_port_speed verb Add new ibv_query_port_speed() verb to enable applications to query the effective bandwidth of a port. This verb is particularly useful when the speed is not a multiplication of IB speed and width where width is 2^n. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 1 + .../infiniband/core/uverbs_std_types_device.c | 42 +++++++++++++++++++ include/rdma/ib_verbs.h | 2 + include/uapi/rdma/ib_user_ioctl_cmds.h | 6 +++ 4 files changed, 51 insertions(+) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 13e8a1714bbd..04edc57592aa 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2816,6 +2816,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); + SET_DEVICE_OP(dev_ops, query_port_speed); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index c0fd283d9d6c..a28f9f21bed8 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -209,6 +209,39 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)( &resp, sizeof(resp)); } +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT_SPEED)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + u32 port_num; + u64 speed; + int ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!ib_dev->ops.query_port_speed) + return -EOPNOTSUPP; + + ret = uverbs_get_const(&port_num, attrs, + UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM); + if (ret) + return ret; + + if (!rdma_is_port_valid(ib_dev, port_num)) + return -EINVAL; + + ret = ib_dev->ops.query_port_speed(ib_dev, port_num, &speed); + if (ret) + return ret; + + return uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_PORT_SPEED_RESP, + &speed, sizeof(speed)); +} + static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( struct uverbs_attr_bundle *attrs) { @@ -469,6 +502,14 @@ DECLARE_UVERBS_NAMED_METHOD( active_speed_ex), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_PORT_SPEED, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM, u32, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_PORT_SPEED_RESP, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_QUERY_GID_TABLE, UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64, @@ -498,6 +539,7 @@ DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE, &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT_SPEED), &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT), &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE), &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY)); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b984f9581a73..a4786395328a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2418,6 +2418,8 @@ struct ib_device_ops { int comp_vector); int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); + int (*query_port_speed)(struct ib_device *device, u32 port_num, + u64 *speed); int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index de6f5a94f1e3..35da4026f452 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -73,6 +73,7 @@ enum uverbs_methods_device { UVERBS_METHOD_QUERY_CONTEXT, UVERBS_METHOD_QUERY_GID_TABLE, UVERBS_METHOD_QUERY_GID_ENTRY, + UVERBS_METHOD_QUERY_PORT_SPEED, }; enum uverbs_attrs_invoke_write_cmd_attr_ids { @@ -86,6 +87,11 @@ enum uverbs_attrs_query_port_cmd_attr_ids { UVERBS_ATTR_QUERY_PORT_RESP, }; +enum uverbs_attrs_query_port_speed_cmd_attr_ids { + UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM, + UVERBS_ATTR_QUERY_PORT_SPEED_RESP, +}; + enum uverbs_attrs_get_context_attr_ids { UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, From 3fd984d5cd8f0df8e79337fbd9283c31dee9ed31 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:53 +0200 Subject: [PATCH 15/66] RDMA/mlx5: Raise async event on device speed change Raise IB_EVENT_DEVICE_SPEED_CHANGE whenever the speed of one of the device's ports changes. Usually all ports of the device changes together. This ensures user applications and upper-layer software are immediately notified when bandwidth changes, improving traffic management in dynamic environments. This is especially useful for vports which are part of a LAG configuration, to know if the effective speed of the LAG was changed. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 40284bbb45d6..bea42acbeaad 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2838,6 +2838,14 @@ static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe, case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: case MLX5_PORT_CHANGE_SUBTYPE_DOWN: case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED: + if (ibdev->ib_active) { + struct ib_event speed_event = {}; + + speed_event.device = &ibdev->ib_dev; + speed_event.event = IB_EVENT_DEVICE_SPEED_CHANGE; + ib_dispatch_event(&speed_event); + } + /* In RoCE, port up/down events are handled in * mlx5_netdev_event(). */ From aaecff5e13cd98d9a9260bf2ec83ef3a1bda86a6 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:59:00 +0200 Subject: [PATCH 16/66] RDMA/mlx5: Implement query_port_speed callback Implement the query_port_speed callback for mlx5 driver to support querying effective port bandwidth. For LAG configurations, query the aggregated speed from the LAG layer or from the modified vport max_tx_speed. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 124 +++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + 2 files changed, 126 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bea42acbeaad..47c19d527fa2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1581,6 +1581,129 @@ static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u32 port, u16 index, return 0; } +static int mlx5_ib_query_port_speed_from_port(struct mlx5_ib_dev *dev, + u32 port_num, u64 *speed) +{ + struct ib_port_speed_info speed_info; + struct ib_port_attr attr = {}; + int err; + + err = mlx5_ib_query_port(&dev->ib_dev, port_num, &attr); + if (err) + return err; + + if (attr.state == IB_PORT_DOWN) { + *speed = 0; + return 0; + } + + err = ib_port_attr_to_speed_info(&attr, &speed_info); + if (err) + return err; + + *speed = speed_info.rate; + return 0; +} + +static int mlx5_ib_query_port_speed_from_vport(struct mlx5_core_dev *mdev, + u8 op_mod, u16 vport, + u8 other_vport, u64 *speed, + struct mlx5_ib_dev *dev, + u32 port_num) +{ + u32 max_tx_speed; + int err; + + err = mlx5_query_vport_max_tx_speed(mdev, op_mod, vport, other_vport, + &max_tx_speed); + if (err) + return err; + + if (max_tx_speed == 0) + /* Value 0 indicates field not supported, fallback */ + return mlx5_ib_query_port_speed_from_port(dev, port_num, + speed); + + *speed = max_tx_speed; + return 0; +} + +static int mlx5_ib_query_port_speed_from_bond(struct mlx5_ib_dev *dev, + u32 port_num, u64 *speed) +{ + struct mlx5_core_dev *mdev = dev->mdev; + u32 bond_speed; + int err; + + err = mlx5_lag_query_bond_speed(mdev, &bond_speed); + if (err) + return err; + + *speed = bond_speed / MLX5_MAX_TX_SPEED_UNIT; + + return 0; +} + +static int mlx5_ib_query_port_speed_non_rep(struct mlx5_ib_dev *dev, + u32 port_num, u64 *speed) +{ + u16 op_mod = MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT; + + if (mlx5_lag_is_roce(dev->mdev)) + return mlx5_ib_query_port_speed_from_bond(dev, port_num, + speed); + + return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod, 0, false, + speed, dev, port_num); +} + +static int mlx5_ib_query_port_speed_rep(struct mlx5_ib_dev *dev, u32 port_num, + u64 *speed) +{ + struct mlx5_eswitch_rep *rep; + struct mlx5_core_dev *mdev; + u16 op_mod; + + if (!dev->port[port_num - 1].rep) { + mlx5_ib_warn(dev, "Representor doesn't exist for port %u\n", + port_num); + return -EINVAL; + } + + rep = dev->port[port_num - 1].rep; + mdev = mlx5_eswitch_get_core_dev(rep->esw); + if (!mdev) + return -ENODEV; + + if (rep->vport == MLX5_VPORT_UPLINK) { + if (mlx5_lag_is_sriov(mdev)) + return mlx5_ib_query_port_speed_from_bond(dev, + port_num, + speed); + + return mlx5_ib_query_port_speed_from_port(dev, port_num, + speed); + } + + op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod, + rep->vport, true, speed, dev, + port_num); +} + +int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num, u64 *speed) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + if (mlx5_ib_port_link_layer(ibdev, port_num) == + IB_LINK_LAYER_INFINIBAND || mlx5_core_mp_enabled(dev->mdev)) + return mlx5_ib_query_port_speed_from_port(dev, port_num, speed); + else if (!dev->is_rep) + return mlx5_ib_query_port_speed_non_rep(dev, port_num, speed); + else + return mlx5_ib_query_port_speed_rep(dev, port_num, speed); +} + static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index, union ib_gid *gid) { @@ -4305,6 +4428,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .query_device = mlx5_ib_query_device, .query_gid = mlx5_ib_query_gid, .query_pkey = mlx5_ib_query_pkey, + .query_port_speed = mlx5_ib_query_port_speed, .query_qp = mlx5_ib_query_qp, .query_srq = mlx5_ib_query_srq, .query_ucontext = mlx5_ib_query_ucontext, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 09d82d5f95e3..cc6b3b6c713c 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1435,6 +1435,8 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props); int mlx5_ib_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props); +int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num, + u64 *speed); void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, u64 access_flags); int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); From ac7dea328ab52a6dce40361bb478b80d5004abe0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 4 Jan 2026 15:51:33 +0200 Subject: [PATCH 17/66] RDMA/umem: Remove redundant DMABUF ops check ib_umem_dmabuf_get_with_dma_device() is an in-kernel function and does not require a defensive check for the .move_notify callback. All current callers guarantee that this callback is always present. Link: https://patch.msgid.link/20260104-ib-core-misc-v1-1-00367f77f3a8@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_dmabuf.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 0ec2e4120cc9..939da49b0dcc 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -129,9 +129,6 @@ ib_umem_dmabuf_get_with_dma_device(struct ib_device *device, if (check_add_overflow(offset, (unsigned long)size, &end)) return ret; - if (unlikely(!ops || !ops->move_notify)) - return ret; - dmabuf = dma_buf_get(fd); if (IS_ERR(dmabuf)) return ERR_CAST(dmabuf); From 8d466b155f83890f2f2d4cf6d0f623ac2d455b12 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 4 Jan 2026 15:51:34 +0200 Subject: [PATCH 18/66] RDMA/core: Avoid exporting module local functions and remove not-used ones Some of the functions are local to the module and some are not used starting from commit 36783dec8d79 ("RDMA/rxe: Delete deprecated module parameters interface"). Delete and avoid exporting them. Signed-off-by: Parav Pandit Link: https://patch.msgid.link/20260104-ib-core-misc-v1-2-00367f77f3a8@nvidia.com Reviewed-by: Zhu Yanjun Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 30 ------------------------------ include/rdma/ib_verbs.h | 2 -- 2 files changed, 32 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 04edc57592aa..4e09f6e0995e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -361,34 +361,6 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } -/** - * ib_device_get_by_name - Find an IB device by name - * @name: The name to look for - * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) - * - * Find and hold an ib_device by its name. The caller must call - * ib_device_put() on the returned pointer. - */ -struct ib_device *ib_device_get_by_name(const char *name, - enum rdma_driver_id driver_id) -{ - struct ib_device *device; - - down_read(&devices_rwsem); - device = __ib_device_get_by_name(name); - if (device && driver_id != RDMA_DRIVER_UNKNOWN && - device->ops.driver_id != driver_id) - device = NULL; - - if (device) { - if (!ib_device_try_get(device)) - device = NULL; - } - up_read(&devices_rwsem); - return device; -} -EXPORT_SYMBOL(ib_device_get_by_name); - static int rename_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; @@ -2876,7 +2848,6 @@ int ib_add_sub_device(struct ib_device *parent, return ret; } -EXPORT_SYMBOL(ib_add_sub_device); int ib_del_sub_device_and_put(struct ib_device *sub) { @@ -2895,7 +2866,6 @@ int ib_del_sub_device_and_put(struct ib_device *sub) return 0; } -EXPORT_SYMBOL(ib_del_sub_device_and_put); #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a4786395328a..6c372a37c482 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4562,8 +4562,6 @@ static inline bool ib_device_try_get(struct ib_device *dev) void ib_device_put(struct ib_device *device); struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id); -struct ib_device *ib_device_get_by_name(const char *name, - enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); From 6dc78c53de99e4ed9868d4f0fc6da6e46f52fe4d Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Sun, 4 Jan 2026 15:51:35 +0200 Subject: [PATCH 19/66] RDMA/mlx5: Fix ucaps init error flow In mlx5_ib_stage_caps_init(), if mlx5_ib_init_ucaps() fails after mlx5_ib_init_var_table() succeeds, the VAR bitmap is leaked since the function returns without cleanup. Thus, cleanup the var table bitmap in case of error of initializing ucaps before exiting, preventing the leak above. Fixes: cf7174e8982f ("RDMA/mlx5: Create UCAP char devices for supported device capabilities") Signed-off-by: Maher Sanalla Reviewed-by: Yishai Hadas Link: https://patch.msgid.link/20260104-ib-core-misc-v1-3-00367f77f3a8@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 47c19d527fa2..e81080622283 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4598,12 +4598,16 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) { err = mlx5_ib_init_ucaps(dev); if (err) - return err; + goto err_ucaps; } dev->ib_dev.use_cq_dim = true; return 0; + +err_ucaps: + bitmap_free(dev->var_table.bitmap); + return err; } static const struct ib_device_ops mlx5_ib_dev_port_ops = { From 522a5c1c56fbf71693cf2e4c726b200f6d703679 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 4 Jan 2026 15:51:36 +0200 Subject: [PATCH 20/66] RDMA/mlx5: Avoid direct access to DMA device pointer The dma_device field is marked as internal and must not be accessed by drivers or ULPs. Remove all direct mlx5 references to this field. Link: https://patch.msgid.link/20260104-ib-core-misc-v1-4-00367f77f3a8@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 325fa04cbe8a..a7b37e3df072 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1646,10 +1646,13 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, offset, length, fd, access_flags, &mlx5_ib_dmabuf_attach_ops); - else + else if (dma_device) umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, dma_device, offset, length, fd, access_flags); + else + umem_dmabuf = ib_umem_dmabuf_get_pinned( + &dev->ib_dev, offset, length, fd, access_flags); if (IS_ERR(umem_dmabuf)) { mlx5_ib_dbg(dev, "umem_dmabuf get failed (%pe)\n", umem_dmabuf); @@ -1782,10 +1785,8 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, fd, access_flags); - return reg_user_mr_dmabuf(pd, pd->device->dma_device, - offset, length, virt_addr, - fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT, - dmah); + return reg_user_mr_dmabuf(pd, NULL, offset, length, virt_addr, fd, + access_flags, MLX5_MKC_ACCESS_MODE_MTT, dmah); } /* From cc016ebeb146d050f8426ade79d4d71771b643c4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 4 Jan 2026 15:51:37 +0200 Subject: [PATCH 21/66] RDMA/qedr: Remove unused defines Perform basic cleanup by removing unused defines from qedr.h. Link: https://patch.msgid.link/20260104-ib-core-misc-v1-5-00367f77f3a8@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qedr/qedr.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index db9ef3e1eb97..a6c9a4d9ab93 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -53,11 +53,8 @@ DP_NAME(dev) ? DP_NAME(dev) : "", ## __VA_ARGS__) #define QEDR_MSG_INIT "INIT" -#define QEDR_MSG_MISC "MISC" #define QEDR_MSG_CQ " CQ" #define QEDR_MSG_MR " MR" -#define QEDR_MSG_RQ " RQ" -#define QEDR_MSG_SQ " SQ" #define QEDR_MSG_QP " QP" #define QEDR_MSG_SRQ " SRQ" #define QEDR_MSG_GSI " GSI" @@ -65,7 +62,6 @@ #define QEDR_CQ_MAGIC_NUMBER (0x11223344) -#define FW_PAGE_SIZE (RDMA_RING_PAGE_SIZE) #define FW_PAGE_SHIFT (12) struct qedr_dev; @@ -178,24 +174,18 @@ struct qedr_dev { u8 user_dpm_enabled; }; -#define QEDR_MAX_SQ_PBL (0x8000) #define QEDR_MAX_SQ_PBL_ENTRIES (0x10000 / sizeof(void *)) #define QEDR_SQE_ELEMENT_SIZE (sizeof(struct rdma_sq_sge)) #define QEDR_MAX_SQE_ELEMENTS_PER_SQE (ROCE_REQ_MAX_SINGLE_SQ_WQE_SIZE / \ QEDR_SQE_ELEMENT_SIZE) -#define QEDR_MAX_SQE_ELEMENTS_PER_PAGE ((RDMA_RING_PAGE_SIZE) / \ - QEDR_SQE_ELEMENT_SIZE) #define QEDR_MAX_SQE ((QEDR_MAX_SQ_PBL_ENTRIES) *\ (RDMA_RING_PAGE_SIZE) / \ (QEDR_SQE_ELEMENT_SIZE) /\ (QEDR_MAX_SQE_ELEMENTS_PER_SQE)) /* RQ */ -#define QEDR_MAX_RQ_PBL (0x2000) #define QEDR_MAX_RQ_PBL_ENTRIES (0x10000 / sizeof(void *)) #define QEDR_RQE_ELEMENT_SIZE (sizeof(struct rdma_rq_sge)) #define QEDR_MAX_RQE_ELEMENTS_PER_RQE (RDMA_MAX_SGE_PER_RQ_WQE) -#define QEDR_MAX_RQE_ELEMENTS_PER_PAGE ((RDMA_RING_PAGE_SIZE) / \ - QEDR_RQE_ELEMENT_SIZE) #define QEDR_MAX_RQE ((QEDR_MAX_RQ_PBL_ENTRIES) *\ (RDMA_RING_PAGE_SIZE) / \ (QEDR_RQE_ELEMENT_SIZE) /\ @@ -210,12 +200,8 @@ struct qedr_dev { #define QEDR_ROCE_MAX_CNQ_SIZE (0x4000) -#define QEDR_MAX_PORT (1) #define QEDR_PORT (1) -#define QEDR_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) - -#define QEDR_ROCE_PKEY_MAX 1 #define QEDR_ROCE_PKEY_TABLE_LEN 1 #define QEDR_ROCE_PKEY_DEFAULT 0xffff @@ -336,12 +322,6 @@ struct qedr_qp_hwq_info { union db_prod32 iwarp_db2_data; }; -#define QEDR_INC_SW_IDX(p_info, index) \ - do { \ - p_info->index = (p_info->index + 1) & \ - qed_chain_get_capacity(p_info->pbl) \ - } while (0) - struct qedr_srq_hwq_info { u32 max_sges; u32 max_wr; From 325e3b5431ddd27c5f93156b36838a351e3b2f72 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 4 Jan 2026 15:51:38 +0200 Subject: [PATCH 22/66] RDMA/ocrdma: Remove unused OCRDMA_UVERBS definition The OCRDMA_UVERBS() macro is unused, so remove it to clean up the code. Link: https://patch.msgid.link/20260104-ib-core-misc-v1-6-00367f77f3a8@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/ocrdma/ocrdma.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 5eb61c110090..5584b781e2e8 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -67,8 +67,6 @@ #define OC_SKH_DEVICE_VF 0x728 #define OCRDMA_MAX_AH 512 -#define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) - #define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo) #define EQ_INTR_PER_SEC_THRSH_HI 150000 #define EQ_INTR_PER_SEC_THRSH_LOW 100000 From 83835f7c07b523c7ca2a5ad0a511670b5810539e Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Wed, 7 Jan 2026 17:15:08 +0100 Subject: [PATCH 23/66] RDMA/rtrs-srv: fix SG mapping This fixes the following error on the server side: RTRS server session allocation failed: -EINVAL caused by the caller of the `ib_dma_map_sg()`, which does not expect less mapped entries, than requested, which is in the order of things and can be easily reproduced on the machine with enabled IOMMU. The fix is to treat any positive number of mapped sg entries as a successful mapping and cache DMA addresses by traversing modified SG table. Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality") Signed-off-by: Roman Penyaev Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20260107161517.56357-2-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 7a402eb8e0bf..adb798e2a54a 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -595,7 +595,7 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) srv_path->mrs_num++) { struct rtrs_srv_mr *srv_mr = &srv_path->mrs[srv_path->mrs_num]; struct scatterlist *s; - int nr, nr_sgt, chunks; + int nr, nr_sgt, chunks, ind; sgt = &srv_mr->sgt; chunks = chunks_per_mr * srv_path->mrs_num; @@ -625,7 +625,7 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) } nr = ib_map_mr_sg(mr, sgt->sgl, nr_sgt, NULL, max_chunk_size); - if (nr != nr_sgt) { + if (nr < nr_sgt) { err = nr < 0 ? nr : -EINVAL; goto dereg_mr; } @@ -641,9 +641,24 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) goto dereg_mr; } } - /* Eventually dma addr for each chunk can be cached */ - for_each_sg(sgt->sgl, s, nr_sgt, i) - srv_path->dma_addr[chunks + i] = sg_dma_address(s); + + /* + * Cache DMA addresses by traversing sg entries. If + * regions were merged, an inner loop is required to + * populate the DMA address array by traversing larger + * regions. + */ + ind = chunks; + for_each_sg(sgt->sgl, s, nr_sgt, i) { + unsigned int dma_len = sg_dma_len(s); + u64 dma_addr = sg_dma_address(s); + u64 dma_addr_end = dma_addr + dma_len; + + do { + srv_path->dma_addr[ind++] = dma_addr; + dma_addr += max_chunk_size; + } while (dma_addr < dma_addr_end); + } ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); srv_mr->mr = mr; From d6cc7b0d6191e3762296dd32a8d9c2e276b950dd Mon Sep 17 00:00:00 2001 From: Kim Zhu Date: Wed, 7 Jan 2026 17:15:09 +0100 Subject: [PATCH 24/66] RDMA/rtrs: Add error description to the logs Print error description instead of the error number. Signed-off-by: Kim Zhu Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20260107161517.56357-3-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c | 8 +- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 88 ++++++++++---------- drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c | 12 +-- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 78 ++++++++--------- drivers/infiniband/ulp/rtrs/rtrs.c | 9 +- 5 files changed, 100 insertions(+), 95 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c index 4aa80c9388f0..287e0ea43287 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c @@ -439,19 +439,19 @@ int rtrs_clt_create_path_files(struct rtrs_clt_path *clt_path) clt->kobj_paths, "%s", str); if (err) { - pr_err("kobject_init_and_add: %d\n", err); + pr_err("kobject_init_and_add: %pe\n", ERR_PTR(err)); kobject_put(&clt_path->kobj); return err; } err = sysfs_create_group(&clt_path->kobj, &rtrs_clt_path_attr_group); if (err) { - pr_err("sysfs_create_group(): %d\n", err); + pr_err("sysfs_create_group(): %pe\n", ERR_PTR(err)); goto put_kobj; } err = kobject_init_and_add(&clt_path->stats->kobj_stats, &ktype_stats, &clt_path->kobj, "stats"); if (err) { - pr_err("kobject_init_and_add: %d\n", err); + pr_err("kobject_init_and_add: %pe\n", ERR_PTR(err)); kobject_put(&clt_path->stats->kobj_stats); goto remove_group; } @@ -459,7 +459,7 @@ int rtrs_clt_create_path_files(struct rtrs_clt_path *clt_path) err = sysfs_create_group(&clt_path->stats->kobj_stats, &rtrs_clt_stats_attr_group); if (err) { - pr_err("failed to create stats sysfs group, err: %d\n", err); + pr_err("failed to create stats sysfs group, err: %pe\n", ERR_PTR(err)); goto put_kobj_stats; } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 71387811b281..ee7d505ff016 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -422,8 +422,8 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, refcount_inc(&req->ref); err = rtrs_inv_rkey(req); if (err) { - rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %d\n", - req->mr->rkey, err); + rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %pe\n", + req->mr->rkey, ERR_PTR(err)); } else if (can_wait) { wait_for_completion(&req->inv_comp); } @@ -443,8 +443,8 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, if (errno) { rtrs_err_rl(con->c.path, - "IO %s request failed: error=%d path=%s [%s:%u] notify=%d\n", - req->dir == DMA_TO_DEVICE ? "write" : "read", errno, + "IO %s request failed: error=%pe path=%s [%s:%u] notify=%d\n", + req->dir == DMA_TO_DEVICE ? "write" : "read", ERR_PTR(errno), kobject_name(&clt_path->kobj), clt_path->hca_name, clt_path->hca_port, notify); } @@ -514,7 +514,7 @@ static void rtrs_clt_recv_done(struct rtrs_clt_con *con, struct ib_wc *wc) cqe); err = rtrs_iu_post_recv(&con->c, iu); if (err) { - rtrs_err(con->c.path, "post iu failed %d\n", err); + rtrs_err(con->c.path, "post iu failed %pe\n", ERR_PTR(err)); rtrs_rdma_error_recovery(con); } } @@ -659,8 +659,8 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) else err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); if (err) { - rtrs_err(con->c.path, "rtrs_post_recv_empty(): %d\n", - err); + rtrs_err(con->c.path, "rtrs_post_recv_empty(): %pe\n", + ERR_PTR(err)); rtrs_rdma_error_recovery(con); } break; @@ -731,8 +731,8 @@ static int post_recv_path(struct rtrs_clt_path *clt_path) err = post_recv_io(to_clt_con(clt_path->s.con[cid]), q_size); if (err) { - rtrs_err(clt_path->clt, "post_recv_io(), err: %d\n", - err); + rtrs_err(clt_path->clt, "post_recv_io(), err: %pe\n", + ERR_PTR(err)); return err; } } @@ -1122,8 +1122,8 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) ret = rtrs_map_sg_fr(req, count); if (ret < 0) { rtrs_err_rl(s, - "Write request failed, failed to map fast reg. data, err: %d\n", - ret); + "Write request failed, failed to map fast reg. data, err: %pe\n", + ERR_PTR(ret)); ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist, req->sg_cnt, req->dir); return ret; @@ -1150,9 +1150,9 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) imm, wr, NULL); if (ret) { rtrs_err_rl(s, - "Write request failed: error=%d path=%s [%s:%u]\n", - ret, kobject_name(&clt_path->kobj), clt_path->hca_name, - clt_path->hca_port); + "Write request failed: error=%pe path=%s [%s:%u]\n", + ERR_PTR(ret), kobject_name(&clt_path->kobj), + clt_path->hca_name, clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); if (req->mr->need_inval) { @@ -1208,8 +1208,8 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) ret = rtrs_map_sg_fr(req, count); if (ret < 0) { rtrs_err_rl(s, - "Read request failed, failed to map fast reg. data, err: %d\n", - ret); + "Read request failed, failed to map fast reg. data, err: %pe\n", + ERR_PTR(ret)); ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, req->dir); return ret; @@ -1260,9 +1260,9 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) req->data_len, imm, wr); if (ret) { rtrs_err_rl(s, - "Read request failed: error=%d path=%s [%s:%u]\n", - ret, kobject_name(&clt_path->kobj), clt_path->hca_name, - clt_path->hca_port); + "Read request failed: error=%pe path=%s [%s:%u]\n", + ERR_PTR(ret), kobject_name(&clt_path->kobj), + clt_path->hca_name, clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); req->mr->need_inval = false; @@ -1774,12 +1774,12 @@ static int rtrs_rdma_addr_resolved(struct rtrs_clt_con *con) err = create_con_cq_qp(con); mutex_unlock(&con->con_mutex); if (err) { - rtrs_err(s, "create_con_cq_qp(), err: %d\n", err); + rtrs_err(s, "create_con_cq_qp(), err: %pe\n", ERR_PTR(err)); return err; } err = rdma_resolve_route(con->c.cm_id, RTRS_CONNECT_TIMEOUT_MS); if (err) - rtrs_err(s, "Resolving route failed, err: %d\n", err); + rtrs_err(s, "Resolving route failed, err: %pe\n", ERR_PTR(err)); return err; } @@ -1813,7 +1813,7 @@ static int rtrs_rdma_route_resolved(struct rtrs_clt_con *con) err = rdma_connect_locked(con->c.cm_id, ¶m); if (err) - rtrs_err(clt, "rdma_connect_locked(): %d\n", err); + rtrs_err(clt, "rdma_connect_locked(): %pe\n", ERR_PTR(err)); return err; } @@ -1846,8 +1846,8 @@ static int rtrs_rdma_conn_established(struct rtrs_clt_con *con, } errno = le16_to_cpu(msg->errno); if (errno) { - rtrs_err(clt, "Invalid RTRS message: errno %d\n", - errno); + rtrs_err(clt, "Invalid RTRS message: errno %pe\n", + ERR_PTR(errno)); return -ECONNRESET; } if (con->c.cid == 0) { @@ -1936,12 +1936,12 @@ static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con, "Previous session is still exists on the server, please reconnect later\n"); else rtrs_err(s, - "Connect rejected: status %d (%s), rtrs errno %d\n", - status, rej_msg, errno); + "Connect rejected: status %d (%s), rtrs errno %pe\n", + status, rej_msg, ERR_PTR(errno)); } else { rtrs_err(s, - "Connect rejected but with malformed message: status %d (%s)\n", - status, rej_msg); + "Connect rejected but with malformed message: status %pe (%s)\n", + ERR_PTR(status), rej_msg); } return -ECONNRESET; @@ -2008,27 +2008,27 @@ static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_TIMEWAIT_EXIT: - rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n", - rdma_event_msg(ev->event), ev->status); + rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), ERR_PTR(ev->status)); cm_err = -ECONNRESET; break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: - rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n", - rdma_event_msg(ev->event), ev->status); + rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), ERR_PTR(ev->status)); cm_err = -EHOSTUNREACH; break; case RDMA_CM_EVENT_DEVICE_REMOVAL: /* * Device removal is a special case. Queue close and return 0. */ - rtrs_wrn_rl(s, "CM event: %s, status: %d\n", rdma_event_msg(ev->event), - ev->status); + rtrs_wrn_rl(s, "CM event: %s, status: %pe\n", rdma_event_msg(ev->event), + ERR_PTR(ev->status)); rtrs_clt_close_conns(clt_path, false); return 0; default: - rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %d)\n", - rdma_event_msg(ev->event), ev->status); + rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), ERR_PTR(ev->status)); cm_err = -ECONNRESET; break; } @@ -2065,14 +2065,14 @@ static int create_cm(struct rtrs_clt_con *con) /* allow the port to be reused */ err = rdma_set_reuseaddr(cm_id, 1); if (err != 0) { - rtrs_err(s, "Set address reuse failed, err: %d\n", err); + rtrs_err(s, "Set address reuse failed, err: %pe\n", ERR_PTR(err)); return err; } err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr, (struct sockaddr *)&clt_path->s.dst_addr, RTRS_CONNECT_TIMEOUT_MS); if (err) { - rtrs_err(s, "Failed to resolve address, err: %d\n", err); + rtrs_err(s, "Failed to resolve address, err: %pe\n", ERR_PTR(err)); return err; } /* @@ -2547,7 +2547,7 @@ static int rtrs_send_path_info(struct rtrs_clt_path *clt_path) /* Prepare for getting info response */ err = rtrs_iu_post_recv(&usr_con->c, rx_iu); if (err) { - rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %d\n", err); + rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %pe\n", ERR_PTR(err)); goto out; } rx_iu = NULL; @@ -2563,7 +2563,7 @@ static int rtrs_send_path_info(struct rtrs_clt_path *clt_path) /* Send info request */ err = rtrs_iu_post_send(&usr_con->c, tx_iu, sizeof(*msg), NULL); if (err) { - rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %d\n", err); + rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %pe\n", ERR_PTR(err)); goto out; } tx_iu = NULL; @@ -2614,15 +2614,15 @@ static int init_path(struct rtrs_clt_path *clt_path) err = init_conns(clt_path); if (err) { rtrs_err(clt_path->clt, - "init_conns() failed: err=%d path=%s [%s:%u]\n", err, - str, clt_path->hca_name, clt_path->hca_port); + "init_conns() failed: err=%pe path=%s [%s:%u]\n", + ERR_PTR(err), str, clt_path->hca_name, clt_path->hca_port); goto out; } err = rtrs_send_path_info(clt_path); if (err) { rtrs_err(clt_path->clt, - "rtrs_send_path_info() failed: err=%d path=%s [%s:%u]\n", - err, str, clt_path->hca_name, clt_path->hca_port); + "rtrs_send_path_info() failed: err=%pe path=%s [%s:%u]\n", + ERR_PTR(err), str, clt_path->hca_name, clt_path->hca_port); goto out; } rtrs_clt_path_up(clt_path); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index 3f305e694fe8..51727c7d710c 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -176,14 +176,14 @@ static int rtrs_srv_create_once_sysfs_root_folders(struct rtrs_srv_path *srv_pat dev_set_uevent_suppress(&srv->dev, true); err = device_add(&srv->dev); if (err) { - pr_err("device_add(): %d\n", err); + pr_err("device_add(): %pe\n", ERR_PTR(err)); put_device(&srv->dev); goto unlock; } srv->kobj_paths = kobject_create_and_add("paths", &srv->dev.kobj); if (!srv->kobj_paths) { err = -ENOMEM; - pr_err("kobject_create_and_add(): %d\n", err); + pr_err("kobject_create_and_add(): %pe\n", ERR_PTR(err)); device_del(&srv->dev); put_device(&srv->dev); goto unlock; @@ -237,14 +237,14 @@ static int rtrs_srv_create_stats_files(struct rtrs_srv_path *srv_path) err = kobject_init_and_add(&srv_path->stats->kobj_stats, &ktype_stats, &srv_path->kobj, "stats"); if (err) { - rtrs_err(s, "kobject_init_and_add(): %d\n", err); + rtrs_err(s, "kobject_init_and_add(): %pe\n", ERR_PTR(err)); kobject_put(&srv_path->stats->kobj_stats); return err; } err = sysfs_create_group(&srv_path->stats->kobj_stats, &rtrs_srv_stats_attr_group); if (err) { - rtrs_err(s, "sysfs_create_group(): %d\n", err); + rtrs_err(s, "sysfs_create_group(): %pe\n", ERR_PTR(err)); goto err; } @@ -276,12 +276,12 @@ int rtrs_srv_create_path_files(struct rtrs_srv_path *srv_path) err = kobject_init_and_add(&srv_path->kobj, &ktype, srv->kobj_paths, "%s", str); if (err) { - rtrs_err(s, "kobject_init_and_add(): %d\n", err); + rtrs_err(s, "kobject_init_and_add(): %pe\n", ERR_PTR(err)); goto destroy_root; } err = sysfs_create_group(&srv_path->kobj, &rtrs_srv_path_attr_group); if (err) { - rtrs_err(s, "sysfs_create_group(): %d\n", err); + rtrs_err(s, "sysfs_create_group(): %pe\n", ERR_PTR(err)); goto put_kobj; } err = rtrs_srv_create_stats_files(srv_path); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index adb798e2a54a..be44fd1b9944 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -317,8 +317,8 @@ static int rdma_write_sg(struct rtrs_srv_op *id) err = ib_post_send(id->con->c.qp, &id->tx_wr.wr, NULL); if (err) rtrs_err(s, - "Posting RDMA-Write-Request to QP failed, err: %d\n", - err); + "Posting RDMA-Write-Request to QP failed, err: %pe\n", + ERR_PTR(err)); return err; } @@ -434,8 +434,8 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, err = ib_post_send(id->con->c.qp, wr, NULL); if (err) - rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %d\n", - err); + rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %pe\n", + ERR_PTR(err)); return err; } @@ -519,8 +519,8 @@ bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int status) err = rdma_write_sg(id); if (err) { - rtrs_err_rl(s, "IO response failed: %d: srv_path=%s\n", err, - kobject_name(&srv_path->kobj)); + rtrs_err_rl(s, "IO response failed: %pe: srv_path=%s\n", + ERR_PTR(err), kobject_name(&srv_path->kobj)); close_path(srv_path); } out: @@ -637,7 +637,7 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) DMA_TO_DEVICE, rtrs_srv_rdma_done); if (!srv_mr->iu) { err = -ENOMEM; - rtrs_err(ss, "rtrs_iu_alloc(), err: %d\n", err); + rtrs_err(ss, "rtrs_iu_alloc(), err: %pe\n", ERR_PTR(err)); goto dereg_mr; } } @@ -813,7 +813,7 @@ static int process_info_req(struct rtrs_srv_con *con, err = post_recv_path(srv_path); if (err) { - rtrs_err(s, "post_recv_path(), err: %d\n", err); + rtrs_err(s, "post_recv_path(), err: %pe\n", ERR_PTR(err)); return err; } @@ -876,7 +876,7 @@ static int process_info_req(struct rtrs_srv_con *con, get_device(&srv_path->srv->dev); err = rtrs_srv_change_state(srv_path, RTRS_SRV_CONNECTED); if (!err) { - rtrs_err(s, "rtrs_srv_change_state(), err: %d\n", err); + rtrs_err(s, "rtrs_srv_change_state(), err: %pe\n", ERR_PTR(err)); goto iu_free; } @@ -890,7 +890,7 @@ static int process_info_req(struct rtrs_srv_con *con, */ err = rtrs_srv_path_up(srv_path); if (err) { - rtrs_err(s, "rtrs_srv_path_up(), err: %d\n", err); + rtrs_err(s, "rtrs_srv_path_up(), err: %pe\n", ERR_PTR(err)); goto iu_free; } @@ -901,7 +901,7 @@ static int process_info_req(struct rtrs_srv_con *con, /* Send info response */ err = rtrs_iu_post_send(&con->c, tx_iu, tx_sz, reg_wr); if (err) { - rtrs_err(s, "rtrs_iu_post_send(), err: %d\n", err); + rtrs_err(s, "rtrs_iu_post_send(), err: %pe\n", ERR_PTR(err)); iu_free: rtrs_iu_free(tx_iu, srv_path->s.dev->ib_dev, 1); } @@ -969,7 +969,7 @@ static int post_recv_info_req(struct rtrs_srv_con *con) /* Prepare for getting info response */ err = rtrs_iu_post_recv(&con->c, rx_iu); if (err) { - rtrs_err(s, "rtrs_iu_post_recv(), err: %d\n", err); + rtrs_err(s, "rtrs_iu_post_recv(), err: %pe\n", ERR_PTR(err)); rtrs_iu_free(rx_iu, srv_path->s.dev->ib_dev, 1); return err; } @@ -1015,7 +1015,7 @@ static int post_recv_path(struct rtrs_srv_path *srv_path) err = post_recv_io(to_srv_con(srv_path->s.con[cid]), q_size); if (err) { - rtrs_err(s, "post_recv_io(), err: %d\n", err); + rtrs_err(s, "post_recv_io(), err: %pe\n", ERR_PTR(err)); return err; } } @@ -1063,8 +1063,8 @@ static void process_read(struct rtrs_srv_con *con, if (ret) { rtrs_err_rl(s, - "Processing read request failed, user module cb reported for msg_id %d, err: %d\n", - buf_id, ret); + "Processing read request failed, user module cb reported for msg_id %d, err: %pe\n", + buf_id, ERR_PTR(ret)); goto send_err_msg; } @@ -1074,8 +1074,8 @@ send_err_msg: ret = send_io_resp_imm(con, id, ret); if (ret < 0) { rtrs_err_rl(s, - "Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %d\n", - buf_id, ret); + "Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %pe\n", + buf_id, ERR_PTR(ret)); close_path(srv_path); } rtrs_srv_put_ops_ids(srv_path); @@ -1115,8 +1115,8 @@ static void process_write(struct rtrs_srv_con *con, data + data_len, usr_len); if (ret) { rtrs_err_rl(s, - "Processing write request failed, user module callback reports err: %d\n", - ret); + "Processing write request failed, user module callback reports err: %pe\n", + ERR_PTR(ret)); goto send_err_msg; } @@ -1126,8 +1126,8 @@ send_err_msg: ret = send_io_resp_imm(con, id, ret); if (ret < 0) { rtrs_err_rl(s, - "Processing write request failed, sending I/O response failed, msg_id %d, err: %d\n", - buf_id, ret); + "Processing write request failed, sending I/O response failed, msg_id %d, err: %pe\n", + buf_id, ERR_PTR(ret)); close_path(srv_path); } rtrs_srv_put_ops_ids(srv_path); @@ -1257,7 +1257,8 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc) srv_path->s.hb_missed_cnt = 0; err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); if (err) { - rtrs_err(s, "rtrs_post_recv(), err: %d\n", err); + rtrs_err(s, "rtrs_post_recv(), err: %pe\n", + ERR_PTR(err)); close_path(srv_path); break; } @@ -1282,8 +1283,8 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc) mr->msg_id = msg_id; err = rtrs_srv_inv_rkey(con, mr); if (err) { - rtrs_err(s, "rtrs_post_recv(), err: %d\n", - err); + rtrs_err(s, "rtrs_post_recv(), err: %pe\n", + ERR_PTR(err)); close_path(srv_path); break; } @@ -1632,7 +1633,7 @@ static int rtrs_rdma_do_accept(struct rtrs_srv_path *srv_path, err = rdma_accept(cm_id, ¶m); if (err) - pr_err("rdma_accept(), err: %d\n", err); + pr_err("rdma_accept(), err: %pe\n", ERR_PTR(err)); return err; } @@ -1650,7 +1651,7 @@ static int rtrs_rdma_do_reject(struct rdma_cm_id *cm_id, int errno) err = rdma_reject(cm_id, &msg, sizeof(msg), IB_CM_REJ_CONSUMER_DEFINED); if (err) - pr_err("rdma_reject(), err: %d\n", err); + pr_err("rdma_reject(), err: %pe\n", ERR_PTR(err)); /* Bounce errno back */ return errno; @@ -1726,7 +1727,7 @@ static int create_con(struct rtrs_srv_path *srv_path, max_send_wr, max_recv_wr, IB_POLL_WORKQUEUE); if (err) { - rtrs_err(s, "rtrs_cq_qp_create(), err: %d\n", err); + rtrs_err(s, "rtrs_cq_qp_create(), err: %pe\n", ERR_PTR(err)); goto free_con; } if (con->c.cid == 0) { @@ -1941,7 +1942,7 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id, } err = create_con(srv_path, cm_id, cid); if (err) { - rtrs_err((&srv_path->s), "create_con(), error %d\n", err); + rtrs_err((&srv_path->s), "create_con(), error %pe\n", ERR_PTR(err)); rtrs_rdma_do_reject(cm_id, err); /* * Since session has other connections we follow normal way @@ -1952,7 +1953,8 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id, } err = rtrs_rdma_do_accept(srv_path, cm_id); if (err) { - rtrs_err((&srv_path->s), "rtrs_rdma_do_accept(), error %d\n", err); + rtrs_err((&srv_path->s), "rtrs_rdma_do_accept(), error %pe\n", + ERR_PTR(err)); rtrs_rdma_do_reject(cm_id, err); /* * Since current connection was successfully added to the @@ -2003,8 +2005,8 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: - rtrs_err(s, "CM error (CM event: %s, err: %d)\n", - rdma_event_msg(ev->event), ev->status); + rtrs_err(s, "CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), ERR_PTR(ev->status)); fallthrough; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_ADDR_CHANGE: @@ -2013,8 +2015,8 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id, close_path(srv_path); break; default: - pr_err("Ignoring unexpected CM event %s, err %d\n", - rdma_event_msg(ev->event), ev->status); + pr_err("Ignoring unexpected CM event %s, err %pe\n", + rdma_event_msg(ev->event), ERR_PTR(ev->status)); break; } @@ -2038,13 +2040,13 @@ static struct rdma_cm_id *rtrs_srv_cm_init(struct rtrs_srv_ctx *ctx, } ret = rdma_bind_addr(cm_id, addr); if (ret) { - pr_err("Binding RDMA address failed, err: %d\n", ret); + pr_err("Binding RDMA address failed, err: %pe\n", ERR_PTR(ret)); goto err_cm; } ret = rdma_listen(cm_id, 64); if (ret) { - pr_err("Listening on RDMA connection failed, err: %d\n", - ret); + pr_err("Listening on RDMA connection failed, err: %pe\n", + ERR_PTR(ret)); goto err_cm; } @@ -2322,8 +2324,8 @@ static int __init rtrs_server_init(void) err = check_module_params(); if (err) { - pr_err("Failed to load module, invalid module parameters, err: %d\n", - err); + pr_err("Failed to load module, invalid module parameters, err: %pe\n", + ERR_PTR(err)); return err; } err = class_register(&rtrs_dev_class); diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index bf38ac6f87c4..bc1208ae8216 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -273,7 +273,8 @@ static int create_qp(struct rtrs_con *con, struct ib_pd *pd, ret = rdma_create_qp(cm_id, pd, &init_attr); if (ret) { - rtrs_err(con->path, "Creating QP failed, err: %d\n", ret); + rtrs_err(con->path, "Creating QP failed, err: %pe\n", + ERR_PTR(ret)); return ret; } con->qp = cm_id->qp; @@ -341,7 +342,8 @@ void rtrs_send_hb_ack(struct rtrs_path *path) err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm, NULL); if (err) { - rtrs_err(path, "send HB ACK failed, errno: %d\n", err); + rtrs_err(path, "send HB ACK failed, errno: %pe\n", + ERR_PTR(err)); path->hb_err_handler(usr_con); return; } @@ -375,7 +377,8 @@ static void hb_work(struct work_struct *work) err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm, NULL); if (err) { - rtrs_err(path, "HB send failed, errno: %d\n", err); + rtrs_err(path, "HB send failed, errno: %pe\n", + ERR_PTR(err)); path->hb_err_handler(usr_con); return; } From 9293e042782df38434191de8f3703fe2cb808ad6 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Wed, 7 Jan 2026 17:15:10 +0100 Subject: [PATCH 25/66] RDMA/rtrs: Add optional support for IB_MR_TYPE_SG_GAPS Support IB_MR_TYPE_SG_GAPS, which has less limitations than standard IB_MR_TYPE_MEM_REG, a few ULP support this. Signed-off-by: Md Haris Iqbal Signed-off-by: Kim Zhu Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20260107161517.56357-4-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 10 ++++++++-- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 13 ++++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index ee7d505ff016..58042d835045 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1359,7 +1359,9 @@ static void free_path_reqs(struct rtrs_clt_path *clt_path) static int alloc_path_reqs(struct rtrs_clt_path *clt_path) { + struct ib_device *ib_dev = clt_path->s.dev->ib_dev; struct rtrs_clt_io_req *req; + enum ib_mr_type mr_type; int i, err = -ENOMEM; clt_path->reqs = kcalloc(clt_path->queue_depth, @@ -1368,6 +1370,11 @@ static int alloc_path_reqs(struct rtrs_clt_path *clt_path) if (!clt_path->reqs) return -ENOMEM; + if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + for (i = 0; i < clt_path->queue_depth; ++i) { req = &clt_path->reqs[i]; req->iu = rtrs_iu_alloc(1, clt_path->max_hdr_size, GFP_KERNEL, @@ -1381,8 +1388,7 @@ static int alloc_path_reqs(struct rtrs_clt_path *clt_path) if (!req->sge) goto out; - req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd, - IB_MR_TYPE_MEM_REG, + req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd, mr_type, clt_path->max_pages_per_mr); if (IS_ERR(req->mr)) { err = PTR_ERR(req->mr); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index be44fd1b9944..7ed8910ef7f5 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -562,13 +562,15 @@ static void unmap_cont_bufs(struct rtrs_srv_path *srv_path) static int map_cont_bufs(struct rtrs_srv_path *srv_path) { + struct ib_device *ib_dev = srv_path->s.dev->ib_dev; struct rtrs_srv_sess *srv = srv_path->srv; struct rtrs_path *ss = &srv_path->s; int i, err, mrs_num; unsigned int chunk_bits; + enum ib_mr_type mr_type; int chunks_per_mr = 1; - struct ib_mr *mr; struct sg_table *sgt; + struct ib_mr *mr; /* * Here we map queue_depth chunks to MR. Firstly we have to @@ -617,8 +619,13 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) err = -EINVAL; goto free_sg; } - mr = ib_alloc_mr(srv_path->s.dev->ib_pd, IB_MR_TYPE_MEM_REG, - nr_sgt); + + if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + + mr = ib_alloc_mr(srv_path->s.dev->ib_pd, mr_type, nr_sgt); if (IS_ERR(mr)) { err = PTR_ERR(mr); goto unmap_sg; From f85febf57bb567b59b41a13c9bf845a73b616d10 Mon Sep 17 00:00:00 2001 From: Kim Zhu Date: Wed, 7 Jan 2026 17:15:11 +0100 Subject: [PATCH 26/66] RDMA/rtrs: Improve error logging for RDMA cm events The member variable status in the struct rdma_cm_event is used for both linux errors and the errors definded in rdma stack. Signed-off-by: Kim Zhu Reviewed-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20260107161517.56357-5-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 46 ++++++++++++++++++++------ drivers/infiniband/ulp/rtrs/rtrs-srv.c | 22 +++++++++--- 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 58042d835045..1cd4d333d417 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1946,8 +1946,8 @@ static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con, status, rej_msg, ERR_PTR(errno)); } else { rtrs_err(s, - "Connect rejected but with malformed message: status %pe (%s)\n", - ERR_PTR(status), rej_msg); + "Connect rejected but with malformed message: status %d (%s)\n", + status, rej_msg); } return -ECONNRESET; @@ -2014,27 +2014,53 @@ static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_TIMEWAIT_EXIT: - rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", - rdma_event_msg(ev->event), ERR_PTR(ev->status)); + if (ev->status < 0) { + rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_wrn(s, "CM error (CM event: %s, err: %s)\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } cm_err = -ECONNRESET; break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: - rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", - rdma_event_msg(ev->event), ERR_PTR(ev->status)); + if (ev->status < 0) { + rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), + ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_wrn(s, "CM error (CM event: %s, err: %s)\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } cm_err = -EHOSTUNREACH; break; case RDMA_CM_EVENT_DEVICE_REMOVAL: /* * Device removal is a special case. Queue close and return 0. */ - rtrs_wrn_rl(s, "CM event: %s, status: %pe\n", rdma_event_msg(ev->event), - ERR_PTR(ev->status)); + if (ev->status < 0) { + rtrs_wrn_rl(s, "CM event: %s, status: %pe\n", + rdma_event_msg(ev->event), + ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_wrn_rl(s, "CM event: %s, status: %s\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } rtrs_clt_close_conns(clt_path, false); return 0; default: - rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %pe)\n", - rdma_event_msg(ev->event), ERR_PTR(ev->status)); + if (ev->status < 0) { + rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %s)\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } cm_err = -ECONNRESET; break; } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 7ed8910ef7f5..9b8567e5ea38 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -2012,8 +2012,15 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: - rtrs_err(s, "CM error (CM event: %s, err: %pe)\n", - rdma_event_msg(ev->event), ERR_PTR(ev->status)); + if (ev->status < 0) { + rtrs_err(s, "CM error (CM event: %s, err: %pe)\n", + rdma_event_msg(ev->event), + ERR_PTR(ev->status)); + } else if (ev->status > 0) { + rtrs_err(s, "CM error (CM event: %s, err: %s)\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } fallthrough; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_ADDR_CHANGE: @@ -2022,8 +2029,15 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id, close_path(srv_path); break; default: - pr_err("Ignoring unexpected CM event %s, err %pe\n", - rdma_event_msg(ev->event), ERR_PTR(ev->status)); + if (ev->status < 0) { + pr_err("Ignoring unexpected CM event %s, err %pe\n", + rdma_event_msg(ev->event), + ERR_PTR(ev->status)); + } else if (ev->status > 0) { + pr_err("Ignoring unexpected CM event %s, err %s\n", + rdma_event_msg(ev->event), + rdma_reject_msg(cm_id, ev->status)); + } break; } From 781c35b5d570d3dd242cf0578a92c93ca63fc14f Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 7 Jan 2026 17:15:12 +0100 Subject: [PATCH 27/66] RDMA/rtrs-clt: Remove unused members in rtrs_clt_io_req Remove unused members from rtrs_clt_io_req. Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20260107161517.56357-6-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h index 0f57759b3080..986239ed2d3b 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -92,7 +92,6 @@ struct rtrs_permit { * rtrs_clt_io_req - describes one inflight IO request */ struct rtrs_clt_io_req { - struct list_head list; struct rtrs_iu *iu; struct scatterlist *sglist; /* list holding user data */ unsigned int sg_cnt; @@ -103,12 +102,10 @@ struct rtrs_clt_io_req { bool in_use; enum rtrs_mp_policy mp_policy; struct rtrs_clt_con *con; - struct rtrs_sg_desc *desc; struct ib_sge *sge; struct rtrs_permit *permit; enum dma_data_direction dir; void (*conf)(void *priv, int errno); - unsigned long start_jiffies; struct ib_mr *mr; struct ib_cqe inv_cqe; From c32eaba2d760ef0ec5426b207cf0ce750064cf36 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Wed, 7 Jan 2026 17:15:13 +0100 Subject: [PATCH 28/66] RDMA/rtrs-srv: Add check and closure for possible zombie paths During several network incidents, a number of RTRS paths for a session went through disconnect and reconnect phase. However, some of those did not auto-reconnect successfully. Instead they failed with the following logs, On client, kernel: rtrs_client L1991: : Connect rejected: status 28 (consumer defined), rtrs errno -104 kernel: rtrs_client L2698: : init_conns() failed: err=-104 path=gid:@gid: [mlx4_0:1] On server, (log a) kernel: ibtrs_server L1868: <>: Connection already exists: 0 When the misbehaving path was removed, and add_path was called to re-add the path, the log on client side changed to, (log b) kernel: rtrs_client L1991: : Connect rejected: status 28 (consumer defined), rtrs errno -17 There was no log on the server side for this, which is expected since there is no logging in that path, if (unlikely(__is_path_w_addr_exists(srv, &cm_id->route.addr))) { err = -EEXIST; goto err; Because of the following check on server side, if (unlikely(sess->state != IBTRS_SRV_CONNECTING)) { ibtrs_err(s, "Session in wrong state: %s\n", .. we know that the path in (log a) was in CONNECTING state. The above state of the path persists for as long as we leave the session be. This means that the path is in some zombie state, probably waiting for the info_req packet to arrive, which never does. The changes in this commits does 2 things. 1) Add logs at places where we see the errors happening. The logs would shed more light at the state and lifetime of such zombie paths. 2) Close such zombie sessions, only if they are in CONNECTING state, and after an inactivity period of 30 seconds. i) The state check prevents closure of paths which are CONNECTED. Also, from the above logs and code, we already know that the path could only be on CONNECTING state, so we play safe and narrow our impact surface area by closing only CONNECTING paths. ii) The inactivity period is to allow requests for other cid to finish processing, or for any stray packets to arrive/fail. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20260107161517.56357-7-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 45 +++++++++++++++++++++++--- drivers/infiniband/ulp/rtrs/rtrs-srv.h | 1 + 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 9b8567e5ea38..4e49c15fa970 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -905,6 +905,12 @@ static int process_info_req(struct rtrs_srv_con *con, tx_iu->dma_addr, tx_iu->size, DMA_TO_DEVICE); + /* + * Now disable zombie connection closing. Since from the logs and code, + * we know that it can never be in CONNECTED state. + */ + srv_path->connection_timeout = 0; + /* Send info response */ err = rtrs_iu_post_send(&con->c, tx_iu, tx_sz, reg_wr); if (err) { @@ -1531,17 +1537,38 @@ static int sockaddr_cmp(const struct sockaddr *a, const struct sockaddr *b) } } +/* Let's close connections which have been waiting for more than 30 seconds */ +#define RTRS_MAX_CONN_TIMEOUT 30000 + +static void rtrs_srv_check_close_path(struct rtrs_srv_path *srv_path) +{ + struct rtrs_path *s = &srv_path->s; + + if (srv_path->state == RTRS_SRV_CONNECTING && srv_path->connection_timeout && + (jiffies_to_msecs(jiffies - srv_path->connection_timeout) > RTRS_MAX_CONN_TIMEOUT)) { + rtrs_err(s, "Closing zombie path\n"); + close_path(srv_path); + } +} + static bool __is_path_w_addr_exists(struct rtrs_srv_sess *srv, struct rdma_addr *addr) { struct rtrs_srv_path *srv_path; - list_for_each_entry(srv_path, &srv->paths_list, s.entry) + list_for_each_entry(srv_path, &srv->paths_list, s.entry) { if (!sockaddr_cmp((struct sockaddr *)&srv_path->s.dst_addr, (struct sockaddr *)&addr->dst_addr) && !sockaddr_cmp((struct sockaddr *)&srv_path->s.src_addr, - (struct sockaddr *)&addr->src_addr)) + (struct sockaddr *)&addr->src_addr)) { + rtrs_err((&srv_path->s), + "Path (%s) with same addr exists (lifetime %u)\n", + rtrs_srv_state_str(srv_path->state), + (jiffies_to_msecs(jiffies - srv_path->connection_timeout))); + rtrs_srv_check_close_path(srv_path); return true; + } + } return false; } @@ -1779,7 +1806,6 @@ static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv, } if (__is_path_w_addr_exists(srv, &cm_id->route.addr)) { err = -EEXIST; - pr_err("Path with same addr exists\n"); goto err; } srv_path = kzalloc(sizeof(*srv_path), GFP_KERNEL); @@ -1826,6 +1852,7 @@ static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv, spin_lock_init(&srv_path->state_lock); INIT_WORK(&srv_path->close_work, rtrs_srv_close_work); rtrs_srv_init_hb(srv_path); + srv_path->connection_timeout = 0; srv_path->s.dev = rtrs_ib_dev_find_or_add(cm_id->device, &dev_pd); if (!srv_path->s.dev) { @@ -1931,8 +1958,10 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id, goto reject_w_err; } if (s->con[cid]) { - rtrs_err(s, "Connection already exists: %d\n", - cid); + rtrs_err(s, "Connection (%s) already exists: %d (lifetime %u)\n", + rtrs_srv_state_str(srv_path->state), cid, + (jiffies_to_msecs(jiffies - srv_path->connection_timeout))); + rtrs_srv_check_close_path(srv_path); mutex_unlock(&srv->paths_mutex); goto reject_w_err; } @@ -1947,6 +1976,12 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id, goto reject_w_err; } } + + /* + * Start of any connection creation resets the timeout for the path. + */ + srv_path->connection_timeout = jiffies; + err = create_con(srv_path, cm_id, cid); if (err) { rtrs_err((&srv_path->s), "create_con(), error %pe\n", ERR_PTR(err)); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h index 014f85681f37..3d36876527f5 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h @@ -89,6 +89,7 @@ struct rtrs_srv_path { unsigned int mem_bits; struct kobject kobj; struct rtrs_srv_stats *stats; + unsigned long connection_timeout; }; static inline struct rtrs_srv_path *to_srv_path(struct rtrs_path *s) From b034a10fdfc4fff547a4ee0602538a214534c426 Mon Sep 17 00:00:00 2001 From: Kim Zhu Date: Wed, 7 Jan 2026 17:15:14 +0100 Subject: [PATCH 29/66] RDMA/rtrs-srv: Rate-limit I/O path error logging Excessive error logging is making it difficult to identify the root cause of issues. Implement rate limiting to improve log clarity. Signed-off-by: Kim Zhu Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20260107161517.56357-8-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 4e49c15fa970..d5189f12d2f7 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -184,7 +184,7 @@ static void rtrs_srv_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc) struct rtrs_srv_path *srv_path = to_srv_path(s); if (wc->status != IB_WC_SUCCESS) { - rtrs_err(s, "REG MR failed: %s\n", + rtrs_err_rl(s, "REG MR failed: %s\n", ib_wc_status_msg(wc->status)); close_path(srv_path); return; From 6405f72e7a3ad7567d16ad5b52d086f573c39548 Mon Sep 17 00:00:00 2001 From: Kim Zhu Date: Wed, 7 Jan 2026 17:15:15 +0100 Subject: [PATCH 30/66] RDMA/rtrs: Extend log message when a port fails Add HCA name and port of this HCA. This would help with analysing and debugging the logs. The logs would looks something like this, rtrs_server L2516: Handling event: port error (10). HCA name: mlx4_0, port num: 2 rtrs_client L3326: Handling event: port error (10). HCA name: mlx4_0, port num: 1 Signed-off-by: Kim Zhu Signed-off-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Link: https://patch.msgid.link/20260107161517.56357-9-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 7 +++++-- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 1cd4d333d417..20fd170b434b 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -3178,8 +3178,11 @@ close_path: void rtrs_clt_ib_event_handler(struct ib_event_handler *handler, struct ib_event *ibevent) { - pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), - ibevent->event); + struct ib_device *idev = ibevent->device; + u32 port_num = ibevent->element.port_num; + + pr_info("Handling event: %s (%d). HCA name: %s, port num: %u\n", + ib_event_msg(ibevent->event), ibevent->event, idev->name, port_num); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index d5189f12d2f7..09f4a16b4403 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -2342,8 +2342,11 @@ static int check_module_params(void) void rtrs_srv_ib_event_handler(struct ib_event_handler *handler, struct ib_event *ibevent) { - pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), - ibevent->event); + struct ib_device *idev = ibevent->device; + u32 port_num = ibevent->element.port_num; + + pr_info("Handling event: %s (%d). HCA name: %s, port num: %u\n", + ib_event_msg(ibevent->event), ibevent->event, idev->name, port_num); } static int rtrs_srv_ib_dev_init(struct rtrs_ib_dev *dev) From fc290630702b530c2969061e7ef0d869a5b6dc4f Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Wed, 7 Jan 2026 17:15:16 +0100 Subject: [PATCH 31/66] RDMA/rtrs-clt: For conn rejection use actual err number When the connection establishment request is rejected from the server side, then the actual error number sent back should be used. Signed-off-by: Md Haris Iqbal Link: https://patch.msgid.link/20260107161517.56357-10-haris.iqbal@ionos.com Reviewed-by: Grzegorz Prajsner Reviewed-by: Jack Wang Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 20fd170b434b..e0b263458825 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1928,7 +1928,7 @@ static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con, struct rtrs_path *s = con->c.path; const struct rtrs_msg_conn_rsp *msg; const char *rej_msg; - int status, errno; + int status, errno = -ECONNRESET; u8 data_len; status = ev->status; @@ -1950,7 +1950,7 @@ static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con, status, rej_msg); } - return -ECONNRESET; + return errno; } void rtrs_clt_close_conns(struct rtrs_clt_path *clt_path, bool wait) From 88f2bf22d99b4a89f5ec3d3dec07271368499c3c Mon Sep 17 00:00:00 2001 From: Grzegorz Prajsner Date: Wed, 7 Jan 2026 17:15:17 +0100 Subject: [PATCH 32/66] RDMA/rtrs-srv: Fix error print in process_info_req() rtrs_srv_change_state() returns bool (true on success) therefore there is no reason to print error when it fails as it always will be 0. Signed-off-by: Grzegorz Prajsner Signed-off-by: Md Haris Iqbal Link: https://patch.msgid.link/20260107161517.56357-11-haris.iqbal@ionos.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 09f4a16b4403..2e09811a10b2 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -883,7 +883,7 @@ static int process_info_req(struct rtrs_srv_con *con, get_device(&srv_path->srv->dev); err = rtrs_srv_change_state(srv_path, RTRS_SRV_CONNECTED); if (!err) { - rtrs_err(s, "rtrs_srv_change_state(), err: %pe\n", ERR_PTR(err)); + rtrs_err(s, "rtrs_srv_change_state() failed\n"); goto iu_free; } From 52f3d34c292b62ec151c6a487d267341d47eefa4 Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Sat, 3 Jan 2026 17:25:17 +0000 Subject: [PATCH 33/66] RDMA/irdma: Remove redundant dma_wmb() before writel() A dma_wmb() is not necessary before a writel() because writel() already has an even stronger store barrier. A dma_wmb() is only required to order writes to consistent/DMA memory whereas the barrier in writel() is specified to order writes to DMA memory as well as MMIO. Signed-off-by: Jacob Moroni Link: https://patch.msgid.link/20260103172517.2088895-1-jmoroni@google.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/ctrl.c | 2 -- drivers/infiniband/hw/irdma/uk.c | 3 --- 2 files changed, 5 deletions(-) diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 081551da763a..022fcdfab339 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -3887,8 +3887,6 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val); spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags); - dma_wmb(); /* make sure shadow area is updated before arming */ - writel(ccq->cq_uk.cq_id, ccq->dev->cq_arm_db); } diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 91669326d464..ac3721a5747a 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -114,7 +114,6 @@ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx) */ void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp) { - dma_wmb(); writel(qp->qp_id, qp->wqe_alloc_db); } @@ -1107,8 +1106,6 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, set_64bit_val(cq->shadow_area, 32, temp_val); - dma_wmb(); /* make sure WQE is populated before valid bit is set */ - writel(cq->cq_id, cq->cqe_alloc_db); } From 5c3f795d17dc57a58a1fc1c1b449812e26ad85a3 Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Mon, 5 Jan 2026 18:05:50 +0000 Subject: [PATCH 34/66] RDMA/irdma: Remove fixed 1 ms delay during AH wait loop The AH CQP command wait loop executes in an atomic context and was using a fixed 1 ms delay. Since many AH create commands can complete much faster than 1 ms, use poll_timeout_us_atomic with a 1 us delay. Also, use the timeout value indicated during the capability exchange rather than a hard-coded value. Signed-off-by: Jacob Moroni Link: https://patch.msgid.link/20260105180550.2907858-1-jmoroni@google.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/main.h | 2 ++ drivers/infiniband/hw/irdma/utils.c | 2 +- drivers/infiniband/hw/irdma/verbs.c | 16 ++++++++-------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index baab61e424a2..d320d1a228b3 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #ifndef CONFIG_64BIT @@ -528,6 +529,7 @@ void irdma_cq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_cq *cq); void irdma_srq_event(struct irdma_sc_srq *srq); void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq); void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf); +int irdma_get_timeout_threshold(struct irdma_sc_dev *dev); int irdma_hw_modify_qp(struct irdma_device *iwdev, struct irdma_qp *iwqp, struct irdma_modify_qp_info *info, bool wait); int irdma_qp_suspend_resume(struct irdma_sc_qp *qp, bool suspend); diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index cc2a12f735d3..3bac7c2588ae 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -572,7 +572,7 @@ void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf) } } -static int irdma_get_timeout_threshold(struct irdma_sc_dev *dev) +int irdma_get_timeout_threshold(struct irdma_sc_dev *dev) { u16 time_s = dev->vc_caps.cqp_timeout_s; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 6d9af41a2884..1f1efd4971a9 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -5027,15 +5027,15 @@ static int irdma_create_hw_ah(struct irdma_device *iwdev, struct irdma_ah *ah, b } if (!sleep) { - int cnt = CQP_COMPL_WAIT_TIME_MS * CQP_TIMEOUT_THRESHOLD; + const u64 tmout_ms = irdma_get_timeout_threshold(&rf->sc_dev) * + CQP_COMPL_WAIT_TIME_MS; - do { - irdma_cqp_ce_handler(rf, &rf->ccq.sc_cq); - mdelay(1); - } while (!ah->sc_ah.ah_info.ah_valid && --cnt); - - if (!cnt) { - ibdev_dbg(&iwdev->ibdev, "VERBS: CQP create AH timed out"); + if (poll_timeout_us_atomic(irdma_cqp_ce_handler(rf, + &rf->ccq.sc_cq), + ah->sc_ah.ah_info.ah_valid, 1, + tmout_ms * USEC_PER_MSEC, false)) { + ibdev_dbg(&iwdev->ibdev, + "VERBS: CQP create AH timed out"); err = -ETIMEDOUT; goto err_ah_create; } From 354e7a6d448b5744362bf33a24315d4d1d0bb7ef Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Thu, 8 Jan 2026 19:30:32 +0800 Subject: [PATCH 35/66] RDMA/hns: Support drain SQ and RQ Some ULPs, e.g. rpcrdma, rely on drain_qp() to ensure all outstanding requests are completed before releasing related memory. If drain_qp() fails, ULPs may release memory directly, and in-flight WRs may later be flushed after the memory is freed, potentially leading to UAF. drain_qp() failures can happen when HW enters an error state or is reset. Add support to drain SQ and RQ in such cases by posting a fake WR during reset, so the driver can process all remaining WRs in sequence and generate corresponding completions. Always invoke comp_handler() in drain process to ensure completions are not lost under concurrency (e.g. concurrent post_send() and reset, or QPs created during reset). If the CQ is already processed, cancel any already scheduled comp_handler() to avoid concurrency issues. Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20260108113032.856306-1-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 166 +++++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a2ae4f33e459..5d0a8662249d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -876,6 +876,170 @@ out: return ret; } +static int hns_roce_push_drain_wr(struct hns_roce_wq *wq, struct ib_cq *cq, + u64 wr_id) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&wq->lock, flags); + if (hns_roce_wq_overflow(wq, 1, cq)) { + ret = -ENOMEM; + goto out; + } + + wq->wrid[wq->head & (wq->wqe_cnt - 1)] = wr_id; + wq->head++; + +out: + spin_unlock_irqrestore(&wq->lock, flags); + return ret; +} + +struct hns_roce_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void hns_roce_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct hns_roce_drain_cqe *cqe = container_of(wc->wr_cqe, + struct hns_roce_drain_cqe, + cqe); + complete(&cqe->done); +} + +static void handle_drain_completion(struct ib_cq *ibcq, + struct hns_roce_drain_cqe *drain, + struct hns_roce_dev *hr_dev) +{ +#define TIMEOUT (HZ / 10) + struct hns_roce_cq *hr_cq = to_hr_cq(ibcq); + unsigned long flags; + bool triggered; + + if (ibcq->poll_ctx == IB_POLL_DIRECT) { + while (wait_for_completion_timeout(&drain->done, TIMEOUT) <= 0) + ib_process_cq_direct(ibcq, -1); + return; + } + + if (hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) + goto waiting_done; + + spin_lock_irqsave(&hr_cq->lock, flags); + triggered = hr_cq->is_armed; + hr_cq->is_armed = 1; + spin_unlock_irqrestore(&hr_cq->lock, flags); + + /* Triggered means this cq is processing or has been processed + * by hns_roce_handle_device_err() or this function. We need to + * cancel the already invoked comp_handler() to avoid concurrency. + * If it has not been triggered, we can directly invoke + * comp_handler(). + */ + if (triggered) { + switch (ibcq->poll_ctx) { + case IB_POLL_SOFTIRQ: + irq_poll_disable(&ibcq->iop); + irq_poll_enable(&ibcq->iop); + break; + case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: + cancel_work_sync(&ibcq->work); + break; + default: + WARN_ON_ONCE(1); + } + } + + if (ibcq->comp_handler) + ibcq->comp_handler(ibcq, ibcq->cq_context); + +waiting_done: + if (ibcq->comp_handler) + wait_for_completion(&drain->done); +} + +static void hns_roce_v2_drain_rq(struct ib_qp *ibqp) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct hns_roce_drain_cqe rdrain = {}; + const struct ib_recv_wr *bad_rwr; + struct ib_cq *cq = ibqp->recv_cq; + struct ib_recv_wr rwr = {}; + int ret; + + ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE); + if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to modify qp during drain rq, ret = %d.\n", + ret); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = hns_roce_drain_qp_done; + init_completion(&rdrain.done); + + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) + ret = hns_roce_push_drain_wr(&hr_qp->rq, cq, rwr.wr_id); + else + ret = hns_roce_v2_post_recv(ibqp, &rwr, &bad_rwr); + if (ret) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to post recv for drain rq, ret = %d.\n", + ret); + return; + } + + handle_drain_completion(cq, &rdrain, hr_dev); +} + +static void hns_roce_v2_drain_sq(struct ib_qp *ibqp) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct hns_roce_drain_cqe sdrain = {}; + const struct ib_send_wr *bad_swr; + struct ib_cq *cq = ibqp->send_cq; + struct ib_rdma_wr swr = { + .wr = { + .next = NULL, + { .wr_cqe = &sdrain.cqe, }, + .opcode = IB_WR_RDMA_WRITE, + }, + }; + int ret; + + ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE); + if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to modify qp during drain sq, ret = %d.\n", + ret); + return; + } + + sdrain.cqe.done = hns_roce_drain_qp_done; + init_completion(&sdrain.done); + + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) + ret = hns_roce_push_drain_wr(&hr_qp->sq, cq, swr.wr.wr_id); + else + ret = hns_roce_v2_post_send(ibqp, &swr.wr, &bad_swr); + if (ret) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to post send for drain sq, ret = %d.\n", + ret); + return; + } + + handle_drain_completion(cq, &sdrain, hr_dev); +} + static void *get_srq_wqe_buf(struct hns_roce_srq *srq, u32 n) { return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift); @@ -7040,6 +7204,8 @@ static const struct ib_device_ops hns_roce_v2_dev_ops = { .post_send = hns_roce_v2_post_send, .query_qp = hns_roce_v2_query_qp, .req_notify_cq = hns_roce_v2_req_notify_cq, + .drain_rq = hns_roce_v2_drain_rq, + .drain_sq = hns_roce_v2_drain_sq, }; static const struct ib_device_ops hns_roce_v2_dev_srq_ops = { From 0beefd0e15d962f497aad750b2d5e9c3570b66d1 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Mon, 12 Jan 2026 01:54:12 +0000 Subject: [PATCH 36/66] RDMA/rxe: Fix double free in rxe_srq_from_init In rxe_srq_from_init(), the queue pointer 'q' is assigned to 'srq->rq.queue' before copying the SRQ number to user space. If copy_to_user() fails, the function calls rxe_queue_cleanup() to free the queue, but leaves the now-invalid pointer in 'srq->rq.queue'. The caller of rxe_srq_from_init() (rxe_create_srq) eventually calls rxe_srq_cleanup() upon receiving the error, which triggers a second rxe_queue_cleanup() on the same memory, leading to a double free. The call trace looks like this: kmem_cache_free+0x.../0x... rxe_queue_cleanup+0x1a/0x30 [rdma_rxe] rxe_srq_cleanup+0x42/0x60 [rdma_rxe] rxe_elem_release+0x31/0x70 [rdma_rxe] rxe_create_srq+0x12b/0x1a0 [rdma_rxe] ib_create_srq_user+0x9a/0x150 [ib_core] Fix this by moving 'srq->rq.queue = q' after copy_to_user. Fixes: aae0484e15f0 ("IB/rxe: avoid srq memory leak") Signed-off-by: Jiasheng Jiang Link: https://patch.msgid.link/20260112015412.29458-1-jiashengjiangcool@gmail.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_srq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 2a234f26ac10..c9a7cd38953d 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -77,9 +77,6 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, goto err_free; } - srq->rq.queue = q; - init->attr.max_wr = srq->rq.max_wr; - if (uresp) { if (copy_to_user(&uresp->srq_num, &srq->srq_num, sizeof(uresp->srq_num))) { @@ -88,6 +85,9 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, } } + srq->rq.queue = q; + init->attr.max_wr = srq->rq.max_wr; + return 0; err_free: From 7874eeacfa42177565c01d5198726671acf7adf2 Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Mon, 12 Jan 2026 02:00:06 +0000 Subject: [PATCH 37/66] RDMA/iwcm: Fix workqueue list corruption by removing work_list The commit e1168f0 ("RDMA/iwcm: Simplify cm_event_handler()") changed the work submission logic to unconditionally call queue_work() with the expectation that queue_work() would have no effect if work was already pending. The problem is that a free list of struct iwcm_work is used (for which struct work_struct is embedded), so each call to queue_work() is basically unique and therefore does indeed queue the work. This causes a problem in the work handler which walks the work_list until it's empty to process entries. This means that a single run of the work handler could process item N+1 and release it back to the free list while the actual workqueue entry is still queued. It could then get reused (INIT_WORK...) and lead to list corruption in the workqueue logic. Fix this by just removing the work_list. The workqueue already does this for us. This fixes the following error that was observed when stress testing with ucmatose on an Intel E830 in iWARP mode: [ 151.465780] list_del corruption. next->prev should be ffff9f0915c69c08, but was ffff9f0a1116be08. (next=ffff9f0a15b11c08) [ 151.466639] ------------[ cut here ]------------ [ 151.466986] kernel BUG at lib/list_debug.c:67! [ 151.467349] Oops: invalid opcode: 0000 [#1] SMP NOPTI [ 151.467753] CPU: 14 UID: 0 PID: 2306 Comm: kworker/u64:18 Not tainted 6.19.0-rc4+ #1 PREEMPT(voluntary) [ 151.468466] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 151.469192] Workqueue: 0x0 (iw_cm_wq) [ 151.469478] RIP: 0010:__list_del_entry_valid_or_report+0xf0/0x100 [ 151.469942] Code: c7 58 5f 4c b2 e8 10 50 aa ff 0f 0b 48 89 ef e8 36 57 cb ff 48 8b 55 08 48 89 e9 48 89 de 48 c7 c7 a8 5f 4c b2 e8 f0 4f aa ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90 90 90 90 90 [ 151.471323] RSP: 0000:ffffb15644e7bd68 EFLAGS: 00010046 [ 151.471712] RAX: 000000000000006d RBX: ffff9f0915c69c08 RCX: 0000000000000027 [ 151.472243] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9f0a37d9c600 [ 151.472768] RBP: ffff9f0a15b11c08 R08: 0000000000000000 R09: c0000000ffff7fff [ 151.473294] R10: 0000000000000001 R11: ffffb15644e7bba8 R12: ffff9f092339ee68 [ 151.473817] R13: ffff9f0900059c28 R14: ffff9f092339ee78 R15: 0000000000000000 [ 151.474344] FS: 0000000000000000(0000) GS:ffff9f0a847b5000(0000) knlGS:0000000000000000 [ 151.474934] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 151.475362] CR2: 0000559e233a9088 CR3: 000000020296b004 CR4: 0000000000770ef0 [ 151.475895] PKRU: 55555554 [ 151.476118] Call Trace: [ 151.476331] [ 151.476497] move_linked_works+0x49/0xa0 [ 151.476792] __pwq_activate_work.isra.46+0x2f/0xa0 [ 151.477151] pwq_dec_nr_in_flight+0x1e0/0x2f0 [ 151.477479] process_scheduled_works+0x1c8/0x410 [ 151.477823] worker_thread+0x125/0x260 [ 151.478108] ? __pfx_worker_thread+0x10/0x10 [ 151.478430] kthread+0xfe/0x240 [ 151.478671] ? __pfx_kthread+0x10/0x10 [ 151.478955] ? __pfx_kthread+0x10/0x10 [ 151.479240] ret_from_fork+0x208/0x270 [ 151.479523] ? __pfx_kthread+0x10/0x10 [ 151.479806] ret_from_fork_asm+0x1a/0x30 [ 151.480103] Fixes: e1168f09b331 ("RDMA/iwcm: Simplify cm_event_handler()") Signed-off-by: Jacob Moroni Link: https://patch.msgid.link/20260112020006.1352438-1-jmoroni@google.com Reviewed-by: Bart Van Assche Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/iwcm.c | 56 +++++++++++++--------------------- drivers/infiniband/core/iwcm.h | 1 - 2 files changed, 21 insertions(+), 36 deletions(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 62410578dec3..eb942ab9c405 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -95,7 +95,6 @@ static struct workqueue_struct *iwcm_wq; struct iwcm_work { struct work_struct work; struct iwcm_id_private *cm_id; - struct list_head list; struct iw_cm_event event; struct list_head free_list; }; @@ -178,7 +177,6 @@ static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) return -ENOMEM; } work->cm_id = cm_id_priv; - INIT_LIST_HEAD(&work->list); put_work(work); } return 0; @@ -213,7 +211,6 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv) static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { if (refcount_dec_and_test(&cm_id_priv->refcount)) { - BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); return true; } @@ -260,7 +257,6 @@ struct iw_cm_id *iw_create_cm_id(struct ib_device *device, refcount_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); - INIT_LIST_HEAD(&cm_id_priv->work_list); INIT_LIST_HEAD(&cm_id_priv->work_free_list); return &cm_id_priv->id; @@ -1007,13 +1003,13 @@ static int process_event(struct iwcm_id_private *cm_id_priv, } /* - * Process events on the work_list for the cm_id. If the callback - * function requests that the cm_id be deleted, a flag is set in the - * cm_id flags to indicate that when the last reference is - * removed, the cm_id is to be destroyed. This is necessary to - * distinguish between an object that will be destroyed by the app - * thread asleep on the destroy_comp list vs. an object destroyed - * here synchronously when the last reference is removed. + * Process events for the cm_id. If the callback function requests + * that the cm_id be deleted, a flag is set in the cm_id flags to + * indicate that when the last reference is removed, the cm_id is + * to be destroyed. This is necessary to distinguish between an + * object that will be destroyed by the app thread asleep on the + * destroy_comp list vs. an object destroyed here synchronously + * when the last reference is removed. */ static void cm_work_handler(struct work_struct *_work) { @@ -1024,35 +1020,26 @@ static void cm_work_handler(struct work_struct *_work) int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); - while (!list_empty(&cm_id_priv->work_list)) { - work = list_first_entry(&cm_id_priv->work_list, - struct iwcm_work, list); - list_del_init(&work->list); - levent = work->event; - put_work(work); - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - - if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { - ret = process_event(cm_id_priv, &levent); - if (ret) { - destroy_cm_id(&cm_id_priv->id); - WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); - } - } else - pr_debug("dropping event %d\n", levent.event); - if (iwcm_deref_id(cm_id_priv)) - return; - spin_lock_irqsave(&cm_id_priv->lock, flags); - } + levent = work->event; + put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { + ret = process_event(cm_id_priv, &levent); + if (ret) { + destroy_cm_id(&cm_id_priv->id); + WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); + } + } else + pr_debug("dropping event %d\n", levent.event); + if (iwcm_deref_id(cm_id_priv)) + return; } /* * This function is called on interrupt context. Schedule events on * the iwcm_wq thread to allow callback functions to downcall into - * the CM and/or block. Events are queued to a per-CM_ID - * work_list. If this is the first event on the work_list, the work - * element is also queued on the iwcm_wq thread. + * the CM and/or block. * * Each event holds a reference on the cm_id. Until the last posted * event has been delivered and processed, the cm_id cannot be @@ -1094,7 +1081,6 @@ static int cm_event_handler(struct iw_cm_id *cm_id, } refcount_inc(&cm_id_priv->refcount); - list_add_tail(&work->list, &cm_id_priv->work_list); queue_work(iwcm_wq, &work->work); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h index bf74639be128..b56fb12edece 100644 --- a/drivers/infiniband/core/iwcm.h +++ b/drivers/infiniband/core/iwcm.h @@ -50,7 +50,6 @@ struct iwcm_id_private { struct ib_qp *qp; struct completion destroy_comp; wait_queue_head_t connect_wait; - struct list_head work_list; spinlock_t lock; refcount_t refcount; struct list_head work_free_list; From f972bde7326e9cd3498c137a052f2034f975ebae Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Thu, 15 Jan 2026 01:36:25 -0800 Subject: [PATCH 38/66] RDMA/mana_ib: Take CQ type from the device type Get CQ type from the used gdma device. The MANA_IB_CREATE_RNIC_CQ flag is ignored. It was used in older kernel versions where the mana_ib was shared between ethernet and rnic. Fixes: d4293f96ce0b ("RDMA/mana_ib: unify mana_ib functions to support any gdma device") Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/20260115093625.177306-1-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 4 +--- include/uapi/rdma/mana-abi.h | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 1becc8779123..2dce1b677115 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -24,6 +24,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; cq->cq_handle = INVALID_MANA_HANDLE; + is_rnic_cq = mana_ib_is_rnic(mdev); if (udata) { if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) @@ -35,8 +36,6 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, return err; } - is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); - if ((!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) || attr->cqe > U32_MAX / COMP_ENTRY_SIZE) { ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); @@ -55,7 +54,6 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, ibucontext); doorbell = mana_ucontext->doorbell; } else { - is_rnic_cq = true; buf_size = MANA_PAGE_ALIGN(roundup_pow_of_two(attr->cqe * COMP_ENTRY_SIZE)); cq->cqe = buf_size / COMP_ENTRY_SIZE; err = mana_ib_create_kernel_queue(mdev, buf_size, GDMA_CQ, &cq->queue); diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h index 45c2df619f07..a75bf32b8cfb 100644 --- a/include/uapi/rdma/mana-abi.h +++ b/include/uapi/rdma/mana-abi.h @@ -17,6 +17,9 @@ #define MANA_IB_UVERBS_ABI_VERSION 1 enum mana_ib_create_cq_flags { + /* Reserved for backward compatibility. Legacy + * kernel versions use it to create CQs in RNIC + */ MANA_IB_CREATE_RNIC_CQ = 1 << 0, }; From ebc2164a4cd4314503f1a0c8e7aaf76d7e5fa211 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Tue, 13 Jan 2026 15:37:10 +0200 Subject: [PATCH 39/66] RDMA/mlx5: Fix UMR hang in LAG error state unload During firmware reset in LAG mode, a race condition causes the driver to hang indefinitely while waiting for UMR completion during device unload. See [1]. In LAG mode the bond device is only registered on the master, so it never sees sys_error events from the slave. During firmware reset this causes UMR waits to hang forever on unload as the slave is dead but the master hasn't entered error state yet, so UMR posts succeed but completions never arrive. Fix this by adding a sys_error notifier that gets registered before MLX5_IB_STAGE_IB_REG and stays alive until after ib_unregister_device(). This ensures error events reach the bond device throughout teardown. [1] Call Trace: __schedule+0x2bd/0x760 schedule+0x37/0xa0 schedule_preempt_disabled+0xa/0x10 __mutex_lock.isra.6+0x2b5/0x4a0 __mlx5_ib_dereg_mr+0x606/0x870 [mlx5_ib] ? __xa_erase+0x4a/0xa0 ? _cond_resched+0x15/0x30 ? wait_for_completion+0x31/0x100 ib_dereg_mr_user+0x48/0xc0 [ib_core] ? rdmacg_uncharge_hierarchy+0xa0/0x100 destroy_hw_idr_uobject+0x20/0x50 [ib_uverbs] uverbs_destroy_uobject+0x37/0x150 [ib_uverbs] __uverbs_cleanup_ufile+0xda/0x140 [ib_uverbs] uverbs_destroy_ufile_hw+0x3a/0xf0 [ib_uverbs] ib_uverbs_remove_one+0xc3/0x140 [ib_uverbs] remove_client_context+0x8b/0xd0 [ib_core] disable_device+0x8c/0x130 [ib_core] __ib_unregister_device+0x10d/0x180 [ib_core] ib_unregister_device+0x21/0x30 [ib_core] __mlx5_ib_remove+0x1e4/0x1f0 [mlx5_ib] auxiliary_bus_remove+0x1e/0x30 device_release_driver_internal+0x103/0x1f0 bus_remove_device+0xf7/0x170 device_del+0x181/0x410 mlx5_rescan_drivers_locked.part.10+0xa9/0x1d0 [mlx5_core] mlx5_disable_lag+0x253/0x260 [mlx5_core] mlx5_lag_disable_change+0x89/0xc0 [mlx5_core] mlx5_eswitch_disable+0x67/0xa0 [mlx5_core] mlx5_unload+0x15/0xd0 [mlx5_core] mlx5_unload_one+0x71/0xc0 [mlx5_core] mlx5_sync_reset_reload_work+0x83/0x100 [mlx5_core] process_one_work+0x1a7/0x360 worker_thread+0x30/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x116/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x22/0x40 Fixes: ede132a5cf55 ("RDMA/mlx5: Move events notifier registration to be after device registration") Signed-off-by: Chiara Meiohas Signed-off-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260113-umr-hand-lag-fix-v1-1-3dc476e00cd9@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 75 ++++++++++++++++++++++++---- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + 2 files changed, 68 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e81080622283..e83a5f12e6bc 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3009,7 +3009,6 @@ static void mlx5_ib_handle_event(struct work_struct *_work) container_of(_work, struct mlx5_ib_event_work, work); struct mlx5_ib_dev *ibdev; struct ib_event ibev; - bool fatal = false; if (work->is_slave) { ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi); @@ -3020,12 +3019,6 @@ static void mlx5_ib_handle_event(struct work_struct *_work) } switch (work->event) { - case MLX5_DEV_EVENT_SYS_ERROR: - ibev.event = IB_EVENT_DEVICE_FATAL; - mlx5_ib_handle_internal_error(ibdev); - ibev.element.port_num = (u8)(unsigned long)work->param; - fatal = true; - break; case MLX5_EVENT_TYPE_PORT_CHANGE: if (handle_port_change(ibdev, work->param, &ibev)) goto out; @@ -3047,8 +3040,6 @@ static void mlx5_ib_handle_event(struct work_struct *_work) if (ibdev->ib_active) ib_dispatch_event(&ibev); - if (fatal) - ibdev->ib_active = false; out: kfree(work); } @@ -3092,6 +3083,66 @@ static int mlx5_ib_event_slave_port(struct notifier_block *nb, return NOTIFY_OK; } +static void mlx5_ib_handle_sys_error_event(struct work_struct *_work) +{ + struct mlx5_ib_event_work *work = + container_of(_work, struct mlx5_ib_event_work, work); + struct mlx5_ib_dev *ibdev = work->dev; + struct ib_event ibev; + + ibev.event = IB_EVENT_DEVICE_FATAL; + mlx5_ib_handle_internal_error(ibdev); + ibev.element.port_num = (u8)(unsigned long)work->param; + ibev.device = &ibdev->ib_dev; + + if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", ibev.element.port_num); + goto out; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); + + ibdev->ib_active = false; +out: + kfree(work); +} + +static int mlx5_ib_sys_error_event(struct notifier_block *nb, + unsigned long event, void *param) +{ + struct mlx5_ib_event_work *work; + + if (event != MLX5_DEV_EVENT_SYS_ERROR) + return NOTIFY_DONE; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return NOTIFY_DONE; + + INIT_WORK(&work->work, mlx5_ib_handle_sys_error_event); + work->dev = container_of(nb, struct mlx5_ib_dev, sys_error_events); + work->is_slave = false; + work->param = param; + work->event = event; + + queue_work(mlx5_ib_event_wq, &work->work); + + return NOTIFY_OK; +} + +static int mlx5_ib_stage_sys_error_notifier_init(struct mlx5_ib_dev *dev) +{ + dev->sys_error_events.notifier_call = mlx5_ib_sys_error_event; + mlx5_notifier_register(dev->mdev, &dev->sys_error_events); + return 0; +} + +static void mlx5_ib_stage_sys_error_notifier_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_notifier_unregister(dev->mdev, &dev->sys_error_events); +} + static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) { struct mlx5_hca_vport_context vport_ctx; @@ -4943,6 +4994,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, mlx5_ib_devx_init, mlx5_ib_devx_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_SYS_ERROR_NOTIFIER, + mlx5_ib_stage_sys_error_notifier_init, + mlx5_ib_stage_sys_error_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), @@ -5000,6 +5054,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, mlx5_ib_devx_init, mlx5_ib_devx_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_SYS_ERROR_NOTIFIER, + mlx5_ib_stage_sys_error_notifier_init, + mlx5_ib_stage_sys_error_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index cc6b3b6c713c..4f4114d95130 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1007,6 +1007,7 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_BFREG, MLX5_IB_STAGE_PRE_IB_REG_UMR, MLX5_IB_STAGE_WHITELIST_UID, + MLX5_IB_STAGE_SYS_ERROR_NOTIFIER, MLX5_IB_STAGE_IB_REG, MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_POST_IB_REG_UMR, @@ -1165,6 +1166,7 @@ struct mlx5_ib_dev { /* protect accessing data_direct_dev */ struct mutex data_direct_lock; struct notifier_block mdev_events; + struct notifier_block sys_error_events; struct notifier_block lag_events; int num_ports; /* serialize update of capability mask From 18ea78e2ae83d1d86a72d21d9511927e57e2c0e1 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 15 Jan 2026 14:26:45 +0200 Subject: [PATCH 40/66] IB/mlx5: Fix port speed query for representors When querying speed information for a representor in switchdev mode, the code previously used the first device in the eswitch, which may not match the device that actually owns the representor. In setups such as multi-port eswitch or LAG, this led to incorrect port attributes being reported. Fix this by retrieving the correct core device from the representor's eswitch before querying its port attributes. Fixes: 27f9e0ccb6da ("net/mlx5: Lag, Add single RDMA device in multiport mode") Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260115-port-speed-query-fix-v2-1-3bde6a3c78e7@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e83a5f12e6bc..eba023b7af0f 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -561,12 +561,20 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, * of an error it will still be zeroed out. * Use native port in case of reps */ - if (dev->is_rep) - err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - 1, 0); - else - err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - mdev_port_num, 0); + if (dev->is_rep) { + struct mlx5_eswitch_rep *rep; + + rep = dev->port[port_num - 1].rep; + if (rep) { + mdev = mlx5_eswitch_get_core_dev(rep->esw); + WARN_ON(!mdev); + } + mdev_port_num = 1; + } + + err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, + mdev_port_num, 0); + if (err) goto out; ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); From d3922f6dad69b3d1f7656c9035bd0e82f73091b7 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 16 Jan 2026 11:28:33 +0800 Subject: [PATCH 41/66] RDMA/rxe: Remove unused page_offset member In rxe_map_mr_sg(), the `page_offset` member of the `rxe_mr` struct was initialized based on `ibmr.iova`, which will be updated inside ib_sg_to_pages() later. Consequently, the value assigned to `page_offset` was incorrect. However, since `page_offset` was never utilized throughout the code, it can be safely removed to clean up the codebase and avoid future confusion. Signed-off-by: Li Zhijian Link: https://patch.msgid.link/20260116032833.2574627-1-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_mr.c | 1 - drivers/infiniband/sw/rxe/rxe_odp.c | 1 - drivers/infiniband/sw/rxe/rxe_verbs.h | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index b1df05238848..05710d785a7e 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -237,7 +237,6 @@ int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, mr->nbuf = 0; mr->page_shift = ilog2(page_size); mr->page_mask = ~((u64)page_size - 1); - mr->page_offset = mr->ibmr.iova & (page_size - 1); return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); } diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index ae71812bea82..64295f77563f 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -110,7 +110,6 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, mr->access = access_flags; mr->ibmr.length = length; mr->ibmr.iova = iova; - mr->page_offset = ib_umem_offset(&umem_odp->umem); err = rxe_odp_init_pages(mr); if (err) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index fd48075810dd..f94ce85eb807 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -347,7 +347,6 @@ struct rxe_mr { int access; atomic_t num_mw; - unsigned int page_offset; unsigned int page_shift; u64 page_mask; From 12985e5915a0b8354796efadaaeb201eed115377 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 16 Jan 2026 11:27:53 +0800 Subject: [PATCH 42/66] RDMA/rxe: Fix iova-to-va conversion for MR page sizes != PAGE_SIZE The current implementation incorrectly handles memory regions (MRs) with page sizes different from the system PAGE_SIZE. The core issue is that rxe_set_page() is called with mr->page_size step increments, but the page_list stores individual struct page pointers, each representing PAGE_SIZE of memory. ib_sg_to_page() has ensured that when i>=1 either a) SG[i-1].dma_end and SG[i].dma_addr are contiguous or b) SG[i-1].dma_end and SG[i].dma_addr are mr->page_size aligned. This leads to incorrect iova-to-va conversion in scenarios: 1) page_size < PAGE_SIZE (e.g., MR: 4K, system: 64K): ibmr->iova = 0x181800 sg[0]: dma_addr=0x181800, len=0x800 sg[1]: dma_addr=0x173000, len=0x1000 Access iova = 0x181800 + 0x810 = 0x182010 Expected VA: 0x173010 (second SG, offset 0x10) Before fix: - index = (0x182010 >> 12) - (0x181800 >> 12) = 1 - page_offset = 0x182010 & 0xFFF = 0x10 - xarray[1] stores system page base 0x170000 - Resulting VA: 0x170000 + 0x10 = 0x170010 (wrong) 2) page_size > PAGE_SIZE (e.g., MR: 64K, system: 4K): ibmr->iova = 0x18f800 sg[0]: dma_addr=0x18f800, len=0x800 sg[1]: dma_addr=0x170000, len=0x1000 Access iova = 0x18f800 + 0x810 = 0x190010 Expected VA: 0x170010 (second SG, offset 0x10) Before fix: - index = (0x190010 >> 16) - (0x18f800 >> 16) = 1 - page_offset = 0x190010 & 0xFFFF = 0x10 - xarray[1] stores system page for dma_addr 0x170000 - Resulting VA: system page of 0x170000 + 0x10 = 0x170010 (wrong) Yi Zhang reported a kernel panic[1] years ago related to this defect. Solution: 1. Replace xarray with pre-allocated rxe_mr_page array for sequential indexing (all MR page indices are contiguous) 2. Each rxe_mr_page stores both struct page* and offset within the system page 3. Handle MR page_size != PAGE_SIZE relationships: - page_size > PAGE_SIZE: Split MR pages into multiple system pages - page_size <= PAGE_SIZE: Store offset within system page 4. Add boundary checks and compatibility validation This ensures correct iova-to-va conversion regardless of MR page size and system PAGE_SIZE relationship, while improving performance through array-based sequential access. Tests on 4K and 64K PAGE_SIZE hosts: - rdma-core/pytests $ ./build/bin/run_tests.py --dev eth0_rxe - blktest: $ TIMEOUT=30 QUICK_RUN=1 USE_RXE=1 NVMET_TRTYPES=rdma ./check nvme srp rnbd [1] https://lore.kernel.org/all/CAHj4cs9XRqE25jyVw9rj9YugffLn5+f=1znaBEnu1usLOciD+g@mail.gmail.com/T/ Fixes: 592627ccbdff ("RDMA/rxe: Replace rxe_map and rxe_phys_buf by xarray") Signed-off-by: Li Zhijian Link: https://patch.msgid.link/20260116032753.2574363-1-lizhijian@fujitsu.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_mr.c | 285 +++++++++++++++++--------- drivers/infiniband/sw/rxe/rxe_verbs.h | 10 +- 2 files changed, 196 insertions(+), 99 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 05710d785a7e..c71ab780e379 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -72,14 +72,46 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr) mr->ibmr.type = IB_MR_TYPE_DMA; } +/* + * Convert iova to page_info index. The page_info stores pages of size + * PAGE_SIZE, but MRs can have different page sizes. This function + * handles the conversion for all cases: + * + * 1. mr->page_size > PAGE_SIZE: + * The MR's iova may not be aligned to mr->page_size. We use the + * aligned base (iova & page_mask) as reference, then calculate + * which PAGE_SIZE sub-page the iova falls into. + * + * 2. mr->page_size <= PAGE_SIZE: + * Use simple shift arithmetic since each page_info entry corresponds + * to one or more MR pages. + */ static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) { - return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); + int idx; + + if (mr_page_size(mr) > PAGE_SIZE) + idx = (iova - (mr->ibmr.iova & mr->page_mask)) >> PAGE_SHIFT; + else + idx = (iova >> mr->page_shift) - + (mr->ibmr.iova >> mr->page_shift); + + WARN_ON(idx >= mr->nbuf); + return idx; } +/* + * Convert iova to offset within the page_info entry. + * + * For mr_page_size > PAGE_SIZE, the offset is within the system page. + * For mr_page_size <= PAGE_SIZE, the offset is within the MR page size. + */ static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) { - return iova & (mr_page_size(mr) - 1); + if (mr_page_size(mr) > PAGE_SIZE) + return iova & (PAGE_SIZE - 1); + else + return iova & (mr_page_size(mr) - 1); } static bool is_pmem_page(struct page *pg) @@ -93,37 +125,69 @@ static bool is_pmem_page(struct page *pg) static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) { - XA_STATE(xas, &mr->page_list, 0); struct sg_page_iter sg_iter; struct page *page; bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); + WARN_ON(mr_page_size(mr) != PAGE_SIZE); + __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); if (!__sg_page_iter_next(&sg_iter)) return 0; - do { - xas_lock(&xas); - while (true) { - page = sg_page_iter_page(&sg_iter); + while (true) { + page = sg_page_iter_page(&sg_iter); - if (persistent && !is_pmem_page(page)) { - rxe_dbg_mr(mr, "Page can't be persistent\n"); - xas_set_err(&xas, -EINVAL); - break; - } - - xas_store(&xas, page); - if (xas_error(&xas)) - break; - xas_next(&xas); - if (!__sg_page_iter_next(&sg_iter)) - break; + if (persistent && !is_pmem_page(page)) { + rxe_dbg_mr(mr, "Page can't be persistent\n"); + return -EINVAL; } - xas_unlock(&xas); - } while (xas_nomem(&xas, GFP_KERNEL)); - return xas_error(&xas); + mr->page_info[mr->nbuf].page = page; + mr->page_info[mr->nbuf].offset = 0; + mr->nbuf++; + + if (!__sg_page_iter_next(&sg_iter)) + break; + } + + return 0; +} + +static int __alloc_mr_page_info(struct rxe_mr *mr, int num_pages) +{ + mr->page_info = kcalloc(num_pages, sizeof(struct rxe_mr_page), + GFP_KERNEL); + if (!mr->page_info) + return -ENOMEM; + + mr->max_allowed_buf = num_pages; + mr->nbuf = 0; + + return 0; +} + +static int alloc_mr_page_info(struct rxe_mr *mr, int num_pages) +{ + int ret; + + WARN_ON(mr->num_buf); + ret = __alloc_mr_page_info(mr, num_pages); + if (ret) + return ret; + + mr->num_buf = num_pages; + + return 0; +} + +static void free_mr_page_info(struct rxe_mr *mr) +{ + if (!mr->page_info) + return; + + kfree(mr->page_info); + mr->page_info = NULL; } int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, @@ -134,8 +198,6 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, rxe_mr_init(access, mr); - xa_init(&mr->page_list); - umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", @@ -143,46 +205,24 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, return PTR_ERR(umem); } + err = alloc_mr_page_info(mr, ib_umem_num_pages(umem)); + if (err) + goto err2; + err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); - if (err) { - ib_umem_release(umem); - return err; - } + if (err) + goto err1; mr->umem = umem; mr->ibmr.type = IB_MR_TYPE_USER; mr->state = RXE_MR_STATE_VALID; return 0; -} - -static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) -{ - XA_STATE(xas, &mr->page_list, 0); - int i = 0; - int err; - - xa_init(&mr->page_list); - - do { - xas_lock(&xas); - while (i != num_buf) { - xas_store(&xas, XA_ZERO_ENTRY); - if (xas_error(&xas)) - break; - xas_next(&xas); - i++; - } - xas_unlock(&xas); - } while (xas_nomem(&xas, GFP_KERNEL)); - - err = xas_error(&xas); - if (err) - return err; - - mr->num_buf = num_buf; - - return 0; +err1: + free_mr_page_info(mr); +err2: + ib_umem_release(umem); + return err; } int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) @@ -192,7 +232,7 @@ int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) /* always allow remote access for FMRs */ rxe_mr_init(RXE_ACCESS_REMOTE, mr); - err = rxe_mr_alloc(mr, max_pages); + err = alloc_mr_page_info(mr, max_pages); if (err) goto err1; @@ -205,26 +245,43 @@ err1: return err; } +/* + * I) MRs with page_size >= PAGE_SIZE, + * Split a large MR page (mr->page_size) into multiple PAGE_SIZE + * sub-pages and store them in page_info, offset is always 0. + * + * Called when mr->page_size > PAGE_SIZE. Each call to rxe_set_page() + * represents one mr->page_size region, which we must split into + * (mr->page_size >> PAGE_SHIFT) individual pages. + * + * II) MRs with page_size < PAGE_SIZE, + * Save each PAGE_SIZE page and its offset within the system page in page_info. + */ static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr) { struct rxe_mr *mr = to_rmr(ibmr); - struct page *page = ib_virt_dma_to_page(dma_addr); bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); - int err; + u32 i, pages_per_mr = mr_page_size(mr) >> PAGE_SHIFT; - if (persistent && !is_pmem_page(page)) { - rxe_dbg_mr(mr, "Page cannot be persistent\n"); - return -EINVAL; + pages_per_mr = MAX(1, pages_per_mr); + + for (i = 0; i < pages_per_mr; i++) { + u64 addr = dma_addr + i * PAGE_SIZE; + struct page *sub_page = ib_virt_dma_to_page(addr); + + if (unlikely(mr->nbuf >= mr->max_allowed_buf)) + return -ENOMEM; + + if (persistent && !is_pmem_page(sub_page)) { + rxe_dbg_mr(mr, "Page cannot be persistent\n"); + return -EINVAL; + } + + mr->page_info[mr->nbuf].page = sub_page; + mr->page_info[mr->nbuf].offset = addr & (PAGE_SIZE - 1); + mr->nbuf++; } - if (unlikely(mr->nbuf == mr->num_buf)) - return -ENOMEM; - - err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); - if (err) - return err; - - mr->nbuf++; return 0; } @@ -234,6 +291,31 @@ int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, struct rxe_mr *mr = to_rmr(ibmr); unsigned int page_size = mr_page_size(mr); + /* + * Ensure page_size and PAGE_SIZE are compatible for mapping. + * We require one to be a multiple of the other for correct + * iova-to-page conversion. + */ + if (!IS_ALIGNED(page_size, PAGE_SIZE) && + !IS_ALIGNED(PAGE_SIZE, page_size)) { + rxe_dbg_mr(mr, "MR page size %u must be compatible with PAGE_SIZE %lu\n", + page_size, PAGE_SIZE); + return -EINVAL; + } + + if (mr_page_size(mr) > PAGE_SIZE) { + /* resize page_info if needed */ + u32 map_mr_pages = (page_size >> PAGE_SHIFT) * mr->num_buf; + + if (map_mr_pages > mr->max_allowed_buf) { + rxe_dbg_mr(mr, "requested pages %u exceed max %u\n", + map_mr_pages, mr->max_allowed_buf); + free_mr_page_info(mr); + if (__alloc_mr_page_info(mr, map_mr_pages)) + return -ENOMEM; + } + } + mr->nbuf = 0; mr->page_shift = ilog2(page_size); mr->page_mask = ~((u64)page_size - 1); @@ -244,30 +326,30 @@ int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, unsigned int length, enum rxe_mr_copy_dir dir) { - unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); - unsigned long index = rxe_mr_iova_to_index(mr, iova); unsigned int bytes; - struct page *page; - void *va; + u8 *va; while (length) { - page = xa_load(&mr->page_list, index); - if (!page) + unsigned long index = rxe_mr_iova_to_index(mr, iova); + struct rxe_mr_page *info = &mr->page_info[index]; + unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); + + if (!info->page) return -EFAULT; - bytes = min_t(unsigned int, length, - mr_page_size(mr) - page_offset); - va = kmap_local_page(page); + page_offset += info->offset; + bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset); + va = kmap_local_page(info->page); + if (dir == RXE_FROM_MR_OBJ) memcpy(addr, va + page_offset, bytes); else memcpy(va + page_offset, addr, bytes); kunmap_local(va); - page_offset = 0; addr += bytes; + iova += bytes; length -= bytes; - index++; } return 0; @@ -425,9 +507,6 @@ err1: static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) { - unsigned int page_offset; - unsigned long index; - struct page *page; unsigned int bytes; int err; u8 *va; @@ -437,15 +516,17 @@ static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int leng return err; while (length > 0) { - index = rxe_mr_iova_to_index(mr, iova); - page = xa_load(&mr->page_list, index); - page_offset = rxe_mr_iova_to_page_offset(mr, iova); - if (!page) - return -EFAULT; - bytes = min_t(unsigned int, length, - mr_page_size(mr) - page_offset); + unsigned long index = rxe_mr_iova_to_index(mr, iova); + struct rxe_mr_page *info = &mr->page_info[index]; + unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); - va = kmap_local_page(page); + if (!info->page) + return -EFAULT; + + page_offset += info->offset; + bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset); + + va = kmap_local_page(info->page); arch_wb_cache_pmem(va + page_offset, bytes); kunmap_local(va); @@ -500,6 +581,7 @@ enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, } else { unsigned long index; int err; + struct rxe_mr_page *info; err = mr_check_range(mr, iova, sizeof(value)); if (err) { @@ -508,9 +590,12 @@ enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, } page_offset = rxe_mr_iova_to_page_offset(mr, iova); index = rxe_mr_iova_to_index(mr, iova); - page = xa_load(&mr->page_list, index); - if (!page) + info = &mr->page_info[index]; + if (!info->page) return RESPST_ERR_RKEY_VIOLATION; + + page_offset += info->offset; + page = info->page; } if (unlikely(page_offset & 0x7)) { @@ -549,6 +634,7 @@ enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) } else { unsigned long index; int err; + struct rxe_mr_page *info; /* See IBA oA19-28 */ err = mr_check_range(mr, iova, sizeof(value)); @@ -558,9 +644,12 @@ enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) } page_offset = rxe_mr_iova_to_page_offset(mr, iova); index = rxe_mr_iova_to_index(mr, iova); - page = xa_load(&mr->page_list, index); - if (!page) + info = &mr->page_info[index]; + if (!info->page) return RESPST_ERR_RKEY_VIOLATION; + + page_offset += info->offset; + page = info->page; } /* See IBA A19.4.2 */ @@ -724,5 +813,5 @@ void rxe_mr_cleanup(struct rxe_pool_elem *elem) ib_umem_release(mr->umem); if (mr->ibmr.type != IB_MR_TYPE_DMA) - xa_destroy(&mr->page_list); + free_mr_page_info(mr); } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index f94ce85eb807..fb149f37e91d 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -335,6 +335,11 @@ static inline int rkey_is_mw(u32 rkey) return (index >= RXE_MIN_MW_INDEX) && (index <= RXE_MAX_MW_INDEX); } +struct rxe_mr_page { + struct page *page; + unsigned int offset; /* offset in system page */ +}; + struct rxe_mr { struct rxe_pool_elem elem; struct ib_mr ibmr; @@ -350,10 +355,13 @@ struct rxe_mr { unsigned int page_shift; u64 page_mask; + /* size of page_info when mr allocated */ u32 num_buf; + /* real size of page_info */ + u32 max_allowed_buf; u32 nbuf; - struct xarray page_list; + struct rxe_mr_page *page_info; }; static inline unsigned int mr_page_size(struct rxe_mr *mr) From 2b7c2ba1308a545a2be2d3d041c2c1cde95b700a Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Tue, 20 Jan 2026 21:25:45 +0000 Subject: [PATCH 43/66] RDMA/irdma: Add enum defs for reserved CQs/QPs Added definitions for the special reserved CQs and QPs. Signed-off-by: Jacob Moroni Link: https://patch.msgid.link/20260120212546.1893076-1-jmoroni@google.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/hw.c | 20 ++++++++++---------- drivers/infiniband/hw/irdma/type.h | 12 ++++++++++++ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index d1fc5726b979..5d418ef5cdca 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -1532,8 +1532,8 @@ static int irdma_initialize_ilq(struct irdma_device *iwdev) int status; info.type = IRDMA_PUDA_RSRC_TYPE_ILQ; - info.cq_id = 1; - info.qp_id = 1; + info.cq_id = IRDMA_RSVD_CQ_ID_ILQ; + info.qp_id = IRDMA_RSVD_QP_ID_GSI_ILQ; info.count = 1; info.pd_id = 1; info.abi_ver = IRDMA_ABI_VER; @@ -1562,7 +1562,7 @@ static int irdma_initialize_ieq(struct irdma_device *iwdev) int status; info.type = IRDMA_PUDA_RSRC_TYPE_IEQ; - info.cq_id = 2; + info.cq_id = IRDMA_RSVD_CQ_ID_IEQ; info.qp_id = iwdev->vsi.exception_lan_q; info.count = 1; info.pd_id = 2; @@ -1868,7 +1868,7 @@ int irdma_rt_init_hw(struct irdma_device *iwdev, vsi_info.pf_data_vsi_num = iwdev->vsi_num; vsi_info.register_qset = rf->gen_ops.register_qset; vsi_info.unregister_qset = rf->gen_ops.unregister_qset; - vsi_info.exception_lan_q = 2; + vsi_info.exception_lan_q = IRDMA_RSVD_QP_ID_IEQ; irdma_sc_vsi_init(&iwdev->vsi, &vsi_info); status = irdma_setup_cm_core(iwdev, rf->rdma_ver); @@ -2099,18 +2099,18 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) irdma_set_hw_rsrc(rf); set_bit(0, rf->allocated_mrs); - set_bit(0, rf->allocated_qps); - set_bit(0, rf->allocated_cqs); + set_bit(IRDMA_RSVD_QP_ID_0, rf->allocated_qps); + set_bit(IRDMA_RSVD_CQ_ID_CQP, rf->allocated_cqs); set_bit(0, rf->allocated_srqs); set_bit(0, rf->allocated_pds); set_bit(0, rf->allocated_arps); set_bit(0, rf->allocated_ahs); set_bit(0, rf->allocated_mcgs); - set_bit(2, rf->allocated_qps); /* qp 2 IEQ */ - set_bit(1, rf->allocated_qps); /* qp 1 ILQ */ - set_bit(1, rf->allocated_cqs); + set_bit(IRDMA_RSVD_QP_ID_IEQ, rf->allocated_qps); + set_bit(IRDMA_RSVD_QP_ID_GSI_ILQ, rf->allocated_qps); + set_bit(IRDMA_RSVD_CQ_ID_ILQ, rf->allocated_cqs); set_bit(1, rf->allocated_pds); - set_bit(2, rf->allocated_cqs); + set_bit(IRDMA_RSVD_CQ_ID_IEQ, rf->allocated_cqs); set_bit(2, rf->allocated_pds); INIT_LIST_HEAD(&rf->mc_qht_list.list); diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index cab4896640a1..3de9240b727f 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -239,6 +239,18 @@ enum irdma_queue_type { IRDMA_QUEUE_TYPE_SRQ, }; +enum irdma_rsvd_cq_id { + IRDMA_RSVD_CQ_ID_CQP, + IRDMA_RSVD_CQ_ID_ILQ, + IRDMA_RSVD_CQ_ID_IEQ, +}; + +enum irdma_rsvd_qp_id { + IRDMA_RSVD_QP_ID_0, + IRDMA_RSVD_QP_ID_GSI_ILQ, + IRDMA_RSVD_QP_ID_IEQ, +}; + struct irdma_sc_dev; struct irdma_vsi_pestat; From 2529aead51673814ebf464723626ac608b8635a5 Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Tue, 20 Jan 2026 21:25:46 +0000 Subject: [PATCH 44/66] RDMA/irdma: Use CQ ID for CEQE context The hardware allows for an opaque CQ context field to be carried over into CEQEs for the CQ. Previously, a pointer to the CQ was used for this context. In the normal CQ destroy flow, the CEQ ring is scrubbed to remove any preexisting CEQEs for the CQ that may not have been processed yet so that the CQ structure is not dereferenced in the CEQ ISR after the CQ has been freed. However, in some cases, it is possible for a CEQE to be in flight in HW even after the CQ destroy command completion is received, so it could be missed during the scrub. To protect against this, we can take advantage of the CQ table that already exists and use the CQ ID for this context rather than a CQ pointer. Signed-off-by: Jacob Moroni Link: https://patch.msgid.link/20260120212546.1893076-2-jmoroni@google.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/ctrl.c | 62 ++++++++------------ drivers/infiniband/hw/irdma/hw.c | 88 +++++++++++++++++++++++++---- drivers/infiniband/hw/irdma/puda.c | 14 +++++ drivers/infiniband/hw/irdma/type.h | 6 +- drivers/infiniband/hw/irdma/utils.c | 3 +- drivers/infiniband/hw/irdma/verbs.c | 5 +- 6 files changed, 127 insertions(+), 51 deletions(-) diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 022fcdfab339..45c7433c96f3 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -2886,15 +2886,6 @@ static int irdma_sc_resume_qp(struct irdma_sc_cqp *cqp, struct irdma_sc_qp *qp, return 0; } -/** - * irdma_sc_cq_ack - acknowledge completion q - * @cq: cq struct - */ -static inline void irdma_sc_cq_ack(struct irdma_sc_cq *cq) -{ - writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); -} - /** * irdma_sc_cq_init - initialize completion q * @cq: cq struct @@ -2956,7 +2947,7 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, return -ENOMEM; set_64bit_val(wqe, 0, cq->cq_uk.cq_size); - set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); + set_64bit_val(wqe, 8, cq->cq_uk.cq_id); set_64bit_val(wqe, 16, FIELD_PREP(IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD, cq->shadow_read_threshold)); set_64bit_val(wqe, 32, (cq->virtual_map ? 0 : cq->cq_pa)); @@ -3013,7 +3004,7 @@ int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq) return -ENOMEM; set_64bit_val(wqe, 0, cq->cq_uk.cq_size); - set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); + set_64bit_val(wqe, 8, cq->cq_uk.cq_id); set_64bit_val(wqe, 40, cq->shadow_area_pa); set_64bit_val(wqe, 48, (cq->virtual_map ? cq->first_pm_pbl_idx : 0)); @@ -3082,7 +3073,7 @@ static int irdma_sc_cq_modify(struct irdma_sc_cq *cq, return -ENOMEM; set_64bit_val(wqe, 0, info->cq_size); - set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); + set_64bit_val(wqe, 8, cq->cq_uk.cq_id); set_64bit_val(wqe, 16, FIELD_PREP(IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD, info->shadow_read_threshold)); set_64bit_val(wqe, 32, info->cq_pa); @@ -4458,47 +4449,38 @@ int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq) * irdma_sc_process_ceq - process ceq * @dev: sc device struct * @ceq: ceq sc structure + * @cq_idx: Pointer to a CQ ID that will be populated. * * It is expected caller serializes this function with cleanup_ceqes() * because these functions manipulate the same ceq + * + * Return: True if cq_idx has been populated with a CQ ID. */ -void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) +bool irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq, + u32 *cq_idx) { u64 temp; __le64 *ceqe; - struct irdma_sc_cq *cq = NULL; - struct irdma_sc_cq *temp_cq; u8 polarity; - u32 cq_idx; do { - cq_idx = 0; ceqe = IRDMA_GET_CURRENT_CEQ_ELEM(ceq); get_64bit_val(ceqe, 0, &temp); polarity = (u8)FIELD_GET(IRDMA_CEQE_VALID, temp); if (polarity != ceq->polarity) - return NULL; + return false; - temp_cq = (struct irdma_sc_cq *)(unsigned long)(temp << 1); - if (!temp_cq) { - cq_idx = IRDMA_INVALID_CQ_IDX; - IRDMA_RING_MOVE_TAIL(ceq->ceq_ring); - - if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring)) - ceq->polarity ^= 1; - continue; - } - - cq = temp_cq; + /* Truncate. Discard valid bit which is MSb of temp. */ + *cq_idx = temp; + if (*cq_idx >= dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt) + *cq_idx = IRDMA_INVALID_CQ_IDX; IRDMA_RING_MOVE_TAIL(ceq->ceq_ring); if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring)) ceq->polarity ^= 1; - } while (cq_idx == IRDMA_INVALID_CQ_IDX); + } while (*cq_idx == IRDMA_INVALID_CQ_IDX); - if (cq) - irdma_sc_cq_ack(cq); - return cq; + return true; } /** @@ -4512,10 +4494,10 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) */ void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq) { - struct irdma_sc_cq *next_cq; u8 ceq_polarity = ceq->polarity; __le64 *ceqe; u8 polarity; + u32 cq_idx; u64 temp; int next; u32 i; @@ -4530,9 +4512,10 @@ void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq) if (polarity != ceq_polarity) return; - next_cq = (struct irdma_sc_cq *)(unsigned long)(temp << 1); - if (cq == next_cq) - set_64bit_val(ceqe, 0, temp & IRDMA_CEQE_VALID); + cq_idx = temp; + if (cq_idx == cq->cq_uk.cq_id) + set_64bit_val(ceqe, 0, (temp & IRDMA_CEQE_VALID) | + IRDMA_INVALID_CQ_IDX); next = IRDMA_RING_GET_NEXT_TAIL(ceq->ceq_ring, i); if (!next) @@ -4973,7 +4956,7 @@ int irdma_sc_ccq_destroy(struct irdma_sc_cq *ccq, u64 scratch, bool post_sq) return -ENOMEM; set_64bit_val(wqe, 0, ccq->cq_uk.cq_size); - set_64bit_val(wqe, 8, (uintptr_t)ccq >> 1); + set_64bit_val(wqe, 8, ccq->cq_uk.cq_id); set_64bit_val(wqe, 40, ccq->shadow_area_pa); hdr = ccq->cq_uk.cq_id | @@ -6459,6 +6442,9 @@ int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev, int ret_code = 0; u8 db_size; + spin_lock_init(&dev->puda_cq_lock); + dev->ilq_cq = NULL; + dev->ieq_cq = NULL; INIT_LIST_HEAD(&dev->cqp_cmd_head); /* for CQP command backlog */ mutex_init(&dev->ws_mutex); dev->hmc_fn_id = info->hmc_fn_id; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 5d418ef5cdca..31c67b753fc0 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -98,6 +98,74 @@ static void irdma_puda_ce_handler(struct irdma_pci_f *rf, irdma_sc_ccq_arm(cq); } +/** + * irdma_process_normal_ceqe - Handle a CEQE for a normal CQ. + * @rf: RDMA PCI function. + * @dev: iWARP device. + * @cq_idx: CQ ID. Must be in table bounds. + * + * Context: Atomic (CEQ lock must be held) + */ +static void irdma_process_normal_ceqe(struct irdma_pci_f *rf, + struct irdma_sc_dev *dev, u32 cq_idx) +{ + /* cq_idx bounds validated in irdma_sc_process_ceq. */ + struct irdma_cq *icq = READ_ONCE(rf->cq_table[cq_idx]); + struct irdma_sc_cq *cq; + + if (unlikely(!icq)) { + /* Should not happen since CEQ is scrubbed upon CQ delete. */ + ibdev_warn_ratelimited(to_ibdev(dev), "Stale CEQE for CQ %u", + cq_idx); + return; + } + + cq = &icq->sc_cq; + + if (unlikely(cq->cq_type != IRDMA_CQ_TYPE_IWARP)) { + ibdev_warn_ratelimited(to_ibdev(dev), "Unexpected CQ type %u", + cq->cq_type); + return; + } + + writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); + irdma_iwarp_ce_handler(cq); +} + +/** + * irdma_process_reserved_ceqe - Handle a CEQE for a reserved CQ. + * @rf: RDMA PCI function. + * @dev: iWARP device. + * @cq_idx: CQ ID. + * + * Context: Atomic + */ +static void irdma_process_reserved_ceqe(struct irdma_pci_f *rf, + struct irdma_sc_dev *dev, u32 cq_idx) +{ + struct irdma_sc_cq *cq; + + if (cq_idx == IRDMA_RSVD_CQ_ID_CQP) { + cq = &rf->ccq.sc_cq; + /* CQP CQ lifetime > CEQ. */ + writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); + queue_work(rf->cqp_cmpl_wq, &rf->cqp_cmpl_work); + } else if (cq_idx == IRDMA_RSVD_CQ_ID_ILQ || + cq_idx == IRDMA_RSVD_CQ_ID_IEQ) { + scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) { + cq = (cq_idx == IRDMA_RSVD_CQ_ID_ILQ) ? + dev->ilq_cq : dev->ieq_cq; + if (!cq) { + ibdev_warn_ratelimited(to_ibdev(dev), + "Stale ILQ/IEQ CEQE"); + return; + } + writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); + irdma_puda_ce_handler(rf, cq); + } + } +} + /** * irdma_process_ceq - handle ceq for completions * @rf: RDMA PCI function @@ -107,28 +175,28 @@ static void irdma_process_ceq(struct irdma_pci_f *rf, struct irdma_ceq *ceq) { struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_sc_ceq *sc_ceq; - struct irdma_sc_cq *cq; unsigned long flags; + u32 cq_idx; sc_ceq = &ceq->sc_ceq; do { spin_lock_irqsave(&ceq->ce_lock, flags); - cq = irdma_sc_process_ceq(dev, sc_ceq); - if (!cq) { + + if (!irdma_sc_process_ceq(dev, sc_ceq, &cq_idx)) { spin_unlock_irqrestore(&ceq->ce_lock, flags); break; } - if (cq->cq_type == IRDMA_CQ_TYPE_IWARP) - irdma_iwarp_ce_handler(cq); + /* Normal CQs must be handled while holding CEQ lock. */ + if (likely(cq_idx > IRDMA_RSVD_CQ_ID_IEQ)) { + irdma_process_normal_ceqe(rf, dev, cq_idx); + spin_unlock_irqrestore(&ceq->ce_lock, flags); + continue; + } spin_unlock_irqrestore(&ceq->ce_lock, flags); - if (cq->cq_type == IRDMA_CQ_TYPE_CQP) - queue_work(rf->cqp_cmpl_wq, &rf->cqp_cmpl_work); - else if (cq->cq_type == IRDMA_CQ_TYPE_ILQ || - cq->cq_type == IRDMA_CQ_TYPE_IEQ) - irdma_puda_ce_handler(rf, cq); + irdma_process_reserved_ceqe(rf, dev, cq_idx); } while (1); } diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index cee47ddbd1b5..4f1a8c97faf1 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -809,6 +809,13 @@ error: dma_free_coherent(dev->hw->device, rsrc->cqmem.size, rsrc->cqmem.va, rsrc->cqmem.pa); rsrc->cqmem.va = NULL; + } else { + scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) { + if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ) + dev->ilq_cq = cq; + else + dev->ieq_cq = cq; + } } return ret; @@ -856,6 +863,13 @@ static void irdma_puda_free_cq(struct irdma_puda_rsrc *rsrc) struct irdma_ccq_cqe_info compl_info; struct irdma_sc_dev *dev = rsrc->dev; + scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) { + if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ) + dev->ilq_cq = NULL; + else + dev->ieq_cq = NULL; + } + if (rsrc->dev->ceq_valid) { irdma_cqp_cq_destroy_cmd(dev, &rsrc->cq); return; diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index 3de9240b727f..da8c54d1f035 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -707,6 +707,9 @@ struct irdma_sc_dev { struct irdma_sc_aeq *aeq; struct irdma_sc_ceq *ceq[IRDMA_CEQ_MAX_COUNT]; struct irdma_sc_cq *ccq; + spinlock_t puda_cq_lock; + struct irdma_sc_cq *ilq_cq; + struct irdma_sc_cq *ieq_cq; const struct irdma_irq_ops *irq_ops; struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY]; struct irdma_hmc_fpm_misc hmc_fpm_misc; @@ -1344,7 +1347,8 @@ int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq); int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, struct irdma_ceq_init_info *info); void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq); -void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq); +bool irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq, + u32 *cq_idx); int irdma_sc_aeq_init(struct irdma_sc_aeq *aeq, struct irdma_aeq_init_info *info); diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 3bac7c2588ae..6a385cea6b2c 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -829,7 +829,8 @@ void irdma_cq_rem_ref(struct ib_cq *ibcq) return; } - iwdev->rf->cq_table[iwcq->cq_num] = NULL; + /* May be asynchronously sampled by CEQ ISR without holding tbl lock. */ + WRITE_ONCE(iwdev->rf->cq_table[iwcq->cq_num], NULL); spin_unlock_irqrestore(&iwdev->rf->cqtable_lock, flags); complete(&iwcq->free_cq); } diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 1f1efd4971a9..cf8d19150574 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2669,9 +2669,12 @@ static int irdma_create_cq(struct ib_cq *ibcq, goto cq_destroy; } } - rf->cq_table[cq_num] = iwcq; + init_completion(&iwcq->free_cq); + /* Populate table entry after CQ is fully created. */ + smp_store_release(&rf->cq_table[cq_num], iwcq); + return 0; cq_destroy: irdma_cq_wq_destroy(rf, cq); From 1956f0a74ccf5dc9c3ef717f2985c3ed3400aab0 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 22 Jan 2026 22:29:00 +0800 Subject: [PATCH 45/66] RDMA/uverbs: Validate wqe_size before using it in ib_uverbs_post_send ib_uverbs_post_send() uses cmd.wqe_size from userspace without any validation before passing it to kmalloc() and using the allocated buffer as struct ib_uverbs_send_wr. If a user provides a small wqe_size value (e.g., 1), kmalloc() will succeed, but subsequent accesses to user_wr->opcode, user_wr->num_sge, and other fields will read beyond the allocated buffer, resulting in an out-of-bounds read from kernel heap memory. This could potentially leak sensitive kernel information to userspace. Additionally, providing an excessively large wqe_size can trigger a WARNING in the memory allocation path, as reported by syzkaller. This is inconsistent with ib_uverbs_unmarshall_recv() which properly validates that wqe_size >= sizeof(struct ib_uverbs_recv_wr) before proceeding. Add the same validation for ib_uverbs_post_send() to ensure wqe_size is at least sizeof(struct ib_uverbs_send_wr). Fixes: c3bea3d2dc53 ("RDMA/uverbs: Use the iterator for ib_uverbs_unmarshall_recv()") Signed-off-by: Yi Liu Link: https://patch.msgid.link/20260122142900.2356276-2-liuy22@mails.tsinghua.edu.cn Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index ce16404cdfb8..3259e9848cc7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2049,7 +2049,10 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) if (ret) return ret; - user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL); + if (cmd.wqe_size < sizeof(struct ib_uverbs_send_wr)) + return -EINVAL; + + user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL | __GFP_NOWARN); if (!user_wr) return -ENOMEM; From 9b9d253908478f504297ac283c514e5953ddafa6 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Mon, 26 Jan 2026 07:48:01 +0000 Subject: [PATCH 46/66] RDMA/mlx5: Fix memory leak in GET_DATA_DIRECT_SYSFS_PATH handler The UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH) function allocates memory for the device path using kobject_get_path(). If the length of the device path exceeds the output buffer length, the function returns -ENOSPC but does not free the allocated memory, resulting in a memory leak. Add a kfree() call to the error path to ensure the allocated memory is properly freed. Compile tested only. Issue found using a prototype static analysis tool and code review. Fixes: ec7ad6530909 ("RDMA/mlx5: Introduce GET_DATA_DIRECT_SYSFS_PATH ioctl") Signed-off-by: Zilin Guan Link: https://patch.msgid.link/20260126074801.627898-1-zilin@seu.edu.cn Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/std_types.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index 2fcf553044e1..1ee31611b4b3 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -195,7 +195,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)( int out_len = uverbs_attr_get_len(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH); u32 dev_path_len; - char *dev_path; + char *dev_path = NULL; int ret; c = to_mucontext(ib_uverbs_get_ucontext(attrs)); @@ -223,9 +223,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)( ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path, dev_path_len); - kfree(dev_path); end: + kfree(dev_path); mutex_unlock(&dev->data_direct_lock); return ret; } From a01745ccf7c41043c503546cae7ba7b0ff499d38 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Tue, 27 Jan 2026 00:26:49 -0800 Subject: [PATCH 47/66] =?UTF-8?q?RDMA/mana=5Fib:=20Add=20device=E2=80=91me?= =?UTF-8?q?mory=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a basic DM implementation that enables creating and registering device memory, and using the associated memory keys for networking operations. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/20260127082649.429018-1-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 7 ++ drivers/infiniband/hw/mana/mana_ib.h | 12 +++ drivers/infiniband/hw/mana/mr.c | 130 +++++++++++++++++++++++++++ include/net/mana/gdma.h | 47 +++++++++- 4 files changed, 193 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index bdeddb642b87..ccc2279ca63c 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -69,6 +69,12 @@ static const struct ib_device_ops mana_ib_device_stats_ops = { .alloc_hw_device_stats = mana_ib_alloc_hw_device_stats, }; +const struct ib_device_ops mana_ib_dev_dm_ops = { + .alloc_dm = mana_ib_alloc_dm, + .dealloc_dm = mana_ib_dealloc_dm, + .reg_dm_mr = mana_ib_reg_dm_mr, +}; + static int mana_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -139,6 +145,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); if (dev->adapter_caps.feature_flags & MANA_IB_FEATURE_DEV_COUNTERS_SUPPORT) ib_set_device_ops(&dev->ib_dev, &mana_ib_device_stats_ops); + ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_dm_ops); ret = mana_ib_create_eqs(dev); if (ret) { diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 9d36232ed880..e447acfd2071 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -131,6 +131,11 @@ struct mana_ib_mr { mana_handle_t mr_handle; }; +struct mana_ib_dm { + struct ib_dm ibdm; + mana_handle_t dm_handle; +}; + struct mana_ib_cq { struct ib_cq ibcq; struct mana_ib_queue queue; @@ -735,4 +740,11 @@ struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 leng u64 iova, int fd, int mr_access_flags, struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); + +struct ib_dm *mana_ib_alloc_dm(struct ib_device *dev, struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); +int mana_ib_dealloc_dm(struct ib_dm *dm, struct uverbs_attr_bundle *attrs); +struct ib_mr *mana_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs); + #endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 3d0245a4c1ed..f979f26adc3b 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -40,6 +40,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req), sizeof(resp)); + req.hdr.req.msg_version = GDMA_MESSAGE_V2; req.pd_handle = mr_params->pd_handle; req.mr_type = mr_params->mr_type; @@ -55,6 +56,12 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, req.zbva.dma_region_handle = mr_params->zbva.dma_region_handle; req.zbva.access_flags = mr_params->zbva.access_flags; break; + case GDMA_MR_TYPE_DM: + req.da_ext.length = mr_params->da.length; + req.da.dm_handle = mr_params->da.dm_handle; + req.da.offset = mr_params->da.offset; + req.da.access_flags = mr_params->da.access_flags; + break; default: ibdev_dbg(&dev->ib_dev, "invalid param (GDMA_MR_TYPE) passed, type %d\n", @@ -317,3 +324,126 @@ int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return 0; } + +static int mana_ib_gd_alloc_dm(struct mana_ib_dev *mdev, struct mana_ib_dm *dm, + struct ib_dm_alloc_attr *attr) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_alloc_dm_resp resp = {}; + struct gdma_alloc_dm_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOC_DM, sizeof(req), sizeof(resp)); + req.length = attr->length; + req.alignment = attr->alignment; + req.flags = attr->flags; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + if (!err) + err = -EPROTO; + + return err; + } + + dm->dm_handle = resp.dm_handle; + + return 0; +} + +struct ib_dm *mana_ib_alloc_dm(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + struct mana_ib_dm *dm; + int err; + + dm = kzalloc(sizeof(*dm), GFP_KERNEL); + if (!dm) + return ERR_PTR(-ENOMEM); + + err = mana_ib_gd_alloc_dm(dev, dm, attr); + if (err) + goto err_free; + + return &dm->ibdm; + +err_free: + kfree(dm); + return ERR_PTR(err); +} + +static int mana_ib_gd_destroy_dm(struct mana_ib_dev *mdev, struct mana_ib_dm *dm) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_destroy_dm_resp resp = {}; + struct gdma_destroy_dm_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_DM, sizeof(req), sizeof(resp)); + req.dm_handle = dm->dm_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + if (!err) + err = -EPROTO; + + return err; + } + + return 0; +} + +int mana_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs) +{ + struct mana_ib_dev *dev = container_of(ibdm->device, struct mana_ib_dev, ib_dev); + struct mana_ib_dm *dm = container_of(ibdm, struct mana_ib_dm, ibdm); + int err; + + err = mana_ib_gd_destroy_dm(dev, dm); + if (err) + return err; + + kfree(dm); + return 0; +} + +struct ib_mr *mana_ib_reg_dm_mr(struct ib_pd *ibpd, struct ib_dm *ibdm, + struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mana_ib_dev *dev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_dm *mana_dm = container_of(ibdm, struct mana_ib_dm, ibdm); + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct mana_ib_mr *mr; + int err; + + attr->access_flags &= ~IB_ACCESS_OPTIONAL; + if (attr->access_flags & ~VALID_MR_FLAGS) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_DM; + mr_params.da.dm_handle = mana_dm->dm_handle; + mr_params.da.offset = attr->offset; + mr_params.da.length = attr->length; + mr_params.da.access_flags = + mana_ib_verbs_to_gdma_access_flags(attr->access_flags); + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_free; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index eaa27483f99b..8649eb789c0e 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -35,6 +35,8 @@ enum gdma_request_type { GDMA_CREATE_MR = 31, GDMA_DESTROY_MR = 32, GDMA_QUERY_HWC_TIMEOUT = 84, /* 0x54 */ + GDMA_ALLOC_DM = 96, /* 0x60 */ + GDMA_DESTROY_DM = 97, /* 0x61 */ }; #define GDMA_RESOURCE_DOORBELL_PAGE 27 @@ -861,6 +863,8 @@ enum gdma_mr_type { GDMA_MR_TYPE_GVA = 2, /* Guest zero-based address MRs */ GDMA_MR_TYPE_ZBVA = 4, + /* Device address MRs */ + GDMA_MR_TYPE_DM = 5, }; struct gdma_create_mr_params { @@ -876,6 +880,12 @@ struct gdma_create_mr_params { u64 dma_region_handle; enum gdma_mr_access_flags access_flags; } zbva; + struct { + u64 dm_handle; + u64 offset; + u64 length; + enum gdma_mr_access_flags access_flags; + } da; }; }; @@ -890,13 +900,23 @@ struct gdma_create_mr_request { u64 dma_region_handle; u64 virtual_address; enum gdma_mr_access_flags access_flags; - } gva; + } __packed gva; struct { u64 dma_region_handle; enum gdma_mr_access_flags access_flags; - } zbva; - }; + } __packed zbva; + struct { + u64 dm_handle; + u64 offset; + enum gdma_mr_access_flags access_flags; + } __packed da; + } __packed; u32 reserved_2; + union { + struct { + u64 length; + } da_ext; + }; };/* HW DATA */ struct gdma_create_mr_response { @@ -915,6 +935,27 @@ struct gdma_destroy_mr_response { struct gdma_resp_hdr hdr; };/* HW DATA */ +struct gdma_alloc_dm_req { + struct gdma_req_hdr hdr; + u64 length; + u32 alignment; + u32 flags; +}; /* HW Data */ + +struct gdma_alloc_dm_resp { + struct gdma_resp_hdr hdr; + u64 dm_handle; +}; /* HW Data */ + +struct gdma_destroy_dm_req { + struct gdma_req_hdr hdr; + u64 dm_handle; +}; /* HW Data */ + +struct gdma_destroy_dm_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + int mana_gd_verify_vf_version(struct pci_dev *pdev); int mana_gd_register_device(struct gdma_dev *gd); From 87bf646921430e303176edc4eb07c30160361b73 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 20 Jan 2026 15:44:37 +0800 Subject: [PATCH 48/66] RDMA/rxe: Fix race condition in QP timer handlers I encontered the following warning: WARNING: drivers/infiniband/sw/rxe/rxe_task.c:249 at rxe_sched_task+0x1c8/0x238 [rdma_rxe], CPU#0: swapper/0/0 ... libsha1 [last unloaded: ip6_udp_tunnel] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G C 6.19.0-rc5-64k-v8+ #37 PREEMPT Tainted: [C]=CRAP Hardware name: Raspberry Pi 4 Model B Rev 1.2 Call trace: rxe_sched_task+0x1c8/0x238 [rdma_rxe] (P) retransmit_timer+0x130/0x188 [rdma_rxe] call_timer_fn+0x68/0x4d0 __run_timers+0x630/0x888 ... WARNING: drivers/infiniband/sw/rxe/rxe_task.c:38 at rxe_sched_task+0x1c0/0x238 [rdma_rxe], CPU#0: swapper/0/0 ... WARNING: drivers/infiniband/sw/rxe/rxe_task.c:111 at do_work+0x488/0x5c8 [rdma_rxe], CPU#3: kworker/u17:4/93400 ... refcount_t: underflow; use-after-free. WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x138/0x1a0, CPU#3: kworker/u17:4/93400 The issue is caused by a race condition between retransmit_timer() and rxe_destroy_qp, leading to the Queue Pair's (QP) reference count dropping to zero during timer handler execution. It seems this warning is harmless because rxe_qp_do_cleanup() will flush all pending timers and requests. Example of flow causing the issue: CPU0 CPU1 retransmit_timer() { spin_lock_irqsave rxe_destroy_qp() __rxe_cleanup() __rxe_put() // qp->ref_count decrease to 0 rxe_qp_do_cleanup() { if (qp->valid) { rxe_sched_task() { WARN_ON(rxe_read(task->qp) <= 0); } } spin_unlock_irqrestore } spin_lock_irqsave qp->valid = 0 spin_unlock_irqrestore } Ensure the QP's reference count is maintained and its validity is checked within the timer callbacks by adding calls to rxe_get(qp) and corresponding rxe_put(qp) after use. Signed-off-by: Li Zhijian Fixes: d94671632572 ("RDMA/rxe: Rewrite rxe_task.c") Link: https://patch.msgid.link/20260120074437.623018-1-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_comp.c | 3 +++ drivers/infiniband/sw/rxe/rxe_req.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index a5b2b62f596b..1390e861bd1d 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -119,12 +119,15 @@ void retransmit_timer(struct timer_list *t) rxe_dbg_qp(qp, "retransmit timer fired\n"); + if (!rxe_get(qp)) + return; spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { qp->comp.timeout = 1; rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); + rxe_put(qp); } void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 373b03f223be..12d03f390b09 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -102,6 +102,8 @@ void rnr_nak_timer(struct timer_list *t) rxe_dbg_qp(qp, "nak timer fired\n"); + if (!rxe_get(qp)) + return; spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { /* request a send queue retry */ @@ -110,6 +112,7 @@ void rnr_nak_timer(struct timer_list *t) rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); + rxe_put(qp); } static void req_check_sq_drain_done(struct rxe_qp *qp) From 959d2c356e32abde9c5b95c7e83236cded94251a Mon Sep 17 00:00:00 2001 From: Carlos Bilbao Date: Tue, 27 Jan 2026 17:44:46 -0800 Subject: [PATCH 49/66] RDMA/irdma: Use kvzalloc for paged memory DMA address array Allocate array chunk->dmainfo.dmaaddrs using kvzalloc() to allow the allocation to fall back to vmalloc when contiguous memory is unavailable (instead of failing and logging page allocation warnings). Acked-by: Tatyana Nikolova Signed-off-by: Carlos Bilbao (Lambda) Link: https://patch.msgid.link/20260128014446.405247-1-carlos.bilbao@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/utils.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 6a385cea6b2c..9d737f9e52df 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2239,7 +2239,7 @@ void irdma_pble_free_paged_mem(struct irdma_chunk *chunk) chunk->pg_cnt); done: - kfree(chunk->dmainfo.dmaaddrs); + kvfree(chunk->dmainfo.dmaaddrs); chunk->dmainfo.dmaaddrs = NULL; vfree(chunk->vaddr); chunk->vaddr = NULL; @@ -2256,7 +2256,7 @@ int irdma_pble_get_paged_mem(struct irdma_chunk *chunk, u32 pg_cnt) u32 size; void *va; - chunk->dmainfo.dmaaddrs = kzalloc(pg_cnt << 3, GFP_KERNEL); + chunk->dmainfo.dmaaddrs = kvzalloc(pg_cnt << 3, GFP_KERNEL); if (!chunk->dmainfo.dmaaddrs) return -ENOMEM; @@ -2277,7 +2277,7 @@ int irdma_pble_get_paged_mem(struct irdma_chunk *chunk, u32 pg_cnt) return 0; err: - kfree(chunk->dmainfo.dmaaddrs); + kvfree(chunk->dmainfo.dmaaddrs); chunk->dmainfo.dmaaddrs = NULL; return -ENOMEM; From 5e541553588d493bd9317bc8a8c1ab85cbddc2c5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Jan 2026 19:53:56 -0500 Subject: [PATCH 50/66] RDMA/core: add bio_vec based RDMA read/write API The existing rdma_rw_ctx_init() API requires callers to construct a scatterlist, which is then DMA-mapped page by page. Callers that already have data in bio_vec form (such as the NVMe-oF target) must first convert to scatterlist, adding overhead and complexity. Introduce rdma_rw_ctx_init_bvec() and rdma_rw_ctx_destroy_bvec() to accept bio_vec arrays directly. The new helpers use dma_map_phys() for hardware RDMA devices and virtual addressing for software RDMA devices (rxe, siw), avoiding intermediate scatterlist construction. Memory registration (MR) path support is deferred to a follow-up series; callers requiring MR-based transfers (iWARP devices or force_mr=1) receive -EOPNOTSUPP and should use the scatterlist API. Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever Link: https://patch.msgid.link/20260128005400.25147-2-cel@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/rw.c | 197 +++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 42 ++++++++ include/rdma/rw.h | 11 ++ 3 files changed, 250 insertions(+) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 6354ddf2a274..39ca21d18d7b 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -274,6 +274,115 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, return 1; } +static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx, + struct ib_qp *qp, const struct bio_vec *bvecs, + struct bvec_iter *iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct ib_rdma_wr *rdma_wr = &ctx->single.wr; + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); + u64 dma_addr; + + ctx->nr_ops = 1; + + dma_addr = ib_dma_map_bvec(dev, &bv, dir); + if (ib_dma_mapping_error(dev, dma_addr)) + return -ENOMEM; + + ctx->single.sge.lkey = qp->pd->local_dma_lkey; + ctx->single.sge.addr = dma_addr; + ctx->single.sge.length = bv.bv_len; + + memset(rdma_wr, 0, sizeof(*rdma_wr)); + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->wr.sg_list = &ctx->single.sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + + ctx->type = RDMA_RW_SINGLE_WR; + return 1; +} + +static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + const struct bio_vec *bvecs, u32 nr_bvec, struct bvec_iter *iter, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : + qp->max_read_sge; + struct ib_sge *sge; + u32 total_len = 0, i, j; + u32 mapped_bvecs = 0; + u32 nr_ops = DIV_ROUND_UP(nr_bvec, max_sge); + size_t sges_size = array_size(nr_bvec, sizeof(*ctx->map.sges)); + size_t wrs_offset = ALIGN(sges_size, __alignof__(*ctx->map.wrs)); + size_t wrs_size = array_size(nr_ops, sizeof(*ctx->map.wrs)); + void *mem; + + if (sges_size == SIZE_MAX || wrs_size == SIZE_MAX || + check_add_overflow(wrs_offset, wrs_size, &wrs_size)) + return -ENOMEM; + + mem = kzalloc(wrs_size, GFP_KERNEL); + if (!mem) + return -ENOMEM; + + ctx->map.sges = sge = mem; + ctx->map.wrs = mem + wrs_offset; + + for (i = 0; i < nr_ops; i++) { + struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; + u32 nr_sge = min(nr_bvec - mapped_bvecs, max_sge); + + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->remote_addr = remote_addr + total_len; + rdma_wr->rkey = rkey; + rdma_wr->wr.num_sge = nr_sge; + rdma_wr->wr.sg_list = sge; + + for (j = 0; j < nr_sge; j++) { + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); + u64 dma_addr; + + dma_addr = ib_dma_map_bvec(dev, &bv, dir); + if (ib_dma_mapping_error(dev, dma_addr)) + goto out_unmap; + + mapped_bvecs++; + sge->addr = dma_addr; + sge->length = bv.bv_len; + sge->lkey = qp->pd->local_dma_lkey; + + total_len += bv.bv_len; + sge++; + + bvec_iter_advance_single(bvecs, iter, bv.bv_len); + } + + rdma_wr->wr.next = i + 1 < nr_ops ? + &ctx->map.wrs[i + 1].wr : NULL; + } + + ctx->nr_ops = nr_ops; + ctx->type = RDMA_RW_MULTI_WR; + return nr_ops; + +out_unmap: + for (i = 0; i < mapped_bvecs; i++) + ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr, + ctx->map.sges[i].length, dir); + kfree(ctx->map.sges); + return -ENOMEM; +} + /** * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context * @ctx: context to initialize @@ -344,6 +453,53 @@ out_unmap_sg: } EXPORT_SYMBOL(rdma_rw_ctx_init); +/** + * rdma_rw_ctx_init_bvec - initialize a RDMA READ/WRITE context from bio_vec + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @bvecs: bio_vec array to READ/WRITE from/to + * @nr_bvec: number of entries in @bvecs + * @iter: bvec iterator describing offset and length + * @remote_addr: remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Accepts bio_vec arrays directly, avoiding scatterlist conversion for + * callers that already have data in bio_vec form. Prefer this over + * rdma_rw_ctx_init() when the source data is a bio_vec array. + * + * This function does not support devices requiring memory registration. + * iWARP devices and configurations with force_mr=1 should use + * rdma_rw_ctx_init() with a scatterlist instead. + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code: + * + * * -EINVAL - @nr_bvec is zero or @iter.bi_size is zero + * * -EOPNOTSUPP - device requires MR path (iWARP or force_mr=1) + * * -ENOMEM - DMA mapping or memory allocation failed + */ +int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + struct bvec_iter iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + if (nr_bvec == 0 || iter.bi_size == 0) + return -EINVAL; + + /* MR path not supported for bvec - reject iWARP and force_mr */ + if (rdma_rw_io_needs_mr(qp->device, port_num, dir, nr_bvec)) + return -EOPNOTSUPP; + + if (nr_bvec == 1) + return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter, + remote_addr, rkey, dir); + return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter, + remote_addr, rkey, dir); +} +EXPORT_SYMBOL(rdma_rw_ctx_init_bvec); + /** * rdma_rw_ctx_signature_init - initialize a RW context with signature offload * @ctx: context to initialize @@ -598,6 +754,47 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, } EXPORT_SYMBOL(rdma_rw_ctx_destroy); +/** + * rdma_rw_ctx_destroy_bvec - release resources from rdma_rw_ctx_init_bvec + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound (unused) + * @bvecs: bio_vec array that was used for the READ/WRITE (unused) + * @nr_bvec: number of entries in @bvecs + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Releases all resources allocated by a successful rdma_rw_ctx_init_bvec() + * call. Must not be called if rdma_rw_ctx_init_bvec() returned an error. + * + * The @port_num and @bvecs parameters are unused but present for API + * symmetry with rdma_rw_ctx_destroy(). + */ +void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 __maybe_unused port_num, + const struct bio_vec __maybe_unused *bvecs, + u32 nr_bvec, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 i; + + switch (ctx->type) { + case RDMA_RW_MULTI_WR: + for (i = 0; i < nr_bvec; i++) + ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr, + ctx->map.sges[i].length, dir); + kfree(ctx->map.sges); + break; + case RDMA_RW_SINGLE_WR: + ib_dma_unmap_bvec(dev, ctx->single.sge.addr, + ctx->single.sge.length, dir); + break; + default: + WARN_ON_ONCE(1); + return; + } +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy_bvec); + /** * rdma_rw_ctx_destroy_signature - release all resources allocated by * rdma_rw_ctx_signature_init diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6c372a37c482..8bd020da7745 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -4266,6 +4267,47 @@ static inline void ib_dma_unmap_page(struct ib_device *dev, dma_unmap_page(dev->dma_device, addr, size, direction); } +/** + * ib_dma_map_bvec - Map a bio_vec to DMA address + * @dev: The device for which the dma_addr is to be created + * @bvec: The bio_vec to map + * @direction: The direction of the DMA + * + * Returns a DMA address for the bio_vec. The caller must check the + * result with ib_dma_mapping_error() before use; a failed mapping + * must not be passed to ib_dma_unmap_bvec(). + * + * For software RDMA devices (rxe, siw), returns a virtual address + * and no actual DMA mapping occurs. + */ +static inline u64 ib_dma_map_bvec(struct ib_device *dev, + struct bio_vec *bvec, + enum dma_data_direction direction) +{ + if (ib_uses_virt_dma(dev)) + return (uintptr_t)bvec_virt(bvec); + return dma_map_phys(dev->dma_device, bvec_phys(bvec), + bvec->bv_len, direction, 0); +} + +/** + * ib_dma_unmap_bvec - Unmap a bio_vec DMA mapping + * @dev: The device for which the DMA address was created + * @addr: The DMA address returned by ib_dma_map_bvec() + * @size: The size of the region in bytes + * @direction: The direction of the DMA + * + * Releases a DMA mapping created by ib_dma_map_bvec(). For software + * RDMA devices this is a no-op since no actual mapping occurred. + */ +static inline void ib_dma_unmap_bvec(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + if (!ib_uses_virt_dma(dev)) + dma_unmap_phys(dev->dma_device, addr, size, direction, 0); +} + int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, diff --git a/include/rdma/rw.h b/include/rdma/rw.h index d606cac48233..b2fc3e2373d7 100644 --- a/include/rdma/rw.h +++ b/include/rdma/rw.h @@ -5,6 +5,7 @@ #ifndef _RDMA_RW_H #define _RDMA_RW_H +#include #include #include #include @@ -49,6 +50,16 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir); +struct bio_vec; + +int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + struct bvec_iter iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir); +void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + enum dma_data_direction dir); + int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, From 853e892076ba5666c81afbc86552e008280f9768 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Jan 2026 19:53:57 -0500 Subject: [PATCH 51/66] RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations The bvec RDMA API maps each bvec individually via dma_map_phys(), requiring an IOTLB sync for each mapping. For large I/O operations with many bvecs, this overhead becomes significant. The two-step IOVA API (dma_iova_try_alloc / dma_iova_link / dma_iova_sync) allocates a contiguous IOVA range upfront, links all physical pages without IOTLB syncs, then performs a single sync at the end. This reduces IOTLB flushes from O(n) to O(1). It also requires only a single output dma_addr_t compared to extra per-input element storage in struct scatterlist. Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever Link: https://patch.msgid.link/20260128005400.25147-3-cel@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/rw.c | 106 +++++++++++++++++++++++++++++++++++ include/rdma/rw.h | 8 +++ 2 files changed, 114 insertions(+) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 39ca21d18d7b..c2fc8cba972e 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -14,6 +14,7 @@ enum { RDMA_RW_MULTI_WR, RDMA_RW_MR, RDMA_RW_SIG_MR, + RDMA_RW_IOVA, }; static bool rdma_rw_force_mr; @@ -383,6 +384,87 @@ out_unmap: return -ENOMEM; } +/* + * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range. + * This reduces IOTLB sync overhead by doing one sync at the end instead of + * one per bvec, and produces a contiguous DMA address range that can be + * described by a single SGE. + * + * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA + * mapping is not available, or another negative error code on failure. + */ +static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx, + struct ib_qp *qp, const struct bio_vec *bvec, + struct bvec_iter *iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct device *dma_dev = dev->dma_device; + size_t total_len = iter->bi_size; + struct bio_vec first_bv; + size_t mapped_len = 0; + int ret; + + /* Virtual DMA devices cannot support IOVA allocators */ + if (ib_uses_virt_dma(dev)) + return -EOPNOTSUPP; + + /* Try to allocate contiguous IOVA space */ + first_bv = mp_bvec_iter_bvec(bvec, *iter); + if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state, + bvec_phys(&first_bv), total_len)) + return -EOPNOTSUPP; + + /* Link all bvecs into the IOVA space */ + while (iter->bi_size) { + struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter); + + ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv), + mapped_len, bv.bv_len, dir, 0); + if (ret) + goto out_destroy; + + mapped_len += bv.bv_len; + bvec_iter_advance(bvec, iter, bv.bv_len); + } + + /* Sync the IOTLB once for all linked pages */ + ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len); + if (ret) + goto out_destroy; + + ctx->iova.mapped_len = mapped_len; + + /* Single SGE covers the entire contiguous IOVA range */ + ctx->iova.sge.addr = ctx->iova.state.addr; + ctx->iova.sge.length = mapped_len; + ctx->iova.sge.lkey = qp->pd->local_dma_lkey; + + /* Single WR for the whole transfer */ + memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr)); + if (dir == DMA_TO_DEVICE) + ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE; + else + ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ; + ctx->iova.wr.wr.num_sge = 1; + ctx->iova.wr.wr.sg_list = &ctx->iova.sge; + ctx->iova.wr.remote_addr = remote_addr; + ctx->iova.wr.rkey = rkey; + + ctx->type = RDMA_RW_IOVA; + ctx->nr_ops = 1; + return 1; + +out_destroy: + /* + * dma_iova_destroy() expects the actual mapped length, not the + * total allocation size. It unlinks only the successfully linked + * range and frees the entire IOVA allocation. + */ + dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0); + return ret; +} + /** * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context * @ctx: context to initialize @@ -485,6 +567,8 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct bvec_iter iter, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { + int ret; + if (nr_bvec == 0 || iter.bi_size == 0) return -EINVAL; @@ -495,6 +579,16 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, if (nr_bvec == 1) return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter, remote_addr, rkey, dir); + + /* + * Try IOVA-based mapping first for multi-bvec transfers. + * This reduces IOTLB sync overhead by batching all mappings. + */ + ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr, + rkey, dir); + if (ret != -EOPNOTSUPP) + return ret; + return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter, remote_addr, rkey, dir); } @@ -671,6 +765,10 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, first_wr = &ctx->reg[0].reg_wr.wr; last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; break; + case RDMA_RW_IOVA: + first_wr = &ctx->iova.wr.wr; + last_wr = &ctx->iova.wr.wr; + break; case RDMA_RW_MULTI_WR: first_wr = &ctx->map.wrs[0].wr; last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; @@ -745,6 +843,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, break; case RDMA_RW_SINGLE_WR: break; + case RDMA_RW_IOVA: + /* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */ + WARN_ON_ONCE(1); + return; default: BUG(); break; @@ -778,6 +880,10 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 i; switch (ctx->type) { + case RDMA_RW_IOVA: + dma_iova_destroy(dev->dma_device, &ctx->iova.state, + ctx->iova.mapped_len, dir, 0); + break; case RDMA_RW_MULTI_WR: for (i = 0; i < nr_bvec; i++) ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr, diff --git a/include/rdma/rw.h b/include/rdma/rw.h index b2fc3e2373d7..205e16ed6cd8 100644 --- a/include/rdma/rw.h +++ b/include/rdma/rw.h @@ -32,6 +32,14 @@ struct rdma_rw_ctx { struct ib_rdma_wr *wrs; } map; + /* for IOVA-based mapping of bvecs into contiguous DMA range: */ + struct { + struct dma_iova_state state; + struct ib_sge sge; + struct ib_rdma_wr wr; + size_t mapped_len; + } iova; + /* for registering multiple WRs: */ struct rdma_rw_reg_ctx { struct ib_sge sge; From bea28ac14cab25d79ea759138def79aa82e0b428 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Jan 2026 19:53:58 -0500 Subject: [PATCH 52/66] RDMA/core: add MR support for bvec-based RDMA operations The bvec-based RDMA API currently returns -EOPNOTSUPP when Memory Region registration is required. This prevents iWARP devices from using the bvec path, since iWARP requires MR registration for RDMA READ operations. The force_mr debug parameter is also unusable with bvec input. Add rdma_rw_init_mr_wrs_bvec() to handle MR registration for bvec arrays. The approach creates a synthetic scatterlist populated with DMA addresses from the bvecs, then reuses the existing ib_map_mr_sg() infrastructure. This avoids driver changes while keeping the implementation small. The synthetic scatterlist is stored in the rdma_rw_ctx for cleanup. On destroy, the MRs are returned to the pool and the bvec DMA mappings are released using the stored addresses. Signed-off-by: Chuck Lever Link: https://patch.msgid.link/20260128005400.25147-4-cel@kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/rw.c | 189 ++++++++++++++++++++++++++++------- include/rdma/rw.h | 1 + 2 files changed, 154 insertions(+), 36 deletions(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index c2fc8cba972e..2c148457b589 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -122,6 +122,36 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num, return count; } +static int rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx *reg, + struct rdma_rw_reg_ctx *prev, struct ib_qp *qp, u32 port_num, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + if (prev) { + if (reg->mr->need_inval) + prev->wr.wr.next = ®->inv_wr; + else + prev->wr.wr.next = ®->reg_wr.wr; + } + + reg->reg_wr.wr.next = ®->wr.wr; + + reg->wr.wr.sg_list = ®->sge; + reg->wr.wr.num_sge = 1; + reg->wr.remote_addr = remote_addr; + reg->wr.rkey = rkey; + + if (dir == DMA_TO_DEVICE) { + reg->wr.wr.opcode = IB_WR_RDMA_WRITE; + } else if (!rdma_cap_read_inv(qp->device, port_num)) { + reg->wr.wr.opcode = IB_WR_RDMA_READ; + } else { + reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; + reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; + } + + return 1; +} + static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) @@ -147,30 +177,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, if (ret < 0) goto out_free; count += ret; - - if (prev) { - if (reg->mr->need_inval) - prev->wr.wr.next = ®->inv_wr; - else - prev->wr.wr.next = ®->reg_wr.wr; - } - - reg->reg_wr.wr.next = ®->wr.wr; - - reg->wr.wr.sg_list = ®->sge; - reg->wr.wr.num_sge = 1; - reg->wr.remote_addr = remote_addr; - reg->wr.rkey = rkey; - if (dir == DMA_TO_DEVICE) { - reg->wr.wr.opcode = IB_WR_RDMA_WRITE; - } else if (!rdma_cap_read_inv(qp->device, port_num)) { - reg->wr.wr.opcode = IB_WR_RDMA_READ; - } else { - reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; - reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; - } - count++; - + count += rdma_rw_init_reg_wr(reg, prev, qp, port_num, + remote_addr, rkey, dir); remote_addr += reg->sge.length; sg_cnt -= nents; for (j = 0; j < nents; j++) @@ -193,6 +201,92 @@ out: return ret; } +static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + struct bvec_iter *iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct rdma_rw_reg_ctx *prev = NULL; + u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en); + struct scatterlist *sg; + int i, ret, count = 0; + u32 nents = 0; + + ctx->reg = kcalloc(DIV_ROUND_UP(nr_bvec, pages_per_mr), + sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) + return -ENOMEM; + + /* + * Build scatterlist from bvecs using the iterator. This follows + * the pattern from __blk_rq_map_sg. + */ + ctx->reg[0].sgt.sgl = kmalloc_array(nr_bvec, + sizeof(*ctx->reg[0].sgt.sgl), + GFP_KERNEL); + if (!ctx->reg[0].sgt.sgl) { + ret = -ENOMEM; + goto out_free_reg; + } + sg_init_table(ctx->reg[0].sgt.sgl, nr_bvec); + + for (sg = ctx->reg[0].sgt.sgl; iter->bi_size; sg = sg_next(sg)) { + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); + + if (nents >= nr_bvec) { + ret = -EINVAL; + goto out_free_sgl; + } + sg_set_page(sg, bv.bv_page, bv.bv_len, bv.bv_offset); + bvec_iter_advance(bvecs, iter, bv.bv_len); + nents++; + } + sg_mark_end(sg_last(ctx->reg[0].sgt.sgl, nents)); + ctx->reg[0].sgt.orig_nents = nents; + + /* DMA map the scatterlist */ + ret = ib_dma_map_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); + if (ret) + goto out_free_sgl; + + ctx->nr_ops = DIV_ROUND_UP(ctx->reg[0].sgt.nents, pages_per_mr); + + sg = ctx->reg[0].sgt.sgl; + nents = ctx->reg[0].sgt.nents; + for (i = 0; i < ctx->nr_ops; i++) { + struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; + u32 sge_cnt = min(nents, pages_per_mr); + + ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sge_cnt, 0); + if (ret < 0) + goto out_free_mrs; + count += ret; + count += rdma_rw_init_reg_wr(reg, prev, qp, port_num, + remote_addr, rkey, dir); + remote_addr += reg->sge.length; + nents -= sge_cnt; + sg += sge_cnt; + prev = reg; + } + + if (prev) + prev->wr.wr.next = NULL; + + ctx->type = RDMA_RW_MR; + return count; + +out_free_mrs: + while (--i >= 0) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); +out_free_sgl: + kfree(ctx->reg[0].sgt.sgl); +out_free_reg: + kfree(ctx->reg); + return ret; +} + static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) @@ -547,19 +641,13 @@ EXPORT_SYMBOL(rdma_rw_ctx_init); * @rkey: remote key to operate on * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ * - * Accepts bio_vec arrays directly, avoiding scatterlist conversion for - * callers that already have data in bio_vec form. Prefer this over - * rdma_rw_ctx_init() when the source data is a bio_vec array. - * - * This function does not support devices requiring memory registration. - * iWARP devices and configurations with force_mr=1 should use - * rdma_rw_ctx_init() with a scatterlist instead. + * Maps the bio_vec array directly, avoiding intermediate scatterlist + * conversion. Supports MR registration for iWARP devices and force_mr mode. * * Returns the number of WQEs that will be needed on the workqueue if * successful, or a negative error code: * * * -EINVAL - @nr_bvec is zero or @iter.bi_size is zero - * * -EOPNOTSUPP - device requires MR path (iWARP or force_mr=1) * * -ENOMEM - DMA mapping or memory allocation failed */ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, @@ -567,14 +655,24 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct bvec_iter iter, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { + struct ib_device *dev = qp->pd->device; int ret; if (nr_bvec == 0 || iter.bi_size == 0) return -EINVAL; - /* MR path not supported for bvec - reject iWARP and force_mr */ - if (rdma_rw_io_needs_mr(qp->device, port_num, dir, nr_bvec)) - return -EOPNOTSUPP; + /* + * iWARP requires MR registration for all RDMA READs. The force_mr + * debug option also mandates MR usage. + */ + if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp(dev, port_num)) + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, + nr_bvec, &iter, remote_addr, + rkey, dir); + if (unlikely(rdma_rw_force_mr)) + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, + nr_bvec, &iter, remote_addr, + rkey, dir); if (nr_bvec == 1) return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter, @@ -582,13 +680,23 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, /* * Try IOVA-based mapping first for multi-bvec transfers. - * This reduces IOTLB sync overhead by batching all mappings. + * IOVA coalesces bvecs into a single DMA-contiguous region, + * reducing the number of WRs needed and avoiding MR overhead. */ ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr, rkey, dir); if (ret != -EOPNOTSUPP) return ret; + /* + * IOVA mapping not available. Check if MR registration provides + * better performance than multiple SGE entries. + */ + if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec)) + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, + nr_bvec, &iter, remote_addr, + rkey, dir); + return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter, remote_addr, rkey, dir); } @@ -833,6 +941,8 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, switch (ctx->type) { case RDMA_RW_MR: + /* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */ + WARN_ON_ONCE(ctx->reg[0].sgt.sgl); for (i = 0; i < ctx->nr_ops; i++) ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); kfree(ctx->reg); @@ -880,6 +990,13 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 i; switch (ctx->type) { + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); + kfree(ctx->reg[0].sgt.sgl); + kfree(ctx->reg); + break; case RDMA_RW_IOVA: dma_iova_destroy(dev->dma_device, &ctx->iova.state, ctx->iova.mapped_len, dir, 0); diff --git a/include/rdma/rw.h b/include/rdma/rw.h index 205e16ed6cd8..3400c017bfb6 100644 --- a/include/rdma/rw.h +++ b/include/rdma/rw.h @@ -47,6 +47,7 @@ struct rdma_rw_ctx { struct ib_reg_wr reg_wr; struct ib_send_wr inv_wr; struct ib_mr *mr; + struct sg_table sgt; } *reg; }; }; From afcae7d7b8a278a6c29e064f99e5bafd4ac1fb37 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Jan 2026 19:53:59 -0500 Subject: [PATCH 53/66] RDMA/core: add rdma_rw_max_sge() helper for SQ sizing svc_rdma_accept() computes sc_sq_depth as the sum of rq_depth and the number of rdma_rw contexts (ctxts). This value is used to allocate the Send CQ and to initialize the sc_sq_avail credit pool. However, when the device uses memory registration for RDMA operations, rdma_rw_init_qp() inflates the QP's max_send_wr by a factor of three per context to account for REG and INV work requests. The Send CQ and credit pool remain sized for only one work request per context, causing Send Queue exhaustion under heavy NFS WRITE workloads. Introduce rdma_rw_max_sge() to compute the actual number of Send Queue entries required for a given number of rdma_rw contexts. Upper layer protocols call this helper before creating a Queue Pair so that their Send CQs and credit accounting match the QP's true capacity. Update svc_rdma_accept() to use rdma_rw_max_sge() when computing sc_sq_depth, ensuring the credit pool reflects the work requests that rdma_rw_init_qp() will reserve. Reviewed-by: Christoph Hellwig Fixes: 00bd1439f464 ("RDMA/rw: Support threshold for registration vs scattering to local pages") Signed-off-by: Chuck Lever Link: https://patch.msgid.link/20260128005400.25147-5-cel@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/rw.c | 53 +++++++++++++++++------- include/rdma/rw.h | 2 + net/sunrpc/xprtrdma/svc_rdma_transport.c | 8 +++- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 2c148457b589..518095d82d5d 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -1071,34 +1071,57 @@ unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, } EXPORT_SYMBOL(rdma_rw_mr_factor); +/** + * rdma_rw_max_send_wr - compute max Send WRs needed for RDMA R/W contexts + * @dev: RDMA device + * @port_num: port number + * @max_rdma_ctxs: number of rdma_rw_ctx structures + * @create_flags: QP create flags (pass IB_QP_CREATE_INTEGRITY_EN if + * data integrity will be enabled on the QP) + * + * Returns the total number of Send Queue entries needed for + * @max_rdma_ctxs. The result accounts for memory registration and + * invalidation work requests when the device requires them. + * + * ULPs use this to size Send Queues and Send CQs before creating a + * Queue Pair. + */ +unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num, + unsigned int max_rdma_ctxs, u32 create_flags) +{ + unsigned int factor = 1; + unsigned int result; + + if (create_flags & IB_QP_CREATE_INTEGRITY_EN || + rdma_rw_can_use_mr(dev, port_num)) + factor += 2; /* reg + inv */ + + if (check_mul_overflow(factor, max_rdma_ctxs, &result)) + return UINT_MAX; + return result; +} +EXPORT_SYMBOL(rdma_rw_max_send_wr); + void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) { - u32 factor; + unsigned int factor = 1; WARN_ON_ONCE(attr->port_num == 0); /* - * Each context needs at least one RDMA READ or WRITE WR. - * - * For some hardware we might need more, eventually we should ask the - * HCA driver for a multiplier here. - */ - factor = 1; - - /* - * If the device needs MRs to perform RDMA READ or WRITE operations, - * we'll need two additional MRs for the registrations and the - * invalidation. + * If the device uses MRs to perform RDMA READ or WRITE operations, + * or if data integrity is enabled, account for registration and + * invalidation work requests. */ if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN || rdma_rw_can_use_mr(dev, attr->port_num)) - factor += 2; /* inv + reg */ + factor += 2; /* reg + inv */ attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; /* - * But maybe we were just too high in the sky and the device doesn't - * even support all we need, and we'll have to live with what we get.. + * The device might not support all we need, and we'll have to + * live with what we get. */ attr->cap.max_send_wr = min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr); diff --git a/include/rdma/rw.h b/include/rdma/rw.h index 3400c017bfb6..6a1d08614e09 100644 --- a/include/rdma/rw.h +++ b/include/rdma/rw.h @@ -86,6 +86,8 @@ int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, unsigned int maxpages); +unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num, + unsigned int max_rdma_ctxs, u32 create_flags); void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr); int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr); void rdma_rw_cleanup_mrs(struct ib_qp *qp); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b7b318ad25c4..9b623849723e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -462,7 +462,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_bc_requests = 2; } - /* Arbitrary estimate of the needed number of rdma_rw contexts. + /* Estimate the needed number of rdma_rw contexts. The maximum + * Read and Write chunks have one segment each. Each request + * can involve one Read chunk and either a Write chunk or Reply + * chunk; thus a factor of three. */ maxpayload = min(xprt->xpt_server->sv_max_payload, RPCSVC_MAXPAYLOAD_RDMA); @@ -470,7 +473,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) rdma_rw_mr_factor(dev, newxprt->sc_port_num, maxpayload >> PAGE_SHIFT); - newxprt->sc_sq_depth = rq_depth + ctxts; + newxprt->sc_sq_depth = rq_depth + + rdma_rw_max_send_wr(dev, newxprt->sc_port_num, ctxts, 0); if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); From 5ee62b4a91137557ee4b09d1604f1dfd0b4344a8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 27 Jan 2026 19:54:00 -0500 Subject: [PATCH 54/66] svcrdma: use bvec-based RDMA read/write API Convert svcrdma to the bvec-based RDMA API introduced earlier in this series. The bvec-based RDMA API eliminates the intermediate scatterlist conversion step, allowing direct DMA mapping from bio_vec arrays. This simplifies the svc_rdma_rw_ctxt structure by removing the chained SG table management. The structure retains an inline array approach similar to the previous scatterlist implementation: an inline bvec array sized to max_send_sge handles most I/O operations without additional allocation. Larger requests fall back to dynamic allocation. This preserves the allocation-free fast path for typical NFS operations while supporting arbitrarily large transfers. The bvec API handles all device types internally, including iWARP devices which require memory registration. No explicit fallback path is needed. Signed-off-by: Chuck Lever Link: https://patch.msgid.link/20260128005400.25147-6-cel@kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Leon Romanovsky --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 155 +++++++++++++++++------------- 1 file changed, 86 insertions(+), 69 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 310de7a80be5..4ec2f9ae06aa 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -5,6 +5,8 @@ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. */ +#include +#include #include #include @@ -20,30 +22,33 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); /* Each R/W context contains state for one chain of RDMA Read or * Write Work Requests. * - * Each WR chain handles a single contiguous server-side buffer, - * because scatterlist entries after the first have to start on - * page alignment. xdr_buf iovecs cannot guarantee alignment. + * Each WR chain handles a single contiguous server-side buffer. + * - each xdr_buf iovec is a single contiguous buffer + * - the xdr_buf pages array is a single contiguous buffer because the + * second through the last element always start on a page boundary * * Each WR chain handles only one R_key. Each RPC-over-RDMA segment * from a client may contain a unique R_key, so each WR chain moves * up to one segment at a time. * - * The scatterlist makes this data structure over 4KB in size. To - * make it less likely to fail, and to handle the allocation for - * smaller I/O requests without disabling bottom-halves, these - * contexts are created on demand, but cached and reused until the - * controlling svcxprt_rdma is destroyed. + * The inline bvec array is sized to handle most I/O requests without + * additional allocation. Larger requests fall back to dynamic allocation. + * These contexts are created on demand, but cached and reused until + * the controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; - unsigned int rw_first_sgl_nents; - struct sg_table rw_sg_table; - struct scatterlist rw_first_sgl[]; + unsigned int rw_first_bvec_nents; + struct bio_vec *rw_bvec; + struct bio_vec rw_first_bvec[]; }; +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt); + static inline struct svc_rdma_rw_ctxt * svc_rdma_next_ctxt(struct list_head *list) { @@ -52,10 +57,10 @@ svc_rdma_next_ctxt(struct list_head *list) } static struct svc_rdma_rw_ctxt * -svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) +svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec) { struct ib_device *dev = rdma->sc_cm_id->device; - unsigned int first_sgl_nents = dev->attrs.max_send_sge; + unsigned int first_bvec_nents = dev->attrs.max_send_sge; struct svc_rdma_rw_ctxt *ctxt; struct llist_node *node; @@ -65,33 +70,44 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) if (node) { ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents), + ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec, + first_bvec_nents), GFP_KERNEL, ibdev_to_node(dev)); if (!ctxt) goto out_noctx; INIT_LIST_HEAD(&ctxt->rw_list); - ctxt->rw_first_sgl_nents = first_sgl_nents; + ctxt->rw_first_bvec_nents = first_bvec_nents; } - ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; - if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, - ctxt->rw_sg_table.sgl, - first_sgl_nents)) - goto out_free; + if (nr_bvec <= ctxt->rw_first_bvec_nents) { + ctxt->rw_bvec = ctxt->rw_first_bvec; + } else { + ctxt->rw_bvec = kmalloc_array_node(nr_bvec, + sizeof(*ctxt->rw_bvec), + GFP_KERNEL, + ibdev_to_node(dev)); + if (!ctxt->rw_bvec) + goto out_free; + } return ctxt; out_free: - kfree(ctxt); + /* Return cached contexts to cache; free freshly allocated ones */ + if (node) + svc_rdma_put_rw_ctxt(rdma, ctxt); + else + kfree(ctxt); out_noctx: - trace_svcrdma_rwctx_empty(rdma, sges); + trace_svcrdma_rwctx_empty(rdma, nr_bvec); return NULL; } static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, struct llist_head *list) { - sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents); + if (ctxt->rw_bvec != ctxt->rw_first_bvec) + kfree(ctxt->rw_bvec); llist_add(&ctxt->rw_node, list); } @@ -123,6 +139,7 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) * @ctxt: R/W context to prepare * @offset: RDMA offset * @handle: RDMA tag/handle + * @length: total number of bytes in the bvec array * @direction: I/O direction * * Returns on success, the number of WQEs that will be needed @@ -130,14 +147,18 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) */ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, struct svc_rdma_rw_ctxt *ctxt, - u64 offset, u32 handle, + u64 offset, u32 handle, unsigned int length, enum dma_data_direction direction) { + struct bvec_iter iter = { + .bi_size = length, + }; int ret; - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, - ctxt->rw_sg_table.sgl, ctxt->rw_nents, - 0, offset, handle, direction); + ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, + ctxt->rw_bvec, ctxt->rw_nents, + iter, offset, handle, direction); if (unlikely(ret < 0)) { trace_svcrdma_dma_map_rw_err(rdma, offset, handle, ctxt->rw_nents, ret); @@ -175,7 +196,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, { struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; - LLIST_HEAD(free); trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); @@ -183,10 +203,11 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); - rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, ctxt->rw_sg_table.sgl, - ctxt->rw_nents, dir); - __svc_rdma_put_rw_ctxt(ctxt, &free); + rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, + ctxt->rw_bvec, ctxt->rw_nents, dir); + if (ctxt->rw_bvec != ctxt->rw_first_bvec) + kfree(ctxt->rw_bvec); ctxt->rw_node.next = first; first = &ctxt->rw_node; @@ -414,29 +435,26 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, return -ENOTCONN; } -/* Build and DMA-map an SGL that covers one kvec in an xdr_buf +/* Build a bvec that covers one kvec in an xdr_buf. */ -static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, - unsigned int len, - struct svc_rdma_rw_ctxt *ctxt) +static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt) { - struct scatterlist *sg = ctxt->rw_sg_table.sgl; - - sg_set_buf(&sg[0], info->wi_base, len); + bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len); info->wi_base += len; ctxt->rw_nents = 1; } -/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. +/* Build a bvec array that covers part of an xdr_buf's pagelist. */ -static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, - unsigned int remaining, - struct svc_rdma_rw_ctxt *ctxt) +static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info, + unsigned int remaining, + struct svc_rdma_rw_ctxt *ctxt) { - unsigned int sge_no, sge_bytes, page_off, page_no; + unsigned int bvec_idx, bvec_len, page_off, page_no; const struct xdr_buf *xdr = info->wi_xdr; - struct scatterlist *sg; struct page **page; page_off = info->wi_next_off + xdr->page_base; @@ -444,21 +462,19 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, page_off = offset_in_page(page_off); page = xdr->pages + page_no; info->wi_next_off += remaining; - sg = ctxt->rw_sg_table.sgl; - sge_no = 0; + bvec_idx = 0; do { - sge_bytes = min_t(unsigned int, remaining, - PAGE_SIZE - page_off); - sg_set_page(sg, *page, sge_bytes, page_off); - - remaining -= sge_bytes; - sg = sg_next(sg); + bvec_len = min_t(unsigned int, remaining, + PAGE_SIZE - page_off); + bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, bvec_len, + page_off); + remaining -= bvec_len; page_off = 0; - sge_no++; + bvec_idx++; page++; } while (remaining); - ctxt->rw_nents = sge_no; + ctxt->rw_nents = bvec_idx; } /* Construct RDMA Write WRs to send a portion of an xdr_buf containing @@ -496,7 +512,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, constructor(info, write_len, ctxt); offset = seg->rs_offset + info->wi_seg_off; ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, - DMA_TO_DEVICE); + write_len, DMA_TO_DEVICE); if (ret < 0) return -EIO; percpu_counter_inc(&svcrdma_stat_write); @@ -535,7 +551,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info, const struct kvec *iov) { info->wi_base = iov->iov_base; - return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, + return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec, iov->iov_len); } @@ -559,7 +575,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info, { info->wi_xdr = xdr; info->wi_next_off = offset - xdr->head[0].iov_len; - return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, + return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec, length); } @@ -734,29 +750,29 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, { struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; - unsigned int sge_no, seg_len, len; + unsigned int bvec_idx, nr_bvec, seg_len, len, total; struct svc_rdma_rw_ctxt *ctxt; - struct scatterlist *sg; int ret; len = segment->rs_length; - sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT; - ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); + if (check_add_overflow(head->rc_pageoff, len, &total)) + return -EINVAL; + nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT; + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec); if (!ctxt) return -ENOMEM; - ctxt->rw_nents = sge_no; + ctxt->rw_nents = nr_bvec; - sg = ctxt->rw_sg_table.sgl; - for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { + for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) { seg_len = min_t(unsigned int, len, PAGE_SIZE - head->rc_pageoff); if (!head->rc_pageoff) head->rc_page_count++; - sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], - seg_len, head->rc_pageoff); - sg = sg_next(sg); + bvec_set_page(&ctxt->rw_bvec[bvec_idx], + rqstp->rq_pages[head->rc_curpage], + seg_len, head->rc_pageoff); head->rc_pageoff += seg_len; if (head->rc_pageoff == PAGE_SIZE) { @@ -770,7 +786,8 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, } ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, - segment->rs_handle, DMA_FROM_DEVICE); + segment->rs_handle, segment->rs_length, + DMA_FROM_DEVICE); if (ret < 0) return -EIO; percpu_counter_inc(&svcrdma_stat_read); From 58b604dfc7bb753f91bc0ccd3fa705e14e6edfb4 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 29 Jan 2026 17:49:00 +0800 Subject: [PATCH 55/66] RDMA/uverbs: Add __GFP_NOWARN to ib_uverbs_unmarshall_recv() kmalloc Since wqe_size in ib_uverbs_unmarshall_recv() is user-provided and already validated, but can still be large, add __GFP_NOWARN to suppress memory allocation warnings for large sizes, consistent with the similar fix in ib_uverbs_post_send(). Fixes: 67cdb40ca444 ("[IB] uverbs: Implement more commands") Signed-off-by: Yi Liu Link: https://patch.msgid.link/20260129094900.3517706-1-liuy22@mails.tsinghua.edu.cn Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 3259e9848cc7..f4616deeca54 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2242,7 +2242,7 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, if (ret) return ERR_PTR(ret); - user_wr = kmalloc(wqe_size, GFP_KERNEL); + user_wr = kmalloc(wqe_size, GFP_KERNEL | __GFP_NOWARN); if (!user_wr) return ERR_PTR(-ENOMEM); From e5b0cfa32b1c3e7f153373bfdc20ccdd3c342de2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 28 Jan 2026 11:55:25 +0200 Subject: [PATCH 56/66] MAINTAINERS: Drop RDMA files from Hyper-V section MAINTAINERS entries are organized by subsystem ownership, and the RDMA files belong under drivers/infiniband. Remove the overly broad mana_ib entries from the Hyper-V section, and instead add the Hyper-V mailing list to CC on mana_ib patches. This makes get_maintainer.pl behave more sensibly when running it on mana_ib patches. Fixes: 428ca2d4c6aa ("MAINTAINERS: Add Long Li as a Hyper-V maintainer") Link: https://patch.msgid.link/20260128-get-maintainers-fix-v1-1-fc5e58ce9f02@nvidia.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 12f49de7fe03..d2e3353a1d29 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11739,7 +11739,6 @@ F: arch/x86/kernel/cpu/mshyperv.c F: drivers/clocksource/hyperv_timer.c F: drivers/hid/hid-hyperv.c F: drivers/hv/ -F: drivers/infiniband/hw/mana/ F: drivers/input/serio/hyperv-keyboard.c F: drivers/iommu/hyperv-iommu.c F: drivers/net/ethernet/microsoft/ @@ -11758,7 +11757,6 @@ F: include/hyperv/hvhdk_mini.h F: include/linux/hyperv.h F: include/net/mana F: include/uapi/linux/hyperv.h -F: include/uapi/rdma/mana-abi.h F: net/vmw_vsock/hyperv_transport.c F: tools/hv/ @@ -17318,6 +17316,7 @@ MICROSOFT MANA RDMA DRIVER M: Long Li M: Konstantin Taranov L: linux-rdma@vger.kernel.org +L: linux-hyperv@vger.kernel.org S: Supported F: drivers/infiniband/hw/mana/ F: include/net/mana From e72d45d274d8edd8c1c365444403a02147f484a6 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 2 Feb 2026 19:04:09 +0530 Subject: [PATCH 57/66] RDMA/bnxt_re: Add support for QP rate limiting Broadcom P7 chips supports applying rate limit to RC QPs. It allows adjust shaper rate values during the INIT -> RTR, RTR -> RTS, RTS -> RTS state changes or after QP transitions to RTR or RTS. Signed-off-by: Damodharam Ammepalli Reviewed-by: Hongguang Gao Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20260202133413.3182578-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 12 +++++++++++- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 12 +++++++++++- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 3 +++ drivers/infiniband/hw/bnxt_re/qplib_res.h | 6 ++++++ drivers/infiniband/hw/bnxt_re/qplib_sp.c | 5 +++++ drivers/infiniband/hw/bnxt_re/qplib_sp.h | 2 ++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 13 +++++++++---- 7 files changed, 47 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index f19b55c13d58..39dd18af86eb 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2089,10 +2089,11 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, unsigned int flags; u8 nw_type; - if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS) + if (qp_attr_mask & ~(IB_QP_ATTR_STANDARD_BITS | IB_QP_RATE_LIMIT)) return -EOPNOTSUPP; qp->qplib_qp.modify_flags = 0; + qp->qplib_qp.ext_modify_flags = 0; if (qp_attr_mask & IB_QP_STATE) { curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state); new_qp_state = qp_attr->qp_state; @@ -2129,6 +2130,15 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, bnxt_re_unlock_cqs(qp, flags); } } + + if (qp_attr_mask & IB_QP_RATE_LIMIT) { + if (qp->qplib_qp.type != IB_QPT_RC || + !_is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) + return -EOPNOTSUPP; + qp->qplib_qp.ext_modify_flags |= + CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID; + qp->qplib_qp.rate_limit = qp_attr->rate_limit; + } if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_EN_SQD_ASYNC_NOTIFY; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index c88f049136fc..3e44311bf939 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1313,8 +1313,8 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_modify_qp req = {}; u16 vlan_pcp_vlan_dei_vlan_id; + u32 bmask, bmask_ext; u32 temp32[4]; - u32 bmask; int rc; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, @@ -1329,9 +1329,16 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) is_optimized_state_transition(qp)) bnxt_set_mandatory_attributes(res, qp, &req); } + bmask = qp->modify_flags; req.modify_mask = cpu_to_le32(qp->modify_flags); + bmask_ext = qp->ext_modify_flags; + req.ext_modify_mask = cpu_to_le32(qp->ext_modify_flags); req.qp_cid = cpu_to_le32(qp->id); + + if (bmask_ext & CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID) + req.rate_limit = cpu_to_le32(qp->rate_limit); + if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_STATE) { req.network_type_en_sqd_async_notify_new_state = (qp->state & CMDQ_MODIFY_QP_NEW_STATE_MASK) | @@ -1429,6 +1436,9 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); if (rc) return rc; + + if (bmask_ext & CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID) + qp->shaper_allocation_status = resp.shaper_allocation_status; qp->cur_qp_state = qp->state; return 0; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 1b414a73b46d..30c3f99be07b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -280,6 +280,7 @@ struct bnxt_qplib_qp { u8 state; u8 cur_qp_state; u64 modify_flags; + u32 ext_modify_flags; u32 max_inline_data; u32 mtu; u8 path_mtu; @@ -346,6 +347,8 @@ struct bnxt_qplib_qp { bool is_host_msn_tbl; u8 tos_dscp; u32 ugid_index; + u32 rate_limit; + u8 shaper_allocation_status; }; #define BNXT_RE_MAX_MSG_SIZE 0x80000000 diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 2ea3b7f232a3..9a5dcf97b6f4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -623,4 +623,10 @@ static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2) return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); } +static inline bool _is_modify_qp_rate_limit_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & + CREQ_QUERY_FUNC_RESP_SB_MODIFY_QP_RATE_LIMIT_SUPPORTED; +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 408a34df2667..ec9eb52a8ebf 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -193,6 +193,11 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) attr->max_dpi = le32_to_cpu(sb->max_dpi); attr->is_atomic = bnxt_qplib_is_atomic_cap(rcfw); + + if (_is_modify_qp_rate_limit_supported(attr->dev_cap_flags2)) { + attr->rate_limit_min = le16_to_cpu(sb->rate_limit_min); + attr->rate_limit_max = le32_to_cpu(sb->rate_limit_max); + } bail: dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 5a45c55c6464..9fadd637cb5b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -76,6 +76,8 @@ struct bnxt_qplib_dev_attr { u16 dev_cap_flags; u16 dev_cap_flags2; u32 max_dpi; + u16 rate_limit_min; + u32 rate_limit_max; }; struct bnxt_qplib_pd { diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 99ecd72e72e2..aac338f2afd8 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -690,10 +690,11 @@ struct cmdq_modify_qp { __le32 ext_modify_mask; #define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_EXT_STATS_CTX 0x1UL #define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_SCHQ_ID_VALID 0x2UL + #define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID 0x8UL __le32 ext_stats_ctx_id; __le16 schq_id; __le16 unused_0; - __le32 reserved32; + __le32 rate_limit; }; /* creq_modify_qp_resp (size:128b/16B) */ @@ -716,7 +717,8 @@ struct creq_modify_qp_resp { #define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_INDEX_MASK 0xeUL #define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_INDEX_SFT 1 #define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_STATE 0x10UL - u8 reserved8; + u8 shaper_allocation_status; + #define CREQ_MODIFY_QP_RESP_SHAPER_ALLOCATED 0x1UL __le32 lag_src_mac; }; @@ -2179,7 +2181,7 @@ struct creq_query_func_resp { u8 reserved48[6]; }; -/* creq_query_func_resp_sb (size:1088b/136B) */ +/* creq_query_func_resp_sb (size:1280b/160B) */ struct creq_query_func_resp_sb { u8 opcode; #define CREQ_QUERY_FUNC_RESP_SB_OPCODE_QUERY_FUNC 0x83UL @@ -2256,12 +2258,15 @@ struct creq_query_func_resp_sb { #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE #define CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED 0x40UL + #define CREQ_QUERY_FUNC_RESP_SB_MODIFY_QP_RATE_LIMIT_SUPPORTED 0x400UL #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL __le16 max_xp_qp_size; __le16 create_qp_batch_size; __le16 destroy_qp_batch_size; __le16 max_srq_ext; - __le64 reserved64; + __le16 reserved16; + __le16 rate_limit_min; + __le32 rate_limit_max; }; /* cmdq_set_func_resources (size:448b/56B) */ From 13edc7d4e0aa4abb5d50a062b61b9bffb01b0327 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 2 Feb 2026 19:04:10 +0530 Subject: [PATCH 58/66] RDMA/bnxt_re: Report packet pacing capabilities when querying device Enable the support to report packet pacing capabilities from kernel to user space. Packet pacing allows to limit the rate to any number between the maximum and minimum. The capabilities are exposed to user space through query_device. The following capabilities are reported: 1. The maximum and minimum rate limit in kbps. 2. Bitmap showing which QP types support rate limit. Signed-off-by: Damodharam Ammepalli Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20260202133413.3182578-3-kalesh-anakkur.purayil@broadcom.com Reviewed-by: Anantha Prabhu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 22 +++++++++++++++++++++- include/uapi/rdma/bnxt_re-abi.h | 16 ++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 39dd18af86eb..c146f43ae875 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -186,6 +186,9 @@ int bnxt_re_query_device(struct ib_device *ibdev, { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; + struct bnxt_re_query_device_ex_resp resp = {}; + size_t outlen = (udata) ? udata->outlen : 0; + int rc = 0; memset(ib_attr, 0, sizeof(*ib_attr)); memcpy(&ib_attr->fw_ver, dev_attr->fw_ver, @@ -250,7 +253,21 @@ int bnxt_re_query_device(struct ib_device *ibdev, ib_attr->max_pkeys = 1; ib_attr->local_ca_ack_delay = BNXT_RE_DEFAULT_ACK_DELAY; - return 0; + + if ((offsetofend(typeof(resp), packet_pacing_caps) <= outlen) && + _is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) { + resp.packet_pacing_caps.qp_rate_limit_min = + dev_attr->rate_limit_min; + resp.packet_pacing_caps.qp_rate_limit_max = + dev_attr->rate_limit_max; + resp.packet_pacing_caps.supported_qpts = + 1 << IB_QPT_RC; + } + if (outlen) + rc = ib_copy_to_udata(udata, &resp, + min(sizeof(resp), outlen)); + + return rc; } int bnxt_re_modify_device(struct ib_device *ibdev, @@ -4401,6 +4418,9 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED; + if (_is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_QP_RATE_LIMIT_ENABLED; + if (udata->inlen >= sizeof(ureq)) { rc = ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq))); if (rc) diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index faa9d62b3b30..f24edf1c75eb 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -56,6 +56,7 @@ enum { BNXT_RE_UCNTX_CMASK_DBR_PACING_ENABLED = 0x08ULL, BNXT_RE_UCNTX_CMASK_POW2_DISABLED = 0x10ULL, BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED = 0x40, + BNXT_RE_UCNTX_CMASK_QP_RATE_LIMIT_ENABLED = 0x80ULL, }; enum bnxt_re_wqe_mode { @@ -215,4 +216,19 @@ enum bnxt_re_toggle_mem_methods { BNXT_RE_METHOD_GET_TOGGLE_MEM = (1U << UVERBS_ID_NS_SHIFT), BNXT_RE_METHOD_RELEASE_TOGGLE_MEM, }; + +struct bnxt_re_packet_pacing_caps { + __u32 qp_rate_limit_min; + __u32 qp_rate_limit_max; /* In kbps */ + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_RC + */ + __u32 supported_qpts; + __u32 reserved; +}; + +struct bnxt_re_query_device_ex_resp { + struct bnxt_re_packet_pacing_caps packet_pacing_caps; +}; #endif /* __BNXT_RE_UVERBS_ABI_H__*/ From 949e7c062d3769d9522e8a4abe080fb92ddd61bf Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 2 Feb 2026 19:04:11 +0530 Subject: [PATCH 59/66] RDMA/bnxt_re: Report QP rate limit in debugfs Update QP info debugfs hook to report the rate limit applied on the QP. 0 means unlimited. Signed-off-by: Damodharam Ammepalli Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20260202133413.3182578-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/debugfs.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index 88817c86ae24..e025217861c2 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -87,25 +87,35 @@ static ssize_t qp_info_read(struct file *filep, size_t count, loff_t *ppos) { struct bnxt_re_qp *qp = filep->private_data; + struct bnxt_qplib_qp *qplib_qp; + u32 rate_limit = 0; char *buf; int len; if (*ppos) return 0; + qplib_qp = &qp->qplib_qp; + if (qplib_qp->shaper_allocation_status) + rate_limit = qplib_qp->rate_limit; + buf = kasprintf(GFP_KERNEL, "QPN\t\t: %d\n" "transport\t: %s\n" "state\t\t: %s\n" "mtu\t\t: %d\n" "timeout\t\t: %d\n" - "remote QPN\t: %d\n", + "remote QPN\t: %d\n" + "shaper allocated : %d\n" + "rate limit\t: %d kbps\n", qp->qplib_qp.id, bnxt_re_qp_type_str(qp->qplib_qp.type), bnxt_re_qp_state_str(qp->qplib_qp.state), qp->qplib_qp.mtu, qp->qplib_qp.timeout, - qp->qplib_qp.dest_qpn); + qp->qplib_qp.dest_qpn, + qplib_qp->shaper_allocation_status, + rate_limit); if (!buf) return -ENOMEM; if (count < strlen(buf)) { From cae42d97d94e9c9803554ca6d577297a2270d471 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 2 Feb 2026 19:04:12 +0530 Subject: [PATCH 60/66] RDMA/mlx5: Support rate limit only for Raw Packet QP mlx5 based hardware supports rate limiting only on Raw ethernet QPs. Added an explicit check to fail the operation on any other QP types. The rate limit support has been enahanced in the stack for RC QPs too. Compile tested only. CC: Leon Romanovsky Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20260202133413.3182578-5-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 69af20790481..0324909e3151 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4362,6 +4362,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, optpar |= ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + if (attr_mask & IB_QP_RATE_LIMIT && qp->type != IB_QPT_RAW_PACKET) { + err = -EOPNOTSUPP; + goto out; + } + if (qp->type == IB_QPT_RAW_PACKET || qp->flags & IB_QP_CREATE_SOURCE_QPN) { struct mlx5_modify_raw_qp_param raw_qp_param = {}; From 42e3aac65c1c9eb36cdee0d8312a326196e0822f Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 2 Feb 2026 19:04:13 +0530 Subject: [PATCH 61/66] IB/core: Extend rate limit support for RC QPs Broadcom devices supports setting the rate limit while changing RC QP state from INIT to RTR, RTR to RTS and RTS to RTS. Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20260202133413.3182578-6-kalesh-anakkur.purayil@broadcom.com Reviewed-by: Damodharam Ammepalli Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/verbs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 8b56b6b62352..02ebc3e52196 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1537,7 +1537,8 @@ static const struct { IB_QP_PKEY_INDEX), [IB_QPT_RC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), + IB_QP_PKEY_INDEX | + IB_QP_RATE_LIMIT), [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), @@ -1585,7 +1586,8 @@ static const struct { IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), + IB_QP_PATH_MIG_STATE | + IB_QP_RATE_LIMIT), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | @@ -1619,7 +1621,8 @@ static const struct { IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | - IB_QP_MIN_RNR_TIMER), + IB_QP_MIN_RNR_TIMER | + IB_QP_RATE_LIMIT), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | From 5551b02fdbfd85a325bb857f3a8f9c9f33397ed2 Mon Sep 17 00:00:00 2001 From: YunJe Shin Date: Tue, 3 Feb 2026 19:06:21 +0900 Subject: [PATCH 62/66] RDMA/umad: Reject negative data_len in ib_umad_write ib_umad_write computes data_len from user-controlled count and the MAD header sizes. With a mismatched user MAD header size and RMPP header length, data_len can become negative and reach ib_create_send_mad(). This can make the padding calculation exceed the segment size and trigger an out-of-bounds memset in alloc_send_rmpp_list(). Add an explicit check to reject negative data_len before creating the send buffer. KASAN splat: [ 211.363464] BUG: KASAN: slab-out-of-bounds in ib_create_send_mad+0xa01/0x11b0 [ 211.364077] Write of size 220 at addr ffff88800c3fa1f8 by task spray_thread/102 [ 211.365867] ib_create_send_mad+0xa01/0x11b0 [ 211.365887] ib_umad_write+0x853/0x1c80 Fixes: 2be8e3ee8efd ("IB/umad: Add P_Key index support") Signed-off-by: YunJe Shin Link: https://patch.msgid.link/20260203100628.1215408-1-ioerts@kookmin.ac.kr Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/user_mad.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index fd67fc9fe85a..2f7e3c4483fc 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -514,7 +514,8 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, struct rdma_ah_attr ah_attr; struct ib_ah *ah; __be64 *tid; - int ret, data_len, hdr_len, copy_offset, rmpp_active; + int ret, hdr_len, copy_offset, rmpp_active; + size_t data_len; u8 base_version; if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) @@ -588,7 +589,10 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, } base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version; - data_len = count - hdr_size(file) - hdr_len; + if (check_sub_overflow(count, hdr_size(file) + hdr_len, &data_len)) { + ret = -EINVAL; + goto err_ah; + } packet->msg = ib_create_send_mad(agent, be32_to_cpu(packet->mad.hdr.qpn), packet->mad.hdr.pkey_index, rmpp_active, From 14ab3da122bd18920ad57428f6cf4fade8385142 Mon Sep 17 00:00:00 2001 From: YunJe Shin Date: Wed, 4 Feb 2026 18:24:57 +0900 Subject: [PATCH 63/66] RDMA/siw: Fix potential NULL pointer dereference in header processing If siw_get_hdr() returns -EINVAL before set_rx_fpdu_context(), qp->rx_fpdu can be NULL. The error path in siw_tcp_rx_data() dereferences qp->rx_fpdu->more_ddp_segs without checking, which may lead to a NULL pointer deref. Only check more_ddp_segs when rx_fpdu is present. KASAN splat: [ 101.384271] KASAN: null-ptr-deref in range [0x00000000000000c0-0x00000000000000c7] [ 101.385869] RIP: 0010:siw_tcp_rx_data+0x13ad/0x1e50 Fixes: 8b6a361b8c48 ("rdma/siw: receive path") Signed-off-by: YunJe Shin Link: https://patch.msgid.link/20260204092546.489842-1-ioerts@kookmin.ac.kr Acked-by: Bernard Metzler Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw_qp_rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index a10820e33887..e8a88b378d51 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -1435,7 +1435,8 @@ int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, } if (unlikely(rv != 0 && rv != -EAGAIN)) { if ((srx->state > SIW_GET_HDR || - qp->rx_fpdu->more_ddp_segs) && run_completion) + (qp->rx_fpdu && qp->rx_fpdu->more_ddp_segs)) && + run_completion) siw_rdmap_complete(qp, rv); siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv, From 9ad95a0f2b75a788325249f341694e0343facf7b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 1 Feb 2026 16:34:04 +0200 Subject: [PATCH 64/66] RDMA/uverbs: Support external FD uobjects Add support for uobjects that wrap externally allocated file descriptors (FDs). In this mode, the FD number still follows the standard uverbs allocation flow, but the file pointer is allocated externally and has its own fops and private data. As a result, alloc_begin_fd_uobject() must handle cases where fd_type->fops is NULL, and both alloc_commit_fd_uobject() and alloc_abort_fd_uobject() must account for whether filp->private_data exists, since it is populated outside the standard uverbs flow. Signed-off-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260201-dmabuf-export-v3-1-da238b614fe3@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/rdma_core.c | 35 +++++++++++++++++------------ 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 18918f463361..b6eda2fb0911 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -465,7 +465,7 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj, fd_type = container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); - if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release && + if (WARN_ON(fd_type->fops && fd_type->fops->release != &uverbs_uobject_fd_release && fd_type->fops->release != &uverbs_async_event_release)) { ret = ERR_PTR(-EINVAL); goto err_fd; @@ -477,14 +477,16 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj, goto err_fd; } - /* Note that uverbs_uobject_fd_release() is called during abort */ - filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, - fd_type->flags); - if (IS_ERR(filp)) { - ret = ERR_CAST(filp); - goto err_getfile; + if (fd_type->fops) { + /* Note that uverbs_uobject_fd_release() is called during abort */ + filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, + fd_type->flags); + if (IS_ERR(filp)) { + ret = ERR_CAST(filp); + goto err_getfile; + } + uobj->object = filp; } - uobj->object = filp; uobj->id = new_fd; return uobj; @@ -561,7 +563,9 @@ static void alloc_abort_fd_uobject(struct ib_uobject *uobj) { struct file *filp = uobj->object; - fput(filp); + if (filp) + fput(filp); + put_unused_fd(uobj->id); } @@ -628,11 +632,14 @@ static void alloc_commit_fd_uobject(struct ib_uobject *uobj) /* This shouldn't be used anymore. Use the file object instead */ uobj->id = 0; - /* - * NOTE: Once we install the file we loose ownership of our kref on - * uobj. It will be put by uverbs_uobject_fd_release() - */ - filp->private_data = uobj; + if (!filp->private_data) { + /* + * NOTE: Once we install the file we loose ownership of our kref on + * uobj. It will be put by uverbs_uobject_fd_release() + */ + filp->private_data = uobj; + } + fd_install(fd, filp); } From 0ac6f4056c4a257f4b230b910e3e6fee6c6fc9b9 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 1 Feb 2026 16:34:05 +0200 Subject: [PATCH 65/66] RDMA/uverbs: Add DMABUF object type and operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose DMABUF functionality to userspace through the uverbs interface, enabling InfiniBand/RDMA devices to export PCI based memory regions (e.g. device memory) as DMABUF file descriptors. This allows zero-copy sharing of RDMA memory with other subsystems that support the dma-buf framework. A new UVERBS_OBJECT_DMABUF object type and allocation method were introduced. During allocation, uverbs invokes the driver to supply the rdma_user_mmap_entry associated with the given page offset (pgoff). Based on the returned rdma_user_mmap_entry, uverbs requests the driver to provide the corresponding physical-memory details as well as the driver’s PCI provider information. Using this information, dma_buf_export() is called; if it succeeds, uobj->object is set to the underlying file pointer returned by the dma-buf framework. The file descriptor number follows the standard uverbs allocation flow, but the file pointer comes from the dma-buf subsystem, including its own fops and private data. When an mmap entry is removed, uverbs iterates over its associated DMABUFs, marks them as revoked, and calls dma_buf_move_notify() so that their importers are notified. The same procedure applies during the disassociate flow; final cleanup occurs when the application closes the file. Signed-off-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260201-dmabuf-export-v3-2-da238b614fe3@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/Makefile | 1 + drivers/infiniband/core/device.c | 2 + drivers/infiniband/core/ib_core_uverbs.c | 24 +++ drivers/infiniband/core/rdma_core.c | 28 +-- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs.h | 21 ++ .../infiniband/core/uverbs_std_types_dmabuf.c | 200 ++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 1 + include/rdma/ib_verbs.h | 9 + include/rdma/uverbs_types.h | 1 + include/uapi/rdma/ib_user_ioctl_cmds.h | 10 + 11 files changed, 286 insertions(+), 12 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_dmabuf.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index f483e0c12444..a2a7a9d2e0d3 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -33,6 +33,7 @@ ib_umad-y := user_mad.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_std_types_cq.o \ + uverbs_std_types_dmabuf.o \ uverbs_std_types_dmah.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 4e09f6e0995e..416242b9c158 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2765,6 +2765,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); + SET_DEVICE_OP(dev_ops, mmap_get_pfns); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); @@ -2775,6 +2776,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); + SET_DEVICE_OP(dev_ops, pgoff_to_mmap_entry); SET_DEVICE_OP(dev_ops, pre_destroy_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c index b51bd7087a88..1de72ff4610c 100644 --- a/drivers/infiniband/core/ib_core_uverbs.c +++ b/drivers/infiniband/core/ib_core_uverbs.c @@ -5,9 +5,13 @@ * Copyright 2019 Marvell. All rights reserved. */ #include +#include +#include #include "uverbs.h" #include "core_priv.h" +MODULE_IMPORT_NS("DMA_BUF"); + /** * rdma_umap_priv_init() - Initialize the private data of a vma * @@ -229,12 +233,29 @@ EXPORT_SYMBOL(rdma_user_mmap_entry_put); */ void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry) { + struct ib_uverbs_dmabuf_file *uverbs_dmabuf, *tmp; + if (!entry) return; + mutex_lock(&entry->dmabufs_lock); xa_lock(&entry->ucontext->mmap_xa); entry->driver_removed = true; xa_unlock(&entry->ucontext->mmap_xa); + list_for_each_entry_safe(uverbs_dmabuf, tmp, &entry->dmabufs, dmabufs_elm) { + dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL); + list_del(&uverbs_dmabuf->dmabufs_elm); + uverbs_dmabuf->revoked = true; + dma_buf_move_notify(uverbs_dmabuf->dmabuf); + dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv, + DMA_RESV_USAGE_BOOKKEEP, false, + MAX_SCHEDULE_TIMEOUT); + dma_resv_unlock(uverbs_dmabuf->dmabuf->resv); + kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done); + wait_for_completion(&uverbs_dmabuf->comp); + } + mutex_unlock(&entry->dmabufs_lock); + kref_put(&entry->ref, rdma_user_mmap_entry_free); } EXPORT_SYMBOL(rdma_user_mmap_entry_remove); @@ -274,6 +295,9 @@ int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, return -EINVAL; kref_init(&entry->ref); + INIT_LIST_HEAD(&entry->dmabufs); + mutex_init(&entry->dmabufs_lock); + entry->ucontext = ucontext; /* diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index b6eda2fb0911..3e0a8b9cd288 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -809,21 +809,10 @@ const struct uverbs_obj_type_class uverbs_idr_class = { }; EXPORT_SYMBOL(uverbs_idr_class); -/* - * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct - * file_operations release method. - */ -int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) +int uverbs_uobject_release(struct ib_uobject *uobj) { struct ib_uverbs_file *ufile; - struct ib_uobject *uobj; - /* - * This can only happen if the fput came from alloc_abort_fd_uobject() - */ - if (!filp->private_data) - return 0; - uobj = filp->private_data; ufile = uobj->ufile; if (down_read_trylock(&ufile->hw_destroy_rwsem)) { @@ -850,6 +839,21 @@ int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) uverbs_uobject_put(uobj); return 0; } + +/* + * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct + * file_operations release method. + */ +int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) +{ + /* + * This can only happen if the fput came from alloc_abort_fd_uobject() + */ + if (!filp->private_data) + return 0; + + return uverbs_uobject_release(filp->private_data); +} EXPORT_SYMBOL(uverbs_uobject_fd_release); /* diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index a59b087611cb..55f1e3558856 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; extern const struct uapi_definition uverbs_def_obj_device[]; extern const struct uapi_definition uverbs_def_obj_dm[]; +extern const struct uapi_definition uverbs_def_obj_dmabuf[]; extern const struct uapi_definition uverbs_def_obj_dmah[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 797e2fcc8072..6d4295277e0e 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -133,6 +133,18 @@ struct ib_uverbs_completion_event_file { struct ib_uverbs_event_queue ev_queue; }; +struct ib_uverbs_dmabuf_file { + struct ib_uobject uobj; + struct dma_buf *dmabuf; + struct list_head dmabufs_elm; + struct rdma_user_mmap_entry *mmap_entry; + struct phys_vec phys_vec; + struct p2pdma_provider *provider; + struct kref kref; + struct completion comp; + u8 revoked :1; +}; + struct ib_uverbs_event { union { struct ib_uverbs_async_event_desc async; @@ -290,4 +302,13 @@ ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs, void copy_port_attr_to_resp(struct ib_port_attr *attr, struct ib_uverbs_query_port_resp *resp, struct ib_device *ib_dev, u8 port_num); + +static inline void ib_uverbs_dmabuf_done(struct kref *kref) +{ + struct ib_uverbs_dmabuf_file *priv = + container_of(kref, struct ib_uverbs_dmabuf_file, kref); + + complete(&priv->comp); +} + #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_std_types_dmabuf.c b/drivers/infiniband/core/uverbs_std_types_dmabuf.c new file mode 100644 index 000000000000..dfdfcd1d1a44 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_dmabuf.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_dmabuf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + return 0; +} + +static struct sg_table * +uverbs_dmabuf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv; + struct sg_table *ret; + + dma_resv_assert_held(priv->dmabuf->resv); + + if (priv->revoked) + return ERR_PTR(-ENODEV); + + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider, + &priv->phys_vec, 1, priv->phys_vec.len, + dir); + if (IS_ERR(ret)) + return ret; + + kref_get(&priv->kref); + return ret; +} + +static void uverbs_dmabuf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(priv->dmabuf->resv); + dma_buf_free_sgt(attachment, sgt, dir); + kref_put(&priv->kref, ib_uverbs_dmabuf_done); +} + +static int uverbs_dmabuf_pin(struct dma_buf_attachment *attach) +{ + return -EOPNOTSUPP; +} + +static void uverbs_dmabuf_unpin(struct dma_buf_attachment *attach) +{ +} + +static void uverbs_dmabuf_release(struct dma_buf *dmabuf) +{ + struct ib_uverbs_dmabuf_file *priv = dmabuf->priv; + + /* + * This can only happen if the fput came from alloc_abort_fd_uobject() + */ + if (!priv->uobj.context) + return; + + uverbs_uobject_release(&priv->uobj); +} + +static const struct dma_buf_ops uverbs_dmabuf_ops = { + .attach = uverbs_dmabuf_attach, + .map_dma_buf = uverbs_dmabuf_map, + .unmap_dma_buf = uverbs_dmabuf_unmap, + .pin = uverbs_dmabuf_pin, + .unpin = uverbs_dmabuf_unpin, + .release = uverbs_dmabuf_release, +}; + +static int UVERBS_HANDLER(UVERBS_METHOD_DMABUF_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE) + ->obj_attr.uobject; + struct ib_uverbs_dmabuf_file *uverbs_dmabuf = + container_of(uobj, struct ib_uverbs_dmabuf_file, uobj); + struct ib_device *ib_dev = attrs->context->device; + struct rdma_user_mmap_entry *mmap_entry; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + off_t pg_off; + int ret; + + ret = uverbs_get_const(&pg_off, attrs, UVERBS_ATTR_ALLOC_DMABUF_PGOFF); + if (ret) + return ret; + + mmap_entry = ib_dev->ops.pgoff_to_mmap_entry(attrs->context, pg_off); + if (!mmap_entry) + return -EINVAL; + + ret = ib_dev->ops.mmap_get_pfns(mmap_entry, &uverbs_dmabuf->phys_vec, + &uverbs_dmabuf->provider); + if (ret) + goto err; + + exp_info.ops = &uverbs_dmabuf_ops; + exp_info.size = uverbs_dmabuf->phys_vec.len; + exp_info.flags = O_CLOEXEC; + exp_info.priv = uverbs_dmabuf; + + uverbs_dmabuf->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(uverbs_dmabuf->dmabuf)) { + ret = PTR_ERR(uverbs_dmabuf->dmabuf); + goto err; + } + + kref_init(&uverbs_dmabuf->kref); + init_completion(&uverbs_dmabuf->comp); + INIT_LIST_HEAD(&uverbs_dmabuf->dmabufs_elm); + mutex_lock(&mmap_entry->dmabufs_lock); + if (mmap_entry->driver_removed) + ret = -EIO; + else + list_add_tail(&uverbs_dmabuf->dmabufs_elm, &mmap_entry->dmabufs); + mutex_unlock(&mmap_entry->dmabufs_lock); + if (ret) + goto err_revoked; + + uobj->object = uverbs_dmabuf->dmabuf->file; + uverbs_dmabuf->mmap_entry = mmap_entry; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE); + return 0; + +err_revoked: + dma_buf_put(uverbs_dmabuf->dmabuf); +err: + rdma_user_mmap_entry_put(mmap_entry); + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DMABUF_ALLOC, + UVERBS_ATTR_FD(UVERBS_ATTR_ALLOC_DMABUF_HANDLE, + UVERBS_OBJECT_DMABUF, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMABUF_PGOFF, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +static void uverbs_dmabuf_fd_destroy_uobj(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + struct ib_uverbs_dmabuf_file *uverbs_dmabuf = + container_of(uobj, struct ib_uverbs_dmabuf_file, uobj); + bool wait_for_comp = false; + + mutex_lock(&uverbs_dmabuf->mmap_entry->dmabufs_lock); + dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL); + if (!uverbs_dmabuf->revoked) { + uverbs_dmabuf->revoked = true; + list_del(&uverbs_dmabuf->dmabufs_elm); + dma_buf_move_notify(uverbs_dmabuf->dmabuf); + dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv, + DMA_RESV_USAGE_BOOKKEEP, false, + MAX_SCHEDULE_TIMEOUT); + wait_for_comp = true; + } + dma_resv_unlock(uverbs_dmabuf->dmabuf->resv); + if (wait_for_comp) { + kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done); + /* Let's wait till all DMA unmap are completed. */ + wait_for_completion(&uverbs_dmabuf->comp); + } + mutex_unlock(&uverbs_dmabuf->mmap_entry->dmabufs_lock); + + /* Matches the get done as part of pgoff_to_mmap_entry() */ + rdma_user_mmap_entry_put(uverbs_dmabuf->mmap_entry); +} + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_DMABUF, + UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_dmabuf_file), + uverbs_dmabuf_fd_destroy_uobj, + NULL, NULL, O_RDONLY), + &UVERBS_METHOD(UVERBS_METHOD_DMABUF_ALLOC)); + +const struct uapi_definition uverbs_def_obj_dmabuf[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMABUF), + UAPI_DEF_OBJ_NEEDS_FN(mmap_get_pfns), + UAPI_DEF_OBJ_NEEDS_FN(pgoff_to_mmap_entry), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index e00ea63175bd..38d0bbbee796 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_cq), UAPI_DEF_CHAIN(uverbs_def_obj_device), UAPI_DEF_CHAIN(uverbs_def_obj_dm), + UAPI_DEF_CHAIN(uverbs_def_obj_dmabuf), UAPI_DEF_CHAIN(uverbs_def_obj_dmah), UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8bd020da7745..3f3827e1c711 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -44,6 +44,7 @@ #include #include #include +#include #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN @@ -2364,6 +2365,9 @@ struct rdma_user_mmap_entry { unsigned long start_pgoff; size_t npages; bool driver_removed; + /* protects access to dmabufs */ + struct mutex dmabufs_lock; + struct list_head dmabufs; }; /* Return the offset (in bytes) the user should pass to libc's mmap() */ @@ -2501,6 +2505,11 @@ struct ib_device_ops { * Therefore needs to be implemented by the driver in mmap_free. */ void (*mmap_free)(struct rdma_user_mmap_entry *entry); + int (*mmap_get_pfns)(struct rdma_user_mmap_entry *entry, + struct phys_vec *phys_vec, + struct p2pdma_provider **provider); + struct rdma_user_mmap_entry *(*pgoff_to_mmap_entry)(struct ib_ucontext *ucontext, + off_t pg_off); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index 26ba919ac245..6a253b7dc5ea 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -186,6 +186,7 @@ struct ib_uverbs_file { extern const struct uverbs_obj_type_class uverbs_idr_class; extern const struct uverbs_obj_type_class uverbs_fd_class; int uverbs_uobject_fd_release(struct inode *inode, struct file *filp); +int uverbs_uobject_release(struct ib_uobject *uobj); #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ sizeof(char)) diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 35da4026f452..72041c1b0ea5 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -56,6 +56,7 @@ enum uverbs_default_objects { UVERBS_OBJECT_COUNTERS, UVERBS_OBJECT_ASYNC_EVENT, UVERBS_OBJECT_DMAH, + UVERBS_OBJECT_DMABUF, }; enum { @@ -263,6 +264,15 @@ enum uverbs_methods_dmah { UVERBS_METHOD_DMAH_FREE, }; +enum uverbs_attrs_alloc_dmabuf_cmd_attr_ids { + UVERBS_ATTR_ALLOC_DMABUF_HANDLE, + UVERBS_ATTR_ALLOC_DMABUF_PGOFF, +}; + +enum uverbs_methods_dmabuf { + UVERBS_METHOD_DMABUF_ALLOC, +}; + enum uverbs_attrs_reg_dm_mr_cmd_attr_ids { UVERBS_ATTR_REG_DM_MR_HANDLE, UVERBS_ATTR_REG_DM_MR_OFFSET, From d6c58f4eb3d00a695f5a610ea780cad322ec714e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 1 Feb 2026 16:34:06 +0200 Subject: [PATCH 66/66] RDMA/mlx5: Implement DMABUF export ops Enable p2pdma on the mlx5 PCI device to allow DMABUF-based peer-to-peer DMA mappings. Add implementation of the mmap_get_pfns and pgoff_to_mmap_entry device operations required for DMABUF support in the mlx5 RDMA driver. The pgoff_to_mmap_entry operation converts a page offset to the corresponding rdma_user_mmap_entry by extracting the command and index from the offset and looking it up in the ucontext's mmap_xa. The mmap_get_pfns operation retrieves the physical address and length from the mmap entry and obtains the p2pdma provider for the underlying PCI device, which is needed for peer-to-peer DMA operations with DMABUFs. Signed-off-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260201-dmabuf-export-v3-3-da238b614fe3@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 72 +++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index eba023b7af0f..88cbd534771c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2454,6 +2454,70 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, virt_to_page(dev->mdev->clock_info)); } +static int phys_addr_to_bar(struct pci_dev *pdev, phys_addr_t pa) +{ + resource_size_t start, end; + int bar; + + for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) { + /* Skip BARs not present or not memory-mapped */ + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) + continue; + + start = pci_resource_start(pdev, bar); + end = pci_resource_end(pdev, bar); + + if (!start || !end) + continue; + + if (pa >= start && pa <= end) + return bar; + } + + return -1; +} + +static int mlx5_ib_mmap_get_pfns(struct rdma_user_mmap_entry *entry, + struct phys_vec *phys_vec, + struct p2pdma_provider **provider) +{ + struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); + struct pci_dev *pdev = to_mdev(entry->ucontext->device)->mdev->pdev; + int bar; + + phys_vec->paddr = mentry->address; + phys_vec->len = entry->npages * PAGE_SIZE; + + bar = phys_addr_to_bar(pdev, phys_vec->paddr); + if (bar < 0) + return -EINVAL; + + *provider = pcim_p2pdma_provider(pdev, bar); + /* If the kernel was not compiled with CONFIG_PCI_P2PDMA the + * functionality is not supported. + */ + if (!*provider) + return -EOPNOTSUPP; + + return 0; +} + +static struct rdma_user_mmap_entry * +mlx5_ib_pgoff_to_mmap_entry(struct ib_ucontext *ucontext, off_t pg_off) +{ + unsigned long entry_pgoff; + unsigned long idx; + u8 command; + + pg_off = pg_off >> PAGE_SHIFT; + command = get_command(pg_off); + idx = get_extended_index(pg_off); + + entry_pgoff = command << 16 | idx; + + return rdma_user_mmap_entry_get_pgoff(ucontext, entry_pgoff); +} + static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry) { struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); @@ -4419,7 +4483,13 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (err) goto err_mp; + err = pcim_p2pdma_init(mdev->pdev); + if (err && err != -EOPNOTSUPP) + goto err_dd; + return 0; +err_dd: + mlx5_ib_data_direct_cleanup(dev); err_mp: mlx5_ib_cleanup_multiport_master(dev); err: @@ -4471,11 +4541,13 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi, .mmap = mlx5_ib_mmap, .mmap_free = mlx5_ib_mmap_free, + .mmap_get_pfns = mlx5_ib_mmap_get_pfns, .modify_cq = mlx5_ib_modify_cq, .modify_device = mlx5_ib_modify_device, .modify_port = mlx5_ib_modify_port, .modify_qp = mlx5_ib_modify_qp, .modify_srq = mlx5_ib_modify_srq, + .pgoff_to_mmap_entry = mlx5_ib_pgoff_to_mmap_entry, .pre_destroy_cq = mlx5_ib_pre_destroy_cq, .poll_cq = mlx5_ib_poll_cq, .post_destroy_cq = mlx5_ib_post_destroy_cq,