net/rds: new extension header: rdma bytes

Introduce a new extension header type RDSV3_EXTHDR_RDMA_BYTES for
an RDMA initiator to exchange rdma byte counts to its target.
Currently, RDMA operations cannot precisely account how many bytes a
peer just transferred via RDMA, which limits per-connection statistics
and future policy (e.g., monitoring or rate/cgroup accounting of RDMA
traffic).

In this patch we expand rds_message_add_extension to accept multiple
extensions, and add new flag to RDS header: RDS_FLAG_EXTHDR_EXTENSION,
along with a new extension to RDS header: rds_ext_header_rdma_bytes.

Signed-off-by: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Signed-off-by: Guangyu Sun <guangyu.sun@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Link: https://patch.msgid.link/20260203055723.1085751-2-achender@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Shamir Rabinovitch 2026-02-02 22:57:16 -07:00 committed by Jakub Kicinski
parent acd21dd2da
commit 46f257ee69
4 changed files with 107 additions and 29 deletions

View file

@ -577,16 +577,42 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
if (rm->rdma.op_active) {
struct rds_ext_header_rdma ext_hdr;
struct rds_ext_header_rdma ext_hdr = {};
struct rds_ext_header_rdma_bytes
rdma_bytes_ext_hdr = {};
ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
if (rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA,
&ext_hdr)) {
/* prepare the rdma bytes ext header */
rdma_bytes_ext_hdr.h_rflags =
rm->rdma.op_write ?
RDS_FLAG_RDMA_WR_BYTES :
RDS_FLAG_RDMA_RD_BYTES;
rdma_bytes_ext_hdr.h_rdma_bytes =
cpu_to_be32(rm->rdma.op_bytes);
} else {
rdsdebug("RDS_EXTHDR_RDMA dropped");
}
if (rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA_BYTES,
&rdma_bytes_ext_hdr)) {
/* rdma bytes ext header was added successfully,
* notify the remote side via flag in header
*/
rm->m_inc.i_hdr.h_flags |=
RDS_FLAG_EXTHDR_EXTENSION;
} else {
rdsdebug("RDS_EXTHDR_RDMA_BYTES dropped");
}
}
if (rm->m_rdma_cookie) {
rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
rds_rdma_cookie_key(rm->m_rdma_cookie),
rds_rdma_cookie_offset(rm->m_rdma_cookie));
if (rm->m_rdma_cookie &&
!rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
rds_rdma_cookie_key(rm->m_rdma_cookie),
rds_rdma_cookie_offset(rm->m_rdma_cookie))) {
rdsdebug("RDS_EXTHDR_RDMA_DEST dropped\n");
}
/* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so

View file

@ -44,6 +44,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
};
@ -191,31 +192,69 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
hdr->h_sport = sport;
hdr->h_dport = dport;
hdr->h_sequence = cpu_to_be64(seq);
hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
/* see rds_find_next_ext_space for reason why we memset the
* ext header
*/
memset(hdr->h_exthdr, RDS_EXTHDR_NONE, RDS_HEADER_EXT_SPACE);
}
EXPORT_SYMBOL_GPL(rds_message_populate_header);
int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
const void *data, unsigned int len)
/*
* Find the next place we can add an RDS header extension with
* specific length. Extension headers are pushed one after the
* other. In the following, the number after the colon is the number
* of bytes:
*
* [ type1:1 dta1:len1 [ type2:1 dta2:len2 ] ... ] RDS_EXTHDR_NONE
*
* If the extension headers fill the complete extension header space
* (16 bytes), the trailing RDS_EXTHDR_NONE is omitted.
*/
static int rds_find_next_ext_space(struct rds_header *hdr, unsigned int len,
u8 **ext_start)
{
unsigned int ext_len;
unsigned int type;
int ind = 0;
while ((ind + 1 + len) <= RDS_HEADER_EXT_SPACE) {
if (hdr->h_exthdr[ind] == RDS_EXTHDR_NONE) {
*ext_start = hdr->h_exthdr + ind;
return 0;
}
type = hdr->h_exthdr[ind];
ext_len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
WARN_ONCE(!ext_len, "Unknown ext hdr type %d\n", type);
if (!ext_len)
return -EINVAL;
/* ind points to a valid ext hdr with known length */
ind += 1 + ext_len;
}
/* no room for extension */
return -ENOSPC;
}
/* The ext hdr space is prefilled with zero from the kzalloc() */
int rds_message_add_extension(struct rds_header *hdr,
unsigned int type, const void *data)
{
unsigned int ext_len = sizeof(u8) + len;
unsigned char *dst;
unsigned int len;
/* For now, refuse to add more than one extension header */
if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
if (!len)
return 0;
if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
if (rds_find_next_ext_space(hdr, len, &dst))
return 0;
if (ext_len >= RDS_HEADER_EXT_SPACE)
return 0;
dst = hdr->h_exthdr;
*dst++ = type;
memcpy(dst, data, len);
dst[len] = RDS_EXTHDR_NONE;
return 1;
}
EXPORT_SYMBOL_GPL(rds_message_add_extension);
@ -272,7 +311,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
ext_hdr.h_rdma_offset = cpu_to_be32(offset);
return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr);
}
EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);

View file

@ -183,10 +183,11 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net)
write_pnet(&conn->c_net, net);
}
#define RDS_FLAG_CONG_BITMAP 0x01
#define RDS_FLAG_ACK_REQUIRED 0x02
#define RDS_FLAG_RETRANSMITTED 0x04
#define RDS_MAX_ADV_CREDIT 255
#define RDS_FLAG_CONG_BITMAP 0x01
#define RDS_FLAG_ACK_REQUIRED 0x02
#define RDS_FLAG_RETRANSMITTED 0x04
#define RDS_FLAG_EXTHDR_EXTENSION 0x20
#define RDS_MAX_ADV_CREDIT 255
/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
* probe to exchange control information before establishing a connection.
@ -258,6 +259,20 @@ struct rds_ext_header_rdma_dest {
__be32 h_rdma_offset;
};
/*
* This extension header tells the peer about delivered RDMA byte count.
*/
#define RDS_EXTHDR_RDMA_BYTES 4
struct rds_ext_header_rdma_bytes {
__be32 h_rdma_bytes; /* byte count */
u8 h_rflags; /* direction of RDMA, write or read */
u8 h_pad[3];
};
#define RDS_FLAG_RDMA_WR_BYTES 0x01
#define RDS_FLAG_RDMA_RD_BYTES 0x02
/* Extension header announcing number of paths.
* Implicit length = 2 bytes.
*/
@ -871,7 +886,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq);
int rds_message_add_extension(struct rds_header *hdr,
unsigned int type, const void *data, unsigned int len);
unsigned int type, const void *data);
int rds_message_next_extension(struct rds_header *hdr,
unsigned int *pos, void *buf, unsigned int *buflen);
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);

View file

@ -1459,12 +1459,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths,
sizeof(npaths));
RDS_EXTHDR_NPATHS, &npaths);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_GEN_NUM,
&my_gen_num,
sizeof(u32));
&my_gen_num);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);