net/rds: Encode cp_index in TCP source port

Upon "sendmsg", RDS/TCP selects a backend connection based
on a hash calculated from the source-port ("RDS_MPATH_HASH").

However, "rds_tcp_accept_one" accepts connections
in the order they arrive, which is non-deterministic.

Therefore the mapping of the sender's "cp->cp_index"
to that of the receiver changes if the backend
connections are dropped and reconnected.

However, connection state that's preserved across reconnects
(e.g. "cp_next_rx_seq") relies on that sender<->receiver
mapping to never change.

So we make sure that client and server of the TCP connection
have the exact same "cp->cp_index" across reconnects by
encoding "cp->cp_index" in the lower three bits of the
client's TCP source port.

A new extension "RDS_EXTHDR_SPORT_IDX" is introduced,
that allows the server to tell the difference between
clients that do the "cp->cp_index" encoding, and
legacy clients that pick source ports randomly.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Link: https://patch.msgid.link/20260203055723.1085751-3-achender@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Gerd Rausch 2026-02-02 22:57:17 -07:00 committed by Jakub Kicinski
parent 46f257ee69
commit a20a699255
7 changed files with 77 additions and 7 deletions

View file

@ -47,6 +47,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
[RDS_EXTHDR_SPORT_IDX] = 1,
};
void rds_message_addref(struct rds_message *rm)

View file

@ -147,6 +147,7 @@ struct rds_connection {
c_ping_triggered:1,
c_pad_to_32:29;
int c_npaths;
bool c_with_sport_idx;
struct rds_connection *c_passive;
struct rds_transport *c_trans;
@ -278,8 +279,10 @@ struct rds_ext_header_rdma_bytes {
*/
#define RDS_EXTHDR_NPATHS 5
#define RDS_EXTHDR_GEN_NUM 6
#define RDS_EXTHDR_SPORT_IDX 8
#define __RDS_EXTHDR_MAX 16 /* for now */
#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
#define RDS_MSG_RX_HDR 0
#define RDS_MSG_RX_START 1

View file

@ -204,7 +204,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
struct rds_ext_header_version version;
__be16 rds_npaths;
__be32 rds_gen_num;
u8 dummy;
} buffer;
bool new_with_sport_idx = false;
u32 new_peer_gen_num = 0;
while (1) {
@ -221,11 +223,16 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
case RDS_EXTHDR_GEN_NUM:
new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
break;
case RDS_EXTHDR_SPORT_IDX:
new_with_sport_idx = true;
break;
default:
pr_warn_ratelimited("ignoring unknown exthdr type "
"0x%x\n", type);
}
}
conn->c_with_sport_idx = new_with_sport_idx;
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
conn->c_npaths = max_t(int, conn->c_npaths, 1);
conn->c_ping_triggered = 0;

View file

@ -1457,12 +1457,16 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
cp->cp_conn->c_trans->t_mp_capable) {
__be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
u8 dummy = 0;
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_GEN_NUM,
&my_gen_num);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_SPORT_IDX,
&dummy);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);

View file

@ -34,6 +34,7 @@ struct rds_tcp_connection {
*/
struct mutex t_conn_path_lock;
struct socket *t_sock;
u32 t_client_port_group;
struct rds_tcp_net *t_rtn;
void *t_orig_write_space;
void *t_orig_data_ready;

View file

@ -93,6 +93,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
struct sockaddr_in6 sin6;
struct sockaddr_in sin;
struct sockaddr *addr;
int port_low, port_high, port;
int port_groups, groups_left;
int addrlen;
bool isv6;
int ret;
@ -145,7 +147,26 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
addrlen = sizeof(sin);
}
ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen);
/* encode cp->cp_index in lowest bits of source-port */
inet_get_local_port_range(rds_conn_net(conn), &port_low, &port_high);
port_low = ALIGN(port_low, RDS_MPATH_WORKERS);
port_groups = (port_high - port_low + 1) / RDS_MPATH_WORKERS;
ret = -EADDRINUSE;
groups_left = port_groups;
while (groups_left-- > 0 && ret) {
if (++tc->t_client_port_group >= port_groups)
tc->t_client_port_group = 0;
port = port_low +
tc->t_client_port_group * RDS_MPATH_WORKERS +
cp->cp_index;
if (isv6)
sin6.sin6_port = htons(port);
else
sin.sin_port = htons(port);
ret = kernel_bind(sock, (struct sockaddr_unsized *)addr,
addrlen);
}
if (ret) {
rdsdebug("bind failed with %d at address %pI6c\n",
ret, &conn->c_laddr);

View file

@ -62,19 +62,52 @@ void rds_tcp_keepalive(struct socket *sock)
* we special case cp_index 0 is to allow the rds probe ping itself to itself
* get through efficiently.
*/
static
struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
static struct rds_tcp_connection *
rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock)
{
int i;
int npaths = max_t(int, 1, conn->c_npaths);
union {
struct sockaddr_storage storage;
struct sockaddr addr;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
} saddr;
int sport, npaths, i_min, i_max, i;
for (i = 0; i < npaths; i++) {
if (conn->c_with_sport_idx &&
kernel_getpeername(sock, &saddr.addr) >= 0) {
/* cp->cp_index is encoded in lowest bits of source-port */
switch (saddr.addr.sa_family) {
case AF_INET:
sport = ntohs(saddr.sin.sin_port);
break;
case AF_INET6:
sport = ntohs(saddr.sin6.sin6_port);
break;
default:
sport = -1;
}
} else {
sport = -1;
}
npaths = max_t(int, 1, conn->c_npaths);
if (sport >= 0) {
i_min = sport % npaths;
i_max = i_min;
} else {
i_min = 0;
i_max = npaths - 1;
}
for (i = i_min; i <= i_max; i++) {
struct rds_conn_path *cp = &conn->c_path[i];
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
RDS_CONN_CONNECTING))
return cp->cp_transport_data;
}
return NULL;
}
@ -199,7 +232,7 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn)
* to and discarded by the sender.
* We must not throw those away!
*/
rs_tcp = rds_tcp_accept_one_path(conn);
rs_tcp = rds_tcp_accept_one_path(conn, new_sock);
if (!rs_tcp) {
/* It's okay to stash "new_sock", since
* "rds_tcp_conn_slots_available" triggers