mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 04:44:45 +01:00
net/rds: Encode cp_index in TCP source port
Upon "sendmsg", RDS/TCP selects a backend connection based
on a hash calculated from the source-port ("RDS_MPATH_HASH").
However, "rds_tcp_accept_one" accepts connections
in the order they arrive, which is non-deterministic.
Therefore the mapping of the sender's "cp->cp_index"
to that of the receiver changes if the backend
connections are dropped and reconnected.
However, connection state that's preserved across reconnects
(e.g. "cp_next_rx_seq") relies on that sender<->receiver
mapping to never change.
So we make sure that client and server of the TCP connection
have the exact same "cp->cp_index" across reconnects by
encoding "cp->cp_index" in the lower three bits of the
client's TCP source port.
A new extension "RDS_EXTHDR_SPORT_IDX" is introduced,
that allows the server to tell the difference between
clients that do the "cp->cp_index" encoding, and
legacy clients that pick source ports randomly.
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Link: https://patch.msgid.link/20260203055723.1085751-3-achender@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
parent
46f257ee69
commit
a20a699255
7 changed files with 77 additions and 7 deletions
|
|
@ -47,6 +47,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
|
|||
[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
|
||||
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
|
||||
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
|
||||
[RDS_EXTHDR_SPORT_IDX] = 1,
|
||||
};
|
||||
|
||||
void rds_message_addref(struct rds_message *rm)
|
||||
|
|
|
|||
|
|
@ -147,6 +147,7 @@ struct rds_connection {
|
|||
c_ping_triggered:1,
|
||||
c_pad_to_32:29;
|
||||
int c_npaths;
|
||||
bool c_with_sport_idx;
|
||||
struct rds_connection *c_passive;
|
||||
struct rds_transport *c_trans;
|
||||
|
||||
|
|
@ -278,8 +279,10 @@ struct rds_ext_header_rdma_bytes {
|
|||
*/
|
||||
#define RDS_EXTHDR_NPATHS 5
|
||||
#define RDS_EXTHDR_GEN_NUM 6
|
||||
#define RDS_EXTHDR_SPORT_IDX 8
|
||||
|
||||
#define __RDS_EXTHDR_MAX 16 /* for now */
|
||||
|
||||
#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
|
||||
#define RDS_MSG_RX_HDR 0
|
||||
#define RDS_MSG_RX_START 1
|
||||
|
|
|
|||
|
|
@ -204,7 +204,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
|
|||
struct rds_ext_header_version version;
|
||||
__be16 rds_npaths;
|
||||
__be32 rds_gen_num;
|
||||
u8 dummy;
|
||||
} buffer;
|
||||
bool new_with_sport_idx = false;
|
||||
u32 new_peer_gen_num = 0;
|
||||
|
||||
while (1) {
|
||||
|
|
@ -221,11 +223,16 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
|
|||
case RDS_EXTHDR_GEN_NUM:
|
||||
new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
|
||||
break;
|
||||
case RDS_EXTHDR_SPORT_IDX:
|
||||
new_with_sport_idx = true;
|
||||
break;
|
||||
default:
|
||||
pr_warn_ratelimited("ignoring unknown exthdr type "
|
||||
"0x%x\n", type);
|
||||
}
|
||||
}
|
||||
|
||||
conn->c_with_sport_idx = new_with_sport_idx;
|
||||
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
|
||||
conn->c_npaths = max_t(int, conn->c_npaths, 1);
|
||||
conn->c_ping_triggered = 0;
|
||||
|
|
|
|||
|
|
@ -1457,12 +1457,16 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
|
|||
cp->cp_conn->c_trans->t_mp_capable) {
|
||||
__be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
|
||||
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
|
||||
u8 dummy = 0;
|
||||
|
||||
rds_message_add_extension(&rm->m_inc.i_hdr,
|
||||
RDS_EXTHDR_NPATHS, &npaths);
|
||||
rds_message_add_extension(&rm->m_inc.i_hdr,
|
||||
RDS_EXTHDR_GEN_NUM,
|
||||
&my_gen_num);
|
||||
rds_message_add_extension(&rm->m_inc.i_hdr,
|
||||
RDS_EXTHDR_SPORT_IDX,
|
||||
&dummy);
|
||||
}
|
||||
spin_unlock_irqrestore(&cp->cp_lock, flags);
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ struct rds_tcp_connection {
|
|||
*/
|
||||
struct mutex t_conn_path_lock;
|
||||
struct socket *t_sock;
|
||||
u32 t_client_port_group;
|
||||
struct rds_tcp_net *t_rtn;
|
||||
void *t_orig_write_space;
|
||||
void *t_orig_data_ready;
|
||||
|
|
|
|||
|
|
@ -93,6 +93,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
|
|||
struct sockaddr_in6 sin6;
|
||||
struct sockaddr_in sin;
|
||||
struct sockaddr *addr;
|
||||
int port_low, port_high, port;
|
||||
int port_groups, groups_left;
|
||||
int addrlen;
|
||||
bool isv6;
|
||||
int ret;
|
||||
|
|
@ -145,7 +147,26 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
|
|||
addrlen = sizeof(sin);
|
||||
}
|
||||
|
||||
ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen);
|
||||
/* encode cp->cp_index in lowest bits of source-port */
|
||||
inet_get_local_port_range(rds_conn_net(conn), &port_low, &port_high);
|
||||
port_low = ALIGN(port_low, RDS_MPATH_WORKERS);
|
||||
port_groups = (port_high - port_low + 1) / RDS_MPATH_WORKERS;
|
||||
ret = -EADDRINUSE;
|
||||
groups_left = port_groups;
|
||||
while (groups_left-- > 0 && ret) {
|
||||
if (++tc->t_client_port_group >= port_groups)
|
||||
tc->t_client_port_group = 0;
|
||||
port = port_low +
|
||||
tc->t_client_port_group * RDS_MPATH_WORKERS +
|
||||
cp->cp_index;
|
||||
|
||||
if (isv6)
|
||||
sin6.sin6_port = htons(port);
|
||||
else
|
||||
sin.sin_port = htons(port);
|
||||
ret = kernel_bind(sock, (struct sockaddr_unsized *)addr,
|
||||
addrlen);
|
||||
}
|
||||
if (ret) {
|
||||
rdsdebug("bind failed with %d at address %pI6c\n",
|
||||
ret, &conn->c_laddr);
|
||||
|
|
|
|||
|
|
@ -62,19 +62,52 @@ void rds_tcp_keepalive(struct socket *sock)
|
|||
* we special case cp_index 0 is to allow the rds probe ping itself to itself
|
||||
* get through efficiently.
|
||||
*/
|
||||
static
|
||||
struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
|
||||
static struct rds_tcp_connection *
|
||||
rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock)
|
||||
{
|
||||
int i;
|
||||
int npaths = max_t(int, 1, conn->c_npaths);
|
||||
union {
|
||||
struct sockaddr_storage storage;
|
||||
struct sockaddr addr;
|
||||
struct sockaddr_in sin;
|
||||
struct sockaddr_in6 sin6;
|
||||
} saddr;
|
||||
int sport, npaths, i_min, i_max, i;
|
||||
|
||||
for (i = 0; i < npaths; i++) {
|
||||
if (conn->c_with_sport_idx &&
|
||||
kernel_getpeername(sock, &saddr.addr) >= 0) {
|
||||
/* cp->cp_index is encoded in lowest bits of source-port */
|
||||
switch (saddr.addr.sa_family) {
|
||||
case AF_INET:
|
||||
sport = ntohs(saddr.sin.sin_port);
|
||||
break;
|
||||
case AF_INET6:
|
||||
sport = ntohs(saddr.sin6.sin6_port);
|
||||
break;
|
||||
default:
|
||||
sport = -1;
|
||||
}
|
||||
} else {
|
||||
sport = -1;
|
||||
}
|
||||
|
||||
npaths = max_t(int, 1, conn->c_npaths);
|
||||
|
||||
if (sport >= 0) {
|
||||
i_min = sport % npaths;
|
||||
i_max = i_min;
|
||||
} else {
|
||||
i_min = 0;
|
||||
i_max = npaths - 1;
|
||||
}
|
||||
|
||||
for (i = i_min; i <= i_max; i++) {
|
||||
struct rds_conn_path *cp = &conn->c_path[i];
|
||||
|
||||
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
|
||||
RDS_CONN_CONNECTING))
|
||||
return cp->cp_transport_data;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -199,7 +232,7 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn)
|
|||
* to and discarded by the sender.
|
||||
* We must not throw those away!
|
||||
*/
|
||||
rs_tcp = rds_tcp_accept_one_path(conn);
|
||||
rs_tcp = rds_tcp_accept_one_path(conn, new_sock);
|
||||
if (!rs_tcp) {
|
||||
/* It's okay to stash "new_sock", since
|
||||
* "rds_tcp_conn_slots_available" triggers
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue