linux/net/rds/tcp.h
Gerd Rausch 9d27a0fb12 net/rds: Trigger rds_send_ping() more than once
Even though a peer may have already received a
non-zero value for "RDS_EXTHDR_NPATHS" from a node in the past,
the current peer may not.

Therefore it is important to initiate another rds_send_ping()
after a re-connect to any peer:
It is unknown at that time if we're still talking to the same
instance of RDS kernel modules on the other side.

Otherwise, the peer may just operate on a single lane
("c_npaths == 0"), not knowing that more lanes are available.

However, if "c_with_sport_idx" is supported,
we also need to check that the connection we accepted on lane#0
meets the proper source port modulo requirement, as we fan out:

Since the exchange of "RDS_EXTHDR_NPATHS" and "RDS_EXTHDR_SPORT_IDX"
is asynchronous, initially we have no choice but to accept an incoming
connection (via "accept") in the first slot ("cp_index == 0")
for backwards compatibility.

But that very connection may have come from a different lane
with "cp_index != 0", since the peer thought that we already understood
and handled "c_with_sport_idx" properly, as indicated by a previous
exchange before a module was reloaded.

In short:
If a module gets reloaded, we recover from that, but do *not*
allow a downgrade to support fewer lanes.

Downgrades would require us to merge messages from separate lanes,
which is rather tricky with the current RDS design.
Each lane has its own sequence number space and all messages
would need to be re-sequenced as we merge, all while
handling "RDS_FLAG_RETRANSMITTED" and "cp_retrans" properly.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Link: https://patch.msgid.link/20260203055723.1085751-9-achender@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2026-02-04 20:46:39 -08:00

120 lines
3.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _RDS_TCP_H
#define _RDS_TCP_H
#define RDS_TCP_PORT 16385
/* per-network namespace private data for this module */
struct rds_tcp_net {
/* serialize "rds_tcp_accept_one" with "rds_tcp_accept_lock"
* to protect "rds_tcp_accepted_sock"
*/
struct mutex rds_tcp_accept_lock;
struct socket *rds_tcp_listen_sock;
struct socket *rds_tcp_accepted_sock;
struct work_struct rds_tcp_accept_w;
struct ctl_table_header *rds_tcp_sysctl;
const struct ctl_table *ctl_table;
int sndbuf_size;
int rcvbuf_size;
};
struct rds_tcp_incoming {
struct rds_incoming ti_inc;
struct sk_buff_head ti_skb_list;
};
struct rds_tcp_connection {
struct list_head t_tcp_node;
bool t_tcp_node_detached;
struct rds_conn_path *t_cpath;
/* t_conn_path_lock synchronizes the connection establishment between
* rds_tcp_accept_one and rds_tcp_conn_path_connect
*/
struct mutex t_conn_path_lock;
struct socket *t_sock;
u32 t_client_port_group;
struct rds_tcp_net *t_rtn;
void *t_orig_write_space;
void *t_orig_data_ready;
void *t_orig_state_change;
struct rds_tcp_incoming *t_tinc;
size_t t_tinc_hdr_rem;
size_t t_tinc_data_rem;
/* XXX error report? */
struct work_struct t_conn_w;
struct work_struct t_send_w;
struct work_struct t_down_w;
struct work_struct t_recv_w;
/* for info exporting only */
struct list_head t_list_item;
u32 t_last_sent_nxt;
u32 t_last_expected_una;
u32 t_last_seen_una;
/* for rds_tcp_conn_path_shutdown */
wait_queue_head_t t_recv_done_waitq;
};
struct rds_tcp_statistics {
uint64_t s_tcp_data_ready_calls;
uint64_t s_tcp_write_space_calls;
uint64_t s_tcp_sndbuf_full;
uint64_t s_tcp_connect_raced;
uint64_t s_tcp_listen_closed_stale;
};
/* tcp.c */
extern int rds_tcp_netid;
bool rds_tcp_tune(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_restore_callbacks(struct socket *sock,
struct rds_tcp_connection *tc);
u32 rds_tcp_write_seq(struct rds_tcp_connection *tc);
u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
extern struct rds_transport rds_tcp_transport;
void rds_tcp_accept_work(struct rds_tcp_net *rtn);
int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
__u32 scope_id);
/* tcp_connect.c */
int rds_tcp_conn_path_connect(struct rds_conn_path *cp);
void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out);
int rds_tcp_accept_one(struct rds_tcp_net *rtn);
void rds_tcp_keepalive(struct socket *sock);
void *rds_tcp_listen_sock_def_readable(struct net *net);
/* tcp_recv.c */
int rds_tcp_recv_init(void);
void rds_tcp_recv_exit(void);
void rds_tcp_data_ready(struct sock *sk);
int rds_tcp_recv_path(struct rds_conn_path *cp);
void rds_tcp_inc_free(struct rds_incoming *inc);
int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
/* tcp_send.c */
void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp);
void rds_tcp_xmit_path_complete(struct rds_conn_path *cp);
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack);
void rds_tcp_write_space(struct sock *sk);
/* tcp_stats.c */
DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
#endif