From 93686c472eb7b09a51b97a096449e7092fefcd1f Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Fri, 30 Jan 2026 18:32:48 +0100 Subject: [PATCH 01/85] ovpn: set sk_user_data before overriding callbacks During initialization, we override socket callbacks and set sk_user_data to an ovpn_socket instance. Currently, these two operations are decoupled: callbacks are overridden before sk_user_data is set. While existing callbacks perform safety checks for NULL or non-ovpn sk_user_data, this condition causes a "half-formed" state where valid packets arriving during attachment trigger error logs (e.g., "invoked on non ovpn socket"). Set sk_user_data before overriding the callbacks so that it can be accessed safely from them. Since we already check that the socket has no sk_user_data before setting it, this remains safe even if an interrupt accesses the socket after sk_user_data is set but before the callbacks are overridden. This also requires initializing all protocol-specific fields (such as tcp_tx_work and peer links) before calling ovpn_socket_attach, ensuring the ovpn_socket is fully formed before it becomes visible to any callback. Fixes: f6226ae7a0cd ("ovpn: introduce the ovpn_socket object") Signed-off-by: Ralf Lici Reviewed-by: Sabrina Dubroca Signed-off-by: Antonio Quartulli --- drivers/net/ovpn/socket.c | 41 +++++++++++++++++++++------------------ drivers/net/ovpn/tcp.c | 9 +++++++-- drivers/net/ovpn/udp.c | 1 + 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c index 9750871ab65c..448cee3b3f9f 100644 --- a/drivers/net/ovpn/socket.c +++ b/drivers/net/ovpn/socket.c @@ -200,24 +200,6 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer) ovpn_sock->sk = sk; kref_init(&ovpn_sock->refcount); - /* the newly created ovpn_socket is holding reference to sk, - * therefore we increase its refcounter. - * - * This ovpn_socket instance is referenced by all peers - * using the same socket. - * - * ovpn_socket_release() will take care of dropping the reference. - */ - sock_hold(sk); - - ret = ovpn_socket_attach(ovpn_sock, sock, peer); - if (ret < 0) { - sock_put(sk); - kfree(ovpn_sock); - ovpn_sock = ERR_PTR(ret); - goto sock_release; - } - /* TCP sockets are per-peer, therefore they are linked to their unique * peer */ @@ -234,7 +216,28 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer) GFP_KERNEL); } - rcu_assign_sk_user_data(sk, ovpn_sock); + /* the newly created ovpn_socket is holding reference to sk, + * therefore we increase its refcounter. + * + * This ovpn_socket instance is referenced by all peers + * using the same socket. + * + * ovpn_socket_release() will take care of dropping the reference. + */ + sock_hold(sk); + + ret = ovpn_socket_attach(ovpn_sock, sock, peer); + if (ret < 0) { + if (sk->sk_protocol == IPPROTO_TCP) + ovpn_peer_put(peer); + else if (sk->sk_protocol == IPPROTO_UDP) + netdev_put(peer->ovpn->dev, &ovpn_sock->dev_tracker); + + sock_put(sk); + kfree(ovpn_sock); + ovpn_sock = ERR_PTR(ret); + } + sock_release: release_sock(sk); return ovpn_sock; diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c index 0d7f30360d87..f0b4e07ba924 100644 --- a/drivers/net/ovpn/tcp.c +++ b/drivers/net/ovpn/tcp.c @@ -487,6 +487,7 @@ int ovpn_tcp_socket_attach(struct ovpn_socket *ovpn_sock, /* make sure no pre-existing encapsulation handler exists */ if (ovpn_sock->sk->sk_user_data) return -EBUSY; + rcu_assign_sk_user_data(ovpn_sock->sk, ovpn_sock); /* only a fully connected socket is expected. Connection should be * handled in userspace @@ -495,13 +496,14 @@ int ovpn_tcp_socket_attach(struct ovpn_socket *ovpn_sock, net_err_ratelimited("%s: provided TCP socket is not in ESTABLISHED state: %d\n", netdev_name(peer->ovpn->dev), ovpn_sock->sk->sk_state); - return -EINVAL; + ret = -EINVAL; + goto err; } ret = strp_init(&peer->tcp.strp, ovpn_sock->sk, &cb); if (ret < 0) { DEBUG_NET_WARN_ON_ONCE(1); - return ret; + goto err; } INIT_WORK(&peer->tcp.defer_del_work, ovpn_tcp_peer_del_work); @@ -536,6 +538,9 @@ int ovpn_tcp_socket_attach(struct ovpn_socket *ovpn_sock, strp_check_rcv(&peer->tcp.strp); return 0; +err: + rcu_assign_sk_user_data(ovpn_sock->sk, NULL); + return ret; } static void ovpn_tcp_close(struct sock *sk, long timeout) diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index d6a0f7a0b75d..272b535ecaad 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -386,6 +386,7 @@ int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, struct socket *sock, struct ovpn_priv *ovpn) { struct udp_tunnel_sock_cfg cfg = { + .sk_user_data = ovpn_sock, .encap_type = UDP_ENCAP_OVPNINUDP, .encap_rcv = ovpn_udp_encap_recv, .encap_destroy = ovpn_udp_encap_destroy, From a5ec7baa44ea3a1d6aa0ca31c0ad82edf9affe41 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Fri, 30 Jan 2026 18:32:49 +0100 Subject: [PATCH 02/85] ovpn: fix possible use-after-free in ovpn_net_xmit When building the skb_list in ovpn_net_xmit, skb_share_check will free the original skb if it is shared. The current implementation continues to use the stale skb pointer for subsequent operations: - peer lookup, - skb_dst_drop (even though all segments produced by skb_gso_segment will have a dst attached), - ovpn_peer_stats_increment_tx. Fix this by moving the peer lookup and skb_dst_drop before segmentation so that the original skb is still valid when used. Return early if all segments fail skb_share_check and the list ends up empty. Also switch ovpn_peer_stats_increment_tx to use skb_list.next; the next patch fixes the stats logic. Fixes: 08857b5ec5d9 ("ovpn: implement basic TX path (UDP)") Signed-off-by: Ralf Lici Reviewed-by: Sabrina Dubroca Signed-off-by: Antonio Quartulli --- drivers/net/ovpn/io.c | 54 +++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index 3e9e7f8444b3..f70c58b10599 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -365,7 +365,27 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) /* verify IP header size in network packet */ proto = ovpn_ip_check_protocol(skb); if (unlikely(!proto || skb->protocol != proto)) - goto drop; + goto drop_no_peer; + + /* retrieve peer serving the destination IP of this packet */ + peer = ovpn_peer_get_by_dst(ovpn, skb); + if (unlikely(!peer)) { + switch (skb->protocol) { + case htons(ETH_P_IP): + net_dbg_ratelimited("%s: no peer to send data to dst=%pI4\n", + netdev_name(ovpn->dev), + &ip_hdr(skb)->daddr); + break; + case htons(ETH_P_IPV6): + net_dbg_ratelimited("%s: no peer to send data to dst=%pI6c\n", + netdev_name(ovpn->dev), + &ipv6_hdr(skb)->daddr); + break; + } + goto drop_no_peer; + } + /* dst was needed for peer selection - it can now be dropped */ + skb_dst_drop(skb); if (skb_is_gso(skb)) { segments = skb_gso_segment(skb, 0); @@ -396,34 +416,24 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) __skb_queue_tail(&skb_list, curr); } + + /* no segments survived: don't jump to 'drop' because we already + * incremented the counter for each failure in the loop + */ + if (unlikely(skb_queue_empty(&skb_list))) { + ovpn_peer_put(peer); + return NETDEV_TX_OK; + } skb_list.prev->next = NULL; - /* retrieve peer serving the destination IP of this packet */ - peer = ovpn_peer_get_by_dst(ovpn, skb); - if (unlikely(!peer)) { - switch (skb->protocol) { - case htons(ETH_P_IP): - net_dbg_ratelimited("%s: no peer to send data to dst=%pI4\n", - netdev_name(ovpn->dev), - &ip_hdr(skb)->daddr); - break; - case htons(ETH_P_IPV6): - net_dbg_ratelimited("%s: no peer to send data to dst=%pI6c\n", - netdev_name(ovpn->dev), - &ipv6_hdr(skb)->daddr); - break; - } - goto drop; - } - /* dst was needed for peer selection - it can now be dropped */ - skb_dst_drop(skb); - - ovpn_peer_stats_increment_tx(&peer->vpn_stats, skb->len); + ovpn_peer_stats_increment_tx(&peer->vpn_stats, skb_list.next->len); ovpn_send(ovpn, skb_list.next, peer); return NETDEV_TX_OK; drop: + ovpn_peer_put(peer); +drop_no_peer: dev_dstats_tx_dropped(ovpn->dev); skb_tx_error(skb); kfree_skb_list(skb); From b660b13d4c6379ca6360f24aaef8c5807fefd237 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Fri, 30 Jan 2026 18:32:50 +0100 Subject: [PATCH 03/85] ovpn: fix VPN TX bytes counting In ovpn_net_xmit, after GSO segmentation and segment processing, the first segment on the list is used to increment VPN TX statistics, which fails to account for any subsequent segments in the chain. Fix this by accumulating the length of every segment that successfully passes skb_share_check into a tx_bytes variable. This ensures the peer statistics accurately reflect the total data volume sent, regardless of whether the original packet was segmented. Fixes: 04ca14955f9a ("ovpn: store tunnel and transport statistics") Signed-off-by: Ralf Lici Reviewed-by: Sabrina Dubroca Signed-off-by: Antonio Quartulli --- drivers/net/ovpn/io.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index f70c58b10599..955c9a37e1f8 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -355,6 +355,7 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) struct ovpn_priv *ovpn = netdev_priv(dev); struct sk_buff *segments, *curr, *next; struct sk_buff_head skb_list; + unsigned int tx_bytes = 0; struct ovpn_peer *peer; __be16 proto; int ret; @@ -414,6 +415,8 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) continue; } + /* only count what we actually send */ + tx_bytes += curr->len; __skb_queue_tail(&skb_list, curr); } @@ -426,7 +429,7 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) } skb_list.prev->next = NULL; - ovpn_peer_stats_increment_tx(&peer->vpn_stats, skb_list.next->len); + ovpn_peer_stats_increment_tx(&peer->vpn_stats, tx_bytes); ovpn_send(ovpn, skb_list.next, peer); return NETDEV_TX_OK; From 056166710543a01ebcf25449bb462388e44b6ef6 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 5 Feb 2026 10:23:04 +0100 Subject: [PATCH 04/85] net: phy: initialize the port support based on the PHY's for OF ports With the phy_port infrastructure came an ethernet-connector binding, allowing to represent the MDI of a PHY in devicetree. This allows specifying the mediums and pairs of a port. Let's initialize the port's supported list based on what the PHY reports, so that we can then filter it with what the connector allows using. Signed-off-by: Maxime Chevallier Reviewed-by: Christophe Leroy (CS GROUP) Link: https://patch.msgid.link/20260205092317.755906-2-maxime.chevallier@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 8a3eb1839a3d..9b8eaac63b90 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3499,6 +3499,9 @@ static int of_phy_ports(struct phy_device *phydev) port->parent_type = PHY_PORT_PHY; port->phy = phydev; + + linkmode_copy(port->supported, phydev->supported); + err = phy_add_port(phydev, port); if (err) { phy_port_destroy(port); From 6248f3dc4ec536163bf8e1761f3c08d6f31c3cfe Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 5 Feb 2026 10:23:05 +0100 Subject: [PATCH 05/85] net: phy: phy_port: Cleanup the of-parsing logic for phy_port We don't need to maintain a mediums bitfield, let's drop it and drop a bogus check for empty mediums, as we already check it above. Signed-off-by: Maxime Chevallier Reviewed-by: Christophe Leroy (CS GROUP) Link: https://patch.msgid.link/20260205092317.755906-3-maxime.chevallier@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_port.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/phy_port.c b/drivers/net/phy/phy_port.c index ec93c8ca051e..a269e9ea12c4 100644 --- a/drivers/net/phy/phy_port.c +++ b/drivers/net/phy/phy_port.c @@ -53,7 +53,7 @@ struct phy_port *phy_of_parse_port(struct device_node *dn) enum ethtool_link_medium medium; struct phy_port *port; const char *med_str; - u32 pairs = 0, mediums = 0; + u32 pairs = 0; int ret; ret = fwnode_property_read_string(fwnode, "media", &med_str); @@ -85,17 +85,12 @@ struct phy_port *phy_of_parse_port(struct device_node *dn) return ERR_PTR(-EINVAL); } - mediums |= BIT(medium); - - if (!mediums) - return ERR_PTR(-EINVAL); - port = phy_port_alloc(); if (!port) return ERR_PTR(-ENOMEM); port->pairs = pairs; - port->mediums = mediums; + port->mediums = BIT(medium); return port; } From 4cebb26ac6f02b49b8a7c6974af3b4f23685e1a2 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 5 Feb 2026 10:23:06 +0100 Subject: [PATCH 06/85] net: phy: phy_port: Correctly recompute the port's linkmodes a PHY-driven phy_port contains a 'supported' field containing the linkmodes available on this port. This is populated based on : - The PHY's reported features - The DT representation of the connector - The PHY's attach_mdi() callback As these different attrbution methods work in conjunction, the helper phy_port_update_supported() recomputes the final 'supported' value based on the populated mediums, linkmodes and pairs. However this recompute wasn't correctly implemented, and added more modes than necessary by or'ing the medium-specific modes to the existing support. Let's fix this and properly filter the modes. Fixes: 589e934d2735 ("net: phy: Introduce PHY ports representation") Signed-off-by: Maxime Chevallier Reviewed-by: Christophe Leroy (CS GROUP) Link: https://patch.msgid.link/20260205092317.755906-4-maxime.chevallier@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_port.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/phy_port.c b/drivers/net/phy/phy_port.c index a269e9ea12c4..63d1bb154dc7 100644 --- a/drivers/net/phy/phy_port.c +++ b/drivers/net/phy/phy_port.c @@ -108,16 +108,10 @@ EXPORT_SYMBOL_GPL(phy_of_parse_port); */ void phy_port_update_supported(struct phy_port *port) { - __ETHTOOL_DECLARE_LINK_MODE_MASK(supported) = { 0 }; + __ETHTOOL_DECLARE_LINK_MODE_MASK(supported) = {0}; unsigned long mode; int i; - for_each_set_bit(i, &port->mediums, __ETHTOOL_LINK_MEDIUM_LAST) { - linkmode_zero(supported); - phy_caps_medium_get_supported(supported, i, port->pairs); - linkmode_or(port->supported, port->supported, supported); - } - /* If there's no pairs specified, we grab the default number of * pairs as the max of the default pairs for each linkmode */ @@ -127,6 +121,22 @@ void phy_port_update_supported(struct phy_port *port) port->pairs = max_t(int, port->pairs, ethtool_linkmode_n_pairs(mode)); + for_each_set_bit(i, &port->mediums, __ETHTOOL_LINK_MEDIUM_LAST) { + __ETHTOOL_DECLARE_LINK_MODE_MASK(med_supported) = {0}; + + phy_caps_medium_get_supported(med_supported, i, port->pairs); + linkmode_or(supported, supported, med_supported); + } + + /* If port->supported is already populated, filter it out with the + * medium/pair support. Otherwise, let's just use this medium-based + * support as the port's supported list. + */ + if (linkmode_empty(port->supported)) + linkmode_copy(port->supported, supported); + else + linkmode_and(port->supported, supported, port->supported); + /* Serdes ports supported through SFP may not have any medium set, * as they will output PHY_INTERFACE_MODE_XXX modes. In that case, derive * the supported list based on these interfaces From 7c375811b5117dd26e36fc508cff1fb5cd4121aa Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Thu, 12 Feb 2026 13:50:28 +0800 Subject: [PATCH 07/85] myri10ge: replace comma with semicolons Commit fd24173439c0 ("myri10ge: avoid uninitialized variable use") added commas instead of semicolons Acked-by: Arnd Bergmann Signed-off-by: Chen Ni Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20260212055028.3248491-1-nichen@iscas.ac.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 2f0cdbd4e2ac..d9b5d7999370 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -688,9 +688,9 @@ static int myri10ge_get_firmware_capabilities(struct myri10ge_priv *mgp) /* probe for IPv6 TSO support */ mgp->features = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_TSO; - cmd.data0 = 0, - cmd.data1 = 0, - cmd.data2 = 0, + cmd.data0 = 0; + cmd.data1 = 0; + cmd.data2 = 0; status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd, 0); if (status == 0) { @@ -821,9 +821,9 @@ static int myri10ge_change_pause(struct myri10ge_priv *mgp, int pause) int status, ctl; ctl = pause ? MXGEFW_ENABLE_FLOW_CONTROL : MXGEFW_DISABLE_FLOW_CONTROL; - cmd.data0 = 0, - cmd.data1 = 0, - cmd.data2 = 0, + cmd.data0 = 0; + cmd.data1 = 0; + cmd.data2 = 0; status = myri10ge_send_cmd(mgp, ctl, &cmd, 0); if (status) { From a6a9bc544b675d8b5180f2718ec985ad267b5cbf Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 9 Feb 2026 15:27:33 +0800 Subject: [PATCH 08/85] net: mctp: ensure our nlmsg responses are initialised Syed Faraz Abrar (@farazsth98) from Zellic, and Pumpkin (@u1f383) from DEVCORE Research Team working with Trend Micro Zero Day Initiative report that a RTM_GETNEIGH will return uninitalised data in the pad bytes of the ndmsg data. Ensure we're initialising the netlink data to zero, in the link, addr and neigh response messages. Fixes: 831119f88781 ("mctp: Add neighbour netlink interface") Fixes: 06d2f4c583a7 ("mctp: Add netlink route management") Fixes: 583be982d934 ("mctp: Add device handling and netlink interface") Signed-off-by: Jeremy Kerr Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260209-dev-mctp-nlmsg-v1-1-f1e30c346a43@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/device.c | 1 + net/mctp/neigh.c | 1 + net/mctp/route.c | 1 + 3 files changed, 3 insertions(+) diff --git a/net/mctp/device.c b/net/mctp/device.c index 4d404edd7446..04c5570bacff 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -70,6 +70,7 @@ static int mctp_fill_addrinfo(struct sk_buff *skb, return -EMSGSIZE; hdr = nlmsg_data(nlh); + memset(hdr, 0, sizeof(*hdr)); hdr->ifa_family = AF_MCTP; hdr->ifa_prefixlen = 0; hdr->ifa_flags = 0; diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c index 05b899f22d90..fc85f0e69301 100644 --- a/net/mctp/neigh.c +++ b/net/mctp/neigh.c @@ -218,6 +218,7 @@ static int mctp_fill_neigh(struct sk_buff *skb, u32 portid, u32 seq, int event, return -EMSGSIZE; hdr = nlmsg_data(nlh); + memset(hdr, 0, sizeof(*hdr)); hdr->ndm_family = AF_MCTP; hdr->ndm_ifindex = dev->ifindex; hdr->ndm_state = 0; // TODO other state bits? diff --git a/net/mctp/route.c b/net/mctp/route.c index 2ac4011a953f..ecbbe4beb213 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -1643,6 +1643,7 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, return -EMSGSIZE; hdr = nlmsg_data(nlh); + memset(hdr, 0, sizeof(*hdr)); hdr->rtm_family = AF_MCTP; /* we use the _len fields as a number of EIDs, rather than From a2646773a005b59fd1dc7ff3ba15df84889ca5d2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 9 Feb 2026 14:53:53 +0100 Subject: [PATCH 09/85] selftests: mlxsw: tc_restrictions: Fix test failure with new iproute2 As explained in [1], iproute2 started rejecting tc-police burst sizes that result in an overflow. This can happen when the burst size is high enough and the rate is low enough. A couple of test cases specify such configurations, resulting in iproute2 errors and test failure. Fix by reducing the burst size so that the test will pass with both new and old iproute2 versions. [1] https://lore.kernel.org/netdev/20250916215731.3431465-1-jay.vosburgh@canonical.com/ Fixes: cb12d1763267 ("selftests: mlxsw: tc_restrictions: Test tc-police restrictions") Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/88b00c6e85188aa6a065dc240206119b328c46e1.1770643998.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh index 0441a18f098b..aac8ef490feb 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -317,7 +317,7 @@ police_limits_test() tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \ flower skip_sw \ - action police rate 0.5kbit burst 1m conform-exceed drop/ok + action police rate 0.5kbit burst 2k conform-exceed drop/ok check_fail $? "Incorrect success to add police action with too low rate" tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \ @@ -327,7 +327,7 @@ police_limits_test() tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \ flower skip_sw \ - action police rate 1.5kbit burst 1m conform-exceed drop/ok + action police rate 1.5kbit burst 2k conform-exceed drop/ok check_err $? "Failed to add police action with low rate" tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower From fe6515f5952c166bdc4efa2090609e68018338a4 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 11 Feb 2026 11:20:21 +0800 Subject: [PATCH 10/85] bng_en: Remove duplicate include Remove duplicate inclusion of . Signed-off-by: Chen Ni Reviewed-by: Vikas Gupta Link: https://patch.msgid.link/20260211032021.2719742-1-nichen@iscas.ac.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnge/bnge_netdev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_netdev.c b/drivers/net/ethernet/broadcom/bnge/bnge_netdev.c index 6ab317f1c16e..b8e258842c61 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_netdev.c +++ b/drivers/net/ethernet/broadcom/bnge/bnge_netdev.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include From 10ec0fc0ccc525abc807b0ca8ad5a26a0bd56361 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 11 Feb 2026 10:21:46 +0800 Subject: [PATCH 11/85] selftests: net: lib: Fix jq parsing error The testcase failed as below: $./vlan_bridge_binding.sh ... + adf_ip_link_set_up d1 + local name=d1 + shift + ip_link_is_up d1 + ip_link_has_flag d1 UP + local name=d1 + shift + local flag=UP + shift ++ ip -j link show d1 ++ jq --arg flag UP 'any(.[].flags.[]; . == $flag)' jq: error: syntax error, unexpected '[', expecting FORMAT or QQSTRING_START (Unix shell quoting issues?) at , line 1: any(.[].flags.[]; . == $flag) jq: 1 compile error Remove the extra dot (.) after flags array to fix this. Fixes: 4baa1d3a5080 ("selftests: net: lib: Add ip_link_has_flag()") Signed-off-by: Yue Haibing Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260211022146.190948-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 0ec131b339bc..b40694573f4c 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -577,7 +577,7 @@ ip_link_has_flag() local flag=$1; shift local state=$(ip -j link show "$name" | - jq --arg flag "$flag" 'any(.[].flags.[]; . == $flag)') + jq --arg flag "$flag" 'any(.[].flags[]; . == $flag)') [[ $state == true ]] } From ed6788c5a7614d00321bc4c88fdb9d83fcba0e02 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 10 Feb 2026 11:31:10 +0200 Subject: [PATCH 12/85] selftests: drv-net: limit RPS test CPUs to supported range The _get_unused_cpus() function can return CPU numbers >= 16, which exceeds RPS_MAX_CPUS in toeplitz.c. When this happens, the test fails with a cryptic message: # Exception| Traceback (most recent call last): # Exception| File "/tmp/cur/linux/tools/testing/selftests/net/lib/py/ksft.py", line 319, in ksft_run # Exception| func(*args) # Exception| File "/tmp/cur/linux/tools/testing/selftests/drivers/net/hw/toeplitz.py", line 189, in test # Exception| with bkg(" ".join(rx_cmd), ksft_ready=True, exit_wait=True) as rx_proc: # Exception| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Exception| File "/tmp/cur/linux/tools/testing/selftests/net/lib/py/utils.py", line 124, in __init__ # Exception| super().__init__(comm, background=True, # Exception| File "/tmp/cur/linux/tools/testing/selftests/net/lib/py/utils.py", line 77, in __init__ # Exception| raise Exception("Did not receive ready message") # Exception| Exception: Did not receive ready message Rename _get_unused_cpus() to _get_unused_rps_cpus() and cap the CPU search range to RPS_MAX_CPUS. Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260210093110.1935149-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/toeplitz.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.py b/tools/testing/selftests/drivers/net/hw/toeplitz.py index d288c57894f6..cd7e080e6f84 100755 --- a/tools/testing/selftests/drivers/net/hw/toeplitz.py +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.py @@ -19,6 +19,8 @@ from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx, KsftFailEx # "define" for the ID of the Toeplitz hash function ETH_RSS_HASH_TOP = 1 +# Must match RPS_MAX_CPUS in toeplitz.c +RPS_MAX_CPUS = 16 def _check_rps_and_rfs_not_configured(cfg): @@ -67,23 +69,24 @@ def _get_irq_cpus(cfg): return cpus -def _get_unused_cpus(cfg, count=2): +def _get_unused_rps_cpus(cfg, count=2): """ - Get CPUs that are not used by Rx queues. - Returns a list of at least 'count' CPU numbers. + Get CPUs that are not used by Rx queues for RPS. + Returns a list of at least 'count' CPU numbers within + the RPS_MAX_CPUS supported range. """ # Get CPUs used by Rx queues rx_cpus = set(_get_irq_cpus(cfg)) - # Get total number of CPUs - num_cpus = os.cpu_count() + # Get total number of CPUs, capped by RPS_MAX_CPUS + num_cpus = min(os.cpu_count(), RPS_MAX_CPUS) # Find unused CPUs unused_cpus = [cpu for cpu in range(num_cpus) if cpu not in rx_cpus] if len(unused_cpus) < count: - raise KsftSkipEx(f"Need at {count} CPUs not used by Rx queues, found {len(unused_cpus)}") + raise KsftSkipEx(f"Need at least {count} CPUs in range 0..{num_cpus - 1} not used by Rx queues, found {len(unused_cpus)}") return unused_cpus[:count] @@ -181,7 +184,7 @@ def test(cfg, proto_flag, ipver, grp): ksft_pr(f"RSS using CPUs: {irq_cpus}") elif grp == "rps": # Get CPUs not used by Rx queues and configure them for RPS - rps_cpus = _get_unused_cpus(cfg, count=2) + rps_cpus = _get_unused_rps_cpus(cfg, count=2) rps_mask = _configure_rps(cfg, rps_cpus) defer(_configure_rps, cfg, []) rx_cmd += ["-r", rps_mask] From d03e094473ecdeb68d853752ba467abe13e1de44 Mon Sep 17 00:00:00 2001 From: Ethan Nelson-Moore Date: Mon, 9 Feb 2026 18:12:34 -0800 Subject: [PATCH 13/85] net: intel: fix PCI device ID conflict between i40e and ipw2200 The ID 8086:104f is matched by both i40e and ipw2200. The same device ID should not be in more than one driver, because in that case, which driver is used is unpredictable. Fix this by taking advantage of the fact that i40e devices use PCI_CLASS_NETWORK_ETHERNET and ipw2200 devices use PCI_CLASS_NETWORK_OTHER to differentiate the devices. Fixes: 2e45d3f4677a ("i40e: Add support for X710 B/P & SFP+ cards") Cc: stable@vger.kernel.org Acked-by: Johannes Berg Signed-off-by: Ethan Nelson-Moore Reviewed-by: Aleksandr Loktionov Link: https://patch.msgid.link/20260210021235.16315-1-enelsonmoore@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 8 +++++++- drivers/net/wireless/intel/ipw2x00/ipw2200.c | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index d3bc3207054f..02de186dcc8f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -75,7 +75,13 @@ static const struct pci_device_id i40e_pci_tbl[] = { {PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T4), 0}, {PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T_BC), 0}, {PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_SFP), 0}, - {PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_B), 0}, + /* + * This ID conflicts with ipw2200, but the devices can be differentiated + * because i40e devices use PCI_CLASS_NETWORK_ETHERNET and ipw2200 + * devices use PCI_CLASS_NETWORK_OTHER. + */ + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, I40E_DEV_ID_10G_B), + PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, 0}, {PCI_VDEVICE(INTEL, I40E_DEV_ID_KX_X722), 0}, {PCI_VDEVICE(INTEL, I40E_DEV_ID_QSFP_X722), 0}, {PCI_VDEVICE(INTEL, I40E_DEV_ID_SFP_X722), 0}, diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index 09035a77e775..b0e769da9415 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -11387,7 +11387,13 @@ static const struct pci_device_id card_ids[] = { {PCI_VENDOR_ID_INTEL, 0x1043, 0x8086, 0x2754, 0, 0, 0}, {PCI_VENDOR_ID_INTEL, 0x1043, 0x8086, 0x2761, 0, 0, 0}, {PCI_VENDOR_ID_INTEL, 0x1043, 0x8086, 0x2762, 0, 0, 0}, - {PCI_VDEVICE(INTEL, 0x104f), 0}, + /* + * This ID conflicts with i40e, but the devices can be differentiated + * because i40e devices use PCI_CLASS_NETWORK_ETHERNET and ipw2200 + * devices use PCI_CLASS_NETWORK_OTHER. + */ + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x104f), + PCI_CLASS_NETWORK_OTHER << 8, 0xffff00, 0}, {PCI_VDEVICE(INTEL, 0x4220), 0}, /* BG */ {PCI_VDEVICE(INTEL, 0x4221), 0}, /* BG */ {PCI_VDEVICE(INTEL, 0x4223), 0}, /* ABG */ From babab1b42ed68877ef669a08384becf281ad2582 Mon Sep 17 00:00:00 2001 From: Jie Zhang Date: Mon, 9 Feb 2026 17:50:32 -0500 Subject: [PATCH 14/85] net: stmmac: fix oops when split header is enabled For GMAC4, when split header is enabled, in some rare cases, the hardware does not fill buf2 of the first descriptor with payload. Thus we cannot assume buf2 is always fully filled if it is not the last descriptor. Otherwise, the length of buf2 of the second descriptor will be calculated wrong and cause an oops: Unable to handle kernel paging request at virtual address ffff00019246bfc0 ... x2 : 0000000000000040 x1 : ffff00019246bfc0 x0 : ffff00009246c000 Call trace: dcache_inval_poc+0x28/0x58 (P) dma_direct_sync_single_for_cpu+0x38/0x6c __dma_sync_single_for_cpu+0x34/0x6c stmmac_napi_poll_rx+0x8f0/0xb60 __napi_poll.constprop.0+0x30/0x144 net_rx_action+0x160/0x274 handle_softirqs+0x1b8/0x1fc ... To fix this, the PL bit-field in RDES3 register is used for all descriptors, whether it is the last descriptor or not. Fixes: ec222003bd94 ("net: stmmac: Prepare to add Split Header support") Reviewed-by: Jacob Keller Signed-off-by: Jie Zhang Link: https://patch.msgid.link/20260209225037.589130-1-jie.zhang@analog.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index c63099a77cc0..82375d34ad57 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5040,13 +5040,27 @@ static unsigned int stmmac_rx_buf2_len(struct stmmac_priv *priv, if (!priv->sph_active) return 0; - /* Not last descriptor */ - if (status & rx_not_ls) + /* For GMAC4, when split header is enabled, in some rare cases, the + * hardware does not fill buf2 of the first descriptor with payload. + * Thus we cannot assume buf2 is always fully filled if it is not + * the last descriptor. Otherwise, the length of buf2 of the second + * descriptor will be calculated wrong and cause an oops. + * + * If this is the last descriptor, 'plen' is the length of the + * received packet that was transferred to system memory. + * Otherwise, it is the accumulated number of bytes that have been + * transferred for the current packet. + * + * Thus 'plen - len' always gives the correct length of buf2. + */ + + /* Not GMAC4 and not last descriptor */ + if (priv->plat->core_type != DWMAC_CORE_GMAC4 && (status & rx_not_ls)) return priv->dma_conf.dma_buf_sz; + /* GMAC4 or last descriptor */ plen = stmmac_get_rx_frame_len(priv, p, coe); - /* Last descriptor */ return plen - len; } From 8930878101cd40063888a68af73b1b0f8b6c79bc Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Tue, 10 Feb 2026 17:45:37 +0800 Subject: [PATCH 15/85] atm: fore200e: fix use-after-free in tasklets during device removal When the PCA-200E or SBA-200E adapter is being detached, the fore200e is deallocated. However, the tx_tasklet or rx_tasklet may still be running or pending, leading to use-after-free bug when the already freed fore200e is accessed again in fore200e_tx_tasklet() or fore200e_rx_tasklet(). One of the race conditions can occur as follows: CPU 0 (cleanup) | CPU 1 (tasklet) fore200e_pca_remove_one() | fore200e_interrupt() fore200e_shutdown() | tasklet_schedule() kfree(fore200e) | fore200e_tx_tasklet() | fore200e-> // UAF Fix this by ensuring tx_tasklet or rx_tasklet is properly canceled before the fore200e is released. Add tasklet_kill() in fore200e_shutdown() to synchronize with any pending or running tasklets. Moreover, since fore200e_reset() could prevent further interrupts or data transfers, the tasklet_kill() should be placed after fore200e_reset() to prevent the tasklet from being rescheduled in fore200e_interrupt(). Finally, it only needs to do tasklet_kill() when the fore200e state is greater than or equal to FORE200E_STATE_IRQ, since tasklets are uninitialized in earlier states. In a word, the tasklet_kill() should be placed in the FORE200E_STATE_IRQ branch within the switch...case structure. This bug was identified through static analysis. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@kernel.org Suggested-by: Jijie Shao Signed-off-by: Duoming Zhou Reviewed-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260210094537.9767-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski --- drivers/atm/fore200e.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index f62e38571440..fec081db36dc 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -373,6 +373,10 @@ fore200e_shutdown(struct fore200e* fore200e) fallthrough; case FORE200E_STATE_IRQ: free_irq(fore200e->irq, fore200e->atm_dev); +#ifdef FORE200E_USE_TASKLET + tasklet_kill(&fore200e->tx_tasklet); + tasklet_kill(&fore200e->rx_tasklet); +#endif fallthrough; case FORE200E_STATE_ALLOC_BUF: From 6c28aa8dfdf24f554d4c5d4ff7d723a95360d94a Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Tue, 10 Feb 2026 14:44:01 +0100 Subject: [PATCH 16/85] net: sparx5/lan969x: fix DWRR cost max to match hardware register width DWRR (Deficit Weighted Round Robin) scheduling distributes bandwidth across traffic classes based on per-queue cost values, where lower cost means higher bandwidth share. The SPX5_DWRR_COST_MAX constant is 63 (6 bits) but the hardware register field HSCH_DWRR_ENTRY_DWRR_COST is GENMASK(24, 20), only 5 bits wide (max 31). This causes sparx5_weight_to_hw_cost() to compute cost values that silently overflow via FIELD_PREP, resulting in incorrect scheduling weights. Set SPX5_DWRR_COST_MAX to 31 to match the hardware register width. Fixes: 211225428d65 ("net: microchip: sparx5: add support for offloading ets qdisc") Signed-off-by: Daniel Machon Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260210-sparx5-fix-dwrr-cost-max-v1-1-58fbdbc25652@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_qos.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_qos.h b/drivers/net/ethernet/microchip/sparx5/sparx5_qos.h index 1231a80335d7..04f76f1e23f6 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_qos.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_qos.h @@ -35,7 +35,7 @@ #define SPX5_SE_BURST_UNIT 4096 /* Dwrr */ -#define SPX5_DWRR_COST_MAX 63 +#define SPX5_DWRR_COST_MAX 31 struct sparx5_shaper { u32 mode; From 29372f07f7969a2f0490793226ecf6c8c6bde0fa Mon Sep 17 00:00:00 2001 From: Ziyi Guo Date: Sun, 8 Feb 2026 22:56:00 +0000 Subject: [PATCH 17/85] net: mscc: ocelot: extract ocelot_xmit_timestamp() helper Extract the PTP timestamp handling logic from ocelot_port_xmit() into a separate ocelot_xmit_timestamp() helper function. This is a pure refactor with no behavioral change. The helper returns false if the skb was consumed (freed) due to a timestamp request failure, and true if the caller should continue with frame injection. The rew_op value is returned via pointer. This prepares for splitting ocelot_port_xmit() into separate FDMA and register injection paths in a subsequent patch. Signed-off-by: Ziyi Guo Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20260208225602.1339325-2-n7l8m4@u.northwestern.edu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_net.c | 36 ++++++++++++++++---------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 469784d3a1a6..ef4a6c768de9 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -551,6 +551,26 @@ static int ocelot_port_stop(struct net_device *dev) return 0; } +static bool ocelot_xmit_timestamp(struct ocelot *ocelot, int port, + struct sk_buff *skb, u32 *rew_op) +{ + if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + struct sk_buff *clone = NULL; + + if (ocelot_port_txtstamp_request(ocelot, port, skb, &clone)) { + kfree_skb(skb); + return false; + } + + if (clone) + OCELOT_SKB_CB(skb)->clone = clone; + + *rew_op = ocelot_ptp_rew_op(skb); + } + + return true; +} + static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) { struct ocelot_port_private *priv = netdev_priv(dev); @@ -563,20 +583,8 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) !ocelot_can_inject(ocelot, 0)) return NETDEV_TX_BUSY; - /* Check if timestamping is needed */ - if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { - struct sk_buff *clone = NULL; - - if (ocelot_port_txtstamp_request(ocelot, port, skb, &clone)) { - kfree_skb(skb); - return NETDEV_TX_OK; - } - - if (clone) - OCELOT_SKB_CB(skb)->clone = clone; - - rew_op = ocelot_ptp_rew_op(skb); - } + if (!ocelot_xmit_timestamp(ocelot, port, skb, &rew_op)) + return NETDEV_TX_OK; if (static_branch_unlikely(&ocelot_fdma_enabled)) { ocelot_fdma_inject_frame(ocelot, port, rew_op, skb, dev); From 47f79b20e7fb885aa1623b759a68e8e27401ec4d Mon Sep 17 00:00:00 2001 From: Ziyi Guo Date: Sun, 8 Feb 2026 22:56:01 +0000 Subject: [PATCH 18/85] net: mscc: ocelot: split xmit into FDMA and register injection paths Split ocelot_port_xmit() into two separate functions: - ocelot_port_xmit_fdma(): handles the FDMA injection path - ocelot_port_xmit_inj(): handles the register-based injection path The top-level ocelot_port_xmit() now dispatches to the appropriate function based on the ocelot_fdma_enabled static key. This is a pure refactor with no behavioral change. Separating the two code paths makes each one simpler and prepares for adding proper locking to the register injection path without affecting the FDMA path. Signed-off-by: Ziyi Guo Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20260208225602.1339325-3-n7l8m4@u.northwestern.edu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_net.c | 39 ++++++++++++++++++++------ 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index ef4a6c768de9..a7966c174b2e 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -571,7 +571,8 @@ static bool ocelot_xmit_timestamp(struct ocelot *ocelot, int port, return true; } -static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ocelot_port_xmit_fdma(struct sk_buff *skb, + struct net_device *dev) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot_port *ocelot_port = &priv->port; @@ -579,24 +580,44 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) int port = priv->port.index; u32 rew_op = 0; - if (!static_branch_unlikely(&ocelot_fdma_enabled) && - !ocelot_can_inject(ocelot, 0)) + if (!ocelot_xmit_timestamp(ocelot, port, skb, &rew_op)) + return NETDEV_TX_OK; + + ocelot_fdma_inject_frame(ocelot, port, rew_op, skb, dev); + + return NETDEV_TX_OK; +} + +static netdev_tx_t ocelot_port_xmit_inj(struct sk_buff *skb, + struct net_device *dev) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->port.index; + u32 rew_op = 0; + + if (!ocelot_can_inject(ocelot, 0)) return NETDEV_TX_BUSY; if (!ocelot_xmit_timestamp(ocelot, port, skb, &rew_op)) return NETDEV_TX_OK; - if (static_branch_unlikely(&ocelot_fdma_enabled)) { - ocelot_fdma_inject_frame(ocelot, port, rew_op, skb, dev); - } else { - ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); + ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); - consume_skb(skb); - } + consume_skb(skb); return NETDEV_TX_OK; } +static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) +{ + if (static_branch_unlikely(&ocelot_fdma_enabled)) + return ocelot_port_xmit_fdma(skb, dev); + + return ocelot_port_xmit_inj(skb, dev); +} + enum ocelot_action_type { OCELOT_MACT_LEARN, OCELOT_MACT_FORGET, From 026f6513c5880c2c89e38ad66bbec2868f978605 Mon Sep 17 00:00:00 2001 From: Ziyi Guo Date: Sun, 8 Feb 2026 22:56:02 +0000 Subject: [PATCH 19/85] net: mscc: ocelot: add missing lock protection in ocelot_port_xmit_inj() ocelot_port_xmit_inj() calls ocelot_can_inject() and ocelot_port_inject_frame() without holding the injection group lock. Both functions contain lockdep_assert_held() for the injection lock, and the correct caller felix_port_deferred_xmit() properly acquires the lock using ocelot_lock_inj_grp() before calling these functions. Add ocelot_lock_inj_grp()/ocelot_unlock_inj_grp() around the register injection path to fix the missing lock protection. The FDMA path is not affected as it uses its own locking mechanism. Fixes: c5e12ac3beb0 ("net: mscc: ocelot: serialize access to the injection/extraction groups") Signed-off-by: Ziyi Guo Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20260208225602.1339325-4-n7l8m4@u.northwestern.edu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_net.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index a7966c174b2e..1b8269320464 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -597,14 +597,22 @@ static netdev_tx_t ocelot_port_xmit_inj(struct sk_buff *skb, int port = priv->port.index; u32 rew_op = 0; - if (!ocelot_can_inject(ocelot, 0)) - return NETDEV_TX_BUSY; + ocelot_lock_inj_grp(ocelot, 0); - if (!ocelot_xmit_timestamp(ocelot, port, skb, &rew_op)) + if (!ocelot_can_inject(ocelot, 0)) { + ocelot_unlock_inj_grp(ocelot, 0); + return NETDEV_TX_BUSY; + } + + if (!ocelot_xmit_timestamp(ocelot, port, skb, &rew_op)) { + ocelot_unlock_inj_grp(ocelot, 0); return NETDEV_TX_OK; + } ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); + ocelot_unlock_inj_grp(ocelot, 0); + consume_skb(skb); return NETDEV_TX_OK; From a68a9bd086c2822d0c629443bd16ad1317afe501 Mon Sep 17 00:00:00 2001 From: Pin-yen Lin Date: Mon, 9 Feb 2026 16:59:36 -0800 Subject: [PATCH 20/85] selftests: netconsole: Increase port listening timeout wait_for_port() can wait up to 2 seconds with the sleep and the polling in wait_local_port_listen() combined. So, in netcons_basic.sh, the socat process could die before the test writes to the netconsole. Increase the timeout to 3 seconds to make netcons_basic.sh pass consistently. Fixes: 3dc6c76391cb ("selftests: net: Add IPv6 support to netconsole basic tests") Signed-off-by: Pin-yen Lin Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260210005939.3230550-1-treapking@google.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index b6093bcf2b06..02dcdeb723be 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -249,8 +249,8 @@ function listen_port_and_save_to() { SOCAT_MODE="UDP6-LISTEN" fi - # Just wait for 2 seconds - timeout 2 ip netns exec "${NAMESPACE}" \ + # Just wait for 3 seconds + timeout 3 ip netns exec "${NAMESPACE}" \ socat "${SOCAT_MODE}":"${PORT}",fork "${OUTPUT}" 2> /dev/null } From 6db8b56eed62baacaf37486e83378a72635c04cc Mon Sep 17 00:00:00 2001 From: Qanux Date: Wed, 11 Feb 2026 12:04:12 +0800 Subject: [PATCH 21/85] ipv6: ioam: fix heap buffer overflow in __ioam6_fill_trace_data() On the receive path, __ioam6_fill_trace_data() uses trace->nodelen to decide how much data to write for each node. It trusts this field as-is from the incoming packet, with no consistency check against trace->type (the 24-bit field that tells which data items are present). A crafted packet can set nodelen=0 while setting type bits 0-21, causing the function to write ~100 bytes past the allocated region (into skb_shared_info), which corrupts adjacent heap memory and leads to a kernel panic. Add a shared helper ioam6_trace_compute_nodelen() in ioam6.c to derive the expected nodelen from the type field, and use it: - in ioam6_iptunnel.c (send path, existing validation) to replace the open-coded computation; - in exthdrs.c (receive path, ipv6_hop_ioam) to drop packets whose nodelen is inconsistent with the type field, before any data is written. Per RFC 9197, bits 12-21 are each short (4-octet) fields, so they are included in IOAM6_MASK_SHORT_FIELDS (changed from 0xff100000 to 0xff1ffc00). Fixes: 9ee11f0fff20 ("ipv6: ioam: Data plane support for Pre-allocated Trace") Cc: stable@vger.kernel.org Signed-off-by: Junxi Qian Reviewed-by: Justin Iurman Link: https://patch.msgid.link/20260211040412.86195-1-qjx1298677004@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ioam6.h | 2 ++ net/ipv6/exthdrs.c | 5 +++++ net/ipv6/ioam6.c | 14 ++++++++++++++ net/ipv6/ioam6_iptunnel.c | 10 +--------- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/include/net/ioam6.h b/include/net/ioam6.h index 2cbbee6e806a..a75912fe247e 100644 --- a/include/net/ioam6.h +++ b/include/net/ioam6.h @@ -60,6 +60,8 @@ void ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_trace_hdr *trace, bool is_input); +u8 ioam6_trace_compute_nodelen(u32 trace_type); + int ioam6_init(void); void ioam6_exit(void); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 209fdf1b1aa9..5e3610a926cf 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -931,6 +931,11 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4) goto drop; + /* Inconsistent Pre-allocated Trace header */ + if (trace->nodelen != + ioam6_trace_compute_nodelen(be32_to_cpu(trace->type_be32))) + goto drop; + /* Ignore if the IOAM namespace is unknown */ ns = ioam6_namespace(dev_net(skb->dev), trace->namespace_id); if (!ns) diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 9553a3200081..08b7ac8c99b7 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -690,6 +690,20 @@ struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id) return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); } +#define IOAM6_MASK_SHORT_FIELDS 0xff1ffc00 +#define IOAM6_MASK_WIDE_FIELDS 0x00e00000 + +u8 ioam6_trace_compute_nodelen(u32 trace_type) +{ + u8 nodelen = hweight32(trace_type & IOAM6_MASK_SHORT_FIELDS) + * (sizeof(__be32) / 4); + + nodelen += hweight32(trace_type & IOAM6_MASK_WIDE_FIELDS) + * (sizeof(__be64) / 4); + + return nodelen; +} + static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_namespace *ns, struct ioam6_trace_hdr *trace, diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 1fe7894f14dd..b9f6d892a566 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -22,9 +22,6 @@ #include #include -#define IOAM6_MASK_SHORT_FIELDS 0xff100000 -#define IOAM6_MASK_WIDE_FIELDS 0xe00000 - struct ioam6_lwt_encap { struct ipv6_hopopt_hdr eh; u8 pad[2]; /* 2-octet padding for 4n-alignment */ @@ -93,13 +90,8 @@ static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) trace->type.bit21 | trace->type.bit23) return false; - trace->nodelen = 0; fields = be32_to_cpu(trace->type_be32); - - trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS) - * (sizeof(__be32) / 4); - trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS) - * (sizeof(__be64) / 4); + trace->nodelen = ioam6_trace_compute_nodelen(fields); return true; } From 8244f959e2c125c849e569f5b23ed49804cce695 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Feb 2026 17:50:21 +0000 Subject: [PATCH 22/85] ipv6: Fix out-of-bound access in fib6_add_rt2node(). syzbot reported out-of-bound read in fib6_add_rt2node(). [0] When IPv6 route is created with RTA_NH_ID, struct fib6_info does not have the trailing struct fib6_nh. The cited commit started to check !iter->fib6_nh->fib_nh_gw_family to ensure that rt6_qualify_for_ecmp() will return false for iter. If iter->nh is not NULL, rt6_qualify_for_ecmp() returns false anyway. Let's check iter->nh before reading iter->fib6_nh and avoid OOB read. [0]: BUG: KASAN: slab-out-of-bounds in fib6_add_rt2node+0x349c/0x3500 net/ipv6/ip6_fib.c:1142 Read of size 1 at addr ffff8880384ba6de by task syz.0.18/5500 CPU: 0 UID: 0 PID: 5500 Comm: syz.0.18 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xba/0x230 mm/kasan/report.c:482 kasan_report+0x117/0x150 mm/kasan/report.c:595 fib6_add_rt2node+0x349c/0x3500 net/ipv6/ip6_fib.c:1142 fib6_add_rt2node_nh net/ipv6/ip6_fib.c:1363 [inline] fib6_add+0x910/0x18c0 net/ipv6/ip6_fib.c:1531 __ip6_ins_rt net/ipv6/route.c:1351 [inline] ip6_route_add+0xde/0x1b0 net/ipv6/route.c:3957 inet6_rtm_newroute+0x268/0x19e0 net/ipv6/route.c:5660 rtnetlink_rcv_msg+0x7d5/0xbe0 net/core/rtnetlink.c:6958 netlink_rcv_skb+0x232/0x4b0 net/netlink/af_netlink.c:2550 netlink_unicast_kernel net/netlink/af_netlink.c:1318 [inline] netlink_unicast+0x80f/0x9b0 net/netlink/af_netlink.c:1344 netlink_sendmsg+0x813/0xb40 net/netlink/af_netlink.c:1894 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg net/socket.c:742 [inline] ____sys_sendmsg+0xa68/0xad0 net/socket.c:2592 ___sys_sendmsg+0x2a5/0x360 net/socket.c:2646 __sys_sendmsg net/socket.c:2678 [inline] __do_sys_sendmsg net/socket.c:2683 [inline] __se_sys_sendmsg net/socket.c:2681 [inline] __x64_sys_sendmsg+0x1bd/0x2a0 net/socket.c:2681 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f9316b9aeb9 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd8809b678 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f9316e15fa0 RCX: 00007f9316b9aeb9 RDX: 0000000000000000 RSI: 0000200000004380 RDI: 0000000000000003 RBP: 00007f9316c08c1f R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f9316e15fac R14: 00007f9316e15fa0 R15: 00007f9316e15fa0 Allocated by task 5499: kasan_save_stack mm/kasan/common.c:57 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:78 poison_kmalloc_redzone mm/kasan/common.c:398 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:415 kasan_kmalloc include/linux/kasan.h:263 [inline] __do_kmalloc_node mm/slub.c:5657 [inline] __kmalloc_noprof+0x40c/0x7e0 mm/slub.c:5669 kmalloc_noprof include/linux/slab.h:961 [inline] kzalloc_noprof include/linux/slab.h:1094 [inline] fib6_info_alloc+0x30/0xf0 net/ipv6/ip6_fib.c:155 ip6_route_info_create+0x142/0x860 net/ipv6/route.c:3820 ip6_route_add+0x49/0x1b0 net/ipv6/route.c:3949 inet6_rtm_newroute+0x268/0x19e0 net/ipv6/route.c:5660 rtnetlink_rcv_msg+0x7d5/0xbe0 net/core/rtnetlink.c:6958 netlink_rcv_skb+0x232/0x4b0 net/netlink/af_netlink.c:2550 netlink_unicast_kernel net/netlink/af_netlink.c:1318 [inline] netlink_unicast+0x80f/0x9b0 net/netlink/af_netlink.c:1344 netlink_sendmsg+0x813/0xb40 net/netlink/af_netlink.c:1894 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg net/socket.c:742 [inline] ____sys_sendmsg+0xa68/0xad0 net/socket.c:2592 ___sys_sendmsg+0x2a5/0x360 net/socket.c:2646 __sys_sendmsg net/socket.c:2678 [inline] __do_sys_sendmsg net/socket.c:2683 [inline] __se_sys_sendmsg net/socket.c:2681 [inline] __x64_sys_sendmsg+0x1bd/0x2a0 net/socket.c:2681 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: bbf4a17ad9ff ("ipv6: Fix ECMP sibling count mismatch when clearing RTF_ADDRCONF") Reported-by: syzbot+707d6a5da1ab9e0c6f9d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/698cbfba.050a0220.2eeac1.009d.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Fernando Fernandez Mancera Reviewed-by: Shigeru Yoshida Link: https://patch.msgid.link/20260211175133.3657034-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9880d608392b..56058e6de490 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1139,7 +1139,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_add_gc_list(iter); } if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT)) && - !iter->fib6_nh->fib_nh_gw_family) { + (iter->nh || !iter->fib6_nh->fib_nh_gw_family)) { iter->fib6_flags &= ~RTF_ADDRCONF; iter->fib6_flags &= ~RTF_PREFIX_RT; } From a49d2a2c37a6252c41cbdd505f9d1c58d5a3817a Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Thu, 12 Feb 2026 12:02:30 +0100 Subject: [PATCH 23/85] net: sparx5/lan969x: fix PTP clock max_adj value The max_adj field in ptp_clock_info tells userspace how much the PHC clock frequency can be adjusted. ptp4l reads this and will never request a correction larger than max_adj. On both sparx5 and lan969x the clock offset may never converge because the servo needs a frequency correction larger than the current max_adj of 200000 (200 ppm) allows. The servo rails at the max and the offset stays in the tens of microseconds. The hardware has no inherent max adjustment limit; frequency correction is done by writing a 64-bit clock period increment to CLK_PER_CFG, and the register has plenty of range. The 200000 value was just an overly conservative software limit. The max_adj is shared between sparx5 and lan969x, and the increased value is safe for both. Fix this by increasing max_adj to 10000000 (10000 ppm), giving the servo sufficient headroom. Fixes: 0933bd04047c ("net: sparx5: Add support for ptp clocks") Signed-off-by: Daniel Machon Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260212-sparx5-ptp-max-adj-v2-v1-1-06b200e50ce3@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c b/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c index 2f168700f63c..8b2e07821a95 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_ptp.c @@ -576,7 +576,7 @@ static int sparx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) static struct ptp_clock_info sparx5_ptp_clock_info = { .owner = THIS_MODULE, .name = "sparx5 ptp", - .max_adj = 200000, + .max_adj = 10000000, .gettime64 = sparx5_ptp_gettime64, .settime64 = sparx5_ptp_settime64, .adjtime = sparx5_ptp_adjtime, From 9dd391493a727464e9a03cfff9356c8e10b8da0b Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 12 Feb 2026 21:59:15 +0100 Subject: [PATCH 24/85] vsock: fix child netns mode initialization When a new network namespace is created, vsock_net_init() correctly initializes the namespace's mode by reading the parent's `child_ns_mode` via vsock_net_child_mode(). However, the `child_ns_mode` of the new namespace was always hardcoded to VSOCK_NET_MODE_GLOBAL, regardless of its own mode. This means that if a parent namespace has `child_ns_mode` set to "local", the child namespace correctly gets mode "local", but its `child_ns_mode` is reset to "global". As a result, further nested namespaces will incorrectly get mode "global" instead of inheriting "local", breaking the expected propagation of the mode through nested namespaces. Fix this by initializing `child_ns_mode` to the namespace's own mode, so the setting propagates correctly through all levels of nesting. Fixes: eafb64f40ca4 ("vsock: add netns to vsock core") Cc: bobbyeshleman@meta.com Signed-off-by: Stefano Garzarella Reviewed-by: Bobby Eshleman Link: https://patch.msgid.link/20260212205916.97533-2-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 20ad2b2dc17b..3b629b4a0359 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -91,7 +91,8 @@ * - /proc/sys/net/vsock/ns_mode (read-only) reports the current namespace's * mode, which is set at namespace creation and immutable thereafter. * - /proc/sys/net/vsock/child_ns_mode (writable) controls what mode future - * child namespaces will inherit when created. The default is "global". + * child namespaces will inherit when created. The initial value matches + * the namespace's own ns_mode. * * Changing child_ns_mode only affects newly created namespaces, not the * current namespace or existing children. At namespace creation, ns_mode @@ -2912,7 +2913,7 @@ static void vsock_net_init(struct net *net) else net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns); - net->vsock.child_ns_mode = VSOCK_NET_MODE_GLOBAL; + net->vsock.child_ns_mode = net->vsock.mode; } static __net_init int vsock_sysctl_init_net(struct net *net) From 6a997f38bdf822d4c5cc10b445ff1cb26872580a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 12 Feb 2026 21:59:16 +0100 Subject: [PATCH 25/85] vsock: prevent child netns mode switch from local to global A "local" namespace can change its `child_ns_mode` sysctl to "global", allowing nested namespaces to access global CIDs. This can be exploited by an unprivileged user who gained CAP_NET_ADMIN through a user namespace. Prevent this by rejecting writes that attempt to set `child_ns_mode` to "global" when the current namespace's mode is "local". Fixes: eafb64f40ca4 ("vsock: add netns to vsock core") Cc: bobbyeshleman@meta.com Signed-off-by: Stefano Garzarella Reviewed-by: Bobby Eshleman Link: https://patch.msgid.link/20260212205916.97533-3-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 3b629b4a0359..9880756d9eff 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -95,8 +95,9 @@ * the namespace's own ns_mode. * * Changing child_ns_mode only affects newly created namespaces, not the - * current namespace or existing children. At namespace creation, ns_mode - * is inherited from the parent's child_ns_mode. + * current namespace or existing children. A "local" namespace cannot set + * child_ns_mode to "global". At namespace creation, ns_mode is inherited + * from the parent's child_ns_mode. * * The init_net mode is "global" and cannot be modified. * @@ -2844,8 +2845,16 @@ static int vsock_net_child_mode_string(const struct ctl_table *table, int write, if (ret) return ret; - if (write) + if (write) { + /* Prevent a "local" namespace from escalating to "global", + * which would give nested namespaces access to global CIDs. + */ + if (vsock_net_mode(net) == VSOCK_NET_MODE_LOCAL && + new_mode == VSOCK_NET_MODE_GLOBAL) + return -EPERM; + vsock_net_set_child_mode(net, new_mode); + } return 0; } From ee5492fd88cfc079c19fbeac78e9e53b7f6c04f3 Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Wed, 11 Feb 2026 19:13:29 +0000 Subject: [PATCH 26/85] fbnic: close fw_log race between users and teardown Fixes a theoretical race on fw_log between the teardown path and fw_log write functions. fw_log is written inside fbnic_fw_log_write() and can be reached from the mailbox handler fbnic_fw_msix_intr(), but fw_log is freed before IRQ/MBX teardown during cleanup, resulting in a potential data race of dereferencing a freed/null variable. Possible Interleaving Scenario: CPU0: fbnic_fw_msix_intr() // Entry fbnic_fw_log_write() if (fbnic_fw_log_ready()) // true ... preempt ... CPU1: fbnic_remove() // Entry fbnic_fw_log_free() vfree(log->data_start); log->data_start = NULL; CPU0: continues, walks log->entries or writes to log->data_start The initialization also has an incorrect order problem, as the fw_log is currently allocated after MBX setup during initialization. Fix the problems by adjusting the synchronization order to put initialization in place before the mailbox is enabled, and not cleared until after the mailbox has been disabled. Fixes: ecc53b1b46c89 ("eth: fbnic: Enable firmware logging") Signed-off-by: Chengfeng Ye Link: https://patch.msgid.link/20260211191329.530886-1-dg573847474@gmail.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/meta/fbnic/fbnic_fw_log.c | 3 --- drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 19 ++++++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c index 85a883dba385..d8a9a7d7c237 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c @@ -51,8 +51,6 @@ int fbnic_fw_log_init(struct fbnic_dev *fbd) log->data_start = data; log->data_end = data + FBNIC_FW_LOG_SIZE; - fbnic_fw_log_enable(fbd, true); - return 0; } @@ -63,7 +61,6 @@ void fbnic_fw_log_free(struct fbnic_dev *fbd) if (!fbnic_fw_log_ready(fbd)) return; - fbnic_fw_log_disable(fbd); INIT_LIST_HEAD(&log->entries); log->size = 0; vfree(log->data_start); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 6f9389748a7d..3fa9d1910daa 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -311,11 +311,17 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto free_irqs; } + err = fbnic_fw_log_init(fbd); + if (err) + dev_warn(fbd->dev, + "Unable to initialize firmware log buffer: %d\n", + err); + err = fbnic_fw_request_mbx(fbd); if (err) { dev_err(&pdev->dev, "Firmware mailbox initialization failure\n"); - goto free_irqs; + goto free_fw_log; } /* Send the request to enable the FW logging to host. Note if this @@ -323,11 +329,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) * possible the FW is just too old to support the logging and needs * to be updated. */ - err = fbnic_fw_log_init(fbd); - if (err) - dev_warn(fbd->dev, - "Unable to initialize firmware log buffer: %d\n", - err); + fbnic_fw_log_enable(fbd, true); fbnic_devlink_register(fbd); fbnic_devlink_otp_check(fbd, "error detected during probe"); @@ -374,6 +376,8 @@ init_failure_mode: * firmware updates for fixes. */ return 0; +free_fw_log: + fbnic_fw_log_free(fbd); free_irqs: fbnic_free_irqs(fbd); err_destroy_health: @@ -408,8 +412,9 @@ static void fbnic_remove(struct pci_dev *pdev) fbnic_hwmon_unregister(fbd); fbnic_dbg_fbd_exit(fbd); fbnic_devlink_unregister(fbd); - fbnic_fw_log_free(fbd); + fbnic_fw_log_disable(fbd); fbnic_fw_free_mbx(fbd); + fbnic_fw_log_free(fbd); fbnic_free_irqs(fbd); fbnic_devlink_health_destroy(fbd); From bbeb3bfbffe0279fa47c041658b037fb38a93965 Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 11 Feb 2026 17:00:41 -0800 Subject: [PATCH 27/85] eth: fbnic: set FBNIC_QUEUE_RDE_CTL0_EN_HDR_SPLIT on RDE_CTL0 Fix EN_HDR_SPLIT configuration by writing the field to RDE_CTL0 instead of RDE_CTL1. Because drop mode configuration and header splitting enablement both use RDE_CTL0, we consolidate these configurations into the single function fbnic_config_drop_mode. Fixes: 2b30fc01a6c7 ("eth: fbnic: Add support for HDS configuration") Signed-off-by: Bobby Eshleman Acked-by: Mohsin Bashir Link: https://patch.msgid.link/20260211-fbnic-tcp-hds-fixes-v1-1-55d050e6f606@meta.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_txrx.c | 25 +++++++++++--------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c index e29959241ff3..4aaa928bf8ab 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c @@ -2591,7 +2591,8 @@ write_ctl: } static void fbnic_config_drop_mode_rcq(struct fbnic_napi_vector *nv, - struct fbnic_ring *rcq, bool tx_pause) + struct fbnic_ring *rcq, bool tx_pause, + bool hdr_split) { struct fbnic_net *fbn = netdev_priv(nv->napi.dev); u32 drop_mode, rcq_ctl; @@ -2604,22 +2605,26 @@ static void fbnic_config_drop_mode_rcq(struct fbnic_napi_vector *nv, /* Specify packet layout */ rcq_ctl = FIELD_PREP(FBNIC_QUEUE_RDE_CTL0_DROP_MODE_MASK, drop_mode) | FIELD_PREP(FBNIC_QUEUE_RDE_CTL0_MIN_HROOM_MASK, FBNIC_RX_HROOM) | - FIELD_PREP(FBNIC_QUEUE_RDE_CTL0_MIN_TROOM_MASK, FBNIC_RX_TROOM); + FIELD_PREP(FBNIC_QUEUE_RDE_CTL0_MIN_TROOM_MASK, FBNIC_RX_TROOM) | + FIELD_PREP(FBNIC_QUEUE_RDE_CTL0_EN_HDR_SPLIT, hdr_split); fbnic_ring_wr32(rcq, FBNIC_QUEUE_RDE_CTL0, rcq_ctl); } -void fbnic_config_drop_mode(struct fbnic_net *fbn, bool tx_pause) +void fbnic_config_drop_mode(struct fbnic_net *fbn, bool txp) { + bool hds; int i, t; + hds = fbn->hds_thresh < FBNIC_HDR_BYTES_MIN; + for (i = 0; i < fbn->num_napi; i++) { struct fbnic_napi_vector *nv = fbn->napi[i]; for (t = 0; t < nv->rxt_count; t++) { struct fbnic_q_triad *qt = &nv->qt[nv->txt_count + t]; - fbnic_config_drop_mode_rcq(nv, &qt->cmpl, tx_pause); + fbnic_config_drop_mode_rcq(nv, &qt->cmpl, txp, hds); } } } @@ -2670,20 +2675,18 @@ static void fbnic_enable_rcq(struct fbnic_napi_vector *nv, { struct fbnic_net *fbn = netdev_priv(nv->napi.dev); u32 log_size = fls(rcq->size_mask); - u32 hds_thresh = fbn->hds_thresh; u32 rcq_ctl = 0; - - fbnic_config_drop_mode_rcq(nv, rcq, fbn->tx_pause); + bool hdr_split; + u32 hds_thresh; /* Force lower bound on MAX_HEADER_BYTES. Below this, all frames should * be split at L4. It would also result in the frames being split at * L2/L3 depending on the frame size. */ - if (fbn->hds_thresh < FBNIC_HDR_BYTES_MIN) { - rcq_ctl = FBNIC_QUEUE_RDE_CTL0_EN_HDR_SPLIT; - hds_thresh = FBNIC_HDR_BYTES_MIN; - } + hdr_split = fbn->hds_thresh < FBNIC_HDR_BYTES_MIN; + fbnic_config_drop_mode_rcq(nv, rcq, fbn->tx_pause, hdr_split); + hds_thresh = max(fbn->hds_thresh, FBNIC_HDR_BYTES_MIN); rcq_ctl |= FIELD_PREP(FBNIC_QUEUE_RDE_CTL1_PADLEN_MASK, FBNIC_RX_PAD) | FIELD_PREP(FBNIC_QUEUE_RDE_CTL1_MAX_HDR_MASK, hds_thresh) | FIELD_PREP(FBNIC_QUEUE_RDE_CTL1_PAYLD_OFF_MASK, From bd254115f38db3c046332bb62e8719e0dc7c2b53 Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 11 Feb 2026 17:00:42 -0800 Subject: [PATCH 28/85] eth: fbnic: increase FBNIC_HDR_BYTES_MIN from 128 to 256 bytes Increase FBNIC_HDR_BYTES_MIN from 128 to 256 bytes. The previous minimum was too small to guarantee that very long L2+L3+L4 headers always fit within the header buffer. When EN_HDR_SPLIT is disabled and a packet exceeds MAX_HEADER_BYTES, splitting occurs at that byte offset instead of the header boundary, resulting in some of the header landing in the payload page. The increased minimum ensures headers always fit with the MAX_HEADER_BYTES cut off and land in the header page. Fixes: 2b30fc01a6c7 ("eth: fbnic: Add support for HDS configuration") Signed-off-by: Bobby Eshleman Acked-by: Mohsin Bashir Link: https://patch.msgid.link/20260211-fbnic-tcp-hds-fixes-v1-2-55d050e6f606@meta.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_txrx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h index b9560103ab86..980965274079 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h @@ -66,7 +66,7 @@ struct fbnic_net; (4096 - FBNIC_RX_HROOM - FBNIC_RX_TROOM - FBNIC_RX_PAD) #define FBNIC_HDS_THRESH_DEFAULT \ (1536 - FBNIC_RX_PAD) -#define FBNIC_HDR_BYTES_MIN 128 +#define FBNIC_HDR_BYTES_MIN 256 struct fbnic_pkt_buff { struct xdp_buff buff; From 0f30a31b55c4179fc55613a75ef41d496687d465 Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 11 Feb 2026 17:00:43 -0800 Subject: [PATCH 29/85] eth: fbnic: set DMA_HINT_L4 for all flows fbnic always advertises ETHTOOL_TCP_DATA_SPLIT_ENABLED via ethtool .get_ringparam. To enable proper splitting for all flow types, even for IP/Ethernet flows, this patch sets DMA_HINT_L4 unconditionally for all RSS and NFC flow steering rules. According to the spec, L4 falls back to L3 if no valid L4 is found, and L3 falls back to L2 if no L3 is found. This makes sure that the correct header boundary is used regardless of traffic type. This is important for zero-copy use cases where we must ensure that all ZC packets are split correctly. Fixes: 2b30fc01a6c7 ("eth: fbnic: Add support for HDS configuration") Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260211-fbnic-tcp-hds-fixes-v1-3-55d050e6f606@meta.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c | 3 +++ drivers/net/ethernet/meta/fbnic/fbnic_rpc.c | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index 11745a2d8a44..401c2196b9ff 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -1145,6 +1145,9 @@ ipv6_flow: return -EINVAL; } + dest |= FIELD_PREP(FBNIC_RPC_ACT_TBL0_DMA_HINT, + FBNIC_RCD_HDR_AL_DMA_HINT_L4); + /* Write action table values */ act_tcam->dest = dest; act_tcam->rss_en_mask = fbnic_flow_hash_2_rss_en_mask(fbn, hash_idx); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c index 7f31e890031c..42a186db43ea 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c @@ -338,9 +338,8 @@ void fbnic_rss_reinit(struct fbnic_dev *fbd, struct fbnic_net *fbn) else if (tstamp_mask & (1u << flow_type)) dest |= FBNIC_RPC_ACT_TBL0_TS_ENA; - if (act1_value[flow_type] & FBNIC_RPC_TCAM_ACT1_L4_VALID) - dest |= FIELD_PREP(FBNIC_RPC_ACT_TBL0_DMA_HINT, - FBNIC_RCD_HDR_AL_DMA_HINT_L4); + dest |= FIELD_PREP(FBNIC_RPC_ACT_TBL0_DMA_HINT, + FBNIC_RCD_HDR_AL_DMA_HINT_L4); rss_en_mask = fbnic_flow_hash_2_rss_en_mask(fbn, flow_type); From e7a3c1adc127f9f91a35169d34f7471d417d72a6 Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 11 Feb 2026 17:00:44 -0800 Subject: [PATCH 30/85] selftests: drv-net: add HDS payload sweep test for devmem TCP Add check_rx_hds test that verifies header/data split works across payload sizes. The test sweeps payload sizes from 1 byte to 8KB, if any data propagates up to userspace as SCM_DEVMEM_LINEAR, then the test fails. This shows that regardless of payload size, ncdevmem's configuration of hds-thresh to 0 is respected. Add -L (--fail-on-linear) flag to ncdevmem that causes the receiver to fail if any SCM_DEVMEM_LINEAR cmsg is received. Use socat option for fixed block sizing and tcp nodelay to disable nagle's algo to avoid buffering. Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260211-fbnic-tcp-hds-fixes-v1-4-55d050e6f606@meta.com Signed-off-by: Paolo Abeni --- .../selftests/drivers/net/hw/devmem.py | 19 ++++++++++++++++++- .../selftests/drivers/net/hw/ncdevmem.c | 11 ++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py index 45c2d49d55b6..ee863e90d1e0 100755 --- a/tools/testing/selftests/drivers/net/hw/devmem.py +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -63,12 +63,29 @@ def check_tx_chunks(cfg) -> None: ksft_eq(socat.stdout.strip(), "hello\nworld") +def check_rx_hds(cfg) -> None: + """Test HDS splitting across payload sizes.""" + require_devmem(cfg) + + for size in [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]: + port = rand_port() + listen_cmd = f"{cfg.bin_local} -L -l -f {cfg.ifname} -s {cfg.addr} -p {port}" + + with bkg(listen_cmd, exit_wait=True) as ncdevmem: + wait_port_listen(port) + cmd(f"dd if=/dev/zero bs={size} count=1 2>/dev/null | " + + f"socat -b {size} -u - TCP{cfg.addr_ipver}:{cfg.baddr}:{port},nodelay", + host=cfg.remote, shell=True) + + ksft_eq(ncdevmem.ret, 0, f"HDS failed for payload size {size}") + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: cfg.bin_local = path.abspath(path.dirname(__file__) + "/ncdevmem") cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - ksft_run([check_rx, check_tx, check_tx_chunks], + ksft_run([check_rx, check_tx, check_tx_chunks, check_rx_hds], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 16864c844108..e098d6534c3c 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -98,6 +98,7 @@ static unsigned int ifindex; static unsigned int dmabuf_id; static uint32_t tx_dmabuf_id; static int waittime_ms = 500; +static bool fail_on_linear; /* System state loaded by current_config_load() */ #define MAX_FLOWS 8 @@ -975,6 +976,11 @@ static int do_server(struct memory_buffer *mem) "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n", dmabuf_cmsg->frag_size); + if (fail_on_linear) { + pr_err("received SCM_DEVMEM_LINEAR but --fail-on-linear (-L) set"); + goto err_close_client; + } + continue; } @@ -1398,8 +1404,11 @@ int main(int argc, char *argv[]) int is_server = 0, opt; int ret, err = 1; - while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:z:")) != -1) { + while ((opt = getopt(argc, argv, "Lls:c:p:v:q:t:f:z:")) != -1) { switch (opt) { + case 'L': + fail_on_linear = true; + break; case 'l': is_server = 1; break; From 94560267d6c41b1ff3fafbab726e3f8a55a6af34 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Thu, 12 Feb 2026 22:31:19 +0100 Subject: [PATCH 31/85] ovpn: tcp - don't deref NULL sk_socket member after tcp_close() When deleting a peer in case of keepalive expiration, the peer is removed from the OpenVPN hashtable and is temporary inserted in a "release list" for further processing. This happens in: ovpn_peer_keepalive_work() unlock_ovpn(release_list) This processing includes detaching from the socket being used to talk to this peer, by restoring its original proto and socket ops/callbacks. In case of TCP it may happen that, while the peer is sitting in the release list, userspace decides to close the socket. This will result in a concurrent execution of: tcp_close(sk) __tcp_close(sk) sock_orphan(sk) sk_set_socket(sk, NULL) The last function call will set sk->sk_socket to NULL. When the releasing routine is resumed, ovpn_tcp_socket_detach() will attempt to dereference sk->sk_socket to restore its original ops member. This operation will crash due to sk->sk_socket being NULL. Fix this race condition by testing-and-accessing sk->sk_socket atomically under sk->sk_callback_lock. Link: https://lore.kernel.org/netdev/176996279620.3109699.15382994681575380467@eldamar.lan/ Link: https://github.com/OpenVPN/ovpn-net-next/issues/29 Signed-off-by: Antonio Quartulli Fixes: 11851cbd60ea ("ovpn: implement TCP transport") Link: https://patch.msgid.link/20260212213130.11497-1-antonio@openvpn.net Signed-off-by: Paolo Abeni --- drivers/net/ovpn/tcp.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c index f0b4e07ba924..ec2bbc28c196 100644 --- a/drivers/net/ovpn/tcp.c +++ b/drivers/net/ovpn/tcp.c @@ -199,7 +199,19 @@ void ovpn_tcp_socket_detach(struct ovpn_socket *ovpn_sock) sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready; sk->sk_write_space = peer->tcp.sk_cb.sk_write_space; sk->sk_prot = peer->tcp.sk_cb.prot; - sk->sk_socket->ops = peer->tcp.sk_cb.ops; + + /* tcp_close() may race this function and could set + * sk->sk_socket to NULL. It does so by invoking + * sock_orphan(), which holds sk_callback_lock before + * doing the assignment. + * + * For this reason we acquire the same lock to avoid + * sk_socket to disappear under our feet + */ + write_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket) + sk->sk_socket->ops = peer->tcp.sk_cb.ops; + write_unlock_bh(&sk->sk_callback_lock); rcu_assign_sk_user_data(sk, NULL); } From 9e7021d2aeae57c323a6f722ed7915686cdcc123 Mon Sep 17 00:00:00 2001 From: Ziyi Guo Date: Thu, 12 Feb 2026 21:41:54 +0000 Subject: [PATCH 32/85] net: usb: catc: enable basic endpoint checking catc_probe() fills three URBs with hardcoded endpoint pipes without verifying the endpoint descriptors: - usb_sndbulkpipe(usbdev, 1) and usb_rcvbulkpipe(usbdev, 1) for TX/RX - usb_rcvintpipe(usbdev, 2) for interrupt status A malformed USB device can present these endpoints with transfer types that differ from what the driver assumes. Add a catc_usb_ep enum for endpoint numbers, replacing magic constants throughout. Add usb_check_bulk_endpoints() and usb_check_int_endpoints() calls after usb_set_interface() to verify endpoint types before use, rejecting devices with mismatched descriptors at probe time. Similar to - commit 90b7f2961798 ("net: usb: rtl8150: enable basic endpoint checking") which fixed the issue in rtl8150. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Simon Horman Signed-off-by: Ziyi Guo Link: https://patch.msgid.link/20260212214154.3609844-1-n7l8m4@u.northwestern.edu Signed-off-by: Paolo Abeni --- drivers/net/usb/catc.c | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c index 5c7f19cbacf6..96e82f94edcf 100644 --- a/drivers/net/usb/catc.c +++ b/drivers/net/usb/catc.c @@ -58,6 +58,16 @@ static const char driver_name[] = "catc"; #define CTRL_QUEUE 16 /* Max control requests in flight (power of two) */ #define RX_PKT_SZ 1600 /* Max size of receive packet for F5U011 */ +/* + * USB endpoints. + */ + +enum catc_usb_ep { + CATC_USB_EP_CONTROL = 0, + CATC_USB_EP_BULK = 1, + CATC_USB_EP_INT_IN = 2, +}; + /* * Control requests. */ @@ -765,6 +775,13 @@ static int catc_probe(struct usb_interface *intf, const struct usb_device_id *id u8 broadcast[ETH_ALEN]; u8 *macbuf; int pktsz, ret = -ENOMEM; + static const u8 bulk_ep_addr[] = { + CATC_USB_EP_BULK | USB_DIR_OUT, + CATC_USB_EP_BULK | USB_DIR_IN, + 0}; + static const u8 int_ep_addr[] = { + CATC_USB_EP_INT_IN | USB_DIR_IN, + 0}; macbuf = kmalloc(ETH_ALEN, GFP_KERNEL); if (!macbuf) @@ -777,6 +794,14 @@ static int catc_probe(struct usb_interface *intf, const struct usb_device_id *id goto fail_mem; } + /* Verify that all required endpoints are present */ + if (!usb_check_bulk_endpoints(intf, bulk_ep_addr) || + !usb_check_int_endpoints(intf, int_ep_addr)) { + dev_err(dev, "Missing or invalid endpoints\n"); + ret = -ENODEV; + goto fail_mem; + } + netdev = alloc_etherdev(sizeof(struct catc)); if (!netdev) goto fail_mem; @@ -821,14 +846,14 @@ static int catc_probe(struct usb_interface *intf, const struct usb_device_id *id usb_fill_control_urb(catc->ctrl_urb, usbdev, usb_sndctrlpipe(usbdev, 0), NULL, NULL, 0, catc_ctrl_done, catc); - usb_fill_bulk_urb(catc->tx_urb, usbdev, usb_sndbulkpipe(usbdev, 1), - NULL, 0, catc_tx_done, catc); + usb_fill_bulk_urb(catc->tx_urb, usbdev, usb_sndbulkpipe(usbdev, CATC_USB_EP_BULK), + NULL, 0, catc_tx_done, catc); - usb_fill_bulk_urb(catc->rx_urb, usbdev, usb_rcvbulkpipe(usbdev, 1), - catc->rx_buf, pktsz, catc_rx_done, catc); + usb_fill_bulk_urb(catc->rx_urb, usbdev, usb_rcvbulkpipe(usbdev, CATC_USB_EP_BULK), + catc->rx_buf, pktsz, catc_rx_done, catc); - usb_fill_int_urb(catc->irq_urb, usbdev, usb_rcvintpipe(usbdev, 2), - catc->irq_buf, 2, catc_irq_done, catc, 1); + usb_fill_int_urb(catc->irq_urb, usbdev, usb_rcvintpipe(usbdev, CATC_USB_EP_INT_IN), + catc->irq_buf, 2, catc_irq_done, catc, 1); if (!catc->is_f5u011) { u32 *buf; From 6d1dc8014334c7fb25719999bca84d811e60a559 Mon Sep 17 00:00:00 2001 From: Ziyi Guo Date: Thu, 12 Feb 2026 22:40:40 +0000 Subject: [PATCH 33/85] xen-netback: reject zero-queue configuration from guest A malicious or buggy Xen guest can write "0" to the xenbus key "multi-queue-num-queues". The connect() function in the backend only validates the upper bound (requested_num_queues > xenvif_max_queues) but not zero, allowing requested_num_queues=0 to reach vzalloc(array_size(0, sizeof(struct xenvif_queue))), which triggers WARN_ON_ONCE(!size) in __vmalloc_node_range(). On systems with panic_on_warn=1, this allows a guest-to-host denial of service. The Xen network interface specification requires the queue count to be "greater than zero". Add a zero check to match the validation already present in xen-blkback, which has included this guard since its multi-queue support was added. Fixes: 8d3d53b3e433 ("xen-netback: Add support for multiple queues") Signed-off-by: Ziyi Guo Reviewed-by: Juergen Gross Link: https://patch.msgid.link/20260212224040.86674-1-n7l8m4@u.northwestern.edu Signed-off-by: Paolo Abeni --- drivers/net/xen-netback/xenbus.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index a78a25b87240..61b547aab286 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -735,10 +735,11 @@ static void connect(struct backend_info *be) */ requested_num_queues = xenbus_read_unsigned(dev->otherend, "multi-queue-num-queues", 1); - if (requested_num_queues > xenvif_max_queues) { + if (requested_num_queues > xenvif_max_queues || + requested_num_queues == 0) { /* buggy or malicious guest */ xenbus_dev_fatal(dev, -EINVAL, - "guest requested %u queues, exceeding the maximum of %u.", + "guest requested %u queues, but valid range is 1 - %u.", requested_num_queues, xenvif_max_queues); return; } From da29e453dcb3aa7cabead7915f5f945d0add3a52 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Thu, 12 Feb 2026 20:54:09 -0700 Subject: [PATCH 34/85] net/rds: rds_sendmsg should not discard payload_len Commit 3db6e0d172c9 ("rds: use RCU to synchronize work-enqueue with connection teardown") modifies rds_sendmsg to avoid enqueueing work while a tear down is in progress. However, it also changed the return value of rds_sendmsg to that of rds_send_xmit instead of the payload_len. This means the user may incorrectly receive errno values when it should have simply received a payload of 0 while the peer attempts a reconnections. So this patch corrects the teardown handling code to only use the out error path in that case, thus restoring the original payload_len return value. Fixes: 3db6e0d172c9 ("rds: use RCU to synchronize work-enqueue with connection teardown") Reviewed-by: Simon Horman Signed-off-by: Allison Henderson Link: https://patch.msgid.link/20260213035409.1963391-1-achender@kernel.org Signed-off-by: Paolo Abeni --- net/rds/send.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/rds/send.c b/net/rds/send.c index 6e96f108473e..a1039e422a38 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1431,9 +1431,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) else queue_delayed_work(cpath->cp_wq, &cpath->cp_send_w, 1); rcu_read_unlock(); + + if (ret) + goto out; } - if (ret) - goto out; + rds_message_put(rm); for (ind = 0; ind < vct.indx; ind++) From c7d9be66b71af490446127c6ffcb66d6bb71b8b9 Mon Sep 17 00:00:00 2001 From: Ethan Nelson-Moore Date: Thu, 12 Feb 2026 20:55:09 -0800 Subject: [PATCH 35/85] net: arcnet: com20020-pci: fix support for 2.5Mbit cards Commit 8c14f9c70327 ("ARCNET: add com20020 PCI IDs with metadata") converted the com20020-pci driver to use a card info structure instead of a single flag mask in driver_data. However, it failed to take into account that in the original code, driver_data of 0 indicates a card with no special flags, not a card that should not have any card info structure. This introduced a null pointer dereference when cards with no flags were probed. Commit bd6f1fd5d33d ("net: arcnet: com20020: Fix null-ptr-deref in com20020pci_probe()") then papered over this issue by rejecting cards with no driver_data instead of resolving the problem at its source. Fix the original issue by introducing a new card info structure for 2.5Mbit cards that does not set any flags and using it if no driver_data is present. Fixes: 8c14f9c70327 ("ARCNET: add com20020 PCI IDs with metadata") Fixes: bd6f1fd5d33d ("net: arcnet: com20020: Fix null-ptr-deref in com20020pci_probe()") Cc: stable@vger.kernel.org Reviewed-by: Simon Horman Signed-off-by: Ethan Nelson-Moore Link: https://patch.msgid.link/20260213045510.32368-1-enelsonmoore@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/arcnet/com20020-pci.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index 19e411b2e3a7..dbadda08dce2 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -115,6 +115,8 @@ static const struct attribute_group com20020_state_group = { .attrs = com20020_state_attrs, }; +static struct com20020_pci_card_info card_info_2p5mbit; + static void com20020pci_remove(struct pci_dev *pdev); static int com20020pci_probe(struct pci_dev *pdev, @@ -140,7 +142,7 @@ static int com20020pci_probe(struct pci_dev *pdev, ci = (struct com20020_pci_card_info *)id->driver_data; if (!ci) - return -EINVAL; + ci = &card_info_2p5mbit; priv->ci = ci; mm = &ci->misc_map; @@ -347,6 +349,18 @@ static struct com20020_pci_card_info card_info_5mbit = { .flags = ARC_IS_5MBIT, }; +static struct com20020_pci_card_info card_info_2p5mbit = { + .name = "ARC-PCI", + .devcount = 1, + .chan_map_tbl = { + { + .bar = 2, + .offset = 0x00, + .size = 0x08, + }, + }, +}; + static struct com20020_pci_card_info card_info_sohard = { .name = "SOHARD SH ARC-PCI", .devcount = 1, From 8b769e311a86bb9d15c5658ad283b86fc8f080a2 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Feb 2026 09:00:30 +0200 Subject: [PATCH 36/85] net: bridge: mcast: always update mdb_n_entries for vlan contexts syzbot triggered a warning[1] about the number of mdb entries in a context. It turned out that there are multiple ways to trigger that warning today (some got added during the years), the root cause of the problem is that the increase is done conditionally, and over the years these different conditions increased so there were new ways to trigger the warning, that is to do a decrease which wasn't paired with a previous increase. For example one way to trigger it is with flush: $ ip l add br0 up type bridge vlan_filtering 1 mcast_snooping 1 $ ip l add dumdum up master br0 type dummy $ bridge mdb add dev br0 port dumdum grp 239.0.0.1 permanent vid 1 $ ip link set dev br0 down $ ip link set dev br0 type bridge mcast_vlan_snooping 1 ^^^^ this will enable snooping, but will not update mdb_n_entries because in __br_multicast_enable_port_ctx() we check !netif_running $ bridge mdb flush dev br0 ^^^ this will trigger the warning because it will delete the pg which we added above, which will try to decrease mdb_n_entries Fix the problem by removing the conditional increase and always keep the count up-to-date while the vlan exists. In order to do that we have to first initialize it on port-vlan context creation, and then always increase or decrease the value regardless of mcast options. To keep the current behaviour we have to enforce the mdb limit only if the context is port's or if the port-vlan's mcast snooping is enabled. [1] ------------[ cut here ]------------ n == 0 WARNING: net/bridge/br_multicast.c:718 at br_multicast_port_ngroups_dec_one net/bridge/br_multicast.c:718 [inline], CPU#0: syz.4.4607/22043 WARNING: net/bridge/br_multicast.c:718 at br_multicast_port_ngroups_dec net/bridge/br_multicast.c:771 [inline], CPU#0: syz.4.4607/22043 WARNING: net/bridge/br_multicast.c:718 at br_multicast_del_pg+0x1bbe/0x1e20 net/bridge/br_multicast.c:825, CPU#0: syz.4.4607/22043 Modules linked in: CPU: 0 UID: 0 PID: 22043 Comm: syz.4.4607 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/24/2026 RIP: 0010:br_multicast_port_ngroups_dec_one net/bridge/br_multicast.c:718 [inline] RIP: 0010:br_multicast_port_ngroups_dec net/bridge/br_multicast.c:771 [inline] RIP: 0010:br_multicast_del_pg+0x1bbe/0x1e20 net/bridge/br_multicast.c:825 Code: 41 5f 5d e9 04 7a 48 f7 e8 3f 73 5c f7 90 0f 0b 90 e9 cf fd ff ff e8 31 73 5c f7 90 0f 0b 90 e9 16 fd ff ff e8 23 73 5c f7 90 <0f> 0b 90 e9 60 fd ff ff e8 15 73 5c f7 eb 05 e8 0e 73 5c f7 48 8b RSP: 0018:ffffc9000c207220 EFLAGS: 00010293 RAX: ffffffff8a68042d RBX: ffff88807c6f1800 RCX: ffff888066e90000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff888066e90000 R09: 000000000000000c R10: 000000000000000c R11: 0000000000000000 R12: ffff8880303ef800 R13: dffffc0000000000 R14: ffff888050eb11c4 R15: 1ffff1100a1d6238 FS: 00007fa45921b6c0(0000) GS:ffff8881256f5000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa4591f9ff8 CR3: 0000000081df2000 CR4: 00000000003526f0 Call Trace: br_mdb_flush_pgs net/bridge/br_mdb.c:1525 [inline] br_mdb_flush net/bridge/br_mdb.c:1544 [inline] br_mdb_del_bulk+0x5e2/0xb20 net/bridge/br_mdb.c:1561 rtnl_mdb_del+0x48a/0x640 net/core/rtnetlink.c:-1 rtnetlink_rcv_msg+0x77e/0xbe0 net/core/rtnetlink.c:6967 netlink_rcv_skb+0x232/0x4b0 net/netlink/af_netlink.c:2550 netlink_unicast_kernel net/netlink/af_netlink.c:1318 [inline] netlink_unicast+0x80f/0x9b0 net/netlink/af_netlink.c:1344 netlink_sendmsg+0x813/0xb40 net/netlink/af_netlink.c:1894 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg net/socket.c:742 [inline] ____sys_sendmsg+0xa68/0xad0 net/socket.c:2592 ___sys_sendmsg+0x2a5/0x360 net/socket.c:2646 __sys_sendmsg net/socket.c:2678 [inline] __do_sys_sendmsg net/socket.c:2683 [inline] __se_sys_sendmsg net/socket.c:2681 [inline] __x64_sys_sendmsg+0x1bd/0x2a0 net/socket.c:2681 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fa45839aeb9 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fa45921b028 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007fa458615fa0 RCX: 00007fa45839aeb9 RDX: 0000000000000000 RSI: 00002000000000c0 RDI: 0000000000000004 RBP: 00007fa458408c1f R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fa458616038 R14: 00007fa458615fa0 R15: 00007fff0b59fae8 Fixes: b57e8d870d52 ("net: bridge: Maintain number of MDB entries in net_bridge_mcast_port") Reported-by: syzbot+d5d1b7343531d17bd3c5@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/aYrWbRp83MQR1ife@debil/T/#t Reviewed-by: Ido Schimmel Signed-off-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260213070031.1400003-2-nikolay@nvidia.com Signed-off-by: Paolo Abeni --- net/bridge/br_multicast.c | 45 ++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index dccae08b4f4c..b6a5147886ca 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -244,14 +244,11 @@ br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid) lockdep_assert_held_once(&port->br->multicast_lock); - if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) - return NULL; - /* Take RCU to access the vlan. */ rcu_read_lock(); vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid); - if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) + if (vlan) pmctx = &vlan->port_mcast_ctx; rcu_read_unlock(); @@ -701,7 +698,10 @@ br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx, u32 max = READ_ONCE(pmctx->mdb_max_entries); u32 n = READ_ONCE(pmctx->mdb_n_entries); - if (max && n >= max) { + /* enforce the max limit when it's a port pmctx or a port-vlan pmctx + * with snooping enabled + */ + if (!br_multicast_port_ctx_vlan_disabled(pmctx) && max && n >= max) { NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u", what, n, max); return -E2BIG; @@ -736,9 +736,7 @@ static int br_multicast_port_ngroups_inc(struct net_bridge_port *port, return err; } - /* Only count on the VLAN context if VID is given, and if snooping on - * that VLAN is enabled. - */ + /* Only count on the VLAN context if VID is given */ if (!group->vid) return 0; @@ -2011,6 +2009,18 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, timer_setup(&pmctx->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif + /* initialize mdb_n_entries if a new port vlan is being created */ + if (vlan) { + struct net_bridge_port_group *pg; + u32 n = 0; + + spin_lock_bh(&port->br->multicast_lock); + hlist_for_each_entry(pg, &port->mglist, mglist) + if (pg->key.addr.vid == vlan->vid) + n++; + WRITE_ONCE(pmctx->mdb_n_entries, n); + spin_unlock_bh(&port->br->multicast_lock); + } } void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) @@ -2094,25 +2104,6 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) br_ip4_multicast_add_router(brmctx, pmctx); br_ip6_multicast_add_router(brmctx, pmctx); } - - if (br_multicast_port_ctx_is_vlan(pmctx)) { - struct net_bridge_port_group *pg; - u32 n = 0; - - /* The mcast_n_groups counter might be wrong. First, - * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries - * are flushed, thus mcast_n_groups after the toggle does not - * reflect the true values. And second, permanent entries added - * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected - * either. Thus we have to refresh the counter. - */ - - hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) { - if (pg->key.addr.vid == pmctx->vlan->vid) - n++; - } - WRITE_ONCE(pmctx->mdb_n_entries, n); - } } static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) From a8470953b4caf52b32d27e2a23797824b312a325 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Feb 2026 09:00:31 +0200 Subject: [PATCH 37/85] selftests: forwarding: bridge_mdb_max: add tests for mdb_n_entries warning Recently we were able to trigger a warning in the mdb_n_entries counting code. Add tests that exercise different ways which used to trigger that warning. Reviewed-by: Ido Schimmel Signed-off-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260213070031.1400003-3-nikolay@nvidia.com Signed-off-by: Paolo Abeni --- .../net/forwarding/bridge_mdb_max.sh | 90 ++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh index 3da9d93ab36f..625162fd7e8b 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh @@ -28,6 +28,7 @@ ALL_TESTS=" test_8021d test_8021q test_8021qvs + test_mdb_count_warning " NUM_NETIFS=4 @@ -83,8 +84,6 @@ switch_create_8021q() { local br_flags=$1; shift - log_info "802.1q $br_flags${br_flags:+ }tests" - ip link add name br0 type bridge vlan_filtering 1 vlan_default_pvid 0 \ mcast_snooping 1 $br_flags \ mcast_igmp_version 3 mcast_mld_version 2 @@ -106,6 +105,7 @@ switch_create_8021q() switch_create_8021qvs() { + log_info "802.1q mcast_vlan_snooping 1 tests" switch_create_8021q "mcast_vlan_snooping 1" bridge vlan global set dev br0 vid 10 mcast_igmp_version 3 bridge vlan global set dev br0 vid 10 mcast_mld_version 2 @@ -1272,6 +1272,76 @@ test_8021qvs_toggle_vlan_snooping() test_toggle_vlan_snooping_permanent } +mdb_count_check_warn() +{ + local msg=$1; shift + + dmesg | grep -q "WARNING:.*br_multicast_port_ngroups_dec.*" + check_fail $? "$msg" +} + +test_mdb_count_mcast_vlan_snooping_flush() +{ + RET=0 + + # check if we already have a warning + mdb_count_check_warn "Check MDB entries count warning before test" + + bridge mdb add dev br0 port "$swp1" grp 239.0.0.1 permanent vid 10 + ip link set dev br0 down + ip link set dev br0 type bridge mcast_vlan_snooping 1 + bridge mdb flush dev br0 + + mdb_count_check_warn "Check MDB entries count warning after test" + + ip link set dev br0 type bridge mcast_vlan_snooping 0 + ip link set dev br0 up + + log_test "MDB count warning: mcast_vlan_snooping and MDB flush" +} + +test_mdb_count_mcast_snooping_flush() +{ + RET=0 + + # check if we already have a warning + mdb_count_check_warn "Check MDB entries count warning before test" + + bridge mdb add dev br0 port "$swp1" grp 239.0.0.1 permanent vid 10 + ip link set dev br0 type bridge mcast_snooping 0 + ip link set dev br0 type bridge mcast_vlan_snooping 1 + bridge mdb flush dev br0 + + mdb_count_check_warn "Check MDB entries count warning after test" + + ip link set dev br0 type bridge mcast_vlan_snooping 0 + ip link set dev br0 type bridge mcast_snooping 1 + + log_test "MDB count warning: mcast_snooping and MDB flush" +} + +test_mdb_count_vlan_state_flush() +{ + RET=0 + + # check if we already have a warning + mdb_count_check_warn "Check MDB entries count warning before test" + + bridge mdb add dev br0 port "$swp1" grp 239.0.0.1 permanent vid 10 + ip link set dev br0 down + bridge vlan set vid 10 dev "$swp1" state blocking + ip link set dev br0 type bridge mcast_vlan_snooping 1 + ip link set dev br0 up + bridge mdb flush dev br0 + + mdb_count_check_warn "Check MDB entries count warning after test" + + bridge vlan set vid 10 dev "$swp1" state forwarding + ip link set dev br0 type bridge mcast_vlan_snooping 0 + + log_test "MDB count warning: disabled vlan state and MDB flush" +} + # test groups test_8021d() @@ -1297,6 +1367,7 @@ test_8021q() { # Tests for vlan_filtering 1 mcast_vlan_snooping 0. + log_info "802.1q tests" switch_create_8021q setup_wait @@ -1334,6 +1405,21 @@ test_8021qvs() switch_destroy } +test_mdb_count_warning() +{ + # Tests for mdb_n_entries warning + + log_info "MDB count warning tests" + switch_create_8021q + setup_wait + + test_mdb_count_mcast_vlan_snooping_flush + test_mdb_count_mcast_snooping_flush + test_mdb_count_vlan_state_flush + + switch_destroy +} + if ! bridge link help 2>&1 | grep -q "mcast_max_groups"; then echo "SKIP: iproute2 too old, missing bridge \"mcast_max_groups\" support" exit $ksft_skip From 636fd32d401572f6cfe89ba17df4eb251020c10f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Feb 2026 08:44:00 +0100 Subject: [PATCH 38/85] printk: add CONFIG_PRINTK dependency for netconsole The 'select PRINTK_EXECUTION_CTX' line now causes a harmless warning when NETCONSOLE_DYNAMIC is enabled but PRINTK is not: WARNING: unmet direct dependencies detected for PRINTK_EXECUTION_CTX Depends on [n]: PRINTK [=n] Selected by [y]: - NETCONSOLE_DYNAMIC [=y] && NETDEVICES [=y] && NET_CORE [=y] && NETCONSOLE [=y] && SYSFS [=y] && CONFIGFS_FS [=y] && (NETCONSOLE [=y]!=y [=y] || CONFIGFS_FS [=y]!=m [=m]) In that configuration, the netconsole driver is useless anyway, so avoid this with an added dependency that prevents CONFIG_NETCONSOLE to be enabled without CONFIG_PRINTK. Fixes: 60325c27d3cf ("printk: Add execution context (task name/CPU) to printk_info") Signed-off-by: Arnd Bergmann Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260213074431.1729627-1-arnd@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 7e9becad91df..17108c359216 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -333,6 +333,7 @@ config MACSEC config NETCONSOLE tristate "Network console logging support" + depends on PRINTK help If you want to log kernel messages over the network, enable this. See for details. From 26f29b14916964043bc25b55ad1b9f5e9a12ca53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Feb 2026 12:04:27 +0000 Subject: [PATCH 39/85] net: fix backlog_unlock_irq_restore() vs CONFIG_PREEMPT_RT CONFIG_PREEMPT_RT is special, make this clear in backlog_lock_irq_save() and backlog_unlock_irq_restore(). The issue shows up with CONFIG_DEBUG_IRQFLAGS=y raw_local_irq_restore() called with IRQs enabled WARNING: kernel/locking/irqflag-debug.c:10 at warn_bogus_irq_restore+0xc/0x20 kernel/locking/irqflag-debug.c:10, CPU#1: aoe_tx0/1321 Modules linked in: CPU: 1 UID: 0 PID: 1321 Comm: aoe_tx0 Not tainted syzkaller #0 PREEMPT_{RT,(full)} Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/24/2026 RIP: 0010:warn_bogus_irq_restore+0xc/0x20 kernel/locking/irqflag-debug.c:10 Call Trace: backlog_unlock_irq_restore net/core/dev.c:253 [inline] enqueue_to_backlog+0x525/0xcf0 net/core/dev.c:5347 netif_rx_internal+0x120/0x550 net/core/dev.c:5659 __netif_rx+0xa9/0x110 net/core/dev.c:5679 loopback_xmit+0x43a/0x660 drivers/net/loopback.c:90 __netdev_start_xmit include/linux/netdevice.h:5275 [inline] netdev_start_xmit include/linux/netdevice.h:5284 [inline] xmit_one net/core/dev.c:3864 [inline] dev_hard_start_xmit+0x2df/0x830 net/core/dev.c:3880 __dev_queue_xmit+0x16f4/0x3990 net/core/dev.c:4829 dev_queue_xmit include/linux/netdevice.h:3384 [inline] Fixes: 27a01c1969a5 ("net: fully inline backlog_unlock_irq_restore()") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260213120427.2914544-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/dev.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index ac6bcb2a0784..96f89eb797e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -231,10 +231,13 @@ static bool use_backlog_threads(void) static inline void backlog_lock_irq_save(struct softnet_data *sd, unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); - else + } else { local_irq_save(*flags); + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_lock(&sd->input_pkt_queue.lock); + } } static inline void backlog_lock_irq_disable(struct softnet_data *sd) @@ -248,9 +251,13 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd) static inline void backlog_unlock_irq_restore(struct softnet_data *sd, unsigned long flags) { - if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) - spin_unlock(&sd->input_pkt_queue.lock); - local_irq_restore(flags); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, flags); + } else { + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_unlock(&sd->input_pkt_queue.lock); + local_irq_restore(flags); + } } static inline void backlog_unlock_irq_enable(struct softnet_data *sd) From 02cb2e6bacbb08ebf6acb61be816efd11e1f4a21 Mon Sep 17 00:00:00 2001 From: Aleksei Oladko Date: Fri, 13 Feb 2026 13:19:05 +0000 Subject: [PATCH 40/85] selftests: forwarding: vxlan_bridge_1d: fix test failure with br_netfilter enabled The test generates VXLAN traffic using mausezahn, where the encapsulated inner IPv4 packet contains a zero IP header checksum. After VXLAN decapsulation, such packets do not pass sanity checks in br_netfilter and are dropped, which causes the test to fail. Fix this by calculating and setting a valid IPv4 header checksum for the encapsulated packet generated by mausezahn, so that the packet is accepted by br_netfilter. Fixed by using the payload_template_calc_checksum() / payload_template_expand_checksum() helpers that are only available in v6.3 and newer kernels. Fixes: a0b61f3d8ebf ("selftests: forwarding: vxlan_bridge_1d: Add an ECN decap test") Signed-off-by: Aleksei Oladko Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260213131907.43351-2-aleksey.oladko@virtuozzo.com Signed-off-by: Paolo Abeni --- .../net/forwarding/vxlan_bridge_1d.sh | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index b43816dd998c..457f41d5e584 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -567,6 +567,21 @@ vxlan_encapped_ping_do() local inner_tos=$1; shift local outer_tos=$1; shift + local ipv4hdr=$(: + )"45:"$( : IP version + IHL + )"$inner_tos:"$( : IP TOS + )"00:54:"$( : IP total length + )"99:83:"$( : IP identification + )"40:00:"$( : IP flags + frag off + )"40:"$( : IP TTL + )"01:"$( : IP proto + )"CHECKSUM:"$( : IP header csum + )"c0:00:02:03:"$( : IP saddr: 192.0.2.3 + )"c0:00:02:01"$( : IP daddr: 192.0.2.1 + ) + local checksum=$(payload_template_calc_checksum "$ipv4hdr") + ipv4hdr=$(payload_template_expand_checksum "$ipv4hdr" $checksum) + $MZ $dev -c $count -d 100msec -q \ -b $next_hop_mac -B $dest_ip \ -t udp tos=$outer_tos,sp=23456,dp=$VXPORT,p=$(: @@ -577,16 +592,7 @@ vxlan_encapped_ping_do() )"$dest_mac:"$( : ETH daddr )"$(mac_get w2):"$( : ETH saddr )"08:00:"$( : ETH type - )"45:"$( : IP version + IHL - )"$inner_tos:"$( : IP TOS - )"00:54:"$( : IP total length - )"99:83:"$( : IP identification - )"40:00:"$( : IP flags + frag off - )"40:"$( : IP TTL - )"01:"$( : IP proto - )"00:00:"$( : IP header csum - )"c0:00:02:03:"$( : IP saddr: 192.0.2.3 - )"c0:00:02:01:"$( : IP daddr: 192.0.2.1 + )"$ipv4hdr:"$( : IPv4 header )"08:"$( : ICMP type )"00:"$( : ICMP code )"8b:f2:"$( : ICMP csum From ce9f6aec0fb780dafc1dfc5f47c688422aff464a Mon Sep 17 00:00:00 2001 From: Aleksei Oladko Date: Fri, 13 Feb 2026 13:19:06 +0000 Subject: [PATCH 41/85] selftests: forwarding: vxlan_bridge_1d_ipv6: fix test failure with br_netfilter enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test generates VXLAN traffic using mausezahn, where the encapsulated inner IPv6 packet has an incorrect payload length set in the IPv6 header. After VXLAN decapsulation, such packets do not pass sanity checks in br_netfilter and are dropped, which causes the test to fail. Fix this by setting the correct IPv6 payload length for the encapsulated packet generated by mausezahn, so that the packet is accepted by br_netfilter. tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh lines 698-706 )"00:03:"$( : Payload length )"3a:"$( : Next header )"04:"$( : Hop limit )"$saddr:"$( : IP saddr )"$daddr:"$( : IP daddr )"80:"$( : ICMPv6.type )"00:"$( : ICMPv6.code )"00:"$( : ICMPv6.checksum ) Data after IPv6 header: • 80: — 1 byte (ICMPv6 type) • 00: — 1 byte (ICMPv6 code) • 00: — 1 byte (ICMPv6 checksum, truncated) Total: 3 bytes → 00:03 is correct. The old value 00:08 did not match the actual payload size. Fixes: b07e9957f220 ("selftests: forwarding: Add VxLAN tests with a VLAN-unaware bridge for IPv6") Signed-off-by: Aleksei Oladko Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260213131907.43351-3-aleksey.oladko@virtuozzo.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh index a603f7b0a08f..e642feeada0e 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh @@ -695,7 +695,7 @@ vxlan_encapped_ping_do() )"6"$( : IP version )"$inner_tos"$( : Traffic class )"0:00:00:"$( : Flow label - )"00:08:"$( : Payload length + )"00:03:"$( : Payload length )"3a:"$( : Next header )"04:"$( : Hop limit )"$saddr:"$( : IP saddr From a8c198d16c64cdf57f481a4cd3e769502802369e Mon Sep 17 00:00:00 2001 From: Aleksei Oladko Date: Fri, 13 Feb 2026 13:19:07 +0000 Subject: [PATCH 42/85] selftests: forwarding: fix pedit tests failure with br_netfilter enabled The tests use the tc pedit action to modify the IPv4 source address ("pedit ex munge ip src set"), but the IP header checksum is not recalculated after the modification. As a result, the modified packet fails sanity checks in br_netfilter after bridging and is dropped, which causes the test to fail. Fix this by ensuring net.bridge.bridge-nf-call-iptables is set to 0 during the test execution. This prevents the bridge from passing L2 traffic to netfilter, bypassing the checksum validation that causes the test failure. Fixes: 92ad3828944e ("selftests: forwarding: Add a test for pedit munge SIP and DIP") Fixes: 226657ba2389 ("selftests: forwarding: Add a forwarding test for pedit munge dsfield") Signed-off-by: Aleksei Oladko Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260213131907.43351-4-aleksey.oladko@virtuozzo.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/pedit_dsfield.sh | 8 ++++++++ tools/testing/selftests/net/forwarding/pedit_ip.sh | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh index af008fbf2725..eb2d8034de9c 100755 --- a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -98,12 +98,20 @@ setup_prepare() h1_create h2_create switch_create + + if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + sysctl_set net.bridge.bridge-nf-call-iptables 0 + fi } cleanup() { pre_cleanup + if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + sysctl_restore net.bridge.bridge-nf-call-iptables + fi + switch_destroy h2_destroy h1_destroy diff --git a/tools/testing/selftests/net/forwarding/pedit_ip.sh b/tools/testing/selftests/net/forwarding/pedit_ip.sh index d14efb2d23b2..9235674627ab 100755 --- a/tools/testing/selftests/net/forwarding/pedit_ip.sh +++ b/tools/testing/selftests/net/forwarding/pedit_ip.sh @@ -91,12 +91,20 @@ setup_prepare() h1_create h2_create switch_create + + if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + sysctl_set net.bridge.bridge-nf-call-iptables 0 + fi } cleanup() { pre_cleanup + if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + sysctl_restore net.bridge.bridge-nf-call-iptables + fi + switch_destroy h2_destroy h1_destroy From 07919126ecfc392102555a70016db3e591abcb3d Mon Sep 17 00:00:00 2001 From: Sun Jian Date: Thu, 5 Feb 2026 20:30:17 +0800 Subject: [PATCH 43/85] netfilter: annotate NAT helper hook pointers with __rcu The NAT helper hook pointers are updated and dereferenced under RCU rules, but lack the proper __rcu annotation. This makes sparse report address space mismatches when the hooks are used with rcu_dereference(). Add the missing __rcu annotations to the global hook pointer declarations and definitions in Amanda, FTP, IRC, SNMP and TFTP. No functional change intended. Suggested-by: Florian Westphal Signed-off-by: Sun Jian Signed-off-by: Florian Westphal --- include/linux/netfilter/nf_conntrack_amanda.h | 2 +- include/linux/netfilter/nf_conntrack_ftp.h | 2 +- include/linux/netfilter/nf_conntrack_irc.h | 2 +- include/linux/netfilter/nf_conntrack_snmp.h | 2 +- include/linux/netfilter/nf_conntrack_tftp.h | 2 +- net/netfilter/nf_conntrack_amanda.c | 14 +++++++------- net/netfilter/nf_conntrack_ftp.c | 14 +++++++------- net/netfilter/nf_conntrack_irc.c | 13 +++++++------ net/netfilter/nf_conntrack_snmp.c | 8 ++++---- net/netfilter/nf_conntrack_tftp.c | 7 ++++--- 10 files changed, 34 insertions(+), 32 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_amanda.h b/include/linux/netfilter/nf_conntrack_amanda.h index 6f0ac896fcc9..dfe89f38d1f7 100644 --- a/include/linux/netfilter/nf_conntrack_amanda.h +++ b/include/linux/netfilter/nf_conntrack_amanda.h @@ -7,7 +7,7 @@ #include #include -extern unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb, +extern unsigned int (__rcu *nf_nat_amanda_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, unsigned int protoff, unsigned int matchoff, diff --git a/include/linux/netfilter/nf_conntrack_ftp.h b/include/linux/netfilter/nf_conntrack_ftp.h index 0e38302820b9..f31292642035 100644 --- a/include/linux/netfilter/nf_conntrack_ftp.h +++ b/include/linux/netfilter/nf_conntrack_ftp.h @@ -26,7 +26,7 @@ struct nf_ct_ftp_master { /* For NAT to hook in when we find a packet which describes what other * connection we should expect. */ -extern unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb, +extern unsigned int (__rcu *nf_nat_ftp_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, enum nf_ct_ftp_type type, unsigned int protoff, diff --git a/include/linux/netfilter/nf_conntrack_irc.h b/include/linux/netfilter/nf_conntrack_irc.h index d02255f721e1..4f3ca5621998 100644 --- a/include/linux/netfilter/nf_conntrack_irc.h +++ b/include/linux/netfilter/nf_conntrack_irc.h @@ -8,7 +8,7 @@ #define IRC_PORT 6667 -extern unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, +extern unsigned int (__rcu *nf_nat_irc_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, unsigned int protoff, unsigned int matchoff, diff --git a/include/linux/netfilter/nf_conntrack_snmp.h b/include/linux/netfilter/nf_conntrack_snmp.h index 87e4f33eb55f..99107e4f5234 100644 --- a/include/linux/netfilter/nf_conntrack_snmp.h +++ b/include/linux/netfilter/nf_conntrack_snmp.h @@ -5,7 +5,7 @@ #include #include -extern int (*nf_nat_snmp_hook)(struct sk_buff *skb, +extern int (__rcu *nf_nat_snmp_hook)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo); diff --git a/include/linux/netfilter/nf_conntrack_tftp.h b/include/linux/netfilter/nf_conntrack_tftp.h index dc4c1b9beac0..1490b68dd7d1 100644 --- a/include/linux/netfilter/nf_conntrack_tftp.h +++ b/include/linux/netfilter/nf_conntrack_tftp.h @@ -19,7 +19,7 @@ struct tftphdr { #define TFTP_OPCODE_ACK 4 #define TFTP_OPCODE_ERROR 5 -extern unsigned int (*nf_nat_tftp_hook)(struct sk_buff *skb, +extern unsigned int (__rcu *nf_nat_tftp_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, struct nf_conntrack_expect *exp); diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index 7be4c35e4795..c0132559f6af 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -37,13 +37,13 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); module_param(ts_algo, charp, 0400); MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); -unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp) - __read_mostly; +unsigned int (__rcu *nf_nat_amanda_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) + __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_amanda_hook); enum amanda_strings { diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 617f744a2e3a..5e00f9123c38 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -43,13 +43,13 @@ module_param_array(ports, ushort, &ports_c, 0400); static bool loose; module_param(loose, bool, 0600); -unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - enum nf_ct_ftp_type type, - unsigned int protoff, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp); +unsigned int (__rcu *nf_nat_ftp_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + enum nf_ct_ftp_type type, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp); EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 5703846bea3b..b8e6d724acd1 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -30,12 +30,13 @@ static unsigned int dcc_timeout __read_mostly = 300; static char *irc_buffer; static DEFINE_SPINLOCK(irc_buffer_lock); -unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp) __read_mostly; +unsigned int (__rcu *nf_nat_irc_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) + __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_irc_hook); #define HELPER_NAME "irc" diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c index daacf2023fa5..387dd6e58f88 100644 --- a/net/netfilter/nf_conntrack_snmp.c +++ b/net/netfilter/nf_conntrack_snmp.c @@ -25,10 +25,10 @@ static unsigned int timeout __read_mostly = 30; module_param(timeout, uint, 0400); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); -int (*nf_nat_snmp_hook)(struct sk_buff *skb, - unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo); +int (__rcu *nf_nat_snmp_hook)(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 80ee53f29f68..89e9914e5d03 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -32,9 +32,10 @@ static unsigned int ports_c; module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "Port numbers of TFTP servers"); -unsigned int (*nf_nat_tftp_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_expect *exp) __read_mostly; +unsigned int (__rcu *nf_nat_tftp_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_expect *exp) + __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_tftp_hook); static int tftp_help(struct sk_buff *skb, From 779c60a5190c42689534172f4b49e927c9959e4e Mon Sep 17 00:00:00 2001 From: Brian Witte Date: Wed, 4 Feb 2026 14:26:37 -0600 Subject: [PATCH 44/85] netfilter: nft_counter: serialize reset with spinlock Add a global static spinlock to serialize counter fetch+reset operations, preventing concurrent dump-and-reset from underrunning values. The lock is taken before fetching the total so that two parallel resets cannot both read the same counter values and then both subtract them. A global lock is used for simplicity since resets are infrequent. If this becomes a bottleneck, it can be replaced with a per-net lock later. Fixes: bd662c4218f9 ("netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests") Fixes: 3d483faa6663 ("netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests") Fixes: 3cb03edb4de3 ("netfilter: nf_tables: Add locking for NFT_MSG_GETRULE_RESET requests") Suggested-by: Florian Westphal Signed-off-by: Brian Witte Signed-off-by: Florian Westphal --- net/netfilter/nft_counter.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 0d70325280cc..169ae93688bc 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -32,6 +32,9 @@ struct nft_counter_percpu_priv { static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync); +/* control plane only: sync fetch+reset */ +static DEFINE_SPINLOCK(nft_counter_lock); + static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -148,13 +151,25 @@ static void nft_counter_fetch(struct nft_counter_percpu_priv *priv, } } +static void nft_counter_fetch_and_reset(struct nft_counter_percpu_priv *priv, + struct nft_counter_tot *total) +{ + spin_lock(&nft_counter_lock); + nft_counter_fetch(priv, total); + nft_counter_reset(priv, total); + spin_unlock(&nft_counter_lock); +} + static int nft_counter_do_dump(struct sk_buff *skb, struct nft_counter_percpu_priv *priv, bool reset) { struct nft_counter_tot total; - nft_counter_fetch(priv, &total); + if (unlikely(reset)) + nft_counter_fetch_and_reset(priv, &total); + else + nft_counter_fetch(priv, &total); if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), NFTA_COUNTER_PAD) || @@ -162,9 +177,6 @@ static int nft_counter_do_dump(struct sk_buff *skb, NFTA_COUNTER_PAD)) goto nla_put_failure; - if (reset) - nft_counter_reset(priv, &total); - return 0; nla_put_failure: From 30c4d7fb59ac4c8d7fa7937df11eed10b368fa11 Mon Sep 17 00:00:00 2001 From: Brian Witte Date: Wed, 4 Feb 2026 14:26:38 -0600 Subject: [PATCH 45/85] netfilter: nft_quota: use atomic64_xchg for reset Use atomic64_xchg() to atomically read and zero the consumed value on reset, which is simpler than the previous read+sub pattern and doesn't require lock serialization. Fixes: bd662c4218f9 ("netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests") Fixes: 3d483faa6663 ("netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests") Fixes: 3cb03edb4de3 ("netfilter: nf_tables: Add locking for NFT_MSG_GETRULE_RESET requests") Suggested-by: Pablo Neira Ayuso Signed-off-by: Brian Witte Signed-off-by: Florian Westphal --- net/netfilter/nft_quota.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index df0798da2329..cb6c0e04ff67 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -140,11 +140,16 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, u64 consumed, consumed_cap, quota; u32 flags = priv->flags; - /* Since we inconditionally increment consumed quota for each packet + /* Since we unconditionally increment consumed quota for each packet * that we see, don't go over the quota boundary in what we send to * userspace. */ - consumed = atomic64_read(priv->consumed); + if (reset) { + consumed = atomic64_xchg(priv->consumed, 0); + clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags); + } else { + consumed = atomic64_read(priv->consumed); + } quota = atomic64_read(&priv->quota); if (consumed >= quota) { consumed_cap = quota; @@ -160,10 +165,6 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) goto nla_put_failure; - if (reset) { - atomic64_sub(consumed, priv->consumed); - clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags); - } return 0; nla_put_failure: From 7f261bb906bf527c4a6e2a646e2d5f3679f2a8bc Mon Sep 17 00:00:00 2001 From: Brian Witte Date: Wed, 4 Feb 2026 14:26:36 -0600 Subject: [PATCH 46/85] netfilter: nf_tables: revert commit_mutex usage in reset path It causes circular lock dependency between commit_mutex, nfnl_subsys_ipset and nlk_cb_mutex when nft reset, ipset list, and iptables-nft with '-m set' rule run at the same time. Previous patches made it safe to run individual reset handlers concurrently so commit_mutex is no longer required to prevent this. Fixes: bd662c4218f9 ("netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests") Fixes: 3d483faa6663 ("netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests") Fixes: 3cb03edb4de3 ("netfilter: nf_tables: Add locking for NFT_MSG_GETRULE_RESET requests") Link: https://lore.kernel.org/all/aUh_3mVRV8OrGsVo@strlen.de/ Reported-by: Closes: https://syzkaller.appspot.com/bug?extid=ff16b505ec9152e5f448 Signed-off-by: Brian Witte Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 248 ++++++---------------------------- 1 file changed, 42 insertions(+), 206 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1ed034a47bd0..819056ea1ce1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3901,23 +3901,6 @@ done: return skb->len; } -static int nf_tables_dumpreset_rules(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); - int ret; - - /* Mutex is held is to prevent that two concurrent dump-and-reset calls - * do not underrun counters and quotas. The commit_mutex is used for - * the lack a better lock, this is not transaction path. - */ - mutex_lock(&nft_net->commit_mutex); - ret = nf_tables_dump_rules(skb, cb); - mutex_unlock(&nft_net->commit_mutex); - - return ret; -} - static int nf_tables_dump_rules_start(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; @@ -3937,18 +3920,12 @@ static int nf_tables_dump_rules_start(struct netlink_callback *cb) return -ENOMEM; } } + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + ctx->reset = true; + return 0; } -static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb) -{ - struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; - - ctx->reset = true; - - return nf_tables_dump_rules_start(cb); -} - static int nf_tables_dump_rules_done(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; @@ -4012,6 +3989,8 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; + bool reset = false; + char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -4025,46 +4004,15 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - skb2 = nf_tables_getrule_single(portid, info, nla, false); + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + reset = true; + + skb2 = nf_tables_getrule_single(portid, info, nla, reset); if (IS_ERR(skb2)) return PTR_ERR(skb2); - return nfnetlink_unicast(skb2, net, portid); -} - -static int nf_tables_getrule_reset(struct sk_buff *skb, - const struct nfnl_info *info, - const struct nlattr * const nla[]) -{ - struct nftables_pernet *nft_net = nft_pernet(info->net); - u32 portid = NETLINK_CB(skb).portid; - struct net *net = info->net; - struct sk_buff *skb2; - char *buf; - - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start= nf_tables_dumpreset_rules_start, - .dump = nf_tables_dumpreset_rules, - .done = nf_tables_dump_rules_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - - if (!try_module_get(THIS_MODULE)) - return -EINVAL; - rcu_read_unlock(); - mutex_lock(&nft_net->commit_mutex); - skb2 = nf_tables_getrule_single(portid, info, nla, true); - mutex_unlock(&nft_net->commit_mutex); - rcu_read_lock(); - module_put(THIS_MODULE); - - if (IS_ERR(skb2)) - return PTR_ERR(skb2); + if (!reset) + return nfnetlink_unicast(skb2, net, portid); buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), @@ -6324,6 +6272,10 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) nla_nest_end(skb, nest); nlmsg_end(skb, nlh); + if (dump_ctx->reset && args.iter.count > args.iter.skip) + audit_log_nft_set_reset(table, cb->seq, + args.iter.count - args.iter.skip); + rcu_read_unlock(); if (args.iter.err && args.iter.err != -EMSGSIZE) @@ -6339,26 +6291,6 @@ nla_put_failure: return -ENOSPC; } -static int nf_tables_dumpreset_set(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); - struct nft_set_dump_ctx *dump_ctx = cb->data; - int ret, skip = cb->args[0]; - - mutex_lock(&nft_net->commit_mutex); - - ret = nf_tables_dump_set(skb, cb); - - if (cb->args[0] > skip) - audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq, - cb->args[0] - skip); - - mutex_unlock(&nft_net->commit_mutex); - - return ret; -} - static int nf_tables_dump_set_start(struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; @@ -6602,8 +6534,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb, { struct netlink_ext_ack *extack = info->extack; struct nft_set_dump_ctx dump_ctx; + int rem, err = 0, nelems = 0; + struct net *net = info->net; struct nlattr *attr; - int rem, err = 0; + bool reset = false; + + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -6613,7 +6550,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, .module = THIS_MODULE, }; - err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, reset); if (err) return err; @@ -6624,75 +6561,21 @@ static int nf_tables_getsetelem(struct sk_buff *skb, if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; - err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, reset); if (err) return err; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false); - if (err < 0) { - NL_SET_BAD_ATTR(extack, attr); - break; - } - } - - return err; -} - -static int nf_tables_getsetelem_reset(struct sk_buff *skb, - const struct nfnl_info *info, - const struct nlattr * const nla[]) -{ - struct nftables_pernet *nft_net = nft_pernet(info->net); - struct netlink_ext_ack *extack = info->extack; - struct nft_set_dump_ctx dump_ctx; - int rem, err = 0, nelems = 0; - struct nlattr *attr; - - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start = nf_tables_dump_set_start, - .dump = nf_tables_dumpreset_set, - .done = nf_tables_dump_set_done, - .module = THIS_MODULE, - }; - - err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); - if (err) - return err; - - c.data = &dump_ctx; - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - - if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) - return -EINVAL; - - if (!try_module_get(THIS_MODULE)) - return -EINVAL; - rcu_read_unlock(); - mutex_lock(&nft_net->commit_mutex); - rcu_read_lock(); - - err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); - if (err) - goto out_unlock; - - nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true); + err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, reset); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; } nelems++; } - audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); - -out_unlock: - rcu_read_unlock(); - mutex_unlock(&nft_net->commit_mutex); - rcu_read_lock(); - module_put(THIS_MODULE); + if (reset) + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(net), + nelems); return err; } @@ -8564,19 +8447,6 @@ cont: return skb->len; } -static int nf_tables_dumpreset_obj(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); - int ret; - - mutex_lock(&nft_net->commit_mutex); - ret = nf_tables_dump_obj(skb, cb); - mutex_unlock(&nft_net->commit_mutex); - - return ret; -} - static int nf_tables_dump_obj_start(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; @@ -8593,18 +8463,12 @@ static int nf_tables_dump_obj_start(struct netlink_callback *cb) if (nla[NFTA_OBJ_TYPE]) ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + ctx->reset = true; + return 0; } -static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb) -{ - struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; - - ctx->reset = true; - - return nf_tables_dump_obj_start(cb); -} - static int nf_tables_dump_obj_done(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; @@ -8665,7 +8529,10 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; struct sk_buff *skb2; + bool reset = false; + char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -8679,46 +8546,15 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - skb2 = nf_tables_getobj_single(portid, info, nla, false); + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + reset = true; + + skb2 = nf_tables_getobj_single(portid, info, nla, reset); if (IS_ERR(skb2)) return PTR_ERR(skb2); - return nfnetlink_unicast(skb2, info->net, portid); -} - -static int nf_tables_getobj_reset(struct sk_buff *skb, - const struct nfnl_info *info, - const struct nlattr * const nla[]) -{ - struct nftables_pernet *nft_net = nft_pernet(info->net); - u32 portid = NETLINK_CB(skb).portid; - struct net *net = info->net; - struct sk_buff *skb2; - char *buf; - - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start = nf_tables_dumpreset_obj_start, - .dump = nf_tables_dumpreset_obj, - .done = nf_tables_dump_obj_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - - if (!try_module_get(THIS_MODULE)) - return -EINVAL; - rcu_read_unlock(); - mutex_lock(&nft_net->commit_mutex); - skb2 = nf_tables_getobj_single(portid, info, nla, true); - mutex_unlock(&nft_net->commit_mutex); - rcu_read_lock(); - module_put(THIS_MODULE); - - if (IS_ERR(skb2)) - return PTR_ERR(skb2); + if (!reset) + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), @@ -10037,7 +9873,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_rule_policy, }, [NFT_MSG_GETRULE_RESET] = { - .call = nf_tables_getrule_reset, + .call = nf_tables_getrule, .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, @@ -10091,7 +9927,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM_RESET] = { - .call = nf_tables_getsetelem_reset, + .call = nf_tables_getsetelem, .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, @@ -10137,7 +9973,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call = nf_tables_getobj_reset, + .call = nf_tables_getobj, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, From a6d28eb8efe96b3e35c92efdf1bfacb0cccf541f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Feb 2026 12:53:09 +0100 Subject: [PATCH 47/85] netfilter: nf_conntrack_h323: don't pass uninitialised l3num value Mihail Milev reports: Error: UNINIT (CWE-457): net/netfilter/nf_conntrack_h323_main.c:1189:2: var_decl: Declaring variable "tuple" without initializer. net/netfilter/nf_conntrack_h323_main.c:1197:2: uninit_use_in_call: Using uninitialized value "tuple.src.l3num" when calling "__nf_ct_expect_find". net/netfilter/nf_conntrack_expect.c:142:2: read_value: Reading value "tuple->src.l3num" when calling "nf_ct_expect_dst_hash". 1195| tuple.dst.protonum = IPPROTO_TCP; 1196| 1197|-> exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); 1198| if (exp && exp->master == ct) 1199| return exp; Switch this to a C99 initialiser and set the l3num value. Fixes: f587de0e2feb ("[NETFILTER]: nf_conntrack/nf_nat: add H.323 helper port") Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_h323_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 17f1f453d481..a2a0e22ccee1 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1187,13 +1187,13 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, { struct net *net = nf_ct_net(ct); struct nf_conntrack_expect *exp; - struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple tuple = { + .src.l3num = nf_ct_l3num(ct), + .dst.protonum = IPPROTO_TCP, + .dst.u.tcp.port = port, + }; - memset(&tuple.src.u3, 0, sizeof(tuple.src.u3)); - tuple.src.u.tcp.port = 0; memcpy(&tuple.dst.u3, addr, sizeof(tuple.dst.u3)); - tuple.dst.u.tcp.port = port; - tuple.dst.protonum = IPPROTO_TCP; exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); if (exp && exp->master == ct) From 4edd4ba71ce0df015303dba75ea9d20d1a217546 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sat, 14 Feb 2026 15:54:06 +0100 Subject: [PATCH 48/85] include: uapi: netfilter_bridge.h: Cover for musl libc Musl defines its own struct ethhdr and thus defines __UAPI_DEF_ETHHDR to zero. To avoid struct redefinition errors, user space is therefore supposed to include netinet/if_ether.h before (or instead of) linux/if_ether.h. To relieve them from this burden, include the libc header here if not building for kernel space. Reported-by: Alyssa Ross Suggested-by: Florian Westphal Signed-off-by: Phil Sutter Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter_bridge.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h index f6e8d1e05c97..758de72b2764 100644 --- a/include/uapi/linux/netfilter_bridge.h +++ b/include/uapi/linux/netfilter_bridge.h @@ -5,6 +5,10 @@ /* bridge-specific defines for netfilter. */ +#ifndef __KERNEL__ +#include /* for __UAPI_DEF_ETHHDR if defined */ +#endif + #include #include #include From 05cfe9863ef049d98141dc2969eefde72fb07625 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 14 Feb 2026 16:58:49 +0200 Subject: [PATCH 49/85] ipvs: skip ipv6 extension headers for csum checks Protocol checksum validation fails for IPv6 if there are extension headers before the protocol header. iph->len already contains its offset, so use it to fix the problem. Fixes: 2906f66a5682 ("ipvs: SCTP Trasport Loadbalancing Support") Fixes: 0bbdd42b7efa ("IPVS: Extend protocol DNAT/SNAT and state handlers") Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 18 ++++++------------ net/netfilter/ipvs/ip_vs_proto_tcp.c | 21 +++++++-------------- net/netfilter/ipvs/ip_vs_proto_udp.c | 20 +++++++------------- 3 files changed, 20 insertions(+), 39 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 83e452916403..63c78a1f3918 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -10,7 +10,8 @@ #include static int -sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int sctphoff); static int sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, @@ -108,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!sctp_csum_check(cp->af, skb, pp)) + if (!sctp_csum_check(cp->af, skb, pp, sctphoff)) return 0; /* Call application helper if needed */ @@ -156,7 +157,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!sctp_csum_check(cp->af, skb, pp)) + if (!sctp_csum_check(cp->af, skb, pp, sctphoff)) return 0; /* Call application helper if needed */ @@ -185,19 +186,12 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, } static int -sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int sctphoff) { - unsigned int sctphoff; struct sctphdr *sh; __le32 cmp, val; -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - sctphoff = sizeof(struct ipv6hdr); - else -#endif - sctphoff = ip_hdrlen(skb); - sh = (struct sctphdr *)(skb->data + sctphoff); cmp = sh->checksum; val = sctp_compute_cksum(skb, sctphoff); diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index f68a1533ee45..8cc0a8ce6241 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -28,7 +28,8 @@ #include static int -tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int tcphoff); static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, @@ -165,7 +166,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!tcp_csum_check(cp->af, skb, pp)) + if (!tcp_csum_check(cp->af, skb, pp, tcphoff)) return 0; /* Call application helper if needed */ @@ -243,7 +244,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!tcp_csum_check(cp->af, skb, pp)) + if (!tcp_csum_check(cp->af, skb, pp, tcphoff)) return 0; /* @@ -300,17 +301,9 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, static int -tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int tcphoff) { - unsigned int tcphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); @@ -321,7 +314,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - tcphoff, - ipv6_hdr(skb)->nexthdr, + IPPROTO_TCP, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 0f0107c80dd2..f9de632e38cd 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -24,7 +24,8 @@ #include static int -udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int udphoff); static int udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, @@ -154,7 +155,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!udp_csum_check(cp->af, skb, pp)) + if (!udp_csum_check(cp->af, skb, pp, udphoff)) return 0; /* @@ -237,7 +238,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!udp_csum_check(cp->af, skb, pp)) + if (!udp_csum_check(cp->af, skb, pp, udphoff)) return 0; /* @@ -296,17 +297,10 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, static int -udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int udphoff) { struct udphdr _udph, *uh; - unsigned int udphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); if (uh == NULL) @@ -324,7 +318,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - udphoff, - ipv6_hdr(skb)->nexthdr, + IPPROTO_UDP, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); From 8fde939b0206afc1d5846217a01a16b9bc8c7896 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 14 Feb 2026 16:58:50 +0200 Subject: [PATCH 50/85] ipvs: do not keep dest_dst if dev is going down There is race between the netdev notifier ip_vs_dst_event() and the code that caches dst with dev that is going down. As the FIB can be notified for the closed device after our handler finishes, it is possible valid route to be returned and cached resuling in a leaked dev reference until the dest is not removed. To prevent new dest_dst to be attached to dest just after the handler dropped the old one, add a netif_running() check to make sure the notifier handler is not currently running for device that is closing. Fixes: 7a4f0761fce3 ("IPVS: init and cleanup restructuring") Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- net/netfilter/ipvs/ip_vs_xmit.c | 46 ++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index f861d116cc33..4389bfe3050d 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -294,6 +294,12 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, return true; } +/* rt has device that is down */ +static bool rt_dev_is_down(const struct net_device *dev) +{ + return dev && !netif_running(dev); +} + /* Get route to destination or remote server */ static int __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, @@ -309,9 +315,11 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); - if (likely(dest_dst)) + if (likely(dest_dst)) { rt = dst_rtable(dest_dst->dst_cache); - else { + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.ip; + } else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); if (!dest_dst) { @@ -327,14 +335,22 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, ip_vs_dest_dst_free(dest_dst); goto err_unreach; } - __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + /* It is forbidden to attach dest->dest_dst if + * device is going down. + */ + if (!rt_dev_is_down(dst_dev_rcu(&rt->dst))) + __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + else + noref = 0; spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", &dest->addr.ip, &dest_dst->dst_saddr.ip, rcuref_read(&rt->dst.__rcuref)); + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.ip; + if (!noref) + ip_vs_dest_dst_free(dest_dst); } - if (ret_saddr) - *ret_saddr = dest_dst->dst_saddr.ip; } else { noref = 0; @@ -471,9 +487,11 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); - if (likely(dest_dst)) + if (likely(dest_dst)) { rt = dst_rt6_info(dest_dst->dst_cache); - else { + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.in6; + } else { u32 cookie; dest_dst = ip_vs_dest_dst_alloc(); @@ -494,14 +512,22 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, } rt = dst_rt6_info(dst); cookie = rt6_get_cookie(rt); - __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + /* It is forbidden to attach dest->dest_dst if + * device is going down. + */ + if (!rt_dev_is_down(dst_dev_rcu(&rt->dst))) + __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + else + noref = 0; spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", &dest->addr.in6, &dest_dst->dst_saddr.in6, rcuref_read(&rt->dst.__rcuref)); + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.in6; + if (!noref) + ip_vs_dest_dst_free(dest_dst); } - if (ret_saddr) - *ret_saddr = dest_dst->dst_saddr.in6; } else { noref = 0; dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, From 008e7a7c293b30bc43e4368dac6ea3808b75a572 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 17 Feb 2026 12:56:39 +0100 Subject: [PATCH 51/85] net: remove WARN_ON_ONCE when accessing forward path array Although unlikely, recent support for IPIP tunnels increases chances of reaching this WARN_ON_ONCE if userspace manages to build a sufficiently long forward path. Remove it. Fixes: ddb94eafab8b ("net: resolve forwarding path from virtual netdevice and HW destination address") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 96f89eb797e8..096b3ff13f6b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -744,7 +744,7 @@ static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) { int k = stack->num_paths++; - if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + if (k >= NET_DEVICE_PATH_STACK_MAX) return NULL; return &stack->path[k]; From 71e99ee20fc3f662555118cf1159443250647533 Mon Sep 17 00:00:00 2001 From: Inseo An Date: Tue, 17 Feb 2026 21:14:40 +0900 Subject: [PATCH 52/85] netfilter: nf_tables: fix use-after-free in nf_tables_addchain() nf_tables_addchain() publishes the chain to table->chains via list_add_tail_rcu() (in nft_chain_add()) before registering hooks. If nf_tables_register_hook() then fails, the error path calls nft_chain_del() (list_del_rcu()) followed by nf_tables_chain_destroy() with no RCU grace period in between. This creates two use-after-free conditions: 1) Control-plane: nf_tables_dump_chains() traverses table->chains under rcu_read_lock(). A concurrent dump can still be walking the chain when the error path frees it. 2) Packet path: for NFPROTO_INET, nf_register_net_hook() briefly installs the IPv4 hook before IPv6 registration fails. Packets entering nft_do_chain() via the transient IPv4 hook can still be dereferencing chain->blob_gen_X when the error path frees the chain. Add synchronize_rcu() between nft_chain_del() and the chain destroy so that all RCU readers -- both dump threads and in-flight packet evaluation -- have finished before the chain is freed. Fixes: 91c7b38dc9f0 ("netfilter: nf_tables: use new transaction infrastructure to handle chain") Signed-off-by: Inseo An Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 819056ea1ce1..0c5a4855b97d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2823,6 +2823,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, err_register_hook: nft_chain_del(chain); + synchronize_rcu(); err_chain_add: nft_trans_destroy(trans); err_trans: From 452a3eee22c57a5786ae6db5c97f3b0ec13bb3b7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Feb 2026 10:22:02 +0000 Subject: [PATCH 53/85] ipv6: fix a race in ip6_sock_set_v6only() It is unlikely that this function will be ever called with isk->inet_num being not zero. Perform the check on isk->inet_num inside the locked section for complete safety. Fixes: 9b115749acb24 ("ipv6: add ip6_sock_set_v6only") Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Reviewed-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20260216102202.3343588-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index cc56e09525d0..53c5056508be 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1213,12 +1213,15 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, static inline int ip6_sock_set_v6only(struct sock *sk) { - if (inet_sk(sk)->inet_num) - return -EINVAL; + int ret = 0; + lock_sock(sk); - sk->sk_ipv6only = true; + if (inet_sk(sk)->inet_num) + ret = -EINVAL; + else + sk->sk_ipv6only = true; release_sock(sk); - return 0; + return ret; } static inline void ip6_sock_set_recverr(struct sock *sk) From 6e980df452169f82674f2e650079c1fe0aee343d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Feb 2026 11:54:54 +0100 Subject: [PATCH 54/85] net: psp: select CONFIG_SKB_EXTENSIONS psp now uses skb extensions, failing to build when that is disabled: In file included from include/net/psp.h:7, from net/psp/psp_sock.c:9: include/net/psp/functions.h: In function '__psp_skb_coalesce_diff': include/net/psp/functions.h:60:13: error: implicit declaration of function 'skb_ext_find'; did you mean 'skb_ext_copy'? [-Wimplicit-function-declaration] 60 | a = skb_ext_find(one, SKB_EXT_PSP); | ^~~~~~~~~~~~ | skb_ext_copy include/net/psp/functions.h:60:31: error: 'SKB_EXT_PSP' undeclared (first use in this function) 60 | a = skb_ext_find(one, SKB_EXT_PSP); | ^~~~~~~~~~~ include/net/psp/functions.h:60:31: note: each undeclared identifier is reported only once for each function it appears in include/net/psp/functions.h: In function '__psp_sk_rx_policy_check': include/net/psp/functions.h:94:53: error: 'SKB_EXT_PSP' undeclared (first use in this function) 94 | struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); | ^~~~~~~~~~~ net/psp/psp_sock.c: In function 'psp_sock_recv_queue_check': net/psp/psp_sock.c:164:41: error: 'SKB_EXT_PSP' undeclared (first use in this function) 164 | pse = skb_ext_find(skb, SKB_EXT_PSP); | ^~~~~~~~~~~ Select the Kconfig symbol as we do from its other users. Fixes: 6b46ca260e22 ("net: psp: add socket security association code") Signed-off-by: Arnd Bergmann Reviewed-by: Simon Horman Reviewed-by: Daniel Zahka Link: https://patch.msgid.link/20260216105500.2382181-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- net/psp/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 371e8771f3bd..84d6b0f25460 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -6,6 +6,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET select SKB_DECRYPTED + select SKB_EXTENSIONS select SOCK_VALIDATE_XMIT help Enable kernel support for the PSP Security Protocol (PSP). From 0943404b1f3b178e1e54386dadcbf4f2729c7762 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Feb 2026 19:36:53 +0000 Subject: [PATCH 55/85] net: do not delay zero-copy skbs in skb_attempt_defer_free() After the blamed commit, TCP tx zero copy notifications could be arbitrarily delayed and cause regressions in applications waiting for them. Signed-off-by: Eric Dumazet Fixes: e20dfbad8aab ("net: fix napi_consume_skb() with alien skbs") Reviewed-by: Jason Xing Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260216193653.627617-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 699c401a5eae..dc47d3efc72e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7266,10 +7266,15 @@ void skb_attempt_defer_free(struct sk_buff *skb) { struct skb_defer_node *sdn; unsigned long defer_count; - int cpu = skb->alloc_cpu; unsigned int defer_max; bool kick; + int cpu; + /* zero copy notifications should not be delayed. */ + if (skb_zcopy(skb)) + goto nodefer; + + cpu = skb->alloc_cpu; if (cpu == raw_smp_processor_id() || WARN_ON_ONCE(cpu >= nr_cpu_ids) || !cpu_online(cpu)) { From a047497f952831e377564b606dcb74a7cb309384 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Mon, 16 Feb 2026 20:40:07 +0100 Subject: [PATCH 56/85] dpll: zl3073x: Fix ref frequency setting The frequency for an input reference is computed as: frequency = freq_base * freq_mult * freq_ratio_m / freq_ratio_n Before commit 5bc02b190a3fb ("dpll: zl3073x: Cache all reference properties in zl3073x_ref"), zl3073x_dpll_input_pin_frequency_set() explicitly wrote 1 to both the REF_RATIO_M and REF_RATIO_N hardware registers whenever a new frequency was set. This ensured the FEC ratio was always reset to 1:1 alongside the new base/multiplier values. The refactoring in that commit introduced zl3073x_ref_freq_set() to update the cached ref state, but this helper only sets freq_base and freq_mult without resetting freq_ratio_m and freq_ratio_n to 1. Because zl3073x_ref_state_set() uses a compare-and-write strategy, unchanged ratio fields are never written to the hardware. If the device previously had non-unity FEC ratio values, they remain in effect after a frequency change, resulting in an incorrect computed frequency. Explicitly set freq_ratio_m and freq_ratio_n to 1 in zl3073x_ref_freq_set() to restore the original behavior. Fixes: 5bc02b190a3fb ("dpll: zl3073x: Cache all reference properties in zl3073x_ref") Signed-off-by: Ivan Vecera Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260216194007.680416-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- drivers/dpll/zl3073x/ref.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dpll/zl3073x/ref.h b/drivers/dpll/zl3073x/ref.h index efc7f59cd9f9..0d8618f5ce8d 100644 --- a/drivers/dpll/zl3073x/ref.h +++ b/drivers/dpll/zl3073x/ref.h @@ -91,6 +91,8 @@ zl3073x_ref_freq_set(struct zl3073x_ref *ref, u32 freq) ref->freq_base = base; ref->freq_mult = mult; + ref->freq_ratio_m = 1; + ref->freq_ratio_n = 1; return 0; } From 458c95de5c55c05b7a149da1509f4f0f68d28625 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Feb 2026 11:55:17 +0100 Subject: [PATCH 57/85] net: dsa: MxL862xx: don't force-enable MAXLINEAR_GPHY The newly added dsa driver attempts to enable the corresponding PHY driver, but that one has additional dependencies that may not be available: WARNING: unmet direct dependencies detected for MAXLINEAR_GPHY Depends on [m]: NETDEVICES [=y] && PHYLIB [=y] && (HWMON [=m] || HWMON [=m]=n [=n]) Selected by [y]: - NET_DSA_MXL862 [=y] && NETDEVICES [=y] && NET_DSA [=y] aarch64-linux-ld: drivers/net/phy/mxl-gpy.o: in function `gpy_probe': mxl-gpy.c:(.text.gpy_probe+0x13c): undefined reference to `devm_hwmon_device_register_with_info' aarch64-linux-ld: drivers/net/phy/mxl-gpy.o: in function `gpy_hwmon_read': mxl-gpy.c:(.text.gpy_hwmon_read+0x48): undefined reference to `polynomial_calc' There is actually no compile-time dependency, as DSA correctly uses the PHY abstractions. Remove the 'select' statement to reduce the complexity. Fixes: 23794bec1cb6 ("net: dsa: add basic initial driver for MxL862xx switches") Signed-off-by: Arnd Bergmann Reviewed-by: Daniel Golle Link: https://patch.msgid.link/20260216105522.2382373-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mxl862xx/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/dsa/mxl862xx/Kconfig b/drivers/net/dsa/mxl862xx/Kconfig index 4db7bab21a71..3e772298cc89 100644 --- a/drivers/net/dsa/mxl862xx/Kconfig +++ b/drivers/net/dsa/mxl862xx/Kconfig @@ -2,7 +2,6 @@ config NET_DSA_MXL862 tristate "MaxLinear MxL862xx" depends on NET_DSA - select MAXLINEAR_GPHY select NET_DSA_TAG_MXL_862XX help This enables support for the MaxLinear MxL862xx switch family. From ad5dfde2a5733aaf652ea3e40c8c5e071e935901 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Feb 2026 10:01:49 +0000 Subject: [PATCH 58/85] ping: annotate data-races in ping_lookup() isk->inet_num, isk->inet_rcv_saddr and sk->sk_bound_dev_if are read locklessly in ping_lookup(). Add READ_ONCE()/WRITE_ONCE() annotations. The race on isk->inet_rcv_saddr is probably coming from IPv6 support, but does not deserve a specific backport. Fixes: dbca1596bbb0 ("ping: convert to RCU lookups, get rid of rwlock") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260216100149.3319315-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ping.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index ebfc5a3d3ad6..71d5e17719de 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -148,7 +148,7 @@ void ping_unhash(struct sock *sk) pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); spin_lock(&ping_table.lock); if (sk_del_node_init_rcu(sk)) { - isk->inet_num = 0; + WRITE_ONCE(isk->inet_num, 0); isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } @@ -181,31 +181,35 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) } sk_for_each_rcu(sk, hslot) { + int bound_dev_if; + if (!net_eq(sock_net(sk), net)) continue; isk = inet_sk(sk); pr_debug("iterate\n"); - if (isk->inet_num != ident) + if (READ_ONCE(isk->inet_num) != ident) continue; + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (skb->protocol == htons(ETH_P_IP) && sk->sk_family == AF_INET) { - pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, - (int) isk->inet_num, &isk->inet_rcv_saddr, - sk->sk_bound_dev_if); + __be32 rcv_saddr = READ_ONCE(isk->inet_rcv_saddr); - if (isk->inet_rcv_saddr && - isk->inet_rcv_saddr != ip_hdr(skb)->daddr) + pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, + ident, &rcv_saddr, + bound_dev_if); + + if (rcv_saddr && rcv_saddr != ip_hdr(skb)->daddr) continue; #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6) && sk->sk_family == AF_INET6) { pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, - (int) isk->inet_num, + ident, &sk->sk_v6_rcv_saddr, - sk->sk_bound_dev_if); + bound_dev_if); if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, @@ -216,8 +220,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) continue; } - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif) + if (bound_dev_if && bound_dev_if != dif && + bound_dev_if != sdif) continue; goto exit; @@ -392,7 +396,9 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr) if (saddr->sa_family == AF_INET) { struct inet_sock *isk = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) saddr; - isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; + + isk->inet_saddr = addr->sin_addr.s_addr; + WRITE_ONCE(isk->inet_rcv_saddr, addr->sin_addr.s_addr); #if IS_ENABLED(CONFIG_IPV6) } else if (saddr->sa_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; @@ -850,7 +856,8 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct sk_buff *skb; int copied, err; - pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); + pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, + READ_ONCE(isk->inet_num)); err = -EOPNOTSUPP; if (flags & MSG_OOB) From 9e371b0ba7f5344b071ff813b7dddade8ba7f8c4 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Sat, 14 Feb 2026 18:25:43 +0100 Subject: [PATCH 59/85] ipv6: addrconf: reduce default temp_valid_lft to 2 days This is a recommendation from RFC 8981 and it was intended to be changed by commit 969c54646af0 ("ipv6: Implement draft-ietf-6man-rfc4941bis") but it only changed the sysctl documentation. Signed-off-by: Fernando Fernandez Mancera Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260214172543.5783-1-fmancera@suse.de Signed-off-by: Jakub Kicinski --- include/net/addrconf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 78e8b877fb25..9e96776945e5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -8,7 +8,8 @@ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ -#define TEMP_VALID_LIFETIME (7*86400) /* 1 week */ +/* TEMP_VALID_LIFETIME default value as specified in RFC 8981 3.8 */ +#define TEMP_VALID_LIFETIME (2*86400) /* 2 days */ #define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */ #define REGEN_MIN_ADVANCE (2) /* 2 seconds */ #define REGEN_MAX_RETRY (3) From 32b70e62034aa72f8414ad4e9122cce7ad418c48 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Feb 2026 19:51:59 -0800 Subject: [PATCH 60/85] selftests: tc_actions: don't dump 2MB of \0 to stdout Since we started running selftests in NIPA we have been seeing tc_actions.sh generate a soft lockup warning on ~20% of the runs. On the pre-netdev foundation setup it was actually a missed irq splat from the console. Now it's either that or a lockup. I initially suspected a socket locking issue since the test is exercising local loopback with act_mirred. After hours of staring at this I noticed in strace that ncat when -o $file is specified _both_ saves the output to the file and still prints it to stdout. Because the file being sent is constructed with: dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$mirred ^^^^^^^^^ the data printed is all \0. Most terminals don't display nul characters (and neither does vng output capture save them). But QEMU's serial console still has to poke them thru which is very slow and causes the lockup (if the file is >600kB). Replace the '-o $file' with '> $file'. This speeds the test up from 2m20s to 18s on debug kernels, and prevents the warnings. Fixes: ca22da2fbd69 ("act_mirred: use the backlog for nested calls to mirred ingress") Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260214035159.2119699-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/tc_actions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index ea89e558672d..86edbc7e2489 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -223,7 +223,7 @@ mirred_egress_to_ingress_tcp_test() ip_proto icmp \ action drop - ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 & + ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 > $mirred_e2i_tf2 & local rpid=$! ip vrf exec v$h1 ncat -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1 wait -n $rpid From e3f000f0dee1bfab52e2e61ca6a3835d9e187e35 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Feb 2026 14:25:57 +0000 Subject: [PATCH 61/85] macvlan: observe an RCU grace period in macvlan_common_newlink() error path valis reported that a race condition still happens after my prior patch. macvlan_common_newlink() might have made @dev visible before detecting an error, and its caller will directly call free_netdev(dev). We must respect an RCU period, either in macvlan or the core networking stack. After adding a temporary mdelay(1000) in macvlan_forward_source_one() to open the race window, valis repro was: ip link add p1 type veth peer p2 ip link set address 00:00:00:00:00:20 dev p1 ip link set up dev p1 ip link set up dev p2 ip link add mv0 link p2 type macvlan mode source (ip link add invalid% link p2 type macvlan mode source macaddr add 00:00:00:00:00:20 &) ; sleep 0.5 ; ping -c1 -I p1 1.2.3.4 PING 1.2.3.4 (1.2.3.4): 56 data bytes RTNETLINK answers: Invalid argument BUG: KASAN: slab-use-after-free in macvlan_forward_source (drivers/net/macvlan.c:408 drivers/net/macvlan.c:444) Read of size 8 at addr ffff888016bb89c0 by task e/175 CPU: 1 UID: 1000 PID: 175 Comm: e Not tainted 6.19.0-rc8+ #33 NONE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:123) print_report (mm/kasan/report.c:379 mm/kasan/report.c:482) ? macvlan_forward_source (drivers/net/macvlan.c:408 drivers/net/macvlan.c:444) kasan_report (mm/kasan/report.c:597) ? macvlan_forward_source (drivers/net/macvlan.c:408 drivers/net/macvlan.c:444) macvlan_forward_source (drivers/net/macvlan.c:408 drivers/net/macvlan.c:444) ? tasklet_init (kernel/softirq.c:983) macvlan_handle_frame (drivers/net/macvlan.c:501) Allocated by task 169: kasan_save_stack (mm/kasan/common.c:58) kasan_save_track (./arch/x86/include/asm/current.h:25 mm/kasan/common.c:70 mm/kasan/common.c:79) __kasan_kmalloc (mm/kasan/common.c:419) __kvmalloc_node_noprof (./include/linux/kasan.h:263 mm/slub.c:5657 mm/slub.c:7140) alloc_netdev_mqs (net/core/dev.c:12012) rtnl_create_link (net/core/rtnetlink.c:3648) rtnl_newlink (net/core/rtnetlink.c:3830 net/core/rtnetlink.c:3957 net/core/rtnetlink.c:4072) rtnetlink_rcv_msg (net/core/rtnetlink.c:6958) netlink_rcv_skb (net/netlink/af_netlink.c:2550) netlink_unicast (net/netlink/af_netlink.c:1319 net/netlink/af_netlink.c:1344) netlink_sendmsg (net/netlink/af_netlink.c:1894) __sys_sendto (net/socket.c:727 net/socket.c:742 net/socket.c:2206) __x64_sys_sendto (net/socket.c:2209) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:131) Freed by task 169: kasan_save_stack (mm/kasan/common.c:58) kasan_save_track (./arch/x86/include/asm/current.h:25 mm/kasan/common.c:70 mm/kasan/common.c:79) kasan_save_free_info (mm/kasan/generic.c:587) __kasan_slab_free (mm/kasan/common.c:287) kfree (mm/slub.c:6674 mm/slub.c:6882) rtnl_newlink (net/core/rtnetlink.c:3845 net/core/rtnetlink.c:3957 net/core/rtnetlink.c:4072) rtnetlink_rcv_msg (net/core/rtnetlink.c:6958) netlink_rcv_skb (net/netlink/af_netlink.c:2550) netlink_unicast (net/netlink/af_netlink.c:1319 net/netlink/af_netlink.c:1344) netlink_sendmsg (net/netlink/af_netlink.c:1894) __sys_sendto (net/socket.c:727 net/socket.c:742 net/socket.c:2206) __x64_sys_sendto (net/socket.c:2209) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:131) Fixes: f8db6475a836 ("macvlan: fix error recovery in macvlan_common_newlink()") Signed-off-by: Eric Dumazet Reported-by: valis Link: https://patch.msgid.link/20260213142557.3059043-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/macvlan.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index c509228be84d..4433b8e95b6a 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1572,6 +1572,11 @@ destroy_macvlan_port: if (create) macvlan_port_destroy(port->dev); } + /* @dev might have been made visible before an error was detected. + * Make sure to observe an RCU grace period before our caller + * (rtnl_newlink()) frees it. + */ + synchronize_net(); return err; } EXPORT_SYMBOL_GPL(macvlan_common_newlink); From ffe68c3766997d82e9ccaf1cdbd47eba269c4aa2 Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Fri, 13 Feb 2026 17:43:39 +0100 Subject: [PATCH 62/85] net: ethernet: ec_bhf: Fix dma_free_coherent() dma handle dma_free_coherent() in error path takes priv->rx_buf.alloc_len as the dma handle. This would lead to improper unmapping of the buffer. Change the dma handle to priv->rx_buf.alloc_phys. Fixes: 6af55ff52b02 ("Driver for Beckhoff CX5020 EtherCAT master module.") Cc: Signed-off-by: Thomas Fourier Link: https://patch.msgid.link/20260213164340.77272-2-fourier.thomas@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ec_bhf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ec_bhf.c b/drivers/net/ethernet/ec_bhf.c index 67275aa4f65b..0c86cbb0313c 100644 --- a/drivers/net/ethernet/ec_bhf.c +++ b/drivers/net/ethernet/ec_bhf.c @@ -423,7 +423,7 @@ static int ec_bhf_open(struct net_device *net_dev) error_rx_free: dma_free_coherent(dev, priv->rx_buf.alloc_len, priv->rx_buf.alloc, - priv->rx_buf.alloc_len); + priv->rx_buf.alloc_phys); out: return err; } From a07c33c6f2fc693bf9c67514fcc15d9d417f390d Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 16 Feb 2026 17:31:47 +0100 Subject: [PATCH 63/85] vsock: document namespace mode sysctls Add documentation for the vsock per-namespace sysctls (`ns_mode` and `child_ns_mode`) to Documentation/admin-guide/sysctl/net.rst. These sysctls were introduced by commit eafb64f40ca4 ("vsock: add netns to vsock core"). Document the two namespace modes (`global` and `local`), the inheritance behavior of `child_ns_mode`, and the restriction preventing local namespaces from setting `child_ns_mode` to `global`. Signed-off-by: Stefano Garzarella Tested-by: Randy Dunlap Acked-by: Randy Dunlap Link: https://patch.msgid.link/20260216163147.236844-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/admin-guide/sysctl/net.rst | 52 +++++++++++++++++++++++- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 19408da2390b..c10530624f1e 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -40,8 +40,8 @@ Table : Subdirectories in /proc/sys/net bridge Bridging rose X.25 PLP layer core General parameter tipc TIPC ethernet Ethernet protocol unix Unix domain sockets - ipv4 IP version 4 x25 X.25 protocol - ipv6 IP version 6 + ipv4 IP version 4 vsock VSOCK sockets + ipv6 IP version 6 x25 X.25 protocol ========= =================== = ========== =================== 1. /proc/sys/net/core - Network core options @@ -551,3 +551,51 @@ originally may have been issued in the correct sequential order. If named_timeout is nonzero, failed topology updates will be placed on a defer queue until another event arrives that clears the error, or until the timeout expires. Value is in milliseconds. + +6. /proc/sys/net/vsock - VSOCK sockets +-------------------------------------- + +VSOCK sockets (AF_VSOCK) provide communication between virtual machines and +their hosts. The behavior of VSOCK sockets in a network namespace is determined +by the namespace's mode (``global`` or ``local``), which controls how CIDs +(Context IDs) are allocated and how sockets interact across namespaces. + +ns_mode +------- + +Read-only. Reports the current namespace's mode, set at namespace creation +and immutable thereafter. + +Values: + + - ``global`` - the namespace shares system-wide CID allocation and + its sockets can reach any VM or socket in any global namespace. + Sockets in this namespace cannot reach sockets in local + namespaces. + - ``local`` - the namespace has private CID allocation and its + sockets can only connect to VMs or sockets within the same + namespace. + +The init_net mode is always ``global``. + +child_ns_mode +------------- + +Controls what mode newly created child namespaces will inherit. At namespace +creation, ``ns_mode`` is inherited from the parent's ``child_ns_mode``. The +initial value matches the namespace's own ``ns_mode``. + +Values: + + - ``global`` - child namespaces will share system-wide CID allocation + and their sockets will be able to reach any VM or socket in any + global namespace. + - ``local`` - child namespaces will have private CID allocation and + their sockets will only be able to connect within their own + namespace. + +Changing ``child_ns_mode`` only affects namespaces created after the change; +it does not modify the current namespace or any existing children. + +A namespace with ``ns_mode`` set to ``local`` cannot change +``child_ns_mode`` to ``global`` (returns ``-EPERM``). From be054cc66f739a9ba615dba9012a07fab8e7dd6f Mon Sep 17 00:00:00 2001 From: Ruitong Liu Date: Sat, 14 Feb 2026 01:59:48 +0800 Subject: [PATCH 64/85] net/sched: act_skbedit: fix divide-by-zero in tcf_skbedit_hash() Commit 38a6f0865796 ("net: sched: support hash selecting tx queue") added SKBEDIT_F_TXQ_SKBHASH support. The inclusive range size is computed as: mapping_mod = queue_mapping_max - queue_mapping + 1; The range size can be 65536 when the requested range covers all possible u16 queue IDs (e.g. queue_mapping=0 and queue_mapping_max=U16_MAX). That value cannot be represented in a u16 and previously wrapped to 0, so tcf_skbedit_hash() could trigger a divide-by-zero: queue_mapping += skb_get_hash(skb) % params->mapping_mod; Compute mapping_mod in a wider type and reject ranges larger than U16_MAX to prevent params->mapping_mod from becoming 0 and avoid the crash. Fixes: 38a6f0865796 ("net: sched: support hash selecting tx queue") Cc: stable@vger.kernel.org # 6.12+ Signed-off-by: Ruitong Liu Link: https://patch.msgid.link/20260213175948.1505257-1-cnitlrt@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/act_skbedit.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 8c1d1554f657..5450c1293eb5 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -126,7 +126,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL; u16 *queue_mapping = NULL, *ptype = NULL; - u16 mapping_mod = 1; + u32 mapping_mod = 1; bool exists = false; int ret = 0, err; u32 index; @@ -194,6 +194,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, } mapping_mod = *queue_mapping_max - *queue_mapping + 1; + if (mapping_mod > U16_MAX) { + NL_SET_ERR_MSG_MOD(extack, "The range of queue_mapping is invalid."); + return -EINVAL; + } flags |= SKBEDIT_F_TXQ_SKBHASH; } if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) From ccd8e87748ad083047d6c8544c5809b7f96cc8df Mon Sep 17 00:00:00 2001 From: Dimitri Daskalakis Date: Sat, 14 Feb 2026 09:19:49 -0800 Subject: [PATCH 65/85] eth: fbnic: Add validation for MTU changes Increasing the MTU beyond the HDS threshold causes the hardware to fragment packets across multiple buffers. If a single-buffer XDP program is attached, the driver will drop all multi-frag frames. While we can't prevent a remote sender from sending non-TCP packets larger than the MTU, this will prevent users from inadvertently breaking new TCP streams. Traditionally, drivers supported XDP with MTU less than 4Kb (packet per page). Fbnic currently prevents attaching XDP when MTU is too high. But it does not prevent increasing MTU after XDP is attached. Fixes: 1b0a3950dbd4 ("eth: fbnic: Add XDP pass, drop, abort support") Signed-off-by: Jakub Kicinski Signed-off-by: Dimitri Daskalakis Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/meta/fbnic/fbnic_netdev.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index 81c9d5c9a4b2..e3ca5fcfabef 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -262,6 +262,23 @@ static int fbnic_set_mac(struct net_device *netdev, void *p) return 0; } +static int fbnic_change_mtu(struct net_device *dev, int new_mtu) +{ + struct fbnic_net *fbn = netdev_priv(dev); + + if (fbnic_check_split_frames(fbn->xdp_prog, new_mtu, fbn->hds_thresh)) { + dev_err(&dev->dev, + "MTU %d is larger than HDS threshold %d in XDP mode\n", + new_mtu, fbn->hds_thresh); + + return -EINVAL; + } + + WRITE_ONCE(dev->mtu, new_mtu); + + return 0; +} + void fbnic_clear_rx_mode(struct fbnic_dev *fbd) { struct net_device *netdev = fbd->netdev; @@ -533,6 +550,7 @@ static const struct net_device_ops fbnic_netdev_ops = { .ndo_start_xmit = fbnic_xmit_frame, .ndo_features_check = fbnic_features_check, .ndo_set_mac_address = fbnic_set_mac, + .ndo_change_mtu = fbnic_change_mtu, .ndo_set_rx_mode = fbnic_set_rx_mode, .ndo_get_stats64 = fbnic_get_stats64, .ndo_bpf = fbnic_bpf, From 570e4549f63293ca4973ef367ff02554f3e3dfc2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Feb 2026 14:29:24 +0000 Subject: [PATCH 66/85] selftests/net: packetdrill: add ipv4-mapped-ipv6 tests Add ipv4-mapped-ipv6 case to ksft_runner.sh before an upcoming TCP fix in this area. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260217142924.1853498-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/packetdrill/ksft_runner.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh index b34e5cf0112e..0a97d5ae3469 100755 --- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -13,6 +13,15 @@ declare -A ip_args=( -D TFO_COOKIE_ZERO=b7c12350a90dc8f5 -D CMSG_LEVEL_IP=SOL_IP -D CMSG_TYPE_RECVERR=IP_RECVERR" + [ipv4-mapped-ipv6]="--ip_version=ipv4-mapped-ipv6 + --local_ip=192.168.0.1 + --gateway_ip=192.168.0.1 + --netmask_ip=255.255.0.0 + --remote_ip=192.0.2.1 + -D TFO_COOKIE=3021b9d889017eeb + -D TFO_COOKIE_ZERO=b7c12350a90dc8f5 + -D CMSG_LEVEL_IP=SOL_IPV6 + -D CMSG_TYPE_RECVERR=IPV6_RECVERR" [ipv6]="--ip_version=ipv6 --mtu=1520 --local_ip=fd3d:0a0b:17d6::1 @@ -45,7 +54,7 @@ fi ip_versions=$(grep -E '^--ip_version=' $script | cut -d '=' -f 2) if [[ -z $ip_versions ]]; then - ip_versions="ipv4 ipv6" + ip_versions="ipv4 ipv6 ipv4-mapped-ipv6" elif [[ ! "$ip_versions" =~ ^ipv[46]$ ]]; then ktap_exit_fail_msg "Too many or unsupported --ip_version: $ip_versions" exit "$KSFT_FAIL" From 034bbd806298e9ba4197dd1587b0348ee30996ea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Feb 2026 14:28:28 +0000 Subject: [PATCH 67/85] icmp: prevent possible overflow in icmp_global_allow() Following expression can overflow if sysctl_icmp_msgs_per_sec is big enough. sysctl_icmp_msgs_per_sec * delta / HZ; Fixes: 4cdf507d5452 ("icmp: add a global rate limitation") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260216142832.3834174-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e216b6df6331..eff8487c0aba 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -250,7 +250,8 @@ bool icmp_global_allow(struct net *net) if (delta < HZ / 50) return false; - incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ; + incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec); + incr = div_u64((u64)incr * delta, HZ); if (!incr) return false; From 87b08913a9ae82082e276d237ece08fc8ee24380 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Feb 2026 14:28:29 +0000 Subject: [PATCH 68/85] inet: move icmp_global_{credit,stamp} to a separate cache line icmp_global_credit was meant to be changed ~1000 times per second, but if an admin sets net.ipv4.icmp_msgs_per_sec to a very high value, icmp_global_credit changes can inflict false sharing to surrounding fields that are read mostly. Move icmp_global_credit and icmp_global_stamp to a separate cacheline aligned group. Fixes: b056b4cd9178 ("icmp: move icmp_global.credit and icmp_global.stamp to per netns storage") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260216142832.3834174-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2dbd46fc4734..8e971c7bf164 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -88,6 +88,12 @@ struct netns_ipv4 { int sysctl_tcp_rcvbuf_low_rtt; __cacheline_group_end(netns_ipv4_read_rx); + /* ICMP rate limiter hot cache line. */ + __cacheline_group_begin_aligned(icmp); + atomic_t icmp_global_credit; + u32 icmp_global_stamp; + __cacheline_group_end_aligned(icmp); + struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; @@ -141,8 +147,7 @@ struct netns_ipv4 { int sysctl_icmp_ratemask; int sysctl_icmp_msgs_per_sec; int sysctl_icmp_msgs_burst; - atomic_t icmp_global_credit; - u32 icmp_global_stamp; + u32 ip_rt_min_pmtu; int ip_rt_mtu_expires; int ip_rt_min_advmss; From 0201eedb69b24a6be9b7c1716287a89c4dde2320 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Feb 2026 14:28:30 +0000 Subject: [PATCH 69/85] ipv6: icmp: remove obsolete code in icmpv6_xrlim_allow() Following part was needed before the blamed commit, because inet_getpeer_v6() second argument was the prefix. /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); Now inet_getpeer_v6() retrieves hosts, we need to remove @tmo adjustement or wider prefixes likes /24 allow 8x more ICMP to be sent for a given ratelimit. As we had this issue for a while, this patch changes net.ipv6.icmp.ratelimit default value from 1000ms to 100ms to avoid potential regressions. Also add a READ_ONCE() when reading net->ipv6.sysctl.icmpv6_time. Fixes: fd0273d7939f ("ipv6: Remove external dependency on rt6i_dst and rt6i_src") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Cc: Martin KaFai Lau Link: https://patch.msgid.link/20260216142832.3834174-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 7 ++++--- net/ipv6/af_inet6.c | 2 +- net/ipv6/icmp.c | 7 +------ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 28c7e4f5ecf9..6921d8594b84 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -3234,12 +3234,13 @@ enhanced_dad - BOOLEAN =========== ratelimit - INTEGER - Limit the maximal rates for sending ICMPv6 messages. + Limit the maximal rates for sending ICMPv6 messages to a particular + peer. 0 to disable any limiting, - otherwise the minimal space between responses in milliseconds. + otherwise the space between responses in milliseconds. - Default: 1000 + Default: 100 ratemask - list of comma separated ranges For ICMPv6 message types matching the ranges in the ratemask, limit diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 31ba677d0442..69be0a67a140 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -952,7 +952,7 @@ static int __net_init inet6_net_init(struct net *net) int err = 0; net->ipv6.sysctl.bindv6only = 0; - net->ipv6.sysctl.icmpv6_time = 1*HZ; + net->ipv6.sysctl.icmpv6_time = HZ / 10; net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 375ecd779fda..0f41ca6f3d83 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -217,14 +217,9 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else if (dev && (dev->flags & IFF_LOOPBACK)) { res = true; } else { - struct rt6_info *rt = dst_rt6_info(dst); - int tmo = net->ipv6.sysctl.icmpv6_time; + int tmo = READ_ONCE(net->ipv6.sysctl.icmpv6_time); struct inet_peer *peer; - /* Give more bandwidth to wider prefixes. */ - if (rt->rt6i_dst.plen < 128) - tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); } From d8d9ef29886733428470655f2f99bc7493589fcb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Feb 2026 14:28:31 +0000 Subject: [PATCH 70/85] ipv4: icmp: icmpv4_xrlim_allow() optimization if net.ipv4.icmp_ratelimit is zero If net.ipv4.icmp_ratelimit is zero, we do not have to call inet_getpeer_v4() and inet_peer_xrlim_allow(). Both can be very expensive under DDOS. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260216142832.3834174-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index eff8487c0aba..a62b4c4033cc 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -316,23 +316,29 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct dst_entry *dst = &rt->dst; struct inet_peer *peer; struct net_device *dev; + int peer_timeout; bool rc = true; if (!apply_ratelimit) return true; + peer_timeout = READ_ONCE(net->ipv4.sysctl_icmp_ratelimit); + if (!peer_timeout) + goto out; + /* No rate limit on loopback */ rcu_read_lock(); dev = dst_dev_rcu(dst); if (dev && (dev->flags & IFF_LOOPBACK)) - goto out; + goto out_unlock; peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, l3mdev_master_ifindex_rcu(dev)); - rc = inet_peer_xrlim_allow(peer, - READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); -out: + rc = inet_peer_xrlim_allow(peer, peer_timeout); + +out_unlock: rcu_read_unlock(); +out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); else From 9395b1bb1f14ae3fa1e4e2f7988f029cb1c009ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Feb 2026 14:28:32 +0000 Subject: [PATCH 71/85] ipv6: icmp: icmpv6_xrlim_allow() optimization if net.ipv6.icmp.ratelimit is zero If net.ipv6.icmp.ratelimit is zero we do not have to call inet_getpeer_v6() and inet_peer_xrlim_allow(). Both can be very expensive under DDOS. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260216142832.3834174-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/icmp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 0f41ca6f3d83..813d2e9edb8b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -220,8 +220,12 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, int tmo = READ_ONCE(net->ipv6.sysctl.icmpv6_time); struct inet_peer *peer; - peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); - res = inet_peer_xrlim_allow(peer, tmo); + if (!tmo) { + res = true; + } else { + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); + res = inet_peer_xrlim_allow(peer, tmo); + } } rcu_read_unlock(); if (!res) From 0da1dba72616b178a64a762a235a24e9899e495d Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 17 Feb 2026 09:45:25 +0200 Subject: [PATCH 72/85] net/mlx5e: XSK, Fix unintended ICOSQ change XSK wakeup must use the async ICOSQ (with proper locking), as it is not guaranteed to run on the same CPU as the channel. The commit that converted the NAPI trigger path to use the sync ICOSQ incorrectly applied the same change to XSK, causing XSK wakeups to use the sync ICOSQ as well. Revert XSK flows to use the async ICOSQ. XDP program attach/detach triggers channel reopen, while XSK pool enable/disable can happen on-the-fly via NDOs without reopening channels. As a result, xsk_pool state cannot be reliably used at mlx5e_open_channel() time to decide whether an async ICOSQ is needed. Update the async_icosq_needed logic to depend on the presence of an XDP program rather than the xsk_pool, ensuring the async ICOSQ is available when XSK wakeups are enabled. This fixes multiple issues: 1. Illegal synchronize_rcu() in an RCU read- side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). 2. Hitting a NULL pointer dereference in mlx5e_xsk_wakeup(): [] BUG: kernel NULL pointer dereference, address: 0000000000000240 [] #PF: supervisor read access in kernel mode [] #PF: error_code(0x0000) - not-present page [] PGD 0 P4D 0 [] Oops: Oops: 0000 [#1] SMP [] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) [] Hardware name: [...] [] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] Reported-by: Daniel Borkmann Closes: https://lore.kernel.org/all/20260123223916.361295-1-daniel@iogearbox.net/ Fixes: 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") Tested-by: Daniel Borkmann Signed-off-by: Tariq Toukan Reviewed-by: Dragos Tatulea Acked-by: Alice Mikityanska Link: https://patch.msgid.link/20260217074525.1761454-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../ethernet/mellanox/mlx5/core/en/xsk/pool.c | 4 ++-- .../ethernet/mellanox/mlx5/core/en/xsk/tx.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 24 +++++++++++++------ 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index a7de3a3efc49..19fce51117c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1103,6 +1103,7 @@ int mlx5e_open_locked(struct net_device *netdev); int mlx5e_close_locked(struct net_device *netdev); void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c); +void mlx5e_trigger_napi_async_icosq(struct mlx5e_channel *c); void mlx5e_trigger_napi_sched(struct napi_struct *napi); int mlx5e_open_channels(struct mlx5e_priv *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c index db776e515b6a..5c5360a25c64 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c @@ -127,7 +127,7 @@ static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv, goto err_remove_pool; mlx5e_activate_xsk(c); - mlx5e_trigger_napi_icosq(c); + mlx5e_trigger_napi_async_icosq(c); /* Don't wait for WQEs, because the newer xdpsock sample doesn't provide * any Fill Ring entries at the setup stage. @@ -179,7 +179,7 @@ static int mlx5e_xsk_disable_locked(struct mlx5e_priv *priv, u16 ix) c = priv->channels.c[ix]; mlx5e_activate_rq(&c->rq); - mlx5e_trigger_napi_icosq(c); + mlx5e_trigger_napi_async_icosq(c); mlx5e_wait_for_min_rx_wqes(&c->rq, MLX5E_RQ_WQES_TIMEOUT); mlx5e_rx_res_xsk_update(priv->rx_res, &priv->channels, ix, false); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c index 9e33156fac8a..8aeab4b21035 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c @@ -34,7 +34,7 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) &c->async_icosq->state)) return 0; - mlx5e_trigger_napi_icosq(c); + mlx5e_trigger_napi_async_icosq(c); } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 4b8084420816..6a7ca4571c19 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2744,16 +2744,26 @@ static int mlx5e_channel_stats_alloc(struct mlx5e_priv *priv, int ix, int cpu) void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c) { + struct mlx5e_icosq *sq = &c->icosq; bool locked; - if (!test_and_set_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state)) - synchronize_net(); + set_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state); + synchronize_net(); - locked = mlx5e_icosq_sync_lock(&c->icosq); - mlx5e_trigger_irq(&c->icosq); - mlx5e_icosq_sync_unlock(&c->icosq, locked); + locked = mlx5e_icosq_sync_lock(sq); + mlx5e_trigger_irq(sq); + mlx5e_icosq_sync_unlock(sq, locked); - clear_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state); + clear_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state); +} + +void mlx5e_trigger_napi_async_icosq(struct mlx5e_channel *c) +{ + struct mlx5e_icosq *sq = c->async_icosq; + + spin_lock_bh(&sq->lock); + mlx5e_trigger_irq(sq); + spin_unlock_bh(&sq->lock); } void mlx5e_trigger_napi_sched(struct napi_struct *napi) @@ -2836,7 +2846,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, netif_napi_add_config_locked(netdev, &c->napi, mlx5e_napi_poll, ix); netif_napi_set_irq_locked(&c->napi, irq); - async_icosq_needed = !!xsk_pool || priv->ktls_rx_was_enabled; + async_icosq_needed = !!params->xdp_prog || priv->ktls_rx_was_enabled; err = mlx5e_open_queues(c, params, cparam, async_icosq_needed); if (unlikely(err)) goto err_napi_del; From 45be47bf5d7db0f762a93e9c0ede6cb3c91edf3b Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Mon, 16 Feb 2026 14:33:38 +0530 Subject: [PATCH 73/85] octeontx2-af: Fix default entries mcam entry action As per design, AF should update the default MCAM action only when mcam_index is -1. A bug in the previous patch caused default entries to be changed even when the request was not for them. Fixes: 570ba37898ec ("octeontx2-af: Update RSS algorithm index") Signed-off-by: Hariprasad Kelam Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260216090338.1318976-1-hkelam@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/rvu_npc.c | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index c7c70429eb6c..8658cb2143df 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -1042,32 +1042,35 @@ void rvu_npc_update_flowkey_alg_idx(struct rvu *rvu, u16 pcifunc, int nixlf, rvu_write64(rvu, blkaddr, NPC_AF_MCAMEX_BANKX_ACTION(index, bank), *(u64 *)&action); - /* update the VF flow rule action with the VF default entry action */ - if (mcam_index < 0) - npc_update_vf_flow_entry(rvu, mcam, blkaddr, pcifunc, - *(u64 *)&action); - /* update the action change in default rule */ pfvf = rvu_get_pfvf(rvu, pcifunc); if (pfvf->def_ucast_rule) pfvf->def_ucast_rule->rx_action = action; - index = npc_get_nixlf_mcam_index(mcam, pcifunc, - nixlf, NIXLF_PROMISC_ENTRY); + if (mcam_index < 0) { + /* update the VF flow rule action with the VF default + * entry action + */ + npc_update_vf_flow_entry(rvu, mcam, blkaddr, pcifunc, + *(u64 *)&action); - /* If PF's promiscuous entry is enabled, - * Set RSS action for that entry as well - */ - npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, blkaddr, - alg_idx); + index = npc_get_nixlf_mcam_index(mcam, pcifunc, + nixlf, NIXLF_PROMISC_ENTRY); - index = npc_get_nixlf_mcam_index(mcam, pcifunc, - nixlf, NIXLF_ALLMULTI_ENTRY); - /* If PF's allmulti entry is enabled, - * Set RSS action for that entry as well - */ - npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, blkaddr, - alg_idx); + /* If PF's promiscuous entry is enabled, + * Set RSS action for that entry as well + */ + npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, + blkaddr, alg_idx); + + index = npc_get_nixlf_mcam_index(mcam, pcifunc, + nixlf, NIXLF_ALLMULTI_ENTRY); + /* If PF's allmulti entry is enabled, + * Set RSS action for that entry as well + */ + npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, + blkaddr, alg_idx); + } } void npc_enadis_default_mce_entry(struct rvu *rvu, u16 pcifunc, From 6bf45704a92a128a8bfb2dd4d550c61b257c8f9a Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Mon, 16 Feb 2026 15:26:43 -0700 Subject: [PATCH 74/85] net/rds: Fix NULL pointer dereference in rds_tcp_accept_one Save a local pointer to new_sock->sk and hold a reference before installing callbacks in rds_tcp_accept_one. After rds_tcp_set_callbacks() or rds_tcp_reset_callbacks(), tc->t_sock is set to new_sock which may race with the shutdown path. A concurrent rds_tcp_conn_path_shutdown() may call sock_release(), which sets new_sock->sk = NULL and may eventually free sk when the refcount reaches zero. Subsequent accesses to new_sock->sk->sk_state would dereference NULL, causing the crash. The fix saves a local sk pointer before callbacks are installed so that sk_state can be accessed safely even after new_sock->sk is nulled, and uses sock_hold()/sock_put() to ensure sk itself remains valid for the duration. Fixes: 826c1004d4ae ("net/rds: rds_tcp_conn_path_shutdown must not discard messages") Reported-by: syzbot+96046021045ffe6d7709@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=96046021045ffe6d7709 Signed-off-by: Allison Henderson Link: https://patch.msgid.link/20260216222643.2391390-1-achender@kernel.org Signed-off-by: Paolo Abeni --- net/rds/tcp_listen.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 6fb5c928b8fd..b4ab68a1da6d 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -177,6 +177,7 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn) struct rds_tcp_connection *rs_tcp = NULL; int conn_state; struct rds_conn_path *cp; + struct sock *sk; struct in6_addr *my_addr, *peer_addr; #if !IS_ENABLED(CONFIG_IPV6) struct in6_addr saddr, daddr; @@ -298,6 +299,17 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn) rds_conn_path_drop(cp, 0); goto rst_nsk; } + /* Save a local pointer to sk and hold a reference before setting + * callbacks. Once callbacks are set, a concurrent + * rds_tcp_conn_path_shutdown() may call sock_release(), which + * sets new_sock->sk to NULL and drops a reference on sk. + * The local pointer lets us safely access sk_state below even + * if new_sock->sk has been nulled, and sock_hold() keeps sk + * itself valid until we are done. + */ + sk = new_sock->sk; + sock_hold(sk); + if (rs_tcp->t_sock) { /* Duelling SYN has been handled in rds_tcp_accept_one() */ rds_tcp_reset_callbacks(new_sock, cp); @@ -316,13 +328,15 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn) * knowing that "rds_tcp_conn_path_shutdown" will * dequeue pending messages. */ - if (new_sock->sk->sk_state == TCP_CLOSE_WAIT || - new_sock->sk->sk_state == TCP_LAST_ACK || - new_sock->sk->sk_state == TCP_CLOSE) + if (READ_ONCE(sk->sk_state) == TCP_CLOSE_WAIT || + READ_ONCE(sk->sk_state) == TCP_LAST_ACK || + READ_ONCE(sk->sk_state) == TCP_CLOSE) rds_conn_path_drop(cp, 0); else queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); + sock_put(sk); + new_sock = NULL; ret = 0; if (conn->c_npaths == 0) From 7b821da55b3f88c1703ff2c2074d182295a84f6b Mon Sep 17 00:00:00 2001 From: Tabrez Ahmed Date: Tue, 17 Feb 2026 19:23:49 +0530 Subject: [PATCH 75/85] rds: tcp: fix uninit-value in __inet_bind KMSAN reported an uninit-value access in __inet_bind() when binding an RDS TCP socket. The uninitialized memory originates from rds_tcp_conn_alloc(), which uses kmem_cache_alloc() to allocate the rds_tcp_connection structure. Specifically, the field 't_client_port_group' is incremented in rds_tcp_conn_path_connect() without being initialized first: if (++tc->t_client_port_group >= port_groups) Since kmem_cache_alloc() does not zero the memory, this field contains garbage, leading to the KMSAN report. Fix this by using kmem_cache_zalloc() to ensure the structure is zero-initialized upon allocation. Reported-by: syzbot+aae646f09192f72a68dc@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=aae646f09192f72a68dc Tested-by: syzbot+aae646f09192f72a68dc@syzkaller.appspotmail.com Fixes: a20a6992558f ("net/rds: Encode cp_index in TCP source port") Signed-off-by: Tabrez Ahmed Reviewed-by: Charalampos Mitrodimas Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20260217135350.33641-1-tabreztalks@gmail.com Signed-off-by: Paolo Abeni --- net/rds/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 45484a93d75f..04f310255692 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -373,7 +373,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) int ret = 0; for (i = 0; i < RDS_MPATH_WORKERS; i++) { - tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); + tc = kmem_cache_zalloc(rds_tcp_conn_slab, gfp); if (!tc) { ret = -ENOMEM; goto fail; From e977fcb3a318b53b47f23b44ac237fceb1b731fe Mon Sep 17 00:00:00 2001 From: Dimitri Daskalakis Date: Tue, 17 Feb 2026 19:06:20 -0800 Subject: [PATCH 76/85] eth: fbnic: Advertise supported XDP features. Drivers are supposed to advertise the XDP features they support. This was missed while adding XDP support. Before: $ ynl --family netdev --dump dev-get ... {'ifindex': 3, 'xdp-features': set(), 'xdp-rx-metadata-features': set(), 'xsk-features': set()}, ... After: $ ynl --family netdev --dump dev-get ... {'ifindex': 3, 'xdp-features': {'basic', 'rx-sg'}, 'xdp-rx-metadata-features': set(), 'xsk-features': set()}, ... Fixes: 168deb7b31b2 ("eth: fbnic: Add support for XDP_TX action") Signed-off-by: Jakub Kicinski Signed-off-by: Dimitri Daskalakis Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260218030620.3329608-1-dimitri.daskalakis1@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_netdev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index e3ca5fcfabef..b4b396ca9bce 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -805,6 +805,8 @@ struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd) netdev->hw_enc_features |= netdev->features; netdev->features |= NETIF_F_NTUPLE; + netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_RX_SG; + netdev->min_mtu = IPV6_MIN_MTU; netdev->max_mtu = FBNIC_MAX_JUMBO_FRAME_SIZE - ETH_HLEN; From 604530085b2ef484843c723a105b6fd3218b4710 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 18 Feb 2026 10:57:55 +0530 Subject: [PATCH 77/85] bnge: fix reserving resources from FW HWRM_FUNC_CFG is used to reserve resources, whereas HWRM_FUNC_QCFG is intended for querying resource information from the firmware. Since __bnge_hwrm_reserve_pf_rings() reserves resources for a specific PF, the command type should be HWRM_FUNC_CFG. Fixes: 627c67f038d2 ("bng_en: Add resource management support") Signed-off-by: Vikas Gupta Reviewed-by: Bhargava Chenna Marreddy Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260218052755.4097468-1-vikas.gupta@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c index 84c90a957719..91a4ef9e3150 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c +++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c @@ -442,7 +442,7 @@ __bnge_hwrm_reserve_pf_rings(struct bnge_dev *bd, struct bnge_hw_rings *hwr) struct hwrm_func_cfg_input *req; u32 enables = 0; - if (bnge_hwrm_req_init(bd, req, HWRM_FUNC_QCFG)) + if (bnge_hwrm_req_init(bd, req, HWRM_FUNC_CFG)) return NULL; req->fid = cpu_to_le16(0xffff); From e6834a4c474697df23ab9948fd3577b26bf48656 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 18 Feb 2026 06:09:19 +0000 Subject: [PATCH 78/85] bonding: alb: fix UAF in rlb_arp_recv during bond up/down The ALB RX path may access rx_hashtbl concurrently with bond teardown. During rapid bond up/down cycles, rlb_deinitialize() frees rx_hashtbl while RX handlers are still running, leading to a null pointer dereference detected by KASAN. However, the root cause is that rlb_arp_recv() can still be accessed after setting recv_probe to NULL, which is actually a use-after-free (UAF) issue. That is the reason for using the referenced commit in the Fixes tag. [ 214.174138] Oops: general protection fault, probably for non-canonical address 0xdffffc000000001d: 0000 [#1] SMP KASAN PTI [ 214.186478] KASAN: null-ptr-deref in range [0x00000000000000e8-0x00000000000000ef] [ 214.194933] CPU: 30 UID: 0 PID: 2375 Comm: ping Kdump: loaded Not tainted 6.19.0-rc8+ #2 PREEMPT(voluntary) [ 214.205907] Hardware name: Dell Inc. PowerEdge R730/0WCJNT, BIOS 2.14.0 01/14/2022 [ 214.214357] RIP: 0010:rlb_arp_recv+0x505/0xab0 [bonding] [ 214.220320] Code: 0f 85 2b 05 00 00 48 b8 00 00 00 00 00 fc ff df 40 0f b6 ed 48 c1 e5 06 49 03 ad 78 01 00 00 48 8d 7d 28 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 06 0f 8e 12 05 00 00 80 7d 28 00 0f 84 8c 00 [ 214.241280] RSP: 0018:ffffc900073d8870 EFLAGS: 00010206 [ 214.247116] RAX: dffffc0000000000 RBX: ffff888168556822 RCX: ffff88816855681e [ 214.255082] RDX: 000000000000001d RSI: dffffc0000000000 RDI: 00000000000000e8 [ 214.263048] RBP: 00000000000000c0 R08: 0000000000000002 R09: ffffed11192021c8 [ 214.271013] R10: ffff8888c9010e43 R11: 0000000000000001 R12: 1ffff92000e7b119 [ 214.278978] R13: ffff8888c9010e00 R14: ffff888168556822 R15: ffff888168556810 [ 214.286943] FS: 00007f85d2d9cb80(0000) GS:ffff88886ccb3000(0000) knlGS:0000000000000000 [ 214.295966] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 214.302380] CR2: 00007f0d047b5e34 CR3: 00000008a1c2e002 CR4: 00000000001726f0 [ 214.310347] Call Trace: [ 214.313070] [ 214.315318] ? __pfx_rlb_arp_recv+0x10/0x10 [bonding] [ 214.320975] bond_handle_frame+0x166/0xb60 [bonding] [ 214.326537] ? __pfx_bond_handle_frame+0x10/0x10 [bonding] [ 214.332680] __netif_receive_skb_core.constprop.0+0x576/0x2710 [ 214.339199] ? __pfx_arp_process+0x10/0x10 [ 214.343775] ? sched_balance_find_src_group+0x98/0x630 [ 214.349513] ? __pfx___netif_receive_skb_core.constprop.0+0x10/0x10 [ 214.356513] ? arp_rcv+0x307/0x690 [ 214.360311] ? __pfx_arp_rcv+0x10/0x10 [ 214.364499] ? __lock_acquire+0x58c/0xbd0 [ 214.368975] __netif_receive_skb_one_core+0xae/0x1b0 [ 214.374518] ? __pfx___netif_receive_skb_one_core+0x10/0x10 [ 214.380743] ? lock_acquire+0x10b/0x140 [ 214.385026] process_backlog+0x3f1/0x13a0 [ 214.389502] ? process_backlog+0x3aa/0x13a0 [ 214.394174] __napi_poll.constprop.0+0x9f/0x370 [ 214.399233] net_rx_action+0x8c1/0xe60 [ 214.403423] ? __pfx_net_rx_action+0x10/0x10 [ 214.408193] ? lock_acquire.part.0+0xbd/0x260 [ 214.413058] ? sched_clock_cpu+0x6c/0x540 [ 214.417540] ? mark_held_locks+0x40/0x70 [ 214.421920] handle_softirqs+0x1fd/0x860 [ 214.426302] ? __pfx_handle_softirqs+0x10/0x10 [ 214.431264] ? __neigh_event_send+0x2d6/0xf50 [ 214.436131] do_softirq+0xb1/0xf0 [ 214.439830] The issue is reproducible by repeatedly running ip link set bond0 up/down while receiving ARP messages, where rlb_arp_recv() can race with rlb_deinitialize() and dereference a freed rx_hashtbl entry. Fix this by setting recv_probe to NULL and then calling synchronize_net() to wait for any concurrent RX processing to finish. This ensures that no RX handler can access rx_hashtbl after it is freed in bond_alb_deinitialize(). Reported-by: Liang Li Fixes: 3aba891dde38 ("bonding: move processing of recv handlers into handle_frame()") Reviewed-by: Nikolay Aleksandrov Acked-by: Jay Vosburgh Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20260218060919.101574-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 78cff904cdc3..55a960da42b5 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4343,9 +4343,13 @@ static int bond_close(struct net_device *bond_dev) bond_work_cancel_all(bond); bond->send_peer_notif = 0; + WRITE_ONCE(bond->recv_probe, NULL); + + /* Wait for any in-flight RX handlers */ + synchronize_net(); + if (bond_is_lb(bond)) bond_alb_deinitialize(bond); - bond->recv_probe = NULL; if (BOND_MODE(bond) == BOND_MODE_8023AD && bond->params.broadcast_neighbor) From 47bf2e813817159f4d195be83a9b5a640ee6baec Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 18 Feb 2026 09:28:59 +0200 Subject: [PATCH 79/85] net/mlx5: Fix multiport device check over light SFs Driver is using num_vhca_ports capability to distinguish between multiport master device and multiport slave device. num_vhca_ports is a capability the driver sets according to the MAX num_vhca_ports capability reported by FW. On the other hand, light SFs doesn't set the above capbility. This leads to wrong results whenever light SFs is checking whether he is a multiport master or slave. Therefore, use the MAX capability to distinguish between master and slave devices. Fixes: e71383fb9cd1 ("net/mlx5: Light probe local SFs") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20260218072904.1764634-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e2d067b1e67b..04dcd09f7517 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1282,12 +1282,12 @@ static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev) static inline int mlx5_core_is_mp_slave(struct mlx5_core_dev *dev) { return MLX5_CAP_GEN(dev, affiliate_nic_vport_criteria) && - MLX5_CAP_GEN(dev, num_vhca_ports) <= 1; + MLX5_CAP_GEN_MAX(dev, num_vhca_ports) <= 1; } static inline int mlx5_core_is_mp_master(struct mlx5_core_dev *dev) { - return MLX5_CAP_GEN(dev, num_vhca_ports) > 1; + return MLX5_CAP_GEN_MAX(dev, num_vhca_ports) > 1; } static inline int mlx5_core_mp_enabled(struct mlx5_core_dev *dev) From ae3cb71e6c4dbda0c0b7c10475b744377813c7bd Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 18 Feb 2026 09:29:00 +0200 Subject: [PATCH 80/85] net/mlx5e: Fix misidentification of ASO CQE during poll loop The ASO completion poll loop uses usleep_range() which can sleep much longer than requested due to scheduler latency. Under load, we witnessed a 20ms+ delay until the process was rescheduled, causing the jiffies based timeout to expire while the thread is sleeping. The original do-while loop structure (poll, sleep, check timeout) would exit without a final poll when waking after timeout, missing a CQE that arrived during sleep. Instead of the open-coded while loop, use the kernel's read_poll_timeout() which always performs an additional check after the sleep expiration, and is less error-prone. Note: read_poll_timeout() doesn't accept a sleep range, by passing 10 sleep_us the sleep range effectively changes from 2-10 to 3-10 usecs. Fixes: 739cfa34518e ("net/mlx5: Make ASO poll CQ usable in atomic context") Fixes: 7e3fce82d945 ("net/mlx5e: Overcome slow response for first macsec ASO WQE") Signed-off-by: Gal Pressman Reviewed-by: Jianbo Liu Signed-off-by: Tariq Toukan Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20260218072904.1764634-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c | 10 +++------- .../net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 10 +++------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c index 7819fb297280..d5d9146efca6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB // Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#include #include #include "lib/aso.h" #include "en/tc/post_act.h" @@ -115,7 +116,6 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev, struct mlx5e_flow_meters *flow_meters; u8 cir_man, cir_exp, cbs_man, cbs_exp; struct mlx5_aso_wqe *aso_wqe; - unsigned long expires; struct mlx5_aso *aso; u64 rate, burst; u8 ds_cnt; @@ -187,12 +187,8 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev, mlx5_aso_post_wqe(aso, true, &aso_wqe->ctrl); /* With newer FW, the wait for the first ASO WQE is more than 2us, put the wait 10ms. */ - expires = jiffies + msecs_to_jiffies(10); - do { - err = mlx5_aso_poll_cq(aso, true); - if (err) - usleep_range(2, 10); - } while (err && time_is_after_jiffies(expires)); + read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC, + false, aso, true); mutex_unlock(&flow_meters->aso_lock); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 528b04d4de41..641cd3a2cdfa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "en.h" #include "lib/aso.h" @@ -1397,7 +1398,6 @@ static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *mac struct mlx5e_macsec_aso *aso; struct mlx5_aso_wqe *aso_wqe; struct mlx5_aso *maso; - unsigned long expires; int err; aso = &macsec->aso; @@ -1411,12 +1411,8 @@ static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *mac macsec_aso_build_wqe_ctrl_seg(aso, &aso_wqe->aso_ctrl, NULL); mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl); - expires = jiffies + msecs_to_jiffies(10); - do { - err = mlx5_aso_poll_cq(maso, false); - if (err) - usleep_range(2, 10); - } while (err && time_is_after_jiffies(expires)); + read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC, + false, maso, false); if (err) goto err_out; From d451994ebc7d4392610bd4b2ab339b255deb4143 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 18 Feb 2026 09:29:01 +0200 Subject: [PATCH 81/85] net/mlx5: Fix misidentification of write combining CQE during poll loop The write combining completion poll loop uses usleep_range() which can sleep much longer than requested due to scheduler latency. Under load, we witnessed a 20ms+ delay until the process was rescheduled, causing the jiffies based timeout to expire while the thread is sleeping. The original do-while loop structure (poll, sleep, check timeout) would exit without a final poll when waking after timeout, missing a CQE that arrived during sleep. Instead of the open-coded while loop, use the kernel's poll_timeout_us() which always performs an additional check after the sleep expiration, and is less error-prone. Note: poll_timeout_us() doesn't accept a sleep range, by passing 10 sleep_us the sleep range effectively changes from 2-10 to 3-10 usecs. Fixes: d98995b4bf98 ("net/mlx5: Reimplement write combining test") Signed-off-by: Gal Pressman Reviewed-by: Jianbo Liu Signed-off-by: Tariq Toukan Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20260218072904.1764634-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/wc.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wc.c b/drivers/net/ethernet/mellanox/mlx5/core/wc.c index 815a7c97d6b0..04d03be1bb77 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wc.c @@ -2,6 +2,7 @@ // Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. #include +#include #include #include "lib/clock.h" #include "mlx5_core.h" @@ -15,7 +16,7 @@ #define TEST_WC_NUM_WQES 255 #define TEST_WC_LOG_CQ_SZ (order_base_2(TEST_WC_NUM_WQES)) #define TEST_WC_SQ_LOG_WQ_SZ TEST_WC_LOG_CQ_SZ -#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100) +#define TEST_WC_POLLING_MAX_TIME_USEC (100 * USEC_PER_MSEC) struct mlx5_wc_cq { /* data path - accessed per cqe */ @@ -359,7 +360,6 @@ static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq) static void mlx5_core_test_wc(struct mlx5_core_dev *mdev) { unsigned int offset = 0; - unsigned long expires; struct mlx5_wc_sq *sq; int i, err; @@ -389,13 +389,9 @@ static void mlx5_core_test_wc(struct mlx5_core_dev *mdev) mlx5_wc_post_nop(sq, &offset, true); - expires = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES; - do { - err = mlx5_wc_poll_cq(sq); - if (err) - usleep_range(2, 10); - } while (mdev->wc_state == MLX5_WC_STATE_UNINITIALIZED && - time_is_after_jiffies(expires)); + poll_timeout_us(mlx5_wc_poll_cq(sq), + mdev->wc_state != MLX5_WC_STATE_UNINITIALIZED, 10, + TEST_WC_POLLING_MAX_TIME_USEC, false); mlx5_wc_destroy_sq(sq); From 9854b243ce4225328d0b32fdc998d35b6952d3f7 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 18 Feb 2026 09:29:02 +0200 Subject: [PATCH 82/85] net/mlx5e: MACsec, add ASO poll loop in macsec_aso_set_arm_event The macsec_aso_set_arm_event function calls mlx5_aso_poll_cq once without a retry loop. If the CQE is not immediately available after posting the WQE, the function fails unnecessarily. Use read_poll_timeout() to poll 3-10 usecs for CQE, consistent with other ASO polling code paths in the driver. Fixes: 739cfa34518e ("net/mlx5: Make ASO poll CQ usable in atomic context") Signed-off-by: Gal Pressman Reviewed-by: Jianbo Liu Signed-off-by: Tariq Toukan Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20260218072904.1764634-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 641cd3a2cdfa..90b3bc5f9166 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -1386,7 +1386,8 @@ static int macsec_aso_set_arm_event(struct mlx5_core_dev *mdev, struct mlx5e_mac MLX5_ACCESS_ASO_OPC_MOD_MACSEC); macsec_aso_build_ctrl(aso, &aso_wqe->aso_ctrl, in); mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl); - err = mlx5_aso_poll_cq(maso, false); + read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC, + false, maso, false); mutex_unlock(&aso->aso_lock); return err; From 83ac0304a2d77519dae1e54c9713cbe1aedf19c9 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 18 Feb 2026 09:29:03 +0200 Subject: [PATCH 83/85] net/mlx5e: Fix deadlocks between devlink and netdev instance locks In the mentioned "Fixes" commit, various work tasks triggering devlink health reporter recovery were switched to use netdev_trylock to protect against concurrent tear down of the channels being recovered. But this had the side effect of introducing potential deadlocks because of incorrect lock ordering. The correct lock order is described by the init flow: probe_one -> mlx5_init_one (acquires devlink lock) -> mlx5_init_one_devl_locked -> mlx5_register_device -> mlx5_rescan_drivers_locked -...-> mlx5e_probe -> _mlx5e_probe -> register_netdev (acquires rtnl lock) -> register_netdevice (acquires netdev lock) => devlink lock -> rtnl lock -> netdev lock. But in the current recovery flow, the order is wrong: mlx5e_tx_err_cqe_work (acquires netdev lock) -> mlx5e_reporter_tx_err_cqe -> mlx5e_health_report -> devlink_health_report (acquires devlink lock => boom!) -> devlink_health_reporter_recover -> mlx5e_tx_reporter_recover -> mlx5e_tx_reporter_recover_from_ctx -> mlx5e_tx_reporter_err_cqe_recover The same pattern exists in: mlx5e_reporter_rx_timeout mlx5e_reporter_tx_ptpsq_unhealthy mlx5e_reporter_tx_timeout Fix these by moving the netdev_trylock calls from the work handlers lower in the call stack, in the respective recovery functions, where they are actually necessary. Fixes: 8f7b00307bf1 ("net/mlx5e: Convert mlx5 netdevs to instance locking") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20260218072904.1764634-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en/ptp.c | 14 ----- .../mellanox/mlx5/core/en/reporter_rx.c | 13 +++++ .../mellanox/mlx5/core/en/reporter_tx.c | 52 +++++++++++++++++-- .../net/ethernet/mellanox/mlx5/core/en_main.c | 40 -------------- 4 files changed, 61 insertions(+), 58 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 424f8a2728a3..74660e7fe674 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -457,22 +457,8 @@ static void mlx5e_ptpsq_unhealthy_work(struct work_struct *work) { struct mlx5e_ptpsq *ptpsq = container_of(work, struct mlx5e_ptpsq, report_unhealthy_work); - struct mlx5e_txqsq *sq = &ptpsq->txqsq; - - /* Recovering the PTP SQ means re-enabling NAPI, which requires the - * netdev instance lock. However, SQ closing has to wait for this work - * task to finish while also holding the same lock. So either get the - * lock or find that the SQ is no longer enabled and thus this work is - * not relevant anymore. - */ - while (!netdev_trylock(sq->netdev)) { - if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)) - return; - msleep(20); - } mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq); - netdev_unlock(sq->netdev); } static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index 0686fbdd5a05..6efb626b5506 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Mellanox Technologies. +#include + #include "health.h" #include "params.h" #include "txrx.h" @@ -177,6 +179,16 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx) rq = ctx; priv = rq->priv; + /* Acquire netdev instance lock to synchronize with channel close and + * reopen flows. Either successfully obtain the lock, or detect that + * channels are closing for another reason, making this work no longer + * necessary. + */ + while (!netdev_trylock(rq->netdev)) { + if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state)) + return 0; + msleep(20); + } mutex_lock(&priv->state_lock); eq = rq->cq.mcq.eq; @@ -186,6 +198,7 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx) clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state); mutex_unlock(&priv->state_lock); + netdev_unlock(rq->netdev); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 4adc1adf9897..60ba840e00fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2019 Mellanox Technologies. */ +#include + #include "health.h" #include "en/ptp.h" #include "en/devlink.h" @@ -79,6 +81,18 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) return 0; + /* Recovering queues means re-enabling NAPI, which requires the netdev + * instance lock. However, SQ closing flows have to wait for work tasks + * to finish while also holding the netdev instance lock. So either get + * the lock or find that the SQ is no longer enabled and thus this work + * is not relevant anymore. + */ + while (!netdev_trylock(dev)) { + if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)) + return 0; + msleep(20); + } + err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); if (err) { netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", @@ -114,9 +128,11 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) else mlx5e_trigger_napi_sched(sq->cq.napi); + netdev_unlock(dev); return 0; out: clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); + netdev_unlock(dev); return err; } @@ -137,10 +153,24 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx) sq = to_ctx->sq; eq = sq->cq.mcq.eq; priv = sq->priv; + + /* Recovering the TX queues implies re-enabling NAPI, which requires + * the netdev instance lock. + * However, channel closing flows have to wait for this work to finish + * while holding the same lock. So either get the lock or find that + * channels are being closed for other reason and this work is not + * relevant anymore. + */ + while (!netdev_trylock(sq->netdev)) { + if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state)) + return 0; + msleep(20); + } + err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats); if (!err) { to_ctx->status = 0; /* this sq recovered */ - return err; + goto out; } mutex_lock(&priv->state_lock); @@ -148,7 +178,7 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx) mutex_unlock(&priv->state_lock); if (!err) { to_ctx->status = 1; /* all channels recovered */ - return err; + goto out; } to_ctx->status = err; @@ -156,7 +186,8 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx) netdev_err(priv->netdev, "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", err); - +out: + netdev_unlock(sq->netdev); return err; } @@ -173,10 +204,22 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) return 0; priv = ptpsq->txqsq.priv; + netdev = priv->netdev; + + /* Recovering the PTP SQ means re-enabling NAPI, which requires the + * netdev instance lock. However, SQ closing has to wait for this work + * task to finish while also holding the same lock. So either get the + * lock or find that the SQ is no longer enabled and thus this work is + * not relevant anymore. + */ + while (!netdev_trylock(netdev)) { + if (!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state)) + return 0; + msleep(20); + } mutex_lock(&priv->state_lock); chs = &priv->channels; - netdev = priv->netdev; carrier_ok = netif_carrier_ok(netdev); netif_carrier_off(netdev); @@ -193,6 +236,7 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) netif_carrier_on(netdev); mutex_unlock(&priv->state_lock); + netdev_unlock(netdev); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 6a7ca4571c19..7eb691c2a1bd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -631,19 +631,7 @@ static void mlx5e_rq_timeout_work(struct work_struct *timeout_work) struct mlx5e_rq, rx_timeout_work); - /* Acquire netdev instance lock to synchronize with channel close and - * reopen flows. Either successfully obtain the lock, or detect that - * channels are closing for another reason, making this work no longer - * necessary. - */ - while (!netdev_trylock(rq->netdev)) { - if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state)) - return; - msleep(20); - } - mlx5e_reporter_rx_timeout(rq); - netdev_unlock(rq->netdev); } static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq) @@ -1952,20 +1940,7 @@ void mlx5e_tx_err_cqe_work(struct work_struct *recover_work) struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq, recover_work); - /* Recovering queues means re-enabling NAPI, which requires the netdev - * instance lock. However, SQ closing flows have to wait for work tasks - * to finish while also holding the netdev instance lock. So either get - * the lock or find that the SQ is no longer enabled and thus this work - * is not relevant anymore. - */ - while (!netdev_trylock(sq->netdev)) { - if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)) - return; - msleep(20); - } - mlx5e_reporter_tx_err_cqe(sq); - netdev_unlock(sq->netdev); } static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode) @@ -5115,19 +5090,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work) struct net_device *netdev = priv->netdev; int i; - /* Recovering the TX queues implies re-enabling NAPI, which requires - * the netdev instance lock. - * However, channel closing flows have to wait for this work to finish - * while holding the same lock. So either get the lock or find that - * channels are being closed for other reason and this work is not - * relevant anymore. - */ - while (!netdev_trylock(netdev)) { - if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state)) - return; - msleep(20); - } - for (i = 0; i < netdev->real_num_tx_queues; i++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(netdev, i); @@ -5140,8 +5102,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work) /* break if tried to reopened channels */ break; } - - netdev_unlock(netdev); } static void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue) From 57a94d4b22b0c6cc5d601e6b6238d78fb923d991 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 18 Feb 2026 09:29:04 +0200 Subject: [PATCH 84/85] net/mlx5e: Use unsigned for mlx5e_get_max_num_channels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The max number of channels is always an unsigned int, use the correct type to fix compilation errors done with strict type checking, e.g.: error: call to ‘__compiletime_assert_1110’ declared with attribute error: min(mlx5e_get_devlink_param_num_doorbells(mdev), mlx5e_get_max_num_channels(mdev)) signedness error Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20260218072904.1764634-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 19fce51117c9..ea2cd1f5d1d0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -180,7 +180,8 @@ static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size) } /* Use this function to get max num channels (rxqs/txqs) only to create netdev */ -static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) +static inline unsigned int +mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) { return is_kdump_kernel() ? MLX5E_MIN_NUM_CHANNELS : From 571dcbeb8e635182bb825ae758399831805693c2 Mon Sep 17 00:00:00 2001 From: Michael Thalmeier Date: Wed, 18 Feb 2026 09:30:00 +0100 Subject: [PATCH 85/85] net: nfc: nci: Fix parameter validation for packet data Since commit 9c328f54741b ("net: nfc: nci: Add parameter validation for packet data") communication with nci nfc chips is not working any more. The mentioned commit tries to fix access of uninitialized data, but failed to understand that in some cases the data packet is of variable length and can therefore not be compared to the maximum packet length given by the sizeof(struct). Fixes: 9c328f54741b ("net: nfc: nci: Add parameter validation for packet data") Cc: stable@vger.kernel.org Signed-off-by: Michael Thalmeier Reported-by: syzbot+740e04c2a93467a0f8c8@syzkaller.appspotmail.com Link: https://patch.msgid.link/20260218083000.301354-1-michael.thalmeier@hale.at Signed-off-by: Jakub Kicinski --- net/nfc/nci/ntf.c | 159 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 141 insertions(+), 18 deletions(-) diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 418b84e2b260..c96512bb8653 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -58,7 +58,7 @@ static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, struct nci_conn_info *conn_info; int i; - if (skb->len < sizeof(struct nci_core_conn_credit_ntf)) + if (skb->len < offsetofend(struct nci_core_conn_credit_ntf, num_entries)) return -EINVAL; ntf = (struct nci_core_conn_credit_ntf *)skb->data; @@ -68,6 +68,10 @@ static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, if (ntf->num_entries > NCI_MAX_NUM_CONN) ntf->num_entries = NCI_MAX_NUM_CONN; + if (skb->len < offsetofend(struct nci_core_conn_credit_ntf, num_entries) + + ntf->num_entries * sizeof(struct conn_credit_entry)) + return -EINVAL; + /* update the credits */ for (i = 0; i < ntf->num_entries; i++) { ntf->conn_entries[i].conn_id = @@ -138,23 +142,48 @@ static int nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, static const __u8 * nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfca_poll *nfca_poll, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Check if we have enough data for sens_res (2 bytes) */ + if (data_len < 2) + return ERR_PTR(-EINVAL); + nfca_poll->sens_res = __le16_to_cpu(*((__le16 *)data)); data += 2; + data_len -= 2; + + /* Check if we have enough data for nfcid1_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); nfca_poll->nfcid1_len = min_t(__u8, *data++, NFC_NFCID1_MAXSIZE); + data_len--; pr_debug("sens_res 0x%x, nfcid1_len %d\n", nfca_poll->sens_res, nfca_poll->nfcid1_len); + /* Check if we have enough data for nfcid1 */ + if (data_len < nfca_poll->nfcid1_len) + return ERR_PTR(-EINVAL); + memcpy(nfca_poll->nfcid1, data, nfca_poll->nfcid1_len); data += nfca_poll->nfcid1_len; + data_len -= nfca_poll->nfcid1_len; + + /* Check if we have enough data for sel_res_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); nfca_poll->sel_res_len = *data++; + data_len--; + + if (nfca_poll->sel_res_len != 0) { + /* Check if we have enough data for sel_res (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); - if (nfca_poll->sel_res_len != 0) nfca_poll->sel_res = *data++; + } pr_debug("sel_res_len %d, sel_res 0x%x\n", nfca_poll->sel_res_len, @@ -166,12 +195,21 @@ nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, static const __u8 * nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfcb_poll *nfcb_poll, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Check if we have enough data for sensb_res_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcb_poll->sensb_res_len = min_t(__u8, *data++, NFC_SENSB_RES_MAXSIZE); + data_len--; pr_debug("sensb_res_len %d\n", nfcb_poll->sensb_res_len); + /* Check if we have enough data for sensb_res */ + if (data_len < nfcb_poll->sensb_res_len) + return ERR_PTR(-EINVAL); + memcpy(nfcb_poll->sensb_res, data, nfcb_poll->sensb_res_len); data += nfcb_poll->sensb_res_len; @@ -181,14 +219,29 @@ nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, static const __u8 * nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfcf_poll *nfcf_poll, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Check if we have enough data for bit_rate (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcf_poll->bit_rate = *data++; + data_len--; + + /* Check if we have enough data for sensf_res_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcf_poll->sensf_res_len = min_t(__u8, *data++, NFC_SENSF_RES_MAXSIZE); + data_len--; pr_debug("bit_rate %d, sensf_res_len %d\n", nfcf_poll->bit_rate, nfcf_poll->sensf_res_len); + /* Check if we have enough data for sensf_res */ + if (data_len < nfcf_poll->sensf_res_len) + return ERR_PTR(-EINVAL); + memcpy(nfcf_poll->sensf_res, data, nfcf_poll->sensf_res_len); data += nfcf_poll->sensf_res_len; @@ -198,22 +251,49 @@ nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, static const __u8 * nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfcv_poll *nfcv_poll, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Skip 1 byte (reserved) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + ++data; + data_len--; + + /* Check if we have enough data for dsfid (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcv_poll->dsfid = *data++; + data_len--; + + /* Check if we have enough data for uid (8 bytes) */ + if (data_len < NFC_ISO15693_UID_MAXSIZE) + return ERR_PTR(-EINVAL); + memcpy(nfcv_poll->uid, data, NFC_ISO15693_UID_MAXSIZE); data += NFC_ISO15693_UID_MAXSIZE; + return data; } static const __u8 * nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, struct rf_tech_specific_params_nfcf_listen *nfcf_listen, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Check if we have enough data for local_nfcid2_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcf_listen->local_nfcid2_len = min_t(__u8, *data++, NFC_NFCID2_MAXSIZE); + data_len--; + + /* Check if we have enough data for local_nfcid2 */ + if (data_len < nfcf_listen->local_nfcid2_len) + return ERR_PTR(-EINVAL); + memcpy(nfcf_listen->local_nfcid2, data, nfcf_listen->local_nfcid2_len); data += nfcf_listen->local_nfcid2_len; @@ -364,7 +444,7 @@ static int nci_rf_discover_ntf_packet(struct nci_dev *ndev, const __u8 *data; bool add_target = true; - if (skb->len < sizeof(struct nci_rf_discover_ntf)) + if (skb->len < offsetofend(struct nci_rf_discover_ntf, rf_tech_specific_params_len)) return -EINVAL; data = skb->data; @@ -380,26 +460,42 @@ static int nci_rf_discover_ntf_packet(struct nci_dev *ndev, pr_debug("rf_tech_specific_params_len %d\n", ntf.rf_tech_specific_params_len); + if (skb->len < (data - skb->data) + + ntf.rf_tech_specific_params_len + sizeof(ntf.ntf_type)) + return -EINVAL; + if (ntf.rf_tech_specific_params_len > 0) { switch (ntf.rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfca_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfca_poll), data); + &(ntf.rf_tech_specific_params.nfca_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return PTR_ERR(data); break; case NCI_NFC_B_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcb_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcb_poll), data); + &(ntf.rf_tech_specific_params.nfcb_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return PTR_ERR(data); break; case NCI_NFC_F_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcf_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcf_poll), data); + &(ntf.rf_tech_specific_params.nfcf_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return PTR_ERR(data); break; case NCI_NFC_V_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcv_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcv_poll), data); + &(ntf.rf_tech_specific_params.nfcv_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return PTR_ERR(data); break; default: @@ -596,7 +692,7 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, const __u8 *data; int err = NCI_STATUS_OK; - if (skb->len < sizeof(struct nci_rf_intf_activated_ntf)) + if (skb->len < offsetofend(struct nci_rf_intf_activated_ntf, rf_tech_specific_params_len)) return -EINVAL; data = skb->data; @@ -628,26 +724,41 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, if (ntf.rf_interface == NCI_RF_INTERFACE_NFCEE_DIRECT) goto listen; + if (skb->len < (data - skb->data) + ntf.rf_tech_specific_params_len) + return -EINVAL; + if (ntf.rf_tech_specific_params_len > 0) { switch (ntf.activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfca_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfca_poll), data); + &(ntf.rf_tech_specific_params.nfca_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; case NCI_NFC_B_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcb_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcb_poll), data); + &(ntf.rf_tech_specific_params.nfcb_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; case NCI_NFC_F_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcf_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcf_poll), data); + &(ntf.rf_tech_specific_params.nfcf_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; case NCI_NFC_V_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcv_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcv_poll), data); + &(ntf.rf_tech_specific_params.nfcv_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; case NCI_NFC_A_PASSIVE_LISTEN_MODE: @@ -657,7 +768,9 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, case NCI_NFC_F_PASSIVE_LISTEN_MODE: data = nci_extract_rf_params_nfcf_passive_listen(ndev, &(ntf.rf_tech_specific_params.nfcf_listen), - data); + data, ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; default: @@ -668,6 +781,13 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, } } + if (skb->len < (data - skb->data) + + sizeof(ntf.data_exch_rf_tech_and_mode) + + sizeof(ntf.data_exch_tx_bit_rate) + + sizeof(ntf.data_exch_rx_bit_rate) + + sizeof(ntf.activation_params_len)) + return -EINVAL; + ntf.data_exch_rf_tech_and_mode = *data++; ntf.data_exch_tx_bit_rate = *data++; ntf.data_exch_rx_bit_rate = *data++; @@ -679,6 +799,9 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, pr_debug("data_exch_rx_bit_rate 0x%x\n", ntf.data_exch_rx_bit_rate); pr_debug("activation_params_len %d\n", ntf.activation_params_len); + if (skb->len < (data - skb->data) + ntf.activation_params_len) + return -EINVAL; + if (ntf.activation_params_len > 0) { switch (ntf.rf_interface) { case NCI_RF_INTERFACE_ISO_DEP: