linux/net/core/link_watch.c
Jiayuan Chen 83b67cc9be linkwatch: use __dev_put() in callers to prevent UAF
After linkwatch_do_dev() calls __dev_put() to release the linkwatch
reference, the device refcount may drop to 1. At this point,
netdev_run_todo() can proceed (since linkwatch_sync_dev() sees an
empty list and returns without blocking), wait for the refcount to
become 1 via netdev_wait_allrefs_any(), and then free the device
via kobject_put().

This creates a use-after-free when __linkwatch_run_queue() tries to
call netdev_unlock_ops() on the already-freed device.

Note that adding netdev_lock_ops()/netdev_unlock_ops() pair in
netdev_run_todo() before kobject_put() would not work, because
netdev_lock_ops() is conditional - it only locks when
netdev_need_ops_lock() returns true. If the device doesn't require
ops_lock, linkwatch won't hold any lock, and netdev_run_todo()
acquiring the lock won't provide synchronization.

Fix this by moving __dev_put() from linkwatch_do_dev() to its
callers. The device reference logically pairs with de-listing the
device, so it's reasonable for the caller that did the de-listing
to release it. This allows placing __dev_put() after all device
accesses are complete, preventing UAF.

The bug can be reproduced by adding mdelay(2000) after
linkwatch_do_dev() in __linkwatch_run_queue(), then running:

  ip tuntap add mode tun name tun_test
  ip link set tun_test up
  ip link set tun_test carrier off
  ip link set tun_test carrier on
  sleep 0.5
  ip tuntap del mode tun name tun_test

KASAN report:

 ==================================================================
 BUG: KASAN: use-after-free in netdev_need_ops_lock include/net/netdev_lock.h:33 [inline]
 BUG: KASAN: use-after-free in netdev_unlock_ops include/net/netdev_lock.h:47 [inline]
 BUG: KASAN: use-after-free in __linkwatch_run_queue+0x865/0x8a0 net/core/link_watch.c:245
 Read of size 8 at addr ffff88804de5c008 by task kworker/u32:10/8123

 CPU: 0 UID: 0 PID: 8123 Comm: kworker/u32:10 Not tainted syzkaller #0 PREEMPT(full)
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
 Workqueue: events_unbound linkwatch_event
 Call Trace:
  <TASK>
  __dump_stack lib/dump_stack.c:94 [inline]
  dump_stack_lvl+0x100/0x190 lib/dump_stack.c:120
  print_address_description mm/kasan/report.c:378 [inline]
  print_report+0x156/0x4c9 mm/kasan/report.c:482
  kasan_report+0xdf/0x1a0 mm/kasan/report.c:595
  netdev_need_ops_lock include/net/netdev_lock.h:33 [inline]
  netdev_unlock_ops include/net/netdev_lock.h:47 [inline]
  __linkwatch_run_queue+0x865/0x8a0 net/core/link_watch.c:245
  linkwatch_event+0x8f/0xc0 net/core/link_watch.c:304
  process_one_work+0x9c2/0x1840 kernel/workqueue.c:3257
  process_scheduled_works kernel/workqueue.c:3340 [inline]
  worker_thread+0x5da/0xe40 kernel/workqueue.c:3421
  kthread+0x3b3/0x730 kernel/kthread.c:463
  ret_from_fork+0x754/0xaf0 arch/x86/kernel/process.c:158
  ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:246
  </TASK>
 ==================================================================

Fixes: 04efcee6ef ("net: hold instance lock during NETDEV_CHANGE")
Reported-by: syzbot+1ec2f6a450f0b54af8c8@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/6824d064.a70a0220.3e9d8.001a.GAE@google.com/T/
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20260201135915.393451-1-jiayuan.chen@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2026-02-02 16:59:18 -08:00

330 lines
7.5 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux network device link state notification
*
* Author:
* Stefan Rompf <sux@loplof.de>
*/
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/if.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <linux/rtnetlink.h>
#include <linux/jiffies.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include "dev.h"
enum lw_bits {
LW_URGENT = 0,
};
static unsigned long linkwatch_flags;
static unsigned long linkwatch_nextevent;
static void linkwatch_event(struct work_struct *dummy);
static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
static LIST_HEAD(lweventlist);
static DEFINE_SPINLOCK(lweventlist_lock);
static unsigned int default_operstate(const struct net_device *dev)
{
if (netif_testing(dev))
return IF_OPER_TESTING;
/* Some uppers (DSA) have additional sources for being down, so
* first check whether lower is indeed the source of its down state.
*/
if (!netif_carrier_ok(dev)) {
struct net_device *peer;
int iflink;
/* If called from netdev_run_todo()/linkwatch_sync_dev(),
* dev_net(dev) can be already freed, and RTNL is not held.
*/
if (dev->reg_state <= NETREG_REGISTERED)
iflink = dev_get_iflink(dev);
else
iflink = dev->ifindex;
if (iflink == dev->ifindex)
return IF_OPER_DOWN;
ASSERT_RTNL();
peer = __dev_get_by_index(dev_net(dev), iflink);
if (!peer)
return IF_OPER_DOWN;
return netif_carrier_ok(peer) ? IF_OPER_DOWN :
IF_OPER_LOWERLAYERDOWN;
}
if (netif_dormant(dev))
return IF_OPER_DORMANT;
return IF_OPER_UP;
}
static void rfc2863_policy(struct net_device *dev)
{
unsigned int operstate = default_operstate(dev);
if (operstate == READ_ONCE(dev->operstate))
return;
switch(dev->link_mode) {
case IF_LINK_MODE_TESTING:
if (operstate == IF_OPER_UP)
operstate = IF_OPER_TESTING;
break;
case IF_LINK_MODE_DORMANT:
if (operstate == IF_OPER_UP)
operstate = IF_OPER_DORMANT;
break;
case IF_LINK_MODE_DEFAULT:
default:
break;
}
WRITE_ONCE(dev->operstate, operstate);
}
void linkwatch_init_dev(struct net_device *dev)
{
/* Handle pre-registration link state changes */
if (!netif_carrier_ok(dev) || netif_dormant(dev) ||
netif_testing(dev))
rfc2863_policy(dev);
}
static bool linkwatch_urgent_event(struct net_device *dev)
{
if (!netif_running(dev))
return false;
if (dev->ifindex != dev_get_iflink(dev))
return true;
if (netif_is_lag_port(dev) || netif_is_lag_master(dev))
return true;
return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
}
static void linkwatch_add_event(struct net_device *dev)
{
unsigned long flags;
spin_lock_irqsave(&lweventlist_lock, flags);
if (list_empty(&dev->link_watch_list)) {
list_add_tail(&dev->link_watch_list, &lweventlist);
netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
}
static void linkwatch_schedule_work(int urgent)
{
unsigned long delay = linkwatch_nextevent - jiffies;
if (test_bit(LW_URGENT, &linkwatch_flags))
return;
/* Minimise down-time: drop delay for up event. */
if (urgent) {
if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
return;
delay = 0;
}
/* If we wrap around we'll delay it by at most HZ. */
if (delay > HZ)
delay = 0;
/*
* If urgent, schedule immediate execution; otherwise, don't
* override the existing timer.
*/
if (test_bit(LW_URGENT, &linkwatch_flags))
mod_delayed_work(system_dfl_wq, &linkwatch_work, 0);
else
queue_delayed_work(system_dfl_wq, &linkwatch_work, delay);
}
static void linkwatch_do_dev(struct net_device *dev)
{
/*
* Make sure the above read is complete since it can be
* rewritten as soon as we clear the bit below.
*/
smp_mb__before_atomic();
/* We are about to handle this device,
* so new events can be accepted
*/
clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
rfc2863_policy(dev);
if (dev->flags & IFF_UP) {
if (netif_carrier_ok(dev))
dev_activate(dev);
else
dev_deactivate(dev);
netif_state_change(dev);
}
}
static void __linkwatch_run_queue(int urgent_only)
{
#define MAX_DO_DEV_PER_LOOP 100
int do_dev = MAX_DO_DEV_PER_LOOP;
/* Use a local list here since we add non-urgent
* events back to the global one when called with
* urgent_only=1.
*/
LIST_HEAD(wrk);
/* Give urgent case more budget */
if (urgent_only)
do_dev += MAX_DO_DEV_PER_LOOP;
/*
* Limit the number of linkwatch events to one
* per second so that a runaway driver does not
* cause a storm of messages on the netlink
* socket. This limit does not apply to up events
* while the device qdisc is down.
*/
if (!urgent_only)
linkwatch_nextevent = jiffies + HZ;
/* Limit wrap-around effect on delay. */
else if (time_after(linkwatch_nextevent, jiffies + HZ))
linkwatch_nextevent = jiffies;
clear_bit(LW_URGENT, &linkwatch_flags);
spin_lock_irq(&lweventlist_lock);
list_splice_init(&lweventlist, &wrk);
while (!list_empty(&wrk) && do_dev > 0) {
struct net_device *dev;
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
if (!netif_device_present(dev) ||
(urgent_only && !linkwatch_urgent_event(dev))) {
list_add_tail(&dev->link_watch_list, &lweventlist);
continue;
}
/* We must free netdev tracker under
* the spinlock protection.
*/
netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
spin_unlock_irq(&lweventlist_lock);
netdev_lock_ops(dev);
linkwatch_do_dev(dev);
netdev_unlock_ops(dev);
/* Use __dev_put() because netdev_tracker_free() was already
* called above. Must be after netdev_unlock_ops() to prevent
* netdev_run_todo() from freeing the device while still in use.
*/
__dev_put(dev);
do_dev--;
spin_lock_irq(&lweventlist_lock);
}
/* Add the remaining work back to lweventlist */
list_splice_init(&wrk, &lweventlist);
if (!list_empty(&lweventlist))
linkwatch_schedule_work(0);
spin_unlock_irq(&lweventlist_lock);
}
static bool linkwatch_clean_dev(struct net_device *dev)
{
unsigned long flags;
bool clean = false;
spin_lock_irqsave(&lweventlist_lock, flags);
if (!list_empty(&dev->link_watch_list)) {
list_del_init(&dev->link_watch_list);
clean = true;
/* We must release netdev tracker under
* the spinlock protection.
*/
netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
return clean;
}
void __linkwatch_sync_dev(struct net_device *dev)
{
netdev_ops_assert_locked(dev);
if (linkwatch_clean_dev(dev)) {
linkwatch_do_dev(dev);
/* Use __dev_put() because netdev_tracker_free() was already
* called inside linkwatch_clean_dev().
*/
__dev_put(dev);
}
}
void linkwatch_sync_dev(struct net_device *dev)
{
if (linkwatch_clean_dev(dev)) {
netdev_lock_ops(dev);
linkwatch_do_dev(dev);
netdev_unlock_ops(dev);
/* Use __dev_put() because netdev_tracker_free() was already
* called inside linkwatch_clean_dev().
*/
__dev_put(dev);
}
}
/* Must be called with the rtnl semaphore held */
void linkwatch_run_queue(void)
{
__linkwatch_run_queue(0);
}
static void linkwatch_event(struct work_struct *dummy)
{
rtnl_lock();
__linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
rtnl_unlock();
}
void linkwatch_fire_event(struct net_device *dev)
{
bool urgent = linkwatch_urgent_event(dev);
if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
linkwatch_add_event(dev);
} else if (!urgent)
return;
linkwatch_schedule_work(urgent);
}
EXPORT_SYMBOL(linkwatch_fire_event);