io_uring-futex-2023-10-30

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmVAUXUQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpuGsEADEs0/4uXb8kLUF/y0B0bY9jmwiw5id14g5
 TkAH9lbceV0Yv0E1tPeWYIz7Y7s83UOduFVZo4hRH8EysH3IYFZCI/ny3v2nJ1av
 lN7F7YegVOu6qx77e/CwLo7on14awHkSo8pUdCOm6tYLunLg42miRf+xTpSAL0Mg
 ONnt0WxWDOgdNvTaGwBPaVE78FAWK8nc2ACzonQGfzCl2VXOsSy9JaJJMv8eyXOf
 VVZCNcSvHh/zVznlC1YPoZh/bgS2UUJmIGL/XMQnM5qzbK1IPpzlN0cu8rje3s9b
 TUKBKqr6xhC9nyAS1qAjgZ98RfjVnzcbMX+aWEb/Z0y9XFJVSSQQdW+f9A/0KLZm
 jAejHJpNuqwEdB9MplHTXdeSDTkJH3YNbXvtwA6cc/KpZ1FVQXlhSJPp/mbOa7qe
 IIeg6SYt84uZ2HxflTtm+I1uVE9QMcsesy3FIK4kxhA8jSximQw+hPZ3xrv4AHLd
 cTkRAzfXPUFsJJQCgpv289QXobV/vsFhCFTHFxv63H+EGpJ7e1EaW6Eq0pAHG0Ai
 8kk5Ns29jzTVer1W3sMMeDaZ7S8hGRAyRC+Zb/0QxtGsmvxikB0qY1GpdRGPFueQ
 gOawhLZdhkigIsq0U1UGMpHKY0G1Sl9wvHuH2qzUKeWk+vFRv5RwR6zQuVJr2Jo/
 j3HgyYDs7Q==
 =Z0L0
 -----END PGP SIGNATURE-----

Merge tag 'io_uring-futex-2023-10-30' of git://git.kernel.dk/linux

Pull io_uring futex support from Jens Axboe:
 "This adds support for using futexes through io_uring - first futex
  wake and wait, and then the vectored variant of waiting, futex waitv.

  For both wait/wake/waitv, we support the bitset variant, as the
  'normal' variants can be easily implemented on top of that.

  PI and requeue are not supported through io_uring, just the above
  mentioned parts. This may change in the future, but in the spirit of
  keeping this small (and based on what people have been asking for),
  this is what we currently have.

  Wake support is pretty straight forward, most of the thought has gone
  into the wait side to avoid needing to offload wait operations to a
  blocking context. Instead, we rely on the usual callbacks to retry and
  post a completion event, when appropriate.

  As far as I can recall, the first request for futex support with
  io_uring came from Andres Freund, working on postgres. His aio rework
  of postgres was one of the early adopters of io_uring, and futex
  support was a natural extension for that. This is relevant from both a
  usability point of view, as well as for effiency and performance. In
  Andres's words, for the former:

     Futex wait support in io_uring makes it a lot easier to avoid
     deadlocks in concurrent programs that have their own buffer pool:
     Obviously pages in the application buffer pool have to be locked
     during IO. If the initiator of IO A needs to wait for a held lock
     B, the holder of lock B might wait for the IO A to complete. The
     ability to wait for a lock and IO completions at the same time
     provides an efficient way to avoid such deadlocks

  and in terms of effiency, even without unlocking the full potential
  yet, Andres says:

     Futex wake support in io_uring is useful because it allows for more
     efficient directed wakeups. For some "locks" postgres has queues
     implemented in userspace, with wakeup logic that cannot easily be
     implemented with FUTEX_WAKE_BITSET on a single "futex word"
     (imagine waiting for journal flushes to have completed up to a
     certain point).

     Thus a "lock release" sometimes need to wake up many processes in a
     row. A quick-and-dirty conversion to doing these wakeups via
     io_uring lead to a 3% throughput increase, with 12% fewer context
     switches, albeit in a fairly extreme workload"

* tag 'io_uring-futex-2023-10-30' of git://git.kernel.dk/linux:
  io_uring: add support for vectored futex waits
  futex: make the vectored futex operations available
  futex: make futex_parse_waitv() available as a helper
  futex: add wake_data to struct futex_q
  io_uring: add support for futex wake and wait
  futex: abstract out a __futex_wake_mark() helper
  futex: factor out the futex wake handling
  futex: move FUTEX2_VALID_MASK to futex.h
This commit is contained in:
Linus Torvalds 2023-11-01 11:25:08 -10:00
commit 4de520f1fc
13 changed files with 546 additions and 28 deletions

View file

@ -52,6 +52,8 @@ static inline unsigned int futex_to_flags(unsigned int op)
return flags;
}
#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
/* FUTEX2_ to FLAGS_ */
static inline unsigned int futex2_to_flags(unsigned int flags2)
{
@ -137,11 +139,16 @@ struct futex_pi_state {
union futex_key key;
} __randomize_layout;
struct futex_q;
typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
* @list: priority-sorted list of tasks waiting on this futex
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
* @wake: the wake handler for this queue
* @wake_data: data associated with the wake handler
* @key: the key the futex is hashed on
* @pi_state: optional priority inheritance state
* @rt_waiter: rt_waiter storage for use with requeue_pi
@ -166,6 +173,8 @@ struct futex_q {
struct task_struct *task;
spinlock_t *lock_ptr;
futex_wake_fn *wake;
void *wake_data;
union futex_key key;
struct futex_pi_state *pi_state;
struct rt_mutex_waiter *rt_waiter;
@ -212,6 +221,7 @@ extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb);
extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
struct hrtimer_sleeper *timeout);
extern bool __futex_wake_mark(struct futex_q *q);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
extern int fault_in_user_writeable(u32 __user *uaddr);
@ -351,6 +361,16 @@ struct futex_vector {
struct futex_q q;
};
extern int futex_parse_waitv(struct futex_vector *futexv,
struct futex_waitv __user *uwaitv,
unsigned int nr_futexes, futex_wake_fn *wake,
void *wake_data);
extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
int *woken);
extern int futex_unqueue_multiple(struct futex_vector *v, int count);
extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
struct hrtimer_sleeper *to);

View file

@ -58,6 +58,7 @@ enum {
const struct futex_q futex_q_init = {
/* list gets initialized in futex_queue()*/
.wake = futex_wake_mark,
.key = FUTEX_KEY_INIT,
.bitset = FUTEX_BITSET_MATCH_ANY,
.requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
@ -593,7 +594,7 @@ retry_private:
/* Plain futexes just wake or requeue and are done */
if (!requeue_pi) {
if (++task_count <= nr_wake)
futex_wake_mark(&wake_q, this);
this->wake(&wake_q, this);
else
requeue_futex(this, hb1, hb2, &key2);
continue;

View file

@ -179,19 +179,20 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
/**
* futex_parse_waitv - Parse a waitv array from userspace
* @futexv: Kernel side list of waiters to be filled
* @uwaitv: Userspace list to be parsed
* @nr_futexes: Length of futexv
* @wake: Wake to call when futex is woken
* @wake_data: Data for the wake handler
*
* Return: Error code on failure, 0 on success
*/
static int futex_parse_waitv(struct futex_vector *futexv,
struct futex_waitv __user *uwaitv,
unsigned int nr_futexes)
int futex_parse_waitv(struct futex_vector *futexv,
struct futex_waitv __user *uwaitv,
unsigned int nr_futexes, futex_wake_fn *wake,
void *wake_data)
{
struct futex_waitv aux;
unsigned int i;
@ -216,6 +217,8 @@ static int futex_parse_waitv(struct futex_vector *futexv,
futexv[i].w.val = aux.val;
futexv[i].w.uaddr = aux.uaddr;
futexv[i].q = futex_q_init;
futexv[i].q.wake = wake;
futexv[i].q.wake_data = wake_data;
}
return 0;
@ -308,7 +311,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
goto destroy_timer;
}
ret = futex_parse_waitv(futexv, waiters, nr_futexes);
ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
NULL);
if (!ret)
ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
@ -423,7 +427,7 @@ SYSCALL_DEFINE4(futex_requeue,
if (!waiters)
return -EINVAL;
ret = futex_parse_waitv(futexes, waiters, 2);
ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
if (ret)
return ret;

View file

@ -106,6 +106,24 @@
* double_lock_hb() and double_unlock_hb(), respectively.
*/
bool __futex_wake_mark(struct futex_q *q)
{
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
return false;
__futex_unqueue(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
* is written, without taking any locks. This is possible in the event
* of a spurious wakeup, for example. A memory barrier is required here
* to prevent the following store to lock_ptr from getting ahead of the
* plist_del in __futex_unqueue().
*/
smp_store_release(&q->lock_ptr, NULL);
return true;
}
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed. Callers
@ -116,19 +134,12 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
{
struct task_struct *p = q->task;
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
return;
get_task_struct(p);
__futex_unqueue(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
* is written, without taking any locks. This is possible in the event
* of a spurious wakeup, for example. A memory barrier is required here
* to prevent the following store to lock_ptr from getting ahead of the
* plist_del in __futex_unqueue().
*/
smp_store_release(&q->lock_ptr, NULL);
if (!__futex_wake_mark(q)) {
put_task_struct(p);
return;
}
/*
* Queue the task for later wakeup for after we've released
@ -177,7 +188,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
futex_wake_mark(&wake_q, this);
this->wake(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@ -292,7 +303,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
futex_wake_mark(&wake_q, this);
this->wake(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@ -306,7 +317,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
futex_wake_mark(&wake_q, this);
this->wake(&wake_q, this);
if (++op_ret >= nr_wake2)
break;
}
@ -361,7 +372,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
}
/**
* unqueue_multiple - Remove various futexes from their hash bucket
* futex_unqueue_multiple - Remove various futexes from their hash bucket
* @v: The list of futexes to unqueue
* @count: Number of futexes in the list
*
@ -371,7 +382,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* - >=0 - Index of the last futex that was awoken;
* - -1 - No futex was awoken
*/
static int unqueue_multiple(struct futex_vector *v, int count)
int futex_unqueue_multiple(struct futex_vector *v, int count)
{
int ret = -1, i;
@ -399,7 +410,7 @@ static int unqueue_multiple(struct futex_vector *v, int count)
* - 0 - Success
* - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
*/
static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{
struct futex_hash_bucket *hb;
bool retry = false;
@ -461,7 +472,7 @@ retry:
* was woken, we don't return error and return this index to
* userspace
*/
*woken = unqueue_multiple(vs, i);
*woken = futex_unqueue_multiple(vs, i);
if (*woken >= 0)
return 1;
@ -546,7 +557,7 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
__set_current_state(TASK_RUNNING);
ret = unqueue_multiple(vs, count);
ret = futex_unqueue_multiple(vs, count);
if (ret >= 0)
return ret;