mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 03:24:45 +01:00
bpf-next-7.0
-----BEGIN PGP SIGNATURE-----
iQIzBAABCAAdFiEE+soXsSLHKoYyzcli6rmadz2vbToFAmmGmrgACgkQ6rmadz2v
bTq6NxAAkCHosxzGn9GYYBV8xhrBJoJJDCyEbQ4nR0XNY+zaWnuykmiPP9w1aOAM
zm/po3mQB2pZjetvlrPrgG5RLgBCAUHzqVGy0r+phUvD3vbohKlmSlMm2kiXOb9N
T01BgLWsyqN2ZcNFvORdSsftqIJUHcXxU6RdupGD60sO5XM9ty5cwyewLX8GBOas
UN2bOhbK2DpqYWUvtv+3Q3ykxoStMSkXZvDRurwLKl4RHeLjXZXPo8NjnfBlk/F2
vdFo/F4NO4TmhOave6UPXvKb4yo9IlBRmiPAl0RmNKBxenY8j9XuV/xZxU6YgzDn
+SQfDK+CKQ4IYIygE+fqd4e5CaQrnjmPPcIw12AB2CF0LimY9Xxyyk6FSAhMN7wm
GTVh5K2C3Dk3OiRQk4G58EvQ5QcxzX98IeeCpcckMUkPsFWHRvF402WMUcv9SWpD
DsxxPkfENY/6N67EvH0qcSe/ikdUorQKFl4QjXKwsMCd5WhToeP4Z7Ck1gVSNkAh
9CX++mLzg333Lpsc4SSIuk9bEPpFa5cUIKUY7GCsCiuOXciPeMDP3cGSd5LioqxN
qWljs4Z88QDM2LJpAh8g4m3sA7bMhES3nPmdlI5CfgBcVyLW8D8CqQq4GEZ1McwL
Ky084+lEosugoVjRejrdMMEOsqAfcbkTr2b8jpuAZdwJKm6p/bw=
=cBdK
-----END PGP SIGNATURE-----
Merge tag 'bpf-next-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov:
- Support associating BPF program with struct_ops (Amery Hung)
- Switch BPF local storage to rqspinlock and remove recursion detection
counters which were causing false positives (Amery Hung)
- Fix live registers marking for indirect jumps (Anton Protopopov)
- Introduce execution context detection BPF helpers (Changwoo Min)
- Improve verifier precision for 32bit sign extension pattern
(Cupertino Miranda)
- Optimize BTF type lookup by sorting vmlinux BTF and doing binary
search (Donglin Peng)
- Allow states pruning for misc/invalid slots in iterator loops (Eduard
Zingerman)
- In preparation for ASAN support in BPF arenas teach libbpf to move
global BPF variables to the end of the region and enable arena kfuncs
while holding locks (Emil Tsalapatis)
- Introduce support for implicit arguments in kfuncs and migrate a
number of them to new API. This is a prerequisite for cgroup
sub-schedulers in sched-ext (Ihor Solodrai)
- Fix incorrect copied_seq calculation in sockmap (Jiayuan Chen)
- Fix ORC stack unwind from kprobe_multi (Jiri Olsa)
- Speed up fentry attach by using single ftrace direct ops in BPF
trampolines (Jiri Olsa)
- Require frozen map for calculating map hash (KP Singh)
- Fix lock entry creation in TAS fallback in rqspinlock (Kumar
Kartikeya Dwivedi)
- Allow user space to select cpu in lookup/update operations on per-cpu
array and hash maps (Leon Hwang)
- Make kfuncs return trusted pointers by default (Matt Bobrowski)
- Introduce "fsession" support where single BPF program is executed
upon entry and exit from traced kernel function (Menglong Dong)
- Allow bpf_timer and bpf_wq use in all programs types (Mykyta
Yatsenko, Andrii Nakryiko, Kumar Kartikeya Dwivedi, Alexei
Starovoitov)
- Make KF_TRUSTED_ARGS the default for all kfuncs and clean up their
definition across the tree (Puranjay Mohan)
- Allow BPF arena calls from non-sleepable context (Puranjay Mohan)
- Improve register id comparison logic in the verifier and extend
linked registers with negative offsets (Puranjay Mohan)
- In preparation for BPF-OOM introduce kfuncs to access memcg events
(Roman Gushchin)
- Use CFI compatible destructor kfunc type (Sami Tolvanen)
- Add bitwise tracking for BPF_END in the verifier (Tianci Cao)
- Add range tracking for BPF_DIV and BPF_MOD in the verifier (Yazhou
Tang)
- Make BPF selftests work with 64k page size (Yonghong Song)
* tag 'bpf-next-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (268 commits)
selftests/bpf: Fix outdated test on storage->smap
selftests/bpf: Choose another percpu variable in bpf for btf_dump test
selftests/bpf: Remove test_task_storage_map_stress_lookup
selftests/bpf: Update task_local_storage/task_storage_nodeadlock test
selftests/bpf: Update task_local_storage/recursion test
selftests/bpf: Update sk_storage_omem_uncharge test
bpf: Switch to bpf_selem_unlink_nofail in bpf_local_storage_{map_free, destroy}
bpf: Support lockless unlink when freeing map or local storage
bpf: Prepare for bpf_selem_unlink_nofail()
bpf: Remove unused percpu counter from bpf_local_storage_map_free
bpf: Remove cgroup local storage percpu counter
bpf: Remove task local storage percpu counter
bpf: Change local_storage->lock and b->lock to rqspinlock
bpf: Convert bpf_selem_unlink to failable
bpf: Convert bpf_selem_link_map to failable
bpf: Convert bpf_selem_unlink_map to failable
bpf: Select bpf_local_storage_map_bucket based on bpf_local_storage
selftests/xsk: fix number of Tx frags in invalid packet
selftests/xsk: properly handle batch ending in the middle of a packet
bpf: Prevent reentrance into call_rcu_tasks_trace()
...
This commit is contained in:
commit
f17b474e36
248 changed files with 13425 additions and 3066 deletions
|
|
@ -34,11 +34,12 @@ following types:
|
|||
- ``BPF_PROG_TYPE_LWT_IN``
|
||||
- ``BPF_PROG_TYPE_LWT_OUT``
|
||||
- ``BPF_PROG_TYPE_LWT_XMIT``
|
||||
- ``BPF_PROG_TYPE_LWT_SEG6LOCAL``
|
||||
- ``BPF_PROG_TYPE_FLOW_DISSECTOR``
|
||||
- ``BPF_PROG_TYPE_STRUCT_OPS``
|
||||
- ``BPF_PROG_TYPE_RAW_TRACEPOINT``
|
||||
- ``BPF_PROG_TYPE_SYSCALL``
|
||||
- ``BPF_PROG_TYPE_TRACING``
|
||||
- ``BPF_PROG_TYPE_NETFILTER``
|
||||
|
||||
When using the ``BPF_PROG_RUN`` command, userspace supplies an input context
|
||||
object and (for program types operating on network packets) a buffer containing
|
||||
|
|
|
|||
|
|
@ -50,216 +50,21 @@ A wrapper kfunc is often needed when we need to annotate parameters of the
|
|||
kfunc. Otherwise one may directly make the kfunc visible to the BPF program by
|
||||
registering it with the BPF subsystem. See :ref:`BPF_kfunc_nodef`.
|
||||
|
||||
2.2 Annotating kfunc parameters
|
||||
-------------------------------
|
||||
|
||||
Similar to BPF helpers, there is sometime need for additional context required
|
||||
by the verifier to make the usage of kernel functions safer and more useful.
|
||||
Hence, we can annotate a parameter by suffixing the name of the argument of the
|
||||
kfunc with a __tag, where tag may be one of the supported annotations.
|
||||
|
||||
2.2.1 __sz Annotation
|
||||
---------------------
|
||||
|
||||
This annotation is used to indicate a memory and size pair in the argument list.
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc void bpf_memzero(void *mem, int mem__sz)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
Here, the verifier will treat first argument as a PTR_TO_MEM, and second
|
||||
argument as its size. By default, without __sz annotation, the size of the type
|
||||
of the pointer is used. Without __sz annotation, a kfunc cannot accept a void
|
||||
pointer.
|
||||
|
||||
2.2.2 __k Annotation
|
||||
2.2 kfunc Parameters
|
||||
--------------------
|
||||
|
||||
This annotation is only understood for scalar arguments, where it indicates that
|
||||
the verifier must check the scalar argument to be a known constant, which does
|
||||
not indicate a size parameter, and the value of the constant is relevant to the
|
||||
safety of the program.
|
||||
All kfuncs now require trusted arguments by default. This means that all
|
||||
pointer arguments must be valid, and all pointers to BTF objects must be
|
||||
passed in their unmodified form (at a zero offset, and without having been
|
||||
obtained from walking another pointer, with exceptions described below).
|
||||
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc void *bpf_obj_new(u32 local_type_id__k, ...)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
Here, bpf_obj_new uses local_type_id argument to find out the size of that type
|
||||
ID in program's BTF and return a sized pointer to it. Each type ID will have a
|
||||
distinct size, hence it is crucial to treat each such call as distinct when
|
||||
values don't match during verifier state pruning checks.
|
||||
|
||||
Hence, whenever a constant scalar argument is accepted by a kfunc which is not a
|
||||
size parameter, and the value of the constant matters for program safety, __k
|
||||
suffix should be used.
|
||||
|
||||
2.2.3 __uninit Annotation
|
||||
-------------------------
|
||||
|
||||
This annotation is used to indicate that the argument will be treated as
|
||||
uninitialized.
|
||||
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc int bpf_dynptr_from_skb(..., struct bpf_dynptr_kern *ptr__uninit)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
Here, the dynptr will be treated as an uninitialized dynptr. Without this
|
||||
annotation, the verifier will reject the program if the dynptr passed in is
|
||||
not initialized.
|
||||
|
||||
2.2.4 __opt Annotation
|
||||
-------------------------
|
||||
|
||||
This annotation is used to indicate that the buffer associated with an __sz or __szk
|
||||
argument may be null. If the function is passed a nullptr in place of the buffer,
|
||||
the verifier will not check that length is appropriate for the buffer. The kfunc is
|
||||
responsible for checking if this buffer is null before using it.
|
||||
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__opt, u32 buffer__szk)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
Here, the buffer may be null. If buffer is not null, it at least of size buffer_szk.
|
||||
Either way, the returned buffer is either NULL, or of size buffer_szk. Without this
|
||||
annotation, the verifier will reject the program if a null pointer is passed in with
|
||||
a nonzero size.
|
||||
|
||||
2.2.5 __str Annotation
|
||||
----------------------------
|
||||
This annotation is used to indicate that the argument is a constant string.
|
||||
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc bpf_get_file_xattr(..., const char *name__str, ...)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
In this case, ``bpf_get_file_xattr()`` can be called as::
|
||||
|
||||
bpf_get_file_xattr(..., "xattr_name", ...);
|
||||
|
||||
Or::
|
||||
|
||||
const char name[] = "xattr_name"; /* This need to be global */
|
||||
int BPF_PROG(...)
|
||||
{
|
||||
...
|
||||
bpf_get_file_xattr(..., name, ...);
|
||||
...
|
||||
}
|
||||
|
||||
2.2.6 __prog Annotation
|
||||
---------------------------
|
||||
This annotation is used to indicate that the argument needs to be fixed up to
|
||||
the bpf_prog_aux of the caller BPF program. Any value passed into this argument
|
||||
is ignored, and rewritten by the verifier.
|
||||
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
|
||||
int (callback_fn)(void *map, int *key, void *value),
|
||||
unsigned int flags,
|
||||
void *aux__prog)
|
||||
{
|
||||
struct bpf_prog_aux *aux = aux__prog;
|
||||
...
|
||||
}
|
||||
|
||||
.. _BPF_kfunc_nodef:
|
||||
|
||||
2.3 Using an existing kernel function
|
||||
-------------------------------------
|
||||
|
||||
When an existing function in the kernel is fit for consumption by BPF programs,
|
||||
it can be directly registered with the BPF subsystem. However, care must still
|
||||
be taken to review the context in which it will be invoked by the BPF program
|
||||
and whether it is safe to do so.
|
||||
|
||||
2.4 Annotating kfuncs
|
||||
---------------------
|
||||
|
||||
In addition to kfuncs' arguments, verifier may need more information about the
|
||||
type of kfunc(s) being registered with the BPF subsystem. To do so, we define
|
||||
flags on a set of kfuncs as follows::
|
||||
|
||||
BTF_KFUNCS_START(bpf_task_set)
|
||||
BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL)
|
||||
BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE)
|
||||
BTF_KFUNCS_END(bpf_task_set)
|
||||
|
||||
This set encodes the BTF ID of each kfunc listed above, and encodes the flags
|
||||
along with it. Ofcourse, it is also allowed to specify no flags.
|
||||
|
||||
kfunc definitions should also always be annotated with the ``__bpf_kfunc``
|
||||
macro. This prevents issues such as the compiler inlining the kfunc if it's a
|
||||
static kernel function, or the function being elided in an LTO build as it's
|
||||
not used in the rest of the kernel. Developers should not manually add
|
||||
annotations to their kfunc to prevent these issues. If an annotation is
|
||||
required to prevent such an issue with your kfunc, it is a bug and should be
|
||||
added to the definition of the macro so that other kfuncs are similarly
|
||||
protected. An example is given below::
|
||||
|
||||
__bpf_kfunc struct task_struct *bpf_get_task_pid(s32 pid)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
2.4.1 KF_ACQUIRE flag
|
||||
---------------------
|
||||
|
||||
The KF_ACQUIRE flag is used to indicate that the kfunc returns a pointer to a
|
||||
refcounted object. The verifier will then ensure that the pointer to the object
|
||||
is eventually released using a release kfunc, or transferred to a map using a
|
||||
referenced kptr (by invoking bpf_kptr_xchg). If not, the verifier fails the
|
||||
loading of the BPF program until no lingering references remain in all possible
|
||||
explored states of the program.
|
||||
|
||||
2.4.2 KF_RET_NULL flag
|
||||
----------------------
|
||||
|
||||
The KF_RET_NULL flag is used to indicate that the pointer returned by the kfunc
|
||||
may be NULL. Hence, it forces the user to do a NULL check on the pointer
|
||||
returned from the kfunc before making use of it (dereferencing or passing to
|
||||
another helper). This flag is often used in pairing with KF_ACQUIRE flag, but
|
||||
both are orthogonal to each other.
|
||||
|
||||
2.4.3 KF_RELEASE flag
|
||||
---------------------
|
||||
|
||||
The KF_RELEASE flag is used to indicate that the kfunc releases the pointer
|
||||
passed in to it. There can be only one referenced pointer that can be passed
|
||||
in. All copies of the pointer being released are invalidated as a result of
|
||||
invoking kfunc with this flag. KF_RELEASE kfuncs automatically receive the
|
||||
protection afforded by the KF_TRUSTED_ARGS flag described below.
|
||||
|
||||
2.4.4 KF_TRUSTED_ARGS flag
|
||||
--------------------------
|
||||
|
||||
The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It
|
||||
indicates that the all pointer arguments are valid, and that all pointers to
|
||||
BTF objects have been passed in their unmodified form (that is, at a zero
|
||||
offset, and without having been obtained from walking another pointer, with one
|
||||
exception described below).
|
||||
|
||||
There are two types of pointers to kernel objects which are considered "valid":
|
||||
There are two types of pointers to kernel objects which are considered "trusted":
|
||||
|
||||
1. Pointers which are passed as tracepoint or struct_ops callback arguments.
|
||||
2. Pointers which were returned from a KF_ACQUIRE kfunc.
|
||||
|
||||
Pointers to non-BTF objects (e.g. scalar pointers) may also be passed to
|
||||
KF_TRUSTED_ARGS kfuncs, and may have a non-zero offset.
|
||||
kfuncs, and may have a non-zero offset.
|
||||
|
||||
The definition of "valid" pointers is subject to change at any time, and has
|
||||
absolutely no ABI stability guarantees.
|
||||
|
|
@ -308,14 +113,198 @@ is emitted in the ``type_is_trusted()`` function as follows:
|
|||
|
||||
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
|
||||
|
||||
2.3 Annotating kfunc parameters
|
||||
-------------------------------
|
||||
|
||||
2.4.5 KF_SLEEPABLE flag
|
||||
Similar to BPF helpers, there is sometime need for additional context required
|
||||
by the verifier to make the usage of kernel functions safer and more useful.
|
||||
Hence, we can annotate a parameter by suffixing the name of the argument of the
|
||||
kfunc with a __tag, where tag may be one of the supported annotations.
|
||||
|
||||
2.3.1 __sz Annotation
|
||||
---------------------
|
||||
|
||||
This annotation is used to indicate a memory and size pair in the argument list.
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc void bpf_memzero(void *mem, int mem__sz)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
Here, the verifier will treat first argument as a PTR_TO_MEM, and second
|
||||
argument as its size. By default, without __sz annotation, the size of the type
|
||||
of the pointer is used. Without __sz annotation, a kfunc cannot accept a void
|
||||
pointer.
|
||||
|
||||
2.3.2 __k Annotation
|
||||
--------------------
|
||||
|
||||
This annotation is only understood for scalar arguments, where it indicates that
|
||||
the verifier must check the scalar argument to be a known constant, which does
|
||||
not indicate a size parameter, and the value of the constant is relevant to the
|
||||
safety of the program.
|
||||
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc void *bpf_obj_new(u32 local_type_id__k, ...)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
Here, bpf_obj_new uses local_type_id argument to find out the size of that type
|
||||
ID in program's BTF and return a sized pointer to it. Each type ID will have a
|
||||
distinct size, hence it is crucial to treat each such call as distinct when
|
||||
values don't match during verifier state pruning checks.
|
||||
|
||||
Hence, whenever a constant scalar argument is accepted by a kfunc which is not a
|
||||
size parameter, and the value of the constant matters for program safety, __k
|
||||
suffix should be used.
|
||||
|
||||
2.3.3 __uninit Annotation
|
||||
-------------------------
|
||||
|
||||
This annotation is used to indicate that the argument will be treated as
|
||||
uninitialized.
|
||||
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc int bpf_dynptr_from_skb(..., struct bpf_dynptr_kern *ptr__uninit)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
Here, the dynptr will be treated as an uninitialized dynptr. Without this
|
||||
annotation, the verifier will reject the program if the dynptr passed in is
|
||||
not initialized.
|
||||
|
||||
2.3.4 __nullable Annotation
|
||||
---------------------------
|
||||
|
||||
This annotation is used to indicate that the pointer argument may be NULL.
|
||||
The verifier will allow passing NULL for such arguments.
|
||||
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc void bpf_task_release(struct task_struct *task__nullable)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
Here, the task pointer may be NULL. The kfunc is responsible for checking if
|
||||
the pointer is NULL before dereferencing it.
|
||||
|
||||
The __nullable annotation can be combined with other annotations. For example,
|
||||
when used with __sz or __szk annotations for memory and size pairs, the
|
||||
verifier will skip size validation when a NULL pointer is passed, but will
|
||||
still process the size argument to extract constant size information when
|
||||
needed::
|
||||
|
||||
__bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__nullable,
|
||||
u32 buffer__szk)
|
||||
|
||||
Here, the buffer may be NULL. If the buffer is not NULL, it must be at least
|
||||
buffer__szk bytes in size. The kfunc is responsible for checking if the buffer
|
||||
is NULL before using it.
|
||||
|
||||
2.3.5 __str Annotation
|
||||
----------------------------
|
||||
This annotation is used to indicate that the argument is a constant string.
|
||||
|
||||
An example is given below::
|
||||
|
||||
__bpf_kfunc bpf_get_file_xattr(..., const char *name__str, ...)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
In this case, ``bpf_get_file_xattr()`` can be called as::
|
||||
|
||||
bpf_get_file_xattr(..., "xattr_name", ...);
|
||||
|
||||
Or::
|
||||
|
||||
const char name[] = "xattr_name"; /* This need to be global */
|
||||
int BPF_PROG(...)
|
||||
{
|
||||
...
|
||||
bpf_get_file_xattr(..., name, ...);
|
||||
...
|
||||
}
|
||||
|
||||
.. _BPF_kfunc_nodef:
|
||||
|
||||
2.4 Using an existing kernel function
|
||||
-------------------------------------
|
||||
|
||||
When an existing function in the kernel is fit for consumption by BPF programs,
|
||||
it can be directly registered with the BPF subsystem. However, care must still
|
||||
be taken to review the context in which it will be invoked by the BPF program
|
||||
and whether it is safe to do so.
|
||||
|
||||
2.5 Annotating kfuncs
|
||||
---------------------
|
||||
|
||||
In addition to kfuncs' arguments, verifier may need more information about the
|
||||
type of kfunc(s) being registered with the BPF subsystem. To do so, we define
|
||||
flags on a set of kfuncs as follows::
|
||||
|
||||
BTF_KFUNCS_START(bpf_task_set)
|
||||
BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL)
|
||||
BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE)
|
||||
BTF_KFUNCS_END(bpf_task_set)
|
||||
|
||||
This set encodes the BTF ID of each kfunc listed above, and encodes the flags
|
||||
along with it. Ofcourse, it is also allowed to specify no flags.
|
||||
|
||||
kfunc definitions should also always be annotated with the ``__bpf_kfunc``
|
||||
macro. This prevents issues such as the compiler inlining the kfunc if it's a
|
||||
static kernel function, or the function being elided in an LTO build as it's
|
||||
not used in the rest of the kernel. Developers should not manually add
|
||||
annotations to their kfunc to prevent these issues. If an annotation is
|
||||
required to prevent such an issue with your kfunc, it is a bug and should be
|
||||
added to the definition of the macro so that other kfuncs are similarly
|
||||
protected. An example is given below::
|
||||
|
||||
__bpf_kfunc struct task_struct *bpf_get_task_pid(s32 pid)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
2.5.1 KF_ACQUIRE flag
|
||||
---------------------
|
||||
|
||||
The KF_ACQUIRE flag is used to indicate that the kfunc returns a pointer to a
|
||||
refcounted object. The verifier will then ensure that the pointer to the object
|
||||
is eventually released using a release kfunc, or transferred to a map using a
|
||||
referenced kptr (by invoking bpf_kptr_xchg). If not, the verifier fails the
|
||||
loading of the BPF program until no lingering references remain in all possible
|
||||
explored states of the program.
|
||||
|
||||
2.5.2 KF_RET_NULL flag
|
||||
----------------------
|
||||
|
||||
The KF_RET_NULL flag is used to indicate that the pointer returned by the kfunc
|
||||
may be NULL. Hence, it forces the user to do a NULL check on the pointer
|
||||
returned from the kfunc before making use of it (dereferencing or passing to
|
||||
another helper). This flag is often used in pairing with KF_ACQUIRE flag, but
|
||||
both are orthogonal to each other.
|
||||
|
||||
2.5.3 KF_RELEASE flag
|
||||
---------------------
|
||||
|
||||
The KF_RELEASE flag is used to indicate that the kfunc releases the pointer
|
||||
passed in to it. There can be only one referenced pointer that can be passed
|
||||
in. All copies of the pointer being released are invalidated as a result of
|
||||
invoking kfunc with this flag.
|
||||
|
||||
2.5.4 KF_SLEEPABLE flag
|
||||
-----------------------
|
||||
|
||||
The KF_SLEEPABLE flag is used for kfuncs that may sleep. Such kfuncs can only
|
||||
be called by sleepable BPF programs (BPF_F_SLEEPABLE).
|
||||
|
||||
2.4.6 KF_DESTRUCTIVE flag
|
||||
2.5.5 KF_DESTRUCTIVE flag
|
||||
--------------------------
|
||||
|
||||
The KF_DESTRUCTIVE flag is used to indicate functions calling which is
|
||||
|
|
@ -324,18 +313,19 @@ rebooting or panicking. Due to this additional restrictions apply to these
|
|||
calls. At the moment they only require CAP_SYS_BOOT capability, but more can be
|
||||
added later.
|
||||
|
||||
2.4.7 KF_RCU flag
|
||||
2.5.6 KF_RCU flag
|
||||
-----------------
|
||||
|
||||
The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with
|
||||
KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees
|
||||
that the objects are valid and there is no use-after-free. The pointers are not
|
||||
NULL, but the object's refcount could have reached zero. The kfuncs need to
|
||||
consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE
|
||||
pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely
|
||||
also be KF_RET_NULL.
|
||||
The KF_RCU flag allows kfuncs to opt out of the default trusted args
|
||||
requirement and accept RCU pointers with weaker guarantees. The kfuncs marked
|
||||
with KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier
|
||||
guarantees that the objects are valid and there is no use-after-free. The
|
||||
pointers are not NULL, but the object's refcount could have reached zero. The
|
||||
kfuncs need to consider doing refcnt != 0 check, especially when returning a
|
||||
KF_ACQUIRE pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should
|
||||
very likely also be KF_RET_NULL.
|
||||
|
||||
2.4.8 KF_RCU_PROTECTED flag
|
||||
2.5.7 KF_RCU_PROTECTED flag
|
||||
---------------------------
|
||||
|
||||
The KF_RCU_PROTECTED flag is used to indicate that the kfunc must be invoked in
|
||||
|
|
@ -354,7 +344,7 @@ RCU protection but do not take RCU protected arguments.
|
|||
|
||||
.. _KF_deprecated_flag:
|
||||
|
||||
2.4.9 KF_DEPRECATED flag
|
||||
2.5.8 KF_DEPRECATED flag
|
||||
------------------------
|
||||
|
||||
The KF_DEPRECATED flag is used for kfuncs which are scheduled to be
|
||||
|
|
@ -374,7 +364,39 @@ encouraged to make their use-cases known as early as possible, and participate
|
|||
in upstream discussions regarding whether to keep, change, deprecate, or remove
|
||||
those kfuncs if and when such discussions occur.
|
||||
|
||||
2.5 Registering the kfuncs
|
||||
2.5.9 KF_IMPLICIT_ARGS flag
|
||||
------------------------------------
|
||||
|
||||
The KF_IMPLICIT_ARGS flag is used to indicate that the BPF signature
|
||||
of the kfunc is different from it's kernel signature, and the values
|
||||
for implicit arguments are provided at load time by the verifier.
|
||||
|
||||
Only arguments of specific types are implicit.
|
||||
Currently only ``struct bpf_prog_aux *`` type is supported.
|
||||
|
||||
A kfunc with KF_IMPLICIT_ARGS flag therefore has two types in BTF: one
|
||||
function matching the kernel declaration (with _impl suffix in the
|
||||
name by convention), and another matching the intended BPF API.
|
||||
|
||||
Verifier only allows calls to the non-_impl version of a kfunc, that
|
||||
uses a signature without the implicit arguments.
|
||||
|
||||
Example declaration:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
|
||||
void *map__map, bpf_task_work_callback_t callback,
|
||||
struct bpf_prog_aux *aux) { ... }
|
||||
|
||||
Example usage in BPF program:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
/* note that the last argument is omitted */
|
||||
bpf_task_work_schedule_signal(task, &work->tw, &arrmap, task_work_callback);
|
||||
|
||||
2.6 Registering the kfuncs
|
||||
--------------------------
|
||||
|
||||
Once the kfunc is prepared for use, the final step to making it visible is
|
||||
|
|
@ -397,7 +419,7 @@ type. An example is shown below::
|
|||
}
|
||||
late_initcall(init_subsystem);
|
||||
|
||||
2.6 Specifying no-cast aliases with ___init
|
||||
2.7 Specifying no-cast aliases with ___init
|
||||
--------------------------------------------
|
||||
|
||||
The verifier will always enforce that the BTF type of a pointer passed to a
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ bash 4.2 bash --version
|
|||
binutils 2.30 ld -v
|
||||
flex 2.5.35 flex --version
|
||||
bison 2.0 bison --version
|
||||
pahole 1.16 pahole --version
|
||||
pahole 1.22 pahole --version
|
||||
util-linux 2.10o mount --version
|
||||
kmod 13 depmod -V
|
||||
e2fsprogs 1.41.4 e2fsck -V
|
||||
|
|
@ -143,7 +143,7 @@ pahole
|
|||
|
||||
Since Linux 5.2, if CONFIG_DEBUG_INFO_BTF is selected, the build system
|
||||
generates BTF (BPF Type Format) from DWARF in vmlinux, a bit later from kernel
|
||||
modules as well. This requires pahole v1.16 or later.
|
||||
modules as well. This requires pahole v1.22 or later.
|
||||
|
||||
It is found in the 'dwarves' or 'pahole' distro packages or from
|
||||
https://fedorapeople.org/~acme/dwarves/.
|
||||
|
|
|
|||
|
|
@ -43,7 +43,6 @@ options should be enabled to use sched_ext:
|
|||
CONFIG_DEBUG_INFO_BTF=y
|
||||
CONFIG_BPF_JIT_ALWAYS_ON=y
|
||||
CONFIG_BPF_JIT_DEFAULT_ON=y
|
||||
CONFIG_PAHOLE_HAS_SPLIT_BTF=y
|
||||
CONFIG_PAHOLE_HAS_BTF_TAG=y
|
||||
|
||||
sched_ext is used only when the BPF scheduler is loaded and running.
|
||||
|
|
|
|||
11
MAINTAINERS
11
MAINTAINERS
|
|
@ -4772,6 +4772,7 @@ F: net/sched/act_bpf.c
|
|||
F: net/sched/cls_bpf.c
|
||||
F: samples/bpf/
|
||||
F: scripts/bpf_doc.py
|
||||
F: scripts/gen-btf.sh
|
||||
F: scripts/Makefile.btf
|
||||
F: scripts/pahole-version.sh
|
||||
F: tools/bpf/
|
||||
|
|
@ -4804,6 +4805,15 @@ L: bpf@vger.kernel.org
|
|||
S: Maintained
|
||||
F: tools/lib/bpf/
|
||||
|
||||
BPF [MEMORY MANAGEMENT EXTENSIONS]
|
||||
M: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
M: JP Kobryn <inwardvessel@gmail.com>
|
||||
M: Shakeel Butt <shakeel.butt@linux.dev>
|
||||
L: bpf@vger.kernel.org
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: mm/bpf_memcontrol.c
|
||||
|
||||
BPF [MISC]
|
||||
L: bpf@vger.kernel.org
|
||||
S: Odd Fixes
|
||||
|
|
@ -4853,6 +4863,7 @@ S: Maintained
|
|||
F: Documentation/bpf/prog_lsm.rst
|
||||
F: include/linux/bpf_lsm.h
|
||||
F: kernel/bpf/bpf_lsm.c
|
||||
F: kernel/bpf/bpf_lsm_proto.c
|
||||
F: kernel/trace/bpf_trace.c
|
||||
F: security/bpf/
|
||||
|
||||
|
|
|
|||
15
Makefile
15
Makefile
|
|
@ -708,11 +708,12 @@ endif
|
|||
|
||||
# The expansion should be delayed until arch/$(SRCARCH)/Makefile is included.
|
||||
# Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile.
|
||||
# CC_VERSION_TEXT and RUSTC_VERSION_TEXT are referenced from Kconfig (so they
|
||||
# need export), and from include/config/auto.conf.cmd to detect the compiler
|
||||
# upgrade.
|
||||
# CC_VERSION_TEXT, RUSTC_VERSION_TEXT and PAHOLE_VERSION are referenced from
|
||||
# Kconfig (so they need export), and from include/config/auto.conf.cmd to
|
||||
# detect the version changes between builds.
|
||||
CC_VERSION_TEXT = $(subst $(pound),,$(shell LC_ALL=C $(CC) --version 2>/dev/null | head -n 1))
|
||||
RUSTC_VERSION_TEXT = $(subst $(pound),,$(shell $(RUSTC) --version 2>/dev/null))
|
||||
PAHOLE_VERSION = $(shell $(srctree)/scripts/pahole-version.sh $(PAHOLE))
|
||||
|
||||
ifneq ($(findstring clang,$(CC_VERSION_TEXT)),)
|
||||
include $(srctree)/scripts/Makefile.clang
|
||||
|
|
@ -733,7 +734,7 @@ ifdef config-build
|
|||
# KBUILD_DEFCONFIG may point out an alternative default configuration
|
||||
# used for 'make defconfig'
|
||||
include $(srctree)/arch/$(SRCARCH)/Makefile
|
||||
export KBUILD_DEFCONFIG KBUILD_KCONFIG CC_VERSION_TEXT RUSTC_VERSION_TEXT
|
||||
export KBUILD_DEFCONFIG KBUILD_KCONFIG CC_VERSION_TEXT RUSTC_VERSION_TEXT PAHOLE_VERSION
|
||||
|
||||
config: outputmakefile scripts_basic FORCE
|
||||
$(Q)$(MAKE) $(build)=scripts/kconfig $@
|
||||
|
|
@ -1928,12 +1929,18 @@ clean: private rm-files := Module.symvers modules.nsdeps compile_commands.json
|
|||
PHONY += prepare
|
||||
# now expand this into a simple variable to reduce the cost of shell evaluations
|
||||
prepare: CC_VERSION_TEXT := $(CC_VERSION_TEXT)
|
||||
prepare: PAHOLE_VERSION := $(PAHOLE_VERSION)
|
||||
prepare:
|
||||
@if [ "$(CC_VERSION_TEXT)" != "$(CONFIG_CC_VERSION_TEXT)" ]; then \
|
||||
echo >&2 "warning: the compiler differs from the one used to build the kernel"; \
|
||||
echo >&2 " The kernel was built by: $(CONFIG_CC_VERSION_TEXT)"; \
|
||||
echo >&2 " You are using: $(CC_VERSION_TEXT)"; \
|
||||
fi
|
||||
@if [ "$(PAHOLE_VERSION)" != "$(CONFIG_PAHOLE_VERSION)" ]; then \
|
||||
echo >&2 "warning: pahole version differs from the one used to build the kernel"; \
|
||||
echo >&2 " The kernel was built with: $(CONFIG_PAHOLE_VERSION)"; \
|
||||
echo >&2 " You are using: $(PAHOLE_VERSION)"; \
|
||||
fi
|
||||
|
||||
PHONY += help
|
||||
help:
|
||||
|
|
|
|||
|
|
@ -118,7 +118,7 @@ static inline void emit(const u32 insn, struct jit_ctx *ctx)
|
|||
static inline void emit_u32_data(const u32 data, struct jit_ctx *ctx)
|
||||
{
|
||||
if (ctx->image != NULL && ctx->write)
|
||||
ctx->image[ctx->idx] = data;
|
||||
ctx->image[ctx->idx] = (__force __le32)data;
|
||||
|
||||
ctx->idx++;
|
||||
}
|
||||
|
|
@ -2503,6 +2503,12 @@ static bool is_struct_ops_tramp(const struct bpf_tramp_links *fentry_links)
|
|||
fentry_links->links[0]->link.type == BPF_LINK_TYPE_STRUCT_OPS;
|
||||
}
|
||||
|
||||
static void store_func_meta(struct jit_ctx *ctx, u64 func_meta, int func_meta_off)
|
||||
{
|
||||
emit_a64_mov_i64(A64_R(10), func_meta, ctx);
|
||||
emit(A64_STR64I(A64_R(10), A64_SP, func_meta_off), ctx);
|
||||
}
|
||||
|
||||
/* Based on the x86's implementation of arch_prepare_bpf_trampoline().
|
||||
*
|
||||
* bpf prog and function entry before bpf trampoline hooked:
|
||||
|
|
@ -2526,7 +2532,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
|
|||
int regs_off;
|
||||
int retval_off;
|
||||
int bargs_off;
|
||||
int nfuncargs_off;
|
||||
int func_meta_off;
|
||||
int ip_off;
|
||||
int run_ctx_off;
|
||||
int oargs_off;
|
||||
|
|
@ -2537,6 +2543,9 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
|
|||
bool save_ret;
|
||||
__le32 **branches = NULL;
|
||||
bool is_struct_ops = is_struct_ops_tramp(fentry);
|
||||
int cookie_off, cookie_cnt, cookie_bargs_off;
|
||||
int fsession_cnt = bpf_fsession_cnt(tlinks);
|
||||
u64 func_meta;
|
||||
|
||||
/* trampoline stack layout:
|
||||
* [ parent ip ]
|
||||
|
|
@ -2555,10 +2564,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
|
|||
* [ ... ]
|
||||
* SP + bargs_off [ arg reg 1 ] for bpf
|
||||
*
|
||||
* SP + nfuncargs_off [ arg regs count ]
|
||||
* SP + func_meta_off [ regs count, etc ]
|
||||
*
|
||||
* SP + ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
|
||||
*
|
||||
* [ stack cookie N ]
|
||||
* [ ... ]
|
||||
* SP + cookie_off [ stack cookie 1 ]
|
||||
*
|
||||
* SP + run_ctx_off [ bpf_tramp_run_ctx ]
|
||||
*
|
||||
* [ stack arg N ]
|
||||
|
|
@ -2575,13 +2588,18 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
|
|||
/* room for bpf_tramp_run_ctx */
|
||||
stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8);
|
||||
|
||||
cookie_off = stack_size;
|
||||
/* room for session cookies */
|
||||
cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
|
||||
stack_size += cookie_cnt * 8;
|
||||
|
||||
ip_off = stack_size;
|
||||
/* room for IP address argument */
|
||||
if (flags & BPF_TRAMP_F_IP_ARG)
|
||||
stack_size += 8;
|
||||
|
||||
nfuncargs_off = stack_size;
|
||||
/* room for args count */
|
||||
func_meta_off = stack_size;
|
||||
/* room for function metadata, such as regs count */
|
||||
stack_size += 8;
|
||||
|
||||
bargs_off = stack_size;
|
||||
|
|
@ -2639,9 +2657,9 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
|
|||
emit(A64_STR64I(A64_R(10), A64_SP, ip_off), ctx);
|
||||
}
|
||||
|
||||
/* save arg regs count*/
|
||||
emit(A64_MOVZ(1, A64_R(10), nfuncargs, 0), ctx);
|
||||
emit(A64_STR64I(A64_R(10), A64_SP, nfuncargs_off), ctx);
|
||||
/* save function metadata */
|
||||
func_meta = nfuncargs;
|
||||
store_func_meta(ctx, func_meta, func_meta_off);
|
||||
|
||||
/* save args for bpf */
|
||||
save_args(ctx, bargs_off, oargs_off, m, a, false);
|
||||
|
|
@ -2659,10 +2677,27 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
|
|||
emit_call((const u64)__bpf_tramp_enter, ctx);
|
||||
}
|
||||
|
||||
for (i = 0; i < fentry->nr_links; i++)
|
||||
if (fsession_cnt) {
|
||||
/* clear all the session cookies' value */
|
||||
emit(A64_MOVZ(1, A64_R(10), 0, 0), ctx);
|
||||
for (int i = 0; i < cookie_cnt; i++)
|
||||
emit(A64_STR64I(A64_R(10), A64_SP, cookie_off + 8 * i), ctx);
|
||||
/* clear the return value to make sure fentry always gets 0 */
|
||||
emit(A64_STR64I(A64_R(10), A64_SP, retval_off), ctx);
|
||||
}
|
||||
|
||||
cookie_bargs_off = (bargs_off - cookie_off) / 8;
|
||||
for (i = 0; i < fentry->nr_links; i++) {
|
||||
if (bpf_prog_calls_session_cookie(fentry->links[i])) {
|
||||
u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT);
|
||||
|
||||
store_func_meta(ctx, meta, func_meta_off);
|
||||
cookie_bargs_off--;
|
||||
}
|
||||
invoke_bpf_prog(ctx, fentry->links[i], bargs_off,
|
||||
retval_off, run_ctx_off,
|
||||
flags & BPF_TRAMP_F_RET_FENTRY_RET);
|
||||
}
|
||||
|
||||
if (fmod_ret->nr_links) {
|
||||
branches = kcalloc(fmod_ret->nr_links, sizeof(__le32 *),
|
||||
|
|
@ -2694,9 +2729,22 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
|
|||
*branches[i] = cpu_to_le32(A64_CBNZ(1, A64_R(10), offset));
|
||||
}
|
||||
|
||||
for (i = 0; i < fexit->nr_links; i++)
|
||||
/* set the "is_return" flag for fsession */
|
||||
func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT);
|
||||
if (fsession_cnt)
|
||||
store_func_meta(ctx, func_meta, func_meta_off);
|
||||
|
||||
cookie_bargs_off = (bargs_off - cookie_off) / 8;
|
||||
for (i = 0; i < fexit->nr_links; i++) {
|
||||
if (bpf_prog_calls_session_cookie(fexit->links[i])) {
|
||||
u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT);
|
||||
|
||||
store_func_meta(ctx, meta, func_meta_off);
|
||||
cookie_bargs_off--;
|
||||
}
|
||||
invoke_bpf_prog(ctx, fexit->links[i], bargs_off, retval_off,
|
||||
run_ctx_off, false);
|
||||
}
|
||||
|
||||
if (flags & BPF_TRAMP_F_CALL_ORIG) {
|
||||
im->ip_epilogue = ctx->ro_image + ctx->idx;
|
||||
|
|
@ -2746,6 +2794,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
|
|||
return ctx->idx;
|
||||
}
|
||||
|
||||
bool bpf_jit_supports_fsession(void)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
|
||||
struct bpf_tramp_links *tlinks, void *func_addr)
|
||||
{
|
||||
|
|
@ -3132,7 +3185,7 @@ void bpf_jit_free(struct bpf_prog *prog)
|
|||
bpf_jit_binary_pack_finalize(jit_data->ro_header, jit_data->header);
|
||||
kfree(jit_data);
|
||||
}
|
||||
prog->bpf_func -= cfi_get_offset();
|
||||
prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
|
||||
hdr = bpf_jit_binary_pack_hdr(prog);
|
||||
bpf_jit_binary_pack_free(hdr, NULL);
|
||||
priv_stack_ptr = prog->aux->priv_stack_ptr;
|
||||
|
|
|
|||
|
|
@ -336,6 +336,7 @@ config X86
|
|||
select SCHED_SMT if SMP
|
||||
select ARCH_SUPPORTS_SCHED_CLUSTER if SMP
|
||||
select ARCH_SUPPORTS_SCHED_MC if SMP
|
||||
select HAVE_SINGLE_FTRACE_DIRECT_OPS if X86_64 && DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
||||
|
||||
config INSTRUCTION_DECODER
|
||||
def_bool y
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
|
|||
}
|
||||
|
||||
#define arch_ftrace_partial_regs(regs) do { \
|
||||
regs->flags &= ~X86_EFLAGS_FIXED; \
|
||||
regs->flags |= X86_EFLAGS_FIXED; \
|
||||
regs->cs = __KERNEL_CS; \
|
||||
} while (0)
|
||||
|
||||
|
|
|
|||
|
|
@ -364,6 +364,9 @@ SYM_CODE_START(return_to_handler)
|
|||
UNWIND_HINT_UNDEFINED
|
||||
ANNOTATE_NOENDBR
|
||||
|
||||
/* Store original rsp for pt_regs.sp value. */
|
||||
movq %rsp, %rdi
|
||||
|
||||
/* Restore return_to_handler value that got eaten by previous ret instruction. */
|
||||
subq $8, %rsp
|
||||
UNWIND_HINT_FUNC
|
||||
|
|
@ -374,7 +377,7 @@ SYM_CODE_START(return_to_handler)
|
|||
movq %rax, RAX(%rsp)
|
||||
movq %rdx, RDX(%rsp)
|
||||
movq %rbp, RBP(%rsp)
|
||||
movq %rsp, RSP(%rsp)
|
||||
movq %rdi, RSP(%rsp)
|
||||
movq %rsp, %rdi
|
||||
|
||||
call ftrace_return_to_handler
|
||||
|
|
|
|||
|
|
@ -1300,12 +1300,23 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm)
|
|||
emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm);
|
||||
}
|
||||
|
||||
static void emit_store_stack_imm64(u8 **pprog, int reg, int stack_off, u64 imm64)
|
||||
{
|
||||
/*
|
||||
* mov reg, imm64
|
||||
* mov QWORD PTR [rbp + stack_off], reg
|
||||
*/
|
||||
emit_mov_imm64(pprog, reg, imm64 >> 32, (u32) imm64);
|
||||
emit_stx(pprog, BPF_DW, BPF_REG_FP, reg, stack_off);
|
||||
}
|
||||
|
||||
static int emit_atomic_rmw(u8 **pprog, u32 atomic_op,
|
||||
u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
|
||||
{
|
||||
u8 *prog = *pprog;
|
||||
|
||||
EMIT1(0xF0); /* lock prefix */
|
||||
if (atomic_op != BPF_XCHG)
|
||||
EMIT1(0xF0); /* lock prefix */
|
||||
|
||||
maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW);
|
||||
|
||||
|
|
@ -1347,7 +1358,9 @@ static int emit_atomic_rmw_index(u8 **pprog, u32 atomic_op, u32 size,
|
|||
{
|
||||
u8 *prog = *pprog;
|
||||
|
||||
EMIT1(0xF0); /* lock prefix */
|
||||
if (atomic_op != BPF_XCHG)
|
||||
EMIT1(0xF0); /* lock prefix */
|
||||
|
||||
switch (size) {
|
||||
case BPF_W:
|
||||
EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg));
|
||||
|
|
@ -3081,13 +3094,19 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
|
|||
|
||||
static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
|
||||
struct bpf_tramp_links *tl, int stack_size,
|
||||
int run_ctx_off, bool save_ret,
|
||||
void *image, void *rw_image)
|
||||
int run_ctx_off, int func_meta_off, bool save_ret,
|
||||
void *image, void *rw_image, u64 func_meta,
|
||||
int cookie_off)
|
||||
{
|
||||
int i;
|
||||
int i, cur_cookie = (cookie_off - stack_size) / 8;
|
||||
u8 *prog = *pprog;
|
||||
|
||||
for (i = 0; i < tl->nr_links; i++) {
|
||||
if (tl->links[i]->link.prog->call_session_cookie) {
|
||||
emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off,
|
||||
func_meta | (cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT));
|
||||
cur_cookie--;
|
||||
}
|
||||
if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
|
||||
run_ctx_off, save_ret, image, rw_image))
|
||||
return -EINVAL;
|
||||
|
|
@ -3205,12 +3224,14 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
|
|||
void *func_addr)
|
||||
{
|
||||
int i, ret, nr_regs = m->nr_args, stack_size = 0;
|
||||
int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
|
||||
int regs_off, func_meta_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
|
||||
struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
|
||||
struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
|
||||
struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
|
||||
void *orig_call = func_addr;
|
||||
int cookie_off, cookie_cnt;
|
||||
u8 **branches = NULL;
|
||||
u64 func_meta;
|
||||
u8 *prog;
|
||||
bool save_ret;
|
||||
|
||||
|
|
@ -3246,7 +3267,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
|
|||
* [ ... ]
|
||||
* RBP - regs_off [ reg_arg1 ] program's ctx pointer
|
||||
*
|
||||
* RBP - nregs_off [ regs count ] always
|
||||
* RBP - func_meta_off [ regs count, etc ] always
|
||||
*
|
||||
* RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
|
||||
*
|
||||
|
|
@ -3269,15 +3290,20 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
|
|||
stack_size += nr_regs * 8;
|
||||
regs_off = stack_size;
|
||||
|
||||
/* regs count */
|
||||
/* function matedata, such as regs count */
|
||||
stack_size += 8;
|
||||
nregs_off = stack_size;
|
||||
func_meta_off = stack_size;
|
||||
|
||||
if (flags & BPF_TRAMP_F_IP_ARG)
|
||||
stack_size += 8; /* room for IP address argument */
|
||||
|
||||
ip_off = stack_size;
|
||||
|
||||
cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
|
||||
/* room for session cookies */
|
||||
stack_size += cookie_cnt * 8;
|
||||
cookie_off = stack_size;
|
||||
|
||||
stack_size += 8;
|
||||
rbx_off = stack_size;
|
||||
|
||||
|
|
@ -3345,20 +3371,13 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
|
|||
/* mov QWORD PTR [rbp - rbx_off], rbx */
|
||||
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off);
|
||||
|
||||
/* Store number of argument registers of the traced function:
|
||||
* mov rax, nr_regs
|
||||
* mov QWORD PTR [rbp - nregs_off], rax
|
||||
*/
|
||||
emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs);
|
||||
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off);
|
||||
func_meta = nr_regs;
|
||||
/* Store number of argument registers of the traced function */
|
||||
emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta);
|
||||
|
||||
if (flags & BPF_TRAMP_F_IP_ARG) {
|
||||
/* Store IP address of the traced function:
|
||||
* movabsq rax, func_addr
|
||||
* mov QWORD PTR [rbp - ip_off], rax
|
||||
*/
|
||||
emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr);
|
||||
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
|
||||
/* Store IP address of the traced function */
|
||||
emit_store_stack_imm64(&prog, BPF_REG_0, -ip_off, (long)func_addr);
|
||||
}
|
||||
|
||||
save_args(m, &prog, regs_off, false, flags);
|
||||
|
|
@ -3373,9 +3392,18 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
|
|||
}
|
||||
}
|
||||
|
||||
if (bpf_fsession_cnt(tlinks)) {
|
||||
/* clear all the session cookies' value */
|
||||
for (int i = 0; i < cookie_cnt; i++)
|
||||
emit_store_stack_imm64(&prog, BPF_REG_0, -cookie_off + 8 * i, 0);
|
||||
/* clear the return value to make sure fentry always get 0 */
|
||||
emit_store_stack_imm64(&prog, BPF_REG_0, -8, 0);
|
||||
}
|
||||
|
||||
if (fentry->nr_links) {
|
||||
if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
|
||||
flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image))
|
||||
if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, func_meta_off,
|
||||
flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image,
|
||||
func_meta, cookie_off))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
|
@ -3435,9 +3463,14 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
|
|||
}
|
||||
}
|
||||
|
||||
/* set the "is_return" flag for fsession */
|
||||
func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT);
|
||||
if (bpf_fsession_cnt(tlinks))
|
||||
emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta);
|
||||
|
||||
if (fexit->nr_links) {
|
||||
if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off,
|
||||
false, image, rw_image)) {
|
||||
if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, func_meta_off,
|
||||
false, image, rw_image, func_meta, cookie_off)) {
|
||||
ret = -EINVAL;
|
||||
goto cleanup;
|
||||
}
|
||||
|
|
@ -4079,3 +4112,8 @@ bool bpf_jit_supports_timed_may_goto(void)
|
|||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
bool bpf_jit_supports_fsession(void)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -295,9 +295,6 @@ hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t rdwr
|
|||
{
|
||||
struct hid_bpf_ctx_kern *ctx_kern;
|
||||
|
||||
if (!ctx)
|
||||
return NULL;
|
||||
|
||||
ctx_kern = container_of(ctx, struct hid_bpf_ctx_kern, ctx);
|
||||
|
||||
if (rdwr_buf_size + offset > ctx->allocated_size)
|
||||
|
|
@ -364,7 +361,7 @@ __hid_bpf_hw_check_params(struct hid_bpf_ctx *ctx, __u8 *buf, size_t *buf__sz,
|
|||
u32 report_len;
|
||||
|
||||
/* check arguments */
|
||||
if (!ctx || !hid_ops || !buf)
|
||||
if (!hid_ops)
|
||||
return -EINVAL;
|
||||
|
||||
switch (rtype) {
|
||||
|
|
|
|||
|
|
@ -33,11 +33,9 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx,
|
|||
/* bpf_wq implementation */
|
||||
extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym;
|
||||
extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym;
|
||||
extern int bpf_wq_set_callback_impl(struct bpf_wq *wq,
|
||||
int (callback_fn)(void *map, int *key, void *value),
|
||||
unsigned int flags__k, void *aux__ign) __ksym;
|
||||
#define bpf_wq_set_callback(wq, cb, flags) \
|
||||
bpf_wq_set_callback_impl(wq, cb, flags, NULL)
|
||||
extern int bpf_wq_set_callback(struct bpf_wq *wq,
|
||||
int (*callback_fn)(void *, int *, void *),
|
||||
unsigned int flags) __weak __ksym;
|
||||
|
||||
#define HID_MAX_DESCRIPTOR_SIZE 4096
|
||||
#define HID_IGNORE_EVENT -1
|
||||
|
|
|
|||
|
|
@ -68,10 +68,7 @@ __bpf_kfunc void bpf_put_file(struct file *file)
|
|||
*
|
||||
* Resolve the pathname for the supplied *path* and store it in *buf*. This BPF
|
||||
* kfunc is the safer variant of the legacy bpf_d_path() helper and should be
|
||||
* used in place of bpf_d_path() whenever possible. It enforces KF_TRUSTED_ARGS
|
||||
* semantics, meaning that the supplied *path* must itself hold a valid
|
||||
* reference, or else the BPF program will be outright rejected by the BPF
|
||||
* verifier.
|
||||
* used in place of bpf_d_path() whenever possible.
|
||||
*
|
||||
* This BPF kfunc may only be called from BPF LSM programs.
|
||||
*
|
||||
|
|
@ -359,14 +356,13 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s
|
|||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(bpf_fs_kfunc_set_ids)
|
||||
BTF_ID_FLAGS(func, bpf_get_task_exe_file,
|
||||
KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
|
||||
BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_RET_NULL)
|
||||
BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE)
|
||||
BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_path_d_path)
|
||||
BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE)
|
||||
BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE)
|
||||
BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE)
|
||||
BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE)
|
||||
BTF_KFUNCS_END(bpf_fs_kfunc_set_ids)
|
||||
|
||||
static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id)
|
||||
|
|
@ -377,9 +373,8 @@ static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id)
|
|||
return -EACCES;
|
||||
}
|
||||
|
||||
/* bpf_[set|remove]_dentry_xattr.* hooks have KF_TRUSTED_ARGS and
|
||||
* KF_SLEEPABLE, so they are only available to sleepable hooks with
|
||||
* dentry arguments.
|
||||
/* bpf_[set|remove]_dentry_xattr.* hooks have KF_SLEEPABLE, so they are only
|
||||
* available to sleepable hooks with dentry arguments.
|
||||
*
|
||||
* Setting and removing xattr requires exclusive lock on dentry->d_inode.
|
||||
* Some hooks already locked d_inode, while some hooks have not locked
|
||||
|
|
|
|||
|
|
@ -162,7 +162,7 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *di
|
|||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(fsverity_set_ids)
|
||||
BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_get_fsverity_digest)
|
||||
BTF_KFUNCS_END(fsverity_set_ids)
|
||||
|
||||
static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id)
|
||||
|
|
|
|||
|
|
@ -191,7 +191,7 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock)
|
|||
|
||||
#else
|
||||
|
||||
#define res_spin_lock(lock) resilient_tas_spin_lock(lock)
|
||||
#define res_spin_lock(lock) ({ grab_held_lock_entry(lock); resilient_tas_spin_lock(lock); })
|
||||
|
||||
#endif /* CONFIG_QUEUED_SPINLOCKS */
|
||||
|
||||
|
|
|
|||
|
|
@ -172,7 +172,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
|
|||
void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
|
||||
int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map);
|
||||
|
||||
int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
|
||||
int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value, u64 flags);
|
||||
int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
|
||||
void *value, u64 flags);
|
||||
|
||||
|
|
@ -470,7 +470,7 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
|
|||
static inline void bpf_cgroup_storage_free(
|
||||
struct bpf_cgroup_storage *storage) {}
|
||||
static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
|
||||
void *value) {
|
||||
void *value, u64 flags) {
|
||||
return 0;
|
||||
}
|
||||
static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
|
||||
|
|
|
|||
|
|
@ -287,6 +287,7 @@ struct bpf_map_owner {
|
|||
enum bpf_prog_type type;
|
||||
bool jited;
|
||||
bool xdp_has_frags;
|
||||
bool sleepable;
|
||||
u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE];
|
||||
const struct btf_type *attach_func_proto;
|
||||
enum bpf_attach_type expected_attach_type;
|
||||
|
|
@ -673,6 +674,22 @@ void bpf_map_free_internal_structs(struct bpf_map *map, void *obj);
|
|||
int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
|
||||
struct bpf_dynptr *ptr__uninit);
|
||||
|
||||
#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
|
||||
void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
|
||||
u64 flags);
|
||||
void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
|
||||
#else
|
||||
static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
|
||||
int node_id, u64 flags)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
extern const struct bpf_map_ops bpf_map_offload_ops;
|
||||
|
||||
/* bpf_type_flag contains a set of flags that are applicable to the values of
|
||||
|
|
@ -737,7 +754,7 @@ enum bpf_type_flag {
|
|||
MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS),
|
||||
|
||||
/* PTR was passed from the kernel in a trusted context, and may be
|
||||
* passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions.
|
||||
* passed to kfuncs or BPF helper functions.
|
||||
* Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above.
|
||||
* PTR_UNTRUSTED refers to a kptr that was read directly from a map
|
||||
* without invoking bpf_kptr_xchg(). What we really need to know is
|
||||
|
|
@ -1213,6 +1230,9 @@ enum {
|
|||
#endif
|
||||
};
|
||||
|
||||
#define BPF_TRAMP_COOKIE_INDEX_SHIFT 8
|
||||
#define BPF_TRAMP_IS_RETURN_SHIFT 63
|
||||
|
||||
struct bpf_tramp_links {
|
||||
struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
|
||||
int nr_links;
|
||||
|
|
@ -1293,6 +1313,7 @@ enum bpf_tramp_prog_type {
|
|||
BPF_TRAMP_MODIFY_RETURN,
|
||||
BPF_TRAMP_MAX,
|
||||
BPF_TRAMP_REPLACE, /* more than MAX */
|
||||
BPF_TRAMP_FSESSION,
|
||||
};
|
||||
|
||||
struct bpf_tramp_image {
|
||||
|
|
@ -1309,14 +1330,17 @@ struct bpf_tramp_image {
|
|||
};
|
||||
|
||||
struct bpf_trampoline {
|
||||
/* hlist for trampoline_table */
|
||||
struct hlist_node hlist;
|
||||
/* hlist for trampoline_key_table */
|
||||
struct hlist_node hlist_key;
|
||||
/* hlist for trampoline_ip_table */
|
||||
struct hlist_node hlist_ip;
|
||||
struct ftrace_ops *fops;
|
||||
/* serializes access to fields of this trampoline */
|
||||
struct mutex mutex;
|
||||
refcount_t refcnt;
|
||||
u32 flags;
|
||||
u64 key;
|
||||
unsigned long ip;
|
||||
struct {
|
||||
struct btf_func_model model;
|
||||
void *addr;
|
||||
|
|
@ -1418,7 +1442,7 @@ bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr);
|
|||
int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset,
|
||||
void *src, u64 len, u64 flags);
|
||||
void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
|
||||
void *buffer__opt, u64 buffer__szk);
|
||||
void *buffer__nullable, u64 buffer__szk);
|
||||
|
||||
static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len)
|
||||
{
|
||||
|
|
@ -1742,8 +1766,12 @@ struct bpf_prog_aux {
|
|||
struct rcu_head rcu;
|
||||
};
|
||||
struct bpf_stream stream[2];
|
||||
struct mutex st_ops_assoc_mutex;
|
||||
struct bpf_map __rcu *st_ops_assoc;
|
||||
};
|
||||
|
||||
#define BPF_NR_CONTEXTS 4 /* normal, softirq, hardirq, NMI */
|
||||
|
||||
struct bpf_prog {
|
||||
u16 pages; /* Number of allocated pages */
|
||||
u16 jited:1, /* Is our filter JIT'ed? */
|
||||
|
|
@ -1759,6 +1787,7 @@ struct bpf_prog {
|
|||
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
|
||||
call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
|
||||
call_get_func_ip:1, /* Do we call get_func_ip() */
|
||||
call_session_cookie:1, /* Do we call bpf_session_cookie() */
|
||||
tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
|
||||
sleepable:1; /* BPF program is sleepable */
|
||||
enum bpf_prog_type type; /* Type of BPF program */
|
||||
|
|
@ -1770,7 +1799,7 @@ struct bpf_prog {
|
|||
u8 tag[BPF_TAG_SIZE];
|
||||
};
|
||||
struct bpf_prog_stats __percpu *stats;
|
||||
int __percpu *active;
|
||||
u8 __percpu *active; /* u8[BPF_NR_CONTEXTS] for recursion protection */
|
||||
unsigned int (*bpf_func)(const void *ctx,
|
||||
const struct bpf_insn *insn);
|
||||
struct bpf_prog_aux *aux; /* Auxiliary fields */
|
||||
|
|
@ -1855,6 +1884,11 @@ struct bpf_tracing_link {
|
|||
struct bpf_prog *tgt_prog;
|
||||
};
|
||||
|
||||
struct bpf_fsession_link {
|
||||
struct bpf_tracing_link link;
|
||||
struct bpf_tramp_link fexit;
|
||||
};
|
||||
|
||||
struct bpf_raw_tp_link {
|
||||
struct bpf_link link;
|
||||
struct bpf_raw_event_map *btp;
|
||||
|
|
@ -2002,6 +2036,40 @@ struct bpf_struct_ops_common_value {
|
|||
enum bpf_struct_ops_state state;
|
||||
};
|
||||
|
||||
static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog)
|
||||
{
|
||||
#ifdef CONFIG_ARM64
|
||||
u8 rctx = interrupt_context_level();
|
||||
u8 *active = this_cpu_ptr(prog->active);
|
||||
u32 val;
|
||||
|
||||
preempt_disable();
|
||||
active[rctx]++;
|
||||
val = le32_to_cpu(*(__le32 *)active);
|
||||
preempt_enable();
|
||||
if (val != BIT(rctx * 8))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
#else
|
||||
return this_cpu_inc_return(*(int __percpu *)(prog->active)) == 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog)
|
||||
{
|
||||
#ifdef CONFIG_ARM64
|
||||
u8 rctx = interrupt_context_level();
|
||||
u8 *active = this_cpu_ptr(prog->active);
|
||||
|
||||
preempt_disable();
|
||||
active[rctx]--;
|
||||
preempt_enable();
|
||||
#else
|
||||
this_cpu_dec(*(int __percpu *)(prog->active));
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
|
||||
/* This macro helps developer to register a struct_ops type and generate
|
||||
* type information correctly. Developers should use this macro to register
|
||||
|
|
@ -2044,6 +2112,9 @@ static inline void bpf_module_put(const void *data, struct module *owner)
|
|||
module_put(owner);
|
||||
}
|
||||
int bpf_struct_ops_link_create(union bpf_attr *attr);
|
||||
int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
|
||||
void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
|
||||
void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
|
||||
u32 bpf_struct_ops_id(const void *kdata);
|
||||
|
||||
#ifdef CONFIG_NET
|
||||
|
|
@ -2091,6 +2162,17 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
|
|||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
static inline void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog)
|
||||
{
|
||||
}
|
||||
static inline void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
|
||||
{
|
||||
}
|
||||
|
|
@ -2101,6 +2183,37 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op
|
|||
|
||||
#endif
|
||||
|
||||
static inline int bpf_fsession_cnt(struct bpf_tramp_links *links)
|
||||
{
|
||||
struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY];
|
||||
int cnt = 0;
|
||||
|
||||
for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) {
|
||||
if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION)
|
||||
cnt++;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_link *link)
|
||||
{
|
||||
return link->link.prog->call_session_cookie;
|
||||
}
|
||||
|
||||
static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links)
|
||||
{
|
||||
struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY];
|
||||
int cnt = 0;
|
||||
|
||||
for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) {
|
||||
if (bpf_prog_calls_session_cookie(fentries.links[i]))
|
||||
cnt++;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
|
||||
const struct bpf_ctx_arg_aux *info, u32 cnt);
|
||||
|
||||
|
|
@ -2540,6 +2653,10 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
|
|||
int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
|
||||
unsigned long nr_pages, struct page **page_array);
|
||||
#ifdef CONFIG_MEMCG
|
||||
void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
|
||||
struct mem_cgroup **new_memcg);
|
||||
void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
|
||||
struct mem_cgroup *memcg);
|
||||
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
|
||||
int node);
|
||||
void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
|
||||
|
|
@ -2564,6 +2681,17 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
|
|||
kvcalloc(_n, _size, _flags)
|
||||
#define bpf_map_alloc_percpu(_map, _size, _align, _flags) \
|
||||
__alloc_percpu_gfp(_size, _align, _flags)
|
||||
static inline void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
|
||||
struct mem_cgroup **new_memcg)
|
||||
{
|
||||
*new_memcg = NULL;
|
||||
*old_memcg = NULL;
|
||||
}
|
||||
|
||||
static inline void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
|
||||
struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int
|
||||
|
|
@ -2764,8 +2892,8 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env,
|
|||
struct bpf_func_state *caller,
|
||||
struct bpf_func_state *callee);
|
||||
|
||||
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
|
||||
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
|
||||
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 flags);
|
||||
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags);
|
||||
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
|
||||
u64 flags);
|
||||
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
|
||||
|
|
@ -3243,6 +3371,11 @@ static inline void bpf_prog_report_arena_violation(bool write, unsigned long add
|
|||
}
|
||||
#endif /* CONFIG_BPF_SYSCALL */
|
||||
|
||||
static inline bool bpf_net_capable(void)
|
||||
{
|
||||
return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
|
||||
}
|
||||
|
||||
static __always_inline int
|
||||
bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
|
||||
{
|
||||
|
|
@ -3832,14 +3965,43 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
|
|||
}
|
||||
#endif
|
||||
|
||||
static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type)
|
||||
{
|
||||
switch (map_type) {
|
||||
case BPF_MAP_TYPE_PERCPU_ARRAY:
|
||||
case BPF_MAP_TYPE_PERCPU_HASH:
|
||||
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
|
||||
case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags)
|
||||
{
|
||||
if (flags & ~allowed_flags)
|
||||
u32 cpu;
|
||||
|
||||
if ((u32)flags & ~allowed_flags)
|
||||
return -EINVAL;
|
||||
|
||||
if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))
|
||||
return -EINVAL;
|
||||
|
||||
if (!(flags & BPF_F_CPU) && flags >> 32)
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) {
|
||||
if (!bpf_map_supports_cpu_flags(map->map_type))
|
||||
return -EINVAL;
|
||||
if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS))
|
||||
return -EINVAL;
|
||||
|
||||
cpu = flags >> 32;
|
||||
if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus())
|
||||
return -ERANGE;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,12 +15,13 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/bpf_mem_alloc.h>
|
||||
#include <uapi/linux/btf.h>
|
||||
#include <asm/rqspinlock.h>
|
||||
|
||||
#define BPF_LOCAL_STORAGE_CACHE_SIZE 16
|
||||
|
||||
struct bpf_local_storage_map_bucket {
|
||||
struct hlist_head list;
|
||||
raw_spinlock_t lock;
|
||||
rqspinlock_t lock;
|
||||
};
|
||||
|
||||
/* Thp map is not the primary owner of a bpf_local_storage_elem.
|
||||
|
|
@ -67,6 +68,11 @@ struct bpf_local_storage_data {
|
|||
u8 data[] __aligned(8);
|
||||
};
|
||||
|
||||
#define SELEM_MAP_UNLINKED (1 << 0)
|
||||
#define SELEM_STORAGE_UNLINKED (1 << 1)
|
||||
#define SELEM_UNLINKED (SELEM_MAP_UNLINKED | SELEM_STORAGE_UNLINKED)
|
||||
#define SELEM_TOFREE (1 << 2)
|
||||
|
||||
/* Linked to bpf_local_storage and bpf_local_storage_map */
|
||||
struct bpf_local_storage_elem {
|
||||
struct hlist_node map_node; /* Linked to bpf_local_storage_map */
|
||||
|
|
@ -79,7 +85,9 @@ struct bpf_local_storage_elem {
|
|||
* after raw_spin_unlock
|
||||
*/
|
||||
};
|
||||
/* 8 bytes hole */
|
||||
atomic_t state;
|
||||
bool use_kmalloc_nolock;
|
||||
/* 3 bytes hole */
|
||||
/* The data is stored in another cacheline to minimize
|
||||
* the number of cachelines access during a cache hit.
|
||||
*/
|
||||
|
|
@ -88,13 +96,14 @@ struct bpf_local_storage_elem {
|
|||
|
||||
struct bpf_local_storage {
|
||||
struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
|
||||
struct bpf_local_storage_map __rcu *smap;
|
||||
struct hlist_head list; /* List of bpf_local_storage_elem */
|
||||
void *owner; /* The object that owns the above "list" of
|
||||
* bpf_local_storage_elem.
|
||||
*/
|
||||
struct rcu_head rcu;
|
||||
raw_spinlock_t lock; /* Protect adding/removing from the "list" */
|
||||
rqspinlock_t lock; /* Protect adding/removing from the "list" */
|
||||
u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */
|
||||
refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */
|
||||
bool use_kmalloc_nolock;
|
||||
};
|
||||
|
||||
|
|
@ -162,11 +171,10 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
|
|||
return SDATA(selem);
|
||||
}
|
||||
|
||||
void bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
|
||||
u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
|
||||
|
||||
void bpf_local_storage_map_free(struct bpf_map *map,
|
||||
struct bpf_local_storage_cache *cache,
|
||||
int __percpu *busy_counter);
|
||||
struct bpf_local_storage_cache *cache);
|
||||
|
||||
int bpf_local_storage_map_check_btf(const struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
|
|
@ -176,10 +184,11 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
|
|||
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
|
||||
struct bpf_local_storage_elem *selem);
|
||||
|
||||
void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now);
|
||||
int bpf_selem_unlink(struct bpf_local_storage_elem *selem);
|
||||
|
||||
void bpf_selem_link_map(struct bpf_local_storage_map *smap,
|
||||
struct bpf_local_storage_elem *selem);
|
||||
int bpf_selem_link_map(struct bpf_local_storage_map *smap,
|
||||
struct bpf_local_storage *local_storage,
|
||||
struct bpf_local_storage_elem *selem);
|
||||
|
||||
struct bpf_local_storage_elem *
|
||||
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
|
||||
|
|
|
|||
|
|
@ -340,4 +340,14 @@ static inline bool bpf_mprog_supported(enum bpf_prog_type type)
|
|||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool bpf_mprog_detach_empty(enum bpf_prog_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case BPF_PROG_TYPE_SCHED_CLS:
|
||||
return bpf_net_capable();
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif /* __BPF_MPROG_H */
|
||||
|
|
|
|||
|
|
@ -147,8 +147,12 @@ struct bpf_reg_state {
|
|||
* registers. Example:
|
||||
* r1 = r2; both will have r1->id == r2->id == N
|
||||
* r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10
|
||||
* r3 = r2; both will have r3->id == r2->id == N
|
||||
* w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->off == 10
|
||||
*/
|
||||
#define BPF_ADD_CONST (1U << 31)
|
||||
#define BPF_ADD_CONST64 (1U << 31)
|
||||
#define BPF_ADD_CONST32 (1U << 30)
|
||||
#define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32)
|
||||
u32 id;
|
||||
/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
|
||||
* from a pointer-cast helper, bpf_sk_fullsock() and
|
||||
|
|
@ -692,12 +696,16 @@ struct bpf_id_pair {
|
|||
|
||||
struct bpf_idmap {
|
||||
u32 tmp_id_gen;
|
||||
u32 cnt;
|
||||
struct bpf_id_pair map[BPF_ID_MAP_SIZE];
|
||||
};
|
||||
|
||||
struct bpf_idset {
|
||||
u32 count;
|
||||
u32 ids[BPF_ID_MAP_SIZE];
|
||||
u32 num_ids;
|
||||
struct {
|
||||
u32 id;
|
||||
u32 cnt;
|
||||
} entries[BPF_ID_MAP_SIZE];
|
||||
};
|
||||
|
||||
/* see verifier.c:compute_scc_callchain() */
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@
|
|||
*
|
||||
* And the following kfunc:
|
||||
*
|
||||
* BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
|
||||
* BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE)
|
||||
*
|
||||
* All invocations to the kfunc must pass the unmodified, unwalked task:
|
||||
*
|
||||
|
|
@ -66,7 +66,6 @@
|
|||
* return 0;
|
||||
* }
|
||||
*/
|
||||
#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
|
||||
#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */
|
||||
#define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */
|
||||
#define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
|
||||
|
|
@ -79,6 +78,7 @@
|
|||
#define KF_ARENA_RET (1 << 13) /* kfunc returns an arena pointer */
|
||||
#define KF_ARENA_ARG1 (1 << 14) /* kfunc takes an arena pointer as its first argument */
|
||||
#define KF_ARENA_ARG2 (1 << 15) /* kfunc takes an arena pointer as its second argument */
|
||||
#define KF_IMPLICIT_ARGS (1 << 16) /* kfunc has implicit arguments supplied by the verifier */
|
||||
|
||||
/*
|
||||
* Tag marking a kernel function as a kfunc. This is meant to minimize the
|
||||
|
|
@ -220,6 +220,7 @@ bool btf_is_module(const struct btf *btf);
|
|||
bool btf_is_vmlinux(const struct btf *btf);
|
||||
struct module *btf_try_get_module(const struct btf *btf);
|
||||
u32 btf_nr_types(const struct btf *btf);
|
||||
u32 btf_named_start_id(const struct btf *btf, bool own);
|
||||
struct btf *btf_base_btf(const struct btf *btf);
|
||||
bool btf_type_is_i32(const struct btf_type *t);
|
||||
bool btf_type_is_i64(const struct btf_type *t);
|
||||
|
|
@ -575,8 +576,8 @@ const char *btf_name_by_offset(const struct btf *btf, u32 offset);
|
|||
const char *btf_str_by_offset(const struct btf *btf, u32 offset);
|
||||
struct btf *btf_parse_vmlinux(void);
|
||||
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
|
||||
u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id,
|
||||
const struct bpf_prog *prog);
|
||||
u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog);
|
||||
bool btf_kfunc_is_allowed(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog);
|
||||
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
|
||||
const struct bpf_prog *prog);
|
||||
int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
|
||||
|
|
|
|||
|
|
@ -1167,6 +1167,7 @@ bool bpf_jit_supports_arena(void);
|
|||
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
|
||||
bool bpf_jit_supports_private_stack(void);
|
||||
bool bpf_jit_supports_timed_may_goto(void);
|
||||
bool bpf_jit_supports_fsession(void);
|
||||
u64 bpf_arch_uaddress_limit(void);
|
||||
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
|
||||
u64 arch_bpf_timed_may_goto(void);
|
||||
|
|
|
|||
|
|
@ -82,6 +82,7 @@ static inline void early_trace_init(void) { }
|
|||
|
||||
struct module;
|
||||
struct ftrace_hash;
|
||||
struct ftrace_func_entry;
|
||||
|
||||
#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \
|
||||
defined(CONFIG_DYNAMIC_FTRACE)
|
||||
|
|
@ -359,7 +360,6 @@ enum {
|
|||
FTRACE_OPS_FL_DIRECT = BIT(17),
|
||||
FTRACE_OPS_FL_SUBOP = BIT(18),
|
||||
FTRACE_OPS_FL_GRAPH = BIT(19),
|
||||
FTRACE_OPS_FL_JMP = BIT(20),
|
||||
};
|
||||
|
||||
#ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
|
||||
|
|
@ -403,9 +403,17 @@ enum ftrace_ops_cmd {
|
|||
* Negative on failure. The return value is dependent on the
|
||||
* callback.
|
||||
*/
|
||||
typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd);
|
||||
typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, unsigned long ip, enum ftrace_ops_cmd cmd);
|
||||
|
||||
#ifdef CONFIG_DYNAMIC_FTRACE
|
||||
|
||||
#define FTRACE_HASH_DEFAULT_BITS 10
|
||||
|
||||
struct ftrace_hash *alloc_ftrace_hash(int size_bits);
|
||||
void free_ftrace_hash(struct ftrace_hash *hash);
|
||||
struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash,
|
||||
unsigned long ip, unsigned long direct);
|
||||
|
||||
/* The hash used to know what functions callbacks trace */
|
||||
struct ftrace_ops_hash {
|
||||
struct ftrace_hash __rcu *notrace_hash;
|
||||
|
|
@ -535,6 +543,10 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
|
|||
int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr);
|
||||
int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
|
||||
|
||||
int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash);
|
||||
int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash);
|
||||
int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock);
|
||||
|
||||
void ftrace_stub_direct_tramp(void);
|
||||
|
||||
#else
|
||||
|
|
@ -561,6 +573,21 @@ static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned l
|
|||
return -ENODEV;
|
||||
}
|
||||
|
||||
static inline int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash)
|
||||
{
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
static inline int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash)
|
||||
{
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
static inline int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock)
|
||||
{
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
/*
|
||||
* This must be implemented by the architecture.
|
||||
* It is the way the ftrace direct_ops helper, when called
|
||||
|
|
|
|||
|
|
@ -33,6 +33,31 @@ struct ftrace_regs;
|
|||
#define ftrace_regs_get_frame_pointer(fregs) \
|
||||
frame_pointer(&arch_ftrace_regs(fregs)->regs)
|
||||
|
||||
static __always_inline void
|
||||
ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs) { }
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* ftrace_partial_regs_update - update the original ftrace_regs from regs
|
||||
* @fregs: The ftrace_regs to update from @regs
|
||||
* @regs: The partial regs from ftrace_partial_regs() that was updated
|
||||
*
|
||||
* Some architectures have the partial regs living in the ftrace_regs
|
||||
* structure, whereas other architectures need to make a different copy
|
||||
* of the @regs. If a partial @regs is retrieved by ftrace_partial_regs() and
|
||||
* if the code using @regs updates a field (like the instruction pointer or
|
||||
* stack pointer) it may need to propagate that change to the original @fregs
|
||||
* it retrieved the partial @regs from. Use this function to guarantee that
|
||||
* update happens.
|
||||
*/
|
||||
static __always_inline void
|
||||
ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs)
|
||||
{
|
||||
ftrace_regs_set_instruction_pointer(fregs, instruction_pointer(regs));
|
||||
ftrace_regs_set_return_value(fregs, regs_return_value(regs));
|
||||
}
|
||||
|
||||
#endif /* HAVE_ARCH_FTRACE_REGS */
|
||||
|
||||
/* This can be overridden by the architectures */
|
||||
|
|
|
|||
|
|
@ -949,7 +949,11 @@ static inline void mod_memcg_page_state(struct page *page,
|
|||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
unsigned long memcg_events(struct mem_cgroup *memcg, int event);
|
||||
unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
|
||||
unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
|
||||
bool memcg_stat_item_valid(int idx);
|
||||
bool memcg_vm_event_item_valid(enum vm_event_item idx);
|
||||
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
|
||||
unsigned long lruvec_page_state_local(struct lruvec *lruvec,
|
||||
enum node_stat_item idx);
|
||||
|
|
@ -1375,6 +1379,21 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool memcg_stat_item_valid(int idx)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool memcg_vm_event_item_valid(enum vm_event_item idx)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
|
||||
enum node_stat_item idx)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -97,6 +97,8 @@ struct sk_psock {
|
|||
struct sk_buff_head ingress_skb;
|
||||
struct list_head ingress_msg;
|
||||
spinlock_t ingress_lock;
|
||||
/** @msg_tot_len: Total bytes queued in ingress_msg list. */
|
||||
u32 msg_tot_len;
|
||||
unsigned long state;
|
||||
struct list_head link;
|
||||
spinlock_t link_lock;
|
||||
|
|
@ -141,6 +143,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
|
|||
struct sk_msg *msg, u32 bytes);
|
||||
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags);
|
||||
int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags, int *copied_from_self);
|
||||
bool sk_msg_is_readable(struct sock *sk);
|
||||
|
||||
static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
|
||||
|
|
@ -319,6 +323,27 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
|
|||
kfree_skb(skb);
|
||||
}
|
||||
|
||||
static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock)
|
||||
{
|
||||
/* Used by ioctl to read msg_tot_len only; lock-free for performance */
|
||||
return READ_ONCE(psock->msg_tot_len);
|
||||
}
|
||||
|
||||
static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff)
|
||||
{
|
||||
/* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock().
|
||||
* ingress_lock should be held to prevent concurrent updates to msg_tot_len
|
||||
*/
|
||||
WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff);
|
||||
}
|
||||
|
||||
static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff)
|
||||
{
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
sk_psock_msg_len_add_locked(psock, diff);
|
||||
spin_unlock_bh(&psock->ingress_lock);
|
||||
}
|
||||
|
||||
static inline bool sk_psock_queue_msg(struct sk_psock *psock,
|
||||
struct sk_msg *msg)
|
||||
{
|
||||
|
|
@ -327,6 +352,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
|
|||
spin_lock_bh(&psock->ingress_lock);
|
||||
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
|
||||
list_add_tail(&msg->list, &psock->ingress_msg);
|
||||
sk_psock_msg_len_add_locked(psock, msg->sg.size);
|
||||
ret = true;
|
||||
} else {
|
||||
sk_msg_free(psock->sk, msg);
|
||||
|
|
@ -343,18 +369,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
|
|||
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
|
||||
if (msg)
|
||||
if (msg) {
|
||||
list_del(&msg->list);
|
||||
sk_psock_msg_len_add_locked(psock, -msg->sg.size);
|
||||
}
|
||||
spin_unlock_bh(&psock->ingress_lock);
|
||||
return msg;
|
||||
}
|
||||
|
||||
static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock)
|
||||
{
|
||||
return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
|
||||
}
|
||||
|
||||
static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock)
|
||||
{
|
||||
struct sk_msg *msg;
|
||||
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
|
||||
msg = sk_psock_peek_msg_locked(psock);
|
||||
spin_unlock_bh(&psock->ingress_lock);
|
||||
return msg;
|
||||
}
|
||||
|
|
@ -521,6 +554,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
|
|||
return !!psock->saved_data_ready;
|
||||
}
|
||||
|
||||
/* for tcp only, sk is locked */
|
||||
static inline ssize_t sk_psock_msg_inq(struct sock *sk)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
ssize_t inq = 0;
|
||||
|
||||
psock = sk_psock_get(sk);
|
||||
if (likely(psock)) {
|
||||
inq = sk_psock_get_msg_len_nolock(psock);
|
||||
sk_psock_put(sk, psock);
|
||||
}
|
||||
return inq;
|
||||
}
|
||||
|
||||
/* for udp only, sk is not locked */
|
||||
static inline ssize_t sk_msg_first_len(struct sock *sk)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
struct sk_msg *msg;
|
||||
ssize_t inq = 0;
|
||||
|
||||
psock = sk_psock_get(sk);
|
||||
if (likely(psock)) {
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
msg = sk_psock_peek_msg_locked(psock);
|
||||
if (msg)
|
||||
inq = msg->sg.size;
|
||||
spin_unlock_bh(&psock->ingress_lock);
|
||||
sk_psock_put(sk, psock);
|
||||
}
|
||||
return inq;
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
|
||||
|
||||
#define BPF_F_STRPARSER (1UL << 1)
|
||||
|
|
|
|||
|
|
@ -63,6 +63,11 @@ struct tnum tnum_union(struct tnum t1, struct tnum t2);
|
|||
/* Return @a with all but the lowest @size bytes cleared */
|
||||
struct tnum tnum_cast(struct tnum a, u8 size);
|
||||
|
||||
/* Swap the bytes of a tnum */
|
||||
struct tnum tnum_bswap16(struct tnum a);
|
||||
struct tnum tnum_bswap32(struct tnum a);
|
||||
struct tnum tnum_bswap64(struct tnum a);
|
||||
|
||||
/* Returns true if @a is a known constant */
|
||||
static inline bool tnum_is_const(struct tnum a)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order {
|
|||
BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */
|
||||
BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */
|
||||
BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */
|
||||
/*
|
||||
* Walks the immediate children of the specified parent
|
||||
* cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE,
|
||||
* BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP
|
||||
* the iterator does not include the specified parent as one of the
|
||||
* returned iterator elements.
|
||||
*/
|
||||
BPF_CGROUP_ITER_CHILDREN,
|
||||
};
|
||||
|
||||
union bpf_iter_link_info {
|
||||
|
|
@ -918,6 +926,16 @@ union bpf_iter_link_info {
|
|||
* Number of bytes read from the stream on success, or -1 if an
|
||||
* error occurred (in which case, *errno* is set appropriately).
|
||||
*
|
||||
* BPF_PROG_ASSOC_STRUCT_OPS
|
||||
* Description
|
||||
* Associate a BPF program with a struct_ops map. The struct_ops
|
||||
* map is identified by *map_fd* and the BPF program is
|
||||
* identified by *prog_fd*.
|
||||
*
|
||||
* Return
|
||||
* 0 on success or -1 if an error occurred (in which case,
|
||||
* *errno* is set appropriately).
|
||||
*
|
||||
* NOTES
|
||||
* eBPF objects (maps and programs) can be shared between processes.
|
||||
*
|
||||
|
|
@ -974,6 +992,7 @@ enum bpf_cmd {
|
|||
BPF_PROG_BIND_MAP,
|
||||
BPF_TOKEN_CREATE,
|
||||
BPF_PROG_STREAM_READ_BY_FD,
|
||||
BPF_PROG_ASSOC_STRUCT_OPS,
|
||||
__MAX_BPF_CMD,
|
||||
};
|
||||
|
||||
|
|
@ -1134,6 +1153,7 @@ enum bpf_attach_type {
|
|||
BPF_NETKIT_PEER,
|
||||
BPF_TRACE_KPROBE_SESSION,
|
||||
BPF_TRACE_UPROBE_SESSION,
|
||||
BPF_TRACE_FSESSION,
|
||||
__MAX_BPF_ATTACH_TYPE
|
||||
};
|
||||
|
||||
|
|
@ -1373,6 +1393,8 @@ enum {
|
|||
BPF_NOEXIST = 1, /* create new element if it didn't exist */
|
||||
BPF_EXIST = 2, /* update existing element */
|
||||
BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */
|
||||
BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */
|
||||
BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */
|
||||
};
|
||||
|
||||
/* flags for BPF_MAP_CREATE command */
|
||||
|
|
@ -1894,6 +1916,12 @@ union bpf_attr {
|
|||
__u32 prog_fd;
|
||||
} prog_stream_read;
|
||||
|
||||
struct {
|
||||
__u32 map_fd;
|
||||
__u32 prog_fd;
|
||||
__u32 flags;
|
||||
} prog_assoc_struct_ops;
|
||||
|
||||
} __attribute__((aligned(8)));
|
||||
|
||||
/* The description below is an attempt at providing documentation to eBPF
|
||||
|
|
|
|||
|
|
@ -178,7 +178,7 @@ config RUSTC_HAS_FILE_AS_C_STR
|
|||
|
||||
config PAHOLE_VERSION
|
||||
int
|
||||
default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE))
|
||||
default "$(PAHOLE_VERSION)"
|
||||
|
||||
config CONSTRUCTORS
|
||||
bool
|
||||
|
|
|
|||
|
|
@ -42,7 +42,17 @@ endif
|
|||
ifeq ($(CONFIG_BPF_JIT),y)
|
||||
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
|
||||
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
|
||||
# bpf_lsm_proto.o must precede bpf_lsm.o. The current pahole logic
|
||||
# deduplicates function prototypes within
|
||||
# btf_encoder__add_saved_func() by keeping the first instance seen. We
|
||||
# need the function prototype(s) in bpf_lsm_proto.o to take precedence
|
||||
# over those within bpf_lsm.o. Having bpf_lsm_proto.o precede
|
||||
# bpf_lsm.o ensures its DWARF CU is processed early, forcing the
|
||||
# generated BTF to contain the overrides.
|
||||
#
|
||||
# Notably, this is a temporary workaround whilst the deduplication
|
||||
# semantics within pahole are revisited accordingly.
|
||||
obj-${CONFIG_BPF_LSM} += bpf_lsm_proto.o bpf_lsm.o
|
||||
endif
|
||||
ifneq ($(CONFIG_CRYPTO),)
|
||||
obj-$(CONFIG_BPF_SYSCALL) += crypto.o
|
||||
|
|
|
|||
|
|
@ -2,11 +2,15 @@
|
|||
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/btf.h>
|
||||
#include <linux/cacheflush.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/irq_work.h>
|
||||
#include "linux/filter.h"
|
||||
#include <linux/llist.h>
|
||||
#include <linux/btf_ids.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include "range_tree.h"
|
||||
|
||||
/*
|
||||
|
|
@ -42,14 +46,31 @@
|
|||
#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
|
||||
#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
|
||||
|
||||
static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable);
|
||||
|
||||
struct bpf_arena {
|
||||
struct bpf_map map;
|
||||
u64 user_vm_start;
|
||||
u64 user_vm_end;
|
||||
struct vm_struct *kern_vm;
|
||||
struct range_tree rt;
|
||||
/* protects rt */
|
||||
rqspinlock_t spinlock;
|
||||
struct list_head vma_list;
|
||||
/* protects vma_list */
|
||||
struct mutex lock;
|
||||
struct irq_work free_irq;
|
||||
struct work_struct free_work;
|
||||
struct llist_head free_spans;
|
||||
};
|
||||
|
||||
static void arena_free_worker(struct work_struct *work);
|
||||
static void arena_free_irq(struct irq_work *iw);
|
||||
|
||||
struct arena_free_span {
|
||||
struct llist_node node;
|
||||
unsigned long uaddr;
|
||||
u32 page_cnt;
|
||||
};
|
||||
|
||||
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
|
||||
|
|
@ -92,6 +113,66 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
|
|||
return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
struct apply_range_data {
|
||||
struct page **pages;
|
||||
int i;
|
||||
};
|
||||
|
||||
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
|
||||
{
|
||||
struct apply_range_data *d = data;
|
||||
struct page *page;
|
||||
|
||||
if (!data)
|
||||
return 0;
|
||||
/* sanity check */
|
||||
if (unlikely(!pte_none(ptep_get(pte))))
|
||||
return -EBUSY;
|
||||
|
||||
page = d->pages[d->i];
|
||||
/* paranoia, similar to vmap_pages_pte_range() */
|
||||
if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
|
||||
return -EINVAL;
|
||||
|
||||
set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
|
||||
d->i++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void flush_vmap_cache(unsigned long start, unsigned long size)
|
||||
{
|
||||
flush_cache_vmap(start, start + size);
|
||||
}
|
||||
|
||||
static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
|
||||
{
|
||||
pte_t old_pte;
|
||||
struct page *page;
|
||||
|
||||
/* sanity check */
|
||||
old_pte = ptep_get(pte);
|
||||
if (pte_none(old_pte) || !pte_present(old_pte))
|
||||
return 0; /* nothing to do */
|
||||
|
||||
page = pte_page(old_pte);
|
||||
if (WARN_ON_ONCE(!page))
|
||||
return -EINVAL;
|
||||
|
||||
pte_clear(&init_mm, addr, pte);
|
||||
|
||||
/* Add page to the list so it is freed later */
|
||||
if (free_pages)
|
||||
__llist_add(&page->pcp_llist, free_pages);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int populate_pgtable_except_pte(struct bpf_arena *arena)
|
||||
{
|
||||
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
|
||||
KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
|
||||
}
|
||||
|
||||
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
struct vm_struct *kern_vm;
|
||||
|
|
@ -136,6 +217,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
|
|||
arena->user_vm_end = arena->user_vm_start + vm_range;
|
||||
|
||||
INIT_LIST_HEAD(&arena->vma_list);
|
||||
init_llist_head(&arena->free_spans);
|
||||
init_irq_work(&arena->free_irq, arena_free_irq);
|
||||
INIT_WORK(&arena->free_work, arena_free_worker);
|
||||
bpf_map_init_from_attr(&arena->map, attr);
|
||||
range_tree_init(&arena->rt);
|
||||
err = range_tree_set(&arena->rt, 0, attr->max_entries);
|
||||
|
|
@ -144,6 +228,13 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
|
|||
goto err;
|
||||
}
|
||||
mutex_init(&arena->lock);
|
||||
raw_res_spin_lock_init(&arena->spinlock);
|
||||
err = populate_pgtable_except_pte(arena);
|
||||
if (err) {
|
||||
range_tree_destroy(&arena->rt);
|
||||
bpf_map_area_free(arena);
|
||||
goto err;
|
||||
}
|
||||
|
||||
return &arena->map;
|
||||
err:
|
||||
|
|
@ -184,6 +275,10 @@ static void arena_map_free(struct bpf_map *map)
|
|||
if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
|
||||
return;
|
||||
|
||||
/* Ensure no pending deferred frees */
|
||||
irq_work_sync(&arena->free_irq);
|
||||
flush_work(&arena->free_work);
|
||||
|
||||
/*
|
||||
* free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
|
||||
* It unmaps everything from vmalloc area and clears pgtables.
|
||||
|
|
@ -265,44 +360,59 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
|
|||
{
|
||||
struct bpf_map *map = vmf->vma->vm_file->private_data;
|
||||
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
|
||||
struct mem_cgroup *new_memcg, *old_memcg;
|
||||
struct page *page;
|
||||
long kbase, kaddr;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
kbase = bpf_arena_get_kern_vm_start(arena);
|
||||
kaddr = kbase + (u32)(vmf->address);
|
||||
|
||||
guard(mutex)(&arena->lock);
|
||||
if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
|
||||
/* Make a reasonable effort to address impossible case */
|
||||
return VM_FAULT_RETRY;
|
||||
|
||||
page = vmalloc_to_page((void *)kaddr);
|
||||
if (page)
|
||||
/* already have a page vmap-ed */
|
||||
goto out;
|
||||
|
||||
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
|
||||
|
||||
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
|
||||
/* User space requested to segfault when page is not allocated by bpf prog */
|
||||
return VM_FAULT_SIGSEGV;
|
||||
goto out_unlock_sigsegv;
|
||||
|
||||
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
|
||||
if (ret)
|
||||
return VM_FAULT_SIGSEGV;
|
||||
goto out_unlock_sigsegv;
|
||||
|
||||
struct apply_range_data data = { .pages = &page, .i = 0 };
|
||||
/* Account into memcg of the process that created bpf_arena */
|
||||
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
|
||||
if (ret) {
|
||||
range_tree_set(&arena->rt, vmf->pgoff, 1);
|
||||
return VM_FAULT_SIGSEGV;
|
||||
goto out_unlock_sigsegv;
|
||||
}
|
||||
|
||||
ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
|
||||
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
|
||||
if (ret) {
|
||||
range_tree_set(&arena->rt, vmf->pgoff, 1);
|
||||
__free_page(page);
|
||||
return VM_FAULT_SIGSEGV;
|
||||
free_pages_nolock(page, 0);
|
||||
goto out_unlock_sigsegv;
|
||||
}
|
||||
flush_vmap_cache(kaddr, PAGE_SIZE);
|
||||
bpf_map_memcg_exit(old_memcg, new_memcg);
|
||||
out:
|
||||
page_ref_add(page, 1);
|
||||
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
|
||||
vmf->page = page;
|
||||
return 0;
|
||||
out_unlock_sigsegv:
|
||||
bpf_map_memcg_exit(old_memcg, new_memcg);
|
||||
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
|
||||
return VM_FAULT_SIGSEGV;
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct arena_vm_ops = {
|
||||
|
|
@ -423,12 +533,18 @@ static u64 clear_lo32(u64 val)
|
|||
* Allocate pages and vmap them into kernel vmalloc area.
|
||||
* Later the pages will be mmaped into user space vma.
|
||||
*/
|
||||
static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
|
||||
static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id,
|
||||
bool sleepable)
|
||||
{
|
||||
/* user_vm_end/start are fixed before bpf prog runs */
|
||||
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
|
||||
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
|
||||
struct page **pages;
|
||||
struct mem_cgroup *new_memcg, *old_memcg;
|
||||
struct apply_range_data data;
|
||||
struct page **pages = NULL;
|
||||
long remaining, mapped = 0;
|
||||
long alloc_pages;
|
||||
unsigned long flags;
|
||||
long pgoff = 0;
|
||||
u32 uaddr32;
|
||||
int ret, i;
|
||||
|
|
@ -445,17 +561,23 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
|
||||
pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
|
||||
if (!pages)
|
||||
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
|
||||
/* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */
|
||||
alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *));
|
||||
pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE);
|
||||
if (!pages) {
|
||||
bpf_map_memcg_exit(old_memcg, new_memcg);
|
||||
return 0;
|
||||
}
|
||||
data.pages = pages;
|
||||
|
||||
guard(mutex)(&arena->lock);
|
||||
if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
|
||||
goto out_free_pages;
|
||||
|
||||
if (uaddr) {
|
||||
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
|
||||
if (ret)
|
||||
goto out_free_pages;
|
||||
goto out_unlock_free_pages;
|
||||
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
|
||||
} else {
|
||||
ret = pgoff = range_tree_find(&arena->rt, page_cnt);
|
||||
|
|
@ -463,33 +585,62 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
|
|||
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
|
||||
}
|
||||
if (ret)
|
||||
goto out_free_pages;
|
||||
|
||||
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto out_unlock_free_pages;
|
||||
|
||||
remaining = page_cnt;
|
||||
uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
|
||||
/* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
|
||||
* will not overflow 32-bit. Lower 32-bit need to represent
|
||||
* contiguous user address range.
|
||||
* Map these pages at kern_vm_start base.
|
||||
* kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
|
||||
* lower 32-bit and it's ok.
|
||||
*/
|
||||
ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
|
||||
kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
|
||||
if (ret) {
|
||||
for (i = 0; i < page_cnt; i++)
|
||||
__free_page(pages[i]);
|
||||
goto out;
|
||||
|
||||
while (remaining) {
|
||||
long this_batch = min(remaining, alloc_pages);
|
||||
|
||||
/* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
|
||||
memset(pages, 0, this_batch * sizeof(struct page *));
|
||||
|
||||
ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
|
||||
* will not overflow 32-bit. Lower 32-bit need to represent
|
||||
* contiguous user address range.
|
||||
* Map these pages at kern_vm_start base.
|
||||
* kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
|
||||
* lower 32-bit and it's ok.
|
||||
*/
|
||||
data.i = 0;
|
||||
ret = apply_to_page_range(&init_mm,
|
||||
kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT),
|
||||
this_batch << PAGE_SHIFT, apply_range_set_cb, &data);
|
||||
if (ret) {
|
||||
/* data.i pages were mapped, account them and free the remaining */
|
||||
mapped += data.i;
|
||||
for (i = data.i; i < this_batch; i++)
|
||||
free_pages_nolock(pages[i], 0);
|
||||
goto out;
|
||||
}
|
||||
|
||||
mapped += this_batch;
|
||||
remaining -= this_batch;
|
||||
}
|
||||
kvfree(pages);
|
||||
flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
|
||||
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
|
||||
kfree_nolock(pages);
|
||||
bpf_map_memcg_exit(old_memcg, new_memcg);
|
||||
return clear_lo32(arena->user_vm_start) + uaddr32;
|
||||
out:
|
||||
range_tree_set(&arena->rt, pgoff, page_cnt);
|
||||
range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped);
|
||||
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
|
||||
if (mapped) {
|
||||
flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
|
||||
arena_free_pages(arena, uaddr32, mapped, sleepable);
|
||||
}
|
||||
goto out_free_pages;
|
||||
out_unlock_free_pages:
|
||||
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
|
||||
out_free_pages:
|
||||
kvfree(pages);
|
||||
kfree_nolock(pages);
|
||||
bpf_map_memcg_exit(old_memcg, new_memcg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -502,42 +653,66 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
|
|||
{
|
||||
struct vma_list *vml;
|
||||
|
||||
guard(mutex)(&arena->lock);
|
||||
/* iterate link list under lock */
|
||||
list_for_each_entry(vml, &arena->vma_list, head)
|
||||
zap_page_range_single(vml->vma, uaddr,
|
||||
PAGE_SIZE * page_cnt, NULL);
|
||||
}
|
||||
|
||||
static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
|
||||
static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
|
||||
{
|
||||
struct mem_cgroup *new_memcg, *old_memcg;
|
||||
u64 full_uaddr, uaddr_end;
|
||||
long kaddr, pgoff, i;
|
||||
long kaddr, pgoff;
|
||||
struct page *page;
|
||||
struct llist_head free_pages;
|
||||
struct llist_node *pos, *t;
|
||||
struct arena_free_span *s;
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
|
||||
/* only aligned lower 32-bit are relevant */
|
||||
uaddr = (u32)uaddr;
|
||||
uaddr &= PAGE_MASK;
|
||||
kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
|
||||
full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
|
||||
uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
|
||||
if (full_uaddr >= uaddr_end)
|
||||
return;
|
||||
|
||||
page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
|
||||
|
||||
guard(mutex)(&arena->lock);
|
||||
|
||||
pgoff = compute_pgoff(arena, uaddr);
|
||||
/* clear range */
|
||||
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
|
||||
|
||||
if (!sleepable)
|
||||
goto defer;
|
||||
|
||||
ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags);
|
||||
|
||||
/* Can't proceed without holding the spinlock so defer the free */
|
||||
if (ret)
|
||||
goto defer;
|
||||
|
||||
range_tree_set(&arena->rt, pgoff, page_cnt);
|
||||
|
||||
init_llist_head(&free_pages);
|
||||
/* clear ptes and collect struct pages */
|
||||
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
|
||||
apply_range_clear_cb, &free_pages);
|
||||
|
||||
/* drop the lock to do the tlb flush and zap pages */
|
||||
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
|
||||
|
||||
/* ensure no stale TLB entries */
|
||||
flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE));
|
||||
|
||||
if (page_cnt > 1)
|
||||
/* bulk zap if multiple pages being freed */
|
||||
zap_pages(arena, full_uaddr, page_cnt);
|
||||
|
||||
kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
|
||||
for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
|
||||
page = vmalloc_to_page((void *)kaddr);
|
||||
if (!page)
|
||||
continue;
|
||||
llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) {
|
||||
page = llist_entry(pos, struct page, pcp_llist);
|
||||
if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
|
||||
/* Optimization for the common case of page_cnt==1:
|
||||
* If page wasn't mapped into some user vma there
|
||||
|
|
@ -545,9 +720,27 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
|
|||
* page_cnt is big it's faster to do the batched zap.
|
||||
*/
|
||||
zap_pages(arena, full_uaddr, 1);
|
||||
vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
|
||||
__free_page(page);
|
||||
}
|
||||
bpf_map_memcg_exit(old_memcg, new_memcg);
|
||||
|
||||
return;
|
||||
|
||||
defer:
|
||||
s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1);
|
||||
bpf_map_memcg_exit(old_memcg, new_memcg);
|
||||
if (!s)
|
||||
/*
|
||||
* If allocation fails in non-sleepable context, pages are intentionally left
|
||||
* inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not
|
||||
* possible here, so we intentionally omit them for safety.
|
||||
*/
|
||||
return;
|
||||
|
||||
s->page_cnt = page_cnt;
|
||||
s->uaddr = uaddr;
|
||||
llist_add(&s->node, &arena->free_spans);
|
||||
irq_work_queue(&arena->free_irq);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -557,6 +750,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
|
|||
static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt)
|
||||
{
|
||||
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
|
||||
struct mem_cgroup *new_memcg, *old_memcg;
|
||||
unsigned long flags;
|
||||
long pgoff;
|
||||
int ret;
|
||||
|
||||
|
|
@ -567,15 +762,94 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt
|
|||
if (pgoff + page_cnt > page_cnt_max)
|
||||
return -EINVAL;
|
||||
|
||||
guard(mutex)(&arena->lock);
|
||||
if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
|
||||
return -EBUSY;
|
||||
|
||||
/* Cannot guard already allocated pages. */
|
||||
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
|
||||
if (ret)
|
||||
return -EBUSY;
|
||||
if (ret) {
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* "Allocate" the region to prevent it from being allocated. */
|
||||
return range_tree_clear(&arena->rt, pgoff, page_cnt);
|
||||
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
|
||||
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
|
||||
bpf_map_memcg_exit(old_memcg, new_memcg);
|
||||
out:
|
||||
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void arena_free_worker(struct work_struct *work)
|
||||
{
|
||||
struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work);
|
||||
struct mem_cgroup *new_memcg, *old_memcg;
|
||||
struct llist_node *list, *pos, *t;
|
||||
struct arena_free_span *s;
|
||||
u64 arena_vm_start, user_vm_start;
|
||||
struct llist_head free_pages;
|
||||
struct page *page;
|
||||
unsigned long full_uaddr;
|
||||
long kaddr, page_cnt, pgoff;
|
||||
unsigned long flags;
|
||||
|
||||
if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) {
|
||||
schedule_work(work);
|
||||
return;
|
||||
}
|
||||
|
||||
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
|
||||
|
||||
init_llist_head(&free_pages);
|
||||
arena_vm_start = bpf_arena_get_kern_vm_start(arena);
|
||||
user_vm_start = bpf_arena_get_user_vm_start(arena);
|
||||
|
||||
list = llist_del_all(&arena->free_spans);
|
||||
llist_for_each(pos, list) {
|
||||
s = llist_entry(pos, struct arena_free_span, node);
|
||||
page_cnt = s->page_cnt;
|
||||
kaddr = arena_vm_start + s->uaddr;
|
||||
pgoff = compute_pgoff(arena, s->uaddr);
|
||||
|
||||
/* clear ptes and collect pages in free_pages llist */
|
||||
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
|
||||
apply_range_clear_cb, &free_pages);
|
||||
|
||||
range_tree_set(&arena->rt, pgoff, page_cnt);
|
||||
}
|
||||
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
|
||||
|
||||
/* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */
|
||||
llist_for_each_safe(pos, t, list) {
|
||||
s = llist_entry(pos, struct arena_free_span, node);
|
||||
page_cnt = s->page_cnt;
|
||||
full_uaddr = clear_lo32(user_vm_start) + s->uaddr;
|
||||
kaddr = arena_vm_start + s->uaddr;
|
||||
|
||||
/* ensure no stale TLB entries */
|
||||
flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE));
|
||||
|
||||
/* remove pages from user vmas */
|
||||
zap_pages(arena, full_uaddr, page_cnt);
|
||||
|
||||
kfree_nolock(s);
|
||||
}
|
||||
|
||||
/* free all pages collected by apply_to_existing_page_range() in the first loop */
|
||||
llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) {
|
||||
page = llist_entry(pos, struct page, pcp_llist);
|
||||
__free_page(page);
|
||||
}
|
||||
|
||||
bpf_map_memcg_exit(old_memcg, new_memcg);
|
||||
}
|
||||
|
||||
static void arena_free_irq(struct irq_work *iw)
|
||||
{
|
||||
struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq);
|
||||
|
||||
schedule_work(&arena->free_work);
|
||||
}
|
||||
|
||||
__bpf_kfunc_start_defs();
|
||||
|
|
@ -589,9 +863,20 @@ __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_
|
|||
if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
|
||||
return NULL;
|
||||
|
||||
return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
|
||||
return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
|
||||
}
|
||||
|
||||
void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
|
||||
int node_id, u64 flags)
|
||||
{
|
||||
struct bpf_map *map = p__map;
|
||||
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
|
||||
|
||||
if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
|
||||
return NULL;
|
||||
|
||||
return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
|
||||
}
|
||||
__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
|
||||
{
|
||||
struct bpf_map *map = p__map;
|
||||
|
|
@ -599,7 +884,17 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt
|
|||
|
||||
if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
|
||||
return;
|
||||
arena_free_pages(arena, (long)ptr__ign, page_cnt);
|
||||
arena_free_pages(arena, (long)ptr__ign, page_cnt, true);
|
||||
}
|
||||
|
||||
void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
|
||||
{
|
||||
struct bpf_map *map = p__map;
|
||||
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
|
||||
|
||||
if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
|
||||
return;
|
||||
arena_free_pages(arena, (long)ptr__ign, page_cnt, false);
|
||||
}
|
||||
|
||||
__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
|
||||
|
|
@ -618,9 +913,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c
|
|||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(arena_kfuncs)
|
||||
BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
|
||||
BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
|
||||
BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
|
||||
BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2)
|
||||
BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2)
|
||||
BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2)
|
||||
BTF_KFUNCS_END(arena_kfuncs)
|
||||
|
||||
static const struct btf_kfunc_id_set common_kfunc_set = {
|
||||
|
|
|
|||
|
|
@ -307,7 +307,7 @@ static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key,
|
|||
return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
|
||||
}
|
||||
|
||||
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
|
||||
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
|
||||
{
|
||||
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
||||
u32 index = *(u32 *)key;
|
||||
|
|
@ -325,11 +325,18 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
|
|||
size = array->elem_size;
|
||||
rcu_read_lock();
|
||||
pptr = array->pptrs[index & array->index_mask];
|
||||
if (map_flags & BPF_F_CPU) {
|
||||
cpu = map_flags >> 32;
|
||||
copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
|
||||
check_and_init_map_value(map, value);
|
||||
goto unlock;
|
||||
}
|
||||
for_each_possible_cpu(cpu) {
|
||||
copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
|
||||
check_and_init_map_value(map, value + off);
|
||||
off += size;
|
||||
}
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -398,10 +405,11 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
|
|||
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
||||
u32 index = *(u32 *)key;
|
||||
void __percpu *pptr;
|
||||
int cpu, off = 0;
|
||||
void *ptr, *val;
|
||||
u32 size;
|
||||
int cpu;
|
||||
|
||||
if (unlikely(map_flags > BPF_EXIST))
|
||||
if (unlikely((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS))
|
||||
/* unknown flags */
|
||||
return -EINVAL;
|
||||
|
||||
|
|
@ -422,11 +430,20 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
|
|||
size = array->elem_size;
|
||||
rcu_read_lock();
|
||||
pptr = array->pptrs[index & array->index_mask];
|
||||
for_each_possible_cpu(cpu) {
|
||||
copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
|
||||
bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
|
||||
off += size;
|
||||
if (map_flags & BPF_F_CPU) {
|
||||
cpu = map_flags >> 32;
|
||||
ptr = per_cpu_ptr(pptr, cpu);
|
||||
copy_map_value(map, ptr, value);
|
||||
bpf_obj_free_fields(array->map.record, ptr);
|
||||
goto unlock;
|
||||
}
|
||||
for_each_possible_cpu(cpu) {
|
||||
ptr = per_cpu_ptr(pptr, cpu);
|
||||
val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
|
||||
copy_map_value(map, ptr, val);
|
||||
bpf_obj_free_fields(array->map.record, ptr);
|
||||
}
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,29 +11,6 @@
|
|||
|
||||
DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
|
||||
|
||||
static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
|
||||
|
||||
static void bpf_cgrp_storage_lock(void)
|
||||
{
|
||||
cant_migrate();
|
||||
this_cpu_inc(bpf_cgrp_storage_busy);
|
||||
}
|
||||
|
||||
static void bpf_cgrp_storage_unlock(void)
|
||||
{
|
||||
this_cpu_dec(bpf_cgrp_storage_busy);
|
||||
}
|
||||
|
||||
static bool bpf_cgrp_storage_trylock(void)
|
||||
{
|
||||
cant_migrate();
|
||||
if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
|
||||
this_cpu_dec(bpf_cgrp_storage_busy);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
|
||||
{
|
||||
struct cgroup *cg = owner;
|
||||
|
|
@ -45,16 +22,14 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
|
|||
{
|
||||
struct bpf_local_storage *local_storage;
|
||||
|
||||
rcu_read_lock_dont_migrate();
|
||||
rcu_read_lock();
|
||||
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
|
||||
if (!local_storage)
|
||||
goto out;
|
||||
|
||||
bpf_cgrp_storage_lock();
|
||||
bpf_local_storage_destroy(local_storage);
|
||||
bpf_cgrp_storage_unlock();
|
||||
out:
|
||||
rcu_read_unlock_migrate();
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static struct bpf_local_storage_data *
|
||||
|
|
@ -83,9 +58,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
|
|||
if (IS_ERR(cgroup))
|
||||
return ERR_CAST(cgroup);
|
||||
|
||||
bpf_cgrp_storage_lock();
|
||||
sdata = cgroup_storage_lookup(cgroup, map, true);
|
||||
bpf_cgrp_storage_unlock();
|
||||
cgroup_put(cgroup);
|
||||
return sdata ? sdata->data : NULL;
|
||||
}
|
||||
|
|
@ -102,10 +75,8 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
|
|||
if (IS_ERR(cgroup))
|
||||
return PTR_ERR(cgroup);
|
||||
|
||||
bpf_cgrp_storage_lock();
|
||||
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
|
||||
value, map_flags, false, GFP_ATOMIC);
|
||||
bpf_cgrp_storage_unlock();
|
||||
cgroup_put(cgroup);
|
||||
return PTR_ERR_OR_ZERO(sdata);
|
||||
}
|
||||
|
|
@ -118,8 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
|
|||
if (!sdata)
|
||||
return -ENOENT;
|
||||
|
||||
bpf_selem_unlink(SELEM(sdata), false);
|
||||
return 0;
|
||||
return bpf_selem_unlink(SELEM(sdata));
|
||||
}
|
||||
|
||||
static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
|
||||
|
|
@ -132,9 +102,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
|
|||
if (IS_ERR(cgroup))
|
||||
return PTR_ERR(cgroup);
|
||||
|
||||
bpf_cgrp_storage_lock();
|
||||
err = cgroup_storage_delete(cgroup, map);
|
||||
bpf_cgrp_storage_unlock();
|
||||
cgroup_put(cgroup);
|
||||
return err;
|
||||
}
|
||||
|
|
@ -151,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
|
|||
|
||||
static void cgroup_storage_map_free(struct bpf_map *map)
|
||||
{
|
||||
bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy);
|
||||
bpf_local_storage_map_free(map, &cgroup_cache);
|
||||
}
|
||||
|
||||
/* *gfp_flags* is a hidden argument provided by the verifier */
|
||||
|
|
@ -159,7 +127,6 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
|
|||
void *, value, u64, flags, gfp_t, gfp_flags)
|
||||
{
|
||||
struct bpf_local_storage_data *sdata;
|
||||
bool nobusy;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
|
||||
|
|
@ -168,38 +135,27 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
|
|||
if (!cgroup)
|
||||
return (unsigned long)NULL;
|
||||
|
||||
nobusy = bpf_cgrp_storage_trylock();
|
||||
|
||||
sdata = cgroup_storage_lookup(cgroup, map, nobusy);
|
||||
sdata = cgroup_storage_lookup(cgroup, map, true);
|
||||
if (sdata)
|
||||
goto unlock;
|
||||
goto out;
|
||||
|
||||
/* only allocate new storage, when the cgroup is refcounted */
|
||||
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
|
||||
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy)
|
||||
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
|
||||
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
|
||||
value, BPF_NOEXIST, false, gfp_flags);
|
||||
|
||||
unlock:
|
||||
if (nobusy)
|
||||
bpf_cgrp_storage_unlock();
|
||||
out:
|
||||
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
|
||||
}
|
||||
|
||||
BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
|
||||
{
|
||||
int ret;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (!cgroup)
|
||||
return -EINVAL;
|
||||
|
||||
if (!bpf_cgrp_storage_trylock())
|
||||
return -EBUSY;
|
||||
|
||||
ret = cgroup_storage_delete(cgroup, map);
|
||||
bpf_cgrp_storage_unlock();
|
||||
return ret;
|
||||
return cgroup_storage_delete(cgroup, map);
|
||||
}
|
||||
|
||||
const struct bpf_map_ops cgrp_storage_map_ops = {
|
||||
|
|
|
|||
|
|
@ -110,9 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
|
|||
if (!sdata)
|
||||
return -ENOENT;
|
||||
|
||||
bpf_selem_unlink(SELEM(sdata), false);
|
||||
|
||||
return 0;
|
||||
return bpf_selem_unlink(SELEM(sdata));
|
||||
}
|
||||
|
||||
static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
|
||||
|
|
@ -186,7 +184,7 @@ static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
|
|||
|
||||
static void inode_storage_map_free(struct bpf_map *map)
|
||||
{
|
||||
bpf_local_storage_map_free(map, &inode_cache, NULL);
|
||||
bpf_local_storage_map_free(map, &inode_cache);
|
||||
}
|
||||
|
||||
const struct bpf_map_ops inode_storage_map_ops = {
|
||||
|
|
|
|||
|
|
@ -123,10 +123,10 @@ static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
|
|||
|
||||
if ((off % sizeof(long)) != 0 ||
|
||||
(off / sizeof(long)) >= map->max_entries)
|
||||
return -EINVAL;
|
||||
return -EACCES;
|
||||
|
||||
/* from BPF's point of view, this map is a jump table */
|
||||
*imm = (unsigned long)insn_array->ips + off;
|
||||
*imm = (unsigned long)insn_array->ips;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -86,7 +86,7 @@ static bool bpf_iter_support_resched(struct seq_file *seq)
|
|||
|
||||
/* bpf_seq_read, a customized and simpler version for bpf iterator.
|
||||
* The following are differences from seq_read():
|
||||
* . fixed buffer size (PAGE_SIZE)
|
||||
* . fixed buffer size (PAGE_SIZE << 3)
|
||||
* . assuming NULL ->llseek()
|
||||
* . stop() may call bpf program, handling potential overflow there
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -19,9 +19,9 @@
|
|||
|
||||
static struct bpf_local_storage_map_bucket *
|
||||
select_bucket(struct bpf_local_storage_map *smap,
|
||||
struct bpf_local_storage_elem *selem)
|
||||
struct bpf_local_storage *local_storage)
|
||||
{
|
||||
return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
|
||||
return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)];
|
||||
}
|
||||
|
||||
static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
|
||||
|
|
@ -61,11 +61,6 @@ static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
|
|||
return !hlist_unhashed(&selem->snode);
|
||||
}
|
||||
|
||||
static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
return !hlist_unhashed_lockless(&selem->map_node);
|
||||
}
|
||||
|
||||
static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
return !hlist_unhashed(&selem->map_node);
|
||||
|
|
@ -90,6 +85,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
|
|||
|
||||
if (selem) {
|
||||
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
|
||||
atomic_set(&selem->state, 0);
|
||||
selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
|
||||
|
||||
if (value) {
|
||||
/* No need to call check_and_init_map_value as memory is zero init */
|
||||
|
|
@ -198,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
|
|||
/* The bpf_local_storage_map_free will wait for rcu_barrier */
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
|
||||
|
||||
migrate_disable();
|
||||
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
|
||||
migrate_enable();
|
||||
if (smap) {
|
||||
migrate_disable();
|
||||
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
|
||||
migrate_enable();
|
||||
}
|
||||
kfree_nolock(selem);
|
||||
}
|
||||
|
||||
|
|
@ -219,13 +218,14 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
|
|||
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
|
||||
|
||||
if (!smap->use_kmalloc_nolock) {
|
||||
if (!selem->use_kmalloc_nolock) {
|
||||
/*
|
||||
* No uptr will be unpin even when reuse_now == false since uptr
|
||||
* is only supported in task local storage, where
|
||||
* smap->use_kmalloc_nolock == true.
|
||||
*/
|
||||
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
|
||||
if (smap)
|
||||
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
|
||||
__bpf_selem_free(selem, reuse_now);
|
||||
return;
|
||||
}
|
||||
|
|
@ -256,6 +256,36 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
|
|||
bpf_selem_free(selem, reuse_now);
|
||||
}
|
||||
|
||||
static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
|
||||
struct bpf_local_storage_map *smap,
|
||||
struct bpf_local_storage *local_storage,
|
||||
bool free_local_storage, bool pin_owner)
|
||||
{
|
||||
void *owner = local_storage->owner;
|
||||
u32 uncharge = smap->elem_size;
|
||||
|
||||
if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
|
||||
SDATA(selem))
|
||||
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
|
||||
|
||||
if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
|
||||
return;
|
||||
|
||||
uncharge += free_local_storage ? sizeof(*local_storage) : 0;
|
||||
mem_uncharge(smap, local_storage->owner, uncharge);
|
||||
local_storage->mem_charge -= uncharge;
|
||||
|
||||
if (free_local_storage) {
|
||||
local_storage->owner = NULL;
|
||||
|
||||
/* After this RCU_INIT, owner may be freed and cannot be used */
|
||||
RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
|
||||
}
|
||||
|
||||
if (pin_owner)
|
||||
refcount_dec(&local_storage->owner_refcnt);
|
||||
}
|
||||
|
||||
/* local_storage->lock must be held and selem->local_storage == local_storage.
|
||||
* The caller must ensure selem->smap is still valid to be
|
||||
* dereferenced for its smap->elem_size and smap->cache_idx.
|
||||
|
|
@ -266,124 +296,219 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
|
|||
{
|
||||
struct bpf_local_storage_map *smap;
|
||||
bool free_local_storage;
|
||||
void *owner;
|
||||
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
|
||||
owner = local_storage->owner;
|
||||
|
||||
/* All uncharging on the owner must be done first.
|
||||
* The owner may be freed once the last selem is unlinked
|
||||
* from local_storage.
|
||||
*/
|
||||
mem_uncharge(smap, owner, smap->elem_size);
|
||||
|
||||
free_local_storage = hlist_is_singular_node(&selem->snode,
|
||||
&local_storage->list);
|
||||
if (free_local_storage) {
|
||||
mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
|
||||
local_storage->owner = NULL;
|
||||
|
||||
/* After this RCU_INIT, owner may be freed and cannot be used */
|
||||
RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
|
||||
bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
|
||||
free_local_storage, false);
|
||||
|
||||
/* local_storage is not freed now. local_storage->lock is
|
||||
* still held and raw_spin_unlock_bh(&local_storage->lock)
|
||||
* will be done by the caller.
|
||||
*
|
||||
* Although the unlock will be done under
|
||||
* rcu_read_lock(), it is more intuitive to
|
||||
* read if the freeing of the storage is done
|
||||
* after the raw_spin_unlock_bh(&local_storage->lock).
|
||||
*
|
||||
* Hence, a "bool free_local_storage" is returned
|
||||
* to the caller which then calls then frees the storage after
|
||||
* all the RCU grace periods have expired.
|
||||
*/
|
||||
}
|
||||
hlist_del_init_rcu(&selem->snode);
|
||||
if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
|
||||
SDATA(selem))
|
||||
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
|
||||
|
||||
hlist_add_head(&selem->free_node, free_selem_list);
|
||||
|
||||
if (rcu_access_pointer(local_storage->smap) == smap)
|
||||
RCU_INIT_POINTER(local_storage->smap, NULL);
|
||||
|
||||
return free_local_storage;
|
||||
}
|
||||
|
||||
static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
|
||||
bool reuse_now)
|
||||
{
|
||||
struct bpf_local_storage *local_storage;
|
||||
bool free_local_storage = false;
|
||||
HLIST_HEAD(selem_free_list);
|
||||
unsigned long flags;
|
||||
|
||||
if (unlikely(!selem_linked_to_storage_lockless(selem)))
|
||||
/* selem has already been unlinked from sk */
|
||||
return;
|
||||
|
||||
local_storage = rcu_dereference_check(selem->local_storage,
|
||||
bpf_rcu_lock_held());
|
||||
|
||||
raw_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
if (likely(selem_linked_to_storage(selem)))
|
||||
free_local_storage = bpf_selem_unlink_storage_nolock(
|
||||
local_storage, selem, &selem_free_list);
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
|
||||
bpf_selem_free_list(&selem_free_list, reuse_now);
|
||||
|
||||
if (free_local_storage)
|
||||
bpf_local_storage_free(local_storage, reuse_now);
|
||||
}
|
||||
|
||||
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
|
||||
struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
struct bpf_local_storage_map *smap;
|
||||
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
|
||||
local_storage->mem_charge += smap->elem_size;
|
||||
|
||||
RCU_INIT_POINTER(selem->local_storage, local_storage);
|
||||
hlist_add_head_rcu(&selem->snode, &local_storage->list);
|
||||
}
|
||||
|
||||
static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
|
||||
static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
struct bpf_local_storage *local_storage;
|
||||
struct bpf_local_storage_map *smap;
|
||||
struct bpf_local_storage_map_bucket *b;
|
||||
unsigned long flags;
|
||||
int err;
|
||||
|
||||
if (unlikely(!selem_linked_to_map_lockless(selem)))
|
||||
/* selem has already be unlinked from smap */
|
||||
return;
|
||||
|
||||
local_storage = rcu_dereference_check(selem->local_storage,
|
||||
bpf_rcu_lock_held());
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
|
||||
b = select_bucket(smap, selem);
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
if (likely(selem_linked_to_map(selem)))
|
||||
hlist_del_init_rcu(&selem->map_node);
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
b = select_bucket(smap, local_storage);
|
||||
err = raw_res_spin_lock_irqsave(&b->lock, flags);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
hlist_del_init_rcu(&selem->map_node);
|
||||
raw_res_spin_unlock_irqrestore(&b->lock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bpf_selem_link_map(struct bpf_local_storage_map *smap,
|
||||
struct bpf_local_storage_elem *selem)
|
||||
static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
|
||||
hlist_del_init_rcu(&selem->map_node);
|
||||
}
|
||||
|
||||
int bpf_selem_link_map(struct bpf_local_storage_map *smap,
|
||||
struct bpf_local_storage *local_storage,
|
||||
struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
struct bpf_local_storage_map_bucket *b;
|
||||
unsigned long flags;
|
||||
int err;
|
||||
|
||||
b = select_bucket(smap, local_storage);
|
||||
|
||||
err = raw_res_spin_lock_irqsave(&b->lock, flags);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
hlist_add_head_rcu(&selem->map_node, &b->list);
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
raw_res_spin_unlock_irqrestore(&b->lock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
|
||||
static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
|
||||
struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
/* Always unlink from map before unlinking from local_storage
|
||||
* because selem will be freed after successfully unlinked from
|
||||
* the local_storage.
|
||||
hlist_add_head_rcu(&selem->map_node, &b->list);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlink an selem from map and local storage with lock held.
|
||||
* This is the common path used by local storages to delete an selem.
|
||||
*/
|
||||
int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
struct bpf_local_storage *local_storage;
|
||||
bool free_local_storage = false;
|
||||
HLIST_HEAD(selem_free_list);
|
||||
unsigned long flags;
|
||||
int err;
|
||||
|
||||
if (unlikely(!selem_linked_to_storage_lockless(selem)))
|
||||
/* selem has already been unlinked from sk */
|
||||
return 0;
|
||||
|
||||
local_storage = rcu_dereference_check(selem->local_storage,
|
||||
bpf_rcu_lock_held());
|
||||
|
||||
err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (likely(selem_linked_to_storage(selem))) {
|
||||
/* Always unlink from map before unlinking from local_storage
|
||||
* because selem will be freed after successfully unlinked from
|
||||
* the local_storage.
|
||||
*/
|
||||
err = bpf_selem_unlink_map(selem);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
free_local_storage = bpf_selem_unlink_storage_nolock(
|
||||
local_storage, selem, &selem_free_list);
|
||||
}
|
||||
out:
|
||||
raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
|
||||
bpf_selem_free_list(&selem_free_list, false);
|
||||
|
||||
if (free_local_storage)
|
||||
bpf_local_storage_free(local_storage, false);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlink an selem from map and local storage with lockless fallback if callers
|
||||
* are racing or rqspinlock returns error. It should only be called by
|
||||
* bpf_local_storage_destroy() or bpf_local_storage_map_free().
|
||||
*/
|
||||
static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
|
||||
struct bpf_local_storage_map_bucket *b)
|
||||
{
|
||||
bool in_map_free = !!b, free_storage = false;
|
||||
struct bpf_local_storage *local_storage;
|
||||
struct bpf_local_storage_map *smap;
|
||||
unsigned long flags;
|
||||
int err, unlink = 0;
|
||||
|
||||
local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
|
||||
|
||||
if (smap) {
|
||||
b = b ? : select_bucket(smap, local_storage);
|
||||
err = raw_res_spin_lock_irqsave(&b->lock, flags);
|
||||
if (!err) {
|
||||
/*
|
||||
* Call bpf_obj_free_fields() under b->lock to make sure it is done
|
||||
* exactly once for an selem. Safe to free special fields immediately
|
||||
* as no BPF program should be referencing the selem.
|
||||
*/
|
||||
if (likely(selem_linked_to_map(selem))) {
|
||||
hlist_del_init_rcu(&selem->map_node);
|
||||
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
|
||||
unlink++;
|
||||
}
|
||||
raw_res_spin_unlock_irqrestore(&b->lock, flags);
|
||||
}
|
||||
/*
|
||||
* Highly unlikely scenario: resource leak
|
||||
*
|
||||
* When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
|
||||
* and both selem belong to the same bucket, if destroy(selem2) acquired
|
||||
* b->lock and block for too long, neither map_free(selem1) and
|
||||
* destroy(selem1) will be able to free the special field associated
|
||||
* with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
|
||||
*/
|
||||
WARN_ON_ONCE(err && in_map_free);
|
||||
if (!err || in_map_free)
|
||||
RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
|
||||
}
|
||||
|
||||
if (local_storage) {
|
||||
err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
if (!err) {
|
||||
if (likely(selem_linked_to_storage(selem))) {
|
||||
free_storage = hlist_is_singular_node(&selem->snode,
|
||||
&local_storage->list);
|
||||
/*
|
||||
* Okay to skip clearing owner_storage and storage->owner in
|
||||
* destroy() since the owner is going away. No user or bpf
|
||||
* programs should be able to reference it.
|
||||
*/
|
||||
if (smap && in_map_free)
|
||||
bpf_selem_unlink_storage_nolock_misc(
|
||||
selem, smap, local_storage,
|
||||
free_storage, true);
|
||||
hlist_del_init_rcu(&selem->snode);
|
||||
unlink++;
|
||||
}
|
||||
raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
}
|
||||
if (!err || !in_map_free)
|
||||
RCU_INIT_POINTER(selem->local_storage, NULL);
|
||||
}
|
||||
|
||||
if (unlink != 2)
|
||||
atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);
|
||||
|
||||
/*
|
||||
* Normally, an selem can be unlinked under local_storage->lock and b->lock, and
|
||||
* then freed after an RCU grace period. However, if destroy() and map_free() are
|
||||
* racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
|
||||
* the selem only after both map_free() and destroy() see the selem.
|
||||
*/
|
||||
bpf_selem_unlink_map(selem);
|
||||
bpf_selem_unlink_storage(selem, reuse_now);
|
||||
if (unlink == 2 ||
|
||||
atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
|
||||
bpf_selem_free(selem, true);
|
||||
|
||||
if (free_storage)
|
||||
bpf_local_storage_free(local_storage, true);
|
||||
}
|
||||
|
||||
void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
|
||||
|
|
@ -391,16 +516,20 @@ void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
|
|||
struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
unsigned long flags;
|
||||
int err;
|
||||
|
||||
/* spinlock is needed to avoid racing with the
|
||||
* parallel delete. Otherwise, publishing an already
|
||||
* deleted sdata to the cache will become a use-after-free
|
||||
* problem in the next bpf_local_storage_lookup().
|
||||
*/
|
||||
raw_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
if (err)
|
||||
return;
|
||||
|
||||
if (selem_linked_to_storage(selem))
|
||||
rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
}
|
||||
|
||||
static int check_flags(const struct bpf_local_storage_data *old_sdata,
|
||||
|
|
@ -424,6 +553,8 @@ int bpf_local_storage_alloc(void *owner,
|
|||
{
|
||||
struct bpf_local_storage *prev_storage, *storage;
|
||||
struct bpf_local_storage **owner_storage_ptr;
|
||||
struct bpf_local_storage_map_bucket *b;
|
||||
unsigned long flags;
|
||||
int err;
|
||||
|
||||
err = mem_charge(smap, owner, sizeof(*storage));
|
||||
|
|
@ -441,14 +572,21 @@ int bpf_local_storage_alloc(void *owner,
|
|||
goto uncharge;
|
||||
}
|
||||
|
||||
RCU_INIT_POINTER(storage->smap, smap);
|
||||
INIT_HLIST_HEAD(&storage->list);
|
||||
raw_spin_lock_init(&storage->lock);
|
||||
raw_res_spin_lock_init(&storage->lock);
|
||||
storage->owner = owner;
|
||||
storage->mem_charge = sizeof(*storage);
|
||||
storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
|
||||
refcount_set(&storage->owner_refcnt, 1);
|
||||
|
||||
bpf_selem_link_storage_nolock(storage, first_selem);
|
||||
bpf_selem_link_map(smap, first_selem);
|
||||
|
||||
b = select_bucket(smap, storage);
|
||||
err = raw_res_spin_lock_irqsave(&b->lock, flags);
|
||||
if (err)
|
||||
goto uncharge;
|
||||
|
||||
bpf_selem_link_map_nolock(b, first_selem);
|
||||
|
||||
owner_storage_ptr =
|
||||
(struct bpf_local_storage **)owner_storage(smap, owner);
|
||||
|
|
@ -464,10 +602,12 @@ int bpf_local_storage_alloc(void *owner,
|
|||
*/
|
||||
prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
|
||||
if (unlikely(prev_storage)) {
|
||||
bpf_selem_unlink_map(first_selem);
|
||||
bpf_selem_unlink_map_nolock(first_selem);
|
||||
raw_res_spin_unlock_irqrestore(&b->lock, flags);
|
||||
err = -EAGAIN;
|
||||
goto uncharge;
|
||||
}
|
||||
raw_res_spin_unlock_irqrestore(&b->lock, flags);
|
||||
|
||||
return 0;
|
||||
|
||||
|
|
@ -489,8 +629,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
|
|||
struct bpf_local_storage_data *old_sdata = NULL;
|
||||
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
|
||||
struct bpf_local_storage *local_storage;
|
||||
struct bpf_local_storage_map_bucket *b;
|
||||
HLIST_HEAD(old_selem_free_list);
|
||||
unsigned long flags;
|
||||
unsigned long flags, b_flags;
|
||||
int err;
|
||||
|
||||
/* BPF_EXIST and BPF_NOEXIST cannot be both set */
|
||||
|
|
@ -549,7 +690,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
|
|||
if (!alloc_selem)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
raw_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
if (err)
|
||||
goto free_selem;
|
||||
|
||||
/* Recheck local_storage->list under local_storage->lock */
|
||||
if (unlikely(hlist_empty(&local_storage->list))) {
|
||||
|
|
@ -574,22 +717,30 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
|
|||
goto unlock;
|
||||
}
|
||||
|
||||
b = select_bucket(smap, local_storage);
|
||||
|
||||
err = raw_res_spin_lock_irqsave(&b->lock, b_flags);
|
||||
if (err)
|
||||
goto unlock;
|
||||
|
||||
alloc_selem = NULL;
|
||||
/* First, link the new selem to the map */
|
||||
bpf_selem_link_map(smap, selem);
|
||||
bpf_selem_link_map_nolock(b, selem);
|
||||
|
||||
/* Second, link (and publish) the new selem to local_storage */
|
||||
bpf_selem_link_storage_nolock(local_storage, selem);
|
||||
|
||||
/* Third, remove old selem, SELEM(old_sdata) */
|
||||
if (old_sdata) {
|
||||
bpf_selem_unlink_map(SELEM(old_sdata));
|
||||
bpf_selem_unlink_map_nolock(SELEM(old_sdata));
|
||||
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
|
||||
&old_selem_free_list);
|
||||
}
|
||||
|
||||
raw_res_spin_unlock_irqrestore(&b->lock, b_flags);
|
||||
unlock:
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
free_selem:
|
||||
bpf_selem_free_list(&old_selem_free_list, false);
|
||||
if (alloc_selem) {
|
||||
mem_uncharge(smap, owner, smap->elem_size);
|
||||
|
|
@ -657,13 +808,13 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
|
|||
return 0;
|
||||
}
|
||||
|
||||
void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
|
||||
/*
|
||||
* Destroy local storage when the owner is going away. Caller must uncharge memory
|
||||
* if memory charging is used.
|
||||
*/
|
||||
u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
|
||||
{
|
||||
struct bpf_local_storage_elem *selem;
|
||||
bool free_storage = false;
|
||||
HLIST_HEAD(free_selem_list);
|
||||
struct hlist_node *n;
|
||||
unsigned long flags;
|
||||
|
||||
/* Neither the bpf_prog nor the bpf_map's syscall
|
||||
* could be modifying the local_storage->list now.
|
||||
|
|
@ -674,27 +825,20 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
|
|||
* when unlinking elem from the local_storage->list and
|
||||
* the map's bucket->list.
|
||||
*/
|
||||
raw_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
|
||||
/* Always unlink from map before unlinking from
|
||||
* local_storage.
|
||||
hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
|
||||
bpf_selem_unlink_nofail(selem, NULL);
|
||||
|
||||
if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
|
||||
while (refcount_read(&local_storage->owner_refcnt))
|
||||
cpu_relax();
|
||||
/*
|
||||
* Paired with refcount_dec() in bpf_selem_unlink_nofail()
|
||||
* to make sure destroy() sees the correct local_storage->mem_charge.
|
||||
*/
|
||||
bpf_selem_unlink_map(selem);
|
||||
/* If local_storage list has only one element, the
|
||||
* bpf_selem_unlink_storage_nolock() will return true.
|
||||
* Otherwise, it will return false. The current loop iteration
|
||||
* intends to remove all local storage. So the last iteration
|
||||
* of the loop will set the free_cgroup_storage to true.
|
||||
*/
|
||||
free_storage = bpf_selem_unlink_storage_nolock(
|
||||
local_storage, selem, &free_selem_list);
|
||||
smp_mb();
|
||||
}
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
|
||||
bpf_selem_free_list(&free_selem_list, true);
|
||||
|
||||
if (free_storage)
|
||||
bpf_local_storage_free(local_storage, true);
|
||||
return local_storage->mem_charge;
|
||||
}
|
||||
|
||||
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
|
||||
|
|
@ -736,7 +880,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
|
|||
|
||||
for (i = 0; i < nbuckets; i++) {
|
||||
INIT_HLIST_HEAD(&smap->buckets[i].list);
|
||||
raw_spin_lock_init(&smap->buckets[i].lock);
|
||||
raw_res_spin_lock_init(&smap->buckets[i].lock);
|
||||
}
|
||||
|
||||
smap->elem_size = offsetof(struct bpf_local_storage_elem,
|
||||
|
|
@ -758,8 +902,7 @@ free_smap:
|
|||
}
|
||||
|
||||
void bpf_local_storage_map_free(struct bpf_map *map,
|
||||
struct bpf_local_storage_cache *cache,
|
||||
int __percpu *busy_counter)
|
||||
struct bpf_local_storage_cache *cache)
|
||||
{
|
||||
struct bpf_local_storage_map_bucket *b;
|
||||
struct bpf_local_storage_elem *selem;
|
||||
|
|
@ -789,15 +932,14 @@ void bpf_local_storage_map_free(struct bpf_map *map,
|
|||
|
||||
rcu_read_lock();
|
||||
/* No one is adding to b->list now */
|
||||
while ((selem = hlist_entry_safe(
|
||||
rcu_dereference_raw(hlist_first_rcu(&b->list)),
|
||||
struct bpf_local_storage_elem, map_node))) {
|
||||
if (busy_counter)
|
||||
this_cpu_inc(*busy_counter);
|
||||
bpf_selem_unlink(selem, true);
|
||||
if (busy_counter)
|
||||
this_cpu_dec(*busy_counter);
|
||||
cond_resched_rcu();
|
||||
restart:
|
||||
hlist_for_each_entry_rcu(selem, &b->list, map_node) {
|
||||
bpf_selem_unlink_nofail(selem, b);
|
||||
|
||||
if (need_resched()) {
|
||||
cond_resched_rcu();
|
||||
goto restart;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,10 +18,11 @@
|
|||
#include <linux/bpf-cgroup.h>
|
||||
|
||||
/* For every LSM hook that allows attachment of BPF programs, declare a nop
|
||||
* function where a BPF program can be attached.
|
||||
* function where a BPF program can be attached. Notably, we qualify each with
|
||||
* weak linkage such that strong overrides can be implemented if need be.
|
||||
*/
|
||||
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
|
||||
noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
|
||||
__weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
|
||||
{ \
|
||||
return DEFAULT; \
|
||||
}
|
||||
|
|
|
|||
19
kernel/bpf/bpf_lsm_proto.c
Normal file
19
kernel/bpf/bpf_lsm_proto.c
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright 2025 Google LLC.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/bpf_lsm.h>
|
||||
|
||||
/*
|
||||
* Strong definition of the mmap_file() BPF LSM hook. The __nullable suffix on
|
||||
* the struct file pointer parameter name marks it as PTR_MAYBE_NULL. This
|
||||
* explicitly enforces that BPF LSM programs check for NULL before attempting to
|
||||
* dereference it.
|
||||
*/
|
||||
int bpf_lsm_mmap_file(struct file *file__nullable, unsigned long reqprot,
|
||||
unsigned long prot, unsigned long flags)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -533,6 +533,17 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
|
|||
}
|
||||
}
|
||||
|
||||
static void bpf_struct_ops_map_dissoc_progs(struct bpf_struct_ops_map *st_map)
|
||||
{
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < st_map->funcs_cnt; i++) {
|
||||
if (!st_map->links[i])
|
||||
break;
|
||||
bpf_prog_disassoc_struct_ops(st_map->links[i]->prog);
|
||||
}
|
||||
}
|
||||
|
||||
static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
|
||||
{
|
||||
int i;
|
||||
|
|
@ -801,6 +812,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
|
|||
goto reset_unlock;
|
||||
}
|
||||
|
||||
/* Poison pointer on error instead of return for backward compatibility */
|
||||
bpf_prog_assoc_struct_ops(prog, &st_map->map);
|
||||
|
||||
link = kzalloc(sizeof(*link), GFP_USER);
|
||||
if (!link) {
|
||||
bpf_prog_put(prog);
|
||||
|
|
@ -980,6 +994,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
|
|||
if (btf_is_module(st_map->btf))
|
||||
module_put(st_map->st_ops_desc->st_ops->owner);
|
||||
|
||||
bpf_struct_ops_map_dissoc_progs(st_map);
|
||||
|
||||
bpf_struct_ops_map_del_ksyms(st_map);
|
||||
|
||||
/* The struct_ops's function may switch to another struct_ops.
|
||||
|
|
@ -1396,6 +1412,78 @@ err_out:
|
|||
return err;
|
||||
}
|
||||
|
||||
int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map)
|
||||
{
|
||||
struct bpf_map *st_ops_assoc;
|
||||
|
||||
guard(mutex)(&prog->aux->st_ops_assoc_mutex);
|
||||
|
||||
st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc,
|
||||
lockdep_is_held(&prog->aux->st_ops_assoc_mutex));
|
||||
if (st_ops_assoc && st_ops_assoc == map)
|
||||
return 0;
|
||||
|
||||
if (st_ops_assoc) {
|
||||
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
|
||||
return -EBUSY;
|
||||
|
||||
rcu_assign_pointer(prog->aux->st_ops_assoc, BPF_PTR_POISON);
|
||||
} else {
|
||||
/*
|
||||
* struct_ops map does not track associated non-struct_ops programs.
|
||||
* Bump the refcount to make sure st_ops_assoc is always valid.
|
||||
*/
|
||||
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
|
||||
bpf_map_inc(map);
|
||||
|
||||
rcu_assign_pointer(prog->aux->st_ops_assoc, map);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog)
|
||||
{
|
||||
struct bpf_map *st_ops_assoc;
|
||||
|
||||
guard(mutex)(&prog->aux->st_ops_assoc_mutex);
|
||||
|
||||
st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc,
|
||||
lockdep_is_held(&prog->aux->st_ops_assoc_mutex));
|
||||
if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON)
|
||||
return;
|
||||
|
||||
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
|
||||
bpf_map_put(st_ops_assoc);
|
||||
|
||||
RCU_INIT_POINTER(prog->aux->st_ops_assoc, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a reference to the struct_ops struct (i.e., kdata) associated with a
|
||||
* program. Should only be called in BPF program context (e.g., in a kfunc).
|
||||
*
|
||||
* If the returned pointer is not NULL, it must points to a valid struct_ops.
|
||||
* The struct_ops map is not guaranteed to be initialized nor attached.
|
||||
* Kernel struct_ops implementers are responsible for tracking and checking
|
||||
* the state of the struct_ops if the use case requires an initialized or
|
||||
* attached struct_ops.
|
||||
*/
|
||||
void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux)
|
||||
{
|
||||
struct bpf_struct_ops_map *st_map;
|
||||
struct bpf_map *st_ops_assoc;
|
||||
|
||||
st_ops_assoc = rcu_dereference_check(aux->st_ops_assoc, bpf_rcu_lock_held());
|
||||
if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON)
|
||||
return NULL;
|
||||
|
||||
st_map = (struct bpf_struct_ops_map *)st_ops_assoc;
|
||||
|
||||
return &st_map->kvalue.data;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bpf_prog_get_assoc_struct_ops);
|
||||
|
||||
void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
|
||||
{
|
||||
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
|
||||
|
|
|
|||
|
|
@ -20,29 +20,6 @@
|
|||
|
||||
DEFINE_BPF_STORAGE_CACHE(task_cache);
|
||||
|
||||
static DEFINE_PER_CPU(int, bpf_task_storage_busy);
|
||||
|
||||
static void bpf_task_storage_lock(void)
|
||||
{
|
||||
cant_migrate();
|
||||
this_cpu_inc(bpf_task_storage_busy);
|
||||
}
|
||||
|
||||
static void bpf_task_storage_unlock(void)
|
||||
{
|
||||
this_cpu_dec(bpf_task_storage_busy);
|
||||
}
|
||||
|
||||
static bool bpf_task_storage_trylock(void)
|
||||
{
|
||||
cant_migrate();
|
||||
if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
|
||||
this_cpu_dec(bpf_task_storage_busy);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
|
||||
{
|
||||
struct task_struct *task = owner;
|
||||
|
|
@ -70,17 +47,15 @@ void bpf_task_storage_free(struct task_struct *task)
|
|||
{
|
||||
struct bpf_local_storage *local_storage;
|
||||
|
||||
rcu_read_lock_dont_migrate();
|
||||
rcu_read_lock();
|
||||
|
||||
local_storage = rcu_dereference(task->bpf_storage);
|
||||
if (!local_storage)
|
||||
goto out;
|
||||
|
||||
bpf_task_storage_lock();
|
||||
bpf_local_storage_destroy(local_storage);
|
||||
bpf_task_storage_unlock();
|
||||
out:
|
||||
rcu_read_unlock_migrate();
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
|
||||
|
|
@ -106,9 +81,7 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
|
|||
goto out;
|
||||
}
|
||||
|
||||
bpf_task_storage_lock();
|
||||
sdata = task_storage_lookup(task, map, true);
|
||||
bpf_task_storage_unlock();
|
||||
put_pid(pid);
|
||||
return sdata ? sdata->data : NULL;
|
||||
out:
|
||||
|
|
@ -143,11 +116,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
|
|||
goto out;
|
||||
}
|
||||
|
||||
bpf_task_storage_lock();
|
||||
sdata = bpf_local_storage_update(
|
||||
task, (struct bpf_local_storage_map *)map, value, map_flags,
|
||||
true, GFP_ATOMIC);
|
||||
bpf_task_storage_unlock();
|
||||
|
||||
err = PTR_ERR_OR_ZERO(sdata);
|
||||
out:
|
||||
|
|
@ -155,8 +126,7 @@ out:
|
|||
return err;
|
||||
}
|
||||
|
||||
static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
|
||||
bool nobusy)
|
||||
static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
|
||||
{
|
||||
struct bpf_local_storage_data *sdata;
|
||||
|
||||
|
|
@ -164,12 +134,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
|
|||
if (!sdata)
|
||||
return -ENOENT;
|
||||
|
||||
if (!nobusy)
|
||||
return -EBUSY;
|
||||
|
||||
bpf_selem_unlink(SELEM(sdata), false);
|
||||
|
||||
return 0;
|
||||
return bpf_selem_unlink(SELEM(sdata));
|
||||
}
|
||||
|
||||
static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
|
||||
|
|
@ -194,111 +159,50 @@ static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
|
|||
goto out;
|
||||
}
|
||||
|
||||
bpf_task_storage_lock();
|
||||
err = task_storage_delete(task, map, true);
|
||||
bpf_task_storage_unlock();
|
||||
err = task_storage_delete(task, map);
|
||||
out:
|
||||
put_pid(pid);
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Called by bpf_task_storage_get*() helpers */
|
||||
static void *__bpf_task_storage_get(struct bpf_map *map,
|
||||
struct task_struct *task, void *value,
|
||||
u64 flags, gfp_t gfp_flags, bool nobusy)
|
||||
{
|
||||
struct bpf_local_storage_data *sdata;
|
||||
|
||||
sdata = task_storage_lookup(task, map, nobusy);
|
||||
if (sdata)
|
||||
return sdata->data;
|
||||
|
||||
/* only allocate new storage, when the task is refcounted */
|
||||
if (refcount_read(&task->usage) &&
|
||||
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
|
||||
sdata = bpf_local_storage_update(
|
||||
task, (struct bpf_local_storage_map *)map, value,
|
||||
BPF_NOEXIST, false, gfp_flags);
|
||||
return IS_ERR(sdata) ? NULL : sdata->data;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* *gfp_flags* is a hidden argument provided by the verifier */
|
||||
BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
|
||||
task, void *, value, u64, flags, gfp_t, gfp_flags)
|
||||
{
|
||||
bool nobusy;
|
||||
void *data;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
|
||||
return (unsigned long)NULL;
|
||||
|
||||
nobusy = bpf_task_storage_trylock();
|
||||
data = __bpf_task_storage_get(map, task, value, flags,
|
||||
gfp_flags, nobusy);
|
||||
if (nobusy)
|
||||
bpf_task_storage_unlock();
|
||||
return (unsigned long)data;
|
||||
}
|
||||
|
||||
/* *gfp_flags* is a hidden argument provided by the verifier */
|
||||
BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
|
||||
task, void *, value, u64, flags, gfp_t, gfp_flags)
|
||||
{
|
||||
void *data;
|
||||
struct bpf_local_storage_data *sdata;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
|
||||
return (unsigned long)NULL;
|
||||
|
||||
bpf_task_storage_lock();
|
||||
data = __bpf_task_storage_get(map, task, value, flags,
|
||||
gfp_flags, true);
|
||||
bpf_task_storage_unlock();
|
||||
return (unsigned long)data;
|
||||
}
|
||||
sdata = task_storage_lookup(task, map, true);
|
||||
if (sdata)
|
||||
return (unsigned long)sdata->data;
|
||||
|
||||
BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
|
||||
task)
|
||||
{
|
||||
bool nobusy;
|
||||
int ret;
|
||||
/* only allocate new storage, when the task is refcounted */
|
||||
if (refcount_read(&task->usage) &&
|
||||
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
|
||||
sdata = bpf_local_storage_update(
|
||||
task, (struct bpf_local_storage_map *)map, value,
|
||||
BPF_NOEXIST, false, gfp_flags);
|
||||
return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (!task)
|
||||
return -EINVAL;
|
||||
|
||||
nobusy = bpf_task_storage_trylock();
|
||||
/* This helper must only be called from places where the lifetime of the task
|
||||
* is guaranteed. Either by being refcounted or by being protected
|
||||
* by an RCU read-side critical section.
|
||||
*/
|
||||
ret = task_storage_delete(task, map, nobusy);
|
||||
if (nobusy)
|
||||
bpf_task_storage_unlock();
|
||||
return ret;
|
||||
return (unsigned long)NULL;
|
||||
}
|
||||
|
||||
BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
|
||||
task)
|
||||
{
|
||||
int ret;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (!task)
|
||||
return -EINVAL;
|
||||
|
||||
bpf_task_storage_lock();
|
||||
/* This helper must only be called from places where the lifetime of the task
|
||||
* is guaranteed. Either by being refcounted or by being protected
|
||||
* by an RCU read-side critical section.
|
||||
*/
|
||||
ret = task_storage_delete(task, map, true);
|
||||
bpf_task_storage_unlock();
|
||||
return ret;
|
||||
return task_storage_delete(task, map);
|
||||
}
|
||||
|
||||
static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
|
||||
|
|
@ -313,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
|
|||
|
||||
static void task_storage_map_free(struct bpf_map *map)
|
||||
{
|
||||
bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
|
||||
bpf_local_storage_map_free(map, &task_cache);
|
||||
}
|
||||
|
||||
BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
|
||||
|
|
@ -332,17 +236,6 @@ const struct bpf_map_ops task_storage_map_ops = {
|
|||
.map_owner_storage_ptr = task_storage_ptr,
|
||||
};
|
||||
|
||||
const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
|
||||
.func = bpf_task_storage_get_recur,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
|
||||
.arg1_type = ARG_CONST_MAP_PTR,
|
||||
.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
|
||||
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
|
||||
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
const struct bpf_func_proto bpf_task_storage_get_proto = {
|
||||
.func = bpf_task_storage_get,
|
||||
.gpl_only = false,
|
||||
|
|
@ -354,15 +247,6 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
|
|||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
|
||||
.func = bpf_task_storage_delete_recur,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_CONST_MAP_PTR,
|
||||
.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
|
||||
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
|
||||
};
|
||||
|
||||
const struct bpf_func_proto bpf_task_storage_delete_proto = {
|
||||
.func = bpf_task_storage_delete,
|
||||
.gpl_only = false,
|
||||
|
|
|
|||
230
kernel/bpf/btf.c
230
kernel/bpf/btf.c
|
|
@ -25,6 +25,7 @@
|
|||
#include <linux/perf_event.h>
|
||||
#include <linux/bsearch.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/overflow.h>
|
||||
|
||||
|
|
@ -259,6 +260,7 @@ struct btf {
|
|||
void *nohdr_data;
|
||||
struct btf_header hdr;
|
||||
u32 nr_types; /* includes VOID for base BTF */
|
||||
u32 named_start_id;
|
||||
u32 types_size;
|
||||
u32 data_size;
|
||||
refcount_t refcnt;
|
||||
|
|
@ -494,6 +496,11 @@ static bool btf_type_is_modifier(const struct btf_type *t)
|
|||
return false;
|
||||
}
|
||||
|
||||
static int btf_start_id(const struct btf *btf)
|
||||
{
|
||||
return btf->start_id + (btf->base_btf ? 0 : 1);
|
||||
}
|
||||
|
||||
bool btf_type_is_void(const struct btf_type *t)
|
||||
{
|
||||
return t == &btf_void;
|
||||
|
|
@ -544,21 +551,125 @@ u32 btf_nr_types(const struct btf *btf)
|
|||
return total;
|
||||
}
|
||||
|
||||
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
|
||||
/*
|
||||
* Note that vmlinux and kernel module BTFs are always sorted
|
||||
* during the building phase.
|
||||
*/
|
||||
static void btf_check_sorted(struct btf *btf)
|
||||
{
|
||||
u32 i, n, named_start_id = 0;
|
||||
|
||||
n = btf_nr_types(btf);
|
||||
if (btf_is_vmlinux(btf)) {
|
||||
for (i = btf_start_id(btf); i < n; i++) {
|
||||
const struct btf_type *t = btf_type_by_id(btf, i);
|
||||
const char *n = btf_name_by_offset(btf, t->name_off);
|
||||
|
||||
if (n[0] != '\0') {
|
||||
btf->named_start_id = i;
|
||||
return;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = btf_start_id(btf) + 1; i < n; i++) {
|
||||
const struct btf_type *ta = btf_type_by_id(btf, i - 1);
|
||||
const struct btf_type *tb = btf_type_by_id(btf, i);
|
||||
const char *na = btf_name_by_offset(btf, ta->name_off);
|
||||
const char *nb = btf_name_by_offset(btf, tb->name_off);
|
||||
|
||||
if (strcmp(na, nb) > 0)
|
||||
return;
|
||||
|
||||
if (named_start_id == 0 && na[0] != '\0')
|
||||
named_start_id = i - 1;
|
||||
if (named_start_id == 0 && nb[0] != '\0')
|
||||
named_start_id = i;
|
||||
}
|
||||
|
||||
if (named_start_id)
|
||||
btf->named_start_id = named_start_id;
|
||||
}
|
||||
|
||||
/*
|
||||
* btf_named_start_id - Get the named starting ID for the BTF
|
||||
* @btf: Pointer to the target BTF object
|
||||
* @own: Flag indicating whether to query only the current BTF (true = current BTF only,
|
||||
* false = recursively traverse the base BTF chain)
|
||||
*
|
||||
* Return value rules:
|
||||
* 1. For a sorted btf, return its named_start_id
|
||||
* 2. Else for a split BTF, return its start_id
|
||||
* 3. Else for a base BTF, return 1
|
||||
*/
|
||||
u32 btf_named_start_id(const struct btf *btf, bool own)
|
||||
{
|
||||
const struct btf *base_btf = btf;
|
||||
|
||||
while (!own && base_btf->base_btf)
|
||||
base_btf = base_btf->base_btf;
|
||||
|
||||
return base_btf->named_start_id ?: (base_btf->start_id ?: 1);
|
||||
}
|
||||
|
||||
static s32 btf_find_by_name_kind_bsearch(const struct btf *btf, const char *name)
|
||||
{
|
||||
const struct btf_type *t;
|
||||
const char *tname;
|
||||
u32 i, total;
|
||||
s32 l, r, m;
|
||||
|
||||
l = btf_named_start_id(btf, true);
|
||||
r = btf_nr_types(btf) - 1;
|
||||
while (l <= r) {
|
||||
m = l + (r - l) / 2;
|
||||
t = btf_type_by_id(btf, m);
|
||||
tname = btf_name_by_offset(btf, t->name_off);
|
||||
if (strcmp(tname, name) >= 0) {
|
||||
if (l == r)
|
||||
return r;
|
||||
r = m;
|
||||
} else {
|
||||
l = m + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return btf_nr_types(btf);
|
||||
}
|
||||
|
||||
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
|
||||
{
|
||||
const struct btf *base_btf = btf_base_btf(btf);
|
||||
const struct btf_type *t;
|
||||
const char *tname;
|
||||
s32 id, total;
|
||||
|
||||
if (base_btf) {
|
||||
id = btf_find_by_name_kind(base_btf, name, kind);
|
||||
if (id > 0)
|
||||
return id;
|
||||
}
|
||||
|
||||
total = btf_nr_types(btf);
|
||||
for (i = 1; i < total; i++) {
|
||||
t = btf_type_by_id(btf, i);
|
||||
if (BTF_INFO_KIND(t->info) != kind)
|
||||
continue;
|
||||
|
||||
tname = btf_name_by_offset(btf, t->name_off);
|
||||
if (!strcmp(tname, name))
|
||||
return i;
|
||||
if (btf->named_start_id > 0 && name[0]) {
|
||||
id = btf_find_by_name_kind_bsearch(btf, name);
|
||||
for (; id < total; id++) {
|
||||
t = btf_type_by_id(btf, id);
|
||||
tname = btf_name_by_offset(btf, t->name_off);
|
||||
if (strcmp(tname, name) != 0)
|
||||
return -ENOENT;
|
||||
if (BTF_INFO_KIND(t->info) == kind)
|
||||
return id;
|
||||
}
|
||||
} else {
|
||||
for (id = btf_start_id(btf); id < total; id++) {
|
||||
t = btf_type_by_id(btf, id);
|
||||
if (BTF_INFO_KIND(t->info) != kind)
|
||||
continue;
|
||||
tname = btf_name_by_offset(btf, t->name_off);
|
||||
if (strcmp(tname, name) == 0)
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
||||
return -ENOENT;
|
||||
|
|
@ -3424,7 +3535,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type
|
|||
const struct btf_type *t;
|
||||
int len, id;
|
||||
|
||||
id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0);
|
||||
id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key,
|
||||
btf_named_start_id(btf, false) - 1);
|
||||
if (id < 0)
|
||||
return ERR_PTR(id);
|
||||
|
||||
|
|
@ -5791,6 +5903,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
|
|||
goto errout;
|
||||
}
|
||||
env->btf = btf;
|
||||
btf->named_start_id = 0;
|
||||
|
||||
data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
|
||||
if (!data) {
|
||||
|
|
@ -6107,6 +6220,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct
|
|||
case BPF_TRACE_FENTRY:
|
||||
case BPF_TRACE_FEXIT:
|
||||
case BPF_MODIFY_RETURN:
|
||||
case BPF_TRACE_FSESSION:
|
||||
/* allow u64* as ctx */
|
||||
if (btf_is_int(t) && t->size == 8)
|
||||
return 0;
|
||||
|
|
@ -6210,7 +6324,8 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name
|
|||
btf->data = data;
|
||||
btf->data_size = data_size;
|
||||
btf->kernel_btf = true;
|
||||
snprintf(btf->name, sizeof(btf->name), "%s", name);
|
||||
btf->named_start_id = 0;
|
||||
strscpy(btf->name, name);
|
||||
|
||||
err = btf_parse_hdr(env);
|
||||
if (err)
|
||||
|
|
@ -6230,6 +6345,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name
|
|||
if (err)
|
||||
goto errout;
|
||||
|
||||
btf_check_sorted(btf);
|
||||
refcount_set(&btf->refcnt, 1);
|
||||
|
||||
return btf;
|
||||
|
|
@ -6327,7 +6443,8 @@ static struct btf *btf_parse_module(const char *module_name, const void *data,
|
|||
btf->start_id = base_btf->nr_types;
|
||||
btf->start_str_off = base_btf->hdr.str_len;
|
||||
btf->kernel_btf = true;
|
||||
snprintf(btf->name, sizeof(btf->name), "%s", module_name);
|
||||
btf->named_start_id = 0;
|
||||
strscpy(btf->name, module_name);
|
||||
|
||||
btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
|
||||
if (!btf->data) {
|
||||
|
|
@ -6363,6 +6480,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data,
|
|||
}
|
||||
|
||||
btf_verifier_env_free(env);
|
||||
btf_check_sorted(btf);
|
||||
refcount_set(&btf->refcnt, 1);
|
||||
return btf;
|
||||
|
||||
|
|
@ -6704,6 +6822,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
|||
fallthrough;
|
||||
case BPF_LSM_CGROUP:
|
||||
case BPF_TRACE_FEXIT:
|
||||
case BPF_TRACE_FSESSION:
|
||||
/* When LSM programs are attached to void LSM hooks
|
||||
* they use FEXIT trampolines and when attached to
|
||||
* int LSM hooks, they use MODIFY_RETURN trampolines.
|
||||
|
|
@ -7729,12 +7848,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
|
|||
tname);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Convert BTF function arguments into verifier types.
|
||||
* Only PTR_TO_CTX and SCALAR are supported atm.
|
||||
*/
|
||||
for (i = 0; i < nargs; i++) {
|
||||
u32 tags = 0;
|
||||
int id = 0;
|
||||
int id = btf_named_start_id(btf, false) - 1;
|
||||
|
||||
/* 'arg:<tag>' decl_tag takes precedence over derivation of
|
||||
* register type from BTF type itself
|
||||
|
|
@ -8640,24 +8760,17 @@ end:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
|
||||
enum btf_kfunc_hook hook,
|
||||
u32 kfunc_btf_id,
|
||||
const struct bpf_prog *prog)
|
||||
static u32 *btf_kfunc_id_set_contains(const struct btf *btf,
|
||||
enum btf_kfunc_hook hook,
|
||||
u32 kfunc_btf_id)
|
||||
{
|
||||
struct btf_kfunc_hook_filter *hook_filter;
|
||||
struct btf_id_set8 *set;
|
||||
u32 *id, i;
|
||||
u32 *id;
|
||||
|
||||
if (hook >= BTF_KFUNC_HOOK_MAX)
|
||||
return NULL;
|
||||
if (!btf->kfunc_set_tab)
|
||||
return NULL;
|
||||
hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
|
||||
for (i = 0; i < hook_filter->nr_filters; i++) {
|
||||
if (hook_filter->filters[i](prog, kfunc_btf_id))
|
||||
return NULL;
|
||||
}
|
||||
set = btf->kfunc_set_tab->sets[hook];
|
||||
if (!set)
|
||||
return NULL;
|
||||
|
|
@ -8668,6 +8781,28 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
|
|||
return id + 1;
|
||||
}
|
||||
|
||||
static bool __btf_kfunc_is_allowed(const struct btf *btf,
|
||||
enum btf_kfunc_hook hook,
|
||||
u32 kfunc_btf_id,
|
||||
const struct bpf_prog *prog)
|
||||
{
|
||||
struct btf_kfunc_hook_filter *hook_filter;
|
||||
int i;
|
||||
|
||||
if (hook >= BTF_KFUNC_HOOK_MAX)
|
||||
return false;
|
||||
if (!btf->kfunc_set_tab)
|
||||
return false;
|
||||
|
||||
hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
|
||||
for (i = 0; i < hook_filter->nr_filters; i++) {
|
||||
if (hook_filter->filters[i](prog, kfunc_btf_id))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
|
||||
{
|
||||
switch (prog_type) {
|
||||
|
|
@ -8681,6 +8816,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
|
|||
return BTF_KFUNC_HOOK_STRUCT_OPS;
|
||||
case BPF_PROG_TYPE_TRACING:
|
||||
case BPF_PROG_TYPE_TRACEPOINT:
|
||||
case BPF_PROG_TYPE_RAW_TRACEPOINT:
|
||||
case BPF_PROG_TYPE_PERF_EVENT:
|
||||
case BPF_PROG_TYPE_LSM:
|
||||
return BTF_KFUNC_HOOK_TRACING;
|
||||
|
|
@ -8714,6 +8850,26 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
|
|||
}
|
||||
}
|
||||
|
||||
bool btf_kfunc_is_allowed(const struct btf *btf,
|
||||
u32 kfunc_btf_id,
|
||||
const struct bpf_prog *prog)
|
||||
{
|
||||
enum bpf_prog_type prog_type = resolve_prog_type(prog);
|
||||
enum btf_kfunc_hook hook;
|
||||
u32 *kfunc_flags;
|
||||
|
||||
kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
|
||||
if (kfunc_flags && __btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog))
|
||||
return true;
|
||||
|
||||
hook = bpf_prog_type_to_kfunc_hook(prog_type);
|
||||
kfunc_flags = btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
|
||||
if (kfunc_flags && __btf_kfunc_is_allowed(btf, hook, kfunc_btf_id, prog))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Caution:
|
||||
* Reference to the module (obtained using btf_try_get_module) corresponding to
|
||||
* the struct btf *MUST* be held when calling this function from verifier
|
||||
|
|
@ -8721,26 +8877,27 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
|
|||
* keeping the reference for the duration of the call provides the necessary
|
||||
* protection for looking up a well-formed btf->kfunc_set_tab.
|
||||
*/
|
||||
u32 *btf_kfunc_id_set_contains(const struct btf *btf,
|
||||
u32 kfunc_btf_id,
|
||||
const struct bpf_prog *prog)
|
||||
u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog)
|
||||
{
|
||||
enum bpf_prog_type prog_type = resolve_prog_type(prog);
|
||||
enum btf_kfunc_hook hook;
|
||||
u32 *kfunc_flags;
|
||||
|
||||
kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog);
|
||||
kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
|
||||
if (kfunc_flags)
|
||||
return kfunc_flags;
|
||||
|
||||
hook = bpf_prog_type_to_kfunc_hook(prog_type);
|
||||
return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog);
|
||||
return btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
|
||||
}
|
||||
|
||||
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
|
||||
const struct bpf_prog *prog)
|
||||
{
|
||||
return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog);
|
||||
if (!__btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog))
|
||||
return NULL;
|
||||
|
||||
return btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
|
||||
}
|
||||
|
||||
static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
|
||||
|
|
@ -8845,6 +9002,13 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc
|
|||
*/
|
||||
if (!t || !btf_type_is_ptr(t))
|
||||
return -EINVAL;
|
||||
|
||||
if (IS_ENABLED(CONFIG_CFI_CLANG)) {
|
||||
/* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
if (!btf_type_is_void(t))
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -9215,7 +9379,7 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
|
|||
}
|
||||
|
||||
/* Attempt to find target candidates in vmlinux BTF first */
|
||||
cands = bpf_core_add_cands(cands, main_btf, 1);
|
||||
cands = bpf_core_add_cands(cands, main_btf, btf_named_start_id(main_btf, true));
|
||||
if (IS_ERR(cands))
|
||||
return ERR_CAST(cands);
|
||||
|
||||
|
|
@ -9247,7 +9411,7 @@ check_modules:
|
|||
*/
|
||||
btf_get(mod_btf);
|
||||
spin_unlock_bh(&btf_idr_lock);
|
||||
cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
|
||||
cands = bpf_core_add_cands(cands, mod_btf, btf_named_start_id(mod_btf, true));
|
||||
btf_put(mod_btf);
|
||||
if (IS_ERR(cands))
|
||||
return ERR_CAST(cands);
|
||||
|
|
|
|||
|
|
@ -1680,11 +1680,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
|
|||
struct cgroup *cgrp;
|
||||
int ret;
|
||||
|
||||
/* Check socket family since not all sockets represent network
|
||||
* endpoint (e.g. AF_UNIX).
|
||||
*/
|
||||
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
|
||||
sk->sk_family != AF_UNIX)
|
||||
if (!sk_is_inet(sk) && !sk_is_unix(sk))
|
||||
return 0;
|
||||
|
||||
if (!ctx.uaddr) {
|
||||
|
|
|
|||
|
|
@ -8,12 +8,13 @@
|
|||
|
||||
#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
|
||||
|
||||
/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
|
||||
/* cgroup_iter provides five modes of traversal to the cgroup hierarchy.
|
||||
*
|
||||
* 1. Walk the descendants of a cgroup in pre-order.
|
||||
* 2. Walk the descendants of a cgroup in post-order.
|
||||
* 3. Walk the ancestors of a cgroup.
|
||||
* 4. Show the given cgroup only.
|
||||
* 5. Walk the children of a given parent cgroup.
|
||||
*
|
||||
* For walking descendants, cgroup_iter can walk in either pre-order or
|
||||
* post-order. For walking ancestors, the iter walks up from a cgroup to
|
||||
|
|
@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
|
|||
return css_next_descendant_pre(NULL, p->start_css);
|
||||
else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
|
||||
return css_next_descendant_post(NULL, p->start_css);
|
||||
else if (p->order == BPF_CGROUP_ITER_CHILDREN)
|
||||
return css_next_child(NULL, p->start_css);
|
||||
else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
|
||||
return p->start_css;
|
||||
}
|
||||
|
|
@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
|||
return css_next_descendant_post(curr, p->start_css);
|
||||
else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
|
||||
return curr->parent;
|
||||
else if (p->order == BPF_CGROUP_ITER_CHILDREN)
|
||||
return css_next_child(curr, p->start_css);
|
||||
else /* BPF_CGROUP_ITER_SELF_ONLY */
|
||||
return NULL;
|
||||
}
|
||||
|
|
@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
|
|||
int order = linfo->cgroup.order;
|
||||
struct cgroup *cgrp;
|
||||
|
||||
if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
|
||||
order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
|
||||
order != BPF_CGROUP_ITER_ANCESTORS_UP &&
|
||||
order != BPF_CGROUP_ITER_SELF_ONLY)
|
||||
switch (order) {
|
||||
case BPF_CGROUP_ITER_DESCENDANTS_PRE:
|
||||
case BPF_CGROUP_ITER_DESCENDANTS_POST:
|
||||
case BPF_CGROUP_ITER_ANCESTORS_UP:
|
||||
case BPF_CGROUP_ITER_SELF_ONLY:
|
||||
case BPF_CGROUP_ITER_CHILDREN:
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (fd && id)
|
||||
return -EINVAL;
|
||||
|
|
@ -257,6 +267,8 @@ show_order:
|
|||
seq_puts(seq, "order: descendants_post\n");
|
||||
else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
|
||||
seq_puts(seq, "order: ancestors_up\n");
|
||||
else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN)
|
||||
seq_puts(seq, "order: children\n");
|
||||
else /* BPF_CGROUP_ITER_SELF_ONLY */
|
||||
seq_puts(seq, "order: self_only\n");
|
||||
}
|
||||
|
|
@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
|
|||
case BPF_CGROUP_ITER_DESCENDANTS_PRE:
|
||||
case BPF_CGROUP_ITER_DESCENDANTS_POST:
|
||||
case BPF_CGROUP_ITER_ANCESTORS_UP:
|
||||
case BPF_CGROUP_ITER_CHILDREN:
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
|
|
@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i
|
|||
case BPF_CGROUP_ITER_DESCENDANTS_POST:
|
||||
kit->pos = css_next_descendant_post(kit->pos, kit->start);
|
||||
break;
|
||||
case BPF_CGROUP_ITER_CHILDREN:
|
||||
kit->pos = css_next_child(kit->pos, kit->start);
|
||||
break;
|
||||
case BPF_CGROUP_ITER_ANCESTORS_UP:
|
||||
kit->pos = kit->pos ? kit->pos->parent : kit->start;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -112,7 +112,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
|
|||
vfree(fp);
|
||||
return NULL;
|
||||
}
|
||||
fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
|
||||
fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 4,
|
||||
bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
|
||||
if (!fp->active) {
|
||||
vfree(fp);
|
||||
kfree(aux);
|
||||
|
|
@ -136,6 +137,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
|
|||
mutex_init(&fp->aux->used_maps_mutex);
|
||||
mutex_init(&fp->aux->ext_mutex);
|
||||
mutex_init(&fp->aux->dst_mutex);
|
||||
mutex_init(&fp->aux->st_ops_assoc_mutex);
|
||||
|
||||
#ifdef CONFIG_BPF_SYSCALL
|
||||
bpf_prog_stream_init(fp);
|
||||
|
|
@ -286,6 +288,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
|
|||
if (fp->aux) {
|
||||
mutex_destroy(&fp->aux->used_maps_mutex);
|
||||
mutex_destroy(&fp->aux->dst_mutex);
|
||||
mutex_destroy(&fp->aux->st_ops_assoc_mutex);
|
||||
kfree(fp->aux->poke_tab);
|
||||
kfree(fp->aux);
|
||||
}
|
||||
|
|
@ -2398,6 +2401,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
|
|||
map->owner->type = prog_type;
|
||||
map->owner->jited = fp->jited;
|
||||
map->owner->xdp_has_frags = aux->xdp_has_frags;
|
||||
map->owner->sleepable = fp->sleepable;
|
||||
map->owner->expected_attach_type = fp->expected_attach_type;
|
||||
map->owner->attach_func_proto = aux->attach_func_proto;
|
||||
for_each_cgroup_storage_type(i) {
|
||||
|
|
@ -2409,7 +2413,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
|
|||
} else {
|
||||
ret = map->owner->type == prog_type &&
|
||||
map->owner->jited == fp->jited &&
|
||||
map->owner->xdp_has_frags == aux->xdp_has_frags;
|
||||
map->owner->xdp_has_frags == aux->xdp_has_frags &&
|
||||
map->owner->sleepable == fp->sleepable;
|
||||
if (ret &&
|
||||
map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
|
||||
map->owner->expected_attach_type != fp->expected_attach_type)
|
||||
|
|
@ -2912,6 +2917,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
|
|||
#endif
|
||||
bpf_free_used_maps(aux);
|
||||
bpf_free_used_btfs(aux);
|
||||
bpf_prog_disassoc_struct_ops(aux->prog);
|
||||
if (bpf_prog_is_dev_bound(aux))
|
||||
bpf_prog_dev_bound_destroy(aux->prog);
|
||||
#ifdef CONFIG_PERF_EVENTS
|
||||
|
|
@ -3138,6 +3144,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
|
|||
return false;
|
||||
}
|
||||
|
||||
bool __weak bpf_jit_supports_fsession(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
u64 __weak bpf_arch_uaddress_limit(void)
|
||||
{
|
||||
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
|
||||
|
|
|
|||
|
|
@ -430,7 +430,7 @@ static struct bpf_cpu_map_entry *
|
|||
__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
|
||||
u32 cpu)
|
||||
{
|
||||
int numa, err, i, fd = value->bpf_prog.fd;
|
||||
int numa, err = -ENOMEM, i, fd = value->bpf_prog.fd;
|
||||
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
|
||||
struct bpf_cpu_map_entry *rcpu;
|
||||
struct xdp_bulk_queue *bq;
|
||||
|
|
@ -440,7 +440,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
|
|||
|
||||
rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa);
|
||||
if (!rcpu)
|
||||
return NULL;
|
||||
return ERR_PTR(err);
|
||||
|
||||
/* Alloc percpu bulkq */
|
||||
rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq),
|
||||
|
|
@ -468,16 +468,21 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
|
|||
rcpu->value.qsize = value->qsize;
|
||||
gro_init(&rcpu->gro);
|
||||
|
||||
if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
|
||||
goto free_ptr_ring;
|
||||
if (fd > 0) {
|
||||
err = __cpu_map_load_bpf_program(rcpu, map, fd);
|
||||
if (err)
|
||||
goto free_ptr_ring;
|
||||
}
|
||||
|
||||
/* Setup kthread */
|
||||
init_completion(&rcpu->kthread_running);
|
||||
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
|
||||
"cpumap/%d/map:%d", cpu,
|
||||
map->id);
|
||||
if (IS_ERR(rcpu->kthread))
|
||||
if (IS_ERR(rcpu->kthread)) {
|
||||
err = PTR_ERR(rcpu->kthread);
|
||||
goto free_prog;
|
||||
}
|
||||
|
||||
/* Make sure kthread runs on a single CPU */
|
||||
kthread_bind(rcpu->kthread, cpu);
|
||||
|
|
@ -503,7 +508,7 @@ free_bulkq:
|
|||
free_percpu(rcpu->bulkq);
|
||||
free_rcu:
|
||||
kfree(rcpu);
|
||||
return NULL;
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static void __cpu_map_entry_free(struct work_struct *work)
|
||||
|
|
@ -596,8 +601,8 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
|
|||
} else {
|
||||
/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
|
||||
rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
|
||||
if (!rcpu)
|
||||
return -ENOMEM;
|
||||
if (IS_ERR(rcpu))
|
||||
return PTR_ERR(rcpu);
|
||||
}
|
||||
rcu_read_lock();
|
||||
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
|
||||
|
|
|
|||
|
|
@ -477,7 +477,7 @@ __bpf_kfunc_end_defs();
|
|||
BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
|
||||
BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
|
||||
BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
|
||||
BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE)
|
||||
BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
|
||||
BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
|
||||
BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU)
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ struct bpf_crypto_ctx {
|
|||
int bpf_crypto_register_type(const struct bpf_crypto_type *type)
|
||||
{
|
||||
struct bpf_crypto_type_list *node;
|
||||
int err = -EEXIST;
|
||||
int err = -EBUSY;
|
||||
|
||||
down_write(&bpf_crypto_types_sem);
|
||||
list_for_each_entry(node, &bpf_crypto_types, list) {
|
||||
|
|
@ -261,6 +261,12 @@ __bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx)
|
|||
call_rcu(&ctx->rcu, crypto_free_cb);
|
||||
}
|
||||
|
||||
__bpf_kfunc void bpf_crypto_ctx_release_dtor(void *ctx)
|
||||
{
|
||||
bpf_crypto_ctx_release(ctx);
|
||||
}
|
||||
CFI_NOSEAL(bpf_crypto_ctx_release_dtor);
|
||||
|
||||
static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
|
||||
const struct bpf_dynptr_kern *src,
|
||||
const struct bpf_dynptr_kern *dst,
|
||||
|
|
@ -368,7 +374,7 @@ static const struct btf_kfunc_id_set crypt_kfunc_set = {
|
|||
|
||||
BTF_ID_LIST(bpf_crypto_dtor_ids)
|
||||
BTF_ID(struct, bpf_crypto_ctx)
|
||||
BTF_ID(func, bpf_crypto_ctx_release)
|
||||
BTF_ID(func, bpf_crypto_ctx_release_dtor)
|
||||
|
||||
static int __init crypto_kfunc_init(void)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -82,9 +82,6 @@ struct bucket {
|
|||
rqspinlock_t raw_lock;
|
||||
};
|
||||
|
||||
#define HASHTAB_MAP_LOCK_COUNT 8
|
||||
#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
|
||||
|
||||
struct bpf_htab {
|
||||
struct bpf_map map;
|
||||
struct bpf_mem_alloc ma;
|
||||
|
|
@ -932,7 +929,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
|
|||
}
|
||||
|
||||
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
|
||||
void *value, bool onallcpus)
|
||||
void *value, bool onallcpus, u64 map_flags)
|
||||
{
|
||||
void *ptr;
|
||||
|
||||
|
|
@ -943,19 +940,28 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
|
|||
bpf_obj_free_fields(htab->map.record, ptr);
|
||||
} else {
|
||||
u32 size = round_up(htab->map.value_size, 8);
|
||||
int off = 0, cpu;
|
||||
void *val;
|
||||
int cpu;
|
||||
|
||||
if (map_flags & BPF_F_CPU) {
|
||||
cpu = map_flags >> 32;
|
||||
ptr = per_cpu_ptr(pptr, cpu);
|
||||
copy_map_value(&htab->map, ptr, value);
|
||||
bpf_obj_free_fields(htab->map.record, ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
ptr = per_cpu_ptr(pptr, cpu);
|
||||
copy_map_value_long(&htab->map, ptr, value + off);
|
||||
val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
|
||||
copy_map_value(&htab->map, ptr, val);
|
||||
bpf_obj_free_fields(htab->map.record, ptr);
|
||||
off += size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
|
||||
void *value, bool onallcpus)
|
||||
void *value, bool onallcpus, u64 map_flags)
|
||||
{
|
||||
/* When not setting the initial value on all cpus, zero-fill element
|
||||
* values for other cpus. Otherwise, bpf program has no way to ensure
|
||||
|
|
@ -973,7 +979,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
|
|||
zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
|
||||
}
|
||||
} else {
|
||||
pcpu_copy_value(htab, pptr, value, onallcpus);
|
||||
pcpu_copy_value(htab, pptr, value, onallcpus, map_flags);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -985,7 +991,7 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
|
|||
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
|
||||
void *value, u32 key_size, u32 hash,
|
||||
bool percpu, bool onallcpus,
|
||||
struct htab_elem *old_elem)
|
||||
struct htab_elem *old_elem, u64 map_flags)
|
||||
{
|
||||
u32 size = htab->map.value_size;
|
||||
bool prealloc = htab_is_prealloc(htab);
|
||||
|
|
@ -1043,7 +1049,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
|
|||
pptr = *(void __percpu **)ptr;
|
||||
}
|
||||
|
||||
pcpu_init_value(htab, pptr, value, onallcpus);
|
||||
pcpu_init_value(htab, pptr, value, onallcpus, map_flags);
|
||||
|
||||
if (!prealloc)
|
||||
htab_elem_set_ptr(l_new, key_size, pptr);
|
||||
|
|
@ -1147,7 +1153,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
|
|||
}
|
||||
|
||||
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
|
||||
l_old);
|
||||
l_old, map_flags);
|
||||
if (IS_ERR(l_new)) {
|
||||
/* all pre-allocated elements are in use or memory exhausted */
|
||||
ret = PTR_ERR(l_new);
|
||||
|
|
@ -1249,6 +1255,15 @@ err_lock_bucket:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int htab_map_check_update_flags(bool onallcpus, u64 map_flags)
|
||||
{
|
||||
if (unlikely(!onallcpus && map_flags > BPF_EXIST))
|
||||
return -EINVAL;
|
||||
if (unlikely(onallcpus && ((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS)))
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
|
||||
void *value, u64 map_flags,
|
||||
bool percpu, bool onallcpus)
|
||||
|
|
@ -1262,9 +1277,9 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
|
|||
u32 key_size, hash;
|
||||
int ret;
|
||||
|
||||
if (unlikely(map_flags > BPF_EXIST))
|
||||
/* unknown flags */
|
||||
return -EINVAL;
|
||||
ret = htab_map_check_update_flags(onallcpus, map_flags);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
|
||||
|
|
@ -1289,7 +1304,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
|
|||
/* Update value in-place */
|
||||
if (percpu) {
|
||||
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
|
||||
value, onallcpus);
|
||||
value, onallcpus, map_flags);
|
||||
} else {
|
||||
void **inner_map_pptr = htab_elem_value(l_old, key_size);
|
||||
|
||||
|
|
@ -1298,7 +1313,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
|
|||
}
|
||||
} else {
|
||||
l_new = alloc_htab_elem(htab, key, value, key_size,
|
||||
hash, percpu, onallcpus, NULL);
|
||||
hash, percpu, onallcpus, NULL, map_flags);
|
||||
if (IS_ERR(l_new)) {
|
||||
ret = PTR_ERR(l_new);
|
||||
goto err;
|
||||
|
|
@ -1324,9 +1339,9 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
|
|||
u32 key_size, hash;
|
||||
int ret;
|
||||
|
||||
if (unlikely(map_flags > BPF_EXIST))
|
||||
/* unknown flags */
|
||||
return -EINVAL;
|
||||
ret = htab_map_check_update_flags(onallcpus, map_flags);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
|
||||
|
|
@ -1363,10 +1378,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
|
|||
|
||||
/* per-cpu hash map can update value in-place */
|
||||
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
|
||||
value, onallcpus);
|
||||
value, onallcpus, map_flags);
|
||||
} else {
|
||||
pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
|
||||
value, onallcpus);
|
||||
value, onallcpus, map_flags);
|
||||
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
|
||||
l_new = NULL;
|
||||
}
|
||||
|
|
@ -1678,9 +1693,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
|
|||
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
|
||||
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
|
||||
u32 batch, max_count, size, bucket_size, map_id;
|
||||
u64 elem_map_flags, map_flags, allowed_flags;
|
||||
u32 bucket_cnt, total, key_size, value_size;
|
||||
struct htab_elem *node_to_free = NULL;
|
||||
u64 elem_map_flags, map_flags;
|
||||
struct hlist_nulls_head *head;
|
||||
struct hlist_nulls_node *n;
|
||||
unsigned long flags = 0;
|
||||
|
|
@ -1690,9 +1705,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
|
|||
int ret = 0;
|
||||
|
||||
elem_map_flags = attr->batch.elem_flags;
|
||||
if ((elem_map_flags & ~BPF_F_LOCK) ||
|
||||
((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
|
||||
return -EINVAL;
|
||||
allowed_flags = BPF_F_LOCK;
|
||||
if (!do_delete && is_percpu)
|
||||
allowed_flags |= BPF_F_CPU;
|
||||
ret = bpf_map_check_op_flags(map, elem_map_flags, allowed_flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
map_flags = attr->batch.flags;
|
||||
if (map_flags)
|
||||
|
|
@ -1715,7 +1733,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
|
|||
key_size = htab->map.key_size;
|
||||
value_size = htab->map.value_size;
|
||||
size = round_up(value_size, 8);
|
||||
if (is_percpu)
|
||||
if (is_percpu && !(elem_map_flags & BPF_F_CPU))
|
||||
value_size = size * num_possible_cpus();
|
||||
total = 0;
|
||||
/* while experimenting with hash tables with sizes ranging from 10 to
|
||||
|
|
@ -1798,10 +1816,17 @@ again_nocopy:
|
|||
void __percpu *pptr;
|
||||
|
||||
pptr = htab_elem_get_ptr(l, map->key_size);
|
||||
for_each_possible_cpu(cpu) {
|
||||
copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
|
||||
check_and_init_map_value(&htab->map, dst_val + off);
|
||||
off += size;
|
||||
if (elem_map_flags & BPF_F_CPU) {
|
||||
cpu = elem_map_flags >> 32;
|
||||
copy_map_value(&htab->map, dst_val, per_cpu_ptr(pptr, cpu));
|
||||
check_and_init_map_value(&htab->map, dst_val);
|
||||
} else {
|
||||
for_each_possible_cpu(cpu) {
|
||||
copy_map_value_long(&htab->map, dst_val + off,
|
||||
per_cpu_ptr(pptr, cpu));
|
||||
check_and_init_map_value(&htab->map, dst_val + off);
|
||||
off += size;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
value = htab_elem_value(l, key_size);
|
||||
|
|
@ -2209,11 +2234,11 @@ static u64 htab_map_mem_usage(const struct bpf_map *map)
|
|||
bool prealloc = htab_is_prealloc(htab);
|
||||
bool percpu = htab_is_percpu(htab);
|
||||
bool lru = htab_is_lru(htab);
|
||||
u64 num_entries;
|
||||
u64 usage = sizeof(struct bpf_htab);
|
||||
u64 num_entries, usage;
|
||||
|
||||
usage = sizeof(struct bpf_htab) +
|
||||
sizeof(struct bucket) * htab->n_buckets;
|
||||
|
||||
usage += sizeof(struct bucket) * htab->n_buckets;
|
||||
usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
|
||||
if (prealloc) {
|
||||
num_entries = map->max_entries;
|
||||
if (htab_has_extra_elems(htab))
|
||||
|
|
@ -2357,7 +2382,7 @@ static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *k
|
|||
return NULL;
|
||||
}
|
||||
|
||||
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
|
||||
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
|
||||
{
|
||||
struct htab_elem *l;
|
||||
void __percpu *pptr;
|
||||
|
|
@ -2374,16 +2399,22 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
|
|||
l = __htab_map_lookup_elem(map, key);
|
||||
if (!l)
|
||||
goto out;
|
||||
ret = 0;
|
||||
/* We do not mark LRU map element here in order to not mess up
|
||||
* eviction heuristics when user space does a map walk.
|
||||
*/
|
||||
pptr = htab_elem_get_ptr(l, map->key_size);
|
||||
if (map_flags & BPF_F_CPU) {
|
||||
cpu = map_flags >> 32;
|
||||
copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
|
||||
check_and_init_map_value(map, value);
|
||||
goto out;
|
||||
}
|
||||
for_each_possible_cpu(cpu) {
|
||||
copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
|
||||
check_and_init_map_value(map, value + off);
|
||||
off += size;
|
||||
}
|
||||
ret = 0;
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -600,10 +600,17 @@ struct bpffs_btf_enums {
|
|||
|
||||
static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
|
||||
{
|
||||
struct {
|
||||
const struct btf_type **type;
|
||||
const char *name;
|
||||
} btf_enums[] = {
|
||||
{&info->cmd_t, "bpf_cmd"},
|
||||
{&info->map_t, "bpf_map_type"},
|
||||
{&info->prog_t, "bpf_prog_type"},
|
||||
{&info->attach_t, "bpf_attach_type"},
|
||||
};
|
||||
const struct btf *btf;
|
||||
const struct btf_type *t;
|
||||
const char *name;
|
||||
int i, n;
|
||||
int i, id;
|
||||
|
||||
memset(info, 0, sizeof(*info));
|
||||
|
||||
|
|
@ -615,31 +622,16 @@ static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
|
|||
|
||||
info->btf = btf;
|
||||
|
||||
for (i = 1, n = btf_nr_types(btf); i < n; i++) {
|
||||
t = btf_type_by_id(btf, i);
|
||||
if (!btf_type_is_enum(t))
|
||||
continue;
|
||||
for (i = 0; i < ARRAY_SIZE(btf_enums); i++) {
|
||||
id = btf_find_by_name_kind(btf, btf_enums[i].name,
|
||||
BTF_KIND_ENUM);
|
||||
if (id < 0)
|
||||
return -ESRCH;
|
||||
|
||||
name = btf_name_by_offset(btf, t->name_off);
|
||||
if (!name)
|
||||
continue;
|
||||
|
||||
if (strcmp(name, "bpf_cmd") == 0)
|
||||
info->cmd_t = t;
|
||||
else if (strcmp(name, "bpf_map_type") == 0)
|
||||
info->map_t = t;
|
||||
else if (strcmp(name, "bpf_prog_type") == 0)
|
||||
info->prog_t = t;
|
||||
else if (strcmp(name, "bpf_attach_type") == 0)
|
||||
info->attach_t = t;
|
||||
else
|
||||
continue;
|
||||
|
||||
if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
|
||||
return 0;
|
||||
*btf_enums[i].type = btf_type_by_id(btf, id);
|
||||
}
|
||||
|
||||
return -ESRCH;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
|
||||
|
|
|
|||
|
|
@ -180,7 +180,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
|
|||
}
|
||||
|
||||
int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
|
||||
void *value)
|
||||
void *value, u64 map_flags)
|
||||
{
|
||||
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
|
||||
struct bpf_cgroup_storage *storage;
|
||||
|
|
@ -198,12 +198,17 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
|
|||
* access 'value_size' of them, so copying rounded areas
|
||||
* will not leak any kernel data
|
||||
*/
|
||||
if (map_flags & BPF_F_CPU) {
|
||||
cpu = map_flags >> 32;
|
||||
copy_map_value(_map, value, per_cpu_ptr(storage->percpu_buf, cpu));
|
||||
goto unlock;
|
||||
}
|
||||
size = round_up(_map->value_size, 8);
|
||||
for_each_possible_cpu(cpu) {
|
||||
bpf_long_memcpy(value + off,
|
||||
per_cpu_ptr(storage->percpu_buf, cpu), size);
|
||||
copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu));
|
||||
off += size;
|
||||
}
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -213,10 +218,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
|
|||
{
|
||||
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
|
||||
struct bpf_cgroup_storage *storage;
|
||||
int cpu, off = 0;
|
||||
void *val;
|
||||
u32 size;
|
||||
int cpu;
|
||||
|
||||
if (map_flags != BPF_ANY && map_flags != BPF_EXIST)
|
||||
if ((u32)map_flags & ~(BPF_ANY | BPF_EXIST | BPF_F_CPU | BPF_F_ALL_CPUS))
|
||||
return -EINVAL;
|
||||
|
||||
rcu_read_lock();
|
||||
|
|
@ -232,12 +238,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
|
|||
* returned or zeros which were zero-filled by percpu_alloc,
|
||||
* so no kernel data leaks possible
|
||||
*/
|
||||
if (map_flags & BPF_F_CPU) {
|
||||
cpu = map_flags >> 32;
|
||||
copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), value);
|
||||
goto unlock;
|
||||
}
|
||||
size = round_up(_map->value_size, 8);
|
||||
for_each_possible_cpu(cpu) {
|
||||
bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
|
||||
value + off, size);
|
||||
off += size;
|
||||
val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
|
||||
copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), val);
|
||||
}
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
|
|||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(bpf_map_iter_kfunc_ids)
|
||||
BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_map_sum_elem_count)
|
||||
BTF_KFUNCS_END(bpf_map_iter_kfunc_ids)
|
||||
|
||||
static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
|
||||
|
|
|
|||
|
|
@ -1,16 +1,6 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (C) 2017-2018 Netronome Systems, Inc.
|
||||
*
|
||||
* This software is licensed under the GNU General License Version 2,
|
||||
* June 1991 as shown in the file COPYING in the top-level directory of this
|
||||
* source tree.
|
||||
*
|
||||
* THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
|
||||
* WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
|
||||
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
|
||||
* OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
|
||||
* THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
*/
|
||||
|
||||
#include <linux/bpf.h>
|
||||
|
|
|
|||
|
|
@ -149,7 +149,8 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
|
|||
range_it_insert(rn, rt);
|
||||
|
||||
/* Add a range */
|
||||
new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
|
||||
new_rn = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT,
|
||||
NUMA_NO_NODE);
|
||||
if (!new_rn)
|
||||
return -ENOMEM;
|
||||
new_rn->rn_start = last + 1;
|
||||
|
|
@ -234,7 +235,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
|
|||
right->rn_start = start;
|
||||
range_it_insert(right, rt);
|
||||
} else {
|
||||
left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
|
||||
left = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, NUMA_NO_NODE);
|
||||
if (!left)
|
||||
return -ENOMEM;
|
||||
left->rn_start = start;
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/btf.h>
|
||||
#include <linux/err.h>
|
||||
|
|
|
|||
|
|
@ -265,10 +265,11 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
|
|||
|
||||
RES_INIT_TIMEOUT(ts);
|
||||
/*
|
||||
* The fast path is not invoked for the TAS fallback, so we must grab
|
||||
* the deadlock detection entry here.
|
||||
* We are either called directly from res_spin_lock after grabbing the
|
||||
* deadlock detection entry when queued spinlocks are disabled, or from
|
||||
* resilient_queued_spin_lock_slowpath after grabbing the deadlock
|
||||
* detection entry. No need to obtain it here.
|
||||
*/
|
||||
grab_held_lock_entry(lock);
|
||||
|
||||
/*
|
||||
* Since the waiting loop's time is dependent on the amount of
|
||||
|
|
|
|||
|
|
@ -212,14 +212,13 @@ __bpf_kfunc_start_defs();
|
|||
* Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
|
||||
* enum in headers.
|
||||
*/
|
||||
__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
|
||||
u32 len__sz, void *aux__prog)
|
||||
__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
|
||||
u32 len__sz, struct bpf_prog_aux *aux)
|
||||
{
|
||||
struct bpf_bprintf_data data = {
|
||||
.get_bin_args = true,
|
||||
.get_buf = true,
|
||||
};
|
||||
struct bpf_prog_aux *aux = aux__prog;
|
||||
u32 fmt_size = strlen(fmt__str) + 1;
|
||||
struct bpf_stream *stream;
|
||||
u32 data_len = len__sz;
|
||||
|
|
@ -246,6 +245,25 @@ __bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, con
|
|||
return ret;
|
||||
}
|
||||
|
||||
/* Directly trigger a stack dump from the program. */
|
||||
__bpf_kfunc int bpf_stream_print_stack(int stream_id, struct bpf_prog_aux *aux)
|
||||
{
|
||||
struct bpf_stream_stage ss;
|
||||
struct bpf_prog *prog;
|
||||
|
||||
/* Make sure the stream ID is valid. */
|
||||
if (!bpf_stream_get(stream_id, aux))
|
||||
return -ENOENT;
|
||||
|
||||
prog = aux->main_prog_aux->prog;
|
||||
|
||||
bpf_stream_stage(ss, prog, stream_id, ({
|
||||
bpf_stream_dump_stack(ss);
|
||||
}));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
__bpf_kfunc_end_defs();
|
||||
|
||||
/* Added kfunc to common_btf_ids */
|
||||
|
|
|
|||
|
|
@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map)
|
|||
return atomic64_read(&map->writecnt) != 0;
|
||||
}
|
||||
|
||||
static u32 bpf_map_value_size(const struct bpf_map *map)
|
||||
static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags)
|
||||
{
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
|
||||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
|
||||
if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS))
|
||||
return map->value_size;
|
||||
else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
|
||||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
|
||||
return round_up(map->value_size, 8) * num_possible_cpus();
|
||||
else if (IS_FD_MAP(map))
|
||||
return sizeof(u32);
|
||||
|
|
@ -314,11 +316,11 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
|
|||
bpf_disable_instrumentation();
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
|
||||
err = bpf_percpu_hash_copy(map, key, value);
|
||||
err = bpf_percpu_hash_copy(map, key, value, flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
|
||||
err = bpf_percpu_array_copy(map, key, value);
|
||||
err = bpf_percpu_array_copy(map, key, value, flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
|
||||
err = bpf_percpu_cgroup_storage_copy(map, key, value);
|
||||
err = bpf_percpu_cgroup_storage_copy(map, key, value, flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
|
||||
err = bpf_stackmap_extract(map, key, value, false);
|
||||
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
|
||||
|
|
@ -505,17 +507,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
|
|||
return root_mem_cgroup;
|
||||
}
|
||||
|
||||
void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
|
||||
struct mem_cgroup **new_memcg)
|
||||
{
|
||||
*new_memcg = bpf_map_get_memcg(map);
|
||||
*old_memcg = set_active_memcg(*new_memcg);
|
||||
}
|
||||
|
||||
void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
|
||||
struct mem_cgroup *new_memcg)
|
||||
{
|
||||
set_active_memcg(old_memcg);
|
||||
mem_cgroup_put(new_memcg);
|
||||
}
|
||||
|
||||
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
|
||||
int node)
|
||||
{
|
||||
struct mem_cgroup *memcg, *old_memcg;
|
||||
void *ptr;
|
||||
|
||||
memcg = bpf_map_get_memcg(map);
|
||||
old_memcg = set_active_memcg(memcg);
|
||||
bpf_map_memcg_enter(map, &old_memcg, &memcg);
|
||||
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
|
||||
set_active_memcg(old_memcg);
|
||||
mem_cgroup_put(memcg);
|
||||
bpf_map_memcg_exit(old_memcg, memcg);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
|
@ -526,11 +540,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags
|
|||
struct mem_cgroup *memcg, *old_memcg;
|
||||
void *ptr;
|
||||
|
||||
memcg = bpf_map_get_memcg(map);
|
||||
old_memcg = set_active_memcg(memcg);
|
||||
bpf_map_memcg_enter(map, &old_memcg, &memcg);
|
||||
ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
|
||||
set_active_memcg(old_memcg);
|
||||
mem_cgroup_put(memcg);
|
||||
bpf_map_memcg_exit(old_memcg, memcg);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
|
@ -540,11 +552,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
|
|||
struct mem_cgroup *memcg, *old_memcg;
|
||||
void *ptr;
|
||||
|
||||
memcg = bpf_map_get_memcg(map);
|
||||
old_memcg = set_active_memcg(memcg);
|
||||
bpf_map_memcg_enter(map, &old_memcg, &memcg);
|
||||
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
|
||||
set_active_memcg(old_memcg);
|
||||
mem_cgroup_put(memcg);
|
||||
bpf_map_memcg_exit(old_memcg, memcg);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
|
@ -555,11 +565,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
|
|||
struct mem_cgroup *memcg, *old_memcg;
|
||||
void *ptr;
|
||||
|
||||
memcg = bpf_map_get_memcg(map);
|
||||
old_memcg = set_active_memcg(memcg);
|
||||
bpf_map_memcg_enter(map, &old_memcg, &memcg);
|
||||
ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
|
||||
set_active_memcg(old_memcg);
|
||||
mem_cgroup_put(memcg);
|
||||
bpf_map_memcg_exit(old_memcg, memcg);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
|
@ -570,11 +578,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
|
|||
struct mem_cgroup *memcg, *old_memcg;
|
||||
void __percpu *ptr;
|
||||
|
||||
memcg = bpf_map_get_memcg(map);
|
||||
old_memcg = set_active_memcg(memcg);
|
||||
bpf_map_memcg_enter(map, &old_memcg, &memcg);
|
||||
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
|
||||
set_active_memcg(old_memcg);
|
||||
mem_cgroup_put(memcg);
|
||||
bpf_map_memcg_exit(old_memcg, memcg);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
|
@ -612,12 +618,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
|
|||
unsigned long i, j;
|
||||
struct page *pg;
|
||||
int ret = 0;
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct mem_cgroup *memcg, *old_memcg;
|
||||
|
||||
memcg = bpf_map_get_memcg(map);
|
||||
old_memcg = set_active_memcg(memcg);
|
||||
#endif
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
pg = __bpf_alloc_page(nid);
|
||||
|
||||
|
|
@ -631,10 +632,6 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
|
|||
break;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
set_active_memcg(old_memcg);
|
||||
mem_cgroup_put(memcg);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
@ -1366,11 +1363,6 @@ free_map_tab:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static bool bpf_net_capable(void)
|
||||
{
|
||||
return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
|
||||
}
|
||||
|
||||
#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
|
||||
/* called via syscall */
|
||||
static int map_create(union bpf_attr *attr, bpfptr_t uattr)
|
||||
|
|
@ -1734,7 +1726,7 @@ static int map_lookup_elem(union bpf_attr *attr)
|
|||
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
|
||||
return -EPERM;
|
||||
|
||||
err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
|
||||
err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
|
|
@ -1742,7 +1734,7 @@ static int map_lookup_elem(union bpf_attr *attr)
|
|||
if (IS_ERR(key))
|
||||
return PTR_ERR(key);
|
||||
|
||||
value_size = bpf_map_value_size(map);
|
||||
value_size = bpf_map_value_size(map, attr->flags);
|
||||
|
||||
err = -ENOMEM;
|
||||
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
|
||||
|
|
@ -1809,7 +1801,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
|
|||
goto err_put;
|
||||
}
|
||||
|
||||
value_size = bpf_map_value_size(map);
|
||||
value_size = bpf_map_value_size(map, attr->flags);
|
||||
value = kvmemdup_bpfptr(uvalue, value_size);
|
||||
if (IS_ERR(value)) {
|
||||
err = PTR_ERR(value);
|
||||
|
|
@ -2005,11 +1997,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
|
|||
void *key, *value;
|
||||
int err = 0;
|
||||
|
||||
err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
|
||||
err = bpf_map_check_op_flags(map, attr->batch.elem_flags,
|
||||
BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
value_size = bpf_map_value_size(map);
|
||||
value_size = bpf_map_value_size(map, attr->batch.elem_flags);
|
||||
|
||||
max_count = attr->batch.count;
|
||||
if (!max_count)
|
||||
|
|
@ -2064,11 +2057,11 @@ int generic_map_lookup_batch(struct bpf_map *map,
|
|||
u32 value_size, cp, max_count;
|
||||
int err;
|
||||
|
||||
err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
|
||||
err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
value_size = bpf_map_value_size(map);
|
||||
value_size = bpf_map_value_size(map, attr->batch.elem_flags);
|
||||
|
||||
max_count = attr->batch.count;
|
||||
if (!max_count)
|
||||
|
|
@ -2190,7 +2183,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
|
|||
goto err_put;
|
||||
}
|
||||
|
||||
value_size = bpf_map_value_size(map);
|
||||
value_size = bpf_map_value_size(map, 0);
|
||||
|
||||
err = -ENOMEM;
|
||||
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
|
||||
|
|
@ -2820,6 +2813,13 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
|
|||
void *sig;
|
||||
int err = 0;
|
||||
|
||||
/*
|
||||
* Don't attempt to use kmalloc_large or vmalloc for signatures.
|
||||
* Practical signature for BPF program should be below this limit.
|
||||
*/
|
||||
if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
if (system_keyring_id_check(attr->keyring_id) == 0)
|
||||
key = bpf_lookup_system_key(attr->keyring_id);
|
||||
else
|
||||
|
|
@ -3579,6 +3579,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
|
|||
case BPF_PROG_TYPE_TRACING:
|
||||
if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
|
||||
prog->expected_attach_type != BPF_TRACE_FEXIT &&
|
||||
prog->expected_attach_type != BPF_TRACE_FSESSION &&
|
||||
prog->expected_attach_type != BPF_MODIFY_RETURN) {
|
||||
err = -EINVAL;
|
||||
goto out_put_prog;
|
||||
|
|
@ -3628,7 +3629,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
|
|||
key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
|
||||
}
|
||||
|
||||
link = kzalloc(sizeof(*link), GFP_USER);
|
||||
if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
|
||||
struct bpf_fsession_link *fslink;
|
||||
|
||||
fslink = kzalloc(sizeof(*fslink), GFP_USER);
|
||||
if (fslink) {
|
||||
bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING,
|
||||
&bpf_tracing_link_lops, prog, attach_type);
|
||||
fslink->fexit.cookie = bpf_cookie;
|
||||
link = &fslink->link;
|
||||
} else {
|
||||
link = NULL;
|
||||
}
|
||||
} else {
|
||||
link = kzalloc(sizeof(*link), GFP_USER);
|
||||
}
|
||||
if (!link) {
|
||||
err = -ENOMEM;
|
||||
goto out_put_prog;
|
||||
|
|
@ -4352,6 +4367,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
|
|||
case BPF_TRACE_RAW_TP:
|
||||
case BPF_TRACE_FENTRY:
|
||||
case BPF_TRACE_FEXIT:
|
||||
case BPF_TRACE_FSESSION:
|
||||
case BPF_MODIFY_RETURN:
|
||||
return BPF_PROG_TYPE_TRACING;
|
||||
case BPF_LSM_MAC:
|
||||
|
|
@ -4565,6 +4581,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
|
|||
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
|
||||
if (IS_ERR(prog))
|
||||
return PTR_ERR(prog);
|
||||
} else if (!bpf_mprog_detach_empty(ptype)) {
|
||||
return -EPERM;
|
||||
}
|
||||
} else if (is_cgroup_prog_type(ptype, 0, false)) {
|
||||
if (attr->attach_flags || attr->relative_fd)
|
||||
|
|
@ -5310,6 +5328,9 @@ static int bpf_map_get_info_by_fd(struct file *file,
|
|||
if (info.hash_size != SHA256_DIGEST_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
if (!READ_ONCE(map->frozen))
|
||||
return -EPERM;
|
||||
|
||||
err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
|
||||
if (err != 0)
|
||||
return err;
|
||||
|
|
@ -6122,6 +6143,49 @@ static int prog_stream_read(union bpf_attr *attr)
|
|||
return ret;
|
||||
}
|
||||
|
||||
#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd
|
||||
|
||||
static int prog_assoc_struct_ops(union bpf_attr *attr)
|
||||
{
|
||||
struct bpf_prog *prog;
|
||||
struct bpf_map *map;
|
||||
int ret;
|
||||
|
||||
if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS))
|
||||
return -EINVAL;
|
||||
|
||||
if (attr->prog_assoc_struct_ops.flags)
|
||||
return -EINVAL;
|
||||
|
||||
prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd);
|
||||
if (IS_ERR(prog))
|
||||
return PTR_ERR(prog);
|
||||
|
||||
if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
|
||||
ret = -EINVAL;
|
||||
goto put_prog;
|
||||
}
|
||||
|
||||
map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd);
|
||||
if (IS_ERR(map)) {
|
||||
ret = PTR_ERR(map);
|
||||
goto put_prog;
|
||||
}
|
||||
|
||||
if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
|
||||
ret = -EINVAL;
|
||||
goto put_map;
|
||||
}
|
||||
|
||||
ret = bpf_prog_assoc_struct_ops(prog, map);
|
||||
|
||||
put_map:
|
||||
bpf_map_put(map);
|
||||
put_prog:
|
||||
bpf_prog_put(prog);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
|
||||
{
|
||||
union bpf_attr attr;
|
||||
|
|
@ -6261,6 +6325,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
|
|||
case BPF_PROG_STREAM_READ_BY_FD:
|
||||
err = prog_stream_read(&attr);
|
||||
break;
|
||||
case BPF_PROG_ASSOC_STRUCT_OPS:
|
||||
err = prog_assoc_struct_ops(&attr);
|
||||
break;
|
||||
default:
|
||||
err = -EINVAL;
|
||||
break;
|
||||
|
|
@ -6407,7 +6474,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
|
|||
.func = bpf_kallsyms_lookup_name,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/tnum.h>
|
||||
#include <linux/swab.h>
|
||||
|
||||
#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
|
||||
/* A completely unknown value */
|
||||
|
|
@ -253,3 +254,18 @@ struct tnum tnum_const_subreg(struct tnum a, u32 value)
|
|||
{
|
||||
return tnum_with_subreg(a, tnum_const(value));
|
||||
}
|
||||
|
||||
struct tnum tnum_bswap16(struct tnum a)
|
||||
{
|
||||
return TNUM(swab16(a.value & 0xFFFF), swab16(a.mask & 0xFFFF));
|
||||
}
|
||||
|
||||
struct tnum tnum_bswap32(struct tnum a)
|
||||
{
|
||||
return TNUM(swab32(a.value & 0xFFFFFFFF), swab32(a.mask & 0xFFFFFFFF));
|
||||
}
|
||||
|
||||
struct tnum tnum_bswap64(struct tnum a)
|
||||
{
|
||||
return TNUM(swab64(a.value), swab64(a.mask));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/file.h>
|
||||
|
|
|
|||
|
|
@ -24,19 +24,49 @@ const struct bpf_prog_ops bpf_extension_prog_ops = {
|
|||
#define TRAMPOLINE_HASH_BITS 10
|
||||
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
|
||||
|
||||
static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
|
||||
static struct hlist_head trampoline_key_table[TRAMPOLINE_TABLE_SIZE];
|
||||
static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE];
|
||||
|
||||
/* serializes access to trampoline_table */
|
||||
/* serializes access to trampoline tables */
|
||||
static DEFINE_MUTEX(trampoline_mutex);
|
||||
|
||||
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
||||
static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
|
||||
|
||||
static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
|
||||
#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
|
||||
static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip)
|
||||
{
|
||||
struct bpf_trampoline *tr = ops->private;
|
||||
struct hlist_head *head_ip;
|
||||
struct bpf_trampoline *tr;
|
||||
|
||||
mutex_lock(&trampoline_mutex);
|
||||
head_ip = &trampoline_ip_table[hash_64(ip, TRAMPOLINE_HASH_BITS)];
|
||||
hlist_for_each_entry(tr, head_ip, hlist_ip) {
|
||||
if (tr->ip == ip)
|
||||
goto out;
|
||||
}
|
||||
tr = NULL;
|
||||
out:
|
||||
mutex_unlock(&trampoline_mutex);
|
||||
return tr;
|
||||
}
|
||||
#else
|
||||
static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip)
|
||||
{
|
||||
return ops->private;
|
||||
}
|
||||
#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
|
||||
|
||||
static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
|
||||
enum ftrace_ops_cmd cmd)
|
||||
{
|
||||
struct bpf_trampoline *tr;
|
||||
int ret = 0;
|
||||
|
||||
tr = direct_ops_ip_lookup(ops, ip);
|
||||
if (!tr)
|
||||
return -EINVAL;
|
||||
|
||||
if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
|
||||
/* This is called inside register_ftrace_direct_multi(), so
|
||||
* tr->mutex is already locked.
|
||||
|
|
@ -109,10 +139,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
|
|||
enum bpf_attach_type eatype = prog->expected_attach_type;
|
||||
enum bpf_prog_type ptype = prog->type;
|
||||
|
||||
return (ptype == BPF_PROG_TYPE_TRACING &&
|
||||
(eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
|
||||
eatype == BPF_MODIFY_RETURN)) ||
|
||||
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
|
||||
switch (ptype) {
|
||||
case BPF_PROG_TYPE_TRACING:
|
||||
if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
|
||||
eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION)
|
||||
return true;
|
||||
return false;
|
||||
case BPF_PROG_TYPE_LSM:
|
||||
return eatype == BPF_LSM_MAC;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
|
||||
|
|
@ -135,15 +172,171 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
|
|||
PAGE_SIZE, true, ksym->name);
|
||||
}
|
||||
|
||||
static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
|
||||
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
||||
#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
|
||||
/*
|
||||
* We have only single direct_ops which contains all the direct call
|
||||
* sites and is the only global ftrace_ops for all trampolines.
|
||||
*
|
||||
* We use 'update_ftrace_direct_*' api for attachment.
|
||||
*/
|
||||
struct ftrace_ops direct_ops = {
|
||||
.ops_func = bpf_tramp_ftrace_ops_func,
|
||||
};
|
||||
|
||||
static int direct_ops_alloc(struct bpf_trampoline *tr)
|
||||
{
|
||||
tr->fops = &direct_ops;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void direct_ops_free(struct bpf_trampoline *tr) { }
|
||||
|
||||
static struct ftrace_hash *hash_from_ip(struct bpf_trampoline *tr, void *ptr)
|
||||
{
|
||||
unsigned long ip, addr = (unsigned long) ptr;
|
||||
struct ftrace_hash *hash;
|
||||
|
||||
ip = ftrace_location(tr->ip);
|
||||
if (!ip)
|
||||
return NULL;
|
||||
hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
|
||||
if (!hash)
|
||||
return NULL;
|
||||
if (bpf_trampoline_use_jmp(tr->flags))
|
||||
addr = ftrace_jmp_set(addr);
|
||||
if (!add_ftrace_hash_entry_direct(hash, ip, addr)) {
|
||||
free_ftrace_hash(hash);
|
||||
return NULL;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
static int direct_ops_add(struct bpf_trampoline *tr, void *addr)
|
||||
{
|
||||
struct ftrace_hash *hash = hash_from_ip(tr, addr);
|
||||
int err;
|
||||
|
||||
if (!hash)
|
||||
return -ENOMEM;
|
||||
err = update_ftrace_direct_add(tr->fops, hash);
|
||||
free_ftrace_hash(hash);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int direct_ops_del(struct bpf_trampoline *tr, void *addr)
|
||||
{
|
||||
struct ftrace_hash *hash = hash_from_ip(tr, addr);
|
||||
int err;
|
||||
|
||||
if (!hash)
|
||||
return -ENOMEM;
|
||||
err = update_ftrace_direct_del(tr->fops, hash);
|
||||
free_ftrace_hash(hash);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int direct_ops_mod(struct bpf_trampoline *tr, void *addr, bool lock_direct_mutex)
|
||||
{
|
||||
struct ftrace_hash *hash = hash_from_ip(tr, addr);
|
||||
int err;
|
||||
|
||||
if (!hash)
|
||||
return -ENOMEM;
|
||||
err = update_ftrace_direct_mod(tr->fops, hash, lock_direct_mutex);
|
||||
free_ftrace_hash(hash);
|
||||
return err;
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* We allocate ftrace_ops object for each trampoline and it contains
|
||||
* call site specific for that trampoline.
|
||||
*
|
||||
* We use *_ftrace_direct api for attachment.
|
||||
*/
|
||||
static int direct_ops_alloc(struct bpf_trampoline *tr)
|
||||
{
|
||||
tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
|
||||
if (!tr->fops)
|
||||
return -ENOMEM;
|
||||
tr->fops->private = tr;
|
||||
tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void direct_ops_free(struct bpf_trampoline *tr)
|
||||
{
|
||||
if (!tr->fops)
|
||||
return;
|
||||
ftrace_free_filter(tr->fops);
|
||||
kfree(tr->fops);
|
||||
}
|
||||
|
||||
static int direct_ops_add(struct bpf_trampoline *tr, void *ptr)
|
||||
{
|
||||
unsigned long addr = (unsigned long) ptr;
|
||||
struct ftrace_ops *ops = tr->fops;
|
||||
int ret;
|
||||
|
||||
if (bpf_trampoline_use_jmp(tr->flags))
|
||||
addr = ftrace_jmp_set(addr);
|
||||
|
||||
ret = ftrace_set_filter_ip(ops, tr->ip, 0, 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
return register_ftrace_direct(ops, addr);
|
||||
}
|
||||
|
||||
static int direct_ops_del(struct bpf_trampoline *tr, void *addr)
|
||||
{
|
||||
return unregister_ftrace_direct(tr->fops, (long)addr, false);
|
||||
}
|
||||
|
||||
static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex)
|
||||
{
|
||||
unsigned long addr = (unsigned long) ptr;
|
||||
struct ftrace_ops *ops = tr->fops;
|
||||
|
||||
if (bpf_trampoline_use_jmp(tr->flags))
|
||||
addr = ftrace_jmp_set(addr);
|
||||
if (lock_direct_mutex)
|
||||
return modify_ftrace_direct(ops, addr);
|
||||
return modify_ftrace_direct_nolock(ops, addr);
|
||||
}
|
||||
#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
|
||||
#else
|
||||
static void direct_ops_free(struct bpf_trampoline *tr) { }
|
||||
|
||||
static int direct_ops_alloc(struct bpf_trampoline *tr)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int direct_ops_add(struct bpf_trampoline *tr, void *addr)
|
||||
{
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
static int direct_ops_del(struct bpf_trampoline *tr, void *addr)
|
||||
{
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex)
|
||||
{
|
||||
return -ENODEV;
|
||||
}
|
||||
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
|
||||
|
||||
static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip)
|
||||
{
|
||||
struct bpf_trampoline *tr;
|
||||
struct hlist_head *head;
|
||||
int i;
|
||||
|
||||
mutex_lock(&trampoline_mutex);
|
||||
head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
|
||||
hlist_for_each_entry(tr, head, hlist) {
|
||||
head = &trampoline_key_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
|
||||
hlist_for_each_entry(tr, head, hlist_key) {
|
||||
if (tr->key == key) {
|
||||
refcount_inc(&tr->refcnt);
|
||||
goto out;
|
||||
|
|
@ -152,20 +345,19 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
|
|||
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
|
||||
if (!tr)
|
||||
goto out;
|
||||
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
||||
tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
|
||||
if (!tr->fops) {
|
||||
if (direct_ops_alloc(tr)) {
|
||||
kfree(tr);
|
||||
tr = NULL;
|
||||
goto out;
|
||||
}
|
||||
tr->fops->private = tr;
|
||||
tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
|
||||
#endif
|
||||
|
||||
tr->key = key;
|
||||
INIT_HLIST_NODE(&tr->hlist);
|
||||
hlist_add_head(&tr->hlist, head);
|
||||
tr->ip = ftrace_location(ip);
|
||||
INIT_HLIST_NODE(&tr->hlist_key);
|
||||
INIT_HLIST_NODE(&tr->hlist_ip);
|
||||
hlist_add_head(&tr->hlist_key, head);
|
||||
head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)];
|
||||
hlist_add_head(&tr->hlist_ip, head);
|
||||
refcount_set(&tr->refcnt, 1);
|
||||
mutex_init(&tr->mutex);
|
||||
for (i = 0; i < BPF_TRAMP_MAX; i++)
|
||||
|
|
@ -200,7 +392,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
|
|||
int ret;
|
||||
|
||||
if (tr->func.ftrace_managed)
|
||||
ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
|
||||
ret = direct_ops_del(tr, old_addr);
|
||||
else
|
||||
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
|
||||
|
||||
|
|
@ -214,10 +406,7 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
|
|||
int ret;
|
||||
|
||||
if (tr->func.ftrace_managed) {
|
||||
if (lock_direct_mutex)
|
||||
ret = modify_ftrace_direct(tr->fops, (long)new_addr);
|
||||
else
|
||||
ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
|
||||
ret = direct_ops_mod(tr, new_addr, lock_direct_mutex);
|
||||
} else {
|
||||
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
|
||||
new_addr);
|
||||
|
|
@ -240,10 +429,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
|
|||
}
|
||||
|
||||
if (tr->func.ftrace_managed) {
|
||||
ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = register_ftrace_direct(tr->fops, (long)new_addr);
|
||||
ret = direct_ops_add(tr, new_addr);
|
||||
} else {
|
||||
ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
|
||||
}
|
||||
|
|
@ -499,13 +685,6 @@ again:
|
|||
if (err)
|
||||
goto out_free;
|
||||
|
||||
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
|
||||
if (bpf_trampoline_use_jmp(tr->flags))
|
||||
tr->fops->flags |= FTRACE_OPS_FL_JMP;
|
||||
else
|
||||
tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
|
||||
#endif
|
||||
|
||||
WARN_ON(tr->cur_image && total == 0);
|
||||
if (tr->cur_image)
|
||||
/* progs already running at this address */
|
||||
|
|
@ -533,15 +712,8 @@ again:
|
|||
tr->cur_image = im;
|
||||
out:
|
||||
/* If any error happens, restore previous flags */
|
||||
if (err) {
|
||||
if (err)
|
||||
tr->flags = orig_flags;
|
||||
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
|
||||
if (bpf_trampoline_use_jmp(tr->flags))
|
||||
tr->fops->flags |= FTRACE_OPS_FL_JMP;
|
||||
else
|
||||
tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
|
||||
#endif
|
||||
}
|
||||
kfree(tlinks);
|
||||
return err;
|
||||
|
||||
|
|
@ -559,6 +731,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
|
|||
return BPF_TRAMP_MODIFY_RETURN;
|
||||
case BPF_TRACE_FEXIT:
|
||||
return BPF_TRAMP_FEXIT;
|
||||
case BPF_TRACE_FSESSION:
|
||||
return BPF_TRAMP_FSESSION;
|
||||
case BPF_LSM_MAC:
|
||||
if (!prog->aux->attach_func_proto->type)
|
||||
/* The function returns void, we cannot modify its
|
||||
|
|
@ -594,8 +768,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
|
|||
struct bpf_trampoline *tr,
|
||||
struct bpf_prog *tgt_prog)
|
||||
{
|
||||
struct bpf_fsession_link *fslink = NULL;
|
||||
enum bpf_tramp_prog_type kind;
|
||||
struct bpf_tramp_link *link_exiting;
|
||||
struct hlist_head *prog_list;
|
||||
int err = 0;
|
||||
int cnt = 0, i;
|
||||
|
||||
|
|
@ -621,24 +797,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
|
|||
BPF_MOD_JUMP, NULL,
|
||||
link->link.prog->bpf_func);
|
||||
}
|
||||
if (kind == BPF_TRAMP_FSESSION) {
|
||||
prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY];
|
||||
cnt++;
|
||||
} else {
|
||||
prog_list = &tr->progs_hlist[kind];
|
||||
}
|
||||
if (cnt >= BPF_MAX_TRAMP_LINKS)
|
||||
return -E2BIG;
|
||||
if (!hlist_unhashed(&link->tramp_hlist))
|
||||
/* prog already linked */
|
||||
return -EBUSY;
|
||||
hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
|
||||
hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) {
|
||||
if (link_exiting->link.prog != link->link.prog)
|
||||
continue;
|
||||
/* prog already linked */
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
|
||||
tr->progs_cnt[kind]++;
|
||||
hlist_add_head(&link->tramp_hlist, prog_list);
|
||||
if (kind == BPF_TRAMP_FSESSION) {
|
||||
tr->progs_cnt[BPF_TRAMP_FENTRY]++;
|
||||
fslink = container_of(link, struct bpf_fsession_link, link.link);
|
||||
hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]);
|
||||
tr->progs_cnt[BPF_TRAMP_FEXIT]++;
|
||||
} else {
|
||||
tr->progs_cnt[kind]++;
|
||||
}
|
||||
err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
|
||||
if (err) {
|
||||
hlist_del_init(&link->tramp_hlist);
|
||||
tr->progs_cnt[kind]--;
|
||||
if (kind == BPF_TRAMP_FSESSION) {
|
||||
tr->progs_cnt[BPF_TRAMP_FENTRY]--;
|
||||
hlist_del_init(&fslink->fexit.tramp_hlist);
|
||||
tr->progs_cnt[BPF_TRAMP_FEXIT]--;
|
||||
} else {
|
||||
tr->progs_cnt[kind]--;
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
|
@ -672,6 +867,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
|
|||
guard(mutex)(&tgt_prog->aux->ext_mutex);
|
||||
tgt_prog->aux->is_extended = false;
|
||||
return err;
|
||||
} else if (kind == BPF_TRAMP_FSESSION) {
|
||||
struct bpf_fsession_link *fslink =
|
||||
container_of(link, struct bpf_fsession_link, link.link);
|
||||
|
||||
hlist_del_init(&fslink->fexit.tramp_hlist);
|
||||
tr->progs_cnt[BPF_TRAMP_FEXIT]--;
|
||||
kind = BPF_TRAMP_FENTRY;
|
||||
}
|
||||
hlist_del_init(&link->tramp_hlist);
|
||||
tr->progs_cnt[kind]--;
|
||||
|
|
@ -850,7 +1052,7 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
|
|||
prog->aux->attach_btf_id);
|
||||
|
||||
bpf_lsm_find_cgroup_shim(prog, &bpf_func);
|
||||
tr = bpf_trampoline_lookup(key);
|
||||
tr = bpf_trampoline_lookup(key, 0);
|
||||
if (WARN_ON_ONCE(!tr))
|
||||
return;
|
||||
|
||||
|
|
@ -870,7 +1072,7 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key,
|
|||
{
|
||||
struct bpf_trampoline *tr;
|
||||
|
||||
tr = bpf_trampoline_lookup(key);
|
||||
tr = bpf_trampoline_lookup(key, tgt_info->tgt_addr);
|
||||
if (!tr)
|
||||
return NULL;
|
||||
|
||||
|
|
@ -906,11 +1108,9 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
|
|||
* fexit progs. The fentry-only trampoline will be freed via
|
||||
* multiple rcu callbacks.
|
||||
*/
|
||||
hlist_del(&tr->hlist);
|
||||
if (tr->fops) {
|
||||
ftrace_free_filter(tr->fops);
|
||||
kfree(tr->fops);
|
||||
}
|
||||
hlist_del(&tr->hlist_key);
|
||||
hlist_del(&tr->hlist_ip);
|
||||
direct_ops_free(tr);
|
||||
kfree(tr);
|
||||
out:
|
||||
mutex_unlock(&trampoline_mutex);
|
||||
|
|
@ -949,7 +1149,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
|
|||
|
||||
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
|
||||
|
||||
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
|
||||
if (unlikely(!bpf_prog_get_recursion_context(prog))) {
|
||||
bpf_prog_inc_misses_counter(prog);
|
||||
if (prog->aux->recursion_detected)
|
||||
prog->aux->recursion_detected(prog);
|
||||
|
|
@ -993,7 +1193,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
|
|||
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
|
||||
|
||||
update_prog_stats(prog, start);
|
||||
this_cpu_dec(*(prog->active));
|
||||
bpf_prog_put_recursion_context(prog);
|
||||
rcu_read_unlock_migrate();
|
||||
}
|
||||
|
||||
|
|
@ -1029,7 +1229,7 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
|
|||
|
||||
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
|
||||
|
||||
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
|
||||
if (unlikely(!bpf_prog_get_recursion_context(prog))) {
|
||||
bpf_prog_inc_misses_counter(prog);
|
||||
if (prog->aux->recursion_detected)
|
||||
prog->aux->recursion_detected(prog);
|
||||
|
|
@ -1044,7 +1244,7 @@ void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
|
|||
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
|
||||
|
||||
update_prog_stats(prog, start);
|
||||
this_cpu_dec(*(prog->active));
|
||||
bpf_prog_put_recursion_context(prog);
|
||||
migrate_enable();
|
||||
rcu_read_unlock_trace();
|
||||
}
|
||||
|
|
@ -1179,7 +1379,9 @@ static int __init init_trampolines(void)
|
|||
int i;
|
||||
|
||||
for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
|
||||
INIT_HLIST_HEAD(&trampoline_table[i]);
|
||||
INIT_HLIST_HEAD(&trampoline_key_table[i]);
|
||||
for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
|
||||
INIT_HLIST_HEAD(&trampoline_ip_table[i]);
|
||||
return 0;
|
||||
}
|
||||
late_initcall(init_trampolines);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -7275,9 +7275,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
|
|||
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
|
||||
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
|
||||
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
|
||||
BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, scx_bpf_exit_bstr)
|
||||
BTF_ID_FLAGS(func, scx_bpf_error_bstr)
|
||||
BTF_ID_FLAGS(func, scx_bpf_dump_bstr)
|
||||
BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
|
||||
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
|
||||
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
|
||||
|
|
@ -7296,7 +7296,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
|
|||
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
|
||||
#endif
|
||||
BTF_ID_FLAGS(func, scx_bpf_now)
|
||||
BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, scx_bpf_events)
|
||||
BTF_KFUNCS_END(scx_kfunc_ids_any)
|
||||
|
||||
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
|
||||
|
|
|
|||
|
|
@ -50,6 +50,9 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
|
|||
config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
||||
bool
|
||||
|
||||
config HAVE_SINGLE_FTRACE_DIRECT_OPS
|
||||
bool
|
||||
|
||||
config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
|
||||
bool
|
||||
|
||||
|
|
|
|||
|
|
@ -830,7 +830,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc
|
|||
info.si_code = SI_KERNEL;
|
||||
info.si_pid = 0;
|
||||
info.si_uid = 0;
|
||||
info.si_value.sival_ptr = (void *)(unsigned long)value;
|
||||
info.si_value.sival_ptr = (void __user __force *)(unsigned long)value;
|
||||
siginfo = &info;
|
||||
}
|
||||
|
||||
|
|
@ -1022,7 +1022,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
|
|||
.func = bpf_snprintf_btf,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg1_type = ARG_PTR_TO_MEM | MEM_WRITE,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg4_type = ARG_CONST_SIZE,
|
||||
|
|
@ -1194,7 +1194,7 @@ const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
|
|||
BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
|
||||
{
|
||||
/* This helper call is inlined by verifier. */
|
||||
u64 nr_args = ((u64 *)ctx)[-1];
|
||||
u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;
|
||||
|
||||
if ((u64) n >= nr_args)
|
||||
return -EINVAL;
|
||||
|
|
@ -1214,7 +1214,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
|
|||
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
|
||||
{
|
||||
/* This helper call is inlined by verifier. */
|
||||
u64 nr_args = ((u64 *)ctx)[-1];
|
||||
u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;
|
||||
|
||||
*value = ((u64 *)ctx)[nr_args];
|
||||
return 0;
|
||||
|
|
@ -1231,7 +1231,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
|
|||
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
|
||||
{
|
||||
/* This helper call is inlined by verifier. */
|
||||
return ((u64 *)ctx)[-1];
|
||||
return ((u64 *)ctx)[-1] & 0xFF;
|
||||
}
|
||||
|
||||
static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
|
||||
|
|
@ -1286,7 +1286,8 @@ static bool is_kprobe_multi(const struct bpf_prog *prog)
|
|||
|
||||
static inline bool is_kprobe_session(const struct bpf_prog *prog)
|
||||
{
|
||||
return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
|
||||
return prog->type == BPF_PROG_TYPE_KPROBE &&
|
||||
prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
|
||||
}
|
||||
|
||||
static inline bool is_uprobe_multi(const struct bpf_prog *prog)
|
||||
|
|
@ -1297,7 +1298,14 @@ static inline bool is_uprobe_multi(const struct bpf_prog *prog)
|
|||
|
||||
static inline bool is_uprobe_session(const struct bpf_prog *prog)
|
||||
{
|
||||
return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
|
||||
return prog->type == BPF_PROG_TYPE_KPROBE &&
|
||||
prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
|
||||
}
|
||||
|
||||
static inline bool is_trace_fsession(const struct bpf_prog *prog)
|
||||
{
|
||||
return prog->type == BPF_PROG_TYPE_TRACING &&
|
||||
prog->expected_attach_type == BPF_TRACE_FSESSION;
|
||||
}
|
||||
|
||||
static const struct bpf_func_proto *
|
||||
|
|
@ -1526,7 +1534,7 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = {
|
|||
.gpl_only = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM_OR_NULL,
|
||||
.arg2_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
|
@ -1661,7 +1669,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
|
|||
.gpl_only = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
|
@ -1734,11 +1742,17 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
|||
case BPF_FUNC_d_path:
|
||||
return &bpf_d_path_proto;
|
||||
case BPF_FUNC_get_func_arg:
|
||||
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
|
||||
if (bpf_prog_has_trampoline(prog) ||
|
||||
prog->expected_attach_type == BPF_TRACE_RAW_TP)
|
||||
return &bpf_get_func_arg_proto;
|
||||
return NULL;
|
||||
case BPF_FUNC_get_func_ret:
|
||||
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
|
||||
case BPF_FUNC_get_func_arg_cnt:
|
||||
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
|
||||
if (bpf_prog_has_trampoline(prog) ||
|
||||
prog->expected_attach_type == BPF_TRACE_RAW_TP)
|
||||
return &bpf_get_func_arg_cnt_proto;
|
||||
return NULL;
|
||||
case BPF_FUNC_get_attach_cookie:
|
||||
if (prog->type == BPF_PROG_TYPE_TRACING &&
|
||||
prog->expected_attach_type == BPF_TRACE_RAW_TP)
|
||||
|
|
@ -2063,7 +2077,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
|
|||
struct bpf_trace_run_ctx run_ctx;
|
||||
|
||||
cant_sleep();
|
||||
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
|
||||
if (unlikely(!bpf_prog_get_recursion_context(prog))) {
|
||||
bpf_prog_inc_misses_counter(prog);
|
||||
goto out;
|
||||
}
|
||||
|
|
@ -2077,7 +2091,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
|
|||
|
||||
bpf_reset_run_ctx(old_run_ctx);
|
||||
out:
|
||||
this_cpu_dec(*(prog->active));
|
||||
bpf_prog_put_recursion_context(prog);
|
||||
}
|
||||
|
||||
#define UNPACK(...) __VA_ARGS__
|
||||
|
|
@ -2564,6 +2578,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
|
|||
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
|
||||
err = bpf_prog_run(link->link.prog, regs);
|
||||
bpf_reset_run_ctx(old_run_ctx);
|
||||
ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr());
|
||||
rcu_read_unlock();
|
||||
|
||||
out:
|
||||
|
|
@ -3316,7 +3331,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
|
|||
|
||||
__bpf_kfunc_start_defs();
|
||||
|
||||
__bpf_kfunc bool bpf_session_is_return(void)
|
||||
__bpf_kfunc bool bpf_session_is_return(void *ctx)
|
||||
{
|
||||
struct bpf_session_run_ctx *session_ctx;
|
||||
|
||||
|
|
@ -3324,7 +3339,7 @@ __bpf_kfunc bool bpf_session_is_return(void)
|
|||
return session_ctx->is_return;
|
||||
}
|
||||
|
||||
__bpf_kfunc __u64 *bpf_session_cookie(void)
|
||||
__bpf_kfunc __u64 *bpf_session_cookie(void *ctx)
|
||||
{
|
||||
struct bpf_session_run_ctx *session_ctx;
|
||||
|
||||
|
|
@ -3334,34 +3349,39 @@ __bpf_kfunc __u64 *bpf_session_cookie(void)
|
|||
|
||||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
|
||||
BTF_KFUNCS_START(session_kfunc_set_ids)
|
||||
BTF_ID_FLAGS(func, bpf_session_is_return)
|
||||
BTF_ID_FLAGS(func, bpf_session_cookie)
|
||||
BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)
|
||||
BTF_KFUNCS_END(session_kfunc_set_ids)
|
||||
|
||||
static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
|
||||
static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id)
|
||||
{
|
||||
if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
|
||||
if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id))
|
||||
return 0;
|
||||
|
||||
if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
|
||||
if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog))
|
||||
return -EACCES;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
|
||||
static const struct btf_kfunc_id_set bpf_session_kfunc_set = {
|
||||
.owner = THIS_MODULE,
|
||||
.set = &kprobe_multi_kfunc_set_ids,
|
||||
.filter = bpf_kprobe_multi_filter,
|
||||
.set = &session_kfunc_set_ids,
|
||||
.filter = bpf_session_filter,
|
||||
};
|
||||
|
||||
static int __init bpf_kprobe_multi_kfuncs_init(void)
|
||||
static int __init bpf_trace_kfuncs_init(void)
|
||||
{
|
||||
return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
|
||||
int err = 0;
|
||||
|
||||
err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set);
|
||||
err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
late_initcall(bpf_kprobe_multi_kfuncs_init);
|
||||
late_initcall(bpf_trace_kfuncs_init);
|
||||
|
||||
typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk);
|
||||
|
||||
|
|
@ -3517,7 +3537,7 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
|
|||
__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
|
||||
u64 size, const void __user *unsafe_ptr__ign)
|
||||
{
|
||||
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
|
||||
return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
|
||||
copy_user_data_nofault, NULL);
|
||||
}
|
||||
|
||||
|
|
@ -3531,7 +3551,7 @@ __bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
|
|||
__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
|
||||
u64 size, const void __user *unsafe_ptr__ign)
|
||||
{
|
||||
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
|
||||
return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
|
||||
copy_user_str_nofault, NULL);
|
||||
}
|
||||
|
||||
|
|
@ -3545,14 +3565,14 @@ __bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 of
|
|||
__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
|
||||
u64 size, const void __user *unsafe_ptr__ign)
|
||||
{
|
||||
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
|
||||
return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
|
||||
copy_user_data_sleepable, NULL);
|
||||
}
|
||||
|
||||
__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
|
||||
u64 size, const void __user *unsafe_ptr__ign)
|
||||
{
|
||||
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
|
||||
return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
|
||||
copy_user_str_sleepable, NULL);
|
||||
}
|
||||
|
||||
|
|
@ -3560,7 +3580,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
|
|||
u64 size, const void __user *unsafe_ptr__ign,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
|
||||
return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
|
||||
copy_user_data_sleepable, tsk);
|
||||
}
|
||||
|
||||
|
|
@ -3568,7 +3588,7 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64
|
|||
u64 size, const void __user *unsafe_ptr__ign,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
|
||||
return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
|
||||
copy_user_str_sleepable, tsk);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -68,7 +68,6 @@
|
|||
})
|
||||
|
||||
/* hash bits for specific function selection */
|
||||
#define FTRACE_HASH_DEFAULT_BITS 10
|
||||
#define FTRACE_HASH_MAX_BITS 12
|
||||
|
||||
#ifdef CONFIG_DYNAMIC_FTRACE
|
||||
|
|
@ -1210,8 +1209,8 @@ static void __add_hash_entry(struct ftrace_hash *hash,
|
|||
hash->count++;
|
||||
}
|
||||
|
||||
static struct ftrace_func_entry *
|
||||
add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
|
||||
struct ftrace_func_entry *
|
||||
add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct)
|
||||
{
|
||||
struct ftrace_func_entry *entry;
|
||||
|
||||
|
|
@ -1220,11 +1219,18 @@ add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
|
|||
return NULL;
|
||||
|
||||
entry->ip = ip;
|
||||
entry->direct = direct;
|
||||
__add_hash_entry(hash, entry);
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
static struct ftrace_func_entry *
|
||||
add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
|
||||
{
|
||||
return add_ftrace_hash_entry_direct(hash, ip, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
free_hash_entry(struct ftrace_hash *hash,
|
||||
struct ftrace_func_entry *entry)
|
||||
|
|
@ -1283,7 +1289,7 @@ static void clear_ftrace_mod_list(struct list_head *head)
|
|||
mutex_unlock(&ftrace_lock);
|
||||
}
|
||||
|
||||
static void free_ftrace_hash(struct ftrace_hash *hash)
|
||||
void free_ftrace_hash(struct ftrace_hash *hash)
|
||||
{
|
||||
if (!hash || hash == EMPTY_HASH)
|
||||
return;
|
||||
|
|
@ -1323,7 +1329,7 @@ void ftrace_free_filter(struct ftrace_ops *ops)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(ftrace_free_filter);
|
||||
|
||||
static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
|
||||
struct ftrace_hash *alloc_ftrace_hash(int size_bits)
|
||||
{
|
||||
struct ftrace_hash *hash;
|
||||
int size;
|
||||
|
|
@ -1397,7 +1403,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
|
|||
size = 1 << hash->size_bits;
|
||||
for (i = 0; i < size; i++) {
|
||||
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
|
||||
if (add_hash_entry(new_hash, entry->ip) == NULL)
|
||||
if (add_ftrace_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL)
|
||||
goto free_hash;
|
||||
}
|
||||
}
|
||||
|
|
@ -2068,7 +2074,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
|
|||
*/
|
||||
if (!ops->ops_func)
|
||||
return -EBUSY;
|
||||
ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
|
||||
ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
|
||||
if (ret)
|
||||
return ret;
|
||||
} else if (is_ipmodify) {
|
||||
|
|
@ -2624,8 +2630,13 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
|
|||
static void call_direct_funcs(unsigned long ip, unsigned long pip,
|
||||
struct ftrace_ops *ops, struct ftrace_regs *fregs)
|
||||
{
|
||||
unsigned long addr = READ_ONCE(ops->direct_call);
|
||||
unsigned long addr;
|
||||
|
||||
#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
|
||||
addr = ftrace_find_rec_direct(ip);
|
||||
#else
|
||||
addr = READ_ONCE(ops->direct_call);
|
||||
#endif
|
||||
if (!addr)
|
||||
return;
|
||||
|
||||
|
|
@ -6049,15 +6060,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
|
|||
if (ftrace_hash_empty(hash))
|
||||
return -EINVAL;
|
||||
|
||||
/* This is a "raw" address, and this should never happen. */
|
||||
if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&direct_mutex);
|
||||
|
||||
if (ops->flags & FTRACE_OPS_FL_JMP)
|
||||
addr = ftrace_jmp_set(addr);
|
||||
|
||||
/* Make sure requested entries are not already registered.. */
|
||||
size = 1 << hash->size_bits;
|
||||
for (i = 0; i < size; i++) {
|
||||
|
|
@ -6178,13 +6182,6 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
|
|||
|
||||
lockdep_assert_held_once(&direct_mutex);
|
||||
|
||||
/* This is a "raw" address, and this should never happen. */
|
||||
if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
|
||||
return -EINVAL;
|
||||
|
||||
if (ops->flags & FTRACE_OPS_FL_JMP)
|
||||
addr = ftrace_jmp_set(addr);
|
||||
|
||||
/* Enable the tmp_ops to have the same functions as the direct ops */
|
||||
ftrace_ops_init(&tmp_ops);
|
||||
tmp_ops.func_hash = ops->func_hash;
|
||||
|
|
@ -6289,6 +6286,368 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
|
|||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
|
||||
|
||||
static unsigned long hash_count(struct ftrace_hash *hash)
|
||||
{
|
||||
return hash ? hash->count : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* hash_add - adds two struct ftrace_hash and returns the result
|
||||
* @a: struct ftrace_hash object
|
||||
* @b: struct ftrace_hash object
|
||||
*
|
||||
* Returns struct ftrace_hash object on success, NULL on error.
|
||||
*/
|
||||
static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b)
|
||||
{
|
||||
struct ftrace_func_entry *entry;
|
||||
struct ftrace_hash *add;
|
||||
int size;
|
||||
|
||||
size = hash_count(a) + hash_count(b);
|
||||
if (size > 32)
|
||||
size = 32;
|
||||
|
||||
add = alloc_and_copy_ftrace_hash(fls(size), a);
|
||||
if (!add)
|
||||
return NULL;
|
||||
|
||||
size = 1 << b->size_bits;
|
||||
for (int i = 0; i < size; i++) {
|
||||
hlist_for_each_entry(entry, &b->buckets[i], hlist) {
|
||||
if (add_ftrace_hash_entry_direct(add, entry->ip, entry->direct) == NULL) {
|
||||
free_ftrace_hash(add);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
return add;
|
||||
}
|
||||
|
||||
/**
|
||||
* update_ftrace_direct_add - Updates @ops by adding direct
|
||||
* callers provided in @hash
|
||||
* @ops: The address of the struct ftrace_ops object
|
||||
* @hash: The address of the struct ftrace_hash object
|
||||
*
|
||||
* This is used to add custom direct callers (ip -> addr) to @ops,
|
||||
* specified in @hash. The @ops will be either registered or updated.
|
||||
*
|
||||
* Returns: zero on success. Non zero on error, which includes:
|
||||
* -EINVAL - The @hash is empty
|
||||
*/
|
||||
int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash)
|
||||
{
|
||||
struct ftrace_hash *old_direct_functions = NULL;
|
||||
struct ftrace_hash *new_direct_functions;
|
||||
struct ftrace_hash *old_filter_hash;
|
||||
struct ftrace_hash *new_filter_hash = NULL;
|
||||
struct ftrace_func_entry *entry;
|
||||
int err = -EINVAL;
|
||||
int size;
|
||||
bool reg;
|
||||
|
||||
if (!hash_count(hash))
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&direct_mutex);
|
||||
|
||||
/* Make sure requested entries are not already registered. */
|
||||
size = 1 << hash->size_bits;
|
||||
for (int i = 0; i < size; i++) {
|
||||
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
|
||||
if (__ftrace_lookup_ip(direct_functions, entry->ip))
|
||||
goto out_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
|
||||
|
||||
/* If there's nothing in filter_hash we need to register the ops. */
|
||||
reg = hash_count(old_filter_hash) == 0;
|
||||
if (reg) {
|
||||
if (ops->func || ops->trampoline)
|
||||
goto out_unlock;
|
||||
if (ops->flags & FTRACE_OPS_FL_ENABLED)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
new_filter_hash = hash_add(old_filter_hash, hash);
|
||||
if (!new_filter_hash)
|
||||
goto out_unlock;
|
||||
|
||||
new_direct_functions = hash_add(direct_functions, hash);
|
||||
if (!new_direct_functions)
|
||||
goto out_unlock;
|
||||
|
||||
old_direct_functions = direct_functions;
|
||||
rcu_assign_pointer(direct_functions, new_direct_functions);
|
||||
|
||||
if (reg) {
|
||||
ops->func = call_direct_funcs;
|
||||
ops->flags |= MULTI_FLAGS;
|
||||
ops->trampoline = FTRACE_REGS_ADDR;
|
||||
ops->local_hash.filter_hash = new_filter_hash;
|
||||
|
||||
err = register_ftrace_function_nolock(ops);
|
||||
if (err) {
|
||||
/* restore old filter on error */
|
||||
ops->local_hash.filter_hash = old_filter_hash;
|
||||
|
||||
/* cleanup for possible another register call */
|
||||
ops->func = NULL;
|
||||
ops->trampoline = 0;
|
||||
} else {
|
||||
new_filter_hash = old_filter_hash;
|
||||
}
|
||||
} else {
|
||||
err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
|
||||
/*
|
||||
* new_filter_hash is dup-ed, so we need to release it anyway,
|
||||
* old_filter_hash either stays on error or is already released
|
||||
*/
|
||||
}
|
||||
|
||||
if (err) {
|
||||
/* reset direct_functions and free the new one */
|
||||
rcu_assign_pointer(direct_functions, old_direct_functions);
|
||||
old_direct_functions = new_direct_functions;
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&direct_mutex);
|
||||
|
||||
if (old_direct_functions && old_direct_functions != EMPTY_HASH)
|
||||
call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb);
|
||||
free_ftrace_hash(new_filter_hash);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* hash_sub - substracts @b from @a and returns the result
|
||||
* @a: struct ftrace_hash object
|
||||
* @b: struct ftrace_hash object
|
||||
*
|
||||
* Returns struct ftrace_hash object on success, NULL on error.
|
||||
*/
|
||||
static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b)
|
||||
{
|
||||
struct ftrace_func_entry *entry, *del;
|
||||
struct ftrace_hash *sub;
|
||||
int size;
|
||||
|
||||
sub = alloc_and_copy_ftrace_hash(a->size_bits, a);
|
||||
if (!sub)
|
||||
return NULL;
|
||||
|
||||
size = 1 << b->size_bits;
|
||||
for (int i = 0; i < size; i++) {
|
||||
hlist_for_each_entry(entry, &b->buckets[i], hlist) {
|
||||
del = __ftrace_lookup_ip(sub, entry->ip);
|
||||
if (WARN_ON_ONCE(!del)) {
|
||||
free_ftrace_hash(sub);
|
||||
return NULL;
|
||||
}
|
||||
remove_hash_entry(sub, del);
|
||||
kfree(del);
|
||||
}
|
||||
}
|
||||
return sub;
|
||||
}
|
||||
|
||||
/**
|
||||
* update_ftrace_direct_del - Updates @ops by removing its direct
|
||||
* callers provided in @hash
|
||||
* @ops: The address of the struct ftrace_ops object
|
||||
* @hash: The address of the struct ftrace_hash object
|
||||
*
|
||||
* This is used to delete custom direct callers (ip -> addr) in
|
||||
* @ops specified via @hash. The @ops will be either unregistered
|
||||
* updated.
|
||||
*
|
||||
* Returns: zero on success. Non zero on error, which includes:
|
||||
* -EINVAL - The @hash is empty
|
||||
* -EINVAL - The @ops is not registered
|
||||
*/
|
||||
int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash)
|
||||
{
|
||||
struct ftrace_hash *old_direct_functions = NULL;
|
||||
struct ftrace_hash *new_direct_functions;
|
||||
struct ftrace_hash *new_filter_hash = NULL;
|
||||
struct ftrace_hash *old_filter_hash;
|
||||
struct ftrace_func_entry *entry;
|
||||
struct ftrace_func_entry *del;
|
||||
unsigned long size;
|
||||
int err = -EINVAL;
|
||||
|
||||
if (!hash_count(hash))
|
||||
return -EINVAL;
|
||||
if (check_direct_multi(ops))
|
||||
return -EINVAL;
|
||||
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
|
||||
return -EINVAL;
|
||||
if (direct_functions == EMPTY_HASH)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&direct_mutex);
|
||||
|
||||
old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
|
||||
|
||||
if (!hash_count(old_filter_hash))
|
||||
goto out_unlock;
|
||||
|
||||
/* Make sure requested entries are already registered. */
|
||||
size = 1 << hash->size_bits;
|
||||
for (int i = 0; i < size; i++) {
|
||||
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
|
||||
del = __ftrace_lookup_ip(direct_functions, entry->ip);
|
||||
if (!del || del->direct != entry->direct)
|
||||
goto out_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
new_filter_hash = hash_sub(old_filter_hash, hash);
|
||||
if (!new_filter_hash)
|
||||
goto out_unlock;
|
||||
|
||||
new_direct_functions = hash_sub(direct_functions, hash);
|
||||
if (!new_direct_functions)
|
||||
goto out_unlock;
|
||||
|
||||
/* If there's nothing left, we need to unregister the ops. */
|
||||
if (ftrace_hash_empty(new_filter_hash)) {
|
||||
err = unregister_ftrace_function(ops);
|
||||
if (!err) {
|
||||
/* cleanup for possible another register call */
|
||||
ops->func = NULL;
|
||||
ops->trampoline = 0;
|
||||
ftrace_free_filter(ops);
|
||||
ops->func_hash->filter_hash = NULL;
|
||||
}
|
||||
} else {
|
||||
err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
|
||||
/*
|
||||
* new_filter_hash is dup-ed, so we need to release it anyway,
|
||||
* old_filter_hash either stays on error or is already released
|
||||
*/
|
||||
}
|
||||
|
||||
if (err) {
|
||||
/* free the new_direct_functions */
|
||||
old_direct_functions = new_direct_functions;
|
||||
} else {
|
||||
old_direct_functions = direct_functions;
|
||||
rcu_assign_pointer(direct_functions, new_direct_functions);
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&direct_mutex);
|
||||
|
||||
if (old_direct_functions && old_direct_functions != EMPTY_HASH)
|
||||
call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb);
|
||||
free_ftrace_hash(new_filter_hash);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* update_ftrace_direct_mod - Updates @ops by modifing its direct
|
||||
* callers provided in @hash
|
||||
* @ops: The address of the struct ftrace_ops object
|
||||
* @hash: The address of the struct ftrace_hash object
|
||||
* @do_direct_lock: If true lock the direct_mutex
|
||||
*
|
||||
* This is used to modify custom direct callers (ip -> addr) in
|
||||
* @ops specified via @hash.
|
||||
*
|
||||
* This can be called from within ftrace ops_func callback with
|
||||
* direct_mutex already locked, in which case @do_direct_lock
|
||||
* needs to be false.
|
||||
*
|
||||
* Returns: zero on success. Non zero on error, which includes:
|
||||
* -EINVAL - The @hash is empty
|
||||
* -EINVAL - The @ops is not registered
|
||||
*/
|
||||
int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock)
|
||||
{
|
||||
struct ftrace_func_entry *entry, *tmp;
|
||||
static struct ftrace_ops tmp_ops = {
|
||||
.func = ftrace_stub,
|
||||
.flags = FTRACE_OPS_FL_STUB,
|
||||
};
|
||||
struct ftrace_hash *orig_hash;
|
||||
unsigned long size, i;
|
||||
int err = -EINVAL;
|
||||
|
||||
if (!hash_count(hash))
|
||||
return -EINVAL;
|
||||
if (check_direct_multi(ops))
|
||||
return -EINVAL;
|
||||
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
|
||||
return -EINVAL;
|
||||
if (direct_functions == EMPTY_HASH)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* We can be called from within ops_func callback with direct_mutex
|
||||
* already taken.
|
||||
*/
|
||||
if (do_direct_lock)
|
||||
mutex_lock(&direct_mutex);
|
||||
|
||||
orig_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
|
||||
if (!orig_hash)
|
||||
goto unlock;
|
||||
|
||||
/* Enable the tmp_ops to have the same functions as the direct ops */
|
||||
ftrace_ops_init(&tmp_ops);
|
||||
tmp_ops.func_hash = ops->func_hash;
|
||||
|
||||
err = register_ftrace_function_nolock(&tmp_ops);
|
||||
if (err)
|
||||
goto unlock;
|
||||
|
||||
/*
|
||||
* Call __ftrace_hash_update_ipmodify() here, so that we can call
|
||||
* ops->ops_func for the ops. This is needed because the above
|
||||
* register_ftrace_function_nolock() worked on tmp_ops.
|
||||
*/
|
||||
err = __ftrace_hash_update_ipmodify(ops, orig_hash, orig_hash, true);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Now the ftrace_ops_list_func() is called to do the direct callers.
|
||||
* We can safely change the direct functions attached to each entry.
|
||||
*/
|
||||
mutex_lock(&ftrace_lock);
|
||||
|
||||
size = 1 << hash->size_bits;
|
||||
for (i = 0; i < size; i++) {
|
||||
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
|
||||
tmp = __ftrace_lookup_ip(direct_functions, entry->ip);
|
||||
if (!tmp)
|
||||
continue;
|
||||
tmp->direct = entry->direct;
|
||||
}
|
||||
}
|
||||
|
||||
mutex_unlock(&ftrace_lock);
|
||||
|
||||
out:
|
||||
/* Removing the tmp_ops will add the updated direct callers to the functions */
|
||||
unregister_ftrace_function(&tmp_ops);
|
||||
|
||||
unlock:
|
||||
if (do_direct_lock)
|
||||
mutex_unlock(&direct_mutex);
|
||||
return err;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
|
||||
|
||||
/**
|
||||
|
|
@ -8709,7 +9068,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
|
|||
if (!op->ops_func)
|
||||
return -EBUSY;
|
||||
|
||||
ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
|
||||
ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -8756,7 +9115,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
|
|||
|
||||
/* The cleanup is optional, ignore any errors */
|
||||
if (found_op && op->ops_func)
|
||||
op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
|
||||
op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&direct_mutex);
|
||||
|
|
|
|||
|
|
@ -388,18 +388,13 @@ config DEBUG_INFO_BTF
|
|||
depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED
|
||||
depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST
|
||||
depends on BPF_SYSCALL
|
||||
depends on PAHOLE_VERSION >= 116
|
||||
depends on DEBUG_INFO_DWARF4 || PAHOLE_VERSION >= 121
|
||||
depends on PAHOLE_VERSION >= 122
|
||||
# pahole uses elfutils, which does not have support for Hexagon relocations
|
||||
depends on !HEXAGON
|
||||
help
|
||||
Generate deduplicated BTF type information from DWARF debug info.
|
||||
Turning this on requires pahole v1.16 or later (v1.21 or later to
|
||||
support DWARF 5), which will convert DWARF type info into equivalent
|
||||
deduplicated BTF type info.
|
||||
|
||||
config PAHOLE_HAS_SPLIT_BTF
|
||||
def_bool PAHOLE_VERSION >= 119
|
||||
Turning this on requires pahole v1.22 or later, which will convert
|
||||
DWARF type info into equivalent deduplicated BTF type info.
|
||||
|
||||
config PAHOLE_HAS_BTF_TAG
|
||||
def_bool PAHOLE_VERSION >= 123
|
||||
|
|
@ -421,7 +416,7 @@ config PAHOLE_HAS_LANG_EXCLUDE
|
|||
config DEBUG_INFO_BTF_MODULES
|
||||
bool "Generate BTF type information for kernel modules"
|
||||
default y
|
||||
depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF
|
||||
depends on DEBUG_INFO_BTF && MODULES
|
||||
help
|
||||
Generate compact split BTF type information for kernel modules.
|
||||
|
||||
|
|
|
|||
|
|
@ -106,6 +106,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
|
|||
ifdef CONFIG_SWAP
|
||||
obj-$(CONFIG_MEMCG) += swap_cgroup.o
|
||||
endif
|
||||
ifdef CONFIG_BPF_SYSCALL
|
||||
obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
|
||||
endif
|
||||
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
|
||||
obj-$(CONFIG_GUP_TEST) += gup_test.o
|
||||
obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
|
||||
|
|
|
|||
193
mm/bpf_memcontrol.c
Normal file
193
mm/bpf_memcontrol.c
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* Memory Controller-related BPF kfuncs and auxiliary code
|
||||
*
|
||||
* Author: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
*/
|
||||
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/bpf.h>
|
||||
|
||||
__bpf_kfunc_start_defs();
|
||||
|
||||
/**
|
||||
* bpf_get_root_mem_cgroup - Returns a pointer to the root memory cgroup
|
||||
*
|
||||
* The function has KF_ACQUIRE semantics, even though the root memory
|
||||
* cgroup is never destroyed after being created and doesn't require
|
||||
* reference counting. And it's perfectly safe to pass it to
|
||||
* bpf_put_mem_cgroup()
|
||||
*
|
||||
* Return: A pointer to the root memory cgroup.
|
||||
*/
|
||||
__bpf_kfunc struct mem_cgroup *bpf_get_root_mem_cgroup(void)
|
||||
{
|
||||
if (mem_cgroup_disabled())
|
||||
return NULL;
|
||||
|
||||
/* css_get() is not needed */
|
||||
return root_mem_cgroup;
|
||||
}
|
||||
|
||||
/**
|
||||
* bpf_get_mem_cgroup - Get a reference to a memory cgroup
|
||||
* @css: pointer to the css structure
|
||||
*
|
||||
* It's fine to pass a css which belongs to any cgroup controller,
|
||||
* e.g. unified hierarchy's main css.
|
||||
*
|
||||
* Implements KF_ACQUIRE semantics.
|
||||
*
|
||||
* Return: A pointer to a mem_cgroup structure after bumping
|
||||
* the corresponding css's reference counter.
|
||||
*/
|
||||
__bpf_kfunc struct mem_cgroup *
|
||||
bpf_get_mem_cgroup(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct mem_cgroup *memcg = NULL;
|
||||
bool rcu_unlock = false;
|
||||
|
||||
if (mem_cgroup_disabled() || !root_mem_cgroup)
|
||||
return NULL;
|
||||
|
||||
if (root_mem_cgroup->css.ss != css->ss) {
|
||||
struct cgroup *cgroup = css->cgroup;
|
||||
int ssid = root_mem_cgroup->css.ss->id;
|
||||
|
||||
rcu_read_lock();
|
||||
rcu_unlock = true;
|
||||
css = rcu_dereference_raw(cgroup->subsys[ssid]);
|
||||
}
|
||||
|
||||
if (css && css_tryget(css))
|
||||
memcg = container_of(css, struct mem_cgroup, css);
|
||||
|
||||
if (rcu_unlock)
|
||||
rcu_read_unlock();
|
||||
|
||||
return memcg;
|
||||
}
|
||||
|
||||
/**
|
||||
* bpf_put_mem_cgroup - Put a reference to a memory cgroup
|
||||
* @memcg: memory cgroup to release
|
||||
*
|
||||
* Releases a previously acquired memcg reference.
|
||||
* Implements KF_RELEASE semantics.
|
||||
*/
|
||||
__bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg)
|
||||
{
|
||||
css_put(&memcg->css);
|
||||
}
|
||||
|
||||
/**
|
||||
* bpf_mem_cgroup_vm_events - Read memory cgroup's vm event counter
|
||||
* @memcg: memory cgroup
|
||||
* @event: event id
|
||||
*
|
||||
* Allows to read memory cgroup event counters.
|
||||
*
|
||||
* Return: The current value of the corresponding events counter.
|
||||
*/
|
||||
__bpf_kfunc unsigned long bpf_mem_cgroup_vm_events(struct mem_cgroup *memcg,
|
||||
enum vm_event_item event)
|
||||
{
|
||||
if (unlikely(!memcg_vm_event_item_valid(event)))
|
||||
return (unsigned long)-1;
|
||||
|
||||
return memcg_events(memcg, event);
|
||||
}
|
||||
|
||||
/**
|
||||
* bpf_mem_cgroup_usage - Read memory cgroup's usage
|
||||
* @memcg: memory cgroup
|
||||
*
|
||||
* Please, note that the root memory cgroup it special and is exempt
|
||||
* from the memory accounting. The returned value is a sum of sub-cgroup's
|
||||
* usages and it not reflecting the size of the root memory cgroup itself.
|
||||
* If you need to get an approximation, you can use root level statistics:
|
||||
* e.g. NR_FILE_PAGES + NR_ANON_MAPPED.
|
||||
*
|
||||
* Return: The current memory cgroup size in bytes.
|
||||
*/
|
||||
__bpf_kfunc unsigned long bpf_mem_cgroup_usage(struct mem_cgroup *memcg)
|
||||
{
|
||||
return page_counter_read(&memcg->memory) * PAGE_SIZE;
|
||||
}
|
||||
|
||||
/**
|
||||
* bpf_mem_cgroup_memory_events - Read memory cgroup's memory event value
|
||||
* @memcg: memory cgroup
|
||||
* @event: memory event id
|
||||
*
|
||||
* Return: The current value of the memory event counter.
|
||||
*/
|
||||
__bpf_kfunc unsigned long bpf_mem_cgroup_memory_events(struct mem_cgroup *memcg,
|
||||
enum memcg_memory_event event)
|
||||
{
|
||||
if (unlikely(event >= MEMCG_NR_MEMORY_EVENTS))
|
||||
return (unsigned long)-1;
|
||||
|
||||
return atomic_long_read(&memcg->memory_events[event]);
|
||||
}
|
||||
|
||||
/**
|
||||
* bpf_mem_cgroup_page_state - Read memory cgroup's page state counter
|
||||
* @memcg: memory cgroup
|
||||
* @idx: counter idx
|
||||
*
|
||||
* Allows to read memory cgroup statistics. The output is in bytes.
|
||||
*
|
||||
* Return: The value of the page state counter in bytes.
|
||||
*/
|
||||
__bpf_kfunc unsigned long bpf_mem_cgroup_page_state(struct mem_cgroup *memcg, int idx)
|
||||
{
|
||||
if (unlikely(!memcg_stat_item_valid(idx)))
|
||||
return (unsigned long)-1;
|
||||
|
||||
return memcg_page_state_output(memcg, idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* bpf_mem_cgroup_flush_stats - Flush memory cgroup's statistics
|
||||
* @memcg: memory cgroup
|
||||
*
|
||||
* Propagate memory cgroup's statistics up the cgroup tree.
|
||||
*/
|
||||
__bpf_kfunc void bpf_mem_cgroup_flush_stats(struct mem_cgroup *memcg)
|
||||
{
|
||||
mem_cgroup_flush_stats(memcg);
|
||||
}
|
||||
|
||||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(bpf_memcontrol_kfuncs)
|
||||
BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL)
|
||||
BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
|
||||
BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE)
|
||||
|
||||
BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events)
|
||||
BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events)
|
||||
BTF_ID_FLAGS(func, bpf_mem_cgroup_usage)
|
||||
BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state)
|
||||
BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_SLEEPABLE)
|
||||
|
||||
BTF_KFUNCS_END(bpf_memcontrol_kfuncs)
|
||||
|
||||
static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = {
|
||||
.owner = THIS_MODULE,
|
||||
.set = &bpf_memcontrol_kfuncs,
|
||||
};
|
||||
|
||||
static int __init bpf_memcontrol_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC,
|
||||
&bpf_memcontrol_kfunc_set);
|
||||
if (err)
|
||||
pr_warn("error while registering bpf memcontrol kfuncs: %d", err);
|
||||
|
||||
return err;
|
||||
}
|
||||
late_initcall(bpf_memcontrol_init);
|
||||
|
|
@ -27,7 +27,6 @@ unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
|
|||
void drain_all_stock(struct mem_cgroup *root_memcg);
|
||||
|
||||
unsigned long memcg_events(struct mem_cgroup *memcg, int event);
|
||||
unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
|
||||
int memory_stat_show(struct seq_file *m, void *v);
|
||||
|
||||
void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n);
|
||||
|
|
|
|||
|
|
@ -665,6 +665,14 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
|
|||
return x;
|
||||
}
|
||||
|
||||
bool memcg_stat_item_valid(int idx)
|
||||
{
|
||||
if ((u32)idx >= MEMCG_NR_STAT)
|
||||
return false;
|
||||
|
||||
return !BAD_STAT_IDX(memcg_stats_index(idx));
|
||||
}
|
||||
|
||||
static int memcg_page_state_unit(int item);
|
||||
|
||||
/*
|
||||
|
|
@ -862,6 +870,14 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event)
|
|||
return READ_ONCE(memcg->vmstats->events[i]);
|
||||
}
|
||||
|
||||
bool memcg_vm_event_item_valid(enum vm_event_item idx)
|
||||
{
|
||||
if (idx >= NR_VM_EVENT_ITEMS)
|
||||
return false;
|
||||
|
||||
return !BAD_STAT_IDX(memcg_events_index(idx));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_V1
|
||||
unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
|
|||
switch (prog->expected_attach_type) {
|
||||
case BPF_TRACE_FENTRY:
|
||||
case BPF_TRACE_FEXIT:
|
||||
case BPF_TRACE_FSESSION:
|
||||
if (bpf_fentry_test1(1) != 2 ||
|
||||
bpf_fentry_test2(2, 3) != 5 ||
|
||||
bpf_fentry_test3(4, 5, 6) != 15 ||
|
||||
|
|
|
|||
|
|
@ -40,29 +40,30 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
|
|||
if (!sdata)
|
||||
return -ENOENT;
|
||||
|
||||
bpf_selem_unlink(SELEM(sdata), false);
|
||||
|
||||
return 0;
|
||||
return bpf_selem_unlink(SELEM(sdata));
|
||||
}
|
||||
|
||||
/* Called by __sk_destruct() & bpf_sk_storage_clone() */
|
||||
void bpf_sk_storage_free(struct sock *sk)
|
||||
{
|
||||
struct bpf_local_storage *sk_storage;
|
||||
u32 uncharge;
|
||||
|
||||
rcu_read_lock_dont_migrate();
|
||||
sk_storage = rcu_dereference(sk->sk_bpf_storage);
|
||||
if (!sk_storage)
|
||||
goto out;
|
||||
|
||||
bpf_local_storage_destroy(sk_storage);
|
||||
uncharge = bpf_local_storage_destroy(sk_storage);
|
||||
if (uncharge)
|
||||
atomic_sub(uncharge, &sk->sk_omem_alloc);
|
||||
out:
|
||||
rcu_read_unlock_migrate();
|
||||
}
|
||||
|
||||
static void bpf_sk_storage_map_free(struct bpf_map *map)
|
||||
{
|
||||
bpf_local_storage_map_free(map, &sk_cache, NULL);
|
||||
bpf_local_storage_map_free(map, &sk_cache);
|
||||
}
|
||||
|
||||
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
|
||||
|
|
@ -191,7 +192,14 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
|
|||
}
|
||||
|
||||
if (new_sk_storage) {
|
||||
bpf_selem_link_map(smap, copy_selem);
|
||||
ret = bpf_selem_link_map(smap, new_sk_storage, copy_selem);
|
||||
if (ret) {
|
||||
bpf_selem_free(copy_selem, true);
|
||||
atomic_sub(smap->elem_size,
|
||||
&newsk->sk_omem_alloc);
|
||||
bpf_map_put(map);
|
||||
goto out;
|
||||
}
|
||||
bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
|
||||
} else {
|
||||
ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
|
||||
|
|
@ -365,6 +373,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
|
|||
return true;
|
||||
case BPF_TRACE_FENTRY:
|
||||
case BPF_TRACE_FEXIT:
|
||||
case BPF_TRACE_FSESSION:
|
||||
return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
|
||||
strlen("bpf_sk_storage"));
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -4137,7 +4137,7 @@ static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
|
|||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_PTR_TO_UNINIT_MEM,
|
||||
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg4_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
|
|
@ -6401,7 +6401,7 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
|
|||
.gpl_only = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
|
@ -6456,7 +6456,7 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
|
|||
.gpl_only = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
|
@ -8010,9 +8010,9 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
|
|||
.gpl_only = true, /* __cookie_v4_init_sequence() is GPL */
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
|
||||
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
|
||||
.arg1_size = sizeof(struct iphdr),
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
|
|
@ -8042,9 +8042,9 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
|
|||
.gpl_only = true, /* __cookie_v6_init_sequence() is GPL */
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
|
||||
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
|
||||
.arg1_size = sizeof(struct ipv6hdr),
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
|
|
@ -8062,9 +8062,9 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
|
|||
.gpl_only = true, /* __cookie_v4_check is GPL */
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
|
||||
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
|
||||
.arg1_size = sizeof(struct iphdr),
|
||||
.arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
|
||||
.arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
|
||||
.arg2_size = sizeof(struct tcphdr),
|
||||
};
|
||||
|
||||
|
|
@ -8086,9 +8086,9 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
|
|||
.gpl_only = true, /* __cookie_v6_check is GPL */
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
|
||||
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
|
||||
.arg1_size = sizeof(struct ipv6hdr),
|
||||
.arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
|
||||
.arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY,
|
||||
.arg2_size = sizeof(struct tcphdr),
|
||||
};
|
||||
#endif /* CONFIG_SYN_COOKIES */
|
||||
|
|
@ -12023,7 +12023,7 @@ BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
|
|||
* trigger an explicit type generation here.
|
||||
*/
|
||||
BTF_TYPE_EMIT(struct unix_sock);
|
||||
if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX)
|
||||
if (sk && sk_is_unix(sk))
|
||||
return (unsigned long)sk;
|
||||
|
||||
return (unsigned long)NULL;
|
||||
|
|
@ -12440,11 +12440,11 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
|
|||
}
|
||||
|
||||
BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
|
||||
BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
|
||||
BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
|
||||
|
||||
BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta)
|
||||
BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta)
|
||||
BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta)
|
||||
|
||||
BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
|
||||
|
|
@ -12457,11 +12457,11 @@ BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
|
|||
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)
|
||||
|
||||
BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
|
||||
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk)
|
||||
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
|
||||
|
||||
BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
|
||||
BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp)
|
||||
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)
|
||||
|
||||
static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
|
||||
|
|
@ -12556,7 +12556,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
|
|||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
|
||||
BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_sock_destroy)
|
||||
BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)
|
||||
|
||||
static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
|
||||
|
|
|
|||
|
|
@ -409,22 +409,26 @@ out:
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
|
||||
|
||||
/* Receive sk_msg from psock->ingress_msg to @msg. */
|
||||
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags)
|
||||
int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags, int *copied_from_self)
|
||||
{
|
||||
struct iov_iter *iter = &msg->msg_iter;
|
||||
int peek = flags & MSG_PEEK;
|
||||
struct sk_msg *msg_rx;
|
||||
int i, copied = 0;
|
||||
bool from_self;
|
||||
|
||||
msg_rx = sk_psock_peek_msg(psock);
|
||||
if (copied_from_self)
|
||||
*copied_from_self = 0;
|
||||
|
||||
while (copied != len) {
|
||||
struct scatterlist *sge;
|
||||
|
||||
if (unlikely(!msg_rx))
|
||||
break;
|
||||
|
||||
from_self = msg_rx->sk == sk;
|
||||
i = msg_rx->sg.start;
|
||||
do {
|
||||
struct page *page;
|
||||
|
|
@ -443,6 +447,9 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
|||
}
|
||||
|
||||
copied += copy;
|
||||
if (from_self && copied_from_self)
|
||||
*copied_from_self += copy;
|
||||
|
||||
if (likely(!peek)) {
|
||||
sge->offset += copy;
|
||||
sge->length -= copy;
|
||||
|
|
@ -451,6 +458,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
|||
atomic_sub(copy, &sk->sk_rmem_alloc);
|
||||
}
|
||||
msg_rx->sg.size -= copy;
|
||||
sk_psock_msg_len_add(psock, -copy);
|
||||
|
||||
if (!sge->length) {
|
||||
sk_msg_iter_var_next(i);
|
||||
|
|
@ -487,6 +495,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
|||
out:
|
||||
return copied;
|
||||
}
|
||||
|
||||
/* Receive sk_msg from psock->ingress_msg to @msg. */
|
||||
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags)
|
||||
{
|
||||
return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
|
||||
|
||||
bool sk_msg_is_readable(struct sock *sk)
|
||||
|
|
@ -616,6 +631,12 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
|
|||
if (unlikely(!msg))
|
||||
return -EAGAIN;
|
||||
skb_set_owner_r(skb, sk);
|
||||
|
||||
/* This is used in tcp_bpf_recvmsg_parser() to determine whether the
|
||||
* data originates from the socket's own protocol stack. No need to
|
||||
* refcount sk because msg's lifetime is bound to sk via the ingress_msg.
|
||||
*/
|
||||
msg->sk = sk;
|
||||
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
|
||||
if (err < 0)
|
||||
kfree(msg);
|
||||
|
|
@ -801,9 +822,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
|
|||
list_del(&msg->list);
|
||||
if (!msg->skb)
|
||||
atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
|
||||
sk_psock_msg_len_add(psock, -msg->sg.size);
|
||||
sk_msg_free(psock->sk, msg);
|
||||
kfree(msg);
|
||||
}
|
||||
WARN_ON_ONCE(psock->msg_tot_len);
|
||||
}
|
||||
|
||||
static void __sk_psock_zap_ingress(struct sk_psock *psock)
|
||||
|
|
@ -909,6 +932,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
|
|||
sk_msg_compute_data_pointers(msg);
|
||||
msg->sk = sk;
|
||||
ret = bpf_prog_run_pin_on_cpu(prog, msg);
|
||||
msg->sk = NULL;
|
||||
ret = sk_psock_map_verd(ret, msg->sk_redir);
|
||||
psock->apply_bytes = msg->apply_bytes;
|
||||
if (ret == __SK_REDIRECT) {
|
||||
|
|
|
|||
|
|
@ -964,7 +964,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
|
|||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(xdp_metadata_kfunc_ids)
|
||||
#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
|
||||
#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name)
|
||||
XDP_METADATA_KFUNC_xxx
|
||||
#undef XDP_METADATA_KFUNC
|
||||
BTF_KFUNCS_END(xdp_metadata_kfunc_ids)
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
#include <net/inet_common.h>
|
||||
#include <net/tls.h>
|
||||
#include <asm/ioctls.h>
|
||||
|
||||
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
|
|
@ -226,6 +227,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
|
|||
int peek = flags & MSG_PEEK;
|
||||
struct sk_psock *psock;
|
||||
struct tcp_sock *tcp;
|
||||
int copied_from_self = 0;
|
||||
int copied = 0;
|
||||
u32 seq;
|
||||
|
||||
|
|
@ -262,7 +264,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
|
|||
}
|
||||
|
||||
msg_bytes_ready:
|
||||
copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
|
||||
copied = __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self);
|
||||
/* The typical case for EFAULT is the socket was gracefully
|
||||
* shutdown with a FIN pkt. So check here the other case is
|
||||
* some error on copy_page_to_iter which would be unexpected.
|
||||
|
|
@ -277,7 +279,7 @@ msg_bytes_ready:
|
|||
goto out;
|
||||
}
|
||||
}
|
||||
seq += copied;
|
||||
seq += copied_from_self;
|
||||
if (!copied) {
|
||||
long timeo;
|
||||
int data;
|
||||
|
|
@ -331,6 +333,24 @@ unlock:
|
|||
return copied;
|
||||
}
|
||||
|
||||
static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
|
||||
{
|
||||
bool slow;
|
||||
|
||||
if (cmd != SIOCINQ)
|
||||
return tcp_ioctl(sk, cmd, karg);
|
||||
|
||||
/* works similar as tcp_ioctl */
|
||||
if (sk->sk_state == TCP_LISTEN)
|
||||
return -EINVAL;
|
||||
|
||||
slow = lock_sock_fast(sk);
|
||||
*karg = sk_psock_msg_inq(sk);
|
||||
unlock_sock_fast(sk, slow);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
|
||||
int flags, int *addr_len)
|
||||
{
|
||||
|
|
@ -609,6 +629,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
|
|||
prot[TCP_BPF_BASE].close = sock_map_close;
|
||||
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
|
||||
prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
|
||||
prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl;
|
||||
|
||||
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
|
||||
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include <net/sock.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/inet_common.h>
|
||||
#include <asm/ioctls.h>
|
||||
|
||||
#include "udp_impl.h"
|
||||
|
||||
|
|
@ -111,12 +112,26 @@ enum {
|
|||
static DEFINE_SPINLOCK(udpv6_prot_lock);
|
||||
static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
|
||||
|
||||
static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
|
||||
{
|
||||
if (cmd != SIOCINQ)
|
||||
return udp_ioctl(sk, cmd, karg);
|
||||
|
||||
/* Since we don't hold a lock, sk_receive_queue may contain data.
|
||||
* BPF might only be processing this data at the moment. We only
|
||||
* care about the data in the ingress_msg here.
|
||||
*/
|
||||
*karg = sk_msg_first_len(sk);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
|
||||
{
|
||||
*prot = *base;
|
||||
prot->close = sock_map_close;
|
||||
prot->recvmsg = udp_bpf_recvmsg;
|
||||
prot->sock_is_readable = sk_msg_is_readable;
|
||||
*prot = *base;
|
||||
prot->close = sock_map_close;
|
||||
prot->recvmsg = udp_bpf_recvmsg;
|
||||
prot->sock_is_readable = sk_msg_is_readable;
|
||||
prot->ioctl = udp_bpf_ioctl;
|
||||
}
|
||||
|
||||
static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
|
||||
|
|
|
|||
|
|
@ -114,8 +114,6 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
|
|||
struct nf_conn *ct;
|
||||
int err;
|
||||
|
||||
if (!opts || !bpf_tuple)
|
||||
return ERR_PTR(-EINVAL);
|
||||
if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
|
||||
return ERR_PTR(-EINVAL);
|
||||
if (opts_len == NF_BPF_CT_OPTS_SZ) {
|
||||
|
|
@ -299,8 +297,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
|
|||
nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz,
|
||||
opts, opts__sz, 10);
|
||||
if (IS_ERR(nfct)) {
|
||||
if (opts)
|
||||
opts->error = PTR_ERR(nfct);
|
||||
opts->error = PTR_ERR(nfct);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -334,8 +331,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
|
|||
caller_net = dev_net(ctx->rxq->dev);
|
||||
nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
|
||||
if (IS_ERR(nfct)) {
|
||||
if (opts)
|
||||
opts->error = PTR_ERR(nfct);
|
||||
opts->error = PTR_ERR(nfct);
|
||||
return NULL;
|
||||
}
|
||||
return nfct;
|
||||
|
|
@ -367,8 +363,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
|
|||
net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
|
||||
nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10);
|
||||
if (IS_ERR(nfct)) {
|
||||
if (opts)
|
||||
opts->error = PTR_ERR(nfct);
|
||||
opts->error = PTR_ERR(nfct);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -402,8 +397,7 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
|
|||
caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
|
||||
nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
|
||||
if (IS_ERR(nfct)) {
|
||||
if (opts)
|
||||
opts->error = PTR_ERR(nfct);
|
||||
opts->error = PTR_ERR(nfct);
|
||||
return NULL;
|
||||
}
|
||||
return nfct;
|
||||
|
|
@ -516,10 +510,10 @@ BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
|
|||
BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
|
||||
BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
|
||||
BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
|
||||
BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_ct_set_timeout)
|
||||
BTF_ID_FLAGS(func, bpf_ct_change_timeout)
|
||||
BTF_ID_FLAGS(func, bpf_ct_set_status)
|
||||
BTF_ID_FLAGS(func, bpf_ct_change_status)
|
||||
BTF_KFUNCS_END(nf_ct_kfunc_set)
|
||||
|
||||
static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
|
||||
|
|
|
|||
|
|
@ -105,7 +105,7 @@ __diag_pop()
|
|||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(nf_ft_kfunc_set)
|
||||
BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL)
|
||||
BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_RET_NULL)
|
||||
BTF_KFUNCS_END(nf_ft_kfunc_set)
|
||||
|
||||
static const struct btf_kfunc_id_set nf_flow_kfunc_set = {
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
|
|||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(nf_nat_kfunc_set)
|
||||
BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_ct_set_nat_info)
|
||||
BTF_KFUNCS_END(nf_nat_kfunc_set)
|
||||
|
||||
static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = {
|
||||
|
|
|
|||
|
|
@ -202,6 +202,12 @@ __bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb)
|
|||
kfree_skb(skb);
|
||||
}
|
||||
|
||||
__bpf_kfunc void bpf_kfree_skb_dtor(void *skb)
|
||||
{
|
||||
bpf_kfree_skb(skb);
|
||||
}
|
||||
CFI_NOSEAL(bpf_kfree_skb_dtor);
|
||||
|
||||
/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list.
|
||||
* @skb: The skb whose reference to be released and dropped.
|
||||
* @to_free_list: The list of skbs to be dropped.
|
||||
|
|
@ -271,14 +277,14 @@ __bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff
|
|||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(qdisc_kfunc_ids)
|
||||
BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_skb_get_hash)
|
||||
BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE)
|
||||
BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE)
|
||||
BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS)
|
||||
BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
|
||||
BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule)
|
||||
BTF_ID_FLAGS(func, bpf_qdisc_init_prologue)
|
||||
BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue)
|
||||
BTF_ID_FLAGS(func, bpf_qdisc_bstats_update)
|
||||
BTF_KFUNCS_END(qdisc_kfunc_ids)
|
||||
|
||||
BTF_SET_START(qdisc_common_kfunc_set)
|
||||
|
|
@ -449,7 +455,7 @@ static struct bpf_struct_ops bpf_Qdisc_ops = {
|
|||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb)
|
||||
BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb_dtor)
|
||||
|
||||
static int __init bpf_qdisc_kfunc_init(void)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32
|
|||
struct net *net = dev_net(xdp->rxq->dev);
|
||||
struct xfrm_state *x;
|
||||
|
||||
if (!opts || opts__sz < sizeof(opts->error))
|
||||
if (opts__sz < sizeof(opts->error))
|
||||
return NULL;
|
||||
|
||||
if (opts__sz != BPF_XFRM_STATE_OPTS_SZ) {
|
||||
|
|
|
|||
|
|
@ -7,14 +7,7 @@ JOBS := $(patsubst -j%,%,$(filter -j%,$(MAKEFLAGS)))
|
|||
|
||||
ifeq ($(call test-le, $(pahole-ver), 125),y)
|
||||
|
||||
# pahole 1.18 through 1.21 can't handle zero-sized per-CPU vars
|
||||
ifeq ($(call test-le, $(pahole-ver), 121),y)
|
||||
pahole-flags-$(call test-ge, $(pahole-ver), 118) += --skip_encoding_btf_vars
|
||||
endif
|
||||
|
||||
pahole-flags-$(call test-ge, $(pahole-ver), 121) += --btf_gen_floats
|
||||
|
||||
pahole-flags-$(call test-ge, $(pahole-ver), 122) += -j$(JOBS)
|
||||
pahole-flags-y += --btf_gen_floats -j$(JOBS)
|
||||
|
||||
pahole-flags-$(call test-ge, $(pahole-ver), 125) += --skip_encoding_btf_inconsistent_proto --btf_gen_optimized
|
||||
|
||||
|
|
@ -25,13 +18,15 @@ pahole-flags-$(call test-ge, $(pahole-ver), 126) = -j$(JOBS) --btf_features=enc
|
|||
|
||||
pahole-flags-$(call test-ge, $(pahole-ver), 130) += --btf_features=attributes
|
||||
|
||||
ifneq ($(KBUILD_EXTMOD),)
|
||||
module-pahole-flags-$(call test-ge, $(pahole-ver), 128) += --btf_features=distilled_base
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
pahole-flags-$(CONFIG_PAHOLE_HAS_LANG_EXCLUDE) += --lang_exclude=rust
|
||||
|
||||
export PAHOLE_FLAGS := $(pahole-flags-y)
|
||||
export MODULE_PAHOLE_FLAGS := $(module-pahole-flags-y)
|
||||
|
||||
resolve-btfids-flags-y :=
|
||||
resolve-btfids-flags-$(CONFIG_WERROR) += --fatal_warnings
|
||||
resolve-btfids-flags-$(if $(KBUILD_EXTMOD),y) += --distill_base
|
||||
resolve-btfids-flags-$(if $(KBUILD_VERBOSE),y) += --verbose
|
||||
|
||||
export RESOLVE_BTFIDS_FLAGS := $(resolve-btfids-flags-y)
|
||||
|
|
|
|||
|
|
@ -42,9 +42,8 @@ quiet_cmd_btf_ko = BTF [M] $@
|
|||
cmd_btf_ko = \
|
||||
if [ ! -f $(objtree)/vmlinux ]; then \
|
||||
printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \
|
||||
else \
|
||||
LLVM_OBJCOPY="$(OBJCOPY)" $(PAHOLE) -J $(PAHOLE_FLAGS) $(MODULE_PAHOLE_FLAGS) --btf_base $(objtree)/vmlinux $@; \
|
||||
$(RESOLVE_BTFIDS) -b $(objtree)/vmlinux $@; \
|
||||
else \
|
||||
$(CONFIG_SHELL) $(srctree)/scripts/gen-btf.sh --btf_base $(objtree)/vmlinux $@; \
|
||||
fi;
|
||||
|
||||
# Same as newer-prereqs, but allows to exclude specified extra dependencies
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ targets += vmlinux.unstripped .vmlinux.export.o
|
|||
vmlinux.unstripped: scripts/link-vmlinux.sh vmlinux.o .vmlinux.export.o $(KBUILD_LDS) FORCE
|
||||
+$(call if_changed_dep,link_vmlinux)
|
||||
ifdef CONFIG_DEBUG_INFO_BTF
|
||||
vmlinux.unstripped: $(RESOLVE_BTFIDS)
|
||||
vmlinux.unstripped: $(RESOLVE_BTFIDS) $(srctree)/scripts/gen-btf.sh
|
||||
endif
|
||||
|
||||
ifdef CONFIG_BUILDTIME_TABLE_SORT
|
||||
|
|
|
|||
147
scripts/gen-btf.sh
Executable file
147
scripts/gen-btf.sh
Executable file
|
|
@ -0,0 +1,147 @@
|
|||
#!/bin/sh
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
# Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This script generates BTF data for the provided ELF file.
|
||||
#
|
||||
# Kernel BTF generation involves these conceptual steps:
|
||||
# 1. pahole generates BTF from DWARF data
|
||||
# 2. resolve_btfids applies kernel-specific btf2btf
|
||||
# transformations and computes data for .BTF_ids section
|
||||
# 3. the result gets linked/objcopied into the target binary
|
||||
#
|
||||
# How step (3) should be done differs between vmlinux, and
|
||||
# kernel modules, which is the primary reason for the existence
|
||||
# of this script.
|
||||
#
|
||||
# For modules the script expects vmlinux passed in as --btf_base.
|
||||
# Generated .BTF, .BTF.base and .BTF_ids sections become embedded
|
||||
# into the input ELF file with objcopy.
|
||||
#
|
||||
# For vmlinux the input file remains unchanged and two files are produced:
|
||||
# - ${1}.btf.o ready for linking into vmlinux
|
||||
# - ${1}.BTF_ids with .BTF_ids data blob
|
||||
# This output is consumed by scripts/link-vmlinux.sh
|
||||
|
||||
set -e
|
||||
|
||||
usage()
|
||||
{
|
||||
echo "Usage: $0 [--btf_base <file>] <target ELF file>"
|
||||
exit 1
|
||||
}
|
||||
|
||||
BTF_BASE=""
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--btf_base)
|
||||
BTF_BASE="$2"
|
||||
shift 2
|
||||
;;
|
||||
-*)
|
||||
echo "Unknown option: $1" >&2
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
break
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $# -ne 1 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
ELF_FILE="$1"
|
||||
shift
|
||||
|
||||
is_enabled() {
|
||||
grep -q "^$1=y" ${objtree}/include/config/auto.conf
|
||||
}
|
||||
|
||||
case "${KBUILD_VERBOSE}" in
|
||||
*1*)
|
||||
set -x
|
||||
;;
|
||||
esac
|
||||
|
||||
gen_btf_data()
|
||||
{
|
||||
btf1="${ELF_FILE}.BTF.1"
|
||||
${PAHOLE} -J ${PAHOLE_FLAGS} \
|
||||
${BTF_BASE:+--btf_base ${BTF_BASE}} \
|
||||
--btf_encode_detached=${btf1} \
|
||||
"${ELF_FILE}"
|
||||
|
||||
${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_FLAGS} \
|
||||
${BTF_BASE:+--btf_base ${BTF_BASE}} \
|
||||
--btf ${btf1} "${ELF_FILE}"
|
||||
}
|
||||
|
||||
gen_btf_o()
|
||||
{
|
||||
btf_data=${ELF_FILE}.btf.o
|
||||
|
||||
# Create ${btf_data} which contains just .BTF section but no symbols. Add
|
||||
# SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all
|
||||
# deletes all symbols including __start_BTF and __stop_BTF, which will
|
||||
# be redefined in the linker script.
|
||||
echo "" | ${CC} ${CLANG_FLAGS} ${KBUILD_CPPFLAGS} ${KBUILD_CFLAGS} -fno-lto -c -x c -o ${btf_data} -
|
||||
${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF \
|
||||
--set-section-flags .BTF=alloc,readonly ${btf_data}
|
||||
${OBJCOPY} --only-section=.BTF --strip-all ${btf_data}
|
||||
|
||||
# Change e_type to ET_REL so that it can be used to link final vmlinux.
|
||||
# GNU ld 2.35+ and lld do not allow an ET_EXEC input.
|
||||
if is_enabled CONFIG_CPU_BIG_ENDIAN; then
|
||||
et_rel='\0\1'
|
||||
else
|
||||
et_rel='\1\0'
|
||||
fi
|
||||
printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none
|
||||
}
|
||||
|
||||
embed_btf_data()
|
||||
{
|
||||
${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF ${ELF_FILE}
|
||||
|
||||
# a module might not have a .BTF_ids or .BTF.base section
|
||||
btf_base="${ELF_FILE}.BTF.base"
|
||||
if [ -f "${btf_base}" ]; then
|
||||
${OBJCOPY} --add-section .BTF.base=${btf_base} ${ELF_FILE}
|
||||
fi
|
||||
btf_ids="${ELF_FILE}.BTF_ids"
|
||||
if [ -f "${btf_ids}" ]; then
|
||||
${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${ELF_FILE}
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup()
|
||||
{
|
||||
rm -f "${ELF_FILE}.BTF.1"
|
||||
rm -f "${ELF_FILE}.BTF"
|
||||
if [ "${BTFGEN_MODE}" = "module" ]; then
|
||||
rm -f "${ELF_FILE}.BTF.base"
|
||||
rm -f "${ELF_FILE}.BTF_ids"
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
BTFGEN_MODE="vmlinux"
|
||||
if [ -n "${BTF_BASE}" ]; then
|
||||
BTFGEN_MODE="module"
|
||||
fi
|
||||
|
||||
gen_btf_data
|
||||
|
||||
case "${BTFGEN_MODE}" in
|
||||
vmlinux)
|
||||
gen_btf_o
|
||||
;;
|
||||
module)
|
||||
embed_btf_data
|
||||
;;
|
||||
esac
|
||||
|
||||
exit 0
|
||||
|
|
@ -106,34 +106,6 @@ vmlinux_link()
|
|||
${kallsymso} ${btf_vmlinux_bin_o} ${arch_vmlinux_o} ${ldlibs}
|
||||
}
|
||||
|
||||
# generate .BTF typeinfo from DWARF debuginfo
|
||||
# ${1} - vmlinux image
|
||||
gen_btf()
|
||||
{
|
||||
local btf_data=${1}.btf.o
|
||||
|
||||
info BTF "${btf_data}"
|
||||
LLVM_OBJCOPY="${OBJCOPY}" ${PAHOLE} -J ${PAHOLE_FLAGS} ${1}
|
||||
|
||||
# Create ${btf_data} which contains just .BTF section but no symbols. Add
|
||||
# SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all
|
||||
# deletes all symbols including __start_BTF and __stop_BTF, which will
|
||||
# be redefined in the linker script. Add 2>/dev/null to suppress GNU
|
||||
# objcopy warnings: "empty loadable segment detected at ..."
|
||||
${OBJCOPY} --only-section=.BTF --set-section-flags .BTF=alloc,readonly \
|
||||
--strip-all ${1} "${btf_data}" 2>/dev/null
|
||||
# Change e_type to ET_REL so that it can be used to link final vmlinux.
|
||||
# GNU ld 2.35+ and lld do not allow an ET_EXEC input.
|
||||
if is_enabled CONFIG_CPU_BIG_ENDIAN; then
|
||||
et_rel='\0\1'
|
||||
else
|
||||
et_rel='\1\0'
|
||||
fi
|
||||
printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none
|
||||
|
||||
btf_vmlinux_bin_o=${btf_data}
|
||||
}
|
||||
|
||||
# Create ${2}.o file with all symbols from the ${1} object file
|
||||
kallsyms()
|
||||
{
|
||||
|
|
@ -205,6 +177,7 @@ if is_enabled CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX; then
|
|||
fi
|
||||
|
||||
btf_vmlinux_bin_o=
|
||||
btfids_vmlinux=
|
||||
kallsymso=
|
||||
strip_debug=
|
||||
generate_map=
|
||||
|
|
@ -232,11 +205,14 @@ if is_enabled CONFIG_KALLSYMS || is_enabled CONFIG_DEBUG_INFO_BTF; then
|
|||
fi
|
||||
|
||||
if is_enabled CONFIG_DEBUG_INFO_BTF; then
|
||||
if ! gen_btf .tmp_vmlinux1; then
|
||||
info BTF .tmp_vmlinux1
|
||||
if ! ${CONFIG_SHELL} ${srctree}/scripts/gen-btf.sh .tmp_vmlinux1; then
|
||||
echo >&2 "Failed to generate BTF for vmlinux"
|
||||
echo >&2 "Try to disable CONFIG_DEBUG_INFO_BTF"
|
||||
exit 1
|
||||
fi
|
||||
btf_vmlinux_bin_o=.tmp_vmlinux1.btf.o
|
||||
btfids_vmlinux=.tmp_vmlinux1.BTF_ids
|
||||
fi
|
||||
|
||||
if is_enabled CONFIG_KALLSYMS; then
|
||||
|
|
@ -289,14 +265,9 @@ fi
|
|||
|
||||
vmlinux_link "${VMLINUX}"
|
||||
|
||||
# fill in BTF IDs
|
||||
if is_enabled CONFIG_DEBUG_INFO_BTF; then
|
||||
info BTFIDS "${VMLINUX}"
|
||||
RESOLVE_BTFIDS_ARGS=""
|
||||
if is_enabled CONFIG_WERROR; then
|
||||
RESOLVE_BTFIDS_ARGS=" --fatal_warnings "
|
||||
fi
|
||||
${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_ARGS} "${VMLINUX}"
|
||||
info BTFIDS ${VMLINUX}
|
||||
${RESOLVE_BTFIDS} --patch_btfids ${btfids_vmlinux} ${VMLINUX}
|
||||
fi
|
||||
|
||||
mksysmap "${VMLINUX}" System.map
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ NET COMMANDS
|
|||
============
|
||||
|
||||
| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ]
|
||||
| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
|
||||
| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** | **prepend** ]
|
||||
| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
|
||||
| **bpftool** **net help**
|
||||
|
|
||||
|
|
@ -58,11 +58,9 @@ bpftool net { show | list } [ dev *NAME* ]
|
|||
then all bpf programs attached to non clsact qdiscs, and finally all bpf
|
||||
programs attached to root and clsact qdisc.
|
||||
|
||||
bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ]
|
||||
bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite | prepend ]
|
||||
Attach bpf program *PROG* to network interface *NAME* with type specified
|
||||
by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the
|
||||
command used with **overwrite** option. Currently, only XDP-related modes
|
||||
are supported for *ATTACH_TYPE*.
|
||||
by *ATTACH_TYPE*.
|
||||
|
||||
*ATTACH_TYPE* can be of:
|
||||
**xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it;
|
||||
|
|
@ -72,11 +70,18 @@ bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ]
|
|||
**tcx_ingress** - Ingress TCX. runs on ingress net traffic;
|
||||
**tcx_egress** - Egress TCX. runs on egress net traffic;
|
||||
|
||||
For XDP-related attach types (**xdp**, **xdpgeneric**, **xdpdrv**,
|
||||
**xdpoffload**), the **overwrite** option can be used to replace a
|
||||
previously attached bpf program.
|
||||
|
||||
For **tcx_ingress** and **tcx_egress** attach types, the **prepend** option
|
||||
can be used to attach the program at the beginning of the chain instead of
|
||||
at the end.
|
||||
|
||||
bpftool net detach *ATTACH_TYPE* dev *NAME*
|
||||
Detach bpf program attached to network interface *NAME* with type specified
|
||||
by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used
|
||||
for attach must be specified. Currently, only XDP-related modes are
|
||||
supported for *ATTACH_TYPE*.
|
||||
for attach must be specified.
|
||||
|
||||
bpftool net help
|
||||
Print short help message.
|
||||
|
|
@ -191,6 +196,17 @@ EXAMPLES
|
|||
tc:
|
||||
lo(1) tcx/ingress tc_prog prog_id 29
|
||||
|
||||
|
|
||||
| **# bpftool net attach tcx_ingress name tc_prog2 dev lo prepend**
|
||||
| **# bpftool net**
|
||||
|
|
||||
|
||||
::
|
||||
|
||||
tc:
|
||||
lo(1) tcx/ingress tc_prog2 prog_id 30
|
||||
lo(1) tcx/ingress tc_prog prog_id 29
|
||||
|
||||
|
|
||||
| **# bpftool net attach tcx_ingress name tc_prog dev lo**
|
||||
| **# bpftool net detach tcx_ingress dev lo**
|
||||
|
|
|
|||
|
|
@ -130,8 +130,8 @@ include $(FEATURES_DUMP)
|
|||
endif
|
||||
endif
|
||||
|
||||
LIBS = $(LIBBPF) -lelf -lz -lcrypto
|
||||
LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz -lcrypto
|
||||
LIBS = $(LIBBPF) -lelf -lcrypto -lz
|
||||
LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lcrypto -lz
|
||||
|
||||
ifeq ($(feature-libelf-zstd),1)
|
||||
LIBS += -lzstd
|
||||
|
|
|
|||
|
|
@ -1142,7 +1142,14 @@ _bpftool()
|
|||
return 0
|
||||
;;
|
||||
8)
|
||||
_bpftool_once_attr 'overwrite'
|
||||
case ${words[3]} in
|
||||
tcx_ingress|tcx_egress)
|
||||
_bpftool_once_attr 'prepend'
|
||||
;;
|
||||
*)
|
||||
_bpftool_once_attr 'overwrite'
|
||||
;;
|
||||
esac
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
|
|
|||
|
|
@ -1191,6 +1191,7 @@ const char *bpf_attach_type_input_str(enum bpf_attach_type t)
|
|||
case BPF_TRACE_FENTRY: return "fentry";
|
||||
case BPF_TRACE_FEXIT: return "fexit";
|
||||
case BPF_MODIFY_RETURN: return "mod_ret";
|
||||
case BPF_TRACE_FSESSION: return "fsession";
|
||||
case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select";
|
||||
case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate";
|
||||
default: return libbpf_bpf_attach_type_str(t);
|
||||
|
|
|
|||
|
|
@ -731,10 +731,10 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
|
|||
{ \n\
|
||||
struct %1$s *skel; \n\
|
||||
\n\
|
||||
skel = skel_alloc(sizeof(*skel)); \n\
|
||||
skel = (struct %1$s *)skel_alloc(sizeof(*skel)); \n\
|
||||
if (!skel) \n\
|
||||
goto cleanup; \n\
|
||||
skel->ctx.sz = (void *)&skel->links - (void *)skel; \n\
|
||||
skel->ctx.sz = (char *)&skel->links - (char *)skel; \n\
|
||||
",
|
||||
obj_name, opts.data_sz);
|
||||
bpf_object__for_each_map(map, obj) {
|
||||
|
|
@ -755,7 +755,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
|
|||
\n\
|
||||
\"; \n\
|
||||
\n\
|
||||
skel->%1$s = skel_prep_map_data((void *)data, %2$zd,\n\
|
||||
skel->%1$s = (__typeof__(skel->%1$s))skel_prep_map_data((void *)data, %2$zd,\n\
|
||||
sizeof(data) - 1);\n\
|
||||
if (!skel->%1$s) \n\
|
||||
goto cleanup; \n\
|
||||
|
|
@ -857,7 +857,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
|
|||
|
||||
codegen("\
|
||||
\n\
|
||||
skel->%1$s = skel_finalize_map_data(&skel->maps.%1$s.initial_value, \n\
|
||||
skel->%1$s = (__typeof__(skel->%1$s))skel_finalize_map_data(&skel->maps.%1$s.initial_value,\n\
|
||||
%2$zd, %3$s, skel->maps.%1$s.map_fd);\n\
|
||||
if (!skel->%1$s) \n\
|
||||
return -ENOMEM; \n\
|
||||
|
|
|
|||
|
|
@ -666,10 +666,16 @@ static int get_tcx_type(enum net_attach_type attach_type)
|
|||
}
|
||||
}
|
||||
|
||||
static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex)
|
||||
static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex, bool prepend)
|
||||
{
|
||||
int type = get_tcx_type(attach_type);
|
||||
|
||||
if (prepend) {
|
||||
LIBBPF_OPTS(bpf_prog_attach_opts, opts,
|
||||
.flags = BPF_F_BEFORE
|
||||
);
|
||||
return bpf_prog_attach_opts(progfd, ifindex, type, &opts);
|
||||
}
|
||||
return bpf_prog_attach(progfd, ifindex, type, 0);
|
||||
}
|
||||
|
||||
|
|
@ -685,6 +691,7 @@ static int do_attach(int argc, char **argv)
|
|||
enum net_attach_type attach_type;
|
||||
int progfd, ifindex, err = 0;
|
||||
bool overwrite = false;
|
||||
bool prepend = false;
|
||||
|
||||
/* parse attach args */
|
||||
if (!REQ_ARGS(5))
|
||||
|
|
@ -709,9 +716,25 @@ static int do_attach(int argc, char **argv)
|
|||
|
||||
if (argc) {
|
||||
if (is_prefix(*argv, "overwrite")) {
|
||||
if (attach_type != NET_ATTACH_TYPE_XDP &&
|
||||
attach_type != NET_ATTACH_TYPE_XDP_GENERIC &&
|
||||
attach_type != NET_ATTACH_TYPE_XDP_DRIVER &&
|
||||
attach_type != NET_ATTACH_TYPE_XDP_OFFLOAD) {
|
||||
p_err("'overwrite' is only supported for xdp types");
|
||||
err = -EINVAL;
|
||||
goto cleanup;
|
||||
}
|
||||
overwrite = true;
|
||||
} else if (is_prefix(*argv, "prepend")) {
|
||||
if (attach_type != NET_ATTACH_TYPE_TCX_INGRESS &&
|
||||
attach_type != NET_ATTACH_TYPE_TCX_EGRESS) {
|
||||
p_err("'prepend' is only supported for tcx_ingress/tcx_egress");
|
||||
err = -EINVAL;
|
||||
goto cleanup;
|
||||
}
|
||||
prepend = true;
|
||||
} else {
|
||||
p_err("expected 'overwrite', got: '%s'?", *argv);
|
||||
p_err("expected 'overwrite' or 'prepend', got: '%s'?", *argv);
|
||||
err = -EINVAL;
|
||||
goto cleanup;
|
||||
}
|
||||
|
|
@ -728,7 +751,7 @@ static int do_attach(int argc, char **argv)
|
|||
/* attach tcx prog */
|
||||
case NET_ATTACH_TYPE_TCX_INGRESS:
|
||||
case NET_ATTACH_TYPE_TCX_EGRESS:
|
||||
err = do_attach_tcx(progfd, attach_type, ifindex);
|
||||
err = do_attach_tcx(progfd, attach_type, ifindex, prepend);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
|
@ -985,7 +1008,7 @@ static int do_help(int argc, char **argv)
|
|||
|
||||
fprintf(stderr,
|
||||
"Usage: %1$s %2$s { show | list } [dev <devname>]\n"
|
||||
" %1$s %2$s attach ATTACH_TYPE PROG dev <devname> [ overwrite ]\n"
|
||||
" %1$s %2$s attach ATTACH_TYPE PROG dev <devname> [ overwrite | prepend ]\n"
|
||||
" %1$s %2$s detach ATTACH_TYPE dev <devname>\n"
|
||||
" %1$s %2$s help\n"
|
||||
"\n"
|
||||
|
|
|
|||
|
|
@ -70,7 +70,8 @@ HOSTCFLAGS_resolve_btfids += -g \
|
|||
-I$(srctree)/tools/include/uapi \
|
||||
-I$(LIBBPF_INCLUDE) \
|
||||
-I$(SUBCMD_INCLUDE) \
|
||||
$(LIBELF_FLAGS)
|
||||
$(LIBELF_FLAGS) \
|
||||
-Wall -Werror
|
||||
|
||||
LIBS = $(LIBELF_LIBS) -lz
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order {
|
|||
BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */
|
||||
BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */
|
||||
BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */
|
||||
/*
|
||||
* Walks the immediate children of the specified parent
|
||||
* cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE,
|
||||
* BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP
|
||||
* the iterator does not include the specified parent as one of the
|
||||
* returned iterator elements.
|
||||
*/
|
||||
BPF_CGROUP_ITER_CHILDREN,
|
||||
};
|
||||
|
||||
union bpf_iter_link_info {
|
||||
|
|
@ -918,6 +926,16 @@ union bpf_iter_link_info {
|
|||
* Number of bytes read from the stream on success, or -1 if an
|
||||
* error occurred (in which case, *errno* is set appropriately).
|
||||
*
|
||||
* BPF_PROG_ASSOC_STRUCT_OPS
|
||||
* Description
|
||||
* Associate a BPF program with a struct_ops map. The struct_ops
|
||||
* map is identified by *map_fd* and the BPF program is
|
||||
* identified by *prog_fd*.
|
||||
*
|
||||
* Return
|
||||
* 0 on success or -1 if an error occurred (in which case,
|
||||
* *errno* is set appropriately).
|
||||
*
|
||||
* NOTES
|
||||
* eBPF objects (maps and programs) can be shared between processes.
|
||||
*
|
||||
|
|
@ -974,6 +992,7 @@ enum bpf_cmd {
|
|||
BPF_PROG_BIND_MAP,
|
||||
BPF_TOKEN_CREATE,
|
||||
BPF_PROG_STREAM_READ_BY_FD,
|
||||
BPF_PROG_ASSOC_STRUCT_OPS,
|
||||
__MAX_BPF_CMD,
|
||||
};
|
||||
|
||||
|
|
@ -1134,6 +1153,7 @@ enum bpf_attach_type {
|
|||
BPF_NETKIT_PEER,
|
||||
BPF_TRACE_KPROBE_SESSION,
|
||||
BPF_TRACE_UPROBE_SESSION,
|
||||
BPF_TRACE_FSESSION,
|
||||
__MAX_BPF_ATTACH_TYPE
|
||||
};
|
||||
|
||||
|
|
@ -1373,6 +1393,8 @@ enum {
|
|||
BPF_NOEXIST = 1, /* create new element if it didn't exist */
|
||||
BPF_EXIST = 2, /* update existing element */
|
||||
BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */
|
||||
BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */
|
||||
BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */
|
||||
};
|
||||
|
||||
/* flags for BPF_MAP_CREATE command */
|
||||
|
|
@ -1894,6 +1916,12 @@ union bpf_attr {
|
|||
__u32 prog_fd;
|
||||
} prog_stream_read;
|
||||
|
||||
struct {
|
||||
__u32 map_fd;
|
||||
__u32 prog_fd;
|
||||
__u32 flags;
|
||||
} prog_assoc_struct_ops;
|
||||
|
||||
} __attribute__((aligned(8)));
|
||||
|
||||
/* The description below is an attempt at providing documentation to eBPF
|
||||
|
|
|
|||
|
|
@ -794,6 +794,7 @@ int bpf_link_create(int prog_fd, int target_fd,
|
|||
case BPF_TRACE_FENTRY:
|
||||
case BPF_TRACE_FEXIT:
|
||||
case BPF_MODIFY_RETURN:
|
||||
case BPF_TRACE_FSESSION:
|
||||
case BPF_LSM_MAC:
|
||||
attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0);
|
||||
if (!OPTS_ZEROED(opts, tracing))
|
||||
|
|
@ -1397,3 +1398,22 @@ int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len,
|
|||
err = sys_bpf(BPF_PROG_STREAM_READ_BY_FD, &attr, attr_sz);
|
||||
return libbpf_err_errno(err);
|
||||
}
|
||||
|
||||
int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd,
|
||||
struct bpf_prog_assoc_struct_ops_opts *opts)
|
||||
{
|
||||
const size_t attr_sz = offsetofend(union bpf_attr, prog_assoc_struct_ops);
|
||||
union bpf_attr attr;
|
||||
int err;
|
||||
|
||||
if (!OPTS_VALID(opts, bpf_prog_assoc_struct_ops_opts))
|
||||
return libbpf_err(-EINVAL);
|
||||
|
||||
memset(&attr, 0, attr_sz);
|
||||
attr.prog_assoc_struct_ops.map_fd = map_fd;
|
||||
attr.prog_assoc_struct_ops.prog_fd = prog_fd;
|
||||
attr.prog_assoc_struct_ops.flags = OPTS_GET(opts, flags, 0);
|
||||
|
||||
err = sys_bpf(BPF_PROG_ASSOC_STRUCT_OPS, &attr, attr_sz);
|
||||
return libbpf_err_errno(err);
|
||||
}
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue