linux/include/trace/events/kmem.h
Kalesh Singh 079c24d569 mm/tracing: rss_stat: ensure curr is false from kthread context
The rss_stat trace event allows userspace tools, like Perfetto [1], to
inspect per-process RSS metric changes over time.

The curr field was introduced to rss_stat in commit e4dcad204d
("rss_stat: add support to detect RSS updates of external mm").  Its
intent is to indicate whether the RSS update is for the mm_struct of the
current execution context; and is set to false when operating on a remote
mm_struct (e.g., via kswapd or a direct reclaimer).

However, an issue arises when a kernel thread temporarily adopts a user
process's mm_struct.  Kernel threads do not have their own mm_struct and
normally have current->mm set to NULL.  To operate on user memory, they
can "borrow" a memory context using kthread_use_mm(), which sets
current->mm to the user process's mm.

This can be observed, for example, in the USB Function Filesystem (FFS)
driver.  The ffs_user_copy_worker() handles AIO completions and uses
kthread_use_mm() to copy data to a user-space buffer.  If a page fault
occurs during this copy, the fault handler executes in the kthread's
context.

At this point, current is the kthread, but current->mm points to the user
process's mm.  Since the rss_stat event (from the page fault) is for that
same mm, the condition current->mm == mm becomes true, causing curr to be
incorrectly set to true when the trace event is emitted.

This is misleading because it suggests the mm belongs to the kthread,
confusing userspace tools that track per-process RSS changes and
corrupting their mm_id-to-process association.

Fix this by ensuring curr is always false when the trace event is emitted
from a kthread context by checking for the PF_KTHREAD flag.

Link: https://lkml.kernel.org/r/20260219233708.1971199-1-kaleshsingh@google.com
Link: https://perfetto.dev/ [1]
Fixes: e4dcad204d ("rss_stat: add support to detect RSS updates of external mm")
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Cc: "David Hildenbrand (Arm)" <david@kernel.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: <stable@vger.kernel.org>	[5.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2026-02-24 11:13:27 -08:00

464 lines
11 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kmem
#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KMEM_H
#include <linux/types.h>
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>
TRACE_EVENT(kmem_cache_alloc,
TP_PROTO(unsigned long call_site,
const void *ptr,
struct kmem_cache *s,
gfp_t gfp_flags,
int node),
TP_ARGS(call_site, ptr, s, gfp_flags, node),
TP_STRUCT__entry(
__field( unsigned long, call_site )
__field( const void *, ptr )
__string( name, s->name )
__field( size_t, bytes_req )
__field( size_t, bytes_alloc )
__field( unsigned long, gfp_flags )
__field( int, node )
__field( bool, accounted )
),
TP_fast_assign(
__entry->call_site = call_site;
__entry->ptr = ptr;
__assign_str(name);
__entry->bytes_req = s->object_size;
__entry->bytes_alloc = s->size;
__entry->gfp_flags = (__force unsigned long)gfp_flags;
__entry->node = node;
__entry->accounted = IS_ENABLED(CONFIG_MEMCG) ?
((gfp_flags & __GFP_ACCOUNT) ||
(s->flags & SLAB_ACCOUNT)) : false;
),
TP_printk("call_site=%pS ptr=%p name=%s bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
(void *)__entry->call_site,
__entry->ptr,
__get_str(name),
__entry->bytes_req,
__entry->bytes_alloc,
show_gfp_flags(__entry->gfp_flags),
__entry->node,
__entry->accounted ? "true" : "false")
);
TRACE_EVENT(kmalloc,
TP_PROTO(unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
gfp_t gfp_flags,
int node),
TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
TP_STRUCT__entry(
__field( unsigned long, call_site )
__field( const void *, ptr )
__field( size_t, bytes_req )
__field( size_t, bytes_alloc )
__field( unsigned long, gfp_flags )
__field( int, node )
),
TP_fast_assign(
__entry->call_site = call_site;
__entry->ptr = ptr;
__entry->bytes_req = bytes_req;
__entry->bytes_alloc = bytes_alloc;
__entry->gfp_flags = (__force unsigned long)gfp_flags;
__entry->node = node;
),
TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
(void *)__entry->call_site,
__entry->ptr,
__entry->bytes_req,
__entry->bytes_alloc,
show_gfp_flags(__entry->gfp_flags),
__entry->node,
(IS_ENABLED(CONFIG_MEMCG) &&
(__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false")
);
TRACE_EVENT(kfree,
TP_PROTO(unsigned long call_site, const void *ptr),
TP_ARGS(call_site, ptr),
TP_STRUCT__entry(
__field( unsigned long, call_site )
__field( const void *, ptr )
),
TP_fast_assign(
__entry->call_site = call_site;
__entry->ptr = ptr;
),
TP_printk("call_site=%pS ptr=%p",
(void *)__entry->call_site, __entry->ptr)
);
TRACE_EVENT(kmem_cache_free,
TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s),
TP_ARGS(call_site, ptr, s),
TP_STRUCT__entry(
__field( unsigned long, call_site )
__field( const void *, ptr )
__string( name, s->name )
),
TP_fast_assign(
__entry->call_site = call_site;
__entry->ptr = ptr;
__assign_str(name);
),
TP_printk("call_site=%pS ptr=%p name=%s",
(void *)__entry->call_site, __entry->ptr, __get_str(name))
);
TRACE_EVENT(mm_page_free,
TP_PROTO(struct page *page, unsigned int order),
TP_ARGS(page, order),
TP_STRUCT__entry(
__field( unsigned long, pfn )
__field( unsigned int, order )
),
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
__entry->order = order;
),
TP_printk("page=%p pfn=0x%lx order=%d",
pfn_to_page(__entry->pfn),
__entry->pfn,
__entry->order)
);
TRACE_EVENT(mm_page_free_batched,
TP_PROTO(struct page *page),
TP_ARGS(page),
TP_STRUCT__entry(
__field( unsigned long, pfn )
),
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
),
TP_printk("page=%p pfn=0x%lx order=0",
pfn_to_page(__entry->pfn),
__entry->pfn)
);
TRACE_EVENT(mm_page_alloc,
TP_PROTO(struct page *page, unsigned int order,
gfp_t gfp_flags, int migratetype),
TP_ARGS(page, order, gfp_flags, migratetype),
TP_STRUCT__entry(
__field( unsigned long, pfn )
__field( unsigned int, order )
__field( unsigned long, gfp_flags )
__field( int, migratetype )
),
TP_fast_assign(
__entry->pfn = page ? page_to_pfn(page) : -1UL;
__entry->order = order;
__entry->gfp_flags = (__force unsigned long)gfp_flags;
__entry->migratetype = migratetype;
),
TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s",
__entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
__entry->pfn != -1UL ? __entry->pfn : 0,
__entry->order,
__entry->migratetype,
show_gfp_flags(__entry->gfp_flags))
);
DECLARE_EVENT_CLASS(mm_page,
TP_PROTO(struct page *page, unsigned int order, int migratetype,
int percpu_refill),
TP_ARGS(page, order, migratetype, percpu_refill),
TP_STRUCT__entry(
__field( unsigned long, pfn )
__field( unsigned int, order )
__field( int, migratetype )
__field( int, percpu_refill )
),
TP_fast_assign(
__entry->pfn = page ? page_to_pfn(page) : -1UL;
__entry->order = order;
__entry->migratetype = migratetype;
__entry->percpu_refill = percpu_refill;
),
TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d",
__entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
__entry->pfn != -1UL ? __entry->pfn : 0,
__entry->order,
__entry->migratetype,
__entry->percpu_refill)
);
DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,
TP_PROTO(struct page *page, unsigned int order, int migratetype,
int percpu_refill),
TP_ARGS(page, order, migratetype, percpu_refill)
);
TRACE_EVENT(mm_page_pcpu_drain,
TP_PROTO(struct page *page, unsigned int order, int migratetype),
TP_ARGS(page, order, migratetype),
TP_STRUCT__entry(
__field( unsigned long, pfn )
__field( unsigned int, order )
__field( int, migratetype )
),
TP_fast_assign(
__entry->pfn = page ? page_to_pfn(page) : -1UL;
__entry->order = order;
__entry->migratetype = migratetype;
),
TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d",
pfn_to_page(__entry->pfn), __entry->pfn,
__entry->order, __entry->migratetype)
);
TRACE_EVENT(mm_page_alloc_extfrag,
TP_PROTO(struct page *page,
int alloc_order, int fallback_order,
int alloc_migratetype, int fallback_migratetype),
TP_ARGS(page,
alloc_order, fallback_order,
alloc_migratetype, fallback_migratetype),
TP_STRUCT__entry(
__field( unsigned long, pfn )
__field( int, alloc_order )
__field( int, fallback_order )
__field( int, alloc_migratetype )
__field( int, fallback_migratetype )
__field( int, change_ownership )
),
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
__entry->alloc_order = alloc_order;
__entry->fallback_order = fallback_order;
__entry->alloc_migratetype = alloc_migratetype;
__entry->fallback_migratetype = fallback_migratetype;
__entry->change_ownership = (alloc_migratetype ==
get_pageblock_migratetype(page));
),
TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
pfn_to_page(__entry->pfn),
__entry->pfn,
__entry->alloc_order,
__entry->fallback_order,
pageblock_order,
__entry->alloc_migratetype,
__entry->fallback_migratetype,
__entry->fallback_order < pageblock_order,
__entry->change_ownership)
);
TRACE_EVENT(mm_setup_per_zone_wmarks,
TP_PROTO(struct zone *zone),
TP_ARGS(zone),
TP_STRUCT__entry(
__field(int, node_id)
__string(name, zone->name)
__field(unsigned long, watermark_min)
__field(unsigned long, watermark_low)
__field(unsigned long, watermark_high)
__field(unsigned long, watermark_promo)
),
TP_fast_assign(
__entry->node_id = zone->zone_pgdat->node_id;
__assign_str(name);
__entry->watermark_min = zone->_watermark[WMARK_MIN];
__entry->watermark_low = zone->_watermark[WMARK_LOW];
__entry->watermark_high = zone->_watermark[WMARK_HIGH];
__entry->watermark_promo = zone->_watermark[WMARK_PROMO];
),
TP_printk("node_id=%d zone name=%s watermark min=%lu low=%lu high=%lu promo=%lu",
__entry->node_id,
__get_str(name),
__entry->watermark_min,
__entry->watermark_low,
__entry->watermark_high,
__entry->watermark_promo)
);
TRACE_EVENT(mm_setup_per_zone_lowmem_reserve,
TP_PROTO(struct zone *zone, struct zone *upper_zone, long lowmem_reserve),
TP_ARGS(zone, upper_zone, lowmem_reserve),
TP_STRUCT__entry(
__field(int, node_id)
__string(name, zone->name)
__string(upper_name, upper_zone->name)
__field(long, lowmem_reserve)
),
TP_fast_assign(
__entry->node_id = zone->zone_pgdat->node_id;
__assign_str(name);
__assign_str(upper_name);
__entry->lowmem_reserve = lowmem_reserve;
),
TP_printk("node_id=%d zone name=%s upper_zone name=%s lowmem_reserve_pages=%ld",
__entry->node_id,
__get_str(name),
__get_str(upper_name),
__entry->lowmem_reserve)
);
TRACE_EVENT(mm_calculate_totalreserve_pages,
TP_PROTO(unsigned long totalreserve_pages),
TP_ARGS(totalreserve_pages),
TP_STRUCT__entry(
__field(unsigned long, totalreserve_pages)
),
TP_fast_assign(
__entry->totalreserve_pages = totalreserve_pages;
),
TP_printk("totalreserve_pages=%lu", __entry->totalreserve_pages)
);
/*
* Required for uniquely and securely identifying mm in rss_stat tracepoint.
*/
#ifndef __PTR_TO_HASHVAL
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
{
int ret;
unsigned long hashval;
ret = ptr_to_hashval(ptr, &hashval);
if (ret)
return 0;
/* The hashed value is only 32-bit */
return (unsigned int)hashval;
}
#define __PTR_TO_HASHVAL
#endif
#define TRACE_MM_PAGES \
EM(MM_FILEPAGES) \
EM(MM_ANONPAGES) \
EM(MM_SWAPENTS) \
EMe(MM_SHMEMPAGES)
#undef EM
#undef EMe
#define EM(a) TRACE_DEFINE_ENUM(a);
#define EMe(a) TRACE_DEFINE_ENUM(a);
TRACE_MM_PAGES
#undef EM
#undef EMe
#define EM(a) { a, #a },
#define EMe(a) { a, #a }
TRACE_EVENT(rss_stat,
TP_PROTO(struct mm_struct *mm,
int member),
TP_ARGS(mm, member),
TP_STRUCT__entry(
__field(unsigned int, mm_id)
__field(unsigned int, curr)
__field(int, member)
__field(long, size)
),
TP_fast_assign(
__entry->mm_id = mm_ptr_to_hash(mm);
/*
* curr is true if the mm matches the current task's mm_struct.
* Since kthreads (PF_KTHREAD) have no mm_struct of their own
* but can borrow one via kthread_use_mm(), we must filter them
* out to avoid incorrectly attributing the RSS update to them.
*/
__entry->curr = current->mm == mm && !(current->flags & PF_KTHREAD);
__entry->member = member;
__entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
<< PAGE_SHIFT);
),
TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
__entry->mm_id,
__entry->curr,
__print_symbolic(__entry->member, TRACE_MM_PAGES),
__entry->size)
);
#endif /* _TRACE_KMEM_H */
/* This part must be outside protection */
#include <trace/define_trace.h>