diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8881198e85c6..3e51190a55e4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1534,16 +1534,27 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) #include void build_all_zonelists(pg_data_t *pgdat); -void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, - enum zone_type highest_zoneidx); -void kswapd_try_clear_hopeless(struct pglist_data *pgdat, - unsigned int order, int highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); + +enum kswapd_clear_hopeless_reason { + KSWAPD_CLEAR_HOPELESS_OTHER = 0, + KSWAPD_CLEAR_HOPELESS_KSWAPD, + KSWAPD_CLEAR_HOPELESS_DIRECT, + KSWAPD_CLEAR_HOPELESS_PCP, +}; + +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, + enum zone_type highest_zoneidx); +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx); +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason); +bool kswapd_test_hopeless(pg_data_t *pgdat); + /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 490958fa10de..ea58e4656abf 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -40,6 +40,16 @@ {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_OTHER); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_KSWAPD); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_DIRECT); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_PCP); + +#define kswapd_clear_hopeless_reason_ops \ + {KSWAPD_CLEAR_HOPELESS_KSWAPD, "KSWAPD"}, \ + {KSWAPD_CLEAR_HOPELESS_DIRECT, "DIRECT"}, \ + {KSWAPD_CLEAR_HOPELESS_PCP, "PCP"}, \ + {KSWAPD_CLEAR_HOPELESS_OTHER, "OTHER"} #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ @@ -535,6 +545,47 @@ TRACE_EVENT(mm_vmscan_throttled, __entry->usec_delayed, show_throttle_flags(__entry->reason)) ); + +TRACE_EVENT(mm_vmscan_kswapd_reclaim_fail, + + TP_PROTO(int nid, int failures), + + TP_ARGS(nid, failures), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, failures) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->failures = failures; + ), + + TP_printk("nid=%d failures=%d", + __entry->nid, __entry->failures) +); + +TRACE_EVENT(mm_vmscan_kswapd_clear_hopeless, + + TP_PROTO(int nid, int reason), + + TP_ARGS(nid, reason), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, reason) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->reason = reason; + ), + + TP_printk("nid=%d reason=%s", + __entry->nid, + __print_symbolic(__entry->reason, kswapd_clear_hopeless_reason_ops)) +); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 7ec442776574..0ae8bec86346 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -955,7 +955,7 @@ static ssize_t demotion_enabled_store(struct kobject *kobj, struct pglist_data *pgdat; for_each_online_pgdat(pgdat) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER); } return count; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e779b18168de..2c70ba9d5cc6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2945,9 +2945,9 @@ static bool free_frozen_page_commit(struct zone *zone, * 'hopeless node' to stay in that state for a while. Let * kswapd work again by resetting kswapd_failures. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES && + if (kswapd_test_hopeless(pgdat) && next_memory_node(pgdat->node_id) < MAX_NUMNODES) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP); } return ret; } diff --git a/mm/show_mem.c b/mm/show_mem.c index 3a4b5207635d..24078ac3e6bc 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -278,8 +278,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(atomic_read(&pgdat->kswapd_failures) >= - MAX_RECLAIM_RETRIES), + str_yes_no(kswapd_test_hopeless(pgdat)), K(node_page_state(pgdat, NR_BALLOON_PAGES))); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 5d9b1bce6f01..1d281174164e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -506,7 +506,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; /* @@ -6437,7 +6437,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { @@ -6846,7 +6846,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7111,8 +7111,11 @@ restart: * watermark_high at this point. We need to avoid increasing the * failure count to prevent the kswapd thread from stopping. */ - if (!sc.nr_reclaimed && !boosted) - atomic_inc(&pgdat->kswapd_failures); + if (!sc.nr_reclaimed && !boosted) { + int fail_cnt = atomic_inc_return(&pgdat->kswapd_failures); + /* kswapd context, low overhead to trace every failure */ + trace_mm_vmscan_kswapd_reclaim_fail(pgdat->node_id, fail_cnt); + } out: clear_reclaim_active(pgdat, highest_zoneidx); @@ -7371,7 +7374,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; /* Hopeless node, leave it to direct reclaim if possible */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || + if (kswapd_test_hopeless(pgdat) || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* @@ -7391,9 +7394,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, wake_up_interruptible(&pgdat->kswapd_wait); } -static void kswapd_clear_hopeless(pg_data_t *pgdat) +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason) { - atomic_set(&pgdat->kswapd_failures, 0); + /* Only trace actual resets, not redundant zero-to-zero */ + if (atomic_xchg(&pgdat->kswapd_failures, 0)) + trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason); } /* @@ -7406,7 +7411,13 @@ void kswapd_try_clear_hopeless(struct pglist_data *pgdat, unsigned int order, int highest_zoneidx) { if (pgdat_balanced(pgdat, order, highest_zoneidx)) - kswapd_clear_hopeless(pgdat); + kswapd_clear_hopeless(pgdat, current_is_kswapd() ? + KSWAPD_CLEAR_HOPELESS_KSWAPD : KSWAPD_CLEAR_HOPELESS_DIRECT); +} + +bool kswapd_test_hopeless(pg_data_t *pgdat) +{ + return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES; } #ifdef CONFIG_HIBERNATION diff --git a/mm/vmstat.c b/mm/vmstat.c index 0f64c898f79f..23e176e1d09d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1840,7 +1840,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n start_pfn: %lu" "\n reserved_highatomic: %lu" "\n free_highatomic: %lu", - atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, + kswapd_test_hopeless(pgdat), zone->zone_start_pfn, zone->nr_reserved_highatomic, zone->nr_free_highatomic);