mirror of
https://github.com/torvalds/linux.git
synced 2026-03-07 23:04:33 +01:00
Now, the swap cache is managed by the swap table. All swap cache users are checking the swap table directly to check the swap cache state. SWAP_HAS_CACHE is now just a temporary pin before the first increase from 0 to 1 of a slot's swap count (swap_dup_entries) after swap allocation (folio_alloc_swap), or before the final free of slots pinned by folio in swap cache (put_swap_folio). Drop these two usages. For the first dup, SWAP_HAS_CACHE pinning was hard to kill because it used to have multiple meanings, more than just "a slot is cached". We have just simplified that and defined that the first dup is always done with folio locked in swap cache (folio_dup_swap), so stop checking the SWAP_HAS_CACHE bit and just check the swap cache (swap table) directly, and add a WARN if a swap entry's count is being increased for the first time while the folio is not in swap cache. As for freeing, just let the swap cache free all swap entries of a folio that have a swap count of zero directly upon folio removal. We have also just cleaned up batch freeing to check the swap cache usage using the swap table: a slot with swap cache in the swap table will not be freed until its cache is gone, and no SWAP_HAS_CACHE bit is involved anymore. And besides, the removal of a folio and freeing of the slots are being done in the same critical section now, which should improve the performance. After these two changes, SWAP_HAS_CACHE no longer has any users. Swap cache synchronization is also done by the swap table directly, so using SWAP_HAS_CACHE to pin a slot before adding the cache is also no longer needed. Remove all related logic and helpers. swap_map is now only used for tracking the count, so all swap_map users can just read it directly, ignoring the swap_count helper, which was previously used to filter out the SWAP_HAS_CACHE bit. The idea of dropping SWAP_HAS_CACHE and using the swap table directly was initially from Chris's idea of merging all the metadata usage of all swaps into one place. Link: https://lkml.kernel.org/r/20251220-swap-table-p2-v5-18-8862a265a033@tencent.com Signed-off-by: Kairui Song <kasong@tencent.com> Suggested-by: Chris Li <chrisl@kernel.org> Reviewed-by: Baoquan He <bhe@redhat.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Barry Song <baohua@kernel.org> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Rafael J. Wysocki (Intel) <rafael@kernel.org> Cc: Yosry Ahmed <yosry.ahmed@linux.dev> Cc: Deepanshu Kartikey <kartikey406@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kairui Song <ryncsn@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
494 lines
14 KiB
C
494 lines
14 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _MM_SWAP_H
|
|
#define _MM_SWAP_H
|
|
|
|
#include <linux/atomic.h> /* for atomic_long_t */
|
|
struct mempolicy;
|
|
struct swap_iocb;
|
|
|
|
extern int page_cluster;
|
|
|
|
#ifdef CONFIG_THP_SWAP
|
|
#define SWAPFILE_CLUSTER HPAGE_PMD_NR
|
|
#define swap_entry_order(order) (order)
|
|
#else
|
|
#define SWAPFILE_CLUSTER 256
|
|
#define swap_entry_order(order) 0
|
|
#endif
|
|
|
|
extern struct swap_info_struct *swap_info[];
|
|
|
|
/*
|
|
* We use this to track usage of a cluster. A cluster is a block of swap disk
|
|
* space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
|
|
* free clusters are organized into a list. We fetch an entry from the list to
|
|
* get a free cluster.
|
|
*
|
|
* The flags field determines if a cluster is free. This is
|
|
* protected by cluster lock.
|
|
*/
|
|
struct swap_cluster_info {
|
|
spinlock_t lock; /*
|
|
* Protect swap_cluster_info fields
|
|
* other than list, and swap_info_struct->swap_map
|
|
* elements corresponding to the swap cluster.
|
|
*/
|
|
u16 count;
|
|
u8 flags;
|
|
u8 order;
|
|
atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */
|
|
struct list_head list;
|
|
};
|
|
|
|
/* All on-list cluster must have a non-zero flag. */
|
|
enum swap_cluster_flags {
|
|
CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */
|
|
CLUSTER_FLAG_FREE,
|
|
CLUSTER_FLAG_NONFULL,
|
|
CLUSTER_FLAG_FRAG,
|
|
/* Clusters with flags above are allocatable */
|
|
CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG,
|
|
CLUSTER_FLAG_FULL,
|
|
CLUSTER_FLAG_DISCARD,
|
|
CLUSTER_FLAG_MAX,
|
|
};
|
|
|
|
#ifdef CONFIG_SWAP
|
|
#include <linux/swapops.h> /* for swp_offset */
|
|
#include <linux/blk_types.h> /* for bio_end_io_t */
|
|
|
|
static inline unsigned int swp_cluster_offset(swp_entry_t entry)
|
|
{
|
|
return swp_offset(entry) % SWAPFILE_CLUSTER;
|
|
}
|
|
|
|
/*
|
|
* Callers of all helpers below must ensure the entry, type, or offset is
|
|
* valid, and protect the swap device with reference count or locks.
|
|
*/
|
|
static inline struct swap_info_struct *__swap_type_to_info(int type)
|
|
{
|
|
struct swap_info_struct *si;
|
|
|
|
si = READ_ONCE(swap_info[type]); /* rcu_dereference() */
|
|
VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
|
|
return si;
|
|
}
|
|
|
|
static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
|
|
{
|
|
return __swap_type_to_info(swp_type(entry));
|
|
}
|
|
|
|
static inline struct swap_cluster_info *__swap_offset_to_cluster(
|
|
struct swap_info_struct *si, pgoff_t offset)
|
|
{
|
|
VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
|
|
VM_WARN_ON_ONCE(offset >= si->max);
|
|
return &si->cluster_info[offset / SWAPFILE_CLUSTER];
|
|
}
|
|
|
|
static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry)
|
|
{
|
|
return __swap_offset_to_cluster(__swap_entry_to_info(entry),
|
|
swp_offset(entry));
|
|
}
|
|
|
|
static __always_inline struct swap_cluster_info *__swap_cluster_lock(
|
|
struct swap_info_struct *si, unsigned long offset, bool irq)
|
|
{
|
|
struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset);
|
|
|
|
/*
|
|
* Nothing modifies swap cache in an IRQ context. All access to
|
|
* swap cache is wrapped by swap_cache_* helpers, and swap cache
|
|
* writeback is handled outside of IRQs. Swapin or swapout never
|
|
* occurs in IRQ, and neither does in-place split or replace.
|
|
*
|
|
* Besides, modifying swap cache requires synchronization with
|
|
* swap_map, which was never IRQ safe.
|
|
*/
|
|
VM_WARN_ON_ONCE(!in_task());
|
|
VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
|
|
if (irq)
|
|
spin_lock_irq(&ci->lock);
|
|
else
|
|
spin_lock(&ci->lock);
|
|
return ci;
|
|
}
|
|
|
|
/**
|
|
* swap_cluster_lock - Lock and return the swap cluster of given offset.
|
|
* @si: swap device the cluster belongs to.
|
|
* @offset: the swap entry offset, pointing to a valid slot.
|
|
*
|
|
* Context: The caller must ensure the offset is in the valid range and
|
|
* protect the swap device with reference count or locks.
|
|
*/
|
|
static inline struct swap_cluster_info *swap_cluster_lock(
|
|
struct swap_info_struct *si, unsigned long offset)
|
|
{
|
|
return __swap_cluster_lock(si, offset, false);
|
|
}
|
|
|
|
static inline struct swap_cluster_info *__swap_cluster_get_and_lock(
|
|
const struct folio *folio, bool irq)
|
|
{
|
|
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
|
|
VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
|
|
return __swap_cluster_lock(__swap_entry_to_info(folio->swap),
|
|
swp_offset(folio->swap), irq);
|
|
}
|
|
|
|
/*
|
|
* swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries.
|
|
* @folio: The folio.
|
|
*
|
|
* This locks and returns the swap cluster that contains a folio's swap
|
|
* entries. The swap entries of a folio are always in one single cluster.
|
|
* The folio has to be locked so its swap entries won't change and the
|
|
* cluster won't be freed.
|
|
*
|
|
* Context: Caller must ensure the folio is locked and in the swap cache.
|
|
* Return: Pointer to the swap cluster.
|
|
*/
|
|
static inline struct swap_cluster_info *swap_cluster_get_and_lock(
|
|
const struct folio *folio)
|
|
{
|
|
return __swap_cluster_get_and_lock(folio, false);
|
|
}
|
|
|
|
/*
|
|
* swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries.
|
|
* @folio: The folio.
|
|
*
|
|
* Same as swap_cluster_get_and_lock but also disable IRQ.
|
|
*
|
|
* Context: Caller must ensure the folio is locked and in the swap cache.
|
|
* Return: Pointer to the swap cluster.
|
|
*/
|
|
static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
|
|
const struct folio *folio)
|
|
{
|
|
return __swap_cluster_get_and_lock(folio, true);
|
|
}
|
|
|
|
static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
|
|
{
|
|
spin_unlock(&ci->lock);
|
|
}
|
|
|
|
static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
|
|
{
|
|
spin_unlock_irq(&ci->lock);
|
|
}
|
|
|
|
/*
|
|
* Below are the core routines for doing swap for a folio.
|
|
* All helpers requires the folio to be locked, and a locked folio
|
|
* in the swap cache pins the swap entries / slots allocated to the
|
|
* folio, swap relies heavily on the swap cache and folio lock for
|
|
* synchronization.
|
|
*
|
|
* folio_alloc_swap(): the entry point for a folio to be swapped
|
|
* out. It allocates swap slots and pins the slots with swap cache.
|
|
* The slots start with a swap count of zero.
|
|
*
|
|
* folio_dup_swap(): increases the swap count of a folio, usually
|
|
* during it gets unmapped and a swap entry is installed to replace
|
|
* it (e.g., swap entry in page table). A swap slot with swap
|
|
* count == 0 should only be increasd by this helper.
|
|
*
|
|
* folio_put_swap(): does the opposite thing of folio_dup_swap().
|
|
*/
|
|
int folio_alloc_swap(struct folio *folio);
|
|
int folio_dup_swap(struct folio *folio, struct page *subpage);
|
|
void folio_put_swap(struct folio *folio, struct page *subpage);
|
|
|
|
/* For internal use */
|
|
extern void swap_entries_free(struct swap_info_struct *si,
|
|
struct swap_cluster_info *ci,
|
|
unsigned long offset, unsigned int nr_pages);
|
|
|
|
/* linux/mm/page_io.c */
|
|
int sio_pool_init(void);
|
|
struct swap_iocb;
|
|
void swap_read_folio(struct folio *folio, struct swap_iocb **plug);
|
|
void __swap_read_unplug(struct swap_iocb *plug);
|
|
static inline void swap_read_unplug(struct swap_iocb *plug)
|
|
{
|
|
if (unlikely(plug))
|
|
__swap_read_unplug(plug);
|
|
}
|
|
void swap_write_unplug(struct swap_iocb *sio);
|
|
int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
|
|
void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
|
|
|
|
/* linux/mm/swap_state.c */
|
|
extern struct address_space swap_space __read_mostly;
|
|
static inline struct address_space *swap_address_space(swp_entry_t entry)
|
|
{
|
|
return &swap_space;
|
|
}
|
|
|
|
/*
|
|
* Return the swap device position of the swap entry.
|
|
*/
|
|
static inline loff_t swap_dev_pos(swp_entry_t entry)
|
|
{
|
|
return ((loff_t)swp_offset(entry)) << PAGE_SHIFT;
|
|
}
|
|
|
|
/**
|
|
* folio_matches_swap_entry - Check if a folio matches a given swap entry.
|
|
* @folio: The folio.
|
|
* @entry: The swap entry to check against.
|
|
*
|
|
* Context: The caller should have the folio locked to ensure it's stable
|
|
* and nothing will move it in or out of the swap cache.
|
|
* Return: true or false.
|
|
*/
|
|
static inline bool folio_matches_swap_entry(const struct folio *folio,
|
|
swp_entry_t entry)
|
|
{
|
|
swp_entry_t folio_entry = folio->swap;
|
|
long nr_pages = folio_nr_pages(folio);
|
|
|
|
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
|
|
if (!folio_test_swapcache(folio))
|
|
return false;
|
|
VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio_entry.val, nr_pages), folio);
|
|
return folio_entry.val == round_down(entry.val, nr_pages);
|
|
}
|
|
|
|
/*
|
|
* All swap cache helpers below require the caller to ensure the swap entries
|
|
* used are valid and stabilize the device by any of the following ways:
|
|
* - Hold a reference by get_swap_device(): this ensures a single entry is
|
|
* valid and increases the swap device's refcount.
|
|
* - Locking a folio in the swap cache: this ensures the folio's swap entries
|
|
* are valid and pinned, also implies reference to the device.
|
|
* - Locking anything referencing the swap entry: e.g. PTL that protects
|
|
* swap entries in the page table, similar to locking swap cache folio.
|
|
* - See the comment of get_swap_device() for more complex usage.
|
|
*/
|
|
bool swap_cache_has_folio(swp_entry_t entry);
|
|
struct folio *swap_cache_get_folio(swp_entry_t entry);
|
|
void *swap_cache_get_shadow(swp_entry_t entry);
|
|
void swap_cache_del_folio(struct folio *folio);
|
|
struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags,
|
|
struct mempolicy *mpol, pgoff_t ilx,
|
|
bool *alloced);
|
|
/* Below helpers require the caller to lock and pass in the swap cluster. */
|
|
void __swap_cache_add_folio(struct swap_cluster_info *ci,
|
|
struct folio *folio, swp_entry_t entry);
|
|
void __swap_cache_del_folio(struct swap_cluster_info *ci,
|
|
struct folio *folio, swp_entry_t entry, void *shadow);
|
|
void __swap_cache_replace_folio(struct swap_cluster_info *ci,
|
|
struct folio *old, struct folio *new);
|
|
void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents);
|
|
|
|
void show_swap_cache_info(void);
|
|
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
|
|
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
|
struct vm_area_struct *vma, unsigned long addr,
|
|
struct swap_iocb **plug);
|
|
struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
|
|
struct mempolicy *mpol, pgoff_t ilx);
|
|
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
|
|
struct vm_fault *vmf);
|
|
struct folio *swapin_folio(swp_entry_t entry, struct folio *folio);
|
|
void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
|
|
unsigned long addr);
|
|
|
|
static inline unsigned int folio_swap_flags(struct folio *folio)
|
|
{
|
|
return __swap_entry_to_info(folio->swap)->flags;
|
|
}
|
|
|
|
/*
|
|
* Return the count of contiguous swap entries that share the same
|
|
* zeromap status as the starting entry. If is_zeromap is not NULL,
|
|
* it will return the zeromap status of the starting entry.
|
|
*/
|
|
static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
|
|
bool *is_zeromap)
|
|
{
|
|
struct swap_info_struct *sis = __swap_entry_to_info(entry);
|
|
unsigned long start = swp_offset(entry);
|
|
unsigned long end = start + max_nr;
|
|
bool first_bit;
|
|
|
|
first_bit = test_bit(start, sis->zeromap);
|
|
if (is_zeromap)
|
|
*is_zeromap = first_bit;
|
|
|
|
if (max_nr <= 1)
|
|
return max_nr;
|
|
if (first_bit)
|
|
return find_next_zero_bit(sis->zeromap, end, start) - start;
|
|
else
|
|
return find_next_bit(sis->zeromap, end, start) - start;
|
|
}
|
|
|
|
static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
|
|
{
|
|
int i;
|
|
|
|
/*
|
|
* While allocating a large folio and doing mTHP swapin, we need to
|
|
* ensure all entries are not cached, otherwise, the mTHP folio will
|
|
* be in conflict with the folio in swap cache.
|
|
*/
|
|
for (i = 0; i < max_nr; i++) {
|
|
if (swap_cache_has_folio(entry))
|
|
return i;
|
|
entry.val++;
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
#else /* CONFIG_SWAP */
|
|
struct swap_iocb;
|
|
static inline struct swap_cluster_info *swap_cluster_lock(
|
|
struct swap_info_struct *si, pgoff_t offset, bool irq)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline struct swap_cluster_info *swap_cluster_get_and_lock(
|
|
struct folio *folio)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
|
|
struct folio *folio)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
|
|
{
|
|
}
|
|
|
|
static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
|
|
{
|
|
}
|
|
|
|
static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline int folio_alloc_swap(struct folio *folio)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline int folio_dup_swap(struct folio *folio, struct page *page)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline void folio_put_swap(struct folio *folio, struct page *page)
|
|
{
|
|
}
|
|
|
|
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
|
|
{
|
|
}
|
|
|
|
static inline void swap_write_unplug(struct swap_iocb *sio)
|
|
{
|
|
}
|
|
|
|
static inline struct address_space *swap_address_space(swp_entry_t entry)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline void show_swap_cache_info(void)
|
|
{
|
|
}
|
|
|
|
static inline struct folio *swap_cluster_readahead(swp_entry_t entry,
|
|
gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
|
|
struct vm_fault *vmf)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline void swap_update_readahead(struct folio *folio,
|
|
struct vm_area_struct *vma, unsigned long addr)
|
|
{
|
|
}
|
|
|
|
static inline int swap_writeout(struct folio *folio,
|
|
struct swap_iocb **swap_plug)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline bool swap_cache_has_folio(swp_entry_t entry)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline void *swap_cache_get_shadow(swp_entry_t entry)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline void swap_cache_del_folio(struct folio *folio)
|
|
{
|
|
}
|
|
|
|
static inline void __swap_cache_del_folio(struct swap_cluster_info *ci,
|
|
struct folio *folio, swp_entry_t entry, void *shadow)
|
|
{
|
|
}
|
|
|
|
static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci,
|
|
struct folio *old, struct folio *new)
|
|
{
|
|
}
|
|
|
|
static inline unsigned int folio_swap_flags(struct folio *folio)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
|
|
bool *has_zeromap)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif /* CONFIG_SWAP */
|
|
#endif /* _MM_SWAP_H */
|