mm: change dup_mmap() recovery

When the dup_mmap() fails during the vma duplication or setup, don't write
the XA_ZERO entry in the vma tree.  Instead, destroy the tree and free the
new resources, leaving an empty vma tree.

Using XA_ZERO introduced races where the vma could be found between
dup_mmap() dropping all locks and exit_mmap() taking the locks.  The race
can occur because the mm can be reached through the other trees via
successfully copied vmas and other methods such as the swapoff code.

XA_ZERO was marking the location to stop vma removal and pagetable
freeing.  The newly created arguments to the unmap_vmas() and
free_pgtables() serve this function.

Replacing the XA_ZERO entry use with the new argument list also means the
checks for xa_is_zero() are no longer necessary so these are also removed.

Note that dup_mmap() now cleans up when ALL vmas are successfully copied,
but the dup_mmap() fails to completely set up some other aspect of the
duplication.

Link: https://lkml.kernel.org/r/20260121164946.2093480-8-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Liam R. Howlett 2026-01-21 11:49:42 -05:00 committed by Andrew Morton
parent 243de0c0dc
commit 43873af772
2 changed files with 32 additions and 16 deletions

View file

@ -411,8 +411,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *next;
next = mas_find(mas, vma_end - 1);
if (unlikely(xa_is_zero(next)))
next = NULL;
/*
* Hide vma from rmap and truncate_pagecache before freeing
@ -431,8 +429,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
vma = next;
next = mas_find(mas, vma_end - 1);
if (unlikely(xa_is_zero(next)))
next = NULL;
if (mm_wr_locked)
vma_start_write(vma);
unlink_anon_vmas(vma);
@ -2186,7 +2182,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
unmap_single_vma(tlb, vma, start, end, &details);
hugetlb_zap_end(vma, &details);
vma = mas_find(mas, tree_end - 1);
} while (vma && likely(!xa_is_zero(vma)));
} while (vma);
mmu_notifier_invalidate_range_end(&range);
}

View file

@ -1285,7 +1285,7 @@ void exit_mmap(struct mm_struct *mm)
arch_exit_mmap(mm);
vma = vma_next(&vmi);
if (!vma || unlikely(xa_is_zero(vma))) {
if (!vma) {
/* Can happen if dup_mmap() received an OOM */
mmap_read_unlock(mm);
mmap_write_lock(mm);
@ -1851,20 +1851,40 @@ loop_out:
ksm_fork(mm, oldmm);
khugepaged_fork(mm, oldmm);
} else {
unsigned long end;
/*
* The entire maple tree has already been duplicated. If the
* mmap duplication fails, mark the failure point with
* XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
* stop releasing VMAs that have not been duplicated after this
* point.
* The entire maple tree has already been duplicated, but
* replacing the vmas failed at mpnt (which could be NULL if
* all were allocated but the last vma was not fully set up).
* Use the start address of the failure point to clean up the
* partially initialized tree.
*/
if (mpnt) {
mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
mas_store(&vmi.mas, XA_ZERO_ENTRY);
/* Avoid OOM iterating a broken tree */
mm_flags_set(MMF_OOM_SKIP, mm);
if (!mm->map_count) {
/* zero vmas were written to the new tree. */
end = 0;
} else if (mpnt) {
/* partial tree failure */
end = mpnt->vm_start;
} else {
/* All vmas were written to the new tree */
end = ULONG_MAX;
}
/* Hide mm from oom killer because the memory is being freed */
mm_flags_set(MMF_OOM_SKIP, mm);
if (end) {
vma_iter_set(&vmi, 0);
tmp = vma_next(&vmi);
flush_cache_mm(mm);
unmap_region(&vmi.mas, /* vma = */ tmp,
/* vma_start = */ 0, /* vma_end = */ end,
/* pg_end = */ end, /* prev = */ NULL,
/* next = */ NULL);
charge = tear_down_vmas(mm, &vmi, tmp, end);
vm_unacct_memory(charge);
}
__mt_destroy(&mm->mm_mt);
/*
* The mm_struct is going to exit, but the locks will be dropped
* first. Set the mm_struct as unstable is advisable as it is