Merge tag 'kvm-x86-gmem-6.20' of https://github.com/kvm-x86/linux into HEAD

KVM guest_memfd changes for 6.20

 - Remove kvm_gmem_populate()'s preparation tracking and half-baked hugepage
   handling, and instead rely on SNP (the only user of the tracking) to do its
   own tracking via the RMP.

 - Retroactively document and enforce (for SNP) that KVM_SEV_SNP_LAUNCH_UPDATE
   and KVM_TDX_INIT_MEM_REGION require the source page to be 4KiB aligned, to
   avoid non-trivial complexity for a non-existent usecase (and because
   in-place conversion simply can't support unaligned sources).

 - When populating guest_memfd memory, GUP the source page in common code and
   pass the refcounted page to the vendor callback, instead of letting vendor
   code do the heavy lifting.  Doing so avoids a looming deadlock bug with
   in-place due an AB-BA conflict betwee mmap_lock and guest_memfd's filemap
   invalidate lock.
This commit is contained in:
Paolo Bonzini 2026-02-09 19:08:17 +01:00
commit 9123c5f956
6 changed files with 134 additions and 145 deletions

View file

@ -523,7 +523,7 @@ Returns: 0 on success, < 0 on error, -EAGAIN if caller should retry
struct kvm_sev_snp_launch_update {
__u64 gfn_start; /* Guest page number to load/encrypt data into. */
__u64 uaddr; /* Userspace address of data to be loaded/encrypted. */
__u64 uaddr; /* 4k-aligned address of data to be loaded/encrypted. */
__u64 len; /* 4k-aligned length in bytes to copy into guest memory.*/
__u8 type; /* The type of the guest pages being initialized. */
__u8 pad0;

View file

@ -156,7 +156,7 @@ KVM_TDX_INIT_MEM_REGION
:Returns: 0 on success, <0 on error
Initialize @nr_pages TDX guest private memory starting from @gpa with userspace
provided data from @source_addr.
provided data from @source_addr. @source_addr must be PAGE_SIZE-aligned.
Note, before calling this sub command, memory attribute of the range
[gpa, gpa + nr_pages] needs to be private. Userspace can use

View file

@ -2277,66 +2277,52 @@ struct sev_gmem_populate_args {
int fw_error;
};
static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
void __user *src, int order, void *opaque)
static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
struct page *src_page, void *opaque)
{
struct sev_gmem_populate_args *sev_populate_args = opaque;
struct sev_data_snp_launch_update fw_args = {0};
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
int n_private = 0, ret, i;
int npages = (1 << order);
gfn_t gfn;
bool assigned = false;
int level;
int ret;
if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page))
return -EINVAL;
for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
struct sev_data_snp_launch_update fw_args = {0};
bool assigned = false;
int level;
ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
if (ret || assigned) {
pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
__func__, gfn, ret, assigned);
ret = ret ? -EINVAL : -EEXIST;
goto err;
}
if (src) {
void *vaddr = kmap_local_pfn(pfn + i);
if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
ret = -EFAULT;
goto err;
}
kunmap_local(vaddr);
}
ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
sev_get_asid(kvm), true);
if (ret)
goto err;
n_private++;
fw_args.gctx_paddr = __psp_pa(sev->snp_context);
fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
fw_args.page_type = sev_populate_args->type;
ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
&fw_args, &sev_populate_args->fw_error);
if (ret)
goto fw_err;
ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level);
if (ret || assigned) {
pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
__func__, gfn, ret, assigned);
ret = ret ? -EINVAL : -EEXIST;
goto out;
}
return 0;
if (src_page) {
void *src_vaddr = kmap_local_page(src_page);
void *dst_vaddr = kmap_local_pfn(pfn);
fw_err:
memcpy(dst_vaddr, src_vaddr, PAGE_SIZE);
kunmap_local(src_vaddr);
kunmap_local(dst_vaddr);
}
ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K,
sev_get_asid(kvm), true);
if (ret)
goto out;
fw_args.gctx_paddr = __psp_pa(sev->snp_context);
fw_args.address = __sme_set(pfn_to_hpa(pfn));
fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
fw_args.page_type = sev_populate_args->type;
ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
&fw_args, &sev_populate_args->fw_error);
/*
* If the firmware command failed handle the reclaim and cleanup of that
* PFN specially vs. prior pages which can be cleaned up below without
* needing to reclaim in advance.
* PFN before reporting an error.
*
* Additionally, when invalid CPUID function entries are detected,
* firmware writes the expected values into the page and leaves it
@ -2346,26 +2332,22 @@ fw_err:
* information to provide information on which CPUID leaves/fields
* failed CPUID validation.
*/
if (!snp_page_reclaim(kvm, pfn + i) &&
if (ret && !snp_page_reclaim(kvm, pfn) &&
sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
void *vaddr = kmap_local_pfn(pfn + i);
void *src_vaddr = kmap_local_page(src_page);
void *dst_vaddr = kmap_local_pfn(pfn);
if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
pr_debug("Failed to write CPUID page back to userspace\n");
memcpy(src_vaddr, dst_vaddr, PAGE_SIZE);
kunmap_local(vaddr);
kunmap_local(src_vaddr);
kunmap_local(dst_vaddr);
}
/* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
n_private--;
err:
pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
__func__, ret, sev_populate_args->fw_error, n_private);
for (i = 0; i < n_private; i++)
kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
out:
if (ret)
pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n",
__func__, gfn, ret, sev_populate_args->fw_error);
return ret;
}
@ -2396,6 +2378,11 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
return -EINVAL;
src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
if (!PAGE_ALIGNED(src))
return -EINVAL;
npages = params.len / PAGE_SIZE;
/*
@ -2427,7 +2414,6 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
sev_populate_args.sev_fd = argp->sev_fd;
sev_populate_args.type = params.type;
src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
sev_gmem_post_populate, &sev_populate_args);

View file

@ -3118,34 +3118,24 @@ struct tdx_gmem_post_populate_arg {
};
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
void __user *src, int order, void *_arg)
struct page *src_page, void *_arg)
{
struct tdx_gmem_post_populate_arg *arg = _arg;
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
u64 err, entry, level_state;
gpa_t gpa = gfn_to_gpa(gfn);
struct page *src_page;
int ret, i;
if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
return -EIO;
/*
* Get the source page if it has been faulted in. Return failure if the
* source page has been swapped out or unmapped in primary memory.
*/
ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
if (ret < 0)
return ret;
if (ret != 1)
return -ENOMEM;
if (!src_page)
return -EOPNOTSUPP;
kvm_tdx->page_add_src = src_page;
ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
kvm_tdx->page_add_src = NULL;
put_page(src_page);
if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
return ret;

View file

@ -2566,7 +2566,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
* @gfn: starting GFN to be populated
* @src: userspace-provided buffer containing data to copy into GFN range
* (passed to @post_populate, and incremented on each iteration
* if not NULL)
* if not NULL). Must be page-aligned.
* @npages: number of pages to copy from userspace-buffer
* @post_populate: callback to issue for each gmem page that backs the GPA
* range
@ -2581,7 +2581,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
* Returns the number of pages that were populated.
*/
typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
void __user *src, int order, void *opaque);
struct page *page, void *opaque);
long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque);

View file

@ -76,11 +76,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
return 0;
}
static inline void kvm_gmem_mark_prepared(struct folio *folio)
{
folio_mark_uptodate(folio);
}
/*
* Process @folio, which contains @gfn, so that the guest can use it.
* The folio must be locked and the gfn must be contained in @slot.
@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, struct folio *folio)
{
unsigned long nr_pages, i;
pgoff_t index;
int r;
nr_pages = folio_nr_pages(folio);
for (i = 0; i < nr_pages; i++)
clear_highpage(folio_page(folio, i));
/*
* Preparing huge folios should always be safe, since it should
@ -114,11 +103,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
index = kvm_gmem_get_index(slot, gfn);
index = ALIGN_DOWN(index, folio_nr_pages(folio));
r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
if (!r)
kvm_gmem_mark_prepared(folio);
return r;
return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
}
/*
@ -151,6 +137,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
mapping_gfp_mask(inode->i_mapping), policy);
mpol_cond_put(policy);
/*
* External interfaces like kvm_gmem_get_pfn() support dealing
* with hugepages to a degree, but internally, guest_memfd currently
* assumes that all folios are order-0 and handling would need
* to be updated for anything otherwise (e.g. page-clearing
* operations).
*/
WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio));
return folio;
}
@ -420,7 +415,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
kvm_gmem_mark_prepared(folio);
folio_mark_uptodate(folio);
}
vmf->page = folio_file_page(folio, vmf->pgoff);
@ -757,7 +752,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
static struct folio *__kvm_gmem_get_pfn(struct file *file,
struct kvm_memory_slot *slot,
pgoff_t index, kvm_pfn_t *pfn,
bool *is_prepared, int *max_order)
int *max_order)
{
struct file *slot_file = READ_ONCE(slot->gmem.file);
struct gmem_file *f = file->private_data;
@ -787,7 +782,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
if (max_order)
*max_order = 0;
*is_prepared = folio_test_uptodate(folio);
return folio;
}
@ -797,19 +791,22 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
{
pgoff_t index = kvm_gmem_get_index(slot, gfn);
struct folio *folio;
bool is_prepared = false;
int r = 0;
CLASS(gmem_get_file, file)(slot);
if (!file)
return -EFAULT;
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
if (IS_ERR(folio))
return PTR_ERR(folio);
if (!is_prepared)
r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
folio_mark_uptodate(folio);
}
r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
folio_unlock(folio);
@ -823,13 +820,49 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
struct file *file, gfn_t gfn, struct page *src_page,
kvm_gmem_populate_cb post_populate, void *opaque)
{
pgoff_t index = kvm_gmem_get_index(slot, gfn);
struct folio *folio;
kvm_pfn_t pfn;
int ret;
filemap_invalidate_lock(file->f_mapping);
folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
goto out_unlock;
}
folio_unlock(folio);
if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
KVM_MEMORY_ATTRIBUTE_PRIVATE,
KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
ret = -EINVAL;
goto out_put_folio;
}
ret = post_populate(kvm, gfn, pfn, src_page, opaque);
if (!ret)
folio_mark_uptodate(folio);
out_put_folio:
folio_put(folio);
out_unlock:
filemap_invalidate_unlock(file->f_mapping);
return ret;
}
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque)
{
struct kvm_memory_slot *slot;
void __user *p;
int ret = 0, max_order;
int ret = 0;
long i;
lockdep_assert_held(&kvm->slots_lock);
@ -837,6 +870,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
if (WARN_ON_ONCE(npages <= 0))
return -EINVAL;
if (WARN_ON_ONCE(!PAGE_ALIGNED(src)))
return -EINVAL;
slot = gfn_to_memslot(kvm, start_gfn);
if (!kvm_slot_has_gmem(slot))
return -EINVAL;
@ -845,60 +881,37 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
if (!file)
return -EFAULT;
filemap_invalidate_lock(file->f_mapping);
npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
for (i = 0; i < npages; i += (1 << max_order)) {
struct folio *folio;
gfn_t gfn = start_gfn + i;
pgoff_t index = kvm_gmem_get_index(slot, gfn);
bool is_prepared = false;
kvm_pfn_t pfn;
for (i = 0; i < npages; i++) {
struct page *src_page = NULL;
if (signal_pending(current)) {
ret = -EINTR;
break;
}
folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
break;
if (src) {
unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
if (ret < 0)
break;
if (ret != 1) {
ret = -ENOMEM;
break;
}
}
if (is_prepared) {
folio_unlock(folio);
folio_put(folio);
ret = -EEXIST;
break;
}
ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page,
post_populate, opaque);
folio_unlock(folio);
WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
(npages - i) < (1 << max_order));
if (src_page)
put_page(src_page);
ret = -EINVAL;
while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
KVM_MEMORY_ATTRIBUTE_PRIVATE,
KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
if (!max_order)
goto put_folio_and_exit;
max_order--;
}
p = src ? src + i * PAGE_SIZE : NULL;
ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
if (!ret)
kvm_gmem_mark_prepared(folio);
put_folio_and_exit:
folio_put(folio);
if (ret)
break;
}
filemap_invalidate_unlock(file->f_mapping);
return ret && !i ? ret : i;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);