This should save some time to avoid double pinning and make the pinning information clear. > > > But we may still have to support > > the user-level memory pinning API as legacy case. Instead of duplicating > > the storage, can we change the region list to the list of memslot->pfns > > instead (Or using the struct **pages in memslot instead of pfns)? This > > way APIs could still work but we don't need extra storage burden. > > Right, patch 6 and 9 will achieve this using the memslot->pfns when the MMU > is active. We still need to maintain this enc_region if the user tries to pin > memory before MMU is active(i.e. vcpu is online). In my testing, I havent > seen enc_region being used, but added to make sure we do not break any userspace. > > > Anyway, I think it might be essentially needed to unify them together, > > Otherwise, not only the metadata size is quite large, but also it is > > confusing to have parallel data structures doing the same thing. > >> > >> return pages; > >> > >> @@ -441,14 +468,43 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, > >> return ERR_PTR(ret); > >> } > >> > >> -static void sev_unpin_memory(struct kvm *kvm, struct page **pages, > >> - unsigned long npages) > >> +static void __sev_unpin_memory(struct kvm *kvm, struct page **pages, > >> + unsigned long npages) > >> { > >> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; > >> > >> unpin_user_pages(pages, npages); > >> kvfree(pages); > >> - sev->pages_locked -= npages; > >> + sev->pages_to_lock -= npages; > >> +} > >> + > >> +static struct pinned_region *find_pinned_region(struct kvm *kvm, > >> + struct page **pages, > >> + unsigned long n) > >> +{ > >> + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; > >> + struct list_head *head = &sev->pinned_regions_list; > >> + struct pinned_region *i; > >> + > >> + list_for_each_entry(i, head, list) { > >> + if (i->pages == pages && i->npages == n) > >> + return i; > >> + } > >> + > >> + return NULL; > >> +} > >> + > >> +static void sev_unpin_memory(struct kvm *kvm, struct page **pages, > >> + unsigned long npages) > >> +{ > >> + struct pinned_region *region; > >> + > >> + region = find_pinned_region(kvm, pages, npages); > >> + __sev_unpin_memory(kvm, pages, npages); > >> + if (region) { > >> + list_del(®ion->list); > >> + kfree(region); > >> + } > >> } > >> > >> static void sev_clflush_pages(struct page *pages[], unsigned long npages) > >> @@ -551,8 +607,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) > >> set_page_dirty_lock(inpages[i]); > >> mark_page_accessed(inpages[i]); > >> } > >> - /* unlock the user pages */ > >> - sev_unpin_memory(kvm, inpages, npages); > >> + /* unlock the user pages on error */ > >> + if (ret) > >> + sev_unpin_memory(kvm, inpages, npages); > >> return ret; > >> } > >> > >> @@ -1059,7 +1116,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) > >> set_page_dirty_lock(pages[i]); > >> mark_page_accessed(pages[i]); > >> } > >> - sev_unpin_memory(kvm, pages, n); > >> + if (ret) > >> + sev_unpin_memory(kvm, pages, n); > >> return ret; > >> } > >> > >> @@ -1338,7 +1396,8 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) > >> e_free_hdr: > >> kfree(hdr); > >> e_unpin: > >> - sev_unpin_memory(kvm, guest_page, n); > >> + if (ret) > >> + sev_unpin_memory(kvm, guest_page, n); > >> > >> return ret; > >> } > >> @@ -1508,7 +1567,8 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) > >> ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, > >> &argp->error); > >> > >> - sev_unpin_memory(kvm, guest_page, n); > >> + if (ret) > >> + sev_unpin_memory(kvm, guest_page, n); > >> > >> e_free_trans: > >> kfree(trans); > >> @@ -1629,16 +1689,17 @@ static void sev_migrate_from(struct kvm_sev_info *dst, > >> dst->active = true; > >> dst->asid = src->asid; > >> dst->handle = src->handle; > >> - dst->pages_locked = src->pages_locked; > >> + dst->pages_to_lock = src->pages_to_lock; > >> dst->enc_context_owner = src->enc_context_owner; > >> > >> src->asid = 0; > >> src->active = false; > >> src->handle = 0; > >> - src->pages_locked = 0; > >> + src->pages_to_lock = 0; > >> src->enc_context_owner = NULL; > >> > >> - list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); > >> + list_cut_before(&dst->pinned_regions_list, &src->pinned_regions_list, > >> + &src->pinned_regions_list); > >> } > >> > >> static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) > >> @@ -1862,8 +1923,7 @@ int svm_register_enc_region(struct kvm *kvm, > >> struct kvm_enc_region *range) > >> { > >> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; > >> - struct enc_region *region; > >> - int ret = 0; > >> + unsigned long npages; > >> > >> if (!sev_guest(kvm)) > >> return -ENOTTY; > >> @@ -1875,101 +1935,35 @@ int svm_register_enc_region(struct kvm *kvm, > >> if (range->addr > ULONG_MAX || range->size > ULONG_MAX) > >> return -EINVAL; > >> > >> - region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT); > >> - if (!region) > >> + npages = get_npages(range->addr, range->size); > >> + if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) { > >> + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", > >> + sev->pages_to_lock + npages, > >> + (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT)); > >> return -ENOMEM; > >> - > >> - mutex_lock(&kvm->lock); > >> - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); > >> - if (IS_ERR(region->pages)) { > >> - ret = PTR_ERR(region->pages); > >> - mutex_unlock(&kvm->lock); > >> - goto e_free; > >> } > >> + sev->pages_to_lock += npages; > >> > >> - region->uaddr = range->addr; > >> - region->size = range->size; > >> - > >> - list_add_tail(®ion->list, &sev->regions_list); > >> - mutex_unlock(&kvm->lock); > >> - > >> - /* > >> - * The guest may change the memory encryption attribute from C=0 -> C=1 > >> - * or vice versa for this memory range. Lets make sure caches are > >> - * flushed to ensure that guest data gets written into memory with > >> - * correct C-bit. > >> - */ > >> - sev_clflush_pages(region->pages, region->npages); > >> - > >> - return ret; > >> - > >> -e_free: > >> - kfree(region); > >> - return ret; > >> -} > >> - > >> -static struct enc_region * > >> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) > >> -{ > >> - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; > >> - struct list_head *head = &sev->regions_list; > >> - struct enc_region *i; > >> - > >> - list_for_each_entry(i, head, list) { > >> - if (i->uaddr == range->addr && > >> - i->size == range->size) > >> - return i; > >> - } > >> - > >> - return NULL; > >> -} > >> - > >> -static void __unregister_enc_region_locked(struct kvm *kvm, > >> - struct enc_region *region) > >> -{ > >> - sev_unpin_memory(kvm, region->pages, region->npages); > >> - list_del(®ion->list); > >> - kfree(region); > >> + return 0; > >> } > >> > >> int svm_unregister_enc_region(struct kvm *kvm, > >> struct kvm_enc_region *range) > >> { > >> - struct enc_region *region; > >> - int ret; > >> + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; > >> + unsigned long npages; > >> > >> /* If kvm is mirroring encryption context it isn't responsible for it */ > >> if (is_mirroring_enc_context(kvm)) > >> return -EINVAL; > >> > >> - mutex_lock(&kvm->lock); > >> - > >> - if (!sev_guest(kvm)) { > >> - ret = -ENOTTY; > >> - goto failed; > >> - } > >> - > >> - region = find_enc_region(kvm, range); > >> - if (!region) { > >> - ret = -EINVAL; > >> - goto failed; > >> - } > >> - > >> - /* > >> - * Ensure that all guest tagged cache entries are flushed before > >> - * releasing the pages back to the system for use. CLFLUSH will > >> - * not do this, so issue a WBINVD. > >> - */ > >> - wbinvd_on_all_cpus(); > >> + if (!sev_guest(kvm)) > >> + return -ENOTTY; > >> > >> - __unregister_enc_region_locked(kvm, region); > >> + npages = get_npages(range->addr, range->size); > >> + sev->pages_to_lock -= npages; > >> > >> - mutex_unlock(&kvm->lock); > >> return 0; > >> - > >> -failed: > >> - mutex_unlock(&kvm->lock); > >> - return ret; > >> } > >> > >> int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) > >> @@ -2018,7 +2012,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) > >> mirror_sev->fd = source_sev->fd; > >> mirror_sev->es_active = source_sev->es_active; > >> mirror_sev->handle = source_sev->handle; > >> - INIT_LIST_HEAD(&mirror_sev->regions_list); > >> + INIT_LIST_HEAD(&mirror_sev->pinned_regions_list); > >> ret = 0; > >> > >> /* > >> @@ -2038,8 +2032,9 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) > >> void sev_vm_destroy(struct kvm *kvm) > >> { > >> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; > >> - struct list_head *head = &sev->regions_list; > >> + struct list_head *head = &sev->pinned_regions_list; > >> struct list_head *pos, *q; > >> + struct pinned_region *region; > >> > >> WARN_ON(sev->num_mirrored_vms); > >> > >> @@ -2072,8 +2067,14 @@ void sev_vm_destroy(struct kvm *kvm) > >> */ > >> if (!list_empty(head)) { > >> list_for_each_safe(pos, q, head) { > >> - __unregister_enc_region_locked(kvm, > >> - list_entry(pos, struct enc_region, list)); > >> + /* > >> + * Unpin the memory that were pinned outside of MMU > >> + * demand pinning > >> + */ > >> + region = list_entry(pos, struct pinned_region, list); > >> + __sev_unpin_memory(kvm, region->pages, region->npages); > >> + list_del(®ion->list); > >> + kfree(region); > >> cond_resched(); > >> } > >> } > > So the guest memory has already been unpinned in sev_free_memslot(). > > Why doing it again at here? > > Guest memory that was demand pinned got freed using sev_free_memslot(). Regions that were > statically pinned for e.g. SEV_LAUNCH_UPDATE_DATA need to be freed here. This too will be > demand pinned in patch 9. > > > > >> @@ -2951,13 +2952,96 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) > >> ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); > >> } > >> > >> +bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, > >> + kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level) > >> +{ > >> + unsigned int npages = KVM_PAGES_PER_HPAGE(level); > >> + unsigned int flags = FOLL_LONGTERM, npinned; > >> + struct kvm_arch_memory_slot *aslot; > >> + struct kvm *kvm = vcpu->kvm; > >> + gfn_t gfn_start, rel_gfn; > >> + struct page *page[1]; > >> + kvm_pfn_t old_pfn; > >> + > >> + if (!sev_guest(kvm)) > >> + return true; > >> + > >> + if (WARN_ON_ONCE(!memslot->arch.pfns)) > >> + return false; > >> + > >> + if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm)) > >> + return false; > >> + > >> + hva = ALIGN_DOWN(hva, npages << PAGE_SHIFT); > >> + flags |= write ? FOLL_WRITE : 0; > >> + > >> + mutex_lock(&kvm->slots_arch_lock); > >> + gfn_start = hva_to_gfn_memslot(hva, memslot); > >> + rel_gfn = gfn_start - memslot->base_gfn; > >> + aslot = &memslot->arch; > >> + if (test_bit(rel_gfn, aslot->pinned_bitmap)) { > >> + old_pfn = aslot->pfns[rel_gfn]; > >> + if (old_pfn == pfn) > >> + goto out; > >> + > >> + /* Flush the cache before releasing the page to the system */ > >> + sev_flush_guest_memory(to_svm(vcpu), __va(old_pfn), > >> + npages * PAGE_SIZE); > >> + unpin_user_page(pfn_to_page(old_pfn)); > >> + } > >> + /* Pin the page, KVM doesn't yet support page migration. */ > >> + npinned = pin_user_pages_fast(hva, 1, flags, page); > >> + KVM_BUG(npinned != 1, kvm, "SEV: Pinning failed\n"); > >> + KVM_BUG(pfn != page_to_pfn(page[0]), kvm, "SEV: pfn mismatch\n"); > >> + > >> + if (!this_cpu_has(X86_FEATURE_SME_COHERENT)) > >> + clflush_cache_range(__va(pfn << PAGE_SHIFT), npages * PAGE_SIZE); > >> + > >> + WARN_ON(rel_gfn >= memslot->npages); > >> + aslot->pfns[rel_gfn] = pfn; > >> + set_bit(rel_gfn, aslot->pinned_bitmap); > >> + > >> +out: > >> + mutex_unlock(&kvm->slots_arch_lock); > >> + return true; > >> +} > >> + > >> void sev_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) > >> { > >> struct kvm_arch_memory_slot *aslot = &slot->arch; > >> + kvm_pfn_t *pfns; > >> + gfn_t gfn; > >> + int i; > >> > >> if (!sev_guest(kvm)) > >> return; > >> > >> + if (!aslot->pinned_bitmap || !slot->arch.pfns) > >> + goto out; > >> + > >> + pfns = aslot->pfns; > >> + > >> + /* > >> + * Ensure that all guest tagged cache entries are flushed before > >> + * releasing the pages back to the system for use. CLFLUSH will > >> + * not do this, so issue a WBINVD. > >> + */ > >> + wbinvd_on_all_cpus(); > > > > This is a heavy-weight operation and is essentially only needed at most > > once per VM shutdown. So, better using smaller hammer in the following > > code. Or, alternatively, maybe passing a boolean from caller to avoid > > flushing if true. > > Or maybe I can do this in sev_vm_destroy() patch? > > >> + > >> + /* > >> + * Iterate the memslot to find the pinned pfn using the bitmap and drop > >> + * the pfn stored. > >> + */ > >> + for (i = 0, gfn = slot->base_gfn; i < slot->npages; i++, gfn++) { > >> + if (test_and_clear_bit(i, aslot->pinned_bitmap)) { > >> + if (WARN_ON(!pfns[i])) > >> + continue; > >> + > >> + unpin_user_page(pfn_to_page(pfns[i])); > >> + } > >> + } > >> + > >> +out: > >> if (aslot->pinned_bitmap) { > >> kvfree(aslot->pinned_bitmap); > >> aslot->pinned_bitmap = NULL; > >> @@ -2992,6 +3076,8 @@ int sev_alloc_memslot_metadata(struct kvm *kvm, > >> return -ENOMEM; > >> > >> aslot->pinned_bitmap = kvzalloc(pinned_bytes, GFP_KERNEL_ACCOUNT); > >> + new->flags |= KVM_MEMSLOT_ENCRYPTED; > >> + > >> if (!aslot->pinned_bitmap) { > >> kvfree(aslot->pfns); > >> aslot->pfns = NULL; > >> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > >> index ec06421cb532..463a90ed6f83 100644 > >> --- a/arch/x86/kvm/svm/svm.c > >> +++ b/arch/x86/kvm/svm/svm.c > >> @@ -4661,6 +4661,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { > >> > >> .alloc_memslot_metadata = sev_alloc_memslot_metadata, > >> .free_memslot = sev_free_memslot, > >> + .pin_pfn = sev_pin_pfn, > >> }; > >> > >> /* > >> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h > >> index f00364020d7e..2f38e793ead0 100644 > >> --- a/arch/x86/kvm/svm/svm.h > >> +++ b/arch/x86/kvm/svm/svm.h > >> @@ -75,8 +75,8 @@ struct kvm_sev_info { > >> unsigned int asid; /* ASID used for this guest */ > >> unsigned int handle; /* SEV firmware handle */ > >> int fd; /* SEV device fd */ > >> - unsigned long pages_locked; /* Number of pages locked */ > >> - struct list_head regions_list; /* List of registered regions */ > >> + unsigned long pages_to_lock; /* Number of page that can be locked */ > >> + struct list_head pinned_regions_list; /* List of pinned regions */ > >> u64 ap_jump_table; /* SEV-ES AP Jump Table address */ > >> struct kvm *enc_context_owner; /* Owner of copied encryption context */ > >> unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ > >> @@ -621,5 +621,7 @@ int sev_alloc_memslot_metadata(struct kvm *kvm, > >> struct kvm_memory_slot *new); > >> void sev_free_memslot(struct kvm *kvm, > >> struct kvm_memory_slot *slot); > >> +bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, > >> + kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level); > >> > >> #endif > >> -- > >> 2.32.0 > >> > > Thanks for the review. > > Regards > Nikunj