Hi Reiji, On Mon, Feb 14, 2022 at 11:46:38PM -0800, Reiji Watanabe wrote: > Hi Alex, > > On Wed, Nov 17, 2021 at 7:37 AM Alexandru Elisei > <alexandru.elisei@xxxxxxx> wrote: > > > > Pin memory in the process address space and map it in the stage 2 tables as > > a result of userspace enabling the KVM_CAP_ARM_LOCK_USER_MEMORY_REGION > > capability; and unpin it from the process address space when the capability > > is used with the KVM_ARM_LOCK_USER_MEMORY_REGION_FLAGS_UNLOCK flag. > > > > The current implementation has two drawbacks which will be fixed in future > > patches: > > > > - The dcache maintenance is done when the memslot is locked, which means > > that it is possible that memory changes made by userspace after the ioctl > > completes won't be visible to a guest running with the MMU off. > > > > - Tag scrubbing is done when the memslot is locked. If the MTE capability > > is enabled after the ioctl, the guest will be able to access unsanitised > > pages. This is prevented by forbidding userspace to enable the MTE > > capability if any memslots are locked. > > > > Only PAGE_SIZE mappings are supported at stage 2. > > > > Signed-off-by: Alexandru Elisei <alexandru.elisei@xxxxxxx> > > --- > > Documentation/virt/kvm/api.rst | 4 +- > > arch/arm64/include/asm/kvm_host.h | 11 ++ > > arch/arm64/kvm/arm.c | 22 +++- > > arch/arm64/kvm/mmu.c | 204 ++++++++++++++++++++++++++++-- > > 4 files changed, 226 insertions(+), 15 deletions(-) > > > > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst > > index 16aa59eae3d9..0ac12a730013 100644 > > --- a/Documentation/virt/kvm/api.rst > > +++ b/Documentation/virt/kvm/api.rst > > @@ -6979,8 +6979,8 @@ write permissions are specified for a memslot which logs dirty pages. > > > > Enabling this capability causes the memory pinned when locking the memslot > > specified in args[0] to be unpinned, or, optionally, all memslots to be > > -unlocked. The IPA range is not unmapped from stage 2. > > ->>>>>>> 56641eee289e (KVM: arm64: Add lock/unlock memslot user API) > > +unlocked. The IPA range is not unmapped from stage 2. It is considered an error > > +to attempt to unlock a memslot which is not locked. > > > > 8. Other capabilities. > > ====================== > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h > > index 733621e41900..7fd70ad90c16 100644 > > --- a/arch/arm64/include/asm/kvm_host.h > > +++ b/arch/arm64/include/asm/kvm_host.h > > @@ -99,7 +99,18 @@ struct kvm_s2_mmu { > > struct kvm_arch *arch; > > }; > > > > +#define KVM_MEMSLOT_LOCK_READ (1 << 0) > > +#define KVM_MEMSLOT_LOCK_WRITE (1 << 1) > > +#define KVM_MEMSLOT_LOCK_MASK 0x3 > > + > > +struct kvm_memory_slot_page { > > + struct list_head list; > > + struct page *page; > > +}; > > + > > struct kvm_arch_memory_slot { > > + struct kvm_memory_slot_page pages; > > + u32 flags; > > }; > > > > struct kvm_arch { > > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > > index d49905d18cee..b9b8b43835e3 100644 > > --- a/arch/arm64/kvm/arm.c > > +++ b/arch/arm64/kvm/arm.c > > @@ -106,6 +106,25 @@ static int kvm_lock_user_memory_region_ioctl(struct kvm *kvm, > > } > > } > > > > +static bool kvm_arm_has_locked_memslots(struct kvm *kvm) > > +{ > > + struct kvm_memslots *slots = kvm_memslots(kvm); > > + struct kvm_memory_slot *memslot; > > + bool has_locked_memslots = false; > > + int idx; > > + > > + idx = srcu_read_lock(&kvm->srcu); > > + kvm_for_each_memslot(memslot, slots) { > > + if (memslot->arch.flags & KVM_MEMSLOT_LOCK_MASK) { > > + has_locked_memslots = true; > > + break; > > + } > > + } > > + srcu_read_unlock(&kvm->srcu, idx); > > + > > + return has_locked_memslots; > > +} > > + > > int kvm_vm_ioctl_enable_cap(struct kvm *kvm, > > struct kvm_enable_cap *cap) > > { > > @@ -120,7 +139,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, > > break; > > case KVM_CAP_ARM_MTE: > > mutex_lock(&kvm->lock); > > - if (!system_supports_mte() || kvm->created_vcpus) { > > + if (!system_supports_mte() || kvm->created_vcpus || > > + (kvm_arm_lock_memslot_supported() && kvm_arm_has_locked_memslots(kvm))) { > > r = -EINVAL; > > } else { > > r = 0; > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > > index f65bcbc9ae69..b0a8e61315e4 100644 > > --- a/arch/arm64/kvm/mmu.c > > +++ b/arch/arm64/kvm/mmu.c > > @@ -72,6 +72,11 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) > > return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); > > } > > > > +static bool memslot_is_locked(struct kvm_memory_slot *memslot) > > +{ > > + return memslot->arch.flags & KVM_MEMSLOT_LOCK_MASK; > > +} > > + > > /** > > * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 > > * @kvm: pointer to kvm structure. > > @@ -769,6 +774,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, > > if (map_size == PAGE_SIZE) > > return true; > > > > + /* Allow only PAGE_SIZE mappings for locked memslots */ > > + if (memslot_is_locked(memslot)) > > + return false; > > + > > size = memslot->npages * PAGE_SIZE; > > > > gpa_start = memslot->base_gfn << PAGE_SHIFT; > > @@ -1296,6 +1305,159 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) > > return ret; > > } > > > > +static int try_rlimit_memlock(unsigned long npages) > > +{ > > + unsigned long lock_limit; > > + bool has_lock_cap; > > + int ret = 0; > > + > > + has_lock_cap = capable(CAP_IPC_LOCK); > > + if (has_lock_cap) > > + goto out; > > + > > + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > + > > + mmap_read_lock(current->mm); > > + if (npages + current->mm->locked_vm > lock_limit) > > + ret = -ENOMEM; > > + mmap_read_unlock(current->mm); > > + > > +out: > > + return ret; > > +} > > + > > +static void unpin_memslot_pages(struct kvm_memory_slot *memslot, bool writable) > > +{ > > + struct kvm_memory_slot_page *entry, *tmp; > > + > > + list_for_each_entry_safe(entry, tmp, &memslot->arch.pages.list, list) { > > + if (writable) > > + set_page_dirty_lock(entry->page); > > + unpin_user_page(entry->page); > > + kfree(entry); > > + } > > +} > > Shouldn't this be done when the memslot is deleted ? > (Or should the locked memslot be prevented from deleting ?) I add code to prevent changes to a locked memslot in patch #9 ("KVM: arm64: Deny changes to locked memslots"). > > > + > > +static int lock_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot, > > + u64 flags) > > +{ > > + struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; > > + struct kvm_memory_slot_page *page_entry; > > + bool writable = flags & KVM_ARM_LOCK_MEM_WRITE; > > + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; > > + struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; > > + struct vm_area_struct *vma; > > + unsigned long npages = memslot->npages; > > + unsigned int pin_flags = FOLL_LONGTERM; > > + unsigned long i, hva, ipa, mmu_seq; > > + int ret; > > + > > + ret = try_rlimit_memlock(npages); > > Even if the memory for the hva described by the memslot is already > 'locked' by mlock or etc, is this checking needed ? I believe it is, mlock uses a different mechanism to pin the pages, it sets the VM_LOCKED VMA flag. And even if a VMA is mlocked, it doesn't mean that the host's stage 1 is populated because of the MLOCK_ONFAULT mlock() flag. If userspace wants to lock the same memory twice, then it's free to do it, and suffer any possible consequences (like running into a size limit). > > > > + if (ret) > > + return -ENOMEM; > > + > > + INIT_LIST_HEAD(&memslot->arch.pages.list); > > + > > + if (writable) { > > + prot |= KVM_PGTABLE_PROT_W; > > + pin_flags |= FOLL_WRITE; > > The lock flag is just for stage 2 mapping, correct ? > I wonder if it is appropriate for KVM to set 'pin_flags', which is > passed to pin_user_pages(), based on the lock flag. I don't see why not, KVM is the consumer of the GUP API. > > > + } > > + > > + hva = memslot->userspace_addr; > > + ipa = memslot->base_gfn << PAGE_SHIFT; > > + > > + mmu_seq = kvm->mmu_notifier_seq; > > + smp_rmb(); > > + > > + for (i = 0; i < npages; i++) { > > + page_entry = kzalloc(sizeof(*page_entry), GFP_KERNEL); > > + if (!page_entry) { > > + unpin_memslot_pages(memslot, writable); > > + ret = -ENOMEM; > > + goto out_err; > > Nit: It seems we can call unpin_memslot_pages() from 'out_err' > instead of calling it from each of the error cases. I'll see if I can remove the repetition. > > > + } > > + > > + mmap_read_lock(current->mm); > > + ret = pin_user_pages(hva, 1, pin_flags, &page_entry->page, &vma); > > + if (ret != 1) { > > + mmap_read_unlock(current->mm); > > + unpin_memslot_pages(memslot, writable); > > + ret = -ENOMEM; > > + goto out_err; > > + } > > + if (kvm_has_mte(kvm)) { > > + if (vma->vm_flags & VM_SHARED) { > > + ret = -EFAULT; > > + } else { > > + ret = sanitise_mte_tags(kvm, > > + page_to_pfn(page_entry->page), > > + PAGE_SIZE); > > + } > > + if (ret) { > > + mmap_read_unlock(current->mm); > > + goto out_err; > > + } > > + } > > + mmap_read_unlock(current->mm); > > + > > + ret = kvm_mmu_topup_memory_cache(&cache, kvm_mmu_cache_min_pages(kvm)); > > + if (ret) { > > + unpin_memslot_pages(memslot, writable); > > + goto out_err; > > + } > > + > > + spin_lock(&kvm->mmu_lock); > > + if (mmu_notifier_retry(kvm, mmu_seq)) { > > + spin_unlock(&kvm->mmu_lock); > > + unpin_memslot_pages(memslot, writable); > > + ret = -EAGAIN; > > + goto out_err; > > + } > > + > > + ret = kvm_pgtable_stage2_map(pgt, ipa, PAGE_SIZE, > > + page_to_phys(page_entry->page), > > + prot, &cache); > > + spin_unlock(&kvm->mmu_lock); > > + > > + if (ret) { > > + kvm_pgtable_stage2_unmap(pgt, memslot->base_gfn << PAGE_SHIFT, > > + i << PAGE_SHIFT); > > + unpin_memslot_pages(memslot, writable); > > + goto out_err; > > + } > > + list_add(&page_entry->list, &memslot->arch.pages.list); > > + > > + hva += PAGE_SIZE; > > + ipa += PAGE_SIZE; > > + } > > + > > + > > + /* > > + * Even though we've checked the limit at the start, we can still exceed > > + * it if userspace locked other pages in the meantime or if the > > + * CAP_IPC_LOCK capability has been revoked. > > + */ > > + ret = account_locked_vm(current->mm, npages, true); > > + if (ret) { > > + kvm_pgtable_stage2_unmap(pgt, memslot->base_gfn << PAGE_SHIFT, > > + npages << PAGE_SHIFT); > > + unpin_memslot_pages(memslot, writable); > > + goto out_err; > > + } > > + > > + memslot->arch.flags = KVM_MEMSLOT_LOCK_READ; > > + if (writable) > > + memslot->arch.flags |= KVM_MEMSLOT_LOCK_WRITE; > > + > > + kvm_mmu_free_memory_cache(&cache); > > + > > + return 0; > > + > > +out_err: > > + kvm_mmu_free_memory_cache(&cache); > > + return ret; > > +} > > + > > int kvm_mmu_lock_memslot(struct kvm *kvm, u64 slot, u64 flags) > > { > > struct kvm_memory_slot *memslot; > > @@ -1325,7 +1487,12 @@ int kvm_mmu_lock_memslot(struct kvm *kvm, u64 slot, u64 flags) > > goto out_unlock_slots; > > } > > > > - ret = -EINVAL; > > + if (memslot_is_locked(memslot)) { > > + ret = -EBUSY; > > + goto out_unlock_slots; > > + } > > + > > + ret = lock_memslot(kvm, memslot, flags); > > > > out_unlock_slots: > > mutex_unlock(&kvm->slots_lock); > > @@ -1335,11 +1502,22 @@ int kvm_mmu_lock_memslot(struct kvm *kvm, u64 slot, u64 flags) > > return ret; > > } > > > > +static void unlock_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) > > +{ > > + bool writable = memslot->arch.flags & KVM_MEMSLOT_LOCK_WRITE; > > + unsigned long npages = memslot->npages; > > + > > + unpin_memslot_pages(memslot, writable); > > + account_locked_vm(current->mm, npages, false); > > + > > + memslot->arch.flags &= ~KVM_MEMSLOT_LOCK_MASK; > > +} > > What if the memslot was locked with read only but the memslot > has read/write permission set ? Shouldn't the stage 2 mapping > updated if KVM allows for the scenario ? If the memslot is locked with read flags, then the stage 2 entries are mapped read-only, and subsequent stage 2 data aborts will relax the permissions if needed. Userspace clearly wants the memory to be mapped at stage 2 with read-only permissions, otherwise it would have specified both read and write permissions when locking the memslot, I don't see why KVM should do more than what was requested of it. If you find this awkward, there is already a case in KVM where userspace wants the stage 2 entries to be read-only so the guest will cause write faults: userspace does this when it wants to migrate the VM by setting the KVM_MEM_LOG_DIRTY_PAGES memslot flag. Thanks, Alex > > Thanks, > Reiji > > > > + > > int kvm_mmu_unlock_memslot(struct kvm *kvm, u64 slot, u64 flags) > > { > > bool unlock_all = flags & KVM_ARM_UNLOCK_MEM_ALL; > > struct kvm_memory_slot *memslot; > > - int ret; > > + int ret = 0; > > > > if (!unlock_all && slot >= KVM_MEM_SLOTS_NUM) > > return -EINVAL; > > @@ -1347,18 +1525,20 @@ int kvm_mmu_unlock_memslot(struct kvm *kvm, u64 slot, u64 flags) > > mutex_lock(&kvm->slots_lock); > > > > if (unlock_all) { > > - ret = -EINVAL; > > - goto out_unlock_slots; > > - } > > - > > - memslot = id_to_memslot(kvm_memslots(kvm), slot); > > - if (!memslot) { > > - ret = -EINVAL; > > - goto out_unlock_slots; > > + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) { > > + if (!memslot_is_locked(memslot)) > > + continue; > > + unlock_memslot(kvm, memslot); > > + } > > + } else { > > + memslot = id_to_memslot(kvm_memslots(kvm), slot); > > + if (!memslot || !memslot_is_locked(memslot)) { > > + ret = -EINVAL; > > + goto out_unlock_slots; > > + } > > + unlock_memslot(kvm, memslot); > > } > > > > - ret = -EINVAL; > > - > > out_unlock_slots: > > mutex_unlock(&kvm->slots_lock); > > return ret; > > -- > > 2.33.1 > > > > _______________________________________________ > > kvmarm mailing list > > kvmarm@xxxxxxxxxxxxxxxxxxxxx > > https://lists.cs.columbia.edu/mailman/listinfo/kvmarm _______________________________________________ kvmarm mailing list kvmarm@xxxxxxxxxxxxxxxxxxxxx https://lists.cs.columbia.edu/mailman/listinfo/kvmarm