On Tue, 2020-10-20 at 09:18 +0300, Kirill A. Shutemov wrote: > We cannot access protected pages directly. Use ioremap() to > create a temporary mapping of the page. The mapping is destroyed > on __kvm_unmap_gfn(). > > The new interface gfn_to_pfn_memslot_protected() is used to detect if > the page is protected. > > ioremap_cache_force() is a hack to bypass IORES_MAP_SYSTEM_RAM check > in > the x86 ioremap code. We need a better solution. > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> > --- > arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- > arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- > arch/x86/include/asm/io.h | 2 + > arch/x86/include/asm/pgtable_types.h | 1 + > arch/x86/kvm/mmu/mmu.c | 6 ++- > arch/x86/mm/ioremap.c | 16 ++++++-- > include/linux/kvm_host.h | 3 +- > include/linux/kvm_types.h | 1 + > virt/kvm/kvm_main.c | 52 +++++++++++++++++++----- > -- > 9 files changed, 63 insertions(+), 22 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c > b/arch/powerpc/kvm/book3s_64_mmu_hv.c > index 38ea396a23d6..8e06cd3f759c 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c > @@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu > *vcpu, > } else { > /* Call KVM generic code to do the slow-path check */ > pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, > - writing, &write_ok); > + writing, &write_ok, NULL); > if (is_error_noslot_pfn(pfn)) > return -EFAULT; > page = NULL; > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c > b/arch/powerpc/kvm/book3s_64_mmu_radix.c > index 22a677b18695..6fd4e3f9b66a 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c > @@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct > kvm_vcpu *vcpu, > > /* Call KVM generic code to do the slow-path check */ > pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, > - writing, upgrade_p); > + writing, upgrade_p, NULL); > if (is_error_noslot_pfn(pfn)) > return -EFAULT; > page = NULL; > diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h > index c58d52fd7bf2..a3e1bfad1026 100644 > --- a/arch/x86/include/asm/io.h > +++ b/arch/x86/include/asm/io.h > @@ -184,6 +184,8 @@ extern void __iomem *ioremap_uc(resource_size_t > offset, unsigned long size); > #define ioremap_uc ioremap_uc > extern void __iomem *ioremap_cache(resource_size_t offset, unsigned > long size); > #define ioremap_cache ioremap_cache > +extern void __iomem *ioremap_cache_force(resource_size_t offset, > unsigned long size); > +#define ioremap_cache_force ioremap_cache_force > extern void __iomem *ioremap_prot(resource_size_t offset, unsigned > long size, unsigned long prot_val); > #define ioremap_prot ioremap_prot > extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, > unsigned long size); > diff --git a/arch/x86/include/asm/pgtable_types.h > b/arch/x86/include/asm/pgtable_types.h > index 816b31c68550..4c16a9583786 100644 > --- a/arch/x86/include/asm/pgtable_types.h > +++ b/arch/x86/include/asm/pgtable_types.h > @@ -147,6 +147,7 @@ enum page_cache_mode { > _PAGE_CACHE_MODE_UC = 3, > _PAGE_CACHE_MODE_WT = 4, > _PAGE_CACHE_MODE_WP = 5, > + _PAGE_CACHE_MODE_WB_FORCE = 6, > > _PAGE_CACHE_MODE_NUM = 8 > }; > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 71aa3da2a0b7..162cb285b87b 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -4058,7 +4058,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, > bool prefault, gfn_t gfn, > } > > async = false; > - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, > writable); > + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, > writable, > + NULL); > if (!async) > return false; /* *pfn has correct page already */ > > @@ -4072,7 +4073,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, > bool prefault, gfn_t gfn, > return true; > } > > - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, > writable); > + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, > writable, > + NULL); > return false; > } > > diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c > index 9e5ccc56f8e0..4409785e294c 100644 > --- a/arch/x86/mm/ioremap.c > +++ b/arch/x86/mm/ioremap.c > @@ -202,9 +202,12 @@ __ioremap_caller(resource_size_t phys_addr, > unsigned long size, > __ioremap_check_mem(phys_addr, size, &io_desc); > > /* > - * Don't allow anybody to remap normal RAM that we're using.. > + * Don't allow anybody to remap normal RAM that we're using, > unless > + * _PAGE_CACHE_MODE_WB_FORCE is used. > */ > - if (io_desc.flags & IORES_MAP_SYSTEM_RAM) { > + if (pcm == _PAGE_CACHE_MODE_WB_FORCE) { > + pcm = _PAGE_CACHE_MODE_WB; > + } else if (io_desc.flags & IORES_MAP_SYSTEM_RAM) { > WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", > &phys_addr, &last_addr); > return NULL; > @@ -419,6 +422,13 @@ void __iomem *ioremap_cache(resource_size_t > phys_addr, unsigned long size) > } > EXPORT_SYMBOL(ioremap_cache); > > +void __iomem *ioremap_cache_force(resource_size_t phys_addr, > unsigned long size) > +{ > + return __ioremap_caller(phys_addr, size, > _PAGE_CACHE_MODE_WB_FORCE, > + __builtin_return_address(0), false); > +} > +EXPORT_SYMBOL(ioremap_cache_force); > + > void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long > size, > unsigned long prot_val) > { > @@ -467,7 +477,7 @@ void iounmap(volatile void __iomem *addr) > p = find_vm_area((void __force *)addr); > > if (!p) { > - printk(KERN_ERR "iounmap: bad address %p\n", addr); > + printk(KERN_ERR "iounmap: bad address %px\n", addr); Unintentional? > dump_stack(); > return; > } > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 6655e8da4555..0d5f3885747b 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -238,6 +238,7 @@ struct kvm_host_map { > void *hva; > kvm_pfn_t pfn; > kvm_pfn_t gfn; > + bool protected; > }; > > /* > @@ -725,7 +726,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct > kvm_memory_slot *slot, gfn_t gfn); > kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, > gfn_t gfn); > kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t > gfn, > bool atomic, bool *async, bool > write_fault, > - bool *writable); > + bool *writable, bool *protected); > > void kvm_release_pfn_clean(kvm_pfn_t pfn); > void kvm_release_pfn_dirty(kvm_pfn_t pfn); > diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h > index a7580f69dda0..0a8c6426b4f4 100644 > --- a/include/linux/kvm_types.h > +++ b/include/linux/kvm_types.h > @@ -58,6 +58,7 @@ struct gfn_to_pfn_cache { > gfn_t gfn; > kvm_pfn_t pfn; > bool dirty; > + bool protected; > }; > > #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 9b569b78874a..7c2c764c28c5 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -1852,9 +1852,10 @@ static bool hva_to_pfn_fast(unsigned long > addr, bool write_fault, > * 1 indicates success, -errno is returned if error is detected. > */ > static int hva_to_pfn_slow(unsigned long addr, bool *async, bool > write_fault, > - bool *writable, kvm_pfn_t *pfn) > + bool *writable, bool *protected, kvm_pfn_t > *pfn) > { > unsigned int flags = FOLL_HWPOISON | FOLL_KVM; > + struct vm_area_struct *vma; > struct page *page; > int npages = 0; > > @@ -1868,9 +1869,15 @@ static int hva_to_pfn_slow(unsigned long addr, > bool *async, bool write_fault, > if (async) > flags |= FOLL_NOWAIT; > > - npages = get_user_pages_unlocked(addr, 1, &page, flags); > - if (npages != 1) > + mmap_read_lock(current->mm); > + npages = get_user_pages(addr, 1, flags, &page, &vma); > + if (npages != 1) { > + mmap_read_unlock(current->mm); > return npages; > + } > + if (protected) > + *protected = vma_is_kvm_protected(vma); > + mmap_read_unlock(current->mm); > > /* map read fault as writable if possible */ > if (unlikely(!write_fault) && writable) { > @@ -1961,7 +1968,7 @@ static int hva_to_pfn_remapped(struct > vm_area_struct *vma, > * whether the mapping is writable. > */ > static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool > *async, > - bool write_fault, bool *writable) > + bool write_fault, bool *writable, bool > *protected) > { > struct vm_area_struct *vma; > kvm_pfn_t pfn = 0; > @@ -1976,7 +1983,8 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, > bool atomic, bool *async, > if (atomic) > return KVM_PFN_ERR_FAULT; > > - npages = hva_to_pfn_slow(addr, async, write_fault, writable, > &pfn); > + npages = hva_to_pfn_slow(addr, async, write_fault, writable, > protected, > + &pfn); > if (npages == 1) > return pfn; > > @@ -2010,7 +2018,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, > bool atomic, bool *async, > > kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t > gfn, > bool atomic, bool *async, bool > write_fault, > - bool *writable) > + bool *writable, bool *protected) > { > unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, > write_fault); > > @@ -2033,7 +2041,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct > kvm_memory_slot *slot, gfn_t gfn, > } > > return hva_to_pfn(addr, atomic, async, write_fault, > - writable); > + writable, protected); > } > EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); > > @@ -2041,19 +2049,26 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, > gfn_t gfn, bool write_fault, > bool *writable) > { > return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, > false, NULL, > - write_fault, writable); > + write_fault, writable, NULL); > } > EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); > > kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t > gfn) > { > - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, > NULL); > + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, > NULL); > } > EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); > > +static kvm_pfn_t gfn_to_pfn_memslot_protected(struct kvm_memory_slot > *slot, > + gfn_t gfn, bool > *protected) > +{ > + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, > + protected); > +} > + > kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, > gfn_t gfn) > { > - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); > + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, > NULL); > } > EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); > > @@ -2134,7 +2149,7 @@ static void kvm_cache_gfn_to_pfn(struct > kvm_memory_slot *slot, gfn_t gfn, > { > kvm_release_pfn(cache->pfn, cache->dirty, cache); > > - cache->pfn = gfn_to_pfn_memslot(slot, gfn); > + cache->pfn = gfn_to_pfn_memslot_protected(slot, gfn, &cache- > >protected); > cache->gfn = gfn; > cache->dirty = false; > cache->generation = gen; > @@ -2149,6 +2164,7 @@ static int __kvm_map_gfn(struct kvm_memslots > *slots, gfn_t gfn, > void *hva = NULL; > struct page *page = KVM_UNMAPPED_PAGE; > struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); > + bool protected; > u64 gen = slots->generation; > > if (!map) > @@ -2162,15 +2178,20 @@ static int __kvm_map_gfn(struct kvm_memslots > *slots, gfn_t gfn, > kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); > } > pfn = cache->pfn; > + protected = cache->protected; > } else { > if (atomic) > return -EAGAIN; > - pfn = gfn_to_pfn_memslot(slot, gfn); > + pfn = gfn_to_pfn_memslot_protected(slot, gfn, > &protected); > } > if (is_error_noslot_pfn(pfn)) > return -EINVAL; > > - if (pfn_valid(pfn)) { > + if (protected) { > + if (atomic) > + return -EAGAIN; > + hva = ioremap_cache_force(pfn_to_hpa(pfn), PAGE_SIZE); > + } else if (pfn_valid(pfn)) { > page = pfn_to_page(pfn); > if (atomic) > hva = kmap_atomic(page); I think the page could have got unmapped since the gup via the hypercall on another CPU. It could be an avenue for the guest to crash the host. > @@ -2191,6 +2212,7 @@ static int __kvm_map_gfn(struct kvm_memslots > *slots, gfn_t gfn, > map->hva = hva; > map->pfn = pfn; > map->gfn = gfn; > + map->protected = protected; > > return 0; > } > @@ -2221,7 +2243,9 @@ static void __kvm_unmap_gfn(struct > kvm_memory_slot *memslot, > if (!map->hva) > return; > > - if (map->page != KVM_UNMAPPED_PAGE) { > + if (map->protected) { > + iounmap(map->hva); > + } else if (map->page != KVM_UNMAPPED_PAGE) { > if (atomic) > kunmap_atomic(map->hva); > else