On Fri, Oct 04, 2013 at 09:50:35AM +0100, Marc Zyngier wrote: > Hi Christoffer, > > On 03/10/13 21:36, Christoffer Dall wrote: > > Support huge pages in KVM/ARM and KVM/ARM64. The pud_huge checking on > > the unmap path may feel a bit silly as the pud_huge check is always > > defined to false, but the compiler should be smart about this. > > > > Note: This deals only with VMAs marked as huge which are allocated by > > users through hugetlbfs only. Transparent huge pages can only be > > detected by looking at the underlying pages (or the page tables > > themselves) and this patch so far simply maps these on a page-by-page > > level in the Stage-2 page tables. > > > > Signed-off-by: Christoffer Dall <christoffer.dall@xxxxxxxxxx> > > > > --- > > Changelog[v2]: > > - Invaliate TLB on clearing stage-2 huge page entries > > - Restructure unmap_range to share more code > > - Share stage_2 pmd allocation > > - Factor out THP support to separate patch > > - Fix incorrect gfn_to_pfn_prot and mmu_seq ordering > > --- > > arch/arm/include/asm/kvm_mmu.h | 17 +++- > > arch/arm/kvm/mmu.c | 172 +++++++++++++++++++++++++++++--------- > > arch/arm64/include/asm/kvm_mmu.h | 12 ++- > > 3 files changed, 157 insertions(+), 44 deletions(-) > > > > diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h > > index 9b28c41..d629fb3 100644 > > --- a/arch/arm/include/asm/kvm_mmu.h > > +++ b/arch/arm/include/asm/kvm_mmu.h > > @@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void); > > int kvm_mmu_init(void); > > void kvm_clear_hyp_idmap(void); > > > > +static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) > > +{ > > + *pmd = new_pmd; > > + flush_pmd_entry(pmd); > > +} > > + > > static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) > > { > > *pte = new_pte; > > @@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) > > pte_val(*pte) |= L_PTE_S2_RDWR; > > } > > > > +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) > > +{ > > + pmd_val(*pmd) |= L_PTE_S2_RDWR; > > It feels a bit weird to use a PTE specific flag to mess with a PMD. Can > you define a new flag for this? > yup > > +} > > + > > struct kvm; > > > > -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) > > +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, > > + unsigned long size) > > { > > /* > > * If we are going to insert an instruction page and the icache is > > @@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) > > * need any kind of flushing (DDI 0406C.b - Page B3-1392). > > */ > > if (icache_is_pipt()) { > > - unsigned long hva = gfn_to_hva(kvm, gfn); > > - __cpuc_coherent_user_range(hva, hva + PAGE_SIZE); > > + __cpuc_coherent_user_range(hva, hva + size); > > } else if (!icache_is_vivt_asid_tagged()) { > > /* any kind of VIPT cache */ > > __flush_icache_all(); > > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > > index b0de86b..cab031b 100644 > > --- a/arch/arm/kvm/mmu.c > > +++ b/arch/arm/kvm/mmu.c > > @@ -19,6 +19,7 @@ > > #include <linux/mman.h> > > #include <linux/kvm_host.h> > > #include <linux/io.h> > > +#include <linux/hugetlb.h> > > #include <trace/events/kvm.h> > > #include <asm/pgalloc.h> > > #include <asm/cacheflush.h> > > @@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start; > > static unsigned long hyp_idmap_end; > > static phys_addr_t hyp_idmap_vector; > > > > +#define kvm_pmd_huge(_x) (pmd_huge(_x)) > > + > > static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) > > { > > /* > > @@ -93,19 +96,29 @@ static bool page_empty(void *ptr) > > > > static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) > > { > > - pmd_t *pmd_table = pmd_offset(pud, 0); > > - pud_clear(pud); > > - kvm_tlb_flush_vmid_ipa(kvm, addr); > > - pmd_free(NULL, pmd_table); > > + if (pud_huge(*pud)) { > > + pud_clear(pud); > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + } else { > > + pmd_t *pmd_table = pmd_offset(pud, 0); > > + pud_clear(pud); > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + pmd_free(NULL, pmd_table); > > + } > > put_page(virt_to_page(pud)); > > } > > > > static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) > > { > > - pte_t *pte_table = pte_offset_kernel(pmd, 0); > > - pmd_clear(pmd); > > - kvm_tlb_flush_vmid_ipa(kvm, addr); > > - pte_free_kernel(NULL, pte_table); > > + if (kvm_pmd_huge(*pmd)) { > > + pmd_clear(pmd); > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + } else { > > + pte_t *pte_table = pte_offset_kernel(pmd, 0); > > + pmd_clear(pmd); > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + pte_free_kernel(NULL, pte_table); > > + } > > put_page(virt_to_page(pmd)); > > } > > > > @@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, > > continue; > > } > > > > + if (pud_huge(*pud)) { > > + /* > > + * If we are dealing with a huge pud, just clear it and > > + * move on. > > + */ > > + clear_pud_entry(kvm, pud, addr); > > + addr = pud_addr_end(addr, end); > > + continue; > > + } > > + > > pmd = pmd_offset(pud, addr); > > if (pmd_none(*pmd)) { > > addr = pmd_addr_end(addr, end); > > continue; > > } > > > > - pte = pte_offset_kernel(pmd, addr); > > - clear_pte_entry(kvm, pte, addr); > > - next = addr + PAGE_SIZE; > > + if (!kvm_pmd_huge(*pmd)) { > > + pte = pte_offset_kernel(pmd, addr); > > + clear_pte_entry(kvm, pte, addr); > > + next = addr + PAGE_SIZE; > > + } > > > > - /* If we emptied the pte, walk back up the ladder */ > > - if (page_empty(pte)) { > > + /* > > + * If the pmd entry is to be cleared, walk back up the ladder > > + */ > > + if (kvm_pmd_huge(*pmd) || page_empty(pte)) { > > clear_pmd_entry(kvm, pmd, addr); > > next = pmd_addr_end(addr, end); > > if (page_empty(pmd) && !page_empty(pud)) { > > @@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm) > > kvm->arch.pgd = NULL; > > } > > > > - > > -static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > > - phys_addr_t addr, const pte_t *new_pte, bool iomap) > > +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > > + phys_addr_t addr) > > { > > pgd_t *pgd; > > pud_t *pud; > > pmd_t *pmd; > > - pte_t *pte, old_pte; > > > > - /* Create 2nd stage page table mapping - Level 1 */ > > pgd = kvm->arch.pgd + pgd_index(addr); > > pud = pud_offset(pgd, addr); > > if (pud_none(*pud)) { > > if (!cache) > > - return 0; /* ignore calls from kvm_set_spte_hva */ > > + return NULL; > > pmd = mmu_memory_cache_alloc(cache); > > pud_populate(NULL, pud, pmd); > > get_page(virt_to_page(pud)); > > } > > > > - pmd = pmd_offset(pud, addr); > > + return pmd_offset(pud, addr); > > +} > > + > > +static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache > > + *cache, phys_addr_t addr, const pmd_t *new_pmd) > > +{ > > + pmd_t *pmd, old_pmd; > > + > > + pmd = stage2_get_pmd(kvm, cache, addr); > > + VM_BUG_ON(!pmd); > > > > - /* Create 2nd stage page table mapping - Level 2 */ > > + /* > > + * Mapping in huge pages should only happen through a fault. If a > > + * page is merged into a transparent huge page, the individual > > + * subpages of that huge page should be unmapped through MMU > > + * notifiers before we get here. > > + * > > + * Merging of CompoundPages is not supported; they should become > > + * splitting first, unmapped, merged, and mapped back in on-demand. > > + */ > > + VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); > > + > > + old_pmd = *pmd; > > + kvm_set_pmd(pmd, *new_pmd); > > + if (pmd_present(old_pmd)) > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + else > > + get_page(virt_to_page(pmd)); > > + return 0; > > +} > > + > > +static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > > + phys_addr_t addr, const pte_t *new_pte, bool iomap) > > +{ > > + pmd_t *pmd; > > + pte_t *pte, old_pte; > > + > > + /* Create stage-2 page table mapping - Level 1 */ > > + pmd = stage2_get_pmd(kvm, cache, addr); > > + if (!pmd) { > > + /* > > + * Ignore calls from kvm_set_spte_hva for unallocated > > + * address ranges. > > + */ > > + return 0; > > + } > > + > > + /* Create stage-2 page mappings - Level 2 */ > > if (pmd_none(*pmd)) { > > if (!cache) > > return 0; /* ignore calls from kvm_set_spte_hva */ > > @@ -508,15 +577,18 @@ out: > > } > > > > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > - gfn_t gfn, struct kvm_memory_slot *memslot, > > + struct kvm_memory_slot *memslot, > > unsigned long fault_status) > > { > > - pte_t new_pte; > > - pfn_t pfn; > > int ret; > > - bool write_fault, writable; > > + bool write_fault, writable, hugetlb = false, force_pte = false; > > unsigned long mmu_seq; > > + gfn_t gfn = fault_ipa >> PAGE_SHIFT; > > + unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); > > + struct kvm *kvm = vcpu->kvm; > > struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; > > + struct vm_area_struct *vma; > > + pfn_t pfn; > > > > write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); > > if (fault_status == FSC_PERM && !write_fault) { > > @@ -524,6 +596,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > return -EFAULT; > > } > > > > + /* Let's check if we will get back a huge page backed by hugetlbfs */ > > + down_read(¤t->mm->mmap_sem); > > + vma = find_vma_intersection(current->mm, hva, hva + 1); > > + if (is_vm_hugetlb_page(vma)) { > > + hugetlb = true; > > + gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; > > + } else { > > + if (vma->vm_start & ~PMD_MASK) > > + force_pte = true; > > What is this test for exactly? I have the feeling this is actually for > THP (since nothing in this patch is actually using force_pte), but I > don't really get it. It probably deserves a comment. > Yes, it's for THP, sorry it got into the wrong patch. If your VMA is not 2M aligned then there's no correlation between a THP in the process page table being mapped on the PMD level and the same set of pages being mapped in the stage-2 page tables and we simply bail from using THP and map each page in a THP using individual page mappings. (Note that this doesn't break in any way, we just don't get the benefit of huge pages in this case). > > + } > > + up_read(¤t->mm->mmap_sem); > > + > > /* We need minimum second+third level pages */ > > ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); > > if (ret) > > @@ -541,26 +625,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > */ > > smp_rmb(); > > > > - pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); > > + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); > > if (is_error_pfn(pfn)) > > return -EFAULT; > > > > - new_pte = pfn_pte(pfn, PAGE_S2); > > - coherent_icache_guest_page(vcpu->kvm, gfn); > > - > > - spin_lock(&vcpu->kvm->mmu_lock); > > - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) > > + spin_lock(&kvm->mmu_lock); > > + if (mmu_notifier_retry(kvm, mmu_seq)) > > goto out_unlock; > > - if (writable) { > > - kvm_set_s2pte_writable(&new_pte); > > - kvm_set_pfn_dirty(pfn); > > + > > + if (hugetlb) { > > + pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); > > + new_pmd = pmd_mkhuge(new_pmd); > > + if (writable) { > > + kvm_set_s2pmd_writable(&new_pmd); > > + kvm_set_pfn_dirty(pfn); > > + } > > + coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); > > + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); > > + } else { > > + pte_t new_pte = pfn_pte(pfn, PAGE_S2); > > + if (writable) { > > + kvm_set_s2pte_writable(&new_pte); > > + kvm_set_pfn_dirty(pfn); > > + } > > + coherent_icache_guest_page(kvm, hva, PAGE_SIZE); > > + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); > > } > > - stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); > > + > > > > out_unlock: > > - spin_unlock(&vcpu->kvm->mmu_lock); > > + spin_unlock(&kvm->mmu_lock); > > kvm_release_pfn_clean(pfn); > > - return 0; > > + return ret; > > } > > > > /** > > @@ -629,7 +725,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) > > > > memslot = gfn_to_memslot(vcpu->kvm, gfn); > > > > - ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); > > + ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); > > if (ret == 0) > > ret = 1; > > out_unlock: > > diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h > > index efe609c..e7f3852 100644 > > --- a/arch/arm64/include/asm/kvm_mmu.h > > +++ b/arch/arm64/include/asm/kvm_mmu.h > > @@ -91,6 +91,7 @@ int kvm_mmu_init(void); > > void kvm_clear_hyp_idmap(void); > > > > #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) > > +#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) > > > > static inline bool kvm_is_write_fault(unsigned long esr) > > { > > @@ -116,13 +117,18 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) > > pte_val(*pte) |= PTE_S2_RDWR; > > } > > > > +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) > > +{ > > + pmd_val(*pmd) |= PTE_S2_RDWR; > > +} > > Same comment as above. > yup -Christoffer -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html