> -----Original Message----- > From: Anup Patel [mailto:anup@xxxxxxxxxxxxxx] > Sent: Monday, November 30, 2020 6:22 PM > To: Jiangyifei <jiangyifei@xxxxxxxxxx> > Cc: Anup Patel <anup.patel@xxxxxxx>; Palmer Dabbelt > <palmer@xxxxxxxxxxx>; Palmer Dabbelt <palmerdabbelt@xxxxxxxxxx>; Paul > Walmsley <paul.walmsley@xxxxxxxxxx>; Albert Ou <aou@xxxxxxxxxxxxxxxxx>; > Paolo Bonzini <pbonzini@xxxxxxxxxx>; Alexander Graf <graf@xxxxxxxxxx>; > Atish Patra <atish.patra@xxxxxxx>; Alistair Francis > <Alistair.Francis@xxxxxxx>; Damien Le Moal <damien.lemoal@xxxxxxx>; > kvm@xxxxxxxxxxxxxxx; kvm-riscv@xxxxxxxxxxxxxxxxxxx; > linux-riscv@xxxxxxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Zhangxiaofeng (F) > <victor.zhangxiaofeng@xxxxxxxxxx>; Wubin (H) <wu.wubin@xxxxxxxxxx>; > dengkai (A) <dengkai1@xxxxxxxxxx>; yinyipeng <yinyipeng1@xxxxxxxxxx> > Subject: Re: [PATCH v15 10/17] RISC-V: KVM: Implement stage2 page table > programming > > On Tue, Nov 24, 2020 at 2:56 PM Anup Patel <anup@xxxxxxxxxxxxxx> wrote: > > > > On Mon, Nov 16, 2020 at 2:59 PM Jiangyifei <jiangyifei@xxxxxxxxxx> wrote: > > > > > > > > > > -----Original Message----- > > > > From: Anup Patel [mailto:anup.patel@xxxxxxx] > > > > Sent: Monday, November 9, 2020 7:33 PM > > > > To: Palmer Dabbelt <palmer@xxxxxxxxxxx>; Palmer Dabbelt > > > > <palmerdabbelt@xxxxxxxxxx>; Paul Walmsley > > > > <paul.walmsley@xxxxxxxxxx>; Albert Ou <aou@xxxxxxxxxxxxxxxxx>; > > > > Paolo Bonzini <pbonzini@xxxxxxxxxx> > > > > Cc: Alexander Graf <graf@xxxxxxxxxx>; Atish Patra > > > > <atish.patra@xxxxxxx>; Alistair Francis > > > > <Alistair.Francis@xxxxxxx>; Damien Le Moal > > > > <damien.lemoal@xxxxxxx>; Anup Patel <anup@xxxxxxxxxxxxxx>; > > > > kvm@xxxxxxxxxxxxxxx; kvm-riscv@xxxxxxxxxxxxxxxxxxx; > > > > linux-riscv@xxxxxxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; > > > > Anup Patel <anup.patel@xxxxxxx>; Jiangyifei > > > > <jiangyifei@xxxxxxxxxx> > > > > Subject: [PATCH v15 10/17] RISC-V: KVM: Implement stage2 page > > > > table programming > > > > > > > > This patch implements all required functions for programming the > > > > stage2 page table for each Guest/VM. > > > > > > > > At high-level, the flow of stage2 related functions is similar > > > > from KVM > > > > ARM/ARM64 implementation but the stage2 page table format is quite > > > > different for KVM RISC-V. > > > > > > > > [jiangyifei: stage2 dirty log support] > > > > Signed-off-by: Yifei Jiang <jiangyifei@xxxxxxxxxx> > > > > Signed-off-by: Anup Patel <anup.patel@xxxxxxx> > > > > Acked-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > > > > Reviewed-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > > > > --- > > > > arch/riscv/include/asm/kvm_host.h | 12 + > > > > arch/riscv/include/asm/pgtable-bits.h | 1 + > > > > arch/riscv/kvm/Kconfig | 1 + > > > > arch/riscv/kvm/main.c | 19 + > > > > arch/riscv/kvm/mmu.c | 649 > > > > +++++++++++++++++++++++++- > > > > arch/riscv/kvm/vm.c | 6 - > > > > 6 files changed, 672 insertions(+), 16 deletions(-) > > > > > > > > > > ...... > > > > > > > > > > > int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, @@ -69,27 > > > > +562,163 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, > > > > gpa_t gpa, unsigned long hva, > > > > bool writeable, bool is_write) { > > > > - /* TODO: */ > > > > - return 0; > > > > + int ret; > > > > + kvm_pfn_t hfn; > > > > + short vma_pageshift; > > > > + gfn_t gfn = gpa >> PAGE_SHIFT; > > > > + struct vm_area_struct *vma; > > > > + struct kvm *kvm = vcpu->kvm; > > > > + struct kvm_mmu_page_cache *pcache = > &vcpu->arch.mmu_page_cache; > > > > + bool logging = (memslot->dirty_bitmap && > > > > + !(memslot->flags & KVM_MEM_READONLY)) ? > true : false; > > > > + unsigned long vma_pagesize; > > > > + > > > > + mmap_read_lock(current->mm); > > > > + > > > > + vma = find_vma_intersection(current->mm, hva, hva + 1); > > > > + if (unlikely(!vma)) { > > > > + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > > > > + mmap_read_unlock(current->mm); > > > > + return -EFAULT; > > > > + } > > > > + > > > > + if (is_vm_hugetlb_page(vma)) > > > > + vma_pageshift = huge_page_shift(hstate_vma(vma)); > > > > + else > > > > + vma_pageshift = PAGE_SHIFT; > > > > + vma_pagesize = 1ULL << vma_pageshift; > > > > + if (logging || (vma->vm_flags & VM_PFNMAP)) > > > > + vma_pagesize = PAGE_SIZE; > > > > + > > > > + if (vma_pagesize == PMD_SIZE || vma_pagesize == PGDIR_SIZE) > > > > + gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> > > > > + PAGE_SHIFT; > > > > + > > > > + mmap_read_unlock(current->mm); > > > > + > > > > + if (vma_pagesize != PGDIR_SIZE && > > > > + vma_pagesize != PMD_SIZE && > > > > + vma_pagesize != PAGE_SIZE) { > > > > + kvm_err("Invalid VMA page size 0x%lx\n", > vma_pagesize); > > > > + return -EFAULT; > > > > + } > > > > + > > > > + /* We need minimum second+third level pages */ > > > > + ret = stage2_cache_topup(pcache, stage2_pgd_levels, > > > > + > KVM_MMU_PAGE_CACHE_NR_OBJS); > > > > + if (ret) { > > > > + kvm_err("Failed to topup stage2 cache\n"); > > > > + return ret; > > > > + } > > > > + > > > > + hfn = gfn_to_pfn_prot(kvm, gfn, is_write, NULL); > > > > + if (hfn == KVM_PFN_ERR_HWPOISON) { > > > > + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, > > > > + vma_pageshift, current); > > > > + return 0; > > > > + } > > > > + if (is_error_noslot_pfn(hfn)) > > > > + return -EFAULT; > > > > + > > > > + /* > > > > + * If logging is active then we allow writable pages only > > > > + * for write faults. > > > > + */ > > > > + if (logging && !is_write) > > > > + writeable = false; > > > > + > > > > + spin_lock(&kvm->mmu_lock); > > > > + > > > > + if (writeable) { > > > > > > Hi Anup, > > > > > > What is the purpose of "writable = !memslot_is_readonly(slot)" in this > series? > > > > Where ? I don't see this line in any of the patches. > > > > > > > > When mapping the HVA to HPA above, it doesn't know that the PTE > writeable of stage2 is "!memslot_is_readonly(slot)". > > > This may causes the difference between the writability of HVA->HPA and > GPA->HPA. > > > For example, GPA->HPA is writeable, but HVA->HPA is not writeable. > > > > Yes, this is possible particularly when Host kernel is updating > > writability of HVA->HPA mappings for swapping in/out pages. > > > > > > > > Is it better that the writability of HVA->HPA is also determined by whether > the memslot is readonly in this change? > > > Like this: > > > - hfn = gfn_to_pfn_prot(kvm, gfn, is_write, NULL); > > > + hfn = gfn_to_pfn_prot(kvm, gfn, writeable, NULL); > > > > The gfn_to_pfn_prot() needs to know what type of fault we got (i.e > > read/write fault). Rest of the information (such as whether slot is > > writable or not) is already available to gfn_to_pfn_prot(). > > > > The question here is should we pass "&writeable" or NULL as last > > parameter to gfn_to_pfn_prot(). The recent JUMP label support in Linux > > RISC-V causes problem on HW where PTE 'A' and 'D' bits are not updated > > by HW so I have to change last parameter of gfn_to_pfn_prot() from > > "&writeable" to NULL. > > > > I am still investigating this. > > This turned-out to be a bug in Spike which is not fixed. > > I will include following change in v16 patch series: > > > diff --git a/arch/riscv/include/asm/kvm_host.h > b/arch/riscv/include/asm/kvm_host.h > index 241030956d47..dc2666b4180b 100644 > --- a/arch/riscv/include/asm/kvm_host.h > +++ b/arch/riscv/include/asm/kvm_host.h > @@ -232,8 +232,7 @@ void __kvm_riscv_hfence_gvma_all(void); > > int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, > struct kvm_memory_slot *memslot, > - gpa_t gpa, unsigned long hva, > - bool writeable, bool is_write); > + gpa_t gpa, unsigned long hva, bool is_write); > void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu); int > kvm_riscv_stage2_alloc_pgd(struct kvm *kvm); void > kvm_riscv_stage2_free_pgd(struct kvm *kvm); diff --git > a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index > fcaeadc9b34d..56fda9ef70fd 100644 > --- a/arch/riscv/kvm/mmu.c > +++ b/arch/riscv/kvm/mmu.c > @@ -689,11 +689,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned > long hva) > > int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, > struct kvm_memory_slot *memslot, > - gpa_t gpa, unsigned long hva, > - bool writeable, bool is_write) > + gpa_t gpa, unsigned long hva, bool is_write) > { > int ret; > kvm_pfn_t hfn; > + bool writeable; > short vma_pageshift; > gfn_t gfn = gpa >> PAGE_SHIFT; > struct vm_area_struct *vma; > @@ -742,7 +742,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, > > mmu_seq = kvm->mmu_notifier_seq; > > - hfn = gfn_to_pfn_prot(kvm, gfn, is_write, NULL); > + hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writeable); > if (hfn == KVM_PFN_ERR_HWPOISON) { > send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, > vma_pageshift, current); diff --git > a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index > f054406792a6..058cfa168abe 100644 > --- a/arch/riscv/kvm/vcpu_exit.c > +++ b/arch/riscv/kvm/vcpu_exit.c > @@ -445,7 +445,7 @@ static int stage2_page_fault(struct kvm_vcpu *vcpu, > struct kvm_run *run, > }; > } > > - ret = kvm_riscv_stage2_map(vcpu, memslot, fault_addr, hva, writeable, > + ret = kvm_riscv_stage2_map(vcpu, memslot, fault_addr, hva, > (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false); > if (ret < 0) > return ret; > > Regards, > Anup This change looks good. Yifei