On Thu, Aug 22, 2019 at 5:58 PM Alexander Graf <graf@xxxxxxxxxx> wrote: > > On 22.08.19 10:45, Anup Patel wrote: > > This patch implements all required functions for programming > > the stage2 page table for each Guest/VM. > > > > At high-level, the flow of stage2 related functions is similar > > from KVM ARM/ARM64 implementation but the stage2 page table > > format is quite different for KVM RISC-V. > > > > Signed-off-by: Anup Patel <anup.patel@xxxxxxx> > > Acked-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > > Reviewed-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > > --- > > arch/riscv/include/asm/kvm_host.h | 10 + > > arch/riscv/include/asm/pgtable-bits.h | 1 + > > arch/riscv/kvm/mmu.c | 637 +++++++++++++++++++++++++- > > 3 files changed, 638 insertions(+), 10 deletions(-) > > > > diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h > > index 3b09158f80f2..a37775c92586 100644 > > --- a/arch/riscv/include/asm/kvm_host.h > > +++ b/arch/riscv/include/asm/kvm_host.h > > @@ -72,6 +72,13 @@ struct kvm_mmio_decode { > > int shift; > > }; > > > > +#define KVM_MMU_PAGE_CACHE_NR_OBJS 32 > > + > > +struct kvm_mmu_page_cache { > > + int nobjs; > > + void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS]; > > +}; > > + > > struct kvm_cpu_context { > > unsigned long zero; > > unsigned long ra; > > @@ -163,6 +170,9 @@ struct kvm_vcpu_arch { > > /* MMIO instruction details */ > > struct kvm_mmio_decode mmio_decode; > > > > + /* Cache pages needed to program page tables with spinlock held */ > > + struct kvm_mmu_page_cache mmu_page_cache; > > + > > /* VCPU power-off state */ > > bool power_off; > > > > diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h > > index bbaeb5d35842..be49d62fcc2b 100644 > > --- a/arch/riscv/include/asm/pgtable-bits.h > > +++ b/arch/riscv/include/asm/pgtable-bits.h > > @@ -26,6 +26,7 @@ > > > > #define _PAGE_SPECIAL _PAGE_SOFT > > #define _PAGE_TABLE _PAGE_PRESENT > > +#define _PAGE_LEAF (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) > > > > /* > > * _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to > > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c > > index 2b965f9aac07..9e95ab6769f6 100644 > > --- a/arch/riscv/kvm/mmu.c > > +++ b/arch/riscv/kvm/mmu.c > > @@ -18,6 +18,432 @@ > > #include <asm/page.h> > > #include <asm/pgtable.h> > > > > +#ifdef CONFIG_64BIT > > +#define stage2_have_pmd true > > +#define stage2_gpa_size ((phys_addr_t)(1ULL << 39)) > > +#define stage2_cache_min_pages 2 > > +#else > > +#define pmd_index(x) 0 > > +#define pfn_pmd(x, y) ({ pmd_t __x = { 0 }; __x; }) > > +#define stage2_have_pmd false > > +#define stage2_gpa_size ((phys_addr_t)(1ULL << 32)) > > +#define stage2_cache_min_pages 1 > > +#endif > > + > > +static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache, > > + int min, int max) > > +{ > > + void *page; > > + > > + BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS); > > + if (pcache->nobjs >= min) > > + return 0; > > + while (pcache->nobjs < max) { > > + page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); > > + if (!page) > > + return -ENOMEM; > > + pcache->objects[pcache->nobjs++] = page; > > + } > > + > > + return 0; > > +} > > + > > +static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache) > > +{ > > + while (pcache && pcache->nobjs) > > + free_page((unsigned long)pcache->objects[--pcache->nobjs]); > > +} > > + > > +static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache) > > +{ > > + void *p; > > + > > + if (!pcache) > > + return NULL; > > + > > + BUG_ON(!pcache->nobjs); > > + p = pcache->objects[--pcache->nobjs]; > > + > > + return p; > > +} > > + > > +struct local_guest_tlb_info { > > + struct kvm_vmid *vmid; > > + gpa_t addr; > > +}; > > + > > +static void local_guest_tlb_flush_vmid_gpa(void *info) > > +{ > > + struct local_guest_tlb_info *infop = info; > > + > > + __kvm_riscv_hfence_gvma_vmid_gpa(READ_ONCE(infop->vmid->vmid_version), > > + infop->addr); > > +} > > + > > +static void stage2_remote_tlb_flush(struct kvm *kvm, gpa_t addr) > > +{ > > + struct local_guest_tlb_info info; > > + struct kvm_vmid *vmid = &kvm->arch.vmid; > > + > > + /* TODO: This should be SBI call */ > > + info.vmid = vmid; > > + info.addr = addr; > > + preempt_disable(); > > + smp_call_function_many(cpu_all_mask, local_guest_tlb_flush_vmid_gpa, > > + &info, true); > > This is all nice and dandy on the toy 4 core systems we have today, but > it will become a bottleneck further down the road. > > How many VMIDs do you have? Could you just allocate a new one every time > you switch host CPUs? Then you know exactly which CPUs to flush by > looking at all your vcpu structs and a local field that tells you which > pCPU they're on at this moment. > > Either way, it's nothing that should block inclusion. For today, we're fine. We are not happy about this either. Other two options, we have are: 1. Have SBI calls for remote HFENCEs 2. Propose RISC-V ISA extension for remote FENCEs Option1 is mostly extending SBI spec and implementing it in runtime firmware. Option2 is ideal solution but requires consensus among wider audience in RISC-V foundation. At this point, we are fine with a simple solution. Regards, Anup > > > Alex