Re: [PATCH v5 13/20] RISC-V: KVM: Implement stage2 page table programming

Alexander Graf <graf@xxxxxxxxxx> · Thu, 22 Aug 2019 15:27:12 +0200

On 22.08.19 14:38, Anup Patel wrote:
On Thu, Aug 22, 2019 at 5:58 PM Alexander Graf <graf@xxxxxxxxxx> wrote:

On 22.08.19 10:45, Anup Patel wrote:
This patch implements all required functions for programming
the stage2 page table for each Guest/VM.

At high-level, the flow of stage2 related functions is similar
from KVM ARM/ARM64 implementation but the stage2 page table
format is quite different for KVM RISC-V.

Signed-off-by: Anup Patel <anup.patel@xxxxxxx>
Acked-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Reviewed-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
---
   arch/riscv/include/asm/kvm_host.h     |  10 +
   arch/riscv/include/asm/pgtable-bits.h |   1 +
   arch/riscv/kvm/mmu.c                  | 637 +++++++++++++++++++++++++-
   3 files changed, 638 insertions(+), 10 deletions(-)

diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 3b09158f80f2..a37775c92586 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -72,6 +72,13 @@ struct kvm_mmio_decode {
       int shift;
   };

+#define KVM_MMU_PAGE_CACHE_NR_OBJS   32
+
+struct kvm_mmu_page_cache {
+     int nobjs;
+     void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS];
+};
+
   struct kvm_cpu_context {
       unsigned long zero;
       unsigned long ra;
@@ -163,6 +170,9 @@ struct kvm_vcpu_arch {
       /* MMIO instruction details */
       struct kvm_mmio_decode mmio_decode;

+     /* Cache pages needed to program page tables with spinlock held */
+     struct kvm_mmu_page_cache mmu_page_cache;
+
       /* VCPU power-off state */
       bool power_off;

diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
index bbaeb5d35842..be49d62fcc2b 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -26,6 +26,7 @@

   #define _PAGE_SPECIAL   _PAGE_SOFT
   #define _PAGE_TABLE     _PAGE_PRESENT
+#define _PAGE_LEAF      (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)

   /*
    * _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 2b965f9aac07..9e95ab6769f6 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -18,6 +18,432 @@
   #include <asm/page.h>
   #include <asm/pgtable.h>

+#ifdef CONFIG_64BIT
+#define stage2_have_pmd              true
+#define stage2_gpa_size              ((phys_addr_t)(1ULL << 39))
+#define stage2_cache_min_pages       2
+#else
+#define pmd_index(x)         0
+#define pfn_pmd(x, y)                ({ pmd_t __x = { 0 }; __x; })
+#define stage2_have_pmd              false
+#define stage2_gpa_size              ((phys_addr_t)(1ULL << 32))
+#define stage2_cache_min_pages       1
+#endif
+
+static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache,
+                           int min, int max)
+{
+     void *page;
+
+     BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS);
+     if (pcache->nobjs >= min)
+             return 0;
+     while (pcache->nobjs < max) {
+             page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+             if (!page)
+                     return -ENOMEM;
+             pcache->objects[pcache->nobjs++] = page;
+     }
+
+     return 0;
+}
+
+static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache)
+{
+     while (pcache && pcache->nobjs)
+             free_page((unsigned long)pcache->objects[--pcache->nobjs]);
+}
+
+static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache)
+{
+     void *p;
+
+     if (!pcache)
+             return NULL;
+
+     BUG_ON(!pcache->nobjs);
+     p = pcache->objects[--pcache->nobjs];
+
+     return p;
+}
+
+struct local_guest_tlb_info {
+     struct kvm_vmid *vmid;
+     gpa_t addr;
+};
+
+static void local_guest_tlb_flush_vmid_gpa(void *info)
+{
+     struct local_guest_tlb_info *infop = info;
+
+     __kvm_riscv_hfence_gvma_vmid_gpa(READ_ONCE(infop->vmid->vmid_version),
+                                      infop->addr);
+}
+
+static void stage2_remote_tlb_flush(struct kvm *kvm, gpa_t addr)
+{
+     struct local_guest_tlb_info info;
+     struct kvm_vmid *vmid = &kvm->arch.vmid;
+
+     /* TODO: This should be SBI call */
+     info.vmid = vmid;
+     info.addr = addr;
+     preempt_disable();
+     smp_call_function_many(cpu_all_mask, local_guest_tlb_flush_vmid_gpa,
+                            &info, true);

This is all nice and dandy on the toy 4 core systems we have today, but
it will become a bottleneck further down the road.

How many VMIDs do you have? Could you just allocate a new one every time
you switch host CPUs? Then you know exactly which CPUs to flush by
looking at all your vcpu structs and a local field that tells you which
pCPU they're on at this moment.

Either way, it's nothing that should block inclusion. For today, we're fine.

We are not happy about this either.

Other two options, we have are:
1. Have SBI calls for remote HFENCEs
2. Propose RISC-V ISA extension for remote FENCEs

Option1 is mostly extending SBI spec and implementing it in runtime
firmware.

Option2 is ideal solution but requires consensus among wider audience
in RISC-V foundation.

At this point, we are fine with a simple solution.

It's fine to explicitly IPI other CPUs to flush their TLBs. What is not 
fine is to IPI *all* CPUs to flush their TLBs.


Alex