[...] > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h > index 9f339dffbc1a..2d6b5058f7d3 100644 > --- a/arch/arm64/include/asm/kvm_pgtable.h > +++ b/arch/arm64/include/asm/kvm_pgtable.h > @@ -288,6 +288,14 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); > */ > u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); > > +/* /** ? > + * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD > + * @vtcr: Content of the VTCR register. > + * > + * Return: the size (in bytes) of the stage-2 PGD > + */ > +size_t kvm_pgtable_stage2_pgd_size(u64 vtcr); > + > /** > * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. > * @pgt: Uninitialised page-table structure to initialise. > diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h > index 8f7b8a2314bb..11526e89fe5c 100644 > --- a/arch/arm64/include/asm/kvm_pkvm.h > +++ b/arch/arm64/include/asm/kvm_pkvm.h > @@ -9,6 +9,9 @@ > #include <linux/memblock.h> > #include <asm/kvm_pgtable.h> > > +/* Maximum number of protected VMs that can be created. */ > +#define KVM_MAX_PVMS 255 > + > #define HYP_MEMBLOCK_REGIONS 128 > > extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; > @@ -40,6 +43,11 @@ static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size) > return res >> PAGE_SHIFT; > } > > +static inline unsigned long hyp_shadow_table_pages(void) > +{ > + return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT; > +} > + > static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) > { > unsigned long total = 0, i; > diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h > index 3bea816296dc..3a0817b5c739 100644 > --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h > +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h > @@ -11,6 +11,7 @@ > #include <asm/kvm_mmu.h> > #include <asm/kvm_pgtable.h> > #include <asm/virt.h> > +#include <nvhe/pkvm.h> > #include <nvhe/spinlock.h> > > /* > @@ -68,10 +69,12 @@ bool addr_is_memory(phys_addr_t phys); > int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); > int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); > int kvm_host_prepare_stage2(void *pgt_pool_base); > +int kvm_guest_prepare_stage2(struct kvm_shadow_vm *vm, void *pgd); > void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); > > int hyp_pin_shared_mem(void *from, void *to); > void hyp_unpin_shared_mem(void *from, void *to); > +void reclaim_guest_pages(struct kvm_shadow_vm *vm); > > static __always_inline void __load_host_stage2(void) > { > diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h > new file mode 100644 > index 000000000000..1d0a33f70879 > --- /dev/null > +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h > @@ -0,0 +1,60 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (C) 2021 Google LLC > + * Author: Fuad Tabba <tabba@xxxxxxxxxx> > + */ > + > +#ifndef __ARM64_KVM_NVHE_PKVM_H__ > +#define __ARM64_KVM_NVHE_PKVM_H__ > + > +#include <asm/kvm_pkvm.h> > + > +/* > + * Holds the relevant data for maintaining the vcpu state completely at hyp. > + */ > +struct kvm_shadow_vcpu_state { > + /* The data for the shadow vcpu. */ > + struct kvm_vcpu shadow_vcpu; > + > + /* A pointer to the host's vcpu. */ > + struct kvm_vcpu *host_vcpu; > + > + /* A pointer to the shadow vm. */ > + struct kvm_shadow_vm *shadow_vm; IMHO, those declarations are already self-explanatory. The comments above don't bring much. > +}; > + > +/* > + * Holds the relevant data for running a protected vm. > + */ > +struct kvm_shadow_vm { > + /* The data for the shadow kvm. */ > + struct kvm kvm; > + > + /* The host's kvm structure. */ > + struct kvm *host_kvm; > + > + /* The total size of the donated shadow area. */ > + size_t shadow_area_size; > + > + struct kvm_pgtable pgt; > + > + /* Array of the shadow state per vcpu. */ > + struct kvm_shadow_vcpu_state shadow_vcpu_states[0]; > +}; > + > +static inline struct kvm_shadow_vcpu_state *get_shadow_state(struct kvm_vcpu *shadow_vcpu) > +{ > + return container_of(shadow_vcpu, struct kvm_shadow_vcpu_state, shadow_vcpu); > +} > + > +static inline struct kvm_shadow_vm *get_shadow_vm(struct kvm_vcpu *shadow_vcpu) > +{ > + return get_shadow_state(shadow_vcpu)->shadow_vm; > +} > + > +void hyp_shadow_table_init(void *tbl); > +int __pkvm_init_shadow(struct kvm *kvm, unsigned long shadow_hva, > + size_t shadow_size, unsigned long pgd_hva); > +int __pkvm_teardown_shadow(unsigned int shadow_handle); > + > +#endif /* __ARM64_KVM_NVHE_PKVM_H__ */ > diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c > index 3cea4b6ac23e..a1fbd11c8041 100644 > --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c > +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c > @@ -15,6 +15,7 @@ > > #include <nvhe/mem_protect.h> > #include <nvhe/mm.h> > +#include <nvhe/pkvm.h> > #include <nvhe/trap_handler.h> > > DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); > @@ -191,6 +192,24 @@ static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) > __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); > } > > +static void handle___pkvm_init_shadow(struct kvm_cpu_context *host_ctxt) > +{ > + DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); > + DECLARE_REG(unsigned long, host_shadow_va, host_ctxt, 2); > + DECLARE_REG(size_t, shadow_size, host_ctxt, 3); > + DECLARE_REG(unsigned long, pgd, host_ctxt, 4); > + > + cpu_reg(host_ctxt, 1) = __pkvm_init_shadow(host_kvm, host_shadow_va, > + shadow_size, pgd); > +} > + > +static void handle___pkvm_teardown_shadow(struct kvm_cpu_context *host_ctxt) > +{ > + DECLARE_REG(unsigned int, shadow_handle, host_ctxt, 1); > + > + cpu_reg(host_ctxt, 1) = __pkvm_teardown_shadow(shadow_handle); > +} > + > typedef void (*hcall_t)(struct kvm_cpu_context *); > > #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x > @@ -220,6 +239,8 @@ static const hcall_t host_hcall[] = { > HANDLE_FUNC(__vgic_v3_save_aprs), > HANDLE_FUNC(__vgic_v3_restore_aprs), > HANDLE_FUNC(__pkvm_vcpu_init_traps), > + HANDLE_FUNC(__pkvm_init_shadow), > + HANDLE_FUNC(__pkvm_teardown_shadow), > }; > > static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) > diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c > index e2e3b30b072e..9baf731736be 100644 > --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c > +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c > @@ -141,6 +141,20 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) > return 0; > } > > +int kvm_guest_prepare_stage2(struct kvm_shadow_vm *vm, void *pgd) > +{ > + vm->pgt.pgd = pgd; > + return 0; > +} > + > +void reclaim_guest_pages(struct kvm_shadow_vm *vm) > +{ > + unsigned long nr_pages; > + > + nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT; > + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(vm->pgt.pgd), nr_pages)); > +} > + > int __pkvm_prot_finalize(void) > { > struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; > diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c > index 99c8d8b73e70..77aeb787670b 100644 > --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c > +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c > @@ -7,6 +7,9 @@ > #include <linux/kvm_host.h> > #include <linux/mm.h> > #include <nvhe/fixed_config.h> > +#include <nvhe/mem_protect.h> > +#include <nvhe/memory.h> I don't think this one is necessary, it is already included in mm.h. > +#include <nvhe/pkvm.h> > #include <nvhe/trap_handler.h> > > /* > @@ -183,3 +186,398 @@ void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) > pvm_init_traps_aa64mmfr0(vcpu); > pvm_init_traps_aa64mmfr1(vcpu); > } > + > +/* > + * Start the shadow table handle at the offset defined instead of at 0. > + * Mainly for sanity checking and debugging. > + */ > +#define HANDLE_OFFSET 0x1000 > + > +static unsigned int shadow_handle_to_idx(unsigned int shadow_handle) > +{ > + return shadow_handle - HANDLE_OFFSET; > +} > + > +static unsigned int idx_to_shadow_handle(unsigned int idx) > +{ > + return idx + HANDLE_OFFSET; > +} > + > +/* > + * Spinlock for protecting the shadow table related state. > + * Protects writes to shadow_table, nr_shadow_entries, and next_shadow_alloc, > + * as well as reads and writes to last_shadow_vcpu_lookup. > + */ > +static DEFINE_HYP_SPINLOCK(shadow_lock); > + > +/* > + * The table of shadow entries for protected VMs in hyp. > + * Allocated at hyp initialization and setup. > + */ > +static struct kvm_shadow_vm **shadow_table; > + > +/* Current number of vms in the shadow table. */ > +static unsigned int nr_shadow_entries; > + > +/* The next entry index to try to allocate from. */ > +static unsigned int next_shadow_alloc; > + > +void hyp_shadow_table_init(void *tbl) > +{ > + WARN_ON(shadow_table); > + shadow_table = tbl; > +} > + > +/* > + * Return the shadow vm corresponding to the handle. > + */ > +static struct kvm_shadow_vm *find_shadow_by_handle(unsigned int shadow_handle) > +{ > + unsigned int shadow_idx = shadow_handle_to_idx(shadow_handle); > + > + if (unlikely(shadow_idx >= KVM_MAX_PVMS)) > + return NULL; > + > + return shadow_table[shadow_idx]; > +} > + > +static void unpin_host_vcpus(struct kvm_shadow_vcpu_state *shadow_vcpu_states, > + unsigned int nr_vcpus) > +{ > + int i; > + > + for (i = 0; i < nr_vcpus; i++) { > + struct kvm_vcpu *host_vcpu = shadow_vcpu_states[i].host_vcpu; IIRC, checkpatch likes an empty line after declarations. > + hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); > + } > +} > + > +static int set_host_vcpus(struct kvm_shadow_vcpu_state *shadow_vcpu_states, > + unsigned int nr_vcpus, > + struct kvm_vcpu **vcpu_array, > + size_t vcpu_array_size) > +{ > + int i; > + > + if (vcpu_array_size < sizeof(*vcpu_array) * nr_vcpus) > + return -EINVAL; > + > + for (i = 0; i < nr_vcpus; i++) { > + struct kvm_vcpu *host_vcpu = kern_hyp_va(vcpu_array[i]); > + > + if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) { > + unpin_host_vcpus(shadow_vcpu_states, i); > + return -EBUSY; > + } > + > + shadow_vcpu_states[i].host_vcpu = host_vcpu; > + } > + > + return 0; > +} > + > +static int init_shadow_structs(struct kvm *kvm, struct kvm_shadow_vm *vm, > + struct kvm_vcpu **vcpu_array, > + unsigned int nr_vcpus) > +{ > + int i; > + > + vm->host_kvm = kvm; > + vm->kvm.created_vcpus = nr_vcpus; > + vm->kvm.arch.vtcr = host_kvm.arch.vtcr; > + > + for (i = 0; i < nr_vcpus; i++) { > + struct kvm_shadow_vcpu_state *shadow_vcpu_state = &vm->shadow_vcpu_states[i]; > + struct kvm_vcpu *shadow_vcpu = &shadow_vcpu_state->shadow_vcpu; > + struct kvm_vcpu *host_vcpu = shadow_vcpu_state->host_vcpu; > + > + shadow_vcpu_state->shadow_vm = vm; > + > + shadow_vcpu->kvm = &vm->kvm; > + shadow_vcpu->vcpu_id = READ_ONCE(host_vcpu->vcpu_id); > + shadow_vcpu->vcpu_idx = i; > + > + shadow_vcpu->arch.hw_mmu = &vm->kvm.arch.mmu; In the end, we don't seem to use much from the struct kvm_cpu. Is it for convinience that a smaller struct kvm_shadow_cpu hasn't been created, or we do anticipate a later wider usage? > + } > + > + return 0; > +} > + > +static bool __exists_shadow(struct kvm *host_kvm) > +{ > + int i; > + unsigned int nr_checked = 0; > + > + for (i = 0; i < KVM_MAX_PVMS && nr_checked < nr_shadow_entries; i++) { > + if (!shadow_table[i]) > + continue; > + > + if (unlikely(shadow_table[i]->host_kvm == host_kvm)) > + return true; > + > + nr_checked++; > + } > + > + return false; > +} > + > +/* > + * Allocate a shadow table entry and insert a pointer to the shadow vm. > + * > + * Return a unique handle to the protected VM on success, > + * negative error code on failure. > + */ > +static unsigned int insert_shadow_table(struct kvm *kvm, > + struct kvm_shadow_vm *vm, > + size_t shadow_size) > +{ > + struct kvm_s2_mmu *mmu = &vm->kvm.arch.mmu; > + unsigned int shadow_handle; > + unsigned int vmid; > + > + hyp_assert_lock_held(&shadow_lock); > + > + if (unlikely(nr_shadow_entries >= KVM_MAX_PVMS)) > + return -ENOMEM; > + > + /* > + * Initializing protected state might have failed, yet a malicious host > + * could trigger this function. Thus, ensure that shadow_table exists. > + */ > + if (unlikely(!shadow_table)) > + return -EINVAL; > + > + /* Check that a shadow hasn't been created before for this host KVM. */ > + if (unlikely(__exists_shadow(kvm))) > + return -EEXIST; > + > + /* Find the next free entry in the shadow table. */ > + while (shadow_table[next_shadow_alloc]) > + next_shadow_alloc = (next_shadow_alloc + 1) % KVM_MAX_PVMS; Couldn't it be merged with __exists_shadow which already knows the first free shadow_table idx? > + shadow_handle = idx_to_shadow_handle(next_shadow_alloc); > + > + vm->kvm.arch.pkvm.shadow_handle = shadow_handle; > + vm->shadow_area_size = shadow_size; > + > + /* VMID 0 is reserved for the host */ > + vmid = next_shadow_alloc + 1; > + if (vmid > 0xff) Couldn't the 0xff be found with get_vmid_bits() or even from host_kvm.arch.vtcr? Or does that depends on something completely different? Also, appologies if this has been discussed already and I missed it, maybe KVM_MAX_PVMS could be changed for that value - 1. Unless we think that archs supporting 16 bits would waste way too much memory for that? > + return -ENOMEM; > + > + atomic64_set(&mmu->vmid.id, vmid); > + mmu->arch = &vm->kvm.arch; > + mmu->pgt = &vm->pgt; > + > + shadow_table[next_shadow_alloc] = vm; > + next_shadow_alloc = (next_shadow_alloc + 1) % KVM_MAX_PVMS; > + nr_shadow_entries++; > + > + return shadow_handle; > +} > + [...]