If we are faulting on a shadow stage 2 translation, we first walk the guest hypervisor's stage 2 page table to see if it has a mapping. If not, we inject a stage 2 page fault to the virtual EL2. Otherwise, we create a mapping in the shadow stage 2 page table. Note that we have to deal with two IPAs when we got a shadow stage 2 page fault. One is the address we faulted on, and is in the L2 guest phys space. The other is from the guest stage-2 page table walk, and is in the L1 guest phys space. To differentiate them, we rename variables so that fault_ipa is used for the former and ipa is used for the latter. Co-developed-by: Christoffer Dall <christoffer.dall@xxxxxxxxxx> Co-developed-by: Jintack Lim <jintack.lim@xxxxxxxxxx> Signed-off-by: Christoffer Dall <christoffer.dall@xxxxxxxxxx> Signed-off-by: Jintack Lim <jintack.lim@xxxxxxxxxx> [maz: rewrote this multiple times...] Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx> --- arch/arm64/include/asm/kvm_emulate.h | 7 +++ arch/arm64/include/asm/kvm_nested.h | 19 ++++++ arch/arm64/kvm/mmu.c | 89 ++++++++++++++++++++++++---- arch/arm64/kvm/nested.c | 48 +++++++++++++++ 4 files changed, 153 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 8ccf8a1d37ff..5173f8cf2904 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -649,4 +649,11 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) kvm_write_cptr_el2(val); } + +static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu && + vcpu->arch.hw_mmu->nested_stage2_enabled); +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index a2e719d1cd53..a9aec29bf7a1 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -77,8 +77,27 @@ struct kvm_s2_trans { u64 upper_attr; }; +static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans) +{ + return trans->output; +} + +static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans) +{ + return trans->block_size; +} + +static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans) +{ + return trans->esr; +} + extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, struct kvm_s2_trans *result); +extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, + struct kvm_s2_trans *trans); +extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2); +int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe); extern bool forward_smc_trap(struct kvm_vcpu *vcpu); extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 588ce46c0ad0..41de7616b735 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1412,14 +1412,16 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_memory_slot *memslot, unsigned long hva, - unsigned long fault_status) + struct kvm_s2_trans *nested, + struct kvm_memory_slot *memslot, + unsigned long hva, unsigned long fault_status) { int ret = 0; bool write_fault, writable, force_pte = false; bool exec_fault, mte_allowed; bool device = false; unsigned long mmu_seq; + phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; @@ -1504,10 +1506,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } vma_pagesize = 1UL << vma_shift; + + if (nested) { + unsigned long max_map_size; + + max_map_size = force_pte ? PUD_SIZE : PAGE_SIZE; + + ipa = kvm_s2_trans_output(nested); + + /* + * If we're about to create a shadow stage 2 entry, then we + * can only create a block mapping if the guest stage 2 page + * table uses at least as big a mapping. + */ + max_map_size = min(kvm_s2_trans_size(nested), max_map_size); + + /* + * Be careful that if the mapping size falls between + * two host sizes, take the smallest of the two. + */ + if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) + max_map_size = PMD_SIZE; + else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) + max_map_size = PAGE_SIZE; + + force_pte = (max_map_size == PAGE_SIZE); + vma_pagesize = min(vma_pagesize, (long)max_map_size); + } + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) fault_ipa &= ~(vma_pagesize - 1); - gfn = fault_ipa >> PAGE_SHIFT; + gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); /* Don't use the VMA after the unlock -- it may have vanished */ @@ -1657,8 +1687,10 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { + struct kvm_s2_trans nested_trans, *nested = NULL; unsigned long fault_status; - phys_addr_t fault_ipa; + phys_addr_t fault_ipa; /* The address we faulted on */ + phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ struct kvm_memory_slot *memslot; unsigned long hva; bool is_iabt, write_fault, writable; @@ -1667,7 +1699,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) fault_status = kvm_vcpu_trap_get_fault_type(vcpu); - fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); if (fault_status == ESR_ELx_FSC_FAULT) { @@ -1708,6 +1740,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) if (fault_status != ESR_ELx_FSC_FAULT && fault_status != ESR_ELx_FSC_PERM && fault_status != ESR_ELx_FSC_ACCESS) { + /* + * We must never see an address size fault on shadow stage 2 + * page table walk, because we would have injected an addr + * size fault when we walked the nested s2 page and not + * create the shadow entry. + */ kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), @@ -1717,7 +1755,37 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) idx = srcu_read_lock(&vcpu->kvm->srcu); - gfn = fault_ipa >> PAGE_SHIFT; + /* + * We may have faulted on a shadow stage 2 page table if we are + * running a nested guest. In this case, we have to resolve the L2 + * IPA to the L1 IPA first, before knowing what kind of memory should + * back the L1 IPA. + * + * If the shadow stage 2 page table walk faults, then we simply inject + * this to the guest and carry on. + */ + if (kvm_is_shadow_s2_fault(vcpu)) { + u32 esr; + + ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ipa = kvm_s2_trans_output(&nested_trans); + nested = &nested_trans; + } + + gfn = ipa >> PAGE_SHIFT; memslot = gfn_to_memslot(vcpu->kvm, gfn); hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); write_fault = kvm_is_write_fault(vcpu); @@ -1761,13 +1829,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * faulting VA. This is always 12 bits, irrespective * of the page size. */ - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, fault_ipa); + ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); + ret = io_mem_abort(vcpu, ipa); goto out_unlock; } /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); + VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); if (fault_status == ESR_ELx_FSC_ACCESS) { handle_access_fault(vcpu, fault_ipa); @@ -1775,7 +1843,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) goto out_unlock; } - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); + ret = user_mem_abort(vcpu, fault_ipa, nested, + memslot, hva, fault_status); if (ret == 0) ret = 1; out: diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index cd74d8ee93cb..f4014ae0f901 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -108,6 +108,15 @@ static u32 compute_fsc(int level, u32 fsc) return fsc | (level & 0x3); } +static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc) +{ + u32 esr; + + esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC; + esr |= compute_fsc(level, fsc); + return esr; +} + static int check_base_s2_limits(struct s2_walk_info *wi, int level, int input_size, int stride) { @@ -472,6 +481,45 @@ void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) } } +/* + * Returns non-zero if permission fault is handled by injecting it to the next + * level hypervisor. + */ +int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans) +{ + unsigned long fault_status = kvm_vcpu_trap_get_fault_type(vcpu); + bool forward_fault = false; + + trans->esr = 0; + + if (fault_status != ESR_ELx_FSC_PERM) + return 0; + + if (kvm_vcpu_trap_is_iabt(vcpu)) { + forward_fault = (trans->upper_attr & BIT(54)); + } else { + bool write_fault = kvm_is_write_fault(vcpu); + + forward_fault = ((write_fault && !trans->writable) || + (!write_fault && !trans->readable)); + } + + if (forward_fault) { + trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM); + return 1; + } + + return 0; +} + +int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2) +{ + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2); + vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2); + + return kvm_inject_nested_sync(vcpu, esr_el2); +} + void kvm_arch_flush_shadow_all(struct kvm *kvm) { int i; -- 2.39.2