On 2024/6/14 22:45, Marc Zyngier wrote: > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index 8984b7c213e1..5aed2e9d380d 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -1407,6 +1407,7 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) > } > > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > + struct kvm_s2_trans *nested, > struct kvm_memory_slot *memslot, unsigned long hva, > bool fault_is_perm) > { > @@ -1415,6 +1416,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > bool exec_fault, mte_allowed; > bool device = false, vfio_allow_any_uc = false; > unsigned long mmu_seq; > + phys_addr_t ipa = fault_ipa; > struct kvm *kvm = vcpu->kvm; > struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; > struct vm_area_struct *vma; > @@ -1498,10 +1500,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > } > > vma_pagesize = 1UL << vma_shift; > + > + if (nested) { > + unsigned long max_map_size; > + > + max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; > + > + ipa = kvm_s2_trans_output(nested); > + > + /* > + * If we're about to create a shadow stage 2 entry, then we > + * can only create a block mapping if the guest stage 2 page > + * table uses at least as big a mapping. > + */ > + max_map_size = min(kvm_s2_trans_size(nested), max_map_size); > + > + /* > + * Be careful that if the mapping size falls between > + * two host sizes, take the smallest of the two. > + */ > + if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) > + max_map_size = PMD_SIZE; > + else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) > + max_map_size = PAGE_SIZE; > + > + force_pte = (max_map_size == PAGE_SIZE); > + vma_pagesize = min(vma_pagesize, (long)max_map_size); > + } > + > if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) > fault_ipa &= ~(vma_pagesize - 1); > > - gfn = fault_ipa >> PAGE_SHIFT; > + gfn = ipa >> PAGE_SHIFT; I had seen a non-nested guest boot failure (with vma_pagesize == PUD_SIZE) and bisection led me here. Is it intentional to ignore the fault_ipa adjustment when calculating gfn if the guest memory is backed by hugetlbfs? This looks broken for the non-nested case. But since I haven't looked at user_mem_abort() for a long time, I'm not sure if I'd missed something... Thanks, Zenghui