From: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx> FNAME(is_self_change_mapping) has two functionalities. If the fault is on a huge page but at least one of the pagetable on the walk is also on the terminal huge page, disable the huge page mapping for the fault. If the fault is modifying at least one of the pagetable on the walk, set something to tell the emulator. The first functionality is much better handled by kvm_mmu_hugepage_adjust() now, and it has a defect that it blindly disables the huge page mapping rather than trying to reduce the size of the huge page first. Huang Hang reported that when a guest is writing to a 1G page, but only a 4K page is mapped because of the first functionality in a case in which we think a 2M page should be mapped. The 1G page includes a pagetable on the pagetable-walk, but the narrowed 2M page doesn't. To fix the problem, remove FNAME(is_self_change_mapping) for its first functionality is already and better handled by kvm_mmu_hugepage_adjust(), and re-implement the second functionality in FNAME(fetch). Reported-by: Huang Hang <hhuang@xxxxxxxxxxxxxxxxx> Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx> --- arch/x86/kvm/mmu/paging_tmpl.h | 66 ++++++++-------------------------- 1 file changed, 15 insertions(+), 51 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 8b83abf1d8bc..c69e30737cd2 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -630,6 +630,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, top_level = vcpu->arch.mmu->cpu_role.base.level; if (top_level == PT32E_ROOT_LEVEL) top_level = PT32_ROOT_LEVEL; + + /* + * write_fault_to_shadow_pgtable will be set if the fault gfn is + * currently used as its pagetable on the path of the pagetable walk. + */ + vcpu->arch.write_fault_to_shadow_pgtable = false; + /* * Verify that the top-level gpte is still there. Since the page * is a root page, it is either write protected (and cannot be @@ -685,8 +692,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); + + if (fault->write && table_gfn == fault->gfn) + vcpu->arch.write_fault_to_shadow_pgtable = true; } + /* + * Adjust huge page after getting non-direct shadow page which might + * affect the huge page info. + */ kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); @@ -733,46 +747,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, return RET_PF_RETRY; } - /* - * To see whether the mapped gfn can write its page table in the current - * mapping. - * - * It is the helper function of FNAME(page_fault). When guest uses large page - * size to map the writable gfn which is used as current page table, we should - * force kvm to use small page size to map it because new shadow page will be - * created when kvm establishes shadow page table that stop kvm using large - * page size. Do it early can avoid unnecessary #PF and emulation. - * - * @write_fault_to_shadow_pgtable will return true if the fault gfn is - * currently used as its page table. - * - * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok - * since the PDPT is always shadowed, that means, we can not use large page - * size to map the gfn which is used as PDPT. - */ -static bool -FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, bool user_fault, - bool *write_fault_to_shadow_pgtable) -{ - int level; - gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); - bool self_changed = false; - - if (!(walker->pte_access & ACC_WRITE_MASK || - (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) - return false; - - for (level = walker->level; level <= walker->max_level; level++) { - gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; - - self_changed |= !(gfn & mask); - *write_fault_to_shadow_pgtable |= !gfn; - } - - return self_changed; -} - /* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte @@ -791,7 +765,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct guest_walker walker; int r; - bool is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); WARN_ON_ONCE(fault->is_tdp); @@ -816,6 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault } fault->gfn = walker.gfn; + fault->max_level = walker.level; fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); if (page_fault_handle_page_track(vcpu, fault)) { @@ -827,16 +801,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - vcpu->arch.write_fault_to_shadow_pgtable = false; - - is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); - - if (is_self_change_mapping) - fault->max_level = PG_LEVEL_4K; - else - fault->max_level = walker.level; - r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; -- 2.19.1.6.gb485710b