On Wed, Apr 25, 2012 at 12:03:48PM +0800, Xiao Guangrong wrote: > If the the present bit of page fault error code is set, it indicates > the shadow page is populated on all levels, it means what we do is > only modify the access bit which can be done out of mmu-lock > > Currently, in order to simplify the code, we only fix the page fault > caused by write-protect on the fast path > > In order to better review, we hold mmu-lock to update spte in this > patch, the concurrent update will be introduced in the later patch > > Signed-off-by: Xiao Guangrong <xiaoguangrong@xxxxxxxxxxxxxxxxxx> > --- > arch/x86/kvm/mmu.c | 113 ++++++++++++++++++++++++++++++++++++++++++-- > arch/x86/kvm/paging_tmpl.h | 3 + > 2 files changed, 112 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index e7d8ffe..96a9d5b 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -2683,18 +2683,117 @@ exit: > return ret; > } > > +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn, > + u32 error_code) > +{ > + /* > + * #PF can be fast only if the shadow page table is present and it > + * is caused by write-protect, that means we just need change the > + * W bit of the spte which can be done out of mmu-lock. > + */ > + if (!(error_code & PFERR_PRESENT_MASK) || > + !(error_code & PFERR_WRITE_MASK)) > + return false; > + > + return true; > +} > + > +static bool > +fast_pf_fix_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > + u64 *sptep, u64 spte) > +{ > + gfn_t gfn; > + > + spin_lock(&vcpu->kvm->mmu_lock); > + > + /* The spte has been changed. */ > + if (*sptep != spte) > + goto exit; > + > + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); > + > + *sptep = spte | PT_WRITABLE_MASK; > + mark_page_dirty(vcpu->kvm, gfn); > + > +exit: > + spin_unlock(&vcpu->kvm->mmu_lock); There was a misunderstanding. The suggestion is to change codepaths that today assume that a side effect of holding mmu_lock is that sptes are not updated or read before attempting to introduce any lockless spte updates. example) u64 mask, old_spte = *sptep; if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); The local old_spte copy might be stale by the time "spte_has_volatile_bits(old_spte)" reads the writable bit. example) VCPU0 VCPU1 set_spte mmu_spte_update decides fast write mov newspte to sptep (non-locked write instruction) newspte in store buffer lockless spte read path reads stale value spin_unlock(mmu_lock) Depending on the what stale value CPU1 read and what decision it took this can be a problem, say the new bits (and we do not want to verify every single case). The spte write from inside mmu_lock should always be atomic now? So these details must be exposed to the reader, they are hidden now (note mmu_spte_update is already a maze, its getting too deep). > + > + return true; > +} > + > +/* > + * Return value: > + * - true: let the vcpu to access on the same address again. > + * - false: let the real page fault path to fix it. > + */ > +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, > + int level, u32 error_code) > +{ > + struct kvm_shadow_walk_iterator iterator; > + struct kvm_mmu_page *sp; > + bool ret = false; > + u64 spte = 0ull; > + > + if (!page_fault_can_be_fast(vcpu, gfn, error_code)) > + return false; > + > + walk_shadow_page_lockless_begin(vcpu); > + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) > + if (!is_shadow_present_pte(spte) || iterator.level < level) > + break; > + > + /* > + * If the mapping has been changed, let the vcpu fault on the > + * same address again. > + */ > + if (!is_rmap_spte(spte)) { > + ret = true; > + goto exit; > + } > + > + if (!is_last_spte(spte, level)) > + goto exit; > + > + /* > + * Check if it is a spurious fault caused by TLB lazily flushed. > + * > + * Need not check the access of upper level table entries since > + * they are always ACC_ALL. > + */ > + if (is_writable_pte(spte)) { > + ret = true; > + goto exit; > + } > + > + /* > + * Currently, to simplify the code, only the spte write-protected > + * by dirty-log can be fast fixed. > + */ > + if (!spte_wp_by_dirty_log(spte)) > + goto exit; > + > + sp = page_header(__pa(iterator.sptep)); > + > + ret = fast_pf_fix_spte(vcpu, sp, iterator.sptep, spte); > + > +exit: > + walk_shadow_page_lockless_end(vcpu); > + > + return ret; > +} > + > static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, > gva_t gva, pfn_t *pfn, bool write, bool *writable); > > -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, > - bool prefault) > +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, > + gfn_t gfn, bool prefault) > { > int r; > int level; > int force_pt_level; > pfn_t pfn; > unsigned long mmu_seq; > - bool map_writable; > + bool map_writable, write = error_code & PFERR_WRITE_MASK; > > force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); > if (likely(!force_pt_level)) { > @@ -2711,6 +2810,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, > } else > level = PT_PAGE_TABLE_LEVEL; > > + if (fast_page_fault(vcpu, v, gfn, level, error_code)) > + return 0; > + > mmu_seq = vcpu->kvm->mmu_notifier_seq; > smp_rmb(); > > @@ -3099,7 +3201,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, > gfn = gva >> PAGE_SHIFT; > > return nonpaging_map(vcpu, gva & PAGE_MASK, > - error_code & PFERR_WRITE_MASK, gfn, prefault); > + error_code, gfn, prefault); > } > > static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) > @@ -3179,6 +3281,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, > } else > level = PT_PAGE_TABLE_LEVEL; > > + if (fast_page_fault(vcpu, gpa, gfn, level, error_code)) > + return 0; > + > mmu_seq = vcpu->kvm->mmu_notifier_seq; > smp_rmb(); > > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > index df5a703..80493fb 100644 > --- a/arch/x86/kvm/paging_tmpl.h > +++ b/arch/x86/kvm/paging_tmpl.h > @@ -617,6 +617,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, > walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); > } > > + if (fast_page_fault(vcpu, addr, walker.gfn, level, error_code)) > + return 0; > + > mmu_seq = vcpu->kvm->mmu_notifier_seq; > smp_rmb(); > > -- > 1.7.7.6 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html