On Mon, Apr 23, 2012 at 07:16:52PM +0300, Avi Kivity wrote: > Using RCU for lockless shadow walking can increase the amount of memory > in use by the system, since RCU grace periods are unpredictable. We also > have an unconditional write to a shared variable (reader_counter), which > isn't good for scaling. > > Replace that with a scheme similar to x86's get_user_pages_fast(): disable > interrupts during lockless shadow walk to force the freer > (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the > processor with interrupts enabled. > > We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent > kvm_flush_remote_tlbs() from avoiding the IPI. > > Signed-off-by: Avi Kivity <avi@xxxxxxxxxx> > --- > > Turned out to be simpler than expected. However, I think there's a problem > with make_all_cpus_request() possible reading an incorrect vcpu->cpu. > > arch/x86/include/asm/kvm_host.h | 4 --- > arch/x86/kvm/mmu.c | 61 +++++++++++---------------------------- > include/linux/kvm_host.h | 3 +- > 3 files changed, 19 insertions(+), 49 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index f624ca7..67e66e6 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -237,8 +237,6 @@ struct kvm_mmu_page { > #endif > > int write_flooding_count; > - > - struct rcu_head rcu; > }; > > struct kvm_pio_request { > @@ -536,8 +534,6 @@ struct kvm_arch { > u64 hv_guest_os_id; > u64 hv_hypercall; > > - atomic_t reader_counter; > - > #ifdef CONFIG_KVM_MMU_AUDIT > int audit_point; > #endif > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 07424cf..903af5e 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -551,19 +551,23 @@ static u64 mmu_spte_get_lockless(u64 *sptep) > > static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) > { > - rcu_read_lock(); > - atomic_inc(&vcpu->kvm->arch.reader_counter); > - > - /* Increase the counter before walking shadow page table */ > - smp_mb__after_atomic_inc(); > + /* > + * Prevent page table teardown by making any free-er wait during > + * kvm_flush_remote_tlbs() IPI to all active vcpus. > + */ > + local_irq_disable(); > + vcpu->mode = READING_SHADOW_PAGE_TABLES; > + /* > + * wmb: advertise vcpu->mode change > + * rmb: make sure we see updated sptes > + */ > + smp_mb(); > } > > static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) > { > - /* Decrease the counter after walking shadow page table finished */ > - smp_mb__before_atomic_dec(); > - atomic_dec(&vcpu->kvm->arch.reader_counter); > - rcu_read_unlock(); > + vcpu->mode = OUTSIDE_GUEST_MODE; > + local_irq_enable(); > } > > static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, > @@ -1989,30 +1993,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, > return ret; > } > > -static void kvm_mmu_isolate_pages(struct list_head *invalid_list) > -{ > - struct kvm_mmu_page *sp; > - > - list_for_each_entry(sp, invalid_list, link) > - kvm_mmu_isolate_page(sp); > -} > - > -static void free_pages_rcu(struct rcu_head *head) > -{ > - struct kvm_mmu_page *next, *sp; > - > - sp = container_of(head, struct kvm_mmu_page, rcu); > - while (sp) { > - if (!list_empty(&sp->link)) > - next = list_first_entry(&sp->link, > - struct kvm_mmu_page, link); > - else > - next = NULL; > - kvm_mmu_free_page(sp); > - sp = next; > - } > -} > - > static void kvm_mmu_commit_zap_page(struct kvm *kvm, > struct list_head *invalid_list) > { > @@ -2021,25 +2001,18 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, > if (list_empty(invalid_list)) > return; > > + /* > + * Wait for all vcpus to exit guest mode and/or lockless shadow > + * page table walks. > + */ > kvm_flush_remote_tlbs(kvm); > > - if (atomic_read(&kvm->arch.reader_counter)) { > - kvm_mmu_isolate_pages(invalid_list); > - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); > - list_del_init(invalid_list); > - > - trace_kvm_mmu_delay_free_pages(sp); > - call_rcu(&sp->rcu, free_pages_rcu); > - return; > - } > - > do { > sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); > WARN_ON(!sp->role.invalid || sp->root_count); > kvm_mmu_isolate_page(sp); > kvm_mmu_free_page(sp); > } while (!list_empty(invalid_list)); > - > } > > /* > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 186ffab..d1f1adf 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); > enum { > OUTSIDE_GUEST_MODE, > IN_GUEST_MODE, > - EXITING_GUEST_MODE > + EXITING_GUEST_MODE, > + READING_SHADOW_PAGE_TABLES, > }; Should add an explicit mb after prepare_zap_page? (currently rely on unrelated ones internal to flush_remote_tlbs). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html