Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking

Marcelo Tosatti <mtosatti@xxxxxxxxxx> · Mon, 23 Apr 2012 22:17:47 -0300

On Mon, Apr 23, 2012 at 07:16:52PM +0300, Avi Kivity wrote:
> Using RCU for lockless shadow walking can increase the amount of memory
> in use by the system, since RCU grace periods are unpredictable.  We also
> have an unconditional write to a shared variable (reader_counter), which
> isn't good for scaling.
> 
> Replace that with a scheme similar to x86's get_user_pages_fast(): disable
> interrupts during lockless shadow walk to force the freer
> (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
> processor with interrupts enabled.
> 
> We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
> kvm_flush_remote_tlbs() from avoiding the IPI.
> 
> Signed-off-by: Avi Kivity <avi@xxxxxxxxxx>
> ---
> 
> Turned out to be simpler than expected.  However, I think there's a problem
> with make_all_cpus_request() possible reading an incorrect vcpu->cpu.
> 
>  arch/x86/include/asm/kvm_host.h |    4 ---
>  arch/x86/kvm/mmu.c              |   61 +++++++++++----------------------------
>  include/linux/kvm_host.h        |    3 +-
>  3 files changed, 19 insertions(+), 49 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f624ca7..67e66e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -237,8 +237,6 @@ struct kvm_mmu_page {
>  #endif
>  
>  	int write_flooding_count;
> -
> -	struct rcu_head rcu;
>  };
>  
>  struct kvm_pio_request {
> @@ -536,8 +534,6 @@ struct kvm_arch {
>  	u64 hv_guest_os_id;
>  	u64 hv_hypercall;
>  
> -	atomic_t reader_counter;
> -
>  	#ifdef CONFIG_KVM_MMU_AUDIT
>  	int audit_point;
>  	#endif
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 07424cf..903af5e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -551,19 +551,23 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
>  
>  static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
>  {
> -	rcu_read_lock();
> -	atomic_inc(&vcpu->kvm->arch.reader_counter);
> -
> -	/* Increase the counter before walking shadow page table */
> -	smp_mb__after_atomic_inc();
> +	/*
> +	 * Prevent page table teardown by making any free-er wait during
> +	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
> +	 */
> +	local_irq_disable();
> +	vcpu->mode = READING_SHADOW_PAGE_TABLES;
> +	/*
> +	 * wmb: advertise vcpu->mode change
> +	 * rmb: make sure we see updated sptes
> +	 */
> +	smp_mb();
>  }
>  
>  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>  {
> -	/* Decrease the counter after walking shadow page table finished */
> -	smp_mb__before_atomic_dec();
> -	atomic_dec(&vcpu->kvm->arch.reader_counter);
> -	rcu_read_unlock();
> +	vcpu->mode = OUTSIDE_GUEST_MODE;
> +	local_irq_enable();
>  }
>  
>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
> @@ -1989,30 +1993,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
>  	return ret;
>  }
>  
> -static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
> -{
> -	struct kvm_mmu_page *sp;
> -
> -	list_for_each_entry(sp, invalid_list, link)
> -		kvm_mmu_isolate_page(sp);
> -}
> -
> -static void free_pages_rcu(struct rcu_head *head)
> -{
> -	struct kvm_mmu_page *next, *sp;
> -
> -	sp = container_of(head, struct kvm_mmu_page, rcu);
> -	while (sp) {
> -		if (!list_empty(&sp->link))
> -			next = list_first_entry(&sp->link,
> -				      struct kvm_mmu_page, link);
> -		else
> -			next = NULL;
> -		kvm_mmu_free_page(sp);
> -		sp = next;
> -	}
> -}
> -
>  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  				    struct list_head *invalid_list)
>  {
> @@ -2021,25 +2001,18 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  	if (list_empty(invalid_list))
>  		return;
>  
> +	/*
> +	 * Wait for all vcpus to exit guest mode and/or lockless shadow
> +	 * page table walks.
> +	 */
>  	kvm_flush_remote_tlbs(kvm);
>  
> -	if (atomic_read(&kvm->arch.reader_counter)) {
> -		kvm_mmu_isolate_pages(invalid_list);
> -		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> -		list_del_init(invalid_list);
> -
> -		trace_kvm_mmu_delay_free_pages(sp);
> -		call_rcu(&sp->rcu, free_pages_rcu);
> -		return;
> -	}
> -
>  	do {
>  		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>  		WARN_ON(!sp->role.invalid || sp->root_count);
>  		kvm_mmu_isolate_page(sp);
>  		kvm_mmu_free_page(sp);
>  	} while (!list_empty(invalid_list));
> -
>  }
>  
>  /*
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 186ffab..d1f1adf 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  enum {
>  	OUTSIDE_GUEST_MODE,
>  	IN_GUEST_MODE,
> -	EXITING_GUEST_MODE
> +	EXITING_GUEST_MODE,
> +	READING_SHADOW_PAGE_TABLES,
>  };

Should add an explicit mb after prepare_zap_page? (currently rely on
unrelated ones internal to flush_remote_tlbs).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html