Re: [PATCH v5 17/19] KVM: Terminate memslot walks via used_slots

Peter Xu <peterx@xxxxxxxxxx> · Thu, 6 Feb 2020 16:09:44 -0500

On Tue, Jan 21, 2020 at 02:31:55PM -0800, Sean Christopherson wrote:
> Refactor memslot handling to treat the number of used slots as the de
> facto size of the memslot array, e.g. return NULL from id_to_memslot()
> when an invalid index is provided instead of relying on npages==0 to
> detect an invalid memslot.  Rework the sorting and walking of memslots
> in advance of dynamically sizing memslots to aid bisection and debug,
> e.g. with luck, a bug in the refactoring will bisect here and/or hit a
> WARN instead of randomly corrupting memory.
> 
> Alternatively, a global null/invalid memslot could be returned, i.e. so
> callers of id_to_memslot() don't have to explicitly check for a NULL
> memslot, but that approach runs the risk of introducing difficult-to-
> debug issues, e.g. if the global null slot is modified.  Constifying
> the return from id_to_memslot() to combat such issues is possible, but
> would require a massive refactoring of arch specific code and would
> still be susceptible to casting shenanigans.
> 
> Add function comments to update_memslots() and search_memslots() to
> explicitly (and loudly) state how memslots are sorted.
> 
> No functional change intended.
> 
> Tested-by: Christoffer Dall <christoffer.dall@xxxxxxx>
> Tested-by: Marc Zyngier <maz@xxxxxxxxxx>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> ---
>  arch/powerpc/kvm/book3s_hv.c |   2 +-
>  arch/x86/kvm/x86.c           |  14 +--
>  include/linux/kvm_host.h     |  18 ++-
>  virt/kvm/arm/mmu.c           |   9 +-
>  virt/kvm/kvm_main.c          | 220 ++++++++++++++++++++++++++---------
>  5 files changed, 189 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 4afabedcbace..ea03cb868151 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4397,7 +4397,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
>  	slots = kvm_memslots(kvm);
>  	memslot = id_to_memslot(slots, log->slot);
>  	r = -ENOENT;
> -	if (!memslot->dirty_bitmap)
> +	if (!memslot || !memslot->dirty_bitmap)
>  		goto out;
>  
>  	/*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 07f7d6458b89..53d2a86cc91e 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9630,9 +9630,9 @@ void kvm_arch_sync_events(struct kvm *kvm)
>  int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
>  {
>  	int i, r;
> -	unsigned long hva;
> +	unsigned long hva, uninitialized_var(old_npages);
>  	struct kvm_memslots *slots = kvm_memslots(kvm);
> -	struct kvm_memory_slot *slot, old;
> +	struct kvm_memory_slot *slot;
>  
>  	/* Called with kvm->slots_lock held.  */
>  	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
> @@ -9640,7 +9640,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
>  
>  	slot = id_to_memslot(slots, id);
>  	if (size) {
> -		if (slot->npages)
> +		if (slot && slot->npages)
>  			return -EEXIST;
>  
>  		/*
> @@ -9652,13 +9652,13 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
>  		if (IS_ERR((void *)hva))
>  			return PTR_ERR((void *)hva);
>  	} else {
> -		if (!slot->npages)
> +		if (!slot || !slot->npages)
>  			return 0;
>  
> -		hva = 0;
> +		hva = slot->userspace_addr;

Is this intended?

> +		old_npages = slot->npages;
>  	}
>  
> -	old = *slot;
>  	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
>  		struct kvm_userspace_memory_region m;
>  
> @@ -9673,7 +9673,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
>  	}
>  
>  	if (!size)
> -		vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
> +		vm_munmap(hva, old_npages * PAGE_SIZE);
>  
>  	return 0;
>  }
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f05be99dc44a..60ddfdb69378 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -572,10 +572,11 @@ static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
>  	return vcpu->vcpu_idx;
>  }
>  
> -#define kvm_for_each_memslot(memslot, slots)	\
> -	for (memslot = &slots->memslots[0];	\
> -	      memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
> -		memslot++)
> +#define kvm_for_each_memslot(memslot, slots)				\
> +	for (memslot = &slots->memslots[0];				\
> +	     memslot < slots->memslots + slots->used_slots; memslot++)	\
> +		if (WARN_ON_ONCE(!memslot->npages)) {			\
> +		} else
>  
>  void kvm_vcpu_destroy(struct kvm_vcpu *vcpu);
>  
> @@ -635,12 +636,15 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
>  	return __kvm_memslots(vcpu->kvm, as_id);
>  }
>  
> -static inline struct kvm_memory_slot *
> -id_to_memslot(struct kvm_memslots *slots, int id)
> +static inline
> +struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
>  {
>  	int index = slots->id_to_index[id];
>  	struct kvm_memory_slot *slot;
>  
> +	if (index < 0)
> +		return NULL;
> +
>  	slot = &slots->memslots[index];
>  
>  	WARN_ON(slot->id != id);
> @@ -1005,6 +1009,8 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
>   * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
>   * gfn_to_memslot() itself isn't here as an inline because that would
>   * bloat other code too much.
> + *
> + * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!

(this confused me too..)

>   */
>  static inline struct kvm_memory_slot *
>  search_memslots(struct kvm_memslots *slots, gfn_t gfn)
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 23af65f8fd0f..a1d3813bad76 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1535,8 +1535,13 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
>  {
>  	struct kvm_memslots *slots = kvm_memslots(kvm);
>  	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
> -	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
> -	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
> +	phys_addr_t start, end;
> +
> +	if (WARN_ON_ONCE(!memslot))
> +		return;
> +
> +	start = memslot->base_gfn << PAGE_SHIFT;
> +	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
>  
>  	spin_lock(&kvm->mmu_lock);
>  	stage2_wp_range(kvm, start, end);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 190c065da48d..9b614cf2ca20 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -565,7 +565,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
>  		return NULL;
>  
>  	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
> -		slots->id_to_index[i] = slots->memslots[i].id = i;
> +		slots->id_to_index[i] = slots->memslots[i].id = -1;
>  
>  	return slots;
>  }
> @@ -869,63 +869,162 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
>  }
>  
>  /*
> - * Insert memslot and re-sort memslots based on their GFN,
> - * so binary search could be used to lookup GFN.
> - * Sorting algorithm takes advantage of having initially
> - * sorted array and known changed memslot position.
> + * Delete a memslot by decrementing the number of used slots and shifting all
> + * other entries in the array forward one spot.
> + */
> +static inline void kvm_memslot_delete(struct kvm_memslots *slots,
> +				      struct kvm_memory_slot *memslot)
> +{
> +	struct kvm_memory_slot *mslots = slots->memslots;
> +	int i;
> +
> +	if (WARN_ON(slots->id_to_index[memslot->id] == -1))
> +		return;
> +
> +	slots->used_slots--;
> +
> +	for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
> +		mslots[i] = mslots[i + 1];
> +		slots->id_to_index[mslots[i].id] = i;
> +	}
> +	mslots[i] = *memslot;
> +	slots->id_to_index[memslot->id] = -1;
> +}
> +
> +/*
> + * "Insert" a new memslot by incrementing the number of used slots.  Returns
> + * the new slot's initial index into the memslots array.
> + */
> +static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)

The naming here didn't help me to understand but a bit more
confused...

How about "kvm_memslot_insert_end"?  Or even unwrap it.

> +{
> +	return slots->used_slots++;
> +}
> +
> +/*
> + * Move a changed memslot backwards in the array by shifting existing slots
> + * with a higher GFN toward the front of the array.  Note, the changed memslot
> + * itself is not preserved in the array, i.e. not swapped at this time, only
> + * its new index into the array is tracked.  Returns the changed memslot's
> + * current index into the memslots array.
> + */
> +static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
> +					    struct kvm_memory_slot *memslot)

"backward" makes me feel like it's moving towards smaller index,
instead it's moving to bigger index.  Same applies to "forward" below.
I'm not sure whether I'm the only one, though...

> +{
> +	struct kvm_memory_slot *mslots = slots->memslots;
> +	int i;
> +
> +	if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
> +	    WARN_ON_ONCE(!slots->used_slots))
> +		return -1;
> +
> +	/*
> +	 * Move the target memslot backward in the array by shifting existing
> +	 * memslots with a higher GFN (than the target memslot) towards the
> +	 * front of the array.
> +	 */
> +	for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
> +		if (memslot->base_gfn > mslots[i + 1].base_gfn)
> +			break;
> +
> +		WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);

Will this trigger?  Note that in __kvm_set_memory_region() we have
already checked overlap of memslots.

> +
> +		/* Shift the next memslot forward one and update its index. */
> +		mslots[i] = mslots[i + 1];
> +		slots->id_to_index[mslots[i].id] = i;
> +	}
> +	return i;
> +}
> +
> +/*
> + * Move a changed memslot forwards in the array by shifting existing slots with
> + * a lower GFN toward the back of the array.  Note, the changed memslot itself
> + * is not preserved in the array, i.e. not swapped at this time, only its new
> + * index into the array is tracked.  Returns the changed memslot's final index
> + * into the memslots array.
> + */
> +static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
> +					   struct kvm_memory_slot *memslot,
> +					   int start)

Same question on the naming.

> +{
> +	struct kvm_memory_slot *mslots = slots->memslots;
> +	int i;
> +
> +	for (i = start; i > 0; i--) {
> +		if (memslot->base_gfn < mslots[i - 1].base_gfn)
> +			break;

(The very careful ">=" converted to "<" here, looks correct, though
 after the refactoring it should not matter any more)

> +
> +		WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);

Same here.

> +
> +		/* Shift the next memslot back one and update its index. */
> +		mslots[i] = mslots[i - 1];
> +		slots->id_to_index[mslots[i].id] = i;
> +	}
> +	return i;
> +}
> +
> +/*
> + * Re-sort memslots based on their GFN to account for an added, deleted, or
> + * moved memslot.  Sorting memslots by GFN allows using a binary search during
> + * memslot lookup.
> + *
> + * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!  I.e. the entry
> + * at memslots[0] has the highest GFN.
> + *
> + * The sorting algorithm takes advantage of having initially sorted memslots
> + * and knowing the position of the changed memslot.  Sorting is also optimized
> + * by not swapping the updated memslot and instead only shifting other memslots
> + * and tracking the new index for the update memslot.  Only once its final
> + * index is known is the updated memslot copied into its position in the array.
> + *
> + *  - When deleting a memslot, the deleted memslot simply needs to be moved to
> + *    the end of the array.
> + *
> + *  - When creating a memslot, the algorithm "inserts" the new memslot at the
> + *    end of the array and then it forward to its correct location.
> + *
> + *  - When moving a memslot, the algorithm first moves the updated memslot
> + *    backward to handle the scenario where the memslot's GFN was changed to a
> + *    lower value.  update_memslots() then falls through and runs the same flow
> + *    as creating a memslot to move the memslot forward to handle the scenario
> + *    where its GFN was changed to a higher value.
> + *
> + * Note, slots are sorted from highest->lowest instead of lowest->highest for
> + * historical reasons.  Originally, invalid memslots where denoted by having
> + * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
> + * to the end of the array.  The current algorithm uses dedicated logic to
> + * delete a memslot and thus does not rely on invalid memslots having GFN=0.
> + *
> + * The other historical motiviation for highest->lowest was to improve the
> + * performance of memslot lookup.  KVM originally used a linear search starting
> + * at memslots[0].  On x86, the largest memslot usually has one of the highest,
> + * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a
> + * single memslot above the 4gb boundary.  As the largest memslot is also the
> + * most likely to be referenced, sorting it to the front of the array was
> + * advantageous.  The current binary search starts from the middle of the array
> + * and uses an LRU pointer to improve performance for all memslots and GFNs.
>   */
>  static void update_memslots(struct kvm_memslots *slots,
> -			    struct kvm_memory_slot *new,
> +			    struct kvm_memory_slot *memslot,
>  			    enum kvm_mr_change change)
>  {
> -	int id = new->id;
> -	int i = slots->id_to_index[id];
> -	struct kvm_memory_slot *mslots = slots->memslots;
> +	int i;
>  
> -	WARN_ON(mslots[i].id != id);
> -	switch (change) {
> -	case KVM_MR_CREATE:
> -		slots->used_slots++;
> -		WARN_ON(mslots[i].npages || !new->npages);
> -		break;
> -	case KVM_MR_DELETE:
> -		slots->used_slots--;
> -		WARN_ON(new->npages || !mslots[i].npages);
> -		break;
> -	default:
> -		break;
> -	}
> +	if (change == KVM_MR_DELETE) {
> +		kvm_memslot_delete(slots, memslot);
> +	} else {
> +		if (change == KVM_MR_CREATE)
> +			i = kvm_memslot_insert_back(slots);
> +		else
> +			i = kvm_memslot_move_backward(slots, memslot);
> +		i = kvm_memslot_move_forward(slots, memslot, i);
>  
> -	while (i < KVM_MEM_SLOTS_NUM - 1 &&
> -	       new->base_gfn <= mslots[i + 1].base_gfn) {
> -		if (!mslots[i + 1].npages)
> -			break;
> -		mslots[i] = mslots[i + 1];
> -		slots->id_to_index[mslots[i].id] = i;
> -		i++;
> +		/*
> +		 * Copy the memslot to its new position in memslots and update
> +		 * its index accordingly.
> +		 */
> +		slots->memslots[i] = *memslot;
> +		slots->id_to_index[memslot->id] = i;
>  	}
> -
> -	/*
> -	 * The ">=" is needed when creating a slot with base_gfn == 0,
> -	 * so that it moves before all those with base_gfn == npages == 0.
> -	 *
> -	 * On the other hand, if new->npages is zero, the above loop has
> -	 * already left i pointing to the beginning of the empty part of
> -	 * mslots, and the ">=" would move the hole backwards in this
> -	 * case---which is wrong.  So skip the loop when deleting a slot.
> -	 */
> -	if (new->npages) {
> -		while (i > 0 &&
> -		       new->base_gfn >= mslots[i - 1].base_gfn) {
> -			mslots[i] = mslots[i - 1];
> -			slots->id_to_index[mslots[i].id] = i;
> -			i--;
> -		}
> -	} else
> -		WARN_ON_ONCE(i != slots->used_slots);
> -
> -	mslots[i] = *new;
> -	slots->id_to_index[mslots[i].id] = i;
>  }
>  
>  static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
> @@ -1104,8 +1203,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  	 * when the memslots are re-sorted by update_memslots().
>  	 */
>  	tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
> -	old = *tmp;
> -	tmp = NULL;

I was confused in that patch, then...

> +	if (tmp) {
> +		old = *tmp;
> +		tmp = NULL;

... now I still don't know why it needs to set to NULL?

> +	} else {
> +		memset(&old, 0, sizeof(old));
> +		old.id = id;
> +	}
>  
>  	if (!mem->memory_size)
>  		return kvm_delete_memslot(kvm, mem, &old, as_id);
> @@ -1223,7 +1327,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
>  
>  	slots = __kvm_memslots(kvm, as_id);
>  	*memslot = id_to_memslot(slots, id);
> -	if (!(*memslot)->dirty_bitmap)
> +	if (!(*memslot) || !(*memslot)->dirty_bitmap)
>  		return -ENOENT;
>  
>  	kvm_arch_sync_dirty_log(kvm, *memslot);
> @@ -1281,10 +1385,10 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
>  
>  	slots = __kvm_memslots(kvm, as_id);
>  	memslot = id_to_memslot(slots, id);
> +	if (!memslot || !memslot->dirty_bitmap)
> +		return -ENOENT;
>  
>  	dirty_bitmap = memslot->dirty_bitmap;
> -	if (!dirty_bitmap)
> -		return -ENOENT;
>  
>  	kvm_arch_sync_dirty_log(kvm, memslot);
>  
> @@ -1392,10 +1496,10 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
>  
>  	slots = __kvm_memslots(kvm, as_id);
>  	memslot = id_to_memslot(slots, id);
> +	if (!memslot || !memslot->dirty_bitmap)
> +		return -ENOENT;
>  
>  	dirty_bitmap = memslot->dirty_bitmap;
> -	if (!dirty_bitmap)
> -		return -ENOENT;
>  
>  	n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
>  
> -- 
> 2.24.1
> 

-- 
Peter Xu