Re: index-out-of-range ubsan warnings

Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx> · Wed, 24 Feb 2016 14:23:19 +0800

On 02/23/2016 09:21 PM, Paolo Bonzini wrote:

+#define INVALID_INDEX        (-1)
+
   static int mmu_unsync_walk(struct kvm_mmu_page *sp,
                              struct kvm_mmu_pages *pvec)
   {
           if (!sp->unsync_children)
                   return 0;

-        mmu_pages_add(pvec, sp, 0);
+        /*
+         * do not count the index in the parent of the sp we're
+         * walking start from.
+         */
+        mmu_pages_add(pvec, sp, INVALID_INDEX);
           return __mmu_unsync_walk(sp, pvec);
   }

@@ -1980,8 +1986,11 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
                           return n;
                   }

-                parents->parent[sp->role.level-2] = sp;
-                parents->idx[sp->role.level-1] = pvec->page[n].idx;
+                parents->parent[sp->role.level - 2] = sp;
+
+                /* skip setting idex of the sp we start from. */
+                if (pvec->page[n].idx != INVALID_INDEX)
+                        parents->idx[sp->role.level - 1] = pvec->page[n].idx;

So far so good.

           }

           return n;
@@ -1999,6 +2008,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
                   if (!sp)
                           return;

+                WARN_ON(idx != INVALID_INDEX);

Shouldn't this warn if idx is *equal* to INVALID_INDEX?

Yes, indeed. It seems i am spending so much time on QEMU and mixed it with
assert() :(


                   clear_unsync_child_bit(sp, idx);
                   level++;
           } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
@@ -2008,7 +2018,7 @@ static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
                                  struct mmu_page_path *parents,
                                  struct kvm_mmu_pages *pvec)
   {
-        parents->parent[parent->role.level-1] = NULL;
+        parents->parent[parent->role.level - 2] = NULL;

This is meant to stop mmu_pages_clear_parents _after_ it has
processed sp, so the "-1" is correct.  The right fix would be:

         if (parent->role.level < PT64_ROOT_LEVEL-1)
                 parents->parent[parent->role.level - 1] = NULL;


it is okay as mmu_pages_next() will refill the highest level.

and no one has noticed it so far because if the bug hits you just write
over the beginning of ->idx (which so far is uninitialized) and nothing
bad happens.

I'm thinking of the following (untested) patch instead:

-------- 8< --------------
From: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Subject: [PATCH] KVM: MMU: Fix ubsan warnings

kvm_mmu_pages_init is doing some really yucky stuff.  It is setting
up a sentinel for mmu_page_clear_parents; however, because of a) the
way levels are numbered starting from 1 and b) the way mmu_page_path
sizes its arrays with PT64_ROOT_LEVEL-1 elements, the access can be
out of bounds.  This is harmless because the code overwrites up to the
first two elements of parents->idx and these are initialized, and
because the sentinel is not needed in this case---mmu_page_clear_parents
exits anyway when it gets to the end of the array.  However ubsan
complains, and everyone else should too.

This fix does three things.  First it makes the mmu_page_path arrays
PT64_ROOT_LEVEL elements in size, so that we can write to them without
checking the level in advance.  Second it disintegrates kvm_mmu_pages_init
between mmu_unsync_walk (which resets struct kvm_mmu_pages) and
mmu_pages_next (which unconditionally places a NULL sentinel at the
end of the current path).  This is okay because the mmu_page_path is
only used in mmu_pages_clear_parents; mmu_pages_clear_parents itself
is called within a for_each_sp iterator, and hence always after a
call to mmu_pages_next.  Third it changes mmu_pages_clear_parents to
just use the sentinel to stop iteration, without checking the bounds
on level.

Reported-by: Sasha Levin <sasha.levin@xxxxxxxxxx>
Reported-by: Mike Krinkin <krinkin.m.u@xxxxxxxxx>
Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 33cc9f3f5b02..f4f5e7ca14dd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1843,6 +1843,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
  static int mmu_unsync_walk(struct kvm_mmu_page *sp,
  			   struct kvm_mmu_pages *pvec)
  {
+	pvec->nr = 0;
  	if (!sp->unsync_children)
  		return 0;

@@ -1956,8 +1957,8 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
  }

  struct mmu_page_path {
-	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-	unsigned int idx[PT64_ROOT_LEVEL-1];
+	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+	unsigned int idx[PT64_ROOT_LEVEL];

So that it wastes 1 entry as we only use it to save parent sp.

  };

  #define for_each_sp(pvec, sp, parents, i)			\
@@ -1971,19 +1972,21 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
  			  int i)
  {
  	int n;
+	int level = PT_PAGE_TABLE_LEVEL;

  	for (n = i+1; n < pvec->nr; n++) {
  		struct kvm_mmu_page *sp = pvec->page[n].sp;

-		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-			parents->idx[0] = pvec->page[n].idx;
-			return n;
-		}
+		level = sp->role.level;
+		parents->idx[level-1] = pvec->page[n].idx;

-		parents->parent[sp->role.level-2] = sp;
-		parents->idx[sp->role.level-1] = pvec->page[n].idx;
+		if (level == PT_PAGE_TABLE_LEVEL)
+			break;
+
+		parents->parent[level-2] = sp;
  	}

+	parents->parent[level-1] = NULL;

Why?

Considering we have already got a unsync page, so break the loop
and reach here. It results parents->parent[ 0 ] = NULL and see
below......

  	return n;
  }

@@ -1994,22 +1996,13 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)

  	do {
  		unsigned int idx = parents->idx[level];
-
  		sp = parents->parent[level];
  		if (!sp)
  			return;

So the first level is NULL, it stops clearing unsync_children.


  		clear_unsync_child_bit(sp, idx);
  		level++;
-	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
-			       struct mmu_page_path *parents,
-			       struct kvm_mmu_pages *pvec)
-{
-	parents->parent[parent->role.level-1] = NULL;
-	pvec->nr = 0;
+	} while (!sp->unsync_children);
  }

  static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@ -2021,7 +2014,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
  	struct kvm_mmu_pages pages;
  	LIST_HEAD(invalid_list);

-	kvm_mmu_pages_init(parent, &parents, &pages);
  	while (mmu_unsync_walk(parent, &pages)) {
  		bool protected = false;

@@ -2037,7 +2029,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
  		}
  		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
  		cond_resched_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_pages_init(parent, &parents, &pages);
  	}
  }

@@ -2269,7 +2260,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
  	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
  		return 0;

-	kvm_mmu_pages_init(parent, &parents, &pages);
  	while (mmu_unsync_walk(parent, &pages)) {
  		struct kvm_mmu_page *sp;

@@ -2278,7 +2268,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
  			mmu_pages_clear_parents(&parents);
  			zapped++;
  		}
-		kvm_mmu_pages_init(parent, &parents, &pages);
  	}

  	return zapped;


Each time i looked into these functions, it costs my several hours. The trick is
that the item, sp and idx, saved to kvm_mmu_pages is the sp and the index in the
_parent_ level.

How about clarify/simplify it as this patch:

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 57cf30b..14b2800 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1848,7 +1848,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 		child = page_header(ent & PT64_BASE_ADDR_MASK);

 		if (child->unsync_children) {
-			if (mmu_pages_add(pvec, child, i))
+			if (mmu_pages_add(pvec, sp, i))
 				return -ENOSPC;

 			ret = __mmu_unsync_walk(child, pvec);
@@ -1861,7 +1861,14 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 				return ret;
 		} else if (child->unsync) {
 			nr_unsync_leaf++;
-			if (mmu_pages_add(pvec, child, i))
+			if (mmu_pages_add(pvec, sp, i))
+				return -ENOSPC;
+
+			/*
+			 * the unsync is on the last level so its 'idx' is
+			 * useless, we set it to 0 to catch potential bugs.
+			 */
+			if (mmu_pages_add(pvec, child, 0))
 				return -ENOSPC;
 		} else
 			clear_unsync_child_bit(sp, i);
@@ -1876,7 +1883,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 	if (!sp->unsync_children)
 		return 0;

-	mmu_pages_add(pvec, sp, 0);
 	return __mmu_unsync_walk(sp, pvec);
 }

@@ -2002,16 +2008,18 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
 {
 	int n;

-	for (n = i+1; n < pvec->nr; n++) {
+	for (n = i + 1; n < pvec->nr; n++) {
 		struct kvm_mmu_page *sp = pvec->page[n].sp;
+		unsigned int idx = pvec->page[n].idx;

 		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-			parents->idx[0] = pvec->page[n].idx;
+			/* we always set the idex as 0 for the unsync page. */
+			WARN_ON(idx != 0);
 			return n;
 		}

-		parents->parent[sp->role.level-2] = sp;
-		parents->idx[sp->role.level-1] = pvec->page[n].idx;
+		parents->parent[sp->role.level - 2] = sp;
+		parents->idx[sp->role.level - 2] = idx;
 	}

 	return n;
@@ -2031,14 +2039,18 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)

 		clear_unsync_child_bit(sp, idx);
 		level++;
-	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
+	} while (level < PT64_ROOT_LEVEL - 1 && !sp->unsync_children);
 }

 static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
 			       struct mmu_page_path *parents,
 			       struct kvm_mmu_pages *pvec)
 {
-	parents->parent[parent->role.level-1] = NULL;
+	/*
+	 * set the highest level to NULL to stop mmu_pages_clear_parents()
+	 * handing it on 32 bit guest.
+	 */
+	parents->parent[parent->role.level - 2] = NULL;
 	pvec->nr = 0;
 }



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html