[PATCH] KVM: Remove dirty handling from gfn_to_pfn_cache completely

David Woodhouse <dwmw2@xxxxxxxxxxxxx> · Sun, 27 Feb 2022 15:11:08 +0000

From: David Woodhouse <dwmw@xxxxxxxxxxxx>

It isn't OK to cache the dirty status of a page in internal structures
for an indefinite period of time.

Any time a vCPU exits the run loop to userspace might be its last; the
VMM might do its final check of the dirty log, flush the last remaining
dirty pages to the destination and complete a live migration. If we
have internal 'dirty' state which doesn't get flushed until the vCPU
is finally destroyed on the source after migration is complete, then
we have lost data because that will escape the final copy.

This problem already exists with the use of kvm_vcpu_unmap() to mark
pages dirty in e.g. VMX nesting.

Note that the actual Linux MM already considers the page to be dirty
since we have a writeable mapping of it. This is just about the KVM
dirty logging.

Make the PV clock mark the page dirty immediately (which is fine as
it's happening in vCPU context). Document the Xen shinfo/vcpu_info
case more completely as being exempt, because we might dirty those
from interrupt context as we deliver event channels.

For the nesting-style use cases (KVM_GUEST_USES_PFN) we will need to
track which gfn_to_pfn_caches have been used and explicitly mark the
corresponding pages dirty before returning to userspace. But we would
have needed external tracking of that anyway, rather than walking the
full list of GPCs to find those belonging to this vCPU which are dirty.

So let's rely *solely* on that external tracking, and keep it simple
rather than laying a tempting trap for callers to fall into.

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---

This sits on top of the Xen event channel series, because that adds a
few users of gfn_to_pfn_cache and I wanted to see it actually being
used this way. If we're OK with this approach, I'll refactor that
series to put this first.

I think this is OK for the KVM_GUEST_USES_PFN cases (mostly nesting)
because, as I note in the commit message, they're going to want to keep
track of which caches they have used *anyway* to avoid walking the full
GPC list on every exit to userspace.

 Documentation/virt/kvm/api.rst |  4 ++++
 arch/x86/kvm/x86.c             |  8 +++----
 arch/x86/kvm/xen.c             | 24 +++++++-------------
 include/linux/kvm_host.h       | 14 +++++-------
 include/linux/kvm_types.h      |  3 +--
 virt/kvm/pfncache.c            | 41 +++++++---------------------------
 6 files changed, 30 insertions(+), 64 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 982fcdf8cfa8..77b631452036 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5288,6 +5288,10 @@ type values:
 
 KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
   Sets the guest physical address of the vcpu_info for a given vCPU.
+  As with the shared_info page for the VM, the corresponding page may be
+  dirtied at any time if event channel interrupt delivery is enabled, so
+  userspace should always assume that the page is dirty without relying
+  on dirty logging.
 
 KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
   Sets the guest physical address of an additional pvclock structure
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04f86cb94069..08317ad5c93b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2237,8 +2237,7 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
 	if (system_time & 1) {
 		kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
 					  KVM_HOST_USES_PFN, system_time & ~1ULL,
-					  sizeof(struct pvclock_vcpu_time_info),
-					  false);
+					  sizeof(struct pvclock_vcpu_time_info));
 	} else {
 		kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
 	}
@@ -2958,8 +2957,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
 		read_unlock_irqrestore(&gpc->lock, flags);
 
 		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
-						 offset + sizeof(*guest_hv_clock),
-						 true))
+						 offset + sizeof(*guest_hv_clock)))
 			return;
 
 		read_lock_irqsave(&gpc->lock, flags);
@@ -2989,6 +2987,8 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
 	smp_wmb();
 
 	guest_hv_clock->version = ++vcpu->hv_clock.version;
+
+	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
 	read_unlock_irqrestore(&gpc->lock, flags);
 
 	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index f59dca40d7c3..71311bb03d53 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -49,7 +49,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
 
 	do {
 		ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN,
-						gpa, PAGE_SIZE, false);
+						gpa, PAGE_SIZE);
 		if (ret)
 			goto out;
 
@@ -245,8 +245,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 		if (state == RUNSTATE_runnable)
 			return;
 
-		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
-						 user_len, false))
+		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
 			return;
 
 		read_lock_irqsave(&gpc->lock, flags);
@@ -377,8 +376,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 		read_unlock_irqrestore(&gpc->lock, flags);
 
 		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
-						 sizeof(struct vcpu_info),
-						 false))
+						 sizeof(struct vcpu_info)))
 			return;
 
 		read_lock_irqsave(&gpc->lock, flags);
@@ -454,8 +452,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 			return 1;
 
 		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
-						 sizeof(struct vcpu_info),
-						 false)) {
+						 sizeof(struct vcpu_info))) {
 			/*
 			 * If this failed, userspace has screwed up the
 			 * vcpu_info mapping. No interrupts for you.
@@ -583,7 +580,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.vcpu_info_cache,
 					      NULL, KVM_HOST_USES_PFN, data->u.gpa,
-					      sizeof(struct vcpu_info), false);
+					      sizeof(struct vcpu_info));
 		if (!r)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
@@ -600,8 +597,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.vcpu_time_info_cache,
 					      NULL, KVM_HOST_USES_PFN, data->u.gpa,
-					      sizeof(struct pvclock_vcpu_time_info),
-					      false);
+					      sizeof(struct pvclock_vcpu_time_info));
 		if (!r)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 		break;
@@ -621,8 +617,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.runstate_cache,
 					      NULL, KVM_HOST_USES_PFN, data->u.gpa,
-					      sizeof(struct vcpu_runstate_info),
-					      false);
+					      sizeof(struct vcpu_runstate_info));
 		break;
 
 	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
@@ -1513,8 +1508,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 			break;
 
 		idx = srcu_read_lock(&kvm->srcu);
-		rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa,
-						  PAGE_SIZE, false);
+		rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
 		srcu_read_unlock(&kvm->srcu, idx);
 	} while(!rc);
 
@@ -1829,10 +1823,8 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
 				     &vcpu->arch.xen.runstate_cache);
-	vcpu->arch.xen.vcpu_info_cache.dirty = false;
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
 				     &vcpu->arch.xen.vcpu_info_cache);
-	vcpu->arch.xen.vcpu_time_info_cache.dirty = false;
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
 				     &vcpu->arch.xen.vcpu_time_info_cache);
 	del_timer_sync(&vcpu->arch.xen.poll_timer);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6e5bbb1b3e0d..aa596fd19578 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1228,7 +1228,6 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
  *		   by KVM (and thus needs a kernel virtual mapping).
  * @gpa:	   guest physical address to map.
  * @len:	   sanity check; the range being access must fit a single page.
- * @dirty:         mark the cache dirty immediately.
  *
  * @return:	   0 for success.
  *		   -EINVAL for a mapping which would cross a page boundary.
@@ -1242,7 +1241,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
  */
 int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 			      struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
-			      gpa_t gpa, unsigned long len, bool dirty);
+			      gpa_t gpa, unsigned long len);
 
 /**
  * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache.
@@ -1251,7 +1250,6 @@ int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
  * @gpc:	   struct gfn_to_pfn_cache object.
  * @gpa:	   current guest physical address to map.
  * @len:	   sanity check; the range being access must fit a single page.
- * @dirty:         mark the cache dirty immediately.
  *
  * @return:	   %true if the cache is still valid and the address matches.
  *		   %false if the cache is not valid.
@@ -1273,7 +1271,6 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
  * @gpc:	   struct gfn_to_pfn_cache object.
  * @gpa:	   updated guest physical address to map.
  * @len:	   sanity check; the range being access must fit a single page.
- * @dirty:         mark the cache dirty immediately.
  *
  * @return:	   0 for success.
  *		   -EINVAL for a mapping which would cross a page boundary.
@@ -1286,7 +1283,7 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
  * with the lock still held to permit access.
  */
 int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
-				 gpa_t gpa, unsigned long len, bool dirty);
+				 gpa_t gpa, unsigned long len);
 
 /**
  * kvm_gfn_to_pfn_cache_unmap - temporarily unmap a gfn_to_pfn_cache.
@@ -1294,10 +1291,9 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
  * @kvm:	   pointer to kvm instance.
  * @gpc:	   struct gfn_to_pfn_cache object.
  *
- * This unmaps the referenced page and marks it dirty, if appropriate. The
- * cache is left in the invalid state but at least the mapping from GPA to
- * userspace HVA will remain cached and can be reused on a subsequent
- * refresh.
+ * This unmaps the referenced page. The cache is left in the invalid state
+ * but at least the mapping from GPA to userspace HVA will remain cached
+ * and can be reused on a subsequent refresh.
  */
 void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
 
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 784f37cbf33e..d1447801fc68 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
+#/* SPDX-License-Identifier: GPL-2.0-only */
 
 #ifndef __KVM_TYPES_H__
 #define __KVM_TYPES_H__
@@ -74,7 +74,6 @@ struct gfn_to_pfn_cache {
 	enum pfn_cache_usage usage;
 	bool active;
 	bool valid;
-	bool dirty;
 };
 
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 9b3a192cb18c..d789f2705e5e 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -49,19 +49,6 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 				}
 				__set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap);
 			}
-
-			/*
-			 * We cannot call mark_page_dirty() from here because
-			 * this physical CPU might not have an active vCPU
-			 * with which to do the KVM dirty tracking.
-			 *
-			 * Neither is there any point in telling the kernel MM
-			 * that the underlying page is dirty. A vCPU in guest
-			 * mode might still be writing to it up to the point
-			 * where we wake them a few lines further down anyway.
-			 *
-			 * So all the dirty marking happens on the unmap.
-			 */
 		}
 		write_unlock_irq(&gpc->lock);
 	}
@@ -104,8 +91,7 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_check);
 
-static void __release_gpc(struct kvm *kvm, kvm_pfn_t pfn, void *khva,
-			  gpa_t gpa, bool dirty)
+static void __release_gpc(struct kvm *kvm, kvm_pfn_t pfn, void *khva, gpa_t gpa)
 {
 	/* Unmap the old page if it was mapped before, and release it */
 	if (!is_error_noslot_pfn(pfn)) {
@@ -118,9 +104,7 @@ static void __release_gpc(struct kvm *kvm, kvm_pfn_t pfn, void *khva,
 #endif
 		}
 
-		kvm_release_pfn(pfn, dirty);
-		if (dirty)
-			mark_page_dirty(kvm, gpa);
+		kvm_release_pfn(pfn, false);
 	}
 }
 
@@ -152,7 +136,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, unsigned long uhva)
 }
 
 int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
-				 gpa_t gpa, unsigned long len, bool dirty)
+				 gpa_t gpa, unsigned long len)
 {
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	unsigned long page_offset = gpa & ~PAGE_MASK;
@@ -160,7 +144,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	unsigned long old_uhva;
 	gpa_t old_gpa;
 	void *old_khva;
-	bool old_valid, old_dirty;
+	bool old_valid;
 	int ret = 0;
 
 	/*
@@ -177,14 +161,12 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	old_khva = gpc->khva - offset_in_page(gpc->khva);
 	old_uhva = gpc->uhva;
 	old_valid = gpc->valid;
-	old_dirty = gpc->dirty;
 
 	/* If the userspace HVA is invalid, refresh that first */
 	if (gpc->gpa != gpa || gpc->generation != slots->generation ||
 	    kvm_is_error_hva(gpc->uhva)) {
 		gfn_t gfn = gpa_to_gfn(gpa);
 
-		gpc->dirty = false;
 		gpc->gpa = gpa;
 		gpc->generation = slots->generation;
 		gpc->memslot = __gfn_to_memslot(slots, gfn);
@@ -255,14 +237,9 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	}
 
  out:
-	if (ret)
-		gpc->dirty = false;
-	else
-		gpc->dirty = dirty;
-
 	write_unlock_irq(&gpc->lock);
 
-	__release_gpc(kvm, old_pfn, old_khva, old_gpa, old_dirty);
+	__release_gpc(kvm, old_pfn, old_khva, old_gpa);
 
 	return ret;
 }
@@ -272,7 +249,6 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 {
 	void *old_khva;
 	kvm_pfn_t old_pfn;
-	bool old_dirty;
 	gpa_t old_gpa;
 
 	write_lock_irq(&gpc->lock);
@@ -280,7 +256,6 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 	gpc->valid = false;
 
 	old_khva = gpc->khva - offset_in_page(gpc->khva);
-	old_dirty = gpc->dirty;
 	old_gpa = gpc->gpa;
 	old_pfn = gpc->pfn;
 
@@ -293,14 +268,14 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 
 	write_unlock_irq(&gpc->lock);
 
-	__release_gpc(kvm, old_pfn, old_khva, old_gpa, old_dirty);
+	__release_gpc(kvm, old_pfn, old_khva, old_gpa);
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap);
 
 
 int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 			      struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
-			      gpa_t gpa, unsigned long len, bool dirty)
+			      gpa_t gpa, unsigned long len)
 {
 	WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
 
@@ -319,7 +294,7 @@ int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 		list_add(&gpc->list, &kvm->gpc_list);
 		spin_unlock(&kvm->gpc_lock);
 	}
-	return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len, dirty);
+	return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len);
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init);
 
-- 
2.33.1

Attachment:
smime.p7s

Description: S/MIME cryptographic signature