[PATCH Part2 RFC v4 35/40] KVM: Add arch hooks to track the host write to guest memory

Brijesh Singh <brijesh.singh@xxxxxxx> · Wed, 7 Jul 2021 13:36:11 -0500

The kvm_write_guest{_page} and kvm_vcpu_write_guest{_page} are used by
the hypevisor to write to the guest memory. The kvm_vcpu_map() and
kvm_map_gfn() are used by the hypervisor to map the guest memory and
and access it later.

When SEV-SNP is enabled in the guest VM, the guest memory pages can
either be a private or shared. A write from the hypervisor goes through
the RMP checks. If hardware sees that hypervisor is attempting to write
to a guest private page, then it triggers an RMP violation (i.e, #PF with
RMP bit set).

Enhance the KVM guest write helpers to invoke an architecture specific
hooks (kvm_arch_write_gfn_{begin,end}) to track the write access from the
hypervisor.

When SEV-SNP is enabled, the guest uses the PAGE_STATE vmgexit to ask the
hypervisor to change the page state from shared to private or vice versa.
While changing the page state to private, use the
kvm_host_write_track_is_active() to check whether the page is being
tracked for the host write access (i.e either mapped or kvm_write_guest
is in progress). If its tracked, then do not change the page state.

Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx>
---
 arch/x86/include/asm/kvm_host.h |  6 +++
 arch/x86/kvm/svm/sev.c          | 51 +++++++++++++++++++++
 arch/x86/kvm/svm/svm.c          |  2 +
 arch/x86/kvm/svm/svm.h          |  1 +
 arch/x86/kvm/x86.c              | 78 +++++++++++++++++++++++++++++++++
 include/linux/kvm_host.h        |  3 ++
 virt/kvm/kvm_main.c             | 21 +++++++--
 7 files changed, 159 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 59185b6bc82a..678992e9966a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -865,10 +865,13 @@ struct kvm_lpage_info {
 	int disallow_lpage;
 };
 
+bool kvm_host_write_track_is_active(struct kvm *kvm, gfn_t gfn);
+
 struct kvm_arch_memory_slot {
 	struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
 	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 	unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
+	unsigned short *host_write_track[KVM_PAGE_TRACK_MAX];
 };
 
 /*
@@ -1393,6 +1396,9 @@ struct kvm_x86_ops {
 	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
 	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
 	int (*get_tdp_max_page_level)(struct kvm_vcpu *vcpu, gpa_t gpa, int max_level);
+
+	void (*write_page_begin)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);
+	void (*write_page_end)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0155d9b3127d..839cf321c6dd 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2884,6 +2884,19 @@ static int snp_make_page_shared(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t pfn,
 	return rmpupdate(pfn_to_page(pfn), &val);
 }
 
+static inline bool kvm_host_write_track_gpa_range_is_active(struct kvm *kvm,
+							    gpa_t start, gpa_t end)
+{
+	while (start < end) {
+		if (kvm_host_write_track_is_active(kvm, gpa_to_gfn(start)))
+			return 1;
+
+		start += PAGE_SIZE;
+	}
+
+	return false;
+}
+
 static int snp_make_page_private(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t pfn, int level)
 {
 	struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
@@ -2895,6 +2908,14 @@ static int snp_make_page_private(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t pfn
 	if (!e)
 		return -EINVAL;
 
+	/*
+	 * If the GPA is tracked for the write access then do not change the
+	 * page state from shared to private.
+	 */
+	if (kvm_host_write_track_gpa_range_is_active(vcpu->kvm,
+		gpa, gpa + page_level_size(level)))
+		return -EBUSY;
+
 	/* Log if the entry is validated */
 	if (rmpentry_validated(e))
 		pr_warn_ratelimited("Asked to make a pre-validated gpa %llx private\n", gpa);
@@ -3468,3 +3489,33 @@ int sev_get_tdp_max_page_level(struct kvm_vcpu *vcpu, gpa_t gpa, int max_level)
 
 	return min_t(uint32_t, level, max_level);
 }
+
+void sev_snp_write_page_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	struct rmpentry *e;
+	int level, rc;
+	kvm_pfn_t pfn;
+
+	if (!sev_snp_guest(kvm))
+		return;
+
+	pfn = gfn_to_pfn(kvm, gfn);
+	if (is_error_noslot_pfn(pfn))
+		return;
+
+	e = snp_lookup_page_in_rmptable(pfn_to_page(pfn), &level);
+	if (unlikely(!e))
+		return;
+
+	/*
+	 * A hypervisor should never write to the guest private page. A write to the
+	 * guest private will cause an RMP violation. If the guest page is private,
+	 * then make it shared.
+	 */
+	if (rmpentry_assigned(e)) {
+		pr_err("SEV-SNP: write to guest private gfn %llx\n", gfn);
+		rc = snp_make_page_shared(kvm_get_vcpu(kvm, 0),
+				gfn << PAGE_SHIFT, pfn, PG_LEVEL_4K);
+		BUG_ON(rc != 0);
+	}
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2632eae52aa3..4ff6fc86dd18 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4577,6 +4577,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 
 	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
 	.get_tdp_max_page_level = sev_get_tdp_max_page_level,
+
+	.write_page_begin = sev_snp_write_page_begin,
 };
 
 static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index af4cce39b30f..e0276ad8a1ae 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -576,6 +576,7 @@ void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
 void sev_es_unmap_ghcb(struct vcpu_svm *svm);
 struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
 int sev_get_tdp_max_page_level(struct kvm_vcpu *vcpu, gpa_t gpa, int max_level);
+void sev_snp_write_page_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);
 
 /* vmenter.S */
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbc4e04e67ad..1398b8021982 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9076,6 +9076,48 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
 }
 
+static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
+			     enum kvm_page_track_mode mode, short count)
+{
+	int index, val;
+
+	index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+
+	val = slot->arch.host_write_track[mode][index];
+
+	if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+		return;
+
+	slot->arch.host_write_track[mode][index] += count;
+}
+
+bool kvm_host_write_track_is_active(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *slot;
+	int index;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	if (!slot)
+		return false;
+
+	index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+	return !!READ_ONCE(slot->arch.host_write_track[KVM_PAGE_TRACK_WRITE][index]);
+}
+EXPORT_SYMBOL_GPL(kvm_host_write_track_is_active);
+
+void kvm_arch_write_gfn_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	update_gfn_track(slot, gfn, KVM_PAGE_TRACK_WRITE, 1);
+
+	if (kvm_x86_ops.write_page_begin)
+		kvm_x86_ops.write_page_begin(kvm, slot, gfn);
+}
+
+void kvm_arch_write_gfn_end(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	update_gfn_track(slot, gfn, KVM_PAGE_TRACK_WRITE, -1);
+}
+
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
 	if (!lapic_in_kernel(vcpu))
@@ -10896,6 +10938,36 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_hv_destroy_vm(kvm);
 }
 
+static void kvm_write_page_track_free_memslot(struct kvm_memory_slot *slot)
+{
+	int i;
+
+	for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+		kvfree(slot->arch.host_write_track[i]);
+		slot->arch.host_write_track[i] = NULL;
+	}
+}
+
+static int kvm_write_page_track_create_memslot(struct kvm_memory_slot *slot,
+					       unsigned long npages)
+{
+	int  i;
+
+	for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+		slot->arch.host_write_track[i] =
+			kvcalloc(npages, sizeof(*slot->arch.host_write_track[i]),
+				 GFP_KERNEL_ACCOUNT);
+		if (!slot->arch.host_write_track[i])
+			goto track_free;
+	}
+
+	return 0;
+
+track_free:
+	kvm_write_page_track_free_memslot(slot);
+	return -ENOMEM;
+}
+
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
 	int i;
@@ -10969,8 +11041,14 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
 	if (kvm_page_track_create_memslot(slot, npages))
 		goto out_free;
 
+	if (kvm_write_page_track_create_memslot(slot, npages))
+		goto e_free_page_track;
+
 	return 0;
 
+e_free_page_track:
+	kvm_page_track_free_memslot(slot);
+
 out_free:
 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 		kvfree(slot->arch.rmap[i]);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2f34487e21f2..f22e22cd2179 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1550,6 +1550,9 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 					    unsigned long start, unsigned long end);
 
+void kvm_arch_write_gfn_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_arch_write_gfn_end(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);
+
 #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
 int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
 #else
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6b4feb92dc79..bc805c15d0de 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -160,6 +160,14 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 {
 }
 
+__weak void kvm_arch_write_gfn_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+}
+
+__weak void kvm_arch_write_gfn_end(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+}
+
 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
 {
 	/*
@@ -2309,7 +2317,8 @@ static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
 	cache->generation = gen;
 }
 
-static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
+static int __kvm_map_gfn(struct kvm *kvm,
+			 struct kvm_memslots *slots, gfn_t gfn,
 			 struct kvm_host_map *map,
 			 struct gfn_to_pfn_cache *cache,
 			 bool atomic)
@@ -2361,20 +2370,22 @@ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
 	map->pfn = pfn;
 	map->gfn = gfn;
 
+	kvm_arch_write_gfn_begin(kvm, slot, map->gfn);
+
 	return 0;
 }
 
 int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
 		struct gfn_to_pfn_cache *cache, bool atomic)
 {
-	return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
+	return __kvm_map_gfn(vcpu->kvm, kvm_memslots(vcpu->kvm), gfn, map,
 			cache, atomic);
 }
 EXPORT_SYMBOL_GPL(kvm_map_gfn);
 
 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 {
-	return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
+	return __kvm_map_gfn(vcpu->kvm, kvm_vcpu_memslots(vcpu), gfn, map,
 		NULL, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
@@ -2412,6 +2423,8 @@ static void __kvm_unmap_gfn(struct kvm *kvm,
 	else
 		kvm_release_pfn(map->pfn, dirty, NULL);
 
+	kvm_arch_write_gfn_end(kvm, memslot, map->gfn);
+
 	map->hva = NULL;
 	map->page = NULL;
 }
@@ -2612,7 +2625,9 @@ static int __kvm_write_guest_page(struct kvm *kvm,
 	addr = gfn_to_hva_memslot(memslot, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
+	kvm_arch_write_gfn_begin(kvm, memslot, gfn);
 	r = __copy_to_user((void __user *)addr + offset, data, len);
+	kvm_arch_write_gfn_end(kvm, memslot, gfn);
 	if (r)
 		return -EFAULT;
 	mark_page_dirty_in_slot(kvm, memslot, gfn);
-- 
2.17.1