Implement the shadow physical id table and its write tracking code which will be soon used for the nested AVIC. Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx> --- arch/x86/kvm/svm/avic.c | 461 +++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/svm/svm.h | 71 +++++++ 2 files changed, 524 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index e5cbbb97fbab6..f462b7e48e3ca 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -51,6 +51,433 @@ static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); + +static inline struct kvm_vcpu *avic_vcpu_by_l1_apicid(struct kvm *kvm, + int l1_apicid) +{ + WARN_ON(l1_apicid == -1); + return kvm_get_vcpu_by_id(kvm, l1_apicid); +} + +static void avic_physid_shadow_entry_set_vcpu(struct kvm *kvm, + struct avic_physid_table *t, + int n, + int new_l1_apicid) +{ + struct avic_physid_entry_descr *e = &t->entries[n]; + u64 sentry = READ_ONCE(*e->sentry); + u64 old_sentry = sentry; + struct kvm_vcpu *new_vcpu = NULL; + int l0_apicid = -1; + + WARN_ON(!test_bit(n, t->valid_entires)); + + if (!list_empty(&e->link)) + list_del_init(&e->link); + + if (new_l1_apicid != -1) + new_vcpu = avic_vcpu_by_l1_apicid(kvm, new_l1_apicid); + + if (new_vcpu) + l0_apicid = kvm_cpu_get_apicid(new_vcpu->cpu); + + physid_entry_set_apicid(&sentry, l0_apicid); + + trace_kvm_avic_physid_update_vcpu_guest(new_l1_apicid, l0_apicid); + + if (sentry != old_sentry) + WRITE_ONCE(*e->sentry, sentry); +} + +static void avic_physid_shadow_entry_create(struct kvm *kvm, + struct avic_physid_table *t, + int n, + u64 gentry) +{ + struct avic_physid_entry_descr *e = &t->entries[n]; + struct page *backing_page; + u64 backing_page_gpa = physid_entry_get_backing_table(gentry); + int l1_apic_id = physid_entry_get_apicid(gentry); + hpa_t backing_page_hpa; + u64 sentry = 0; + + + if (backing_page_gpa == INVALID_BACKING_PAGE) + return; + + /* Pin the APIC backing page */ + backing_page = gfn_to_page(kvm, gpa_to_gfn(backing_page_gpa)); + + if (is_error_page(backing_page)) + /* Invalid GPA in the guest entry - point to a dummy entry */ + backing_page_hpa = t->dummy_page_hpa; + else + backing_page_hpa = page_to_phys(backing_page); + + physid_entry_set_backing_table(&sentry, backing_page_hpa); + + e->gentry = gentry; + *e->sentry = sentry; + + if (test_and_set_bit(n, t->valid_entires)) + WARN_ON(1); + + if (backing_page_hpa != t->dummy_page_hpa) + avic_physid_shadow_entry_set_vcpu(kvm, t, n, l1_apic_id); +} + +static void avic_physid_shadow_entry_remove(struct kvm *kvm, + struct avic_physid_table *t, + int n) +{ + struct avic_physid_entry_descr *e = &t->entries[n]; + hpa_t backing_page_hpa; + + if (!test_and_clear_bit(n, t->valid_entires)) + WARN_ON(1); + + /* Release the APIC backing page */ + backing_page_hpa = physid_entry_get_backing_table(*e->sentry); + + if (backing_page_hpa != t->dummy_page_hpa) + kvm_release_pfn_dirty(backing_page_hpa >> PAGE_SHIFT); + + if (!list_empty(&e->link)) + list_del_init(&e->link); + + e->gentry = 0; + *e->sentry = 0; +} + + +static bool +avic_physid_shadow_table_setup_write_tracking(struct kvm *kvm, + struct avic_physid_table *t, + bool enable) +{ + struct kvm_memory_slot *slot; + + write_lock(&kvm->mmu_lock); + slot = gfn_to_memslot(kvm, t->gfn); + if (!slot) { + write_unlock(&kvm->mmu_lock); + return false; + } + + if (enable) + kvm_slot_page_track_add_page(kvm, slot, t->gfn, KVM_PAGE_TRACK_WRITE); + else + kvm_slot_page_track_remove_page(kvm, slot, t->gfn, KVM_PAGE_TRACK_WRITE); + write_unlock(&kvm->mmu_lock); + return true; +} + +static void +avic_physid_shadow_table_erase(struct kvm *kvm, struct avic_physid_table *t) +{ + int i; + + if (!t->nentries) + return; + + avic_physid_shadow_table_setup_write_tracking(kvm, t, false); + + for_each_set_bit(i, t->valid_entires, AVIC_MAX_PHYSICAL_ID_COUNT) + avic_physid_shadow_entry_remove(kvm, t, i); + + t->nentries = 0; + t->flood_count = 0; +} + +static struct avic_physid_table * +avic_physid_shadow_table_alloc(struct kvm *kvm, gfn_t gfn) +{ + struct avic_physid_entry_descr *e; + struct avic_physid_table *t; + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + u64 *shadow_table_address; + int i; + + if (kvm_page_track_write_tracking_enable(kvm)) + return NULL; + + lockdep_assert_held(&kvm_svm->avic.tables_lock); + + t = kzalloc(sizeof(*t), GFP_KERNEL_ACCOUNT); + if (!t) + return NULL; + + t->shadow_table = alloc_page(GFP_KERNEL_ACCOUNT|__GFP_ZERO); + if (!t->shadow_table) + goto err_free_table; + + shadow_table_address = page_address(t->shadow_table); + t->shadow_table_hpa = __sme_set(page_to_phys(t->shadow_table)); + + for (i = 0; i < ARRAY_SIZE(t->entries); i++) { + e = &t->entries[i]; + e->sentry = &shadow_table_address[i]; + e->gentry = 0; + INIT_LIST_HEAD(&e->link); + } + + t->gfn = gfn; + t->refcount = 1; + + list_add_tail(&t->link, &kvm_svm->avic.physid_tables); + + t->dummy_page_hpa = page_to_phys(kvm_svm->avic.invalid_physid_page); + + trace_kvm_avic_physid_table_alloc(gfn_to_gpa(gfn)); + return t; + +err_free_table: + kfree(t); + return NULL; +} + +static void +avic_physid_shadow_table_free(struct kvm *kvm, struct avic_physid_table *t) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + + lockdep_assert_held(&kvm_svm->avic.tables_lock); + + WARN_ON(t->refcount); + + avic_physid_shadow_table_erase(kvm, t); + + trace_kvm_avic_physid_table_free(gfn_to_gpa(t->gfn)); + + hlist_del(&t->hash_link); + list_del(&t->link); + __free_page(t->shadow_table); + kfree(t); +} + +static struct avic_physid_table * +__avic_physid_shadow_table_get(struct hlist_head *head, gfn_t gfn) +{ + struct avic_physid_table *t; + + hlist_for_each_entry(t, head, hash_link) + if (t->gfn == gfn) { + t->refcount++; + return t; + } + return NULL; +} + +struct avic_physid_table * +avic_physid_shadow_table_get(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); + struct hlist_head *hlist; + struct avic_physid_table *t; + + mutex_lock(&kvm_svm->avic.tables_lock); + + hlist = &kvm_svm->avic.physid_gpa_hash[avic_physid_hash(gfn)]; + t = __avic_physid_shadow_table_get(hlist, gfn); + if (!t) { + t = avic_physid_shadow_table_alloc(vcpu->kvm, gfn); + if (!t) + goto out_unlock; + hlist_add_head(&t->hash_link, hlist); + } +out_unlock: + mutex_unlock(&kvm_svm->avic.tables_lock); + return t; +} + +static void +__avic_physid_shadow_table_put(struct kvm *kvm, struct avic_physid_table *t) +{ + WARN_ON(t->refcount <= 0); + if (--t->refcount == 0) + avic_physid_shadow_table_free(kvm, t); +} + +void avic_physid_shadow_table_put(struct kvm *kvm, struct avic_physid_table *t) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + + mutex_lock(&kvm_svm->avic.tables_lock); + __avic_physid_shadow_table_put(kvm, t); + mutex_unlock(&kvm_svm->avic.tables_lock); +} + +static void avic_physid_shadow_table_invalidate(struct kvm *kvm, + struct avic_physid_table *t) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + + lockdep_assert_held(&kvm_svm->avic.tables_lock); + avic_physid_shadow_table_erase(kvm, t); +} + +int avic_physid_shadow_table_sync(struct kvm_vcpu *vcpu, + struct avic_physid_table *t, int nentries) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); + struct kvm_host_map map; + u64 *gentries; + int i; + int ret = 0; + + mutex_lock(&kvm_svm->avic.tables_lock); + + if (t->nentries >= nentries) + goto out_unlock; + + + trace_kvm_avic_physid_table_reload(gfn_to_gpa(t->gfn), t->nentries, nentries); + + if (t->nentries == 0) { + if (!avic_physid_shadow_table_setup_write_tracking(vcpu->kvm, t, true)) { + ret = -EFAULT; + goto out_unlock; + } + } + + if (kvm_vcpu_map(vcpu, t->gfn, &map)) { + ret = -EFAULT; + goto out_unlock; + } + + gentries = (u64 *)map.hva; + + for (i = t->nentries ; i < nentries ; i++) + avic_physid_shadow_entry_create(vcpu->kvm, t, i, gentries[i]); + + /* publish the table before setting nentries */ + wmb(); + WRITE_ONCE(t->nentries, nentries); + + kvm_vcpu_unmap(vcpu, &map, false); +out_unlock: + mutex_unlock(&kvm_svm->avic.tables_lock); + return ret; +} + +static void avic_physid_shadow_table_track_write(struct kvm_vcpu *vcpu, + gpa_t gpa, + const u8 *new, + int bytes, + struct kvm_page_track_notifier_node *node) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); + struct hlist_head *hlist; + struct avic_physid_table *t; + gfn_t gfn = gpa_to_gfn(gpa); + unsigned int page_offset = offset_in_page(gpa); + unsigned int entry_offset = page_offset & 0x7; + int first = page_offset / sizeof(u64); + int last = (page_offset + bytes - 1) / sizeof(u64); + u64 new_entry, old_entry; + int l1_apic_id; + + if (WARN_ON_ONCE(bytes == 0)) + return; + + mutex_lock(&kvm_svm->avic.tables_lock); + + hlist = &kvm_svm->avic.physid_gpa_hash[avic_physid_hash(gfn)]; + t = __avic_physid_shadow_table_get(hlist, gfn); + + if (!t) + goto out_unlock; + + trace_kvm_avic_physid_table_write(gpa, bytes); + + /* + * Update policy: + * + * Only a write to a single entry, entry that had a valid backing page + * on the last VM entry with this page, and only if the + * write touches only the is_running and/or apic_id part of this entry + * is allowed. + * + * Writes outside of known number of entries are ignored to support + * case when the guest is adding entries to end of the page + * in the process of a cpu hotplug. + * + * All other writes, which are not supposed to happen during + * use of the page, cause the page to be invalidated, + * and read as a whole, next time it is used by a vCPU for VM entry. + */ + + if (first >= t->nentries) + goto out_table_put; + + if (first != last || !test_bit(first, t->valid_entires)) + goto invalidate; + + /* update the entry with written bytes */ + old_entry = t->entries[first].gentry; + new_entry = old_entry; + memcpy(((u8 *)&new_entry) + entry_offset, new, bytes); + + /* if backing page changed, invalidate the whole page*/ + if (physid_entry_get_backing_table(old_entry) != + physid_entry_get_backing_table(new_entry)) + goto invalidate; + + /* + * Detect write flooding to physid pages that might not be used + * for the purpose anymore + */ + if (!atomic_read(&t->usecount)) { + if (++t->flood_count > t->nentries * AVIC_PHYSID_FLOOD_COUNT) + goto invalidate; + } else { + t->flood_count = 0; + } + + /* Update the backing cpu */ + l1_apic_id = physid_entry_get_apicid(new_entry); + avic_physid_shadow_entry_set_vcpu(vcpu->kvm, t, first, l1_apic_id); + t->entries[first].gentry = new_entry; + goto out_table_put; +invalidate: + avic_physid_shadow_table_invalidate(vcpu->kvm, t); +out_table_put: + __avic_physid_shadow_table_put(vcpu->kvm, t); +out_unlock: + mutex_unlock(&kvm_svm->avic.tables_lock); +} + +static void avic_physid_shadow_table_flush_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + struct avic_physid_table *t, *n; + int i; + + mutex_lock(&kvm_svm->avic.tables_lock); + + list_for_each_entry_safe(t, n, &kvm_svm->avic.physid_tables, link) { + + if (gfn_in_memslot(slot, t->gfn)) { + avic_physid_shadow_table_invalidate(kvm, t); + continue; + } + + for_each_set_bit(i, t->valid_entires, AVIC_MAX_PHYSICAL_ID_COUNT) { + u64 gentry = t->entries[i].gentry; + gpa_t gpa = physid_entry_get_backing_table(gentry); + + if (gfn_in_memslot(slot, gpa_to_gfn(gpa))) { + avic_physid_shadow_table_invalidate(kvm, t); + break; + } + } + } + mutex_unlock(&kvm_svm->avic.tables_lock); +} + + /* * This is a wrapper of struct amd_iommu_ir_data. */ @@ -113,18 +540,22 @@ void avic_vm_destroy(struct kvm *kvm) __free_page(avic->logical_id_table_page); if (avic->physical_id_table_page) __free_page(avic->physical_id_table_page); + if (avic->invalid_physid_page) + __free_page(avic->invalid_physid_page); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_del(&avic->hnode); spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + + + kvm_page_track_unregister_notifier(kvm, &avic->write_tracker); } int avic_vm_init(struct kvm *kvm) { unsigned long flags; int err = -ENOMEM; - struct page *p_page; - struct page *l_page; + struct page *page; struct kvm_svm_avic *avic = &to_kvm_svm(kvm)->avic; u32 vm_id; @@ -132,18 +563,25 @@ int avic_vm_init(struct kvm *kvm) return 0; /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!p_page) + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!page) goto free_avic; - avic->physical_id_table_page = p_page; + avic->physical_id_table_page = page; /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!l_page) + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!page) goto free_avic; - avic->logical_id_table_page = l_page; + avic->logical_id_table_page = page; + + /* Allocating a dummy page for invalid nested avic physid entries */ + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!page) + goto free_avic; + + avic->invalid_physid_page = page; spin_lock_irqsave(&svm_vm_data_hash_lock, flags); again: @@ -165,6 +603,13 @@ int avic_vm_init(struct kvm *kvm) hash_add(svm_vm_data_hash, &avic->hnode, avic->vm_id); spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + mutex_init(&avic->tables_lock); + INIT_LIST_HEAD(&avic->physid_tables); + + avic->write_tracker.track_write = avic_physid_shadow_table_track_write; + avic->write_tracker.track_flush_slot = avic_physid_shadow_table_flush_memslot; + + kvm_page_track_register_notifier(kvm, &avic->write_tracker); return 0; free_avic: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index dfca4c06e2071..fc15e1f938793 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -18,6 +18,7 @@ #include <linux/kvm_types.h> #include <linux/kvm_host.h> #include <linux/bits.h> +#include <linux/hash.h> #include <asm/svm.h> #include <asm/sev-common.h> @@ -89,13 +90,33 @@ struct kvm_sev_info { }; +#define AVIC_PHYSID_HASH_SHIFT 8 +#define AVIC_PHYSID_HASH_SIZE (1 << AVIC_PHYSID_HASH_SHIFT) + struct kvm_svm_avic { u32 vm_id; struct page *logical_id_table_page; struct page *physical_id_table_page; struct hlist_node hnode; + + struct mutex tables_lock; + + /* List of all shadow tables */ + struct list_head physid_tables; + + /* GPA hash table to find a shadow table via its GPA */ + struct hlist_head physid_gpa_hash[AVIC_PHYSID_HASH_SIZE]; + + struct kvm_page_track_notifier_node write_tracker; + + struct page *invalid_physid_page; }; +static __always_inline unsigned int avic_physid_hash(gfn_t gfn) +{ + return hash_64(gfn, AVIC_PHYSID_HASH_SHIFT); +} + struct kvm_svm { struct kvm kvm; struct kvm_svm_avic avic; @@ -147,6 +168,49 @@ struct vmcb_ctrl_area_cached { u8 reserved_sw[32]; }; +struct avic_physid_entry_descr { + struct list_head link; + + /* cached value of guest entry */ + u64 gentry; + + /* shadow table entry pointer*/ + u64 *sentry; +}; + +#define AVIC_PHYSID_FLOOD_COUNT 1000 + +struct avic_physid_table { + /* List of all tables member */ + struct list_head link; + + /* GPA hash of all tables member */ + struct hlist_node hash_link; + + /* GPA of the table in guest memory*/ + gfn_t gfn; + + /* Number of entries that we shadow and which are valid*/ + int nentries; + DECLARE_BITMAP(valid_entires, AVIC_MAX_PHYSICAL_ID_COUNT); + + struct avic_physid_entry_descr entries[AVIC_MAX_PHYSICAL_ID_COUNT]; + + /* Guest visible shadow table */ + struct page *shadow_table; + hpa_t shadow_table_hpa; + hpa_t dummy_page_hpa; + + /* Number of vCPUs which have reference to this table */ + int refcount; + + /* number of vCPUs that are in guest mode and use this table */ + atomic_t usecount; + + /* Number of writes to this page between uses of it*/ + int flood_count; +}; + struct svm_nested_state { struct kvm_vmcb_info vmcb02; u64 hsave_msr; @@ -628,6 +692,13 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); +struct avic_physid_table * +avic_physid_shadow_table_get(struct kvm_vcpu *vcpu, gfn_t gfn); +void avic_physid_shadow_table_put(struct kvm *kvm, struct avic_physid_table *t); +int avic_physid_shadow_table_sync(struct kvm_vcpu *vcpu, + struct avic_physid_table *t, int nentries); + + #define INVALID_BACKING_PAGE (~(u64)0) static inline u64 physid_entry_get_backing_table(u64 entry) -- 2.26.3