Implement ring-base dirty memory tracking. Signed-off-by: Lei Cao <lei.cao@xxxxxxxxxxx> --- arch/x86/kvm/Makefile | 3 +- include/linux/kvm_host.h | 12 +++ virt/kvm/gfn_ring.c | 135 +++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 220 ++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 357 insertions(+), 13 deletions(-) create mode 100644 virt/kvm/gfn_ring.c diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 3bff207..d832622 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -8,7 +8,8 @@ CFLAGS_vmx.o := -I. KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o + $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \ + $(KVM)/gfn_ring.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b0783da..082a2b2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -34,6 +34,7 @@ #include <linux/kvm_types.h> #include <asm/kvm_host.h> +#include <linux/kvm_gfn_ring.h> #ifndef KVM_MAX_VCPU_ID #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS @@ -266,6 +267,10 @@ struct kvm_vcpu { bool preempted; struct kvm_vcpu_arch arch; struct dentry *debugfs_dentry; + +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + struct kvm_gfn_ring dirty_ring; +#endif }; static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) @@ -431,6 +436,11 @@ struct kvm { struct list_head devices; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; + +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + u32 dirty_ring_size; + struct kvm_gfn_ring dirty_ring; +#endif }; #define kvm_err(fmt, ...) \ @@ -714,6 +724,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, gfn_t gfn_offset, unsigned long mask); +void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask); + int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); diff --git a/virt/kvm/gfn_ring.c b/virt/kvm/gfn_ring.c new file mode 100644 index 0000000..cb0f455 --- /dev/null +++ b/virt/kvm/gfn_ring.c @@ -0,0 +1,135 @@ +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/vmalloc.h> +#include <linux/kvm_gfn_ring.h> + +int kvm_gfn_ring_alloc(struct kvm_gfn_ring *gfnring, u32 size, u32 limit) +{ + gfnring->dirty_ring = vmalloc(size); + if (!gfnring->dirty_ring) + return -ENOMEM; + memset(gfnring->dirty_ring, 0, size); + + gfnring->size = size/sizeof(struct kvm_dirty_gfn); + gfnring->soft_limit = limit; + gfnring->dirty_index = 0; + gfnring->reset_index = 0; + spin_lock_init(&gfnring->lock); + + return 0; +} + +int kvm_gfn_ring_reset(struct kvm *kvm, struct kvm_gfn_ring *gfnring) +{ + u32 cur_slot, next_slot; + u64 cur_offset, next_offset; + unsigned long mask; + u32 fetch; + int count = 0; + struct kvm_dirty_gfn *entry; + struct kvm_dirty_ring *ring = gfnring->dirty_ring; + + fetch = READ_ONCE(ring->indices.fetch_index); + if (fetch == gfnring->reset_index) + return 0; + + entry = &ring->dirty_gfns[gfnring->reset_index & + (gfnring->size - 1)]; + /* + * The ring buffer is shared with userspace, which might mmap + * it and concurrently modify slot and offset. Userspace must + * not be trusted! READ_ONCE prevents the compiler from changing + * the values after they've been range-checked (the checks are + * in kvm_reset_dirty_gfn). + */ + smp_read_barrier_depends(); + cur_slot = READ_ONCE(entry->slot); + cur_offset = READ_ONCE(entry->offset); + mask = 1; + count++; + gfnring->reset_index++; + while (gfnring->reset_index != fetch) { + entry = &ring->dirty_gfns[gfnring->reset_index & + (gfnring->size - 1)]; + smp_read_barrier_depends(); + next_slot = READ_ONCE(entry->slot); + next_offset = READ_ONCE(entry->offset); + gfnring->reset_index++; + count++; + /* + * Try to coalesce the reset operations when the guest is + * scanning pages in the same slot. + */ + if (next_slot == cur_slot) { + int delta = next_offset - cur_offset; + + if (delta >= 0 && delta < BITS_PER_LONG) { + mask |= 1ull << delta; + continue; + } + + /* Backwards visit, careful about overflows! */ + if (delta > -BITS_PER_LONG && delta < 0 && + (mask << -delta >> -delta) == mask) { + cur_offset = next_offset; + mask = (mask << -delta) | 1; + continue; + } + } + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + cur_slot = next_slot; + cur_offset = next_offset; + mask = 1; + } + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); + + return count; +} + +int kvm_gfn_ring_push(struct kvm_gfn_ring *gfnring, + u32 slot, + u64 offset, + bool locked) +{ + int ret; + u16 num; + struct kvm_dirty_gfn *entry; + + if (locked) + spin_lock(&gfnring->lock); + + num = (u16)(gfnring->dirty_index - gfnring->reset_index); + if (num >= gfnring->size) { + WARN_ON_ONCE(num > gfnring->size); + ret = -EBUSY; + goto out; + } + + entry = &gfnring->dirty_ring->dirty_gfns[gfnring->dirty_index & + (gfnring->size - 1)]; + entry->slot = slot; + entry->offset = offset; + smp_wmb(); + gfnring->dirty_index++; + num = gfnring->dirty_index - gfnring->reset_index; + gfnring->dirty_ring->indices.avail_index = gfnring->dirty_index; + ret = num >= gfnring->soft_limit; + +out: + if (locked) + spin_unlock(&gfnring->lock); + + return ret; +} + +struct page *kvm_gfn_ring_get_page(struct kvm_gfn_ring *ring, u32 i) +{ + return vmalloc_to_page((void *)ring->dirty_ring+i*PAGE_SIZE); + +} + +void kvm_gfn_ring_free(struct kvm_gfn_ring *gfnring) +{ + if (gfnring->dirty_ring) + vfree(gfnring->dirty_ring); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 016be4d..9cf4a5e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -63,9 +63,16 @@ #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> +#include <linux/kvm_gfn_ring.h> + /* Worst case buffer size needed for holding an integer. */ #define ITOA_MAX_LEN 12 +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET +/* some buffer space for the dirty log ring for ring full situations */ +#define DIRTY_RING_BUFFER_ENTRY_NUM 16 +#endif + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -121,7 +128,16 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); static void kvm_release_pfn_dirty(kvm_pfn_t pfn); -static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); +static void mark_page_dirty_in_slot(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct kvm_memory_slot *memslot, + gfn_t gfn); +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET +static void mark_page_dirty_in_ring(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, + gfn_t gfn); +#endif __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); @@ -258,11 +274,36 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + if (kvm->dirty_ring_size) { + u32 limit = (kvm->dirty_ring_size / + sizeof(struct kvm_dirty_gfn)) - + DIRTY_RING_BUFFER_ENTRY_NUM - + kvm_cpu_dirty_log_size(); + r = kvm_gfn_ring_alloc(&vcpu->dirty_ring, + kvm->dirty_ring_size, + limit); + if (r) { + kvm->dirty_ring_size = 0; + goto fail_free_run; + } + } +#endif + r = kvm_arch_vcpu_init(vcpu); if (r < 0) +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + goto fail_free_ring; +#else goto fail_free_run; +#endif return 0; +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET +fail_free_ring: + if (kvm->dirty_ring_size) + kvm_gfn_ring_free(&vcpu->dirty_ring); +#endif fail_free_run: free_page((unsigned long)vcpu->run); fail: @@ -275,6 +316,10 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) put_pid(vcpu->pid); kvm_arch_vcpu_uninit(vcpu); free_page((unsigned long)vcpu->run); +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + if (vcpu->kvm->dirty_ring_size) + kvm_gfn_ring_free(&vcpu->dirty_ring); +#endif } EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); @@ -726,6 +771,10 @@ static void kvm_destroy_vm(struct kvm *kvm) for (i = 0; i < KVM_NR_BUSES; i++) kvm_io_bus_destroy(kvm->buses[i]); kvm_coalesced_mmio_free(kvm); +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + if (kvm->dirty_ring_size) + kvm_gfn_ring_free(&kvm->dirty_ring); +#endif #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); #else @@ -1861,7 +1910,8 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); -static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, +static int __kvm_write_guest_page(struct kvm *kvm, struct kvm_vcpu *vcpu, + struct kvm_memory_slot *memslot, gfn_t gfn, const void *data, int offset, int len) { int r; @@ -1873,7 +1923,7 @@ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, r = __copy_to_user((void __user *)addr + offset, data, len); if (r) return -EFAULT; - mark_page_dirty_in_slot(memslot, gfn); + mark_page_dirty_in_slot(kvm, vcpu, memslot, gfn); return 0; } @@ -1882,7 +1932,8 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - return __kvm_write_guest_page(slot, gfn, data, offset, len); + return __kvm_write_guest_page(kvm, NULL, slot, gfn, data, + offset, len); } EXPORT_SYMBOL_GPL(kvm_write_guest_page); @@ -1891,7 +1942,8 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __kvm_write_guest_page(slot, gfn, data, offset, len); + return __kvm_write_guest_page(vcpu->kvm, vcpu, slot, gfn, data, + offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); @@ -1995,7 +2047,7 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, r = __copy_to_user((void __user *)ghc->hva + offset, data, len); if (r) return -EFAULT; - mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); + mark_page_dirty_in_slot(kvm, NULL, ghc->memslot, gpa >> PAGE_SHIFT); return 0; } @@ -2060,12 +2112,17 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_clear_guest); -static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, +static void mark_page_dirty_in_slot(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct kvm_memory_slot *memslot, gfn_t gfn) { if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + mark_page_dirty_in_ring(kvm, vcpu, memslot, gfn); +#endif set_bit_le(rel_gfn, memslot->dirty_bitmap); } } @@ -2075,7 +2132,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) struct kvm_memory_slot *memslot; memslot = gfn_to_memslot(kvm, gfn); - mark_page_dirty_in_slot(memslot, gfn); + mark_page_dirty_in_slot(kvm, NULL, memslot, gfn); } EXPORT_SYMBOL_GPL(mark_page_dirty); @@ -2084,7 +2141,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) struct kvm_memory_slot *memslot; memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - mark_page_dirty_in_slot(memslot, gfn); + mark_page_dirty_in_slot(vcpu->kvm, vcpu, memslot, gfn); } EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); @@ -2363,6 +2420,13 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + else if ((vmf->pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) && + (vmf->pgoff < KVM_DIRTY_LOG_PAGE_OFFSET + + vcpu->kvm->dirty_ring_size / PAGE_SIZE)) + page = kvm_gfn_ring_get_page(&vcpu->dirty_ring, + vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET); +#endif else return kvm_arch_vcpu_fault(vcpu, vmf); get_page(page); @@ -2946,14 +3010,120 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) } #ifdef KVM_DIRTY_LOG_PAGE_OFFSET -static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, __u32 size) +static void mark_page_dirty_in_ring(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, + gfn_t gfn) { - return -EINVAL; + struct kvm_gfn_ring *gfnlist; + u32 as_id = 0; + u64 offset; + struct kvm_vcpu *exit_vcpu = vcpu; + int ret; + bool locked; + + if (!kvm->dirty_ring_size) + return; + + offset = gfn - slot->base_gfn; + + if (test_bit_le(offset, slot->dirty_bitmap)) + return; + + if (vcpu) + as_id = kvm_arch_vcpu_memslots_id(vcpu); + + locked = (vcpu == NULL); + + if (vcpu) + gfnlist = &vcpu->dirty_ring; + else + gfnlist = &kvm->dirty_ring; + + ret = kvm_gfn_ring_push(gfnlist, (as_id << 16)|slot->id, + offset, locked); + if (ret < 0) { + if (vcpu) + WARN_ONCE(1, "vcpu %d dirty log overflow\n", + vcpu->vcpu_id); + else + WARN_ONCE(1, "global dirty log overflow\n"); + return; + } + + if (ret) { + if (!exit_vcpu) + exit_vcpu = kvm->vcpus[0]; + kvm_make_request(KVM_REQ_EXIT_DIRTY_LOG_FULL, exit_vcpu); + } +} + +void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) +{ + struct kvm_memory_slot *memslot; + int as_id, id; + + as_id = slot >> 16; + id = (u16)slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + return; + + memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); + if (offset >= memslot->npages) + return; + + spin_lock(&kvm->mmu_lock); + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); + spin_unlock(&kvm->mmu_lock); + + while (mask) { + clear_bit_le(offset + __ffs(mask), memslot->dirty_bitmap); + mask &= mask - 1; + } +} + +static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) +{ + int r; + u32 limit; + + /* the size should be power of 2 */ + if (!size || (size & (size - 1))) + return -EINVAL; + + kvm->dirty_ring_size = size; + limit = (size/sizeof(struct kvm_dirty_gfn)) - + DIRTY_RING_BUFFER_ENTRY_NUM; + r = kvm_gfn_ring_alloc(&kvm->dirty_ring, size, limit); + if (r) { + kvm_put_kvm(kvm); + return r; + } + return 0; } static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) { - return -EINVAL; + int i; + struct kvm_vcpu *vcpu; + int cleared = 0; + + if (!kvm->dirty_ring_size) + return -EINVAL; + + mutex_lock(&kvm->slots_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) + cleared += kvm_gfn_ring_reset(kvm, &vcpu->dirty_ring); + + cleared += kvm_gfn_ring_reset(kvm, &kvm->dirty_ring); + + mutex_unlock(&kvm->slots_lock); + + if (cleared) + kvm_flush_remote_tlbs(kvm); + + return cleared; } #endif @@ -3202,6 +3372,29 @@ static long kvm_vm_compat_ioctl(struct file *filp, } #endif +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET +static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvm *kvm = vma->vm_file->private_data; + struct page *page; + + page = kvm_gfn_ring_get_page(&kvm->dirty_ring, vmf->pgoff); + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_vm_vm_ops = { + .fault = kvm_vm_fault, +}; + +static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vm_vm_ops; + return 0; +} +#endif + static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, @@ -3209,6 +3402,9 @@ static struct file_operations kvm_vm_fops = { .compat_ioctl = kvm_vm_compat_ioctl, #endif .llseek = noop_llseek, +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + .mmap = kvm_vm_mmap, +#endif }; static int kvm_dev_ioctl_create_vm(unsigned long type) -- 2.5.0