Signed-off-by: Cao, Lei <Lei.Cao@xxxxxxxxxxx> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> --- Documentation/virtual/kvm/api.txt | 96 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 33 ++++++++++++++ virt/kvm/kvm_main.c | 36 ++++++++++++++- 3 files changed, 164 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index fc7fd75..4b82452 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -168,6 +168,7 @@ Based on their initialization different VMs may have different capabilities. It is thus encouraged to use the vm ioctl to query for capabilities (available with KVM_CAP_CHECK_EXTENSION_VM on the vm fd) + 4.5 KVM_GET_VCPU_MMAP_SIZE Capability: basic @@ -180,6 +181,18 @@ The KVM_RUN ioctl (cf.) communicates with userspace via a shared memory region. This ioctl returns the size of that region. See the KVM_RUN documentation for details. +Besides the size of the KVM_RUN communication region, other areas of +the VCPU file descriptor can be mmap-ed, including: + +- if KVM_CAP_COALESCED_MMIO is available, a page at + KVM_COALESCED_MMIO_PAGE_OFFSET * PAGE_SIZE; for historical reasons, + this page is included in the result of KVM_GET_VCPU_MMAP_SIZE. + KVM_CAP_COALESCED_MMIO is not documented yet. + +- if KVM_CAP_DIRTY_LOG_RING is available, a number of pages at + KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE. For more information on + KVM_CAP_DIRTY_LOG_RING, see section 8.3. + 4.6 KVM_SET_MEMORY_REGION @@ -4374,3 +4387,86 @@ Parameters: none This capability indicates if the flic device will be able to get/set the AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows to discover this without having to create a flic device. + +8.14 KVM_CAP_DIRTY_LOG_RING + +Architectures: x86 +Parameters: args[0] - size of the dirty log ring + +Kernel is capable of tracking dirty memory using rings, which +are stored in memory regions that can be mmaped into userspace. + +There is one dirty ring per vcpu and one global ring. + +The dirty ring has the following structure. + +struct kvm_dirty_gfn { + __u32 pad; + __u32 slot; /* as_id | slot_id */ + __u64 offset; +}; + +struct kvm_dirty_ring { + union { + struct { + __u16 avail_index; /* set by kernel */ + __u16 fetch_index; /* set by userspace */ + } indices; + struct kvm_dirty_gfn dirty_gfns[0]; + }; +}; + +The two indices in the ring buffer are free running counters. +They are _not_ limited to the range 0..size-1 where "size" is +the number of element of the ring buffer. This makes it easy +to compute the number of entries in the ring buffer, which is +simply (u16)(ring->avail_index - ring->fetch_index). + +In pseudocode, processing the ring buffer looks like this: + + idx = load-acquire(&ring->fetch_index); + while (idx != ring->avail_index) { + struct kvm_dirty_gfn *entry; + entry = &ring->dirty_gfns[idx & (size - 1)]; + ... + + idx++; + } + ring->fetch_index = idx; + +Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM ioctl +to enable this capability for the new guest and set the size of the +rings. The size of the ring must be a power of two. The larger the +ring buffer, the less likely the ring is full and the VM is forced to +exit to userspace. The optimal size depends on the workload, but it is +recommended that it be at least 64 KiB (4096 entries). + +After the capability is enabled, userspace mmaps the global ring +buffer from the VM file descriptor. The per-vcpu dirty ring instead +is mmapped when the vcpu is created, similar to the kvm_run struct. +The per-vcpu dirty ring is located at offset KVM_DIRTY_LOG_PAGE_OFFSET * +PAGE_SIZE of the memory mapped region. + +To enable the dirty logging ring buffer, userspace calls +KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions +with KVM_MEM_LOG_DIRTY_PAGES bit set. + +To disable the dirty logging ring buffer, userspace calls +KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions +with KVM_MEM_LOG_DIRTY_PAGES bit clear. + +Once the dirty logging is enabled, userspace can start harvesting +dirty pages. + +To harvest the dirty pages, userspace accesses the mmaped ring +buffer to read the dirty GFNs up to avail_index and set the +fetch_index accordingly. Harvest can be done when the guest is +running or paused. Dirty pages don't need to be harvest all at +once. To rearm the dirty traps, userspace calls the VM ioctl +KVM_RESET_DIRTY_PAGES. + +If one of the dirty lists is full, the guest will exit to userspace +with the exit reason set to KVM_EXIT_DIRTY_LOG_FULL, and the +KVM_RUN ioctl will return -EINTR. Once that happens, userspace +should pause all the vcpus, then harvest all the dirty pages and +rearm the dirty traps. It can unpause the guest after that. diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 496e59a..903a016 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -235,6 +235,7 @@ struct kvm_hyperv_exit { #define KVM_EXIT_S390_STSI 25 #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 +#define KVM_EXIT_DIRTY_LOG_FULL 28 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -932,6 +933,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HYPERV_SYNIC2 148 #define KVM_CAP_HYPERV_VP_INDEX 149 #define KVM_CAP_S390_AIS_MIGRATION 150 +#define KVM_CAP_DIRTY_LOG_RING 151 #ifdef KVM_CAP_IRQ_ROUTING @@ -1358,6 +1360,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_S390_CMMA_MIGRATION */ #define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log) #define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) +/* Available with KVM_CAP_DIRTY_LOG_RING */ +#define KVM_RESET_DIRTY_PAGES _IO(KVMIO, 0xba) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) @@ -1419,4 +1423,33 @@ struct kvm_assigned_msix_entry { #define KVM_ARM_DEV_EL1_PTIMER (1 << 1) #define KVM_ARM_DEV_PMU (1 << 2) +/* + * The following are the requirements for supporting dirty log ring + * (by enabling KVM_DIRTY_LOG_PAGE_OFFSET). + * + * 1. Memory accesses by KVM should call kvm_vcpu_write_* instead + * of kvm_write_* so that the global dirty ring is not filled up + * too quickly. + * 2. kvm_arch_mmu_enable_log_dirty_pt_masked should be defined for + * enabling dirty logging. + * 3. There should not be a separate step to synchronize hardware + * dirty bitmap with KVM's. + */ + +struct kvm_dirty_gfn { + __u32 pad; + __u32 slot; + __u64 offset; +}; + +struct kvm_dirty_ring { + union { + struct { + __u16 avail_index; /* set by kernel */ + __u16 fetch_index; /* set by userspace */ + } indices; + struct kvm_dirty_gfn dirty_gfns[0]; + }; +}; + #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6e4d71c..bdccaf8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2945,6 +2945,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_MULTI_ADDRESS_SPACE: return KVM_ADDRESS_SPACE_NUM; #endif +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + case KVM_CAP_DIRTY_LOG_RING: + return KVM_DIRTY_LOG_PAGE_OFFSET; +#endif case KVM_CAP_MAX_VCPU_ID: return KVM_MAX_VCPU_ID; default: @@ -2953,12 +2957,37 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return kvm_vm_ioctl_check_extension(kvm, arg); } +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET +static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) +{ + return -EINVAL; +} + +static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) +{ + return -EINVAL; +} +#endif + int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { return -EINVAL; } +static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + switch (cap->cap) { +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + case KVM_CAP_DIRTY_LOG_RING: + return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); +#endif + default: + return kvm_vm_ioctl_enable_cap(kvm, cap); + } +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2978,7 +3007,7 @@ static long kvm_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&cap, argp, sizeof(cap))) goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); + r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); break; } case KVM_SET_USER_MEMORY_REGION: { @@ -3129,6 +3158,11 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_CHECK_EXTENSION: r = kvm_vm_ioctl_check_extension_generic(kvm, arg); break; +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET + case KVM_RESET_DIRTY_PAGES: + r = kvm_vm_ioctl_reset_dirty_pages(kvm); + break; +#endif /* KVM_DIRTY_LOG_PAGE_OFFSET */ default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } -- 1.8.3.4