Add support for capabilities that can be enabled in a generic way. Introduce new capability: ring-based dirty memory logging Signed-off-by: Lei Cao <lei.cao@xxxxxxxxxxx> --- Documentation/virtual/kvm/api.txt | 99 +++++++++++++++++++++++++++++++++++++-- arch/powerpc/kvm/powerpc.c | 14 +----- arch/s390/kvm/kvm-s390.c | 11 +---- arch/x86/kvm/x86.c | 14 +----- include/linux/kvm_host.h | 2 + include/uapi/linux/kvm.h | 1 + virt/kvm/kvm_main.c | 32 +++++++++++++ 7 files changed, 135 insertions(+), 38 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 03145b7..4335190 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1006,10 +1006,15 @@ documentation when it pops into existence). 4.37 KVM_ENABLE_CAP -Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM -Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM), - mips (only KVM_CAP_ENABLE_CAP), ppc, s390 -Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) +Capability: KVM_CAP_ENABLE_CAP +Architectures: mips, ppc, s390 +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + +Capability: KVM_CAP_ENABLE_CAP_VM +Architectures: all +Type: vcpu ioctl Parameters: struct kvm_enable_cap (in) Returns: 0 on success; -1 on error @@ -3942,3 +3947,89 @@ In order to use SynIC, it has to be activated by setting this capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this will disable the use of APIC hardware virtualization even if supported by the CPU, as it's incompatible with SynIC auto-EOI behavior. + +8.3 KVM_CAP_DIRTY_LOG_RING + +Architectures: x86 +Parameters: args[0] - size of the dirty log ring + +Kernel is capable of tracking dirty memory using rings, which +are stored in memory regions that can be mmaped into userspace. + +There is one dirty ring per vcpu and one global ring. + +The dirty ring has the following structure. + +struct kvm_dirty_gfn { + __u32 pad; + __u32 slot; /* as_id | slot_id */ + __u64 offset; +}; + +struct kvm_dirty_ring { + union { + struct { + __u16 avail_index; /* set by kernel */ + __u16 fetch_index; /* set by userspace */ + } indices; + struct kvm_dirty_gfn dirty_gfns[0]; + }; +}; + +The two indices in the ring buffer are free running counters. +They are _not_ limited to the range 0..size-1 where "size" is +the number of element of the ring buffer. This makes it simpler +to compute the number of entries in the ring buffer, which is +simply (u16)(ring->avail_index - ring->fetch_index). + +In pseudocode, processing the ring buffer looks like this: + + idx = load-acquire(&ring->fetch_index); + while (idx != ring->avail_index) { + struct kvm_dirty_gfn *entry; + entry = &ring->dirty_gfns[idx & (size - 1)]; + ... + + idx++; + } + ring->fetch_index = idx; + +Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM +ioctl to enable this capability for the new guest and set the +size of the rings. The size of the ring must be a power of two. +The larger the ring, the less likely the ring is full and the VM +is forced to exit to userspace. The optimal size depends on the +workload, but it is recommended that it be at least 64 KiB (4096 +entries). + +After the capability is enabled, userspace mmaps the global +dirty ring. The per-vcpu dirty ring is mmapped along with kvm_run +when vcpu is created. The per-vcpu dirty ring is located at offset +KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE of the memory mapped region. + +To enable dirty logging with ring, userspace calls +KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions +with KVM_MEM_LOG_DIRTY_PAGES bit set. + +To disable dirty logging with ring, userspace calls +KVM_SET_USER_MEMORY_REGION ioctls on all the user memory regions +with KVM_MEM_LOG_DIRTY_PAGES bit clear. + +Once the dirty logging is enabled, userspace can start harvesting +dirty pages. + +To harvest the dirty pages, userspace accesses the mmaped dirty +list to read the dirty GFNs up to avail_index and set the +fetch_index accordingly. Harvest can be done when the guest is +running or paused. Dirty pages don't need to be harvest all at +once. + +To rearm the dirty traps, userspace calls KVM_RESET_DIRTY_PAGES +ioctl. This should be done only when the guest is paused and +all the dirty pages have been harvested. + +If one of the dirty lists is full, the guest will exit to userspace +with the exit reason set to KVM_EXIT_DIRTY_LOG_FULL, and the +KVM_RUN ioctl will return -EINTR. Once that happens, userspace +should pause all the vcpus, then harvest all the dirty pages and +rearm the dirty traps. It can unpause the guest after that. diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index cd892de..0edae1b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -507,7 +507,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_ONE_REG: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: @@ -1358,8 +1357,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r; @@ -1412,15 +1411,6 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } - case KVM_ENABLE_CAP: - { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CREATE_SPAPR_TCE_64: { struct kvm_create_spapr_tce_64 create_tce_64; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6484a25..3192e52 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -366,7 +366,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: @@ -480,7 +479,7 @@ static void icpt_operexc_on_all_vcpus(struct kvm *kvm) } } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; @@ -1232,14 +1231,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_inject_vm(kvm, &s390int); break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - break; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } case KVM_CREATE_IRQCHIP: { struct kvm_irq_routing_entry routing; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e52c908..240fb75 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2629,7 +2629,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: @@ -3869,8 +3868,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, return 0; } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r; @@ -4177,15 +4176,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } default: r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1c5190d..33d9974 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -718,6 +718,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cac48ed..117f1f9 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -871,6 +871,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_USER_INSTR0 130 #define KVM_CAP_MSI_DEVID 131 #define KVM_CAP_PPC_HTM 132 +#define KVM_CAP_DIRTY_LOG_RING 133 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 482612b..f2744ce 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2927,6 +2927,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: + case KVM_CAP_ENABLE_CAP_VM: return 1; #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: @@ -2944,6 +2945,28 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return kvm_vm_ioctl_check_extension(kvm, arg); } +static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, __u32 size) +{ + return -EINVAL; +} + +int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + return -EINVAL; +} + +static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + switch (cap->cap) { + case KVM_CAP_DIRTY_LOG_RING: + return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); + default: + return kvm_vm_ioctl_enable_cap(kvm, cap); + } +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2957,6 +2980,15 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); + break; + } case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region kvm_userspace_mem; -- 2.5.0