On Sat, 25 Feb 2023 20:47:57 +0000, Shivam Kumar <shivam.kumar1@xxxxxxxxxxx> wrote: > > Define dirty_quota_bytes variable to track and throttle memory > dirtying for every vcpu. This variable stores the number of bytes the > vcpu is allowed to dirty. To dirty more, the vcpu needs to request > more quota by exiting to userspace. > > Implement update_dirty_quota function which > > i) Decreases dirty_quota_bytes by arch-specific page size whenever a > page is dirtied. > ii) Raises a KVM request KVM_REQ_DIRTY_QUOTA_EXIT whenever the dirty > quota is exhausted (i.e. dirty_quota_bytes <= 0). > > Suggested-by: Shaju Abraham <shaju.abraham@xxxxxxxxxxx> > Suggested-by: Manish Mishra <manish.mishra@xxxxxxxxxxx> > Co-developed-by: Anurag Madnawat <anurag.madnawat@xxxxxxxxxxx> > Signed-off-by: Anurag Madnawat <anurag.madnawat@xxxxxxxxxxx> > Signed-off-by: Shivam Kumar <shivam.kumar1@xxxxxxxxxxx> > --- > Documentation/virt/kvm/api.rst | 17 +++++++++++++++++ > include/linux/kvm_host.h | 5 +++++ > include/uapi/linux/kvm.h | 8 ++++++++ > tools/include/uapi/linux/kvm.h | 1 + > virt/kvm/Kconfig | 3 +++ > virt/kvm/kvm_main.c | 31 +++++++++++++++++++++++++++++++ > 6 files changed, 65 insertions(+) > > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst > index 62de0768d6aa..3a283fe212d8 100644 > --- a/Documentation/virt/kvm/api.rst > +++ b/Documentation/virt/kvm/api.rst > @@ -6688,6 +6688,23 @@ Please note that the kernel is allowed to use the kvm_run structure as the > primary storage for certain register types. Therefore, the kernel may use the > values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. > > +:: > + > + /* > + * Number of bytes the vCPU is allowed to dirty if KVM_CAP_DIRTY_QUOTA is > + * enabled. KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if this quota > + * is exhausted, i.e. dirty_quota_bytes <= 0. > + */ > + long dirty_quota_bytes; > + > +Please note that enforcing the quota is best effort. Dirty quota is reduced by > +arch-specific page size when any guest page is dirtied. Also, the guest may dirty > +multiple pages before KVM can recheck the quota. What are the events that trigger such quota reduction? > + > +:: > + }; > + > + > > 6. Capabilities that can be enabled on vCPUs > ============================================ > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 8ada23756b0e..f5ce343c64f2 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -167,6 +167,7 @@ static inline bool is_error_page(struct page *page) > #define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) > #define KVM_REQ_UNBLOCK 2 > #define KVM_REQ_DIRTY_RING_SOFT_FULL 3 > +#define KVM_REQ_DIRTY_QUOTA_EXIT 4 > #define KVM_REQUEST_ARCH_BASE 8 > > /* > @@ -800,6 +801,9 @@ struct kvm { > bool dirty_ring_with_bitmap; > bool vm_bugged; > bool vm_dead; > +#ifdef CONFIG_HAVE_KVM_DIRTY_QUOTA > + bool dirty_quota_enabled; > +#endif > > #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER > struct notifier_block pm_notifier; > @@ -1235,6 +1239,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); > bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); > bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); > unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); > +void update_dirty_quota(struct kvm *kvm, unsigned long page_size_bytes); > void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); > void mark_page_dirty(struct kvm *kvm, gfn_t gfn); > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index d77aef872a0a..ddb9d3d797c4 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -264,6 +264,7 @@ struct kvm_xen_exit { > #define KVM_EXIT_RISCV_SBI 35 > #define KVM_EXIT_RISCV_CSR 36 > #define KVM_EXIT_NOTIFY 37 > +#define KVM_EXIT_DIRTY_QUOTA_EXHAUSTED 38 > > /* For KVM_EXIT_INTERNAL_ERROR */ > /* Emulate instruction failed. */ > @@ -526,6 +527,12 @@ struct kvm_run { > struct kvm_sync_regs regs; > char padding[SYNC_REGS_SIZE_BYTES]; > } s; > + /* > + * Number of bytes the vCPU is allowed to dirty if KVM_CAP_DIRTY_QUOTA is > + * enabled. KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if this quota > + * is exhausted, i.e. dirty_quota_bytes <= 0. > + */ > + long dirty_quota_bytes; > }; > > /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */ > @@ -1184,6 +1191,7 @@ struct kvm_ppc_resize_hpt { > #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 > #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 > #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 > +#define KVM_CAP_DIRTY_QUOTA 227 > > #ifdef KVM_CAP_IRQ_ROUTING > > diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h > index 55155e262646..48f236e2b836 100644 > --- a/tools/include/uapi/linux/kvm.h > +++ b/tools/include/uapi/linux/kvm.h > @@ -1175,6 +1175,7 @@ struct kvm_ppc_resize_hpt { > #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 > #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 > #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 > +#define KVM_CAP_DIRTY_QUOTA 227 > > #ifdef KVM_CAP_IRQ_ROUTING > > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index b74916de5183..ccaa332d88f9 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -19,6 +19,9 @@ config HAVE_KVM_IRQ_ROUTING > config HAVE_KVM_DIRTY_RING > bool > > +config HAVE_KVM_DIRTY_QUOTA > + bool > + > # Only strongly ordered architectures can select this, as it doesn't > # put any explicit constraint on userspace ordering. They can also > # select the _ACQ_REL version. > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index d255964ec331..744b955514ce 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -3096,6 +3096,9 @@ static int __kvm_write_guest_page(struct kvm *kvm, > r = __copy_to_user((void __user *)addr + offset, data, len); > if (r) > return -EFAULT; > +#ifdef CONFIG_HAVE_KVM_DIRTY_QUOTA > + update_dirty_quota(kvm, PAGE_SIZE); > +#endif Why PAGE_SIZE? Why not 'len'? Why if the page was already dirtied? Why should it be accounted for multiple times? In most cases, this is the *hypervisor* writing to the guest, not the vcpu. Why should this be accounted to the vcpu quota? M. -- Without deviation from the norm, progress is not possible.