On Thu, Jun 01, 2023 at 05:58:59PM -0700, Sean Christopherson wrote: > Add a "never" option to the nx_huge_pages module param to allow userspace > to do a one-way hard disabling of the mitigation, and don't create the > per-VM recovery threads when the mitigation is hard disabled. Letting > userspace pinky swear that userspace doesn't want to enable NX mitigation > (without reloading KVM) allows certain use cases to avoid the latency > problems associated with spawning a kthread for each VM. > > E.g. in FaaS use cases, the guest kernel is trusted and the host may > create 100+ VMs per logical CPU, which can result in 100ms+ latencies when > a burst of VMs is created. Tested-by: Luiz Capitulino <luizcap@xxxxxxxxxx> Without this patch I can see the 100ms+ latencies on KVM_CREATE_VM even with a single VM. Just run a VM with with strace -T and grep for KVM_CREATE_VM. When using kvmtool I get (latency in seconds - kernel HEAD is a4d7d70112): ioctl(3, KVM_CREATE_VM, 0) = 4 <0.023567> ioctl(3, KVM_CREATE_VM, 0) = 4 <0.076709> ioctl(3, KVM_CREATE_VM, 0) = 4 <0.109109> With this patch and nx_huge_page=never: ioctl(3, KVM_CREATE_VM, 0) = 4 <0.000518> ioctl(3, KVM_CREATE_VM, 0) = 4 <0.000495> ioctl(3, KVM_CREATE_VM, 0) = 4 <0.000513> Now, I debugged down the single VM case before seeing this patch and it can be avoided by building the kernel with CONFIG_CGROUP_FAVOR_DYNMODS=y or mounting the cgroup v2 mount point with the favordynmods mount option. This is because the high latency is coming from a call to cgroup_attach_task_all() in: kvm_vm_worker_thread() cgroup_attach_task_all() percpu_down_write(&cgroup_threadgroup_rwsem) /* calls synchronize_rcu() */ This happens while kvm_vm_create_worker_thread() is waiting on a completion. See commit 6a010a49b63a for more information. This patch is preferable because the favordynmods solution has a trade-off. However, why don't we make nx_huge_pages=never the default behavior if the CPU is not vulnerable? If there are concerns about not being able to restart the worker thread, then maybe we could make this a .config option? - Luiz > > Reported-by: Li RongQing <lirongqing@xxxxxxxxx> > Closes: https://lore.kernel.org/all/1679555884-32544-1-git-send-email-lirongqing@xxxxxxxxx > Cc: Yong He <zhuangel570@xxxxxxxxx> > Cc: Robert Hoo <robert.hoo.linux@xxxxxxxxx> > Cc: Kai Huang <kai.huang@xxxxxxxxx> > Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> > --- > arch/x86/kvm/mmu/mmu.c | 41 ++++++++++++++++++++++++++++++++++++----- > 1 file changed, 36 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index c8961f45e3b1..2ed38916b904 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -58,6 +58,8 @@ > > extern bool itlb_multihit_kvm_mitigation; > > +static bool nx_hugepage_mitigation_hard_disabled; > + > int __read_mostly nx_huge_pages = -1; > static uint __read_mostly nx_huge_pages_recovery_period_ms; > #ifdef CONFIG_PREEMPT_RT > @@ -67,12 +69,13 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; > static uint __read_mostly nx_huge_pages_recovery_ratio = 60; > #endif > > +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); > static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); > static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); > > static const struct kernel_param_ops nx_huge_pages_ops = { > .set = set_nx_huge_pages, > - .get = param_get_bool, > + .get = get_nx_huge_pages, > }; > > static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { > @@ -6844,6 +6847,14 @@ static void mmu_destroy_caches(void) > kmem_cache_destroy(mmu_page_header_cache); > } > > +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) > +{ > + if (nx_hugepage_mitigation_hard_disabled) > + return sprintf(buffer, "never\n"); > + > + return param_get_bool(buffer, kp); > +} > + > static bool get_nx_auto_mode(void) > { > /* Return true when CPU has the bug, and mitigations are ON */ > @@ -6860,15 +6871,29 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) > bool old_val = nx_huge_pages; > bool new_val; > > + if (nx_hugepage_mitigation_hard_disabled) > + return -EPERM; > + > /* In "auto" mode deploy workaround only if CPU has the bug. */ > - if (sysfs_streq(val, "off")) > + if (sysfs_streq(val, "off")) { > new_val = 0; > - else if (sysfs_streq(val, "force")) > + } else if (sysfs_streq(val, "force")) { > new_val = 1; > - else if (sysfs_streq(val, "auto")) > + } else if (sysfs_streq(val, "auto")) { > new_val = get_nx_auto_mode(); > - else if (kstrtobool(val, &new_val) < 0) > + } else if (sysfs_streq(val, "never")) { > + new_val = 0; > + > + mutex_lock(&kvm_lock); > + if (!list_empty(&vm_list)) { > + mutex_unlock(&kvm_lock); > + return -EBUSY; > + } > + nx_hugepage_mitigation_hard_disabled = true; > + mutex_unlock(&kvm_lock); > + } else if (kstrtobool(val, &new_val) < 0) { > return -EINVAL; > + } > > __set_nx_huge_pages(new_val); > > @@ -7006,6 +7031,9 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel > uint old_period, new_period; > int err; > > + if (nx_hugepage_mitigation_hard_disabled) > + return -EPERM; > + > was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); > > err = param_set_uint(val, kp); > @@ -7161,6 +7189,9 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) > { > int err; > > + if (nx_hugepage_mitigation_hard_disabled) > + return 0; > + > err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, > "kvm-nx-lpage-recovery", > &kvm->arch.nx_huge_page_recovery_thread); > > base-commit: 39428f6ea9eace95011681628717062ff7f5eb5f > -- > 2.41.0.rc2.161.g9c6817b8e7-goog >