On Mon, Dec 13, 2021, David Matlack wrote: > When dirty logging is enabled without initially-all-set, attempt to > split all huge pages in the memslot down to 4KB pages so that vCPUs > do not have to take expensive write-protection faults to split huge > pages. > > Huge page splitting is best-effort only. This commit only adds the > support for the TDP MMU, and even there splitting may fail due to out > of memory conditions. Failures to split a huge page is fine from a > correctness standpoint because we still always follow it up by write- > protecting any remaining huge pages. > > Signed-off-by: David Matlack <dmatlack@xxxxxxxxxx> > --- > arch/x86/include/asm/kvm_host.h | 3 + > arch/x86/kvm/mmu/mmu.c | 14 +++ > arch/x86/kvm/mmu/spte.c | 59 ++++++++++++ > arch/x86/kvm/mmu/spte.h | 1 + > arch/x86/kvm/mmu/tdp_mmu.c | 165 ++++++++++++++++++++++++++++++++ > arch/x86/kvm/mmu/tdp_mmu.h | 5 + > arch/x86/kvm/x86.c | 10 ++ > 7 files changed, 257 insertions(+) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index e863d569c89a..4a507109e886 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1573,6 +1573,9 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); > void kvm_mmu_slot_remove_write_access(struct kvm *kvm, > const struct kvm_memory_slot *memslot, > int start_level); > +void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, > + const struct kvm_memory_slot *memslot, > + int target_level); > void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, > const struct kvm_memory_slot *memslot); > void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 3c2cb4dd1f11..9116c6a4ced1 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -5807,6 +5807,20 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, > kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); > } > > +void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, > + const struct kvm_memory_slot *memslot, > + int target_level) > +{ > + u64 start = memslot->base_gfn; > + u64 end = start + memslot->npages; > + > + if (is_tdp_mmu_enabled(kvm)) { > + read_lock(&kvm->mmu_lock); > + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); > + read_unlock(&kvm->mmu_lock); > + } > +} > + > static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > struct kvm_rmap_head *rmap_head, > const struct kvm_memory_slot *slot) > diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c > index fd34ae5d6940..11d0b3993ba5 100644 > --- a/arch/x86/kvm/mmu/spte.c > +++ b/arch/x86/kvm/mmu/spte.c > @@ -191,6 +191,65 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > return wrprot; > } > > +static u64 mark_spte_executable(u64 spte) > +{ > + bool is_access_track = is_access_track_spte(spte); > + > + if (is_access_track) > + spte = restore_acc_track_spte(spte); > + > + spte &= ~shadow_nx_mask; > + spte |= shadow_x_mask; > + > + if (is_access_track) > + spte = mark_spte_for_access_track(spte); > + > + return spte; > +} > + > +/* > + * Construct an SPTE that maps a sub-page of the given huge page SPTE where > + * `index` identifies which sub-page. > + * > + * This is used during huge page splitting to build the SPTEs that make up the Nit, to be consistent with the kernel, s/huge page/hugepage. > + * new page table. > + */ > +u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index, unsigned int access) Newline. Actually, just drop @access since there's exactly one caller that unconditionally passes ACC_ALL. > +{ > + u64 child_spte; > + int child_level; > + > + if (WARN_ON(is_mmio_spte(huge_spte))) > + return 0; Unnecessary, this is covered by is_shadow_present_pte(). > + > + if (WARN_ON(!is_shadow_present_pte(huge_spte))) > + return 0; > + > + if (WARN_ON(!is_large_pte(huge_spte))) Probably best to make these WARN_ON_ONCE, I gotta image if KVM screws up then this will flood dmesg. > + return 0; > + > + child_spte = huge_spte; > + child_level = huge_level - 1; > + > + /* > + * The child_spte already has the base address of the huge page being > + * split. So we just have to OR in the offset to the page at the next > + * lower level for the given index. > + */ > + child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; > + > + if (child_level == PG_LEVEL_4K) { > + child_spte &= ~PT_PAGE_SIZE_MASK; > + > + /* Allow execution for 4K pages if it was disabled for NX HugePages. */ Nit, this just reiterates the "what". Even though the "why" is fairly obvious, maybe this instead? /* * When splitting to a 4K page, make the new SPTE executable as * the NX hugepage mitigation no longer applies. */ > + if (is_nx_huge_page_enabled() && access & ACC_EXEC_MASK) Beacuse the caller always passes @access=ACC_ALL, the "access & ACC_EXEC_MASK" part goes away (which addresses other good feedback about parantheses). > + child_spte = mark_spte_executable(child_spte); Maybe s/mark/make? KVM usually uses "mark" when making modifications for tracking purposes. This is simply making a SPTE executable, there's no tracking involved. > + } > + > + return child_spte; > +} > + > + > u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) > { > u64 spte = SPTE_MMU_PRESENT_MASK; > diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h > index 9b0c7b27f23f..e13f335b4fef 100644 > --- a/arch/x86/kvm/mmu/spte.h > +++ b/arch/x86/kvm/mmu/spte.h > @@ -334,6 +334,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, > u64 old_spte, bool prefetch, bool can_unsync, > bool host_writable, u64 *new_spte); > +u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index, unsigned int access); > u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); > u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); > u64 mark_spte_for_access_track(u64 spte); > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index a8354d8578f1..be5eb74ac053 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1264,6 +1264,171 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, > return spte_set; > } > > +static struct kvm_mmu_page *alloc_tdp_mmu_page_from_kernel(gfp_t gfp) > +{ > + struct kvm_mmu_page *sp; > + > + gfp |= __GFP_ZERO; > + > + sp = kmem_cache_alloc(mmu_page_header_cache, gfp); > + if (!sp) > + return NULL; > + > + sp->spt = (void *)__get_free_page(gfp); > + if (!sp->spt) { > + kmem_cache_free(mmu_page_header_cache, sp); > + return NULL; > + } > + > + return sp; > +} > + > +static struct kvm_mmu_page *alloc_tdp_mmu_page_for_split(struct kvm *kvm, bool *dropped_lock) > +{ > + struct kvm_mmu_page *sp; > + > + lockdep_assert_held_read(&kvm->mmu_lock); > + > + *dropped_lock = false; > + > + /* > + * Since we are allocating while under the MMU lock we have to be > + * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct > + * reclaim and to avoid making any filesystem callbacks (which can end > + * up invoking KVM MMU notifiers, resulting in a deadlock). > + * > + * If this allocation fails we drop the lock and retry with reclaim > + * allowed. > + */ > + sp = alloc_tdp_mmu_page_from_kernel(GFP_NOWAIT | __GFP_ACCOUNT); > + if (sp) > + return sp; > + > + rcu_read_unlock(); > + read_unlock(&kvm->mmu_lock); > + > + *dropped_lock = true; > + > + sp = alloc_tdp_mmu_page_from_kernel(GFP_KERNEL_ACCOUNT); > + > + read_lock(&kvm->mmu_lock); > + rcu_read_lock(); > + > + return sp; > +} > + > +static bool Please put the return type and attributes on the same line as the function name, splitting them makes grep sad. And separating these rarely saves more than a line, e.g. even with conservative wrapping, this goes from 2=>3 lines. static bool tdp_mmu_split_huge_page_atomic(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp) And it doesn't save anything if we want to run over by a few chars, which IMO isn't worth it in this case, but that's not a sticking point. static bool tdp_mmu_split_huge_page_atomic(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp) > +tdp_mmu_split_huge_page_atomic(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp) > +{ > + const u64 huge_spte = iter->old_spte; > + const int level = iter->level; > + u64 child_spte; > + int i; > + > + init_child_tdp_mmu_page(sp, iter); > + > + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { > + child_spte = make_huge_page_split_spte(huge_spte, level, i, ACC_ALL); No need for child_spte, not saving any chars versus setting sp->spt[i] directly. And then if you hoist the comment above the for-loop you can drop the braces and shave a line from the comment: /* * No need for atomics since child_sp has not been installed in the * table yet and thus is not reachable by any other thread. */ for (i = 0; i < PT64_ENT_PER_PAGE; i++) sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i, ACC_ALL); > + > + /* > + * No need for atomics since child_sp has not been installed > + * in the table yet and thus is not reachable by any other > + * thread. > + */ > + sp->spt[i] = child_spte; > + } > + > + if (!tdp_mmu_install_sp_atomic(kvm, iter, sp, false)) > + return false; > + > + /* > + * tdp_mmu_install_sp_atomic will handle subtracting the split huge Please add () when referencing functions, it helps readers visually identify that the comment refers to a different function. > + * page from stats, but we have to manually update the new present child > + * pages. > + */ > + kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); > + > + return true; > +} > + > +static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, struct kvm_mmu_page *root, > + gfn_t start, gfn_t end, int target_level) > +{ > + struct kvm_mmu_page *sp = NULL; > + struct tdp_iter iter; > + bool dropped_lock; > + > + rcu_read_lock(); > + > + /* > + * Traverse the page table splitting all huge pages above the target > + * level into one lower level. For example, if we encounter a 1GB page > + * we split it into 512 2MB pages. > + * > + * Since the TDP iterator uses a pre-order traversal, we are guaranteed > + * to visit an SPTE before ever visiting its children, which means we > + * will correctly recursively split huge pages that are more than one > + * level above the target level (e.g. splitting 1GB to 2MB to 4KB). > + */ > + for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { > +retry: > + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) > + continue; > + > + if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) > + continue; > + > + if (!sp) { > + sp = alloc_tdp_mmu_page_for_split(kvm, &dropped_lock); > + if (!sp) > + return -ENOMEM; > + > + if (dropped_lock) { > + tdp_iter_restart(&iter); Ah, this needs to be rebased to play nice with commit 3a0f64de479c ("KVM: x86/mmu: Don't advance iterator after restart due to yielding"). With that in play, I think that best approach would be to drop dropped_lock (ha!) and just do: sp = alloc_tdp_mmu_page_for_split(kvm, &iter); if (!sp) return -ENOMEM; if (iter.yielded) continue; > + continue; > + } > + } > + > + if (!tdp_mmu_split_huge_page_atomic(kvm, &iter, sp)) > + goto retry; > + > + sp = NULL; > + } > + > + /* > + * It's possible to exit the loop having never used the last sp if, for > + * example, a vCPU doing HugePage NX splitting wins the race and > + * installs its own sp in place of the last sp we tried to split. > + */ > + if (sp) > + tdp_mmu_free_sp(sp); > + > + rcu_read_unlock(); Uber nit, an unusued sp can be freed after dropping RCU. > + > + return 0; > +} > + > +int kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, > + const struct kvm_memory_slot *slot, > + gfn_t start, gfn_t end, > + int target_level) > +{ > + struct kvm_mmu_page *root; > + int r = 0; > + > + lockdep_assert_held_read(&kvm->mmu_lock); > + > + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) { > + r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level); > + if (r) { > + kvm_tdp_mmu_put_root(kvm, root, true); > + break; > + } > + } > + > + return r; > +} > + > /* > * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If > * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. > diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h > index 3899004a5d91..3557a7fcf927 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.h > +++ b/arch/x86/kvm/mmu/tdp_mmu.h > @@ -71,6 +71,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, > struct kvm_memory_slot *slot, gfn_t gfn, > int min_level); > > +int kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, > + const struct kvm_memory_slot *slot, > + gfn_t start, gfn_t end, > + int target_level); > + > static inline void kvm_tdp_mmu_walk_lockless_begin(void) > { > rcu_read_lock(); > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 85127b3e3690..fb5592bf2eee 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -187,6 +187,9 @@ module_param(force_emulation_prefix, bool, S_IRUGO); > int __read_mostly pi_inject_timer = -1; > module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); > > +static bool __read_mostly eagerly_split_huge_pages_for_dirty_logging = true; > +module_param(eagerly_split_huge_pages_for_dirty_logging, bool, 0644); Heh, can we use a shorter name for the module param? There's 0% chance I'll ever type that correctly. Maybe eager_hugepage_splitting? Though even that is a bit too long for my tastes. This should also be documented somewhere, which is where we can/should explain exactly what the param does instead of trying to shove all that info into the name. > + > /* > * Restoring the host value for MSRs that are only consumed when running in > * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU > @@ -11837,6 +11840,13 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, > if (kvm_dirty_log_manual_protect_and_init_set(kvm)) > return; > > + /* > + * Attempt to split all large pages into 4K pages so that vCPUs > + * do not have to take write-protection faults. > + */ > + if (READ_ONCE(eagerly_split_huge_pages_for_dirty_logging)) > + kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); > + > if (kvm_x86_ops.cpu_dirty_log_size) { > kvm_mmu_slot_leaf_clear_dirty(kvm, new); > kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); > -- > 2.34.1.173.g76aa8bc2d0-goog >