Re: [kvm PATCH 2/2] kvm: vmx: use vmalloc() to allocate vcpus

Marc Orr <marcorr@xxxxxxxxxx> · Tue, 23 Oct 2018 17:13:40 -0400



Adding everyone that's cc'd on [kvm PATCH 1/2] mm: export
__vmalloc_node_range().
Thanks,
Marc
On Sat, Oct 20, 2018 at 5:13 PM Marc Orr <marcorr@xxxxxxxxxx> wrote:
>
> Previously, vcpus were allocated through the kmem_cache_zalloc() API,
> which requires the underlying physical memory to be contiguous.
> Because the x86 vcpu struct, struct vcpu_vmx, is relatively large
> (e.g., currently 47680 bytes on my setup), it can become hard to find
> contiguous memory.
>
> At the same time, the comments in the code indicate that the primary
> reason for using the kmem_cache_zalloc() API is to align the memory
> rather than to provide physical contiguity.
>
> Thus, this patch updates the vcpu allocation logic for vmx to use the
> vmalloc() API.
>
> Signed-off-by: Marc Orr <marcorr@xxxxxxxxxx>
> ---
>  arch/x86/kvm/vmx.c  | 98 +++++++++++++++++++++++++++++++++++++++++----
>  virt/kvm/kvm_main.c | 28 +++++++------
>  2 files changed, 106 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index abeeb45d1c33..d480a2cc0667 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -898,7 +898,14 @@ struct nested_vmx {
>  #define POSTED_INTR_ON  0
>  #define POSTED_INTR_SN  1
>
> -/* Posted-Interrupt Descriptor */
> +/*
> + * Posted-Interrupt Descriptor
> + *
> + * Note, the physical address of this structure is used by VMX. Furthermore, the
> + * translation code assumes that the entire pi_desc struct resides within a
> + * single page, which will be true because the struct is 64 bytes and 64-byte
> + * aligned.
> + */
>  struct pi_desc {
>         u32 pir[8];     /* Posted interrupt requested */
>         union {
> @@ -970,8 +977,25 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
>
>  struct vmx_msrs {
>         unsigned int            nr;
> -       struct vmx_msr_entry    val[NR_AUTOLOAD_MSRS];
> +       struct vmx_msr_entry    *val;
>  };
> +struct kmem_cache *vmx_msr_entry_cache;
> +
> +/*
> + * To prevent vmx_msr_entry array from crossing a page boundary, require:
> + * sizeof(*vmx_msrs.vmx_msr_entry.val) to be a power of two. This is guaranteed
> + * through compile-time asserts that:
> + *   - NR_AUTOLOAD_MSRS * sizeof(struct vmx_msr_entry) is a power of two
> + *   - NR_AUTOLOAD_MSRS * sizeof(struct vmx_msr_entry) <= PAGE_SIZE
> + *   - The allocation of vmx_msrs.vmx_msr_entry.val is aligned to its size.
> + */
> +#define CHECK_POWER_OF_TWO(val) \
> +       BUILD_BUG_ON_MSG(!((val) && !((val) & ((val) - 1))), \
> +       #val " is not a power of two.")
> +#define CHECK_INTRA_PAGE(val) do { \
> +               CHECK_POWER_OF_TWO(val); \
> +               BUILD_BUG_ON(!(val <= PAGE_SIZE)); \
> +       } while (0)
>
>  struct vcpu_vmx {
>         struct kvm_vcpu       vcpu;
> @@ -6616,6 +6640,14 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
>         }
>
>         if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
> +               /*
> +                * Note, pi_desc is contained within a single
> +                * page because the struct is 64 bytes and 64-byte aligned.
> +                */
> +               phys_addr_t pi_desc_phys =
> +                       page_to_phys(vmalloc_to_page(&vmx->pi_desc)) +
> +                       (u64)&vmx->pi_desc % PAGE_SIZE;
> +
>                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
>                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
>                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
> @@ -6624,7 +6656,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
>                 vmcs_write16(GUEST_INTR_STATUS, 0);
>
>                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
> -               vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
> +               vmcs_write64(POSTED_INTR_DESC_ADDR, pi_desc_phys);
>         }
>
>         if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
> @@ -11476,19 +11508,43 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>         free_loaded_vmcs(vmx->loaded_vmcs);
>         kfree(vmx->guest_msrs);
>         kvm_vcpu_uninit(vcpu);
> -       kmem_cache_free(kvm_vcpu_cache, vmx);
> +       kmem_cache_free(vmx_msr_entry_cache, vmx->msr_autoload.guest.val);
> +       kmem_cache_free(vmx_msr_entry_cache, vmx->msr_autoload.host.val);
> +       vfree(vmx);
>  }
>
>  static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>  {
>         int err;
> -       struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
> +       struct vcpu_vmx *vmx = __vmalloc_node_range(
> +                       sizeof(struct vcpu_vmx),
> +                       __alignof__(struct vcpu_vmx),
> +                       VMALLOC_START,
> +                       VMALLOC_END,
> +                       GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | __GFP_ACCOUNT,
> +                       PAGE_KERNEL,
> +                       0,
> +                       NUMA_NO_NODE,
> +                       __builtin_return_address(0));
>         unsigned long *msr_bitmap;
>         int cpu;
>
>         if (!vmx)
>                 return ERR_PTR(-ENOMEM);
>
> +       vmx->msr_autoload.guest.val =
> +               kmem_cache_zalloc(vmx_msr_entry_cache, GFP_KERNEL);
> +       if (!vmx->msr_autoload.guest.val) {
> +               err = -ENOMEM;
> +               goto free_vmx;
> +       }
> +       vmx->msr_autoload.host.val =
> +               kmem_cache_zalloc(vmx_msr_entry_cache, GFP_KERNEL);
> +       if (!vmx->msr_autoload.host.val) {
> +               err = -ENOMEM;
> +               goto free_msr_autoload_guest;
> +       }
> +
>         vmx->vpid = allocate_vpid();
>
>         err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
> @@ -11576,7 +11632,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>         kvm_vcpu_uninit(&vmx->vcpu);
>  free_vcpu:
>         free_vpid(vmx->vpid);
> -       kmem_cache_free(kvm_vcpu_cache, vmx);
> +       kmem_cache_free(vmx_msr_entry_cache, vmx->msr_autoload.host.val);
> +free_msr_autoload_guest:
> +       kmem_cache_free(vmx_msr_entry_cache, vmx->msr_autoload.guest.val);
> +free_vmx:
> +       vfree(vmx);
>         return ERR_PTR(err);
>  }
>
> @@ -15153,6 +15213,10 @@ module_exit(vmx_exit);
>  static int __init vmx_init(void)
>  {
>         int r;
> +       size_t vmx_msr_entry_size =
> +               sizeof(struct vmx_msr_entry) * NR_AUTOLOAD_MSRS;
> +
> +       CHECK_INTRA_PAGE(vmx_msr_entry_size);
>
>  #if IS_ENABLED(CONFIG_HYPERV)
>         /*
> @@ -15183,10 +15247,25 @@ static int __init vmx_init(void)
>         }
>  #endif
>
> -       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
> -                    __alignof__(struct vcpu_vmx), THIS_MODULE);
> +       /*
> +        * Disable kmem cache; vmalloc will be used instead
> +        * to avoid OOM'ing when memory is available but not contiguous.
> +        */
> +       r = kvm_init(&vmx_x86_ops, 0, 0, THIS_MODULE);
>         if (r)
>                 return r;
> +       /*
> +        * A vmx_msr_entry array resides exclusively within the kernel. Thus,
> +        * use kmem_cache_create_usercopy(), with the usersize argument set to
> +        * ZERO, to blacklist copying vmx_msr_entry to/from user space.
> +        */
> +       vmx_msr_entry_cache =
> +               kmem_cache_create_usercopy("vmx_msr_entry", vmx_msr_entry_size,
> +                                 vmx_msr_entry_size, SLAB_ACCOUNT, 0, 0, NULL);
> +       if (!vmx_msr_entry_cache) {
> +               r = -ENOMEM;
> +               goto out;
> +       }
>
>         /*
>          * Must be called after kvm_init() so enable_ept is properly set
> @@ -15210,5 +15289,8 @@ static int __init vmx_init(void)
>         vmx_check_vmcs12_offsets();
>
>         return 0;
> +out:
> +       kvm_exit();
> +       return r;
>  }
>  module_init(vmx_init);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 786ade1843a2..8b979e7c3ecd 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4038,18 +4038,22 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
>                 goto out_free_2;
>         register_reboot_notifier(&kvm_reboot_notifier);
>
> -       /* A kmem cache lets us meet the alignment requirements of fx_save. */
> -       if (!vcpu_align)
> -               vcpu_align = __alignof__(struct kvm_vcpu);
> -       kvm_vcpu_cache =
> -               kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
> -                                          SLAB_ACCOUNT,
> -                                          offsetof(struct kvm_vcpu, arch),
> -                                          sizeof_field(struct kvm_vcpu, arch),
> -                                          NULL);
> -       if (!kvm_vcpu_cache) {
> -               r = -ENOMEM;
> -               goto out_free_3;
> +       /*
> +        * When vcpu_size is zero,
> +        * architecture-specific code manages its own vcpu allocation.
> +        */
> +       kvm_vcpu_cache = NULL;
> +       if (vcpu_size) {
> +               if (!vcpu_align)
> +                       vcpu_align = __alignof__(struct kvm_vcpu);
> +               kvm_vcpu_cache = kmem_cache_create_usercopy(
> +                       "kvm_vcpu", vcpu_size, vcpu_align, SLAB_ACCOUNT,
> +                       offsetof(struct kvm_vcpu, arch),
> +                       sizeof_field(struct kvm_vcpu, arch), NULL);
> +               if (!kvm_vcpu_cache) {
> +                       r = -ENOMEM;
> +                       goto out_free_3;
> +               }
>         }
>
>         r = kvm_async_pf_init();
> --
> 2.19.1.568.g152ad8e336-goog
>