Re: [PATCH v2] kvm: x86: Add memcg accounting to KVM allocations

Sean Christopherson <sean.j.christopherson@xxxxxxxxx> · Fri, 8 Feb 2019 11:13:17 -0800

On Fri, Feb 08, 2019 at 10:16:34AM -0800, Ben Gardon wrote:
> There are many KVM kernel memory allocations which are tied to the life of
> the VM process and should be charged to the VM process's cgroup. If the
> allocations aren't tied to the process, the OOM killer will not know
> that killing the process will free the associated kernel memory.
> Add __GFP_ACCOUNT flags to many of the allocations which are not yet being
> charged to the VM process's cgroup.
> 
> Tested:
> 	Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch
> 	introduced no new failures.
> 	Ran a kernel memory accounting test which creates a VM to touch
> 	memory and then checks that the kernel memory allocated for the
> 	process is within certain bounds.
> 	With this patch we account for much more of the vmalloc and slab memory
> 	allocated for the VM.
> 
> There remain a few allocations which should be charged to the VM's
> cgroup but are not. In x86, they include:
> 	vcpu->run
> 	vcpu->arch.pio_data
> 	kvm->coalesced_mmio_ring
> There allocations are unaccounted in this patch because they are mapped
> to userspace, and accounting them to a cgroup causes problems. This
> should be addressed in a future patch.

The vzalloc() calls in svm_vm_alloc() and vmx_vm_alloc() are implicitly
GFP_KERNEL, those should be converted to __vmalloc() so they can be
accounted.

> Signed-off-by: Ben Gardon <bgardon@xxxxxxxxxx>
> Reviewed-by: Shakeel Butt <shakeelb@xxxxxxxxxx>
> ---
>  arch/x86/kvm/hyperv.c     |  2 +-
>  arch/x86/kvm/i8254.c      |  2 +-
>  arch/x86/kvm/i8259.c      |  2 +-
>  arch/x86/kvm/ioapic.c     |  2 +-
>  arch/x86/kvm/lapic.c      |  7 ++++---
>  arch/x86/kvm/mmu.c        |  6 +++---
>  arch/x86/kvm/page_track.c |  2 +-
>  arch/x86/kvm/svm.c        | 23 ++++++++++++-----------
>  arch/x86/kvm/vmx/nested.c |  9 +++++++--
>  arch/x86/kvm/vmx/vmx.c    | 23 +++++++++++++++++------
>  arch/x86/kvm/x86.c        | 16 +++++++++-------
>  virt/kvm/coalesced_mmio.c |  3 ++-
>  virt/kvm/eventfd.c        |  7 ++++---
>  virt/kvm/irqchip.c        |  4 ++--
>  virt/kvm/kvm_main.c       | 18 +++++++++---------
>  virt/kvm/vfio.c           |  4 ++--

It'd be helpful to split this into ~4 commits, e.g.:

  virt/kvm/
  arch/x86/svm.c
  arch/x86/vmx/
  arch/x86/

Reviewing one big patch isn't a big deal since the code changes are
trivial, but it'd be nice to be able to give Reviewed-by tags with a
little more granularity.

>  16 files changed, 76 insertions(+), 54 deletions(-)

[...]

> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 307e5bddb6d97..01cc4b1f7bfa0 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1797,7 +1797,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
>  	if (size > PAGE_SIZE)
>  		pages = vmalloc(size);

This vmalloc() should be converted to __vmalloc() w/ GFP_KERNEL_ACCOUNT.

Relevant to this flow, svm_register_enc_region()'s "region" allocation
should be accounted.

>  	else
> -		pages = kmalloc(size, GFP_KERNEL);
> +		pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
>  
>  	if (!pages)
>  		return NULL;
> @@ -1940,7 +1940,7 @@ static int avic_vm_init(struct kvm *kvm)
>  		return 0;
>  
>  	/* Allocating physical APIC ID table (4KB) */
> -	p_page = alloc_page(GFP_KERNEL);
> +	p_page = alloc_page(GFP_KERNEL_ACCOUNT);
>  	if (!p_page)
>  		goto free_avic;
>  

[...]

> @@ -6309,7 +6310,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
>  	if (ret)
>  		return ret;
>  
> -	data = kzalloc(sizeof(*data), GFP_KERNEL);
> +	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);

sev_unbind_asid() and several other SEV command functions have similar
temporary allocations, those should also be accounted, e.g.:

  __sev_issue_dbg_cmd
  sev_guest_status
  sev_launch_*

>  	if (!data)
>  		return -ENOMEM;
>  
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 3170e291215d0..88d20904b16e0 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -4140,11 +4140,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
>  	if (r < 0)
>  		goto out_vmcs02;
>  
> -	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
> +	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
>  	if (!vmx->nested.cached_vmcs12)
>  		goto out_cached_vmcs12;
>  
> -	vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
> +	vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE,
> +						   GFP_KERNEL_ACCOUNT);
>  	if (!vmx->nested.cached_shadow_vmcs12)
>  		goto out_cached_shadow_vmcs12;
>  
> @@ -5686,6 +5687,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
>  		enable_shadow_vmcs = 0;
>  	if (enable_shadow_vmcs) {
>  		for (i = 0; i < VMX_BITMAP_NR; i++) {
> +			/*
> +			 * The vmx_bitmap is not tied to a VM and so should
> +			 * not be charged to a memcg.
> +			 */
>  			vmx_bitmap[i] = (unsigned long *)
>  				__get_free_page(GFP_KERNEL);

Since "global" allocations are fairly rare, would it make sense to add a
KVM #define to explicitly call out such cases without having to add a
comment every time?  I feel almost all such cases are fairly obvious once
you think about them, the unique flag would just be a way of saying "hey,
this is intentional!".

>  			if (!vmx_bitmap[i]) {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 4d39f731bc332..6c2779cf9472c 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -245,6 +245,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
>  
>  	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
>  	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
> +		/*
> +		 * This allocation for vmx_l1d_flush_pages is not tied to a VM
> +		 * lifetime and so should not be charged to a memcg.
> +		 */
>  		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
>  		if (!page)
>  			return -ENOMEM;
> @@ -2395,7 +2399,11 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
>  	struct page *pages;
>  	struct vmcs *vmcs;
>  
> -	pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
> +	/*
> +	 * Since the page for the VMCS is inherently tied to the VM lifetime,
> +	 * we should charge this allocation to the VM's memcg.
> +	 */
> +	pages = __alloc_pages_node(node, GFP_KERNEL_ACCOUNT, vmcs_config.order);

Except for the root VMCS that is allocated by alloc_kvm_area() to do
VMXON.  Maybe just pass a parameter?  That'd let you drop the comment.

>  	if (!pages)
>  		return NULL;
>  	vmcs = page_address(pages);
> @@ -2442,7 +2450,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
>  	loaded_vmcs_init(loaded_vmcs);
>  
>  	if (cpu_has_vmx_msr_bitmap()) {
> -		loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
> +		loaded_vmcs->msr_bitmap = (unsigned long *)
> +				__get_free_page(GFP_KERNEL_ACCOUNT);
>  		if (!loaded_vmcs->msr_bitmap)
>  			goto out_vmcs;
>  		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);

[...]

> @@ -4064,7 +4065,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		break;
>  	}
>  	case KVM_GET_XSAVE: {
> -		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
> +		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);

There are a number of other temporary allocations in kvm_arch_vcpu_ioctl()
that should be accounted, e.g.:

  KVM_GET_FPU
  KVM_GET_SREGS
  KVM_SET_REGS
  KVM_GET_REGS

kvm_uevent_notify_change() also has temporary allocations that should be
accounted.

>  		r = -ENOMEM;
>  		if (!u.xsave)
>  			break;
> @@ -4088,7 +4089,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		break;
>  	}
>  	case KVM_GET_XCRS: {
> -		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
> +		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
>  		r = -ENOMEM;
>  		if (!u.xcrs)
>  			break;