Re: [RFC][Patch v8 6/7] KVM: Enables the kernel to isolate and report free pages

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
> 
> On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> > On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> >> This patch enables the kernel to scan the per cpu array and
> >> compress it by removing the repetitive/re-allocated pages.
> >> Once the per cpu array is completely filled with pages in the
> >> buddy it wakes up the kernel per cpu thread which re-scans the
> >> entire per cpu array by acquiring a zone lock corresponding to
> >> the page which is being scanned. If the page is still free and
> >> present in the buddy it tries to isolate the page and adds it
> >> to another per cpu array.
> >>
> >> Once this scanning process is complete and if there are any
> >> isolated pages added to the new per cpu array kernel thread
> >> invokes hyperlist_ready().
> >>
> >> In hyperlist_ready() a hypercall is made to report these pages to
> >> the host using the virtio-balloon framework. In order to do so
> >> another virtqueue 'hinting_vq' is added to the balloon framework.
> >> As the host frees all the reported pages, the kernel thread returns
> >> them back to the buddy.
> >>
> >> Signed-off-by: Nitesh Narayan Lal <nitesh@xxxxxxxxxx>
> >
> > This looks kind of like what early iterations of Wei's patches did.
> >
> > But this has lots of issues, for example you might end up with
> > a hypercall per a 4K page.
> > So in the end, he switched over to just reporting only
> > MAX_ORDER - 1 pages.
> You mean that I should only capture/attempt to isolate pages with order
> MAX_ORDER - 1?
> >
> > Would that be a good idea for you too?
> Will it help if we have a threshold value based on the amount of memory
> captured instead of the number of entries/pages in the array?

This is what Wei's patches do at least.

> >
> > An alternative would be a different much lighter weight
> > way to report these pages and to free them on the host.
> >
> >> ---
> >>  drivers/virtio/virtio_balloon.c     |  56 +++++++-
> >>  include/linux/page_hinting.h        |  18 ++-
> >>  include/uapi/linux/virtio_balloon.h |   1 +
> >>  mm/page_alloc.c                     |   2 +-
> >>  virt/kvm/page_hinting.c             | 202 +++++++++++++++++++++++++++-
> >>  5 files changed, 269 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> >> index 728ecd1eea30..8af34e0b9a32 100644
> >> --- a/drivers/virtio/virtio_balloon.c
> >> +++ b/drivers/virtio/virtio_balloon.c
> >> @@ -57,13 +57,15 @@ enum virtio_balloon_vq {
> >>  	VIRTIO_BALLOON_VQ_INFLATE,
> >>  	VIRTIO_BALLOON_VQ_DEFLATE,
> >>  	VIRTIO_BALLOON_VQ_STATS,
> >> +	VIRTIO_BALLOON_VQ_HINTING,
> >>  	VIRTIO_BALLOON_VQ_FREE_PAGE,
> >>  	VIRTIO_BALLOON_VQ_MAX
> >>  };
> >>  
> >>  struct virtio_balloon {
> >>  	struct virtio_device *vdev;
> >> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> >> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
> >> +								*hinting_vq;
> >>  
> >>  	/* Balloon's own wq for cpu-intensive work items */
> >>  	struct workqueue_struct *balloon_wq;
> >> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = {
> >>  	{ 0 },
> >>  };
> >>  
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
> >> +			      int hyper_entries)
> >> +{
> >> +	u64 gpaddr = virt_to_phys((void *)gvaddr);
> >> +
> >> +	virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
> >> +	virtqueue_kick_sync(vb->hinting_vq);
> >> +}
> >> +
> >> +static void hinting_ack(struct virtqueue *vq)
> >> +{
> >> +	struct virtio_balloon *vb = vq->vdev->priv;
> >> +
> >> +	wake_up(&vb->acked);
> >> +}
> >> +
> >> +static void enable_hinting(struct virtio_balloon *vb)
> >> +{
> >> +	guest_page_hinting_flag = 1;
> >> +	static_branch_enable(&guest_page_hinting_key);
> >> +	request_hypercall = (void *)&virtballoon_page_hinting;
> >> +	balloon_ptr = vb;
> >> +	WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
> >> +}
> >> +
> >> +static void disable_hinting(void)
> >> +{
> >> +	guest_page_hinting_flag = 0;
> >> +	static_branch_enable(&guest_page_hinting_key);
> >> +	balloon_ptr = NULL;
> >> +}
> >> +#endif
> >> +
> >>  static u32 page_to_balloon_pfn(struct page *page)
> >>  {
> >>  	unsigned long pfn = page_to_pfn(page);
> >> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb)
> >>  	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> >>  	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> >>  	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >> +	names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
> >>  
> >>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >>  		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> >> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb)
> >>  		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >>  	}
> >>  
> >> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
> >> +		names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
> >> +		callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
> >> +	}
> >>  	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> >>  					 vqs, callbacks, names, NULL, NULL);
> >>  	if (err)
> >>  		return err;
> >>  
> >> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> +		vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
> >> +
> >>  	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> >>  	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> >>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
> >>  		if (err)
> >>  			goto out_del_balloon_wq;
> >>  	}
> >> +
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> +		enable_hinting(vb);
> >> +#endif
> >>  	virtio_device_ready(vdev);
> >>  
> >>  	if (towards_target(vb))
> >> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev)
> >>  	cancel_work_sync(&vb->update_balloon_size_work);
> >>  	cancel_work_sync(&vb->update_balloon_stats_work);
> >>  
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> +		disable_hinting();
> >> +#endif
> >>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> >>  		cancel_work_sync(&vb->report_free_page_work);
> >>  		destroy_workqueue(vb->balloon_wq);
> >> @@ -1009,6 +1062,7 @@ static unsigned int features[] = {
> >>  	VIRTIO_BALLOON_F_MUST_TELL_HOST,
> >>  	VIRTIO_BALLOON_F_STATS_VQ,
> >>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> >> +	VIRTIO_BALLOON_F_HINTING,
> >>  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> >>  	VIRTIO_BALLOON_F_PAGE_POISON,
> >>  };
> >> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
> >> index e800c6b07561..3ba8c1f3b4a4 100644
> >> --- a/include/linux/page_hinting.h
> >> +++ b/include/linux/page_hinting.h
> >> @@ -1,15 +1,12 @@
> >>  #include <linux/smpboot.h>
> >>  
> >> -/*
> >> - * Size of the array which is used to store the freed pages is defined by
> >> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
> >> - * we can get rid of the hardcoded array size.
> >> - */
> >>  #define MAX_FGPT_ENTRIES	1000
> >>  /*
> >>   * hypervisor_pages - It is a dummy structure passed with the hypercall.
> >> - * @pfn: page frame number for the page which needs to be sent to the host.
> >> - * @order: order of the page needs to be reported to the host.
> >> + * @pfn - page frame number for the page which is to be freed.
> >> + * @pages - number of pages which are supposed to be freed.
> >> + * A global array object is used to to hold the list of pfn and pages and is
> >> + * passed as part of the hypercall.
> >>   */
> >>  struct hypervisor_pages {
> >>  	unsigned long pfn;
> >> @@ -19,11 +16,18 @@ struct hypervisor_pages {
> >>  extern int guest_page_hinting_flag;
> >>  extern struct static_key_false guest_page_hinting_key;
> >>  extern struct smp_hotplug_thread hinting_threads;
> >> +extern void (*request_hypercall)(void *, u64, int);
> >> +extern void *balloon_ptr;
> >>  extern bool want_page_poisoning;
> >>  
> >>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >>  			      void __user *buffer, size_t *lenp, loff_t *ppos);
> >>  void guest_free_page(struct page *page, int order);
> >> +extern int __isolate_free_page(struct page *page, unsigned int order);
> >> +extern void free_one_page(struct zone *zone,
> >> +			  struct page *page, unsigned long pfn,
> >> +			  unsigned int order,
> >> +			  int migratetype);
> >>  
> >>  static inline void disable_page_poisoning(void)
> >>  {
> > I guess you will want to put this in some other header.  Function
> > declarations belong close to where they are implemented, not used.
> I will find a better place.
> >
> >> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> >> index a1966cd7b677..2b0f62814e22 100644
> >> --- a/include/uapi/linux/virtio_balloon.h
> >> +++ b/include/uapi/linux/virtio_balloon.h
> >> @@ -36,6 +36,7 @@
> >>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
> >>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
> >>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> >> +#define VIRTIO_BALLOON_F_HINTING	5 /* Page hinting virtqueue */
> >>  
> >>  /* Size of a PFN in the balloon interface. */
> >>  #define VIRTIO_BALLOON_PFN_SHIFT 12
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index d295c9bc01a8..93224cba9243 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
> >>  	spin_unlock(&zone->lock);
> >>  }
> >>  
> >> -static void free_one_page(struct zone *zone,
> >> +void free_one_page(struct zone *zone,
> >>  				struct page *page, unsigned long pfn,
> >>  				unsigned int order,
> >>  				int migratetype)
> >> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> >> index be529f6f2bc0..315099fcda43 100644
> >> --- a/virt/kvm/page_hinting.c
> >> +++ b/virt/kvm/page_hinting.c
> >> @@ -1,6 +1,8 @@
> >>  #include <linux/gfp.h>
> >>  #include <linux/mm.h>
> >> +#include <linux/page_ref.h>
> >>  #include <linux/kvm_host.h>
> >> +#include <linux/sort.h>
> >>  #include <linux/kernel.h>
> >>  
> >>  /*
> >> @@ -39,6 +41,11 @@ int guest_page_hinting_flag;
> >>  EXPORT_SYMBOL(guest_page_hinting_flag);
> >>  static DEFINE_PER_CPU(struct task_struct *, hinting_task);
> >>  
> >> +void (*request_hypercall)(void *, u64, int);
> >> +EXPORT_SYMBOL(request_hypercall);
> >> +void *balloon_ptr;
> >> +EXPORT_SYMBOL(balloon_ptr);
> >> +
> >>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >>  			      void __user *buffer, size_t *lenp,
> >>  			      loff_t *ppos)
> >> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >>  	return ret;
> >>  }
> >>  
> >> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries)
> >> +{
> >> +	int i = 0;
> >> +	int mt = 0;
> >> +
> >> +	if (balloon_ptr)
> >> +		request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
> >> +				  entries);
> >> +
> >> +	while (i < entries) {
> >> +		struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
> >> +
> >> +		mt = get_pageblock_migratetype(page);
> >> +		free_one_page(page_zone(page), page, page_to_pfn(page),
> >> +			      guest_isolated_pages[i].order, mt);
> >> +		i++;
> >> +	}
> >> +}
> >> +
> >> +struct page *get_buddy_page(struct page *page)
> >> +{
> >> +	unsigned long pfn = page_to_pfn(page);
> >> +	unsigned int order;
> >> +
> >> +	for (order = 0; order < MAX_ORDER; order++) {
> >> +		struct page *page_head = page - (pfn & ((1 << order) - 1));
> >> +
> >> +		if (PageBuddy(page_head) && page_private(page_head) >= order)
> >> +			return page_head;
> >> +	}
> >> +	return NULL;
> >> +}
> >> +
> >>  static void hinting_fn(unsigned int cpu)
> >>  {
> >>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +	int idx = 0, ret = 0;
> >> +	struct zone *zone_cur;
> >> +	unsigned long flags = 0;
> >> +
> >> +	while (idx < MAX_FGPT_ENTRIES) {
> >> +		unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
> >> +		unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
> >> +			(1 << page_hinting_obj->kvm_pt[idx].order) - 1;
> >> +
> >> +		while (pfn <= pfn_end) {
> >> +			struct page *page = pfn_to_page(pfn);
> >> +			struct page *buddy_page = NULL;
> >> +
> >> +			zone_cur = page_zone(page);
> >> +			spin_lock_irqsave(&zone_cur->lock, flags);
> >> +
> >> +			if (PageCompound(page)) {
> >> +				struct page *head_page = compound_head(page);
> >> +				unsigned long head_pfn = page_to_pfn(head_page);
> >> +				unsigned int alloc_pages =
> >> +					1 << compound_order(head_page);
> >> +
> >> +				pfn = head_pfn + alloc_pages;
> >> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +				continue;
> >> +			}
> >> +
> >> +			if (page_ref_count(page)) {
> >> +				pfn++;
> >> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +				continue;
> >> +			}
> >> +
> >> +			if (PageBuddy(page)) {
> >> +				int buddy_order = page_private(page);
> >>  
> >> +				ret = __isolate_free_page(page, buddy_order);
> >> +				if (!ret) {
> >> +				} else {
> >> +					int l_idx = page_hinting_obj->hyp_idx;
> >> +					struct hypervisor_pages *l_obj =
> >> +					page_hinting_obj->hypervisor_pagelist;
> >> +
> >> +					l_obj[l_idx].pfn = pfn;
> >> +					l_obj[l_idx].order = buddy_order;
> >> +					page_hinting_obj->hyp_idx += 1;
> >> +				}
> >> +				pfn = pfn + (1 << buddy_order);
> >> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +				continue;
> >> +			}
> >> +
> >> +			buddy_page = get_buddy_page(page);
> >> +			if (buddy_page) {
> >> +				int buddy_order = page_private(buddy_page);
> >> +
> >> +				ret = __isolate_free_page(buddy_page,
> >> +							  buddy_order);
> >> +				if (!ret) {
> >> +				} else {
> >> +					int l_idx = page_hinting_obj->hyp_idx;
> >> +					struct hypervisor_pages *l_obj =
> >> +					page_hinting_obj->hypervisor_pagelist;
> >> +					unsigned long buddy_pfn =
> >> +						page_to_pfn(buddy_page);
> >> +
> >> +					l_obj[l_idx].pfn = buddy_pfn;
> >> +					l_obj[l_idx].order = buddy_order;
> >> +					page_hinting_obj->hyp_idx += 1;
> >> +				}
> >> +				pfn = page_to_pfn(buddy_page) +
> >> +					(1 << buddy_order);
> >> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +				continue;
> >> +			}
> >> +			spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +			pfn++;
> >> +		}
> >> +		page_hinting_obj->kvm_pt[idx].pfn = 0;
> >> +		page_hinting_obj->kvm_pt[idx].order = -1;
> >> +		page_hinting_obj->kvm_pt[idx].zonenum = -1;
> >> +		idx++;
> >> +	}
> >> +	if (page_hinting_obj->hyp_idx > 0) {
> >> +		hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
> >> +				page_hinting_obj->hyp_idx);
> >> +		page_hinting_obj->hyp_idx = 0;
> >> +	}
> >>  	page_hinting_obj->kvm_pt_idx = 0;
> >>  	put_cpu_var(hinting_obj);
> >>  }
> >>  
> >> +int if_exist(struct page *page)
> >> +{
> >> +	int i = 0;
> >> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +
> >> +	while (i < MAX_FGPT_ENTRIES) {
> >> +		if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
> >> +			return 1;
> >> +		i++;
> >> +	}
> >> +	return 0;
> >> +}
> >> +
> >> +void pack_array(void)
> >> +{
> >> +	int i = 0, j = 0;
> >> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +
> >> +	while (i < MAX_FGPT_ENTRIES) {
> >> +		if (page_hinting_obj->kvm_pt[i].pfn != 0) {
> >> +			if (i != j) {
> >> +				page_hinting_obj->kvm_pt[j].pfn =
> >> +					page_hinting_obj->kvm_pt[i].pfn;
> >> +				page_hinting_obj->kvm_pt[j].order =
> >> +					page_hinting_obj->kvm_pt[i].order;
> >> +				page_hinting_obj->kvm_pt[j].zonenum =
> >> +					page_hinting_obj->kvm_pt[i].zonenum;
> >> +			}
> >> +			j++;
> >> +		}
> >> +		i++;
> >> +	}
> >> +	i = j;
> >> +	page_hinting_obj->kvm_pt_idx = j;
> >> +	while (j < MAX_FGPT_ENTRIES) {
> >> +		page_hinting_obj->kvm_pt[j].pfn = 0;
> >> +		page_hinting_obj->kvm_pt[j].order = -1;
> >> +		page_hinting_obj->kvm_pt[j].zonenum = -1;
> >> +		j++;
> >> +	}
> >> +}
> >> +
> >>  void scan_array(void)
> >>  {
> >>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +	int i = 0;
> >>  
> >> +	while (i < MAX_FGPT_ENTRIES) {
> >> +		struct page *page =
> >> +			pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
> >> +		struct page *buddy_page = get_buddy_page(page);
> >> +
> >> +		if (!PageBuddy(page) && buddy_page) {
> >> +			if (if_exist(buddy_page)) {
> >> +				page_hinting_obj->kvm_pt[i].pfn = 0;
> >> +				page_hinting_obj->kvm_pt[i].order = -1;
> >> +				page_hinting_obj->kvm_pt[i].zonenum = -1;
> >> +			} else {
> >> +				page_hinting_obj->kvm_pt[i].pfn =
> >> +					page_to_pfn(buddy_page);
> >> +				page_hinting_obj->kvm_pt[i].order =
> >> +					page_private(buddy_page);
> >> +			}
> >> +		}
> >> +		i++;
> >> +	}
> >> +	pack_array();
> >>  	if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> >>  		wake_up_process(__this_cpu_read(hinting_task));
> >>  }
> >> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order)
> >>  		page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
> >>  							order;
> >>  		page_hinting_obj->kvm_pt_idx += 1;
> >> -		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> >> +		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
> >> +			/*
> >> +			 * We are depending on the buddy free-list to identify
> >> +			 * if a page is free or not. Hence, we are dumping all
> >> +			 * the per-cpu pages back into the buddy allocator. This
> >> +			 * will ensure less failures when we try to isolate free
> >> +			 * captured pages and hence more memory reporting to the
> >> +			 * host.
> >> +			 */
> >> +			drain_local_pages(NULL);
> >>  			scan_array();
> >> +		}
> >>  	}
> >>  	local_irq_restore(flags);
> >>  }
> >> -- 
> >> 2.17.2
> -- 
> Regards
> Nitesh
> 






[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux