On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote: > This patch enables the kernel to scan the per cpu array and > compress it by removing the repetitive/re-allocated pages. > Once the per cpu array is completely filled with pages in the > buddy it wakes up the kernel per cpu thread which re-scans the > entire per cpu array by acquiring a zone lock corresponding to > the page which is being scanned. If the page is still free and > present in the buddy it tries to isolate the page and adds it > to another per cpu array. > > Once this scanning process is complete and if there are any > isolated pages added to the new per cpu array kernel thread > invokes hyperlist_ready(). > > In hyperlist_ready() a hypercall is made to report these pages to > the host using the virtio-balloon framework. In order to do so > another virtqueue 'hinting_vq' is added to the balloon framework. > As the host frees all the reported pages, the kernel thread returns > them back to the buddy. > > Signed-off-by: Nitesh Narayan Lal <nitesh@xxxxxxxxxx> This looks kind of like what early iterations of Wei's patches did. But this has lots of issues, for example you might end up with a hypercall per a 4K page. So in the end, he switched over to just reporting only MAX_ORDER - 1 pages. Would that be a good idea for you too? An alternative would be a different much lighter weight way to report these pages and to free them on the host. > --- > drivers/virtio/virtio_balloon.c | 56 +++++++- > include/linux/page_hinting.h | 18 ++- > include/uapi/linux/virtio_balloon.h | 1 + > mm/page_alloc.c | 2 +- > virt/kvm/page_hinting.c | 202 +++++++++++++++++++++++++++- > 5 files changed, 269 insertions(+), 10 deletions(-) > > diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c > index 728ecd1eea30..8af34e0b9a32 100644 > --- a/drivers/virtio/virtio_balloon.c > +++ b/drivers/virtio/virtio_balloon.c > @@ -57,13 +57,15 @@ enum virtio_balloon_vq { > VIRTIO_BALLOON_VQ_INFLATE, > VIRTIO_BALLOON_VQ_DEFLATE, > VIRTIO_BALLOON_VQ_STATS, > + VIRTIO_BALLOON_VQ_HINTING, > VIRTIO_BALLOON_VQ_FREE_PAGE, > VIRTIO_BALLOON_VQ_MAX > }; > > struct virtio_balloon { > struct virtio_device *vdev; > - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; > + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq, > + *hinting_vq; > > /* Balloon's own wq for cpu-intensive work items */ > struct workqueue_struct *balloon_wq; > @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = { > { 0 }, > }; > > +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr, > + int hyper_entries) > +{ > + u64 gpaddr = virt_to_phys((void *)gvaddr); > + > + virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0); > + virtqueue_kick_sync(vb->hinting_vq); > +} > + > +static void hinting_ack(struct virtqueue *vq) > +{ > + struct virtio_balloon *vb = vq->vdev->priv; > + > + wake_up(&vb->acked); > +} > + > +static void enable_hinting(struct virtio_balloon *vb) > +{ > + guest_page_hinting_flag = 1; > + static_branch_enable(&guest_page_hinting_key); > + request_hypercall = (void *)&virtballoon_page_hinting; > + balloon_ptr = vb; > + WARN_ON(smpboot_register_percpu_thread(&hinting_threads)); > +} > + > +static void disable_hinting(void) > +{ > + guest_page_hinting_flag = 0; > + static_branch_enable(&guest_page_hinting_key); > + balloon_ptr = NULL; > +} > +#endif > + > static u32 page_to_balloon_pfn(struct page *page) > { > unsigned long pfn = page_to_pfn(page); > @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb) > names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate"; > names[VIRTIO_BALLOON_VQ_STATS] = NULL; > names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; > + names[VIRTIO_BALLOON_VQ_HINTING] = NULL; > > if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { > names[VIRTIO_BALLOON_VQ_STATS] = "stats"; > @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb) > callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; > } > > + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) { > + names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq"; > + callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack; > + } > err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, > vqs, callbacks, names, NULL, NULL); > if (err) > return err; > > + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) > + vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING]; > + > vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE]; > vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE]; > if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { > @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev) > if (err) > goto out_del_balloon_wq; > } > + > +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) > + enable_hinting(vb); > +#endif > virtio_device_ready(vdev); > > if (towards_target(vb)) > @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev) > cancel_work_sync(&vb->update_balloon_size_work); > cancel_work_sync(&vb->update_balloon_stats_work); > > +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) > + disable_hinting(); > +#endif > if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { > cancel_work_sync(&vb->report_free_page_work); > destroy_workqueue(vb->balloon_wq); > @@ -1009,6 +1062,7 @@ static unsigned int features[] = { > VIRTIO_BALLOON_F_MUST_TELL_HOST, > VIRTIO_BALLOON_F_STATS_VQ, > VIRTIO_BALLOON_F_DEFLATE_ON_OOM, > + VIRTIO_BALLOON_F_HINTING, > VIRTIO_BALLOON_F_FREE_PAGE_HINT, > VIRTIO_BALLOON_F_PAGE_POISON, > }; > diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h > index e800c6b07561..3ba8c1f3b4a4 100644 > --- a/include/linux/page_hinting.h > +++ b/include/linux/page_hinting.h > @@ -1,15 +1,12 @@ > #include <linux/smpboot.h> > > -/* > - * Size of the array which is used to store the freed pages is defined by > - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which > - * we can get rid of the hardcoded array size. > - */ > #define MAX_FGPT_ENTRIES 1000 > /* > * hypervisor_pages - It is a dummy structure passed with the hypercall. > - * @pfn: page frame number for the page which needs to be sent to the host. > - * @order: order of the page needs to be reported to the host. > + * @pfn - page frame number for the page which is to be freed. > + * @pages - number of pages which are supposed to be freed. > + * A global array object is used to to hold the list of pfn and pages and is > + * passed as part of the hypercall. > */ > struct hypervisor_pages { > unsigned long pfn; > @@ -19,11 +16,18 @@ struct hypervisor_pages { > extern int guest_page_hinting_flag; > extern struct static_key_false guest_page_hinting_key; > extern struct smp_hotplug_thread hinting_threads; > +extern void (*request_hypercall)(void *, u64, int); > +extern void *balloon_ptr; > extern bool want_page_poisoning; > > int guest_page_hinting_sysctl(struct ctl_table *table, int write, > void __user *buffer, size_t *lenp, loff_t *ppos); > void guest_free_page(struct page *page, int order); > +extern int __isolate_free_page(struct page *page, unsigned int order); > +extern void free_one_page(struct zone *zone, > + struct page *page, unsigned long pfn, > + unsigned int order, > + int migratetype); > > static inline void disable_page_poisoning(void) > { I guess you will want to put this in some other header. Function declarations belong close to where they are implemented, not used. > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h > index a1966cd7b677..2b0f62814e22 100644 > --- a/include/uapi/linux/virtio_balloon.h > +++ b/include/uapi/linux/virtio_balloon.h > @@ -36,6 +36,7 @@ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ > #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ > +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ > > /* Size of a PFN in the balloon interface. */ > #define VIRTIO_BALLOON_PFN_SHIFT 12 > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index d295c9bc01a8..93224cba9243 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, > spin_unlock(&zone->lock); > } > > -static void free_one_page(struct zone *zone, > +void free_one_page(struct zone *zone, > struct page *page, unsigned long pfn, > unsigned int order, > int migratetype) > diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c > index be529f6f2bc0..315099fcda43 100644 > --- a/virt/kvm/page_hinting.c > +++ b/virt/kvm/page_hinting.c > @@ -1,6 +1,8 @@ > #include <linux/gfp.h> > #include <linux/mm.h> > +#include <linux/page_ref.h> > #include <linux/kvm_host.h> > +#include <linux/sort.h> > #include <linux/kernel.h> > > /* > @@ -39,6 +41,11 @@ int guest_page_hinting_flag; > EXPORT_SYMBOL(guest_page_hinting_flag); > static DEFINE_PER_CPU(struct task_struct *, hinting_task); > > +void (*request_hypercall)(void *, u64, int); > +EXPORT_SYMBOL(request_hypercall); > +void *balloon_ptr; > +EXPORT_SYMBOL(balloon_ptr); > + > int guest_page_hinting_sysctl(struct ctl_table *table, int write, > void __user *buffer, size_t *lenp, > loff_t *ppos) > @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write, > return ret; > } > > +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries) > +{ > + int i = 0; > + int mt = 0; > + > + if (balloon_ptr) > + request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0], > + entries); > + > + while (i < entries) { > + struct page *page = pfn_to_page(guest_isolated_pages[i].pfn); > + > + mt = get_pageblock_migratetype(page); > + free_one_page(page_zone(page), page, page_to_pfn(page), > + guest_isolated_pages[i].order, mt); > + i++; > + } > +} > + > +struct page *get_buddy_page(struct page *page) > +{ > + unsigned long pfn = page_to_pfn(page); > + unsigned int order; > + > + for (order = 0; order < MAX_ORDER; order++) { > + struct page *page_head = page - (pfn & ((1 << order) - 1)); > + > + if (PageBuddy(page_head) && page_private(page_head) >= order) > + return page_head; > + } > + return NULL; > +} > + > static void hinting_fn(unsigned int cpu) > { > struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); > + int idx = 0, ret = 0; > + struct zone *zone_cur; > + unsigned long flags = 0; > + > + while (idx < MAX_FGPT_ENTRIES) { > + unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn; > + unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn + > + (1 << page_hinting_obj->kvm_pt[idx].order) - 1; > + > + while (pfn <= pfn_end) { > + struct page *page = pfn_to_page(pfn); > + struct page *buddy_page = NULL; > + > + zone_cur = page_zone(page); > + spin_lock_irqsave(&zone_cur->lock, flags); > + > + if (PageCompound(page)) { > + struct page *head_page = compound_head(page); > + unsigned long head_pfn = page_to_pfn(head_page); > + unsigned int alloc_pages = > + 1 << compound_order(head_page); > + > + pfn = head_pfn + alloc_pages; > + spin_unlock_irqrestore(&zone_cur->lock, flags); > + continue; > + } > + > + if (page_ref_count(page)) { > + pfn++; > + spin_unlock_irqrestore(&zone_cur->lock, flags); > + continue; > + } > + > + if (PageBuddy(page)) { > + int buddy_order = page_private(page); > > + ret = __isolate_free_page(page, buddy_order); > + if (!ret) { > + } else { > + int l_idx = page_hinting_obj->hyp_idx; > + struct hypervisor_pages *l_obj = > + page_hinting_obj->hypervisor_pagelist; > + > + l_obj[l_idx].pfn = pfn; > + l_obj[l_idx].order = buddy_order; > + page_hinting_obj->hyp_idx += 1; > + } > + pfn = pfn + (1 << buddy_order); > + spin_unlock_irqrestore(&zone_cur->lock, flags); > + continue; > + } > + > + buddy_page = get_buddy_page(page); > + if (buddy_page) { > + int buddy_order = page_private(buddy_page); > + > + ret = __isolate_free_page(buddy_page, > + buddy_order); > + if (!ret) { > + } else { > + int l_idx = page_hinting_obj->hyp_idx; > + struct hypervisor_pages *l_obj = > + page_hinting_obj->hypervisor_pagelist; > + unsigned long buddy_pfn = > + page_to_pfn(buddy_page); > + > + l_obj[l_idx].pfn = buddy_pfn; > + l_obj[l_idx].order = buddy_order; > + page_hinting_obj->hyp_idx += 1; > + } > + pfn = page_to_pfn(buddy_page) + > + (1 << buddy_order); > + spin_unlock_irqrestore(&zone_cur->lock, flags); > + continue; > + } > + spin_unlock_irqrestore(&zone_cur->lock, flags); > + pfn++; > + } > + page_hinting_obj->kvm_pt[idx].pfn = 0; > + page_hinting_obj->kvm_pt[idx].order = -1; > + page_hinting_obj->kvm_pt[idx].zonenum = -1; > + idx++; > + } > + if (page_hinting_obj->hyp_idx > 0) { > + hyperlist_ready(page_hinting_obj->hypervisor_pagelist, > + page_hinting_obj->hyp_idx); > + page_hinting_obj->hyp_idx = 0; > + } > page_hinting_obj->kvm_pt_idx = 0; > put_cpu_var(hinting_obj); > } > > +int if_exist(struct page *page) > +{ > + int i = 0; > + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); > + > + while (i < MAX_FGPT_ENTRIES) { > + if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn) > + return 1; > + i++; > + } > + return 0; > +} > + > +void pack_array(void) > +{ > + int i = 0, j = 0; > + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); > + > + while (i < MAX_FGPT_ENTRIES) { > + if (page_hinting_obj->kvm_pt[i].pfn != 0) { > + if (i != j) { > + page_hinting_obj->kvm_pt[j].pfn = > + page_hinting_obj->kvm_pt[i].pfn; > + page_hinting_obj->kvm_pt[j].order = > + page_hinting_obj->kvm_pt[i].order; > + page_hinting_obj->kvm_pt[j].zonenum = > + page_hinting_obj->kvm_pt[i].zonenum; > + } > + j++; > + } > + i++; > + } > + i = j; > + page_hinting_obj->kvm_pt_idx = j; > + while (j < MAX_FGPT_ENTRIES) { > + page_hinting_obj->kvm_pt[j].pfn = 0; > + page_hinting_obj->kvm_pt[j].order = -1; > + page_hinting_obj->kvm_pt[j].zonenum = -1; > + j++; > + } > +} > + > void scan_array(void) > { > struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); > + int i = 0; > > + while (i < MAX_FGPT_ENTRIES) { > + struct page *page = > + pfn_to_page(page_hinting_obj->kvm_pt[i].pfn); > + struct page *buddy_page = get_buddy_page(page); > + > + if (!PageBuddy(page) && buddy_page) { > + if (if_exist(buddy_page)) { > + page_hinting_obj->kvm_pt[i].pfn = 0; > + page_hinting_obj->kvm_pt[i].order = -1; > + page_hinting_obj->kvm_pt[i].zonenum = -1; > + } else { > + page_hinting_obj->kvm_pt[i].pfn = > + page_to_pfn(buddy_page); > + page_hinting_obj->kvm_pt[i].order = > + page_private(buddy_page); > + } > + } > + i++; > + } > + pack_array(); > if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) > wake_up_process(__this_cpu_read(hinting_task)); > } > @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order) > page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order = > order; > page_hinting_obj->kvm_pt_idx += 1; > - if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) > + if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) { > + /* > + * We are depending on the buddy free-list to identify > + * if a page is free or not. Hence, we are dumping all > + * the per-cpu pages back into the buddy allocator. This > + * will ensure less failures when we try to isolate free > + * captured pages and hence more memory reporting to the > + * host. > + */ > + drain_local_pages(NULL); > scan_array(); > + } > } > local_irq_restore(flags); > } > -- > 2.17.2