On 2/5/19 3:45 PM, Michael S. Tsirkin wrote: > On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote: >> This patch enables the kernel to scan the per cpu array and >> compress it by removing the repetitive/re-allocated pages. >> Once the per cpu array is completely filled with pages in the >> buddy it wakes up the kernel per cpu thread which re-scans the >> entire per cpu array by acquiring a zone lock corresponding to >> the page which is being scanned. If the page is still free and >> present in the buddy it tries to isolate the page and adds it >> to another per cpu array. >> >> Once this scanning process is complete and if there are any >> isolated pages added to the new per cpu array kernel thread >> invokes hyperlist_ready(). >> >> In hyperlist_ready() a hypercall is made to report these pages to >> the host using the virtio-balloon framework. In order to do so >> another virtqueue 'hinting_vq' is added to the balloon framework. >> As the host frees all the reported pages, the kernel thread returns >> them back to the buddy. >> >> Signed-off-by: Nitesh Narayan Lal <nitesh@xxxxxxxxxx> > > This looks kind of like what early iterations of Wei's patches did. > > But this has lots of issues, for example you might end up with > a hypercall per a 4K page. > So in the end, he switched over to just reporting only > MAX_ORDER - 1 pages. You mean that I should only capture/attempt to isolate pages with order MAX_ORDER - 1? > > Would that be a good idea for you too? Will it help if we have a threshold value based on the amount of memory captured instead of the number of entries/pages in the array? > > An alternative would be a different much lighter weight > way to report these pages and to free them on the host. > >> --- >> drivers/virtio/virtio_balloon.c | 56 +++++++- >> include/linux/page_hinting.h | 18 ++- >> include/uapi/linux/virtio_balloon.h | 1 + >> mm/page_alloc.c | 2 +- >> virt/kvm/page_hinting.c | 202 +++++++++++++++++++++++++++- >> 5 files changed, 269 insertions(+), 10 deletions(-) >> >> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c >> index 728ecd1eea30..8af34e0b9a32 100644 >> --- a/drivers/virtio/virtio_balloon.c >> +++ b/drivers/virtio/virtio_balloon.c >> @@ -57,13 +57,15 @@ enum virtio_balloon_vq { >> VIRTIO_BALLOON_VQ_INFLATE, >> VIRTIO_BALLOON_VQ_DEFLATE, >> VIRTIO_BALLOON_VQ_STATS, >> + VIRTIO_BALLOON_VQ_HINTING, >> VIRTIO_BALLOON_VQ_FREE_PAGE, >> VIRTIO_BALLOON_VQ_MAX >> }; >> >> struct virtio_balloon { >> struct virtio_device *vdev; >> - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; >> + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq, >> + *hinting_vq; >> >> /* Balloon's own wq for cpu-intensive work items */ >> struct workqueue_struct *balloon_wq; >> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = { >> { 0 }, >> }; >> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING >> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr, >> + int hyper_entries) >> +{ >> + u64 gpaddr = virt_to_phys((void *)gvaddr); >> + >> + virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0); >> + virtqueue_kick_sync(vb->hinting_vq); >> +} >> + >> +static void hinting_ack(struct virtqueue *vq) >> +{ >> + struct virtio_balloon *vb = vq->vdev->priv; >> + >> + wake_up(&vb->acked); >> +} >> + >> +static void enable_hinting(struct virtio_balloon *vb) >> +{ >> + guest_page_hinting_flag = 1; >> + static_branch_enable(&guest_page_hinting_key); >> + request_hypercall = (void *)&virtballoon_page_hinting; >> + balloon_ptr = vb; >> + WARN_ON(smpboot_register_percpu_thread(&hinting_threads)); >> +} >> + >> +static void disable_hinting(void) >> +{ >> + guest_page_hinting_flag = 0; >> + static_branch_enable(&guest_page_hinting_key); >> + balloon_ptr = NULL; >> +} >> +#endif >> + >> static u32 page_to_balloon_pfn(struct page *page) >> { >> unsigned long pfn = page_to_pfn(page); >> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb) >> names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate"; >> names[VIRTIO_BALLOON_VQ_STATS] = NULL; >> names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; >> + names[VIRTIO_BALLOON_VQ_HINTING] = NULL; >> >> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { >> names[VIRTIO_BALLOON_VQ_STATS] = "stats"; >> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb) >> callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; >> } >> >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) { >> + names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq"; >> + callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack; >> + } >> err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, >> vqs, callbacks, names, NULL, NULL); >> if (err) >> return err; >> >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) >> + vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING]; >> + >> vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE]; >> vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE]; >> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { >> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev) >> if (err) >> goto out_del_balloon_wq; >> } >> + >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) >> + enable_hinting(vb); >> +#endif >> virtio_device_ready(vdev); >> >> if (towards_target(vb)) >> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev) >> cancel_work_sync(&vb->update_balloon_size_work); >> cancel_work_sync(&vb->update_balloon_stats_work); >> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) >> + disable_hinting(); >> +#endif >> if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { >> cancel_work_sync(&vb->report_free_page_work); >> destroy_workqueue(vb->balloon_wq); >> @@ -1009,6 +1062,7 @@ static unsigned int features[] = { >> VIRTIO_BALLOON_F_MUST_TELL_HOST, >> VIRTIO_BALLOON_F_STATS_VQ, >> VIRTIO_BALLOON_F_DEFLATE_ON_OOM, >> + VIRTIO_BALLOON_F_HINTING, >> VIRTIO_BALLOON_F_FREE_PAGE_HINT, >> VIRTIO_BALLOON_F_PAGE_POISON, >> }; >> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h >> index e800c6b07561..3ba8c1f3b4a4 100644 >> --- a/include/linux/page_hinting.h >> +++ b/include/linux/page_hinting.h >> @@ -1,15 +1,12 @@ >> #include <linux/smpboot.h> >> >> -/* >> - * Size of the array which is used to store the freed pages is defined by >> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which >> - * we can get rid of the hardcoded array size. >> - */ >> #define MAX_FGPT_ENTRIES 1000 >> /* >> * hypervisor_pages - It is a dummy structure passed with the hypercall. >> - * @pfn: page frame number for the page which needs to be sent to the host. >> - * @order: order of the page needs to be reported to the host. >> + * @pfn - page frame number for the page which is to be freed. >> + * @pages - number of pages which are supposed to be freed. >> + * A global array object is used to to hold the list of pfn and pages and is >> + * passed as part of the hypercall. >> */ >> struct hypervisor_pages { >> unsigned long pfn; >> @@ -19,11 +16,18 @@ struct hypervisor_pages { >> extern int guest_page_hinting_flag; >> extern struct static_key_false guest_page_hinting_key; >> extern struct smp_hotplug_thread hinting_threads; >> +extern void (*request_hypercall)(void *, u64, int); >> +extern void *balloon_ptr; >> extern bool want_page_poisoning; >> >> int guest_page_hinting_sysctl(struct ctl_table *table, int write, >> void __user *buffer, size_t *lenp, loff_t *ppos); >> void guest_free_page(struct page *page, int order); >> +extern int __isolate_free_page(struct page *page, unsigned int order); >> +extern void free_one_page(struct zone *zone, >> + struct page *page, unsigned long pfn, >> + unsigned int order, >> + int migratetype); >> >> static inline void disable_page_poisoning(void) >> { > I guess you will want to put this in some other header. Function > declarations belong close to where they are implemented, not used. I will find a better place. > >> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h >> index a1966cd7b677..2b0f62814e22 100644 >> --- a/include/uapi/linux/virtio_balloon.h >> +++ b/include/uapi/linux/virtio_balloon.h >> @@ -36,6 +36,7 @@ >> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ >> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ >> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ >> +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ >> >> /* Size of a PFN in the balloon interface. */ >> #define VIRTIO_BALLOON_PFN_SHIFT 12 >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c >> index d295c9bc01a8..93224cba9243 100644 >> --- a/mm/page_alloc.c >> +++ b/mm/page_alloc.c >> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, >> spin_unlock(&zone->lock); >> } >> >> -static void free_one_page(struct zone *zone, >> +void free_one_page(struct zone *zone, >> struct page *page, unsigned long pfn, >> unsigned int order, >> int migratetype) >> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c >> index be529f6f2bc0..315099fcda43 100644 >> --- a/virt/kvm/page_hinting.c >> +++ b/virt/kvm/page_hinting.c >> @@ -1,6 +1,8 @@ >> #include <linux/gfp.h> >> #include <linux/mm.h> >> +#include <linux/page_ref.h> >> #include <linux/kvm_host.h> >> +#include <linux/sort.h> >> #include <linux/kernel.h> >> >> /* >> @@ -39,6 +41,11 @@ int guest_page_hinting_flag; >> EXPORT_SYMBOL(guest_page_hinting_flag); >> static DEFINE_PER_CPU(struct task_struct *, hinting_task); >> >> +void (*request_hypercall)(void *, u64, int); >> +EXPORT_SYMBOL(request_hypercall); >> +void *balloon_ptr; >> +EXPORT_SYMBOL(balloon_ptr); >> + >> int guest_page_hinting_sysctl(struct ctl_table *table, int write, >> void __user *buffer, size_t *lenp, >> loff_t *ppos) >> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write, >> return ret; >> } >> >> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries) >> +{ >> + int i = 0; >> + int mt = 0; >> + >> + if (balloon_ptr) >> + request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0], >> + entries); >> + >> + while (i < entries) { >> + struct page *page = pfn_to_page(guest_isolated_pages[i].pfn); >> + >> + mt = get_pageblock_migratetype(page); >> + free_one_page(page_zone(page), page, page_to_pfn(page), >> + guest_isolated_pages[i].order, mt); >> + i++; >> + } >> +} >> + >> +struct page *get_buddy_page(struct page *page) >> +{ >> + unsigned long pfn = page_to_pfn(page); >> + unsigned int order; >> + >> + for (order = 0; order < MAX_ORDER; order++) { >> + struct page *page_head = page - (pfn & ((1 << order) - 1)); >> + >> + if (PageBuddy(page_head) && page_private(page_head) >= order) >> + return page_head; >> + } >> + return NULL; >> +} >> + >> static void hinting_fn(unsigned int cpu) >> { >> struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); >> + int idx = 0, ret = 0; >> + struct zone *zone_cur; >> + unsigned long flags = 0; >> + >> + while (idx < MAX_FGPT_ENTRIES) { >> + unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn; >> + unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn + >> + (1 << page_hinting_obj->kvm_pt[idx].order) - 1; >> + >> + while (pfn <= pfn_end) { >> + struct page *page = pfn_to_page(pfn); >> + struct page *buddy_page = NULL; >> + >> + zone_cur = page_zone(page); >> + spin_lock_irqsave(&zone_cur->lock, flags); >> + >> + if (PageCompound(page)) { >> + struct page *head_page = compound_head(page); >> + unsigned long head_pfn = page_to_pfn(head_page); >> + unsigned int alloc_pages = >> + 1 << compound_order(head_page); >> + >> + pfn = head_pfn + alloc_pages; >> + spin_unlock_irqrestore(&zone_cur->lock, flags); >> + continue; >> + } >> + >> + if (page_ref_count(page)) { >> + pfn++; >> + spin_unlock_irqrestore(&zone_cur->lock, flags); >> + continue; >> + } >> + >> + if (PageBuddy(page)) { >> + int buddy_order = page_private(page); >> >> + ret = __isolate_free_page(page, buddy_order); >> + if (!ret) { >> + } else { >> + int l_idx = page_hinting_obj->hyp_idx; >> + struct hypervisor_pages *l_obj = >> + page_hinting_obj->hypervisor_pagelist; >> + >> + l_obj[l_idx].pfn = pfn; >> + l_obj[l_idx].order = buddy_order; >> + page_hinting_obj->hyp_idx += 1; >> + } >> + pfn = pfn + (1 << buddy_order); >> + spin_unlock_irqrestore(&zone_cur->lock, flags); >> + continue; >> + } >> + >> + buddy_page = get_buddy_page(page); >> + if (buddy_page) { >> + int buddy_order = page_private(buddy_page); >> + >> + ret = __isolate_free_page(buddy_page, >> + buddy_order); >> + if (!ret) { >> + } else { >> + int l_idx = page_hinting_obj->hyp_idx; >> + struct hypervisor_pages *l_obj = >> + page_hinting_obj->hypervisor_pagelist; >> + unsigned long buddy_pfn = >> + page_to_pfn(buddy_page); >> + >> + l_obj[l_idx].pfn = buddy_pfn; >> + l_obj[l_idx].order = buddy_order; >> + page_hinting_obj->hyp_idx += 1; >> + } >> + pfn = page_to_pfn(buddy_page) + >> + (1 << buddy_order); >> + spin_unlock_irqrestore(&zone_cur->lock, flags); >> + continue; >> + } >> + spin_unlock_irqrestore(&zone_cur->lock, flags); >> + pfn++; >> + } >> + page_hinting_obj->kvm_pt[idx].pfn = 0; >> + page_hinting_obj->kvm_pt[idx].order = -1; >> + page_hinting_obj->kvm_pt[idx].zonenum = -1; >> + idx++; >> + } >> + if (page_hinting_obj->hyp_idx > 0) { >> + hyperlist_ready(page_hinting_obj->hypervisor_pagelist, >> + page_hinting_obj->hyp_idx); >> + page_hinting_obj->hyp_idx = 0; >> + } >> page_hinting_obj->kvm_pt_idx = 0; >> put_cpu_var(hinting_obj); >> } >> >> +int if_exist(struct page *page) >> +{ >> + int i = 0; >> + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); >> + >> + while (i < MAX_FGPT_ENTRIES) { >> + if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn) >> + return 1; >> + i++; >> + } >> + return 0; >> +} >> + >> +void pack_array(void) >> +{ >> + int i = 0, j = 0; >> + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); >> + >> + while (i < MAX_FGPT_ENTRIES) { >> + if (page_hinting_obj->kvm_pt[i].pfn != 0) { >> + if (i != j) { >> + page_hinting_obj->kvm_pt[j].pfn = >> + page_hinting_obj->kvm_pt[i].pfn; >> + page_hinting_obj->kvm_pt[j].order = >> + page_hinting_obj->kvm_pt[i].order; >> + page_hinting_obj->kvm_pt[j].zonenum = >> + page_hinting_obj->kvm_pt[i].zonenum; >> + } >> + j++; >> + } >> + i++; >> + } >> + i = j; >> + page_hinting_obj->kvm_pt_idx = j; >> + while (j < MAX_FGPT_ENTRIES) { >> + page_hinting_obj->kvm_pt[j].pfn = 0; >> + page_hinting_obj->kvm_pt[j].order = -1; >> + page_hinting_obj->kvm_pt[j].zonenum = -1; >> + j++; >> + } >> +} >> + >> void scan_array(void) >> { >> struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); >> + int i = 0; >> >> + while (i < MAX_FGPT_ENTRIES) { >> + struct page *page = >> + pfn_to_page(page_hinting_obj->kvm_pt[i].pfn); >> + struct page *buddy_page = get_buddy_page(page); >> + >> + if (!PageBuddy(page) && buddy_page) { >> + if (if_exist(buddy_page)) { >> + page_hinting_obj->kvm_pt[i].pfn = 0; >> + page_hinting_obj->kvm_pt[i].order = -1; >> + page_hinting_obj->kvm_pt[i].zonenum = -1; >> + } else { >> + page_hinting_obj->kvm_pt[i].pfn = >> + page_to_pfn(buddy_page); >> + page_hinting_obj->kvm_pt[i].order = >> + page_private(buddy_page); >> + } >> + } >> + i++; >> + } >> + pack_array(); >> if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) >> wake_up_process(__this_cpu_read(hinting_task)); >> } >> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order) >> page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order = >> order; >> page_hinting_obj->kvm_pt_idx += 1; >> - if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) >> + if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) { >> + /* >> + * We are depending on the buddy free-list to identify >> + * if a page is free or not. Hence, we are dumping all >> + * the per-cpu pages back into the buddy allocator. This >> + * will ensure less failures when we try to isolate free >> + * captured pages and hence more memory reporting to the >> + * host. >> + */ >> + drain_local_pages(NULL); >> scan_array(); >> + } >> } >> local_irq_restore(flags); >> } >> -- >> 2.17.2 -- Regards Nitesh
Attachment:
signature.asc
Description: OpenPGP digital signature