On Mon, 11 Jun 2018 11:19:00 -0400 nilal@xxxxxxxxxx wrote: > From: Nitesh Narayan Lal <nilal@xxxxxxxxxx> > > This patch creates a new vq (hinting_vq) to be used for page hinting > and adds support in the existing virtio balloon infrastructure so > that the hyper list carrying pages which are supposed to be freed > could be sent to the host (QEMU) for processing by using hinting_vq. > > Signed-off-by: Nitesh Narayan Lal <nilal@xxxxxxxxxx> > --- > drivers/virtio/virtio_balloon.c | 99 ++++++++++++++++++++++++++++++++++++- > include/linux/page_hinting.h | 16 ++++++ > include/uapi/linux/virtio_balloon.h | 1 + > virt/kvm/page_hinting.c | 36 ++++++-------- > 4 files changed, 129 insertions(+), 23 deletions(-) > create mode 100644 include/linux/page_hinting.h > > diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c > index 6b237e3..217523f 100644 > --- a/drivers/virtio/virtio_balloon.c > +++ b/drivers/virtio/virtio_balloon.c > @@ -32,6 +32,7 @@ > #include <linux/mm.h> > #include <linux/mount.h> > #include <linux/magic.h> > +#include <linux/page_hinting.h> > > /* > * Balloon device works in 4K page units. So each page is pointed to by > @@ -53,8 +54,11 @@ static struct vfsmount *balloon_mnt; > > struct virtio_balloon { > struct virtio_device *vdev; > +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *hinting_vq; > +#else > struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; > - > +#endif This is duplicating the exiting virtqueue pointers. It's only hinting_vq that has to be protected by CONFIG_KVM_FREE_PAGE_HINTING. > /* The balloon servicing is delegated to a freezable workqueue. */ > struct work_struct update_balloon_stats_work; > struct work_struct update_balloon_size_work; > @@ -95,6 +99,33 @@ static struct virtio_device_id id_table[] = { > { 0 }, > }; > > +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > +static void tell_host_one_page(struct virtio_balloon *vb, struct virtqueue *vq, > + u64 gvaddr, int len) > +{ I think this function actually tells the host the whole array at once, right (vs. a single page). In this case, maybe it should be renamed to tell_host_free_pfns? > + unsigned int id = VIRTQUEUE_DESC_ID_INIT; > + u64 gpaddr = virt_to_phys((void *)gvaddr); > + > + virtqueue_add_chain_desc(vq, gpaddr, len, &id, &id, 0); > + virtqueue_add_chain(vq, id, 0, NULL, (void *)gpaddr, NULL); > +} > + > +void virtballoon_page_hinting(struct virtio_balloon *vb, int hyper_entries) > +{ > + u64 gvaddr = (u64)hypervisor_pagelist; > + > + vb->num_pfns = hyper_entries; > + tell_host_one_page(vb, vb->hinting_vq, gvaddr, hyper_entries); > +} > + > +static void hinting_ack(struct virtqueue *vq) > +{ > + struct virtio_balloon *vb = vq->vdev->priv; > + > + wake_up(&vb->acked); Honest question: is there a process to be woken up? If I understood correctly, the guest busy-waits for the host ACK, so the calling process is never put to sleep. Or did I misunderstand? > +} > +#endif > + > static u32 page_to_balloon_pfn(struct page *page) > { > unsigned long pfn = page_to_pfn(page); > @@ -425,6 +456,62 @@ static void update_balloon_size_func(struct work_struct *work) > queue_work(system_freezable_wq, work); > } > > +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > +static int init_vqs(struct virtio_balloon *vb) > +{ I don't think you can duplicate init_vqs(). > + struct virtqueue *vqs[4]; > + vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request, > + hinting_ack }; > + static const char * const names[] = { "inflate", "deflate", "stats", > + "hinting" }; > + int err, nvqs; > + bool stats_vq_support, page_hinting_support; > + > + /* > + * We expect two virtqueues: inflate and deflate, and > + * optionally stat and hinting. > + */ > + stats_vq_support = virtio_has_feature(vb->vdev, > + VIRTIO_BALLOON_F_STATS_VQ); > + page_hinting_support = virtio_has_feature(vb->vdev, > + VIRTIO_GUEST_PAGE_HINTING_VQ > + ); > + if (stats_vq_support && page_hinting_support) > + nvqs = 4; > + else if (stats_vq_support || page_hinting_support) > + nvqs = 3; > + else > + nvqs = 2; > + > + err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL); > + if (err) > + return err; > + > + vb->inflate_vq = vqs[0]; > + vb->deflate_vq = vqs[1]; > + if (page_hinting_support) > + vb->hinting_vq = vqs[3]; > + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { > + struct scatterlist sg; > + unsigned int num_stats; > + > + vb->stats_vq = vqs[2]; > + > + /* > + * Prime this virtqueue with one buffer so the hypervisor can > + * use it to signal us later (it can't be broken yet!). > + */ > + num_stats = update_balloon_stats(vb); > + > + sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats); > + if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL) > + < 0) > + BUG(); > + virtqueue_kick(vb->stats_vq); > + } > + return 0; > +} > +#else > static int init_vqs(struct virtio_balloon *vb) > { > struct virtqueue *vqs[3]; > @@ -463,6 +550,8 @@ static int init_vqs(struct virtio_balloon *vb) > return 0; > } > > +#endif > + > #ifdef CONFIG_BALLOON_COMPACTION > /* > * virtballoon_migratepage - perform the balloon page migration on behalf of > @@ -604,6 +693,13 @@ static int virtballoon_probe(struct virtio_device *vdev) > > virtio_device_ready(vdev); > > +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > + if (virtio_has_feature(vb->vdev, VIRTIO_GUEST_PAGE_HINTING_VQ)) { > + request_hypercall = (void *)&virtballoon_page_hinting; > + balloon_ptr = vb; > + } > +#endif Very minor, but I think it would be nicer if you used a function from virt/kvm/page_hinting.c instead of assigning global variables. Say: page_hinting_enable(vb, virtballoon_page_hinting); > + > if (towards_target(vb)) > virtballoon_changed(vdev); > return 0; > @@ -692,6 +788,7 @@ static unsigned int features[] = { > VIRTIO_BALLOON_F_MUST_TELL_HOST, > VIRTIO_BALLOON_F_STATS_VQ, > VIRTIO_BALLOON_F_DEFLATE_ON_OOM, > + VIRTIO_GUEST_PAGE_HINTING_VQ, > }; > > static struct virtio_driver virtio_balloon_driver = { > diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h > new file mode 100644 > index 0000000..0bfb646 > --- /dev/null > +++ b/include/linux/page_hinting.h > @@ -0,0 +1,16 @@ > +#define MAX_FGPT_ENTRIES 1000 > +/* > + * hypervisor_pages - It is a dummy structure passed with the hypercall. > + * @pfn - page frame number for the page which is to be freed. > + * @pages - number of pages which are supposed to be freed. > + * A global array object is used to to hold the list of pfn and pages and is > + * passed as part of the hypercall. > + */ > +struct hypervisor_pages { > + unsigned long pfn; > + unsigned int pages; > +}; > + > +extern struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES]; > +extern void (*request_hypercall)(void *, int); > +extern void *balloon_ptr; > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h > index 13b8cb5..16d299f 100644 > --- a/include/uapi/linux/virtio_balloon.h > +++ b/include/uapi/linux/virtio_balloon.h > @@ -34,6 +34,7 @@ > #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ > #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > +#define VIRTIO_GUEST_PAGE_HINTING_VQ 3 /* Page hinting virtqueue */ > > /* Size of a PFN in the balloon interface. */ > #define VIRTIO_BALLOON_PFN_SHIFT 12 > diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c > index 97500bb..417582a 100644 > --- a/virt/kvm/page_hinting.c > +++ b/virt/kvm/page_hinting.c > @@ -5,8 +5,8 @@ > #include <linux/sort.h> > #include <linux/kernel.h> > #include <trace/events/kmem.h> > +#include <linux/page_hinting.h> > > -#define MAX_FGPT_ENTRIES 1000 > #define HYPERLIST_THRESHOLD 1 /* FIXME: find a good threshold */ > /* > * struct kvm_free_pages - Tracks the pages which are freed by the guest. > @@ -21,22 +21,15 @@ struct kvm_free_pages { > unsigned int pages; > }; > > -/* > - * hypervisor_pages - It is a dummy structure passed with the hypercall. > - * @pfn - page frame number for the page which is to be freed. > - * @pages - number of pages which are supposed to be freed. > - * A global array object is used to to hold the list of pfn and pages and is > - * passed as part of the hypercall. > - */ > -struct hypervisor_pages { > - unsigned long pfn; > - unsigned int pages; > -}; > - > static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock); > DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt); > DEFINE_PER_CPU(int, kvm_pt_idx); > struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES]; > +EXPORT_SYMBOL(hypervisor_pagelist); > +void (*request_hypercall)(void *, int); > +EXPORT_SYMBOL(request_hypercall); > +void *balloon_ptr; > +EXPORT_SYMBOL(balloon_ptr); > > static void empty_hyperlist(void) > { > @@ -49,13 +42,11 @@ static void empty_hyperlist(void) > } > } > > -void make_hypercall(void) > +void hyperlist_ready(int entries) > { > - /* > - * Dummy function: Tobe filled later. > - */ > - empty_hyperlist(); > trace_guest_str_dump("Hypercall to host...:"); > + request_hypercall(balloon_ptr, entries); > + empty_hyperlist(); > } > > static int sort_pfn(const void *a1, const void *b1) > @@ -156,7 +147,7 @@ int compress_hyperlist(void) > if (merge_counter != 0) > ret = pack_hyperlist() - 1; > else > - ret = MAX_FGPT_ENTRIES - 1; > + ret = MAX_FGPT_ENTRIES; > return ret; > } > > @@ -227,16 +218,16 @@ void arch_free_page_slowpath(void) > */ > if (!prev_free) { > hyper_idx++; > - hypervisor_pagelist[hyper_idx].pfn = pfn; > - hypervisor_pagelist[hyper_idx].pages = 1; > trace_guest_free_page_slowpath( > hypervisor_pagelist[hyper_idx].pfn, > hypervisor_pagelist[hyper_idx].pages); > + hypervisor_pagelist[hyper_idx].pfn = pfn; > + hypervisor_pagelist[hyper_idx].pages = 1; > if (hyper_idx == MAX_FGPT_ENTRIES - 1) { > hyper_idx = compress_hyperlist(); > if (hyper_idx >= > HYPERLIST_THRESHOLD) { > - make_hypercall(); > + hyperlist_ready(hyper_idx); > hyper_idx = 0; > } > } > @@ -272,6 +263,7 @@ void arch_alloc_page(struct page *page, int order) > * free pages is full and a hypercall will be made. Until complete free > * page list is traversed no further allocaiton will be allowed. > */ > + > do { > seq = read_seqbegin(&guest_page_lock); > } while (read_seqretry(&guest_page_lock, seq));