On Mon, 11 Jun 2018 11:20:05 -0400 nilal@xxxxxxxxxx wrote: > From: Nitesh Narayan Lal <nilal@xxxxxxxxxx> > > This patch enables QEMU to handle page hinting requests > from the guest. Once the guest kicks QEMU to free a list of > page, QEMU retrives the guest physical address in the list > and converts each to host virtual address and then > MADVISE that memory. > > Signed-off-by: Nitesh Narayan Lal <nilal@xxxxxxxxxx> > --- > hw/virtio/virtio-balloon.c | 117 +++++++++++++++++++++++- > hw/virtio/virtio.c | 23 +++++ > include/hw/virtio/virtio-access.h | 1 + > include/hw/virtio/virtio-balloon.h | 2 +- > include/qemu/osdep.h | 7 ++ > include/standard-headers/linux/virtio_balloon.h | 1 + > 6 files changed, 149 insertions(+), 2 deletions(-) > > diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c > index 1f7a87f..46deacb 100644 > --- a/hw/virtio/virtio-balloon.c > +++ b/hw/virtio/virtio-balloon.c > @@ -34,6 +34,8 @@ > > #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) > > +void page_hinting_request(uint64_t addr, uint32_t len); > + > static void balloon_page(void *addr, int deflate) > { > if (!qemu_balloon_is_inhibited() && (!kvm_enabled() || > @@ -77,11 +79,22 @@ static bool balloon_stats_supported(const VirtIOBalloon *s) > return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); > } > > +static bool balloon_hinting_supported(const VirtIOBalloon *s) > +{ > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > + return virtio_vdev_has_feature(vdev, VIRTIO_GUEST_PAGE_HINTING_VQ); > +} > + > static bool balloon_stats_enabled(const VirtIOBalloon *s) > { > return s->stats_poll_interval > 0; > } > > +static bool page_hinting_enabled(const VirtIOBalloon *s) > +{ > + return s->stats_poll_interval > 0; > +} > + > static void balloon_stats_destroy_timer(VirtIOBalloon *s) > { > if (balloon_stats_enabled(s)) { > @@ -97,14 +110,20 @@ static void balloon_stats_change_timer(VirtIOBalloon *s, int64_t secs) > timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000); > } > > +static void page_hinting_change_timer(VirtIOBalloon *s, int64_t secs) > +{ > + timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000); > +} > + > static void balloon_stats_poll_cb(void *opaque) > { > VirtIOBalloon *s = opaque; > VirtIODevice *vdev = VIRTIO_DEVICE(s); > > - if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) { > + if (s->stats_vq_elem == NULL || !balloon_stats_supported(s) || !balloon_hinting_supported(s)) { > /* re-schedule */ > balloon_stats_change_timer(s, s->stats_poll_interval); > + page_hinting_change_timer(s, s->stats_poll_interval); > return; > } > > @@ -201,12 +220,101 @@ static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, > balloon_stats_change_timer(s, value); > return; > } > + > + if (page_hinting_enabled(s)) { > + /* timer interval change */ > + s->stats_poll_interval = value; > + page_hinting_change_timer(s, value); > + return; > + } > > /* create a new timer */ > g_assert(s->stats_timer == NULL); > s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s); > s->stats_poll_interval = value; > balloon_stats_change_timer(s, 0); > + /* create a new timer */ > + g_assert(s->stats_timer == NULL); > + s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s); > + s->stats_poll_interval = value; > + page_hinting_change_timer(s, 0); So, I'm not sure I understand the changes to the balloon stats polling code. Why is this needed? > +} > + > +static void *gpa2hva(MemoryRegion **p_mr, hwaddr addr, Error **errp) > +{ > + MemoryRegionSection mrs = memory_region_find(get_system_memory(), > + addr, 1); > + > + if (!mrs.mr) { > + error_setg(errp, "No memory is mapped at address 0x%" HWADDR_PRIx, addr); > + return NULL; > + } > + > + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { > + error_setg(errp, "Memory at address 0x%" HWADDR_PRIx "is not RAM", addr); > + memory_region_unref(mrs.mr); > + return NULL; > + } > + > + *p_mr = mrs.mr; > + return qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); > +} > + > +struct guest_pages { > + unsigned long pfn; > + unsigned int pages; > +}; > + > + > +void page_hinting_request(uint64_t addr, uint32_t len) > +{ > + Error *local_err = NULL; > + MemoryRegion *mr = NULL; > + void *hvaddr; > + int ret = 0; > + struct guest_pages *guest_obj; > + int i = 0; > + void *hvaddr_to_free; > + unsigned long pfn, pfn_end; > + uint64_t gpaddr_to_free; > + > + hvaddr = gpa2hva(&mr, addr, &local_err); > + if (local_err) { > + error_report_err(local_err); > + return; > + } > + guest_obj = hvaddr; > + > + while (i < len) { > + pfn = guest_obj[i].pfn; > + pfn_end = guest_obj[i].pfn + guest_obj[i].pages - 1; > + while (pfn <= pfn_end) { > + gpaddr_to_free = pfn << VIRTIO_BALLOON_PFN_SHIFT; > + hvaddr_to_free = gpa2hva(&mr, gpaddr_to_free, &local_err); > + if (local_err) { > + error_report_err(local_err); > + return; > + } > + ret = qemu_madvise((void *)hvaddr_to_free, 4096, QEMU_MADV_FREE); I asked this question on the kernel patches, but if I understood correctly you're passing the whole free pages array to the host, right? If that's correct, would it work to madvise an entire region instead of going page by page? If that's possible, it would allow you to kill the inner while loop. > + if (ret == -1) > + printf("\n%d:%s Error: Madvise failed with error:%d\n", __LINE__, __func__, ret); > + pfn++; > + } > + i++; > + } > +} > + > + > +static void virtio_balloon_page_hinting(VirtIODevice *vdev, VirtQueue *vq) > +{ > + uint64_t addr; > + uint32_t len; > + VirtQueueElement elem = {}; > + > + pop_hinting_addr(vq, &addr, &len); > + page_hinting_request(addr, len); > + virtqueue_push(vq, &elem, 0); > + virtio_notify(vdev, vq); > } > > static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) > @@ -379,6 +487,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, > VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); > f |= dev->host_features; > virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); > + virtio_add_feature(&f, VIRTIO_GUEST_PAGE_HINTING_VQ); > return f; > } > > @@ -412,6 +521,9 @@ static int virtio_balloon_post_load_device(void *opaque, int version_id) > if (balloon_stats_enabled(s)) { > balloon_stats_change_timer(s, s->stats_poll_interval); > } > + if (page_hinting_enabled(s)) { > + page_hinting_change_timer(s, s->stats_poll_interval); > + } > return 0; > } > > @@ -448,6 +560,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) > s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); > + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_page_hinting); > > reset_stats(s); > } > @@ -491,6 +604,8 @@ static void virtio_balloon_instance_init(Object *obj) > > object_property_add(obj, "guest-stats", "guest statistics", > balloon_stats_get_all, NULL, NULL, s, NULL); > + object_property_add(obj, "guest-page-hinting", "guest page hinting", > + NULL, NULL, NULL, s, NULL); > > object_property_add(obj, "guest-stats-polling-interval", "int", > balloon_stats_get_poll_interval, > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c > index 1debb01..11ddc13 100644 > --- a/hw/virtio/virtio.c > +++ b/hw/virtio/virtio.c > @@ -825,6 +825,29 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu > return elem; > } > > +void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len) > +{ > + VRingMemoryRegionCaches *caches; > + VRingDesc desc; > + MemoryRegionCache *desc_cache; > + VirtIODevice *vdev = vq->vdev; > + unsigned int head, max; > + > + max = vq->vring.num; > + if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) { > + printf("\n%d:%sError: Unable to read head\n", __LINE__, __func__); > + } > + > + caches = vring_get_region_caches(vq); > + if (caches->desc.len < max * sizeof(VRingDesc)) { > + virtio_error(vdev, "Cannot map descriptor ring"); > + } Should you really continue on error? > + desc_cache = &caches->desc; > + vring_desc_read(vdev, &desc, desc_cache, head); > + *addr = desc.addr; > + *len = desc.len; > +} > + > void *virtqueue_pop(VirtQueue *vq, size_t sz) > { > unsigned int i, head, max; > diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h > index 2e92074..568d71f 100644 > --- a/include/hw/virtio/virtio-access.h > +++ b/include/hw/virtio/virtio-access.h > @@ -24,6 +24,7 @@ > #define LEGACY_VIRTIO_IS_BIENDIAN 1 > #endif > > +void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len); > static inline bool virtio_access_is_big_endian(VirtIODevice *vdev) > { > #if defined(LEGACY_VIRTIO_IS_BIENDIAN) > diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h > index e0df352..774498a 100644 > --- a/include/hw/virtio/virtio-balloon.h > +++ b/include/hw/virtio/virtio-balloon.h > @@ -32,7 +32,7 @@ typedef struct virtio_balloon_stat_modern { > > typedef struct VirtIOBalloon { > VirtIODevice parent_obj; > - VirtQueue *ivq, *dvq, *svq; > + VirtQueue *ivq, *dvq, *svq, *hvq; > uint32_t num_pages; > uint32_t actual; > uint64_t stats[VIRTIO_BALLOON_S_NR]; > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index 4165806..dc62f17 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -305,6 +305,11 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #else > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > #endif > +#ifdef MADV_FREE > +#define QEMU_MADV_FREE MADV_FREE > +#else > +#define QEMU_MADV_FREE QEMU_MADV_INVALID > +#endif > > #elif defined(CONFIG_POSIX_MADVISE) > > @@ -318,6 +323,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > +#define QEMU_MADV_FREE QEMU_MADV_INVALID > > #else /* no-op */ > > @@ -331,6 +337,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > +#define QEMU_MADV_FREE QEMU_MADV_INVALID > > #endif > > diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h > index e446805..1372da1 100644 > --- a/include/standard-headers/linux/virtio_balloon.h > +++ b/include/standard-headers/linux/virtio_balloon.h > @@ -34,6 +34,7 @@ > #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ > #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > +#define VIRTIO_GUEST_PAGE_HINTING_VQ 3 /* Page hinting virtqueue */ > > /* Size of a PFN in the balloon interface. */ > #define VIRTIO_BALLOON_PFN_SHIFT 12