Hi Nitesh, > > This patch enables QEMU to handle page hinting requests > from the guest. Once the guest kicks QEMU to free a page, > QEMU retrives the guest physical address and converts it to > host virtual address and then MADVISE that memory. > > Signed-off-by: Nitesh Narayan Lal <nilal@xxxxxxxxxx> > --- > hw/virtio/virtio-balloon.c | 81 > ++++++++++++++++++++++++++++++++++++++ > hw/virtio/virtio.c | 24 +++++++++++ > include/hw/virtio/virtio-access.h | 1 + > include/hw/virtio/virtio-balloon.h | 2 +- > include/qemu/osdep.h | 7 ++++ > 5 files changed, 114 insertions(+), 1 deletion(-) > > diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c > index 37cde38..de10350 100644 > --- a/hw/virtio/virtio-balloon.c > +++ b/hw/virtio/virtio-balloon.c > @@ -33,6 +33,8 @@ > > #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) > > +void page_hinting_request(uint64_t addr, uint32_t len); > + > static void balloon_page(void *addr, int deflate) > { > if (!qemu_balloon_is_inhibited() && (!kvm_enabled() || > @@ -205,6 +207,84 @@ static void balloon_stats_set_poll_interval(Object *obj, > Visitor *v, > balloon_stats_change_timer(s, 0); > } > > +static void *gpa2hva(MemoryRegion **p_mr, hwaddr addr, Error **errp) > +{ > + MemoryRegionSection mrs = memory_region_find(get_system_memory(), > + addr, 1); > + > + if (!mrs.mr) { > + error_setg(errp, "No memory is mapped at address 0x%" HWADDR_PRIx, > addr); > + return NULL; > + } > + > + if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { > + error_setg(errp, "Memory at address 0x%" HWADDR_PRIx "is not RAM", > addr); > + memory_region_unref(mrs.mr); > + return NULL; > + } > + > + *p_mr = mrs.mr; > + return qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); > +} > + > +struct guest_pages { > + unsigned long pfn; > + unsigned int pages; > +}; > + > + > +void page_hinting_request(uint64_t addr, uint32_t len) > +{ > + Error *local_err = NULL; > + MemoryRegion *mr = NULL; > + void *hvaddr; > + int ret = 0; > + struct guest_pages *guest_obj; > + int i = 0; > + void *hvaddr_to_free; > + unsigned long pfn, pfn_end; > + uint64_t gpaddr_to_free; > + > + /*ptr is the host physical address*/ Could not understand the comment? > + hvaddr = gpa2hva(&mr, addr, &local_err); > + if (local_err) { > + error_report_err(local_err); > + return; > + } > + guest_obj = hvaddr; > + > + while (i < len) { > + pfn = guest_obj[i].pfn; > + pfn_end = guest_obj[i].pfn + guest_obj[i].pages - 1; > + while (pfn <= pfn_end) { > + gpaddr_to_free = pfn << VIRTIO_BALLOON_PFN_SHIFT; > + hvaddr_to_free = gpa2hva(&mr, gpaddr_to_free, &local_err); > + if (local_err) { > + error_report_err(local_err); > + return; > + } > + ret = qemu_madvise((void *)hvaddr_to_free, 4096, QEMU_MADV_FREE); > + if (ret == -1) > + printf("\n%d:%s Error: Madvise failed with error:%d\n", __LINE__, > __func__, ret); > + pfn++; > + } > + i++; > + } > +} > + > + > +static void virtio_balloon_page_hinting(VirtIODevice *vdev, VirtQueue *vq) > +{ > + uint64_t addr; > + uint32_t len; > + VirtQueueElement elem = {}; > + > + pop_hinting_addr(vq, &addr, &len); > + page_hinting_request(addr, len); > + virtqueue_push(vq, &elem, 0); > + virtio_notify(vdev, vq); > +} > + > static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) > { > VirtIOBalloon *s = VIRTIO_BALLOON(vdev); > @@ -443,6 +523,7 @@ static void virtio_balloon_device_realize(DeviceState > *dev, Error **errp) > s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); > s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); > + s->hvq = virtio_add_queue(vdev, 128, virtio_balloon_page_hinting); > > reset_stats(s); > } > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c > index 311929e..24e26ec 100644 > --- a/hw/virtio/virtio.c > +++ b/hw/virtio/virtio.c > @@ -825,6 +825,30 @@ static void *virtqueue_alloc_element(size_t sz, unsigned > out_num, unsigned in_nu > return elem; > } > > +void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len) > +{ > + VRingMemoryRegionCaches *caches; > + VRingDesc desc; > + MemoryRegionCache *desc_cache; > + VirtIODevice *vdev = vq->vdev; > + unsigned int head, i, max; > + > + max = vq->vring.num; > + if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) { > + printf("\n%d:%sError: Unable to read head\n", __LINE__, __func__); > + } > + i = head; we can avoid 'i' here and directly pass head 'to vring_desc_read' function? > + > + caches = vring_get_region_caches(vq); > + if (caches->desc.len < max * sizeof(VRingDesc)) { > + virtio_error(vdev, "Cannot map descriptor ring"); > + } > + desc_cache = &caches->desc; > + vring_desc_read(vdev, &desc, desc_cache, i); > + *addr = desc.addr; > + *len = desc.len; > +} > + > void *virtqueue_pop(VirtQueue *vq, size_t sz) > { > unsigned int i, head, max; > diff --git a/include/hw/virtio/virtio-access.h > b/include/hw/virtio/virtio-access.h > index 2e92074..568d71f 100644 > --- a/include/hw/virtio/virtio-access.h > +++ b/include/hw/virtio/virtio-access.h > @@ -24,6 +24,7 @@ > #define LEGACY_VIRTIO_IS_BIENDIAN 1 > #endif > > +void pop_hinting_addr(VirtQueue *vq, uint64_t *addr, uint32_t *len); > static inline bool virtio_access_is_big_endian(VirtIODevice *vdev) > { > #if defined(LEGACY_VIRTIO_IS_BIENDIAN) > diff --git a/include/hw/virtio/virtio-balloon.h > b/include/hw/virtio/virtio-balloon.h > index 1ea13bd..dfb5782 100644 > --- a/include/hw/virtio/virtio-balloon.h > +++ b/include/hw/virtio/virtio-balloon.h > @@ -33,7 +33,7 @@ typedef struct virtio_balloon_stat_modern { > > typedef struct VirtIOBalloon { > VirtIODevice parent_obj; > - VirtQueue *ivq, *dvq, *svq; > + VirtQueue *ivq, *dvq, *svq, *hvq; > uint32_t num_pages; > uint32_t actual; > uint64_t stats[VIRTIO_BALLOON_S_NR]; > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index 9dd318a..033d64c 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -278,6 +278,11 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #else > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > #endif > +#ifdef MADV_FREE > +#define QEMU_MADV_FREE MADV_FREE > +#else > +#define QEMU_MADV_FREE QEMU_MAD_INVALID > +#endif > > #elif defined(CONFIG_POSIX_MADVISE) > > @@ -291,6 +296,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > +#define QEMU_MADV_FREE QEMU_MAD_INVALID > > #else /* no-op */ > > @@ -304,6 +310,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); > #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID > #define QEMU_MADV_REMOVE QEMU_MADV_INVALID > +#define QEMU_MADV_FREE QEMU_MAD_INVALID > > #endif > > -- > 2.9.4 > >