Automatic ballooning consists of dynamically adjusting the guest's balloon according to memory pressure in the host and in the guest. This commit implements the host side of automatic balloning, which basically consists of: 1. Registering with the memory.pressure_level API (from the Linux memory controller cgroup) for the MEDIUM pressure event This is a new feature starting on Linux kernel 3.10. For more information on this please check Documentation/cgroups/memory.txt in Linux kernel sources. 2. On MEDIUM pressure event reception, QEMU asks the guest kernel to inflate the balloon by 16MB 3. This is only done if the guest negotiates VIRTIO_BALLOON_F_AUTO_BALLOON which means the guest's kernel virtio-balloon driver also supports automatic ballooning Automatic deflate is performed by the guest. Here are some numbers. The test-case is to run 35 VMs (1G of RAM each) in parallel doing a kernel build. Host has 32GB of RAM and 16GB of swap. SWAP IN and SWAP OUT correspond to the number of pages swapped in and swapped out, respectively. Auto-ballooning disabled: RUN TIME(s) SWAP IN SWAP OUT 1 634 930980 1588522 2 610 627422 1362174 3 649 1079847 1616367 4 543 953289 1635379 5 642 913237 1514000 Auto-ballooning enabled: RUN TIME(s) SWAP IN SWAP OUT 1 629 901 12537 2 624 981 18506 3 626 573 9085 4 631 2250 42534 5 627 1610 20808 FIXMEs/TODOs: - Should we have a lower limit for guest memory? Otherwise it can reach 0 if too many events are received - Or maybe we should rate-limit events? - It seems that events are being lost when too many of them are sent at the same time on a busy host - Allow this to be dynamically enabled by mngt Signed-off-by: Luiz Capitulino <lcapitulino@xxxxxxxxxx> --- o You can find my test script here: http://repo.or.cz/w/qemu/qmp-unstable.git/blob/refs/heads/balloon/auto-ballooning/memcg/rfc:/scripts/autob-test o You can find the guest driver counterpart code at: http://repo.or.cz/w/linux-2.6/luiz-linux-2.6.git/shortlog/refs/heads/virtio-balloon/auto-deflate/rfc o To play with automatic ballooning, do the following: 1. You'll need 3.9+ for the host kernel 2. Get the guest kernel bits from: git://repo.or.cz/linux-2.6/luiz-linux-2.6.git virtio-balloon/auto-deflate/rfc 3. Apply this patch to QEMU 4. Enable the balloon device in qemu with: -device virtio-balloon-pci,auto-balloon=true 5. Generate memory pressure in the host, or put QEMU in a memcg cgroup with limited memory. Watch the VM memory going down 6. Generate pressure in the guest to see it going up again (say, a kernel build with -j16) hw/virtio/virtio-balloon.c | 162 +++++++++++++++++++++++++++++++++++++ hw/virtio/virtio-pci.c | 5 ++ hw/virtio/virtio-pci.h | 1 + include/hw/virtio/virtio-balloon.h | 15 ++++ 4 files changed, 183 insertions(+) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index d669756..4b23360 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -31,6 +31,12 @@ #include "hw/virtio/virtio-bus.h" +void virtio_balloon_set_conf(DeviceState *dev, VirtIOBalloonConf *bconf) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(dev); + memcpy(&(s->bconf), bconf, sizeof(struct VirtIOBalloonConf)); +} + static void balloon_page(void *addr, int deflate) { #if defined(__linux__) @@ -279,9 +285,21 @@ static void virtio_balloon_set_config(VirtIODevice *vdev, } } +static bool auto_balloon_enabled(const VirtIOBalloon *s) +{ + return s->bconf.auto_balloon; +} + static uint32_t virtio_balloon_get_features(VirtIODevice *vdev, uint32_t f) { + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + f |= (1 << VIRTIO_BALLOON_F_STATS_VQ); + + if (auto_balloon_enabled(s)) { + f |= (1 << VIRTIO_BALLOON_F_AUTO_BALLOON); + } + return f; } @@ -336,6 +354,141 @@ static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id) return 0; } +static int open_sysfile(const char *path, const char *file, mode_t mode) +{ + char *p; + int fd; + + p = g_strjoin("/", path, file, NULL); + fd = qemu_open(p, mode); + if (fd < 0) { + error_report("balloon: can't open '%s': %s", p, strerror(errno)); + } + + g_free(p); + return fd; +} + +static int write_fd(int fd, const char *fmt, ...) +{ + va_list ap; + char *str; + int ret; + + va_start(ap, fmt); + str = g_strdup_vprintf(fmt, ap); + va_end(ap); + + do { + ret = write(fd, str, strlen(str)); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + error_report("balloon: write failed: %s", strerror(errno)); + } + + g_free(str); + return ret; +} + +static bool guest_supports_auto_balloon(const VirtIOBalloon *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + return vdev->guest_features & (1 << VIRTIO_BALLOON_F_AUTO_BALLOON); +} + +static int auto_balloon_ack_event(EventNotifier *ev) +{ + uint64_t res; + int ret, fd; + + fd = event_notifier_get_fd(ev); + + do { + ret = read(fd, &res, sizeof(res)); + } while (ret == -1 && errno == EINTR); + + return (ret < 0 ? ret : 0); +} + +#define AUTO_INFLATE_INCREASE (16 * 1024 * 1024) /* 16 MB */ + +static void auto_balloon_event_medium(EventNotifier *ev) +{ + VirtIOBalloon *s = container_of(ev, VirtIOBalloon, event); + int ret; + + ret = auto_balloon_ack_event(ev); + if (ret < 0) { + fprintf(stderr, "balloon: failied to ack memory pressure event\n"); + return; + } + + if (!guest_supports_auto_balloon(s)) { + fprintf(stderr, "balloon: warning: guest doesn't support auto-ballooning, skipping memory pressure event\n"); + return; + } + + s->num_pages = s->actual + + (AUTO_INFLATE_INCREASE >> VIRTIO_BALLOON_PFN_SHIFT); + virtio_notify_config(VIRTIO_DEVICE(s)); +} + +#define LINUX_MEMCG_PATH "/sys/fs/cgroup/memory" + +static int auto_balloon_init(VirtIOBalloon *s) +{ + const char *path; + int ret; + + path = s->bconf.auto_balloon_memcg_path; + if (!path) { + path = LINUX_MEMCG_PATH; + } + + s->lfd = open_sysfile(path, "memory.pressure_level", O_RDONLY); + if (s->lfd < 0) { + return -1; + } + + s->cfd = open_sysfile(path, "cgroup.event_control", O_WRONLY); + if (s->cfd < 0) { + close(s->lfd); + return -1; + } + + ret = event_notifier_init(&s->event, false); + if (ret < 0) { + error_report("failed to create notifier: %s", strerror(-ret)); + goto out_err; + } + + ret = write_fd(s->cfd, "%d %d medium", + event_notifier_get_fd(&s->event), s->lfd); + if (ret < 0) { + goto out_ev; + } + + event_notifier_set_handler(&s->event, auto_balloon_event_medium); + return 0; + +out_ev: + event_notifier_cleanup(&s->event); +out_err: + close(s->lfd); + close(s->cfd); + return -1; +} + +static void auto_balloon_cleanup(VirtIOBalloon *s) +{ + if (auto_balloon_enabled(s)) { + event_notifier_cleanup(&s->event); + close(s->lfd); + close(s->cfd); + } +} + static int virtio_balloon_device_init(VirtIODevice *vdev) { DeviceState *qdev = DEVICE(vdev); @@ -344,6 +497,14 @@ static int virtio_balloon_device_init(VirtIODevice *vdev) virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON, 8); + if (auto_balloon_enabled(s)) { + ret = auto_balloon_init(s); + if (ret < 0) { + virtio_cleanup(VIRTIO_DEVICE(s)); + return -1; + } + } + ret = qemu_add_balloon_handler(virtio_balloon_to_target, virtio_balloon_stat, s); @@ -374,6 +535,7 @@ static int virtio_balloon_device_exit(DeviceState *qdev) VirtIOBalloon *s = VIRTIO_BALLOON(qdev); VirtIODevice *vdev = VIRTIO_DEVICE(qdev); + auto_balloon_cleanup(s); balloon_stats_destroy_timer(s); qemu_remove_balloon_handler(s); unregister_savevm(qdev, "virtio-balloon", s); diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index ec0066b..378fe8d 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1246,6 +1246,10 @@ static void balloon_pci_stats_set_poll_interval(Object *obj, struct Visitor *v, static Property virtio_balloon_pci_properties[] = { DEFINE_VIRTIO_COMMON_FEATURES(VirtIOPCIProxy, host_features), DEFINE_PROP_HEX32("class", VirtIOPCIProxy, class_code, 0), +#ifdef __linux__ + DEFINE_PROP_BIT("auto-balloon", VirtIOBalloonPCI, bconf.auto_balloon, 0, false), + DEFINE_PROP_STRING("auto-balloon-memory-cgroup-path", VirtIOBalloonPCI, bconf.auto_balloon_memcg_path), +#endif DEFINE_PROP_END_OF_LIST(), }; @@ -1259,6 +1263,7 @@ static int virtio_balloon_pci_init(VirtIOPCIProxy *vpci_dev) vpci_dev->class_code = PCI_CLASS_OTHERS; } + virtio_balloon_set_conf(vdev, &(dev->bconf)); qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); if (qdev_init(vdev) < 0) { return -1; diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h index 917bcc5..eb46401 100644 --- a/hw/virtio/virtio-pci.h +++ b/hw/virtio/virtio-pci.h @@ -145,6 +145,7 @@ struct VirtIOBlkPCI { struct VirtIOBalloonPCI { VirtIOPCIProxy parent_obj; VirtIOBalloon vdev; + VirtIOBalloonConf bconf; }; /* diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h index f863bfe..1a0c255 100644 --- a/include/hw/virtio/virtio-balloon.h +++ b/include/hw/virtio/virtio-balloon.h @@ -30,10 +30,17 @@ /* The feature bitmap for virtio balloon */ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory stats virtqueue */ +#define VIRTIO_BALLOON_F_AUTO_BALLOON 2 /* Automatic ballooning */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 +typedef struct VirtIOBalloonConf +{ + uint32_t auto_balloon; + char *auto_balloon_memcg_path; +} VirtIOBalloonConf; + struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ @@ -67,6 +74,14 @@ typedef struct VirtIOBalloon { QEMUTimer *stats_timer; int64_t stats_last_update; int64_t stats_poll_interval; + VirtIOBalloonConf bconf; + + /* auto-balloon */ + int cfd; + int lfd; + EventNotifier event; } VirtIOBalloon; +void virtio_balloon_set_conf(DeviceState *dev, VirtIOBalloonConf *bconf); + #endif -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html