qemu 2.9 patch to support kernel ring-based dirty memory tracking

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch adds support for kernel ring-based dirty memory tracking to qemu stable-2.9. 

Kernel support for ring-based dirty memory tracking is currently in the 'dirty-ring-buffer'
branch of KVM, against which this patch was tested. The patch modifies the live migration
code to utilize ring-based instead of bitmap-based dirty memory tracking. 

---
 hw/usb/bus.c                  |   3 +-
 include/migration/migration.h |   6 ++
 include/qom/cpu.h             |  18 ++++++
 include/sysemu/kvm.h          |  10 +++
 kvm-all.c                     | 138 ++++++++++++++++++++++++++++++++++++++++++
 linux-headers/linux/kvm.h     |   3 +
 migration/migration.c         |   6 +-
 migration/ram.c               | 102 ++++++++++++++++++++++++++++++-
 net/slirp.c                   |   9 ++-
 9 files changed, 288 insertions(+), 7 deletions(-)

diff --git a/hw/usb/bus.c b/hw/usb/bus.c
index 24f1608..50b6913 100644
--- a/hw/usb/bus.c
+++ b/hw/usb/bus.c
@@ -407,8 +407,9 @@ void usb_register_companion(const char *masterbus, USBPort *ports[],
 void usb_port_location(USBPort *downstream, USBPort *upstream, int portnr)
 {
     if (upstream) {
-        snprintf(downstream->path, sizeof(downstream->path), "%s.%d",
+        int l = snprintf(downstream->path, sizeof(downstream->path), "%s.%d",
                  upstream->path, portnr);
+	assert(l < sizeof(downstream->path));
         downstream->hubcount = upstream->hubcount + 1;
     } else {
         snprintf(downstream->path, sizeof(downstream->path), "%d", portnr);
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 5720c88..08d1b31 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -385,4 +385,10 @@ uint64_t ram_pagesize_summary(void);
 PostcopyState postcopy_state_get(void);
 /* Set the state and return the old state */
 PostcopyState postcopy_state_set(PostcopyState new_state);
+
+#ifdef KVM_CAP_DIRTY_LOG_RING
+int get_dirty_count_from_ring(void);
+int fetch_dirty_pages(bool);
+#endif
+
 #endif
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 5d10359..6839b5b 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -231,6 +231,24 @@ struct CPUWatchpoint {
 struct KVMState;
 struct kvm_run;
 
+#ifdef KVM_CAP_DIRTY_LOG_RING
+struct dirty_gfn {
+    uint32_t pad;
+    uint32_t slot;
+    uint64_t offset;
+};
+
+struct dirty_ring {
+    union {
+        struct {
+            uint16_t avail_index;
+            uint16_t fetch_index;
+        } indices;
+        struct dirty_gfn dirty_gfns[0]; /* slot/offset */
+    };
+};
+#endif
+
 struct hax_vcpu_state;
 
 #define TB_JMP_CACHE_BITS 12
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 24281fc..f283d7d 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -227,6 +227,16 @@ int kvm_init_vcpu(CPUState *cpu);
 int kvm_cpu_exec(CPUState *cpu);
 int kvm_destroy_vcpu(CPUState *cpu);
 
+#ifdef KVM_CAP_DIRTY_LOG_RING
+struct dirty_page {
+    RAMBlock *block;
+    ram_addr_t offset;
+};
+int kvm_get_dirty_page_count(void);
+void kvm_sync_dirty_page_from_ring(struct dirty_page *pages,
+                                   int *dirty_index, int max);
+#endif
+
 #ifdef NEED_CPU_H
 #include "cpu.h"
 
diff --git a/kvm-all.c b/kvm-all.c
index 90b8573..5ee57ef 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -62,6 +62,10 @@
 
 #define KVM_MSI_HASHTAB_SIZE    256
 
+#ifdef KVM_CAP_DIRTY_LOG_RING
+#define KVM_DIRTY_LOG_PAGE_NUM 16
+#endif
+
 struct KVMParkedVcpu {
     unsigned long vcpu_id;
     int kvm_fd;
@@ -102,6 +106,10 @@ struct KVMState
 #endif
     KVMMemoryListener memory_listener;
     QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
+#ifdef KVM_CAP_DIRTY_LOG_RING
+    struct dirty_ring *dirty_logs;
+    int max_dirty_logs;
+#endif
 };
 
 KVMState *kvm_state;
@@ -301,6 +309,99 @@ static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
     return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
 }
 
+#ifdef KVM_CAP_DIRTY_LOG_RING
+int kvm_get_dirty_page_count(void)
+{
+    CPUState *cpu;
+    struct dirty_ring *ring;
+    unsigned int avail, fetch;
+    int total = 0;
+
+    CPU_FOREACH(cpu) {
+        ring = (void *)cpu->kvm_run + 64 * PAGE_SIZE;
+        avail = ring->indices.avail_index;
+        fetch = ring->indices.fetch_index;
+        total += (avail-fetch)%kvm_state->max_dirty_logs;
+    }
+
+    ring = kvm_state->dirty_logs;
+    avail = ring->indices.avail_index;
+    fetch = ring->indices.fetch_index;
+    total += (avail-fetch)%kvm_state->max_dirty_logs;
+
+    return total;
+}
+
+void kvm_sync_dirty_page_from_ring(struct dirty_page *pages,
+                                   int *dirty_index,
+                                   int max)
+{
+    CPUState *cpu;
+    int i, j;
+    struct dirty_ring *ring;
+    unsigned int avail;
+    void *host;
+    int as_id;
+    uint16_t slot_id;
+    int index = 0;
+
+    CPU_FOREACH(cpu) {
+        ring = (void *)cpu->kvm_run + 64 * PAGE_SIZE;
+        avail = ring->indices.avail_index;
+        while(ring->indices.fetch_index != avail) {
+            for (i = 0; i < kvm_state->nr_slots; i++) {
+                j = ring->indices.fetch_index & (kvm_state->max_dirty_logs-1);
+                as_id = ring->dirty_gfns[j].slot >> 16;
+                slot_id = (uint16_t)ring->dirty_gfns[j].slot;
+                if ((kvm_state->memory_listener.as_id == as_id) &&
+                    (kvm_state->memory_listener.slots[i].slot == slot_id)) {
+                    host = kvm_state->memory_listener.slots[i].ram + 
+                           (ring->dirty_gfns[j].offset << TARGET_PAGE_BITS);
+                    pages[*dirty_index+index].block = qemu_ram_block_from_host(host, 0,
+                        &pages[*dirty_index+index].offset);
+                    index++;
+                    if (index == max) {
+                        ring->indices.fetch_index++;
+                        goto reset;
+                    }
+                }
+            }
+            ring->indices.fetch_index++;
+        }                               
+    }
+
+    ring = kvm_state->dirty_logs;
+    avail = ring->indices.avail_index;
+    while(ring->indices.fetch_index != avail) {
+        for (i = 0; i < kvm_state->nr_slots; i++) {
+            j = ring->indices.fetch_index & (kvm_state->max_dirty_logs-1);
+            as_id = ring->dirty_gfns[j].slot >> 16;
+            slot_id = (uint16_t)ring->dirty_gfns[j].slot;
+            if ((kvm_state->memory_listener.as_id == as_id) &&
+                (kvm_state->memory_listener.slots[i].slot == slot_id)) {
+                host = kvm_state->memory_listener.slots[i].ram +
+                       (ring->dirty_gfns[j].offset << TARGET_PAGE_BITS);
+                pages[*dirty_index+index].block = qemu_ram_block_from_host(host, 0,
+                    &pages[*dirty_index+index].offset);
+                index++;
+                if (index == max) {
+                    ring->indices.fetch_index++;
+                    goto reset;
+                }
+            }
+        }
+        ring->indices.fetch_index++;
+    }
+
+reset:
+    if (index) {
+        kvm_vm_ioctl(kvm_state, KVM_RESET_DIRTY_PAGES);
+    }
+
+    *dirty_index += index;
+}
+#endif
+
 int kvm_init_vcpu(CPUState *cpu)
 {
     KVMState *s = kvm_state;
@@ -326,8 +427,13 @@ int kvm_init_vcpu(CPUState *cpu)
         goto err;
     }
 
+#ifdef KVM_CAP_DIRTY_LOG_RING
+    cpu->kvm_run = mmap(NULL, (64+KVM_DIRTY_LOG_PAGE_NUM)*PAGE_SIZE,
+                        PROT_READ | PROT_WRITE, MAP_SHARED, cpu->kvm_fd, 0);
+#else
     cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
                         cpu->kvm_fd, 0);
+#endif
     if (cpu->kvm_run == MAP_FAILED) {
         ret = -errno;
         DPRINTF("mmap'ing vcpu state failed\n");
@@ -1581,6 +1687,9 @@ static int kvm_init(MachineState *ms)
     int ret;
     int type = 0;
     const char *kvm_type;
+#ifdef KVM_CAP_DIRTY_LOG_RING
+    int size = KVM_DIRTY_LOG_PAGE_NUM*PAGE_SIZE;
+#endif
 
     s = KVM_STATE(ms->accelerator);
 
@@ -1684,6 +1793,24 @@ static int kvm_init(MachineState *ms)
     }
 
     s->vmfd = ret;
+
+#ifdef KVM_CAP_DIRTY_LOG_RING
+    ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, size);
+    if (ret < 0) {
+        fprintf(stderr, "ioctl KVM_CAP_DIRTY_LOG_RING %d: failed: %d %s\n", size, -ret,
+                strerror(-ret));
+        goto err;
+    }
+    s->max_dirty_logs = size/sizeof(struct dirty_gfn);
+    s->dirty_logs =  mmap(NULL, KVM_DIRTY_LOG_PAGE_NUM*PAGE_SIZE,
+                          PROT_READ | PROT_WRITE, MAP_SHARED, s->vmfd, 0);
+    if (s->dirty_logs == MAP_FAILED) {
+        ret = -errno;
+        fprintf(stderr, "mmap'ing vm dirty logs failed\n");
+        goto err;
+    }
+#endif
+
     missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
     if (!missing_cap) {
         missing_cap =
@@ -1951,6 +2078,8 @@ static void kvm_eat_signals(CPUState *cpu)
     } while (sigismember(&chkset, SIG_IPI));
 }
 
+extern int fetch_dirty_pages(bool);
+
 int kvm_cpu_exec(CPUState *cpu)
 {
     struct kvm_run *run = cpu->kvm_run;
@@ -2005,6 +2134,15 @@ int kvm_cpu_exec(CPUState *cpu)
 
         if (run_ret < 0) {
             if (run_ret == -EINTR || run_ret == -EAGAIN) {
+#ifdef KVM_CAP_DIRTY_LOG_RING
+                if ((run_ret == -EINTR) && (run->exit_reason == KVM_EXIT_DIRTY_LOG_FULL)) {
+                    //error_report("kvm dirty log full\n");
+	            ret = fetch_dirty_pages(true);
+		    kvm_eat_signals(cpu);
+                    continue;
+		}
+#endif
+
                 DPRINTF("io window exit\n");
                 kvm_eat_signals(cpu);
                 ret = EXCP_INTERRUPT;
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 4e082a8..0546d0a 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -205,6 +205,7 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_S390_STSI        25
 #define KVM_EXIT_IOAPIC_EOI       26
 #define KVM_EXIT_HYPERV           27
+#define KVM_EXIT_DIRTY_LOG_FULL   28
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -883,6 +884,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
+#define KVM_CAP_DIRTY_LOG_RING 151
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1300,6 +1302,7 @@ struct kvm_s390_ucas_mapping {
 #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
 /* Available with KVM_CAP_X86_SMM */
 #define KVM_SMI                   _IO(KVMIO,   0xb7)
+#define KVM_RESET_DIRTY_PAGES _IO(KVMIO, 0xba)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
diff --git a/migration/migration.c b/migration/migration.c
index ad4036f..63b8a0f 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -17,6 +17,7 @@
 #include "qemu/cutils.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
+#include "linux/kvm.h"
 #include "migration/migration.h"
 #include "migration/qemu-file.h"
 #include "sysemu/sysemu.h"
@@ -1959,10 +1960,13 @@ static void *migration_thread(void *opaque)
 
         if (!qemu_file_rate_limit(s->to_dst_file)) {
             uint64_t pend_post, pend_nonpost;
-
             qemu_savevm_state_pending(s->to_dst_file, max_size, &pend_nonpost,
                                       &pend_post);
+#ifdef KVM_CAP_DIRTY_LOG_RING
+            pending_size = pend_nonpost + pend_post + get_dirty_count_from_ring();
+#else
             pending_size = pend_nonpost + pend_post;
+#endif
             trace_migrate_pending(pending_size, max_size,
                                   pend_post, pend_nonpost);
             if (pending_size && pending_size >= max_size) {
diff --git a/migration/ram.c b/migration/ram.c
index de1e0a3..05b1dff 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -35,6 +35,7 @@
 #include "qemu/bitmap.h"
 #include "qemu/timer.h"
 #include "qemu/main-loop.h"
+#include "linux/kvm.h"
 #include "migration/migration.h"
 #include "migration/postcopy-ram.h"
 #include "exec/address-spaces.h"
@@ -44,6 +45,7 @@
 #include "exec/ram_addr.h"
 #include "qemu/rcu_queue.h"
 #include "migration/colo.h"
+#include "sysemu/kvm.h"
 
 static int dirty_rate_high_cnt;
 
@@ -230,6 +232,8 @@ struct PageSearchStatus {
     ram_addr_t   offset;
     /* Set once we wrap around */
     bool         complete_round;
+    /* page is from dirty ring */
+    bool         from_ring;
 };
 typedef struct PageSearchStatus PageSearchStatus;
 
@@ -1055,6 +1059,70 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
     }
 }
 
+#ifdef KVM_CAP_DIRTY_LOG_RING
+#define NUM_DIRTY_PAGES 1024*1024
+struct dirty_page *pages;
+static pthread_mutex_t pages_mutex = PTHREAD_MUTEX_INITIALIZER;
+int dirty_index = 0;
+int fetch_index = 0;
+
+int fetch_dirty_pages(bool lock)
+{
+    if (lock)
+        pthread_mutex_lock(&pages_mutex);
+    if (!pages)
+        pages = malloc(sizeof(struct dirty_page)*NUM_DIRTY_PAGES);
+
+    if (!(NUM_DIRTY_PAGES-dirty_index))
+        error_report("dirty ring full\n");
+    else
+        kvm_sync_dirty_page_from_ring(pages, &dirty_index, NUM_DIRTY_PAGES-dirty_index);
+
+    if (lock)
+        pthread_mutex_unlock(&pages_mutex);
+
+    return 0;
+}
+
+static bool find_dirty_block_from_ring(QEMUFile *f, PageSearchStatus *pss,
+                             bool *again, ram_addr_t *ram_addr_abs, bool last)
+{
+    bool found = false;
+
+    *again = true;
+
+    pthread_mutex_lock(&pages_mutex);
+    fetch_dirty_pages(false);
+    if (!dirty_index) {
+        if (last)
+            *again = false;
+        goto out;
+    }
+
+    if (fetch_index < dirty_index) {
+        pss->block = pages[fetch_index].block;
+        pss->offset = pages[fetch_index].offset;
+        *ram_addr_abs = pss->block->offset + pss->offset;
+        found = true;
+        fetch_index++;
+    }
+
+    if (dirty_index == fetch_index) {
+        dirty_index = 0;
+        fetch_index = 0;
+    } 
+
+out:
+    pthread_mutex_unlock(&pages_mutex);
+    return found;
+}
+
+int get_dirty_count_from_ring(void)
+{
+    return kvm_get_dirty_page_count() * TARGET_PAGE_SIZE;
+}
+#endif
+
 /*
  * Helper for 'get_queued_page' - gets a page off the queue
  *      ms:      MigrationState in
@@ -1264,7 +1332,7 @@ static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
     int res = 0;
 
     /* Check the pages is dirty and if it is send it */
-    if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
+    if (pss->from_ring || migration_bitmap_clear_dirty(dirty_ram_abs)) {
         unsigned long *unsentmap;
         if (compression_switch && migrate_use_compression()) {
             res = ram_save_compressed_page(ms, f, pss,
@@ -1364,6 +1432,10 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
     bool again, found;
     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
                                  ram_addr_t space */
+#ifdef KVM_CAP_DIRTY_LOG_RING
+    PageSearchStatus pss_ring;
+#endif
+    PageSearchStatus *ptr = NULL;
 
     /* No dirty page as there is zero RAM */
     if (!ram_bytes_total()) {
@@ -1373,6 +1445,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
     pss.block = last_seen_block;
     pss.offset = last_offset;
     pss.complete_round = false;
+    pss.from_ring = false;
 
     if (!pss.block) {
         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
@@ -1381,14 +1454,32 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
     do {
         again = true;
         found = get_queued_page(ms, &pss, &dirty_ram_abs);
+        if (found)
+            ptr = &pss;
 
         if (!found) {
             /* priority queue empty, so just search for something dirty */
             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
+            if (found) {
+                ptr = &pss;
+            }
         }
 
+#ifdef KVM_CAP_DIRTY_LOG_RING
+        if (!found && !again) {
+            pss_ring.block = NULL;
+            pss_ring.offset = 0;
+            pss_ring.complete_round = false;
+            pss_ring.from_ring = true;
+            found = find_dirty_block_from_ring(f, &pss_ring, &again, &dirty_ram_abs, last_stage);
+            if (found) {
+                ptr = &pss_ring;
+            }
+        }
+#endif
+
         if (found) {
-            pages = ram_save_host_page(ms, f, &pss,
+            pages = ram_save_host_page(ms, f, ptr,
                                        last_stage, bytes_transferred,
                                        dirty_ram_abs);
         }
@@ -1415,6 +1506,7 @@ void acct_update_position(QEMUFile *f, size_t size, bool zero)
 static ram_addr_t ram_save_remaining(void)
 {
     return migration_dirty_pages;
+
 }
 
 uint64_t ram_bytes_remaining(void)
@@ -1979,7 +2071,9 @@ static int ram_save_init_globals(void)
     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
 
     memory_global_dirty_log_start();
+#ifndef KVM_CAP_DIRTY_LOG_RING
     migration_bitmap_sync();
+#endif
     qemu_mutex_unlock_ramlist();
     qemu_mutex_unlock_iothread();
     rcu_read_unlock();
@@ -2097,7 +2191,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
     rcu_read_lock();
 
     if (!migration_in_postcopy(migrate_get_current())) {
+#ifndef KVM_CAP_DIRTY_LOG_RING
         migration_bitmap_sync();
+#endif
     }
 
     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
@@ -2136,11 +2232,13 @@ static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
 
     if (!migration_in_postcopy(migrate_get_current()) &&
         remaining_size < max_size) {
+#ifndef KVM_CAP_DIRTY_LOG_RING
         qemu_mutex_lock_iothread();
         rcu_read_lock();
         migration_bitmap_sync();
         rcu_read_unlock();
         qemu_mutex_unlock_iothread();
+#endif
         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
     }
 
diff --git a/net/slirp.c b/net/slirp.c
index f97ec23..4471af3 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -562,7 +562,8 @@ static void slirp_smb_cleanup(SlirpState *s)
     int ret;
 
     if (s->smb_dir[0] != '\0') {
-        snprintf(cmd, sizeof(cmd), "rm -rf %s", s->smb_dir);
+        int l = snprintf(cmd, sizeof(cmd), "rm -rf %s", s->smb_dir);
+	assert(l < 128);
         ret = system(cmd);
         if (ret == -1 || !WIFEXITED(ret)) {
             error_report("'%s' failed.", cmd);
@@ -606,7 +607,8 @@ static int slirp_smb(SlirpState* s, const char *exported_dir,
         s->smb_dir[0] = 0;
         return -1;
     }
-    snprintf(smb_conf, sizeof(smb_conf), "%s/%s", s->smb_dir, "smb.conf");
+    int l = snprintf(smb_conf, sizeof(smb_conf), "%s/%s", s->smb_dir, "smb.conf");
+    assert(l < 128);
 
     f = fopen(smb_conf, "w");
     if (!f) {
@@ -651,8 +653,9 @@ static int slirp_smb(SlirpState* s, const char *exported_dir,
             );
     fclose(f);
 
-    snprintf(smb_cmdline, sizeof(smb_cmdline), "%s -l %s -s %s",
+    l = snprintf(smb_cmdline, sizeof(smb_cmdline), "%s -l %s -s %s",
              CONFIG_SMBD_COMMAND, s->smb_dir, smb_conf);
+    assert(l < 128);
 
     if (slirp_add_exec(s->slirp, 0, smb_cmdline, &vserver_addr, 139) < 0 ||
         slirp_add_exec(s->slirp, 0, smb_cmdline, &vserver_addr, 445) < 0) {
-- 
1.8.3.4








[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux