This patch adds support for kernel ring-based dirty memory tracking to qemu stable-2.9. Kernel support for ring-based dirty memory tracking is currently in the 'dirty-ring-buffer' branch of KVM, against which this patch was tested. The patch modifies the live migration code to utilize ring-based instead of bitmap-based dirty memory tracking. --- hw/usb/bus.c | 3 +- include/migration/migration.h | 6 ++ include/qom/cpu.h | 18 ++++++ include/sysemu/kvm.h | 10 +++ kvm-all.c | 138 ++++++++++++++++++++++++++++++++++++++++++ linux-headers/linux/kvm.h | 3 + migration/migration.c | 6 +- migration/ram.c | 102 ++++++++++++++++++++++++++++++- net/slirp.c | 9 ++- 9 files changed, 288 insertions(+), 7 deletions(-) diff --git a/hw/usb/bus.c b/hw/usb/bus.c index 24f1608..50b6913 100644 --- a/hw/usb/bus.c +++ b/hw/usb/bus.c @@ -407,8 +407,9 @@ void usb_register_companion(const char *masterbus, USBPort *ports[], void usb_port_location(USBPort *downstream, USBPort *upstream, int portnr) { if (upstream) { - snprintf(downstream->path, sizeof(downstream->path), "%s.%d", + int l = snprintf(downstream->path, sizeof(downstream->path), "%s.%d", upstream->path, portnr); + assert(l < sizeof(downstream->path)); downstream->hubcount = upstream->hubcount + 1; } else { snprintf(downstream->path, sizeof(downstream->path), "%d", portnr); diff --git a/include/migration/migration.h b/include/migration/migration.h index 5720c88..08d1b31 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -385,4 +385,10 @@ uint64_t ram_pagesize_summary(void); PostcopyState postcopy_state_get(void); /* Set the state and return the old state */ PostcopyState postcopy_state_set(PostcopyState new_state); + +#ifdef KVM_CAP_DIRTY_LOG_RING +int get_dirty_count_from_ring(void); +int fetch_dirty_pages(bool); +#endif + #endif diff --git a/include/qom/cpu.h b/include/qom/cpu.h index 5d10359..6839b5b 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -231,6 +231,24 @@ struct CPUWatchpoint { struct KVMState; struct kvm_run; +#ifdef KVM_CAP_DIRTY_LOG_RING +struct dirty_gfn { + uint32_t pad; + uint32_t slot; + uint64_t offset; +}; + +struct dirty_ring { + union { + struct { + uint16_t avail_index; + uint16_t fetch_index; + } indices; + struct dirty_gfn dirty_gfns[0]; /* slot/offset */ + }; +}; +#endif + struct hax_vcpu_state; #define TB_JMP_CACHE_BITS 12 diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 24281fc..f283d7d 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -227,6 +227,16 @@ int kvm_init_vcpu(CPUState *cpu); int kvm_cpu_exec(CPUState *cpu); int kvm_destroy_vcpu(CPUState *cpu); +#ifdef KVM_CAP_DIRTY_LOG_RING +struct dirty_page { + RAMBlock *block; + ram_addr_t offset; +}; +int kvm_get_dirty_page_count(void); +void kvm_sync_dirty_page_from_ring(struct dirty_page *pages, + int *dirty_index, int max); +#endif + #ifdef NEED_CPU_H #include "cpu.h" diff --git a/kvm-all.c b/kvm-all.c index 90b8573..5ee57ef 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -62,6 +62,10 @@ #define KVM_MSI_HASHTAB_SIZE 256 +#ifdef KVM_CAP_DIRTY_LOG_RING +#define KVM_DIRTY_LOG_PAGE_NUM 16 +#endif + struct KVMParkedVcpu { unsigned long vcpu_id; int kvm_fd; @@ -102,6 +106,10 @@ struct KVMState #endif KVMMemoryListener memory_listener; QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; +#ifdef KVM_CAP_DIRTY_LOG_RING + struct dirty_ring *dirty_logs; + int max_dirty_logs; +#endif }; KVMState *kvm_state; @@ -301,6 +309,99 @@ static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); } +#ifdef KVM_CAP_DIRTY_LOG_RING +int kvm_get_dirty_page_count(void) +{ + CPUState *cpu; + struct dirty_ring *ring; + unsigned int avail, fetch; + int total = 0; + + CPU_FOREACH(cpu) { + ring = (void *)cpu->kvm_run + 64 * PAGE_SIZE; + avail = ring->indices.avail_index; + fetch = ring->indices.fetch_index; + total += (avail-fetch)%kvm_state->max_dirty_logs; + } + + ring = kvm_state->dirty_logs; + avail = ring->indices.avail_index; + fetch = ring->indices.fetch_index; + total += (avail-fetch)%kvm_state->max_dirty_logs; + + return total; +} + +void kvm_sync_dirty_page_from_ring(struct dirty_page *pages, + int *dirty_index, + int max) +{ + CPUState *cpu; + int i, j; + struct dirty_ring *ring; + unsigned int avail; + void *host; + int as_id; + uint16_t slot_id; + int index = 0; + + CPU_FOREACH(cpu) { + ring = (void *)cpu->kvm_run + 64 * PAGE_SIZE; + avail = ring->indices.avail_index; + while(ring->indices.fetch_index != avail) { + for (i = 0; i < kvm_state->nr_slots; i++) { + j = ring->indices.fetch_index & (kvm_state->max_dirty_logs-1); + as_id = ring->dirty_gfns[j].slot >> 16; + slot_id = (uint16_t)ring->dirty_gfns[j].slot; + if ((kvm_state->memory_listener.as_id == as_id) && + (kvm_state->memory_listener.slots[i].slot == slot_id)) { + host = kvm_state->memory_listener.slots[i].ram + + (ring->dirty_gfns[j].offset << TARGET_PAGE_BITS); + pages[*dirty_index+index].block = qemu_ram_block_from_host(host, 0, + &pages[*dirty_index+index].offset); + index++; + if (index == max) { + ring->indices.fetch_index++; + goto reset; + } + } + } + ring->indices.fetch_index++; + } + } + + ring = kvm_state->dirty_logs; + avail = ring->indices.avail_index; + while(ring->indices.fetch_index != avail) { + for (i = 0; i < kvm_state->nr_slots; i++) { + j = ring->indices.fetch_index & (kvm_state->max_dirty_logs-1); + as_id = ring->dirty_gfns[j].slot >> 16; + slot_id = (uint16_t)ring->dirty_gfns[j].slot; + if ((kvm_state->memory_listener.as_id == as_id) && + (kvm_state->memory_listener.slots[i].slot == slot_id)) { + host = kvm_state->memory_listener.slots[i].ram + + (ring->dirty_gfns[j].offset << TARGET_PAGE_BITS); + pages[*dirty_index+index].block = qemu_ram_block_from_host(host, 0, + &pages[*dirty_index+index].offset); + index++; + if (index == max) { + ring->indices.fetch_index++; + goto reset; + } + } + } + ring->indices.fetch_index++; + } + +reset: + if (index) { + kvm_vm_ioctl(kvm_state, KVM_RESET_DIRTY_PAGES); + } + + *dirty_index += index; +} +#endif + int kvm_init_vcpu(CPUState *cpu) { KVMState *s = kvm_state; @@ -326,8 +427,13 @@ int kvm_init_vcpu(CPUState *cpu) goto err; } +#ifdef KVM_CAP_DIRTY_LOG_RING + cpu->kvm_run = mmap(NULL, (64+KVM_DIRTY_LOG_PAGE_NUM)*PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED, cpu->kvm_fd, 0); +#else cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, cpu->kvm_fd, 0); +#endif if (cpu->kvm_run == MAP_FAILED) { ret = -errno; DPRINTF("mmap'ing vcpu state failed\n"); @@ -1581,6 +1687,9 @@ static int kvm_init(MachineState *ms) int ret; int type = 0; const char *kvm_type; +#ifdef KVM_CAP_DIRTY_LOG_RING + int size = KVM_DIRTY_LOG_PAGE_NUM*PAGE_SIZE; +#endif s = KVM_STATE(ms->accelerator); @@ -1684,6 +1793,24 @@ static int kvm_init(MachineState *ms) } s->vmfd = ret; + +#ifdef KVM_CAP_DIRTY_LOG_RING + ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, size); + if (ret < 0) { + fprintf(stderr, "ioctl KVM_CAP_DIRTY_LOG_RING %d: failed: %d %s\n", size, -ret, + strerror(-ret)); + goto err; + } + s->max_dirty_logs = size/sizeof(struct dirty_gfn); + s->dirty_logs = mmap(NULL, KVM_DIRTY_LOG_PAGE_NUM*PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED, s->vmfd, 0); + if (s->dirty_logs == MAP_FAILED) { + ret = -errno; + fprintf(stderr, "mmap'ing vm dirty logs failed\n"); + goto err; + } +#endif + missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); if (!missing_cap) { missing_cap = @@ -1951,6 +2078,8 @@ static void kvm_eat_signals(CPUState *cpu) } while (sigismember(&chkset, SIG_IPI)); } +extern int fetch_dirty_pages(bool); + int kvm_cpu_exec(CPUState *cpu) { struct kvm_run *run = cpu->kvm_run; @@ -2005,6 +2134,15 @@ int kvm_cpu_exec(CPUState *cpu) if (run_ret < 0) { if (run_ret == -EINTR || run_ret == -EAGAIN) { +#ifdef KVM_CAP_DIRTY_LOG_RING + if ((run_ret == -EINTR) && (run->exit_reason == KVM_EXIT_DIRTY_LOG_FULL)) { + //error_report("kvm dirty log full\n"); + ret = fetch_dirty_pages(true); + kvm_eat_signals(cpu); + continue; + } +#endif + DPRINTF("io window exit\n"); kvm_eat_signals(cpu); ret = EXCP_INTERRUPT; diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 4e082a8..0546d0a 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -205,6 +205,7 @@ struct kvm_hyperv_exit { #define KVM_EXIT_S390_STSI 25 #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 +#define KVM_EXIT_DIRTY_LOG_FULL 28 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -883,6 +884,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 #define KVM_CAP_IMMEDIATE_EXIT 136 +#define KVM_CAP_DIRTY_LOG_RING 151 #ifdef KVM_CAP_IRQ_ROUTING @@ -1300,6 +1302,7 @@ struct kvm_s390_ucas_mapping { #define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state) /* Available with KVM_CAP_X86_SMM */ #define KVM_SMI _IO(KVMIO, 0xb7) +#define KVM_RESET_DIRTY_PAGES _IO(KVMIO, 0xba) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) diff --git a/migration/migration.c b/migration/migration.c index ad4036f..63b8a0f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -17,6 +17,7 @@ #include "qemu/cutils.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" +#include "linux/kvm.h" #include "migration/migration.h" #include "migration/qemu-file.h" #include "sysemu/sysemu.h" @@ -1959,10 +1960,13 @@ static void *migration_thread(void *opaque) if (!qemu_file_rate_limit(s->to_dst_file)) { uint64_t pend_post, pend_nonpost; - qemu_savevm_state_pending(s->to_dst_file, max_size, &pend_nonpost, &pend_post); +#ifdef KVM_CAP_DIRTY_LOG_RING + pending_size = pend_nonpost + pend_post + get_dirty_count_from_ring(); +#else pending_size = pend_nonpost + pend_post; +#endif trace_migrate_pending(pending_size, max_size, pend_post, pend_nonpost); if (pending_size && pending_size >= max_size) { diff --git a/migration/ram.c b/migration/ram.c index de1e0a3..05b1dff 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -35,6 +35,7 @@ #include "qemu/bitmap.h" #include "qemu/timer.h" #include "qemu/main-loop.h" +#include "linux/kvm.h" #include "migration/migration.h" #include "migration/postcopy-ram.h" #include "exec/address-spaces.h" @@ -44,6 +45,7 @@ #include "exec/ram_addr.h" #include "qemu/rcu_queue.h" #include "migration/colo.h" +#include "sysemu/kvm.h" static int dirty_rate_high_cnt; @@ -230,6 +232,8 @@ struct PageSearchStatus { ram_addr_t offset; /* Set once we wrap around */ bool complete_round; + /* page is from dirty ring */ + bool from_ring; }; typedef struct PageSearchStatus PageSearchStatus; @@ -1055,6 +1059,70 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, } } +#ifdef KVM_CAP_DIRTY_LOG_RING +#define NUM_DIRTY_PAGES 1024*1024 +struct dirty_page *pages; +static pthread_mutex_t pages_mutex = PTHREAD_MUTEX_INITIALIZER; +int dirty_index = 0; +int fetch_index = 0; + +int fetch_dirty_pages(bool lock) +{ + if (lock) + pthread_mutex_lock(&pages_mutex); + if (!pages) + pages = malloc(sizeof(struct dirty_page)*NUM_DIRTY_PAGES); + + if (!(NUM_DIRTY_PAGES-dirty_index)) + error_report("dirty ring full\n"); + else + kvm_sync_dirty_page_from_ring(pages, &dirty_index, NUM_DIRTY_PAGES-dirty_index); + + if (lock) + pthread_mutex_unlock(&pages_mutex); + + return 0; +} + +static bool find_dirty_block_from_ring(QEMUFile *f, PageSearchStatus *pss, + bool *again, ram_addr_t *ram_addr_abs, bool last) +{ + bool found = false; + + *again = true; + + pthread_mutex_lock(&pages_mutex); + fetch_dirty_pages(false); + if (!dirty_index) { + if (last) + *again = false; + goto out; + } + + if (fetch_index < dirty_index) { + pss->block = pages[fetch_index].block; + pss->offset = pages[fetch_index].offset; + *ram_addr_abs = pss->block->offset + pss->offset; + found = true; + fetch_index++; + } + + if (dirty_index == fetch_index) { + dirty_index = 0; + fetch_index = 0; + } + +out: + pthread_mutex_unlock(&pages_mutex); + return found; +} + +int get_dirty_count_from_ring(void) +{ + return kvm_get_dirty_page_count() * TARGET_PAGE_SIZE; +} +#endif + /* * Helper for 'get_queued_page' - gets a page off the queue * ms: MigrationState in @@ -1264,7 +1332,7 @@ static int ram_save_target_page(MigrationState *ms, QEMUFile *f, int res = 0; /* Check the pages is dirty and if it is send it */ - if (migration_bitmap_clear_dirty(dirty_ram_abs)) { + if (pss->from_ring || migration_bitmap_clear_dirty(dirty_ram_abs)) { unsigned long *unsentmap; if (compression_switch && migrate_use_compression()) { res = ram_save_compressed_page(ms, f, pss, @@ -1364,6 +1432,10 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, bool again, found; ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in ram_addr_t space */ +#ifdef KVM_CAP_DIRTY_LOG_RING + PageSearchStatus pss_ring; +#endif + PageSearchStatus *ptr = NULL; /* No dirty page as there is zero RAM */ if (!ram_bytes_total()) { @@ -1373,6 +1445,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, pss.block = last_seen_block; pss.offset = last_offset; pss.complete_round = false; + pss.from_ring = false; if (!pss.block) { pss.block = QLIST_FIRST_RCU(&ram_list.blocks); @@ -1381,14 +1454,32 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, do { again = true; found = get_queued_page(ms, &pss, &dirty_ram_abs); + if (found) + ptr = &pss; if (!found) { /* priority queue empty, so just search for something dirty */ found = find_dirty_block(f, &pss, &again, &dirty_ram_abs); + if (found) { + ptr = &pss; + } } +#ifdef KVM_CAP_DIRTY_LOG_RING + if (!found && !again) { + pss_ring.block = NULL; + pss_ring.offset = 0; + pss_ring.complete_round = false; + pss_ring.from_ring = true; + found = find_dirty_block_from_ring(f, &pss_ring, &again, &dirty_ram_abs, last_stage); + if (found) { + ptr = &pss_ring; + } + } +#endif + if (found) { - pages = ram_save_host_page(ms, f, &pss, + pages = ram_save_host_page(ms, f, ptr, last_stage, bytes_transferred, dirty_ram_abs); } @@ -1415,6 +1506,7 @@ void acct_update_position(QEMUFile *f, size_t size, bool zero) static ram_addr_t ram_save_remaining(void) { return migration_dirty_pages; + } uint64_t ram_bytes_remaining(void) @@ -1979,7 +2071,9 @@ static int ram_save_init_globals(void) migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; memory_global_dirty_log_start(); +#ifndef KVM_CAP_DIRTY_LOG_RING migration_bitmap_sync(); +#endif qemu_mutex_unlock_ramlist(); qemu_mutex_unlock_iothread(); rcu_read_unlock(); @@ -2097,7 +2191,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque) rcu_read_lock(); if (!migration_in_postcopy(migrate_get_current())) { +#ifndef KVM_CAP_DIRTY_LOG_RING migration_bitmap_sync(); +#endif } ram_control_before_iterate(f, RAM_CONTROL_FINISH); @@ -2136,11 +2232,13 @@ static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, if (!migration_in_postcopy(migrate_get_current()) && remaining_size < max_size) { +#ifndef KVM_CAP_DIRTY_LOG_RING qemu_mutex_lock_iothread(); rcu_read_lock(); migration_bitmap_sync(); rcu_read_unlock(); qemu_mutex_unlock_iothread(); +#endif remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; } diff --git a/net/slirp.c b/net/slirp.c index f97ec23..4471af3 100644 --- a/net/slirp.c +++ b/net/slirp.c @@ -562,7 +562,8 @@ static void slirp_smb_cleanup(SlirpState *s) int ret; if (s->smb_dir[0] != '\0') { - snprintf(cmd, sizeof(cmd), "rm -rf %s", s->smb_dir); + int l = snprintf(cmd, sizeof(cmd), "rm -rf %s", s->smb_dir); + assert(l < 128); ret = system(cmd); if (ret == -1 || !WIFEXITED(ret)) { error_report("'%s' failed.", cmd); @@ -606,7 +607,8 @@ static int slirp_smb(SlirpState* s, const char *exported_dir, s->smb_dir[0] = 0; return -1; } - snprintf(smb_conf, sizeof(smb_conf), "%s/%s", s->smb_dir, "smb.conf"); + int l = snprintf(smb_conf, sizeof(smb_conf), "%s/%s", s->smb_dir, "smb.conf"); + assert(l < 128); f = fopen(smb_conf, "w"); if (!f) { @@ -651,8 +653,9 @@ static int slirp_smb(SlirpState* s, const char *exported_dir, ); fclose(f); - snprintf(smb_cmdline, sizeof(smb_cmdline), "%s -l %s -s %s", + l = snprintf(smb_cmdline, sizeof(smb_cmdline), "%s -l %s -s %s", CONFIG_SMBD_COMMAND, s->smb_dir, smb_conf); + assert(l < 128); if (slirp_add_exec(s->slirp, 0, smb_cmdline, &vserver_addr, 139) < 0 || slirp_add_exec(s->slirp, 0, smb_cmdline, &vserver_addr, 445) < 0) { -- 1.8.3.4