On Tue, Jun 12, 2018 at 05:33:08PM +0000, Cao, Lei wrote: > This patch adds support for kernel ring-based dirty memory tracking to qemu stable-2.9. > > Kernel support for ring-based dirty memory tracking is currently in the 'dirty-ring-buffer' > branch of KVM, against which this patch was tested. The patch modifies the live migration > code to utilize ring-based instead of bitmap-based dirty memory tracking. > > --- > hw/usb/bus.c | 3 +- > include/migration/migration.h | 6 ++ > include/qom/cpu.h | 18 ++++++ > include/sysemu/kvm.h | 10 +++ > kvm-all.c | 138 ++++++++++++++++++++++++++++++++++++++++++ > linux-headers/linux/kvm.h | 3 + > migration/migration.c | 6 +- > migration/ram.c | 102 ++++++++++++++++++++++++++++++- > net/slirp.c | 9 ++- > 9 files changed, 288 insertions(+), 7 deletions(-) Please send QEMU patches to qemu-devel@xxxxxxxxxx. For more information on submitting patches, please see: https://wiki.qemu.org/Contribute/SubmitAPatch > diff --git a/hw/usb/bus.c b/hw/usb/bus.c > index 24f1608..50b6913 100644 > --- a/hw/usb/bus.c > +++ b/hw/usb/bus.c > @@ -407,8 +407,9 @@ void usb_register_companion(const char *masterbus, USBPort *ports[], > void usb_port_location(USBPort *downstream, USBPort *upstream, int portnr) > { > if (upstream) { > - snprintf(downstream->path, sizeof(downstream->path), "%s.%d", > + int l = snprintf(downstream->path, sizeof(downstream->path), "%s.%d", > upstream->path, portnr); > + assert(l < sizeof(downstream->path)); > downstream->hubcount = upstream->hubcount + 1; > } else { > snprintf(downstream->path, sizeof(downstream->path), "%d", portnr); > diff --git a/include/migration/migration.h b/include/migration/migration.h > index 5720c88..08d1b31 100644 > --- a/include/migration/migration.h > +++ b/include/migration/migration.h > @@ -385,4 +385,10 @@ uint64_t ram_pagesize_summary(void); > PostcopyState postcopy_state_get(void); > /* Set the state and return the old state */ > PostcopyState postcopy_state_set(PostcopyState new_state); > + > +#ifdef KVM_CAP_DIRTY_LOG_RING > +int get_dirty_count_from_ring(void); > +int fetch_dirty_pages(bool); > +#endif > + > #endif > diff --git a/include/qom/cpu.h b/include/qom/cpu.h > index 5d10359..6839b5b 100644 > --- a/include/qom/cpu.h > +++ b/include/qom/cpu.h > @@ -231,6 +231,24 @@ struct CPUWatchpoint { > struct KVMState; > struct kvm_run; > > +#ifdef KVM_CAP_DIRTY_LOG_RING > +struct dirty_gfn { > + uint32_t pad; > + uint32_t slot; > + uint64_t offset; > +}; > + > +struct dirty_ring { > + union { > + struct { > + uint16_t avail_index; > + uint16_t fetch_index; > + } indices; > + struct dirty_gfn dirty_gfns[0]; /* slot/offset */ > + }; > +}; > +#endif > + > struct hax_vcpu_state; > > #define TB_JMP_CACHE_BITS 12 > diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h > index 24281fc..f283d7d 100644 > --- a/include/sysemu/kvm.h > +++ b/include/sysemu/kvm.h > @@ -227,6 +227,16 @@ int kvm_init_vcpu(CPUState *cpu); > int kvm_cpu_exec(CPUState *cpu); > int kvm_destroy_vcpu(CPUState *cpu); > > +#ifdef KVM_CAP_DIRTY_LOG_RING > +struct dirty_page { > + RAMBlock *block; > + ram_addr_t offset; > +}; > +int kvm_get_dirty_page_count(void); > +void kvm_sync_dirty_page_from_ring(struct dirty_page *pages, > + int *dirty_index, int max); > +#endif > + > #ifdef NEED_CPU_H > #include "cpu.h" > > diff --git a/kvm-all.c b/kvm-all.c > index 90b8573..5ee57ef 100644 > --- a/kvm-all.c > +++ b/kvm-all.c > @@ -62,6 +62,10 @@ > > #define KVM_MSI_HASHTAB_SIZE 256 > > +#ifdef KVM_CAP_DIRTY_LOG_RING > +#define KVM_DIRTY_LOG_PAGE_NUM 16 > +#endif > + > struct KVMParkedVcpu { > unsigned long vcpu_id; > int kvm_fd; > @@ -102,6 +106,10 @@ struct KVMState > #endif > KVMMemoryListener memory_listener; > QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; > +#ifdef KVM_CAP_DIRTY_LOG_RING > + struct dirty_ring *dirty_logs; > + int max_dirty_logs; > +#endif > }; > > KVMState *kvm_state; > @@ -301,6 +309,99 @@ static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) > return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); > } > > +#ifdef KVM_CAP_DIRTY_LOG_RING > +int kvm_get_dirty_page_count(void) > +{ > + CPUState *cpu; > + struct dirty_ring *ring; > + unsigned int avail, fetch; > + int total = 0; > + > + CPU_FOREACH(cpu) { > + ring = (void *)cpu->kvm_run + 64 * PAGE_SIZE; > + avail = ring->indices.avail_index; > + fetch = ring->indices.fetch_index; > + total += (avail-fetch)%kvm_state->max_dirty_logs; > + } > + > + ring = kvm_state->dirty_logs; > + avail = ring->indices.avail_index; > + fetch = ring->indices.fetch_index; > + total += (avail-fetch)%kvm_state->max_dirty_logs; > + > + return total; > +} > + > +void kvm_sync_dirty_page_from_ring(struct dirty_page *pages, > + int *dirty_index, > + int max) > +{ > + CPUState *cpu; > + int i, j; > + struct dirty_ring *ring; > + unsigned int avail; > + void *host; > + int as_id; > + uint16_t slot_id; > + int index = 0; > + > + CPU_FOREACH(cpu) { > + ring = (void *)cpu->kvm_run + 64 * PAGE_SIZE; > + avail = ring->indices.avail_index; > + while(ring->indices.fetch_index != avail) { > + for (i = 0; i < kvm_state->nr_slots; i++) { > + j = ring->indices.fetch_index & (kvm_state->max_dirty_logs-1); > + as_id = ring->dirty_gfns[j].slot >> 16; > + slot_id = (uint16_t)ring->dirty_gfns[j].slot; > + if ((kvm_state->memory_listener.as_id == as_id) && > + (kvm_state->memory_listener.slots[i].slot == slot_id)) { > + host = kvm_state->memory_listener.slots[i].ram + > + (ring->dirty_gfns[j].offset << TARGET_PAGE_BITS); > + pages[*dirty_index+index].block = qemu_ram_block_from_host(host, 0, > + &pages[*dirty_index+index].offset); > + index++; > + if (index == max) { > + ring->indices.fetch_index++; > + goto reset; > + } > + } > + } > + ring->indices.fetch_index++; > + } > + } > + > + ring = kvm_state->dirty_logs; > + avail = ring->indices.avail_index; > + while(ring->indices.fetch_index != avail) { > + for (i = 0; i < kvm_state->nr_slots; i++) { > + j = ring->indices.fetch_index & (kvm_state->max_dirty_logs-1); > + as_id = ring->dirty_gfns[j].slot >> 16; > + slot_id = (uint16_t)ring->dirty_gfns[j].slot; > + if ((kvm_state->memory_listener.as_id == as_id) && > + (kvm_state->memory_listener.slots[i].slot == slot_id)) { > + host = kvm_state->memory_listener.slots[i].ram + > + (ring->dirty_gfns[j].offset << TARGET_PAGE_BITS); > + pages[*dirty_index+index].block = qemu_ram_block_from_host(host, 0, > + &pages[*dirty_index+index].offset); > + index++; > + if (index == max) { > + ring->indices.fetch_index++; > + goto reset; > + } > + } > + } > + ring->indices.fetch_index++; > + } > + > +reset: > + if (index) { > + kvm_vm_ioctl(kvm_state, KVM_RESET_DIRTY_PAGES); > + } > + > + *dirty_index += index; > +} > +#endif > + > int kvm_init_vcpu(CPUState *cpu) > { > KVMState *s = kvm_state; > @@ -326,8 +427,13 @@ int kvm_init_vcpu(CPUState *cpu) > goto err; > } > > +#ifdef KVM_CAP_DIRTY_LOG_RING > + cpu->kvm_run = mmap(NULL, (64+KVM_DIRTY_LOG_PAGE_NUM)*PAGE_SIZE, > + PROT_READ | PROT_WRITE, MAP_SHARED, cpu->kvm_fd, 0); > +#else > cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, > cpu->kvm_fd, 0); > +#endif > if (cpu->kvm_run == MAP_FAILED) { > ret = -errno; > DPRINTF("mmap'ing vcpu state failed\n"); > @@ -1581,6 +1687,9 @@ static int kvm_init(MachineState *ms) > int ret; > int type = 0; > const char *kvm_type; > +#ifdef KVM_CAP_DIRTY_LOG_RING > + int size = KVM_DIRTY_LOG_PAGE_NUM*PAGE_SIZE; > +#endif > > s = KVM_STATE(ms->accelerator); > > @@ -1684,6 +1793,24 @@ static int kvm_init(MachineState *ms) > } > > s->vmfd = ret; > + > +#ifdef KVM_CAP_DIRTY_LOG_RING > + ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, size); > + if (ret < 0) { > + fprintf(stderr, "ioctl KVM_CAP_DIRTY_LOG_RING %d: failed: %d %s\n", size, -ret, > + strerror(-ret)); > + goto err; > + } > + s->max_dirty_logs = size/sizeof(struct dirty_gfn); > + s->dirty_logs = mmap(NULL, KVM_DIRTY_LOG_PAGE_NUM*PAGE_SIZE, > + PROT_READ | PROT_WRITE, MAP_SHARED, s->vmfd, 0); > + if (s->dirty_logs == MAP_FAILED) { > + ret = -errno; > + fprintf(stderr, "mmap'ing vm dirty logs failed\n"); > + goto err; > + } > +#endif > + > missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); > if (!missing_cap) { > missing_cap = > @@ -1951,6 +2078,8 @@ static void kvm_eat_signals(CPUState *cpu) > } while (sigismember(&chkset, SIG_IPI)); > } > > +extern int fetch_dirty_pages(bool); > + > int kvm_cpu_exec(CPUState *cpu) > { > struct kvm_run *run = cpu->kvm_run; > @@ -2005,6 +2134,15 @@ int kvm_cpu_exec(CPUState *cpu) > > if (run_ret < 0) { > if (run_ret == -EINTR || run_ret == -EAGAIN) { > +#ifdef KVM_CAP_DIRTY_LOG_RING > + if ((run_ret == -EINTR) && (run->exit_reason == KVM_EXIT_DIRTY_LOG_FULL)) { > + //error_report("kvm dirty log full\n"); > + ret = fetch_dirty_pages(true); > + kvm_eat_signals(cpu); > + continue; > + } > +#endif > + > DPRINTF("io window exit\n"); > kvm_eat_signals(cpu); > ret = EXCP_INTERRUPT; > diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h > index 4e082a8..0546d0a 100644 > --- a/linux-headers/linux/kvm.h > +++ b/linux-headers/linux/kvm.h > @@ -205,6 +205,7 @@ struct kvm_hyperv_exit { > #define KVM_EXIT_S390_STSI 25 > #define KVM_EXIT_IOAPIC_EOI 26 > #define KVM_EXIT_HYPERV 27 > +#define KVM_EXIT_DIRTY_LOG_FULL 28 > > /* For KVM_EXIT_INTERNAL_ERROR */ > /* Emulate instruction failed. */ > @@ -883,6 +884,7 @@ struct kvm_ppc_resize_hpt { > #define KVM_CAP_PPC_MMU_RADIX 134 > #define KVM_CAP_PPC_MMU_HASH_V3 135 > #define KVM_CAP_IMMEDIATE_EXIT 136 > +#define KVM_CAP_DIRTY_LOG_RING 151 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -1300,6 +1302,7 @@ struct kvm_s390_ucas_mapping { > #define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state) > /* Available with KVM_CAP_X86_SMM */ > #define KVM_SMI _IO(KVMIO, 0xb7) > +#define KVM_RESET_DIRTY_PAGES _IO(KVMIO, 0xba) > > #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) > #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) > diff --git a/migration/migration.c b/migration/migration.c > index ad4036f..63b8a0f 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -17,6 +17,7 @@ > #include "qemu/cutils.h" > #include "qemu/error-report.h" > #include "qemu/main-loop.h" > +#include "linux/kvm.h" > #include "migration/migration.h" > #include "migration/qemu-file.h" > #include "sysemu/sysemu.h" > @@ -1959,10 +1960,13 @@ static void *migration_thread(void *opaque) > > if (!qemu_file_rate_limit(s->to_dst_file)) { > uint64_t pend_post, pend_nonpost; > - > qemu_savevm_state_pending(s->to_dst_file, max_size, &pend_nonpost, > &pend_post); > +#ifdef KVM_CAP_DIRTY_LOG_RING > + pending_size = pend_nonpost + pend_post + get_dirty_count_from_ring(); > +#else > pending_size = pend_nonpost + pend_post; > +#endif > trace_migrate_pending(pending_size, max_size, > pend_post, pend_nonpost); > if (pending_size && pending_size >= max_size) { > diff --git a/migration/ram.c b/migration/ram.c > index de1e0a3..05b1dff 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -35,6 +35,7 @@ > #include "qemu/bitmap.h" > #include "qemu/timer.h" > #include "qemu/main-loop.h" > +#include "linux/kvm.h" > #include "migration/migration.h" > #include "migration/postcopy-ram.h" > #include "exec/address-spaces.h" > @@ -44,6 +45,7 @@ > #include "exec/ram_addr.h" > #include "qemu/rcu_queue.h" > #include "migration/colo.h" > +#include "sysemu/kvm.h" > > static int dirty_rate_high_cnt; > > @@ -230,6 +232,8 @@ struct PageSearchStatus { > ram_addr_t offset; > /* Set once we wrap around */ > bool complete_round; > + /* page is from dirty ring */ > + bool from_ring; > }; > typedef struct PageSearchStatus PageSearchStatus; > > @@ -1055,6 +1059,70 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, > } > } > > +#ifdef KVM_CAP_DIRTY_LOG_RING > +#define NUM_DIRTY_PAGES 1024*1024 > +struct dirty_page *pages; > +static pthread_mutex_t pages_mutex = PTHREAD_MUTEX_INITIALIZER; > +int dirty_index = 0; > +int fetch_index = 0; > + > +int fetch_dirty_pages(bool lock) > +{ > + if (lock) > + pthread_mutex_lock(&pages_mutex); > + if (!pages) > + pages = malloc(sizeof(struct dirty_page)*NUM_DIRTY_PAGES); > + > + if (!(NUM_DIRTY_PAGES-dirty_index)) > + error_report("dirty ring full\n"); > + else > + kvm_sync_dirty_page_from_ring(pages, &dirty_index, NUM_DIRTY_PAGES-dirty_index); > + > + if (lock) > + pthread_mutex_unlock(&pages_mutex); > + > + return 0; > +} > + > +static bool find_dirty_block_from_ring(QEMUFile *f, PageSearchStatus *pss, > + bool *again, ram_addr_t *ram_addr_abs, bool last) > +{ > + bool found = false; > + > + *again = true; > + > + pthread_mutex_lock(&pages_mutex); > + fetch_dirty_pages(false); > + if (!dirty_index) { > + if (last) > + *again = false; > + goto out; > + } > + > + if (fetch_index < dirty_index) { > + pss->block = pages[fetch_index].block; > + pss->offset = pages[fetch_index].offset; > + *ram_addr_abs = pss->block->offset + pss->offset; > + found = true; > + fetch_index++; > + } > + > + if (dirty_index == fetch_index) { > + dirty_index = 0; > + fetch_index = 0; > + } > + > +out: > + pthread_mutex_unlock(&pages_mutex); > + return found; > +} > + > +int get_dirty_count_from_ring(void) > +{ > + return kvm_get_dirty_page_count() * TARGET_PAGE_SIZE; > +} > +#endif > + > /* > * Helper for 'get_queued_page' - gets a page off the queue > * ms: MigrationState in > @@ -1264,7 +1332,7 @@ static int ram_save_target_page(MigrationState *ms, QEMUFile *f, > int res = 0; > > /* Check the pages is dirty and if it is send it */ > - if (migration_bitmap_clear_dirty(dirty_ram_abs)) { > + if (pss->from_ring || migration_bitmap_clear_dirty(dirty_ram_abs)) { > unsigned long *unsentmap; > if (compression_switch && migrate_use_compression()) { > res = ram_save_compressed_page(ms, f, pss, > @@ -1364,6 +1432,10 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, > bool again, found; > ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in > ram_addr_t space */ > +#ifdef KVM_CAP_DIRTY_LOG_RING > + PageSearchStatus pss_ring; > +#endif > + PageSearchStatus *ptr = NULL; > > /* No dirty page as there is zero RAM */ > if (!ram_bytes_total()) { > @@ -1373,6 +1445,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, > pss.block = last_seen_block; > pss.offset = last_offset; > pss.complete_round = false; > + pss.from_ring = false; > > if (!pss.block) { > pss.block = QLIST_FIRST_RCU(&ram_list.blocks); > @@ -1381,14 +1454,32 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, > do { > again = true; > found = get_queued_page(ms, &pss, &dirty_ram_abs); > + if (found) > + ptr = &pss; > > if (!found) { > /* priority queue empty, so just search for something dirty */ > found = find_dirty_block(f, &pss, &again, &dirty_ram_abs); > + if (found) { > + ptr = &pss; > + } > } > > +#ifdef KVM_CAP_DIRTY_LOG_RING > + if (!found && !again) { > + pss_ring.block = NULL; > + pss_ring.offset = 0; > + pss_ring.complete_round = false; > + pss_ring.from_ring = true; > + found = find_dirty_block_from_ring(f, &pss_ring, &again, &dirty_ram_abs, last_stage); > + if (found) { > + ptr = &pss_ring; > + } > + } > +#endif > + > if (found) { > - pages = ram_save_host_page(ms, f, &pss, > + pages = ram_save_host_page(ms, f, ptr, > last_stage, bytes_transferred, > dirty_ram_abs); > } > @@ -1415,6 +1506,7 @@ void acct_update_position(QEMUFile *f, size_t size, bool zero) > static ram_addr_t ram_save_remaining(void) > { > return migration_dirty_pages; > + > } > > uint64_t ram_bytes_remaining(void) > @@ -1979,7 +2071,9 @@ static int ram_save_init_globals(void) > migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; > > memory_global_dirty_log_start(); > +#ifndef KVM_CAP_DIRTY_LOG_RING > migration_bitmap_sync(); > +#endif > qemu_mutex_unlock_ramlist(); > qemu_mutex_unlock_iothread(); > rcu_read_unlock(); > @@ -2097,7 +2191,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque) > rcu_read_lock(); > > if (!migration_in_postcopy(migrate_get_current())) { > +#ifndef KVM_CAP_DIRTY_LOG_RING > migration_bitmap_sync(); > +#endif > } > > ram_control_before_iterate(f, RAM_CONTROL_FINISH); > @@ -2136,11 +2232,13 @@ static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, > > if (!migration_in_postcopy(migrate_get_current()) && > remaining_size < max_size) { > +#ifndef KVM_CAP_DIRTY_LOG_RING > qemu_mutex_lock_iothread(); > rcu_read_lock(); > migration_bitmap_sync(); > rcu_read_unlock(); > qemu_mutex_unlock_iothread(); > +#endif > remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; > } > > diff --git a/net/slirp.c b/net/slirp.c > index f97ec23..4471af3 100644 > --- a/net/slirp.c > +++ b/net/slirp.c > @@ -562,7 +562,8 @@ static void slirp_smb_cleanup(SlirpState *s) > int ret; > > if (s->smb_dir[0] != '\0') { > - snprintf(cmd, sizeof(cmd), "rm -rf %s", s->smb_dir); > + int l = snprintf(cmd, sizeof(cmd), "rm -rf %s", s->smb_dir); > + assert(l < 128); > ret = system(cmd); > if (ret == -1 || !WIFEXITED(ret)) { > error_report("'%s' failed.", cmd); > @@ -606,7 +607,8 @@ static int slirp_smb(SlirpState* s, const char *exported_dir, > s->smb_dir[0] = 0; > return -1; > } > - snprintf(smb_conf, sizeof(smb_conf), "%s/%s", s->smb_dir, "smb.conf"); > + int l = snprintf(smb_conf, sizeof(smb_conf), "%s/%s", s->smb_dir, "smb.conf"); > + assert(l < 128); > > f = fopen(smb_conf, "w"); > if (!f) { > @@ -651,8 +653,9 @@ static int slirp_smb(SlirpState* s, const char *exported_dir, > ); > fclose(f); > > - snprintf(smb_cmdline, sizeof(smb_cmdline), "%s -l %s -s %s", > + l = snprintf(smb_cmdline, sizeof(smb_cmdline), "%s -l %s -s %s", > CONFIG_SMBD_COMMAND, s->smb_dir, smb_conf); > + assert(l < 128); > > if (slirp_add_exec(s->slirp, 0, smb_cmdline, &vserver_addr, 139) < 0 || > slirp_add_exec(s->slirp, 0, smb_cmdline, &vserver_addr, 445) < 0) { > -- > 1.8.3.4 > > > >
Attachment:
signature.asc
Description: PGP signature