This patch implements postcopy livemigration. Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx> --- Makefile.target | 4 + arch_init.c | 26 +- cpu-all.h | 7 + exec.c | 20 +- migration-exec.c | 8 + migration-fd.c | 30 + migration-postcopy-stub.c | 77 ++ migration-postcopy.c | 1891 +++++++++++++++++++++++++++++++++++++++++++++ migration-tcp.c | 37 +- migration-unix.c | 32 +- migration.c | 31 + migration.h | 30 + qemu-common.h | 1 + qemu-options.hx | 5 +- umem.c | 379 +++++++++ umem.h | 105 +++ vl.c | 14 +- 17 files changed, 2677 insertions(+), 20 deletions(-) create mode 100644 migration-postcopy-stub.c create mode 100644 migration-postcopy.c create mode 100644 umem.c create mode 100644 umem.h diff --git a/Makefile.target b/Makefile.target index 3261383..d94c53f 100644 --- a/Makefile.target +++ b/Makefile.target @@ -4,6 +4,7 @@ GENERATED_HEADERS = config-target.h CONFIG_NO_PCI = $(if $(subst n,,$(CONFIG_PCI)),n,y) CONFIG_NO_KVM = $(if $(subst n,,$(CONFIG_KVM)),n,y) CONFIG_NO_XEN = $(if $(subst n,,$(CONFIG_XEN)),n,y) +CONFIG_NO_POSTCOPY = $(if $(subst n,,$(CONFIG_POSTCOPY)),n,y) include ../config-host.mak include config-devices.mak @@ -199,6 +200,9 @@ obj-$(CONFIG_NO_KVM) += kvm-stub.o obj-y += memory.o LIBS+=-lz +common-obj-$(CONFIG_POSTCOPY) += migration-postcopy.o umem.o +common-obj-$(CONFIG_NO_POSTCOPY) += migration-postcopy-stub.o + QEMU_CFLAGS += $(VNC_TLS_CFLAGS) QEMU_CFLAGS += $(VNC_SASL_CFLAGS) QEMU_CFLAGS += $(VNC_JPEG_CFLAGS) diff --git a/arch_init.c b/arch_init.c index bc53092..8b3130d 100644 --- a/arch_init.c +++ b/arch_init.c @@ -102,6 +102,13 @@ static int is_dup_page(uint8_t *page, uint8_t ch) return 1; } +static bool outgoing_postcopy = false; + +void ram_save_set_params(const MigrationParams *params, void *opaque) +{ + outgoing_postcopy = params->postcopy; +} + static RAMBlock *last_block_sent = NULL; int ram_save_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) @@ -284,6 +291,17 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) uint64_t expected_time = 0; int ret; + if (stage == 1) { + last_block_sent = NULL; + + bytes_transferred = 0; + last_block = NULL; + last_offset = 0; + } + if (outgoing_postcopy) { + return postcopy_outgoing_ram_save_live(mon, f, stage, opaque); + } + if (stage < 0) { cpu_physical_memory_set_dirty_tracking(0); return 0; @@ -295,10 +313,6 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) } if (stage == 1) { - bytes_transferred = 0; - last_block_sent = NULL; - last_block = NULL; - last_offset = 0; sort_ram_list(); /* Make sure all dirty bits are set */ @@ -436,6 +450,10 @@ int ram_load(QEMUFile *f, void *opaque, int version_id) int flags; int error; + if (incoming_postcopy) { + return postcopy_incoming_ram_load(f, opaque, version_id); + } + if (version_id < 3 || version_id > RAM_SAVE_VERSION_ID) { return -EINVAL; } diff --git a/cpu-all.h b/cpu-all.h index 0244f7a..2e9d8a7 100644 --- a/cpu-all.h +++ b/cpu-all.h @@ -475,6 +475,9 @@ extern ram_addr_t ram_size; /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ #define RAM_PREALLOC_MASK (1 << 0) +/* RAM is allocated via umem for postcopy incoming mode */ +#define RAM_POSTCOPY_UMEM_MASK (1 << 1) + typedef struct RAMBlock { uint8_t *host; ram_addr_t offset; @@ -485,6 +488,10 @@ typedef struct RAMBlock { #if defined(__linux__) && !defined(TARGET_S390X) int fd; #endif + +#ifdef CONFIG_POSTCOPY + UMem *umem; /* for incoming postcopy mode */ +#endif } RAMBlock; typedef struct RAMList { diff --git a/exec.c b/exec.c index c8c6692..90b0491 100644 --- a/exec.c +++ b/exec.c @@ -35,6 +35,7 @@ #include "qemu-timer.h" #include "memory.h" #include "exec-memory.h" +#include "migration.h" #if defined(CONFIG_USER_ONLY) #include <qemu.h> #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) @@ -2949,6 +2950,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, new_block->host = host; new_block->flags |= RAM_PREALLOC_MASK; } else { +#ifdef CONFIG_POSTCOPY + if (incoming_postcopy) { + postcopy_incoming_ram_alloc(name, size, + &new_block->host, &new_block->umem); + new_block->flags |= RAM_POSTCOPY_UMEM_MASK; + } else +#endif if (mem_path) { #if defined (__linux__) && !defined(TARGET_S390X) new_block->host = file_ram_alloc(new_block, size, mem_path); @@ -3027,7 +3035,13 @@ void qemu_ram_free(ram_addr_t addr) QLIST_REMOVE(block, next); if (block->flags & RAM_PREALLOC_MASK) { ; - } else if (mem_path) { + } +#ifdef CONFIG_POSTCOPY + else if (block->flags & RAM_POSTCOPY_UMEM_MASK) { + postcopy_incoming_ram_free(block->umem); + } +#endif + else if (mem_path) { #if defined (__linux__) && !defined(TARGET_S390X) if (block->fd) { munmap(block->host, block->length); @@ -3073,6 +3087,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) } else { flags = MAP_FIXED; munmap(vaddr, length); + if (block->flags & RAM_POSTCOPY_UMEM_MASK) { + postcopy_incoming_qemu_pages_unmapped(addr, length); + block->flags &= ~RAM_POSTCOPY_UMEM_MASK; + } if (mem_path) { #if defined(__linux__) && !defined(TARGET_S390X) if (block->fd) { diff --git a/migration-exec.c b/migration-exec.c index e14552e..2bd0c3b 100644 --- a/migration-exec.c +++ b/migration-exec.c @@ -62,6 +62,10 @@ int exec_start_outgoing_migration(MigrationState *s, const char *command) { FILE *f; + if (s->params.postcopy) { + return -ENOSYS; + } + f = popen(command, "w"); if (f == NULL) { DPRINTF("Unable to popen exec target\n"); @@ -104,6 +108,10 @@ int exec_start_incoming_migration(const char *command) { QEMUFile *f; + if (incoming_postcopy) { + return -ENOSYS; + } + DPRINTF("Attempting to start an incoming migration\n"); f = qemu_popen_cmd(command, "r"); if(f == NULL) { diff --git a/migration-fd.c b/migration-fd.c index 6211124..5a62ab9 100644 --- a/migration-fd.c +++ b/migration-fd.c @@ -88,6 +88,23 @@ int fd_start_outgoing_migration(MigrationState *s, const char *fdname) s->write = fd_write; s->close = fd_close; + if (s->params.postcopy) { + int flags = fcntl(s->fd, F_GETFL); + if ((flags & O_ACCMODE) != O_RDWR) { + goto err_after_open; + } + + s->fd_read = dup(s->fd); + if (s->fd_read == -1) { + goto err_after_open; + } + s->file_read = qemu_fdopen(s->fd_read, "r"); + if (s->file_read == NULL) { + close(s->fd_read); + goto err_after_open; + } + } + migrate_fd_connect(s); return 0; @@ -103,7 +120,14 @@ static void fd_accept_incoming_migration(void *opaque) process_incoming_migration(f); qemu_set_fd_handler2(qemu_stdio_fd(f), NULL, NULL, NULL, NULL); + if (incoming_postcopy) { + postcopy_incoming_fork_umemd(qemu_stdio_fd(f), f); + } qemu_fclose(f); + if (incoming_postcopy) { + postcopy_incoming_qemu_ready(); + } + return; } int fd_start_incoming_migration(const char *infd) @@ -114,6 +138,12 @@ int fd_start_incoming_migration(const char *infd) DPRINTF("Attempting to start an incoming migration via fd\n"); fd = strtol(infd, NULL, 0); + if (incoming_postcopy) { + int flags = fcntl(fd, F_GETFL); + if ((flags & O_ACCMODE) != O_RDWR) { + return -EINVAL; + } + } f = qemu_fdopen(fd, "rb"); if(f == NULL) { DPRINTF("Unable to apply qemu wrapper to file descriptor\n"); diff --git a/migration-postcopy-stub.c b/migration-postcopy-stub.c new file mode 100644 index 0000000..0b78de7 --- /dev/null +++ b/migration-postcopy-stub.c @@ -0,0 +1,77 @@ +/* + * migration-postcopy-stub.c: postcopy livemigration + * stub functions for non-supported hosts + * + * Copyright (c) 2011 + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "sysemu.h" +#include "migration.h" + +int postcopy_outgoing_create_read_socket(MigrationState *s) +{ + return -ENOSYS; +} + +int postcopy_outgoing_ram_save_live(Monitor *mon, + QEMUFile *f, int stage, void *opaque) +{ + return -ENOSYS; +} + +void *postcopy_outgoing_begin(MigrationState *ms) +{ + return NULL; +} + +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f, + void *postcopy) +{ + return -ENOSYS; +} + +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy) +{ + return -ENOSYS; +} + +void postcopy_incoming_prepare(void) +{ +} + +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id) +{ + return -ENOSYS; +} + +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read) +{ +} + +void postcopy_incoming_qemu_ready(void) +{ +} + +void postcopy_incoming_qemu_cleanup(void) +{ +} + +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size) +{ +} diff --git a/migration-postcopy.c b/migration-postcopy.c new file mode 100644 index 0000000..ed0d574 --- /dev/null +++ b/migration-postcopy.c @@ -0,0 +1,1891 @@ +/* + * migration-postcopy.c: postcopy livemigration + * + * Copyright (c) 2011 + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "bitmap.h" +#include "sysemu.h" +#include "hw/hw.h" +#include "arch_init.h" +#include "migration.h" +#include "umem.h" + +#include "memory.h" +#define WANT_EXEC_OBSOLETE +#include "exec-obsolete.h" + +//#define DEBUG_POSTCOPY +#ifdef DEBUG_POSTCOPY +#include <sys/syscall.h> +#define DPRINTF(fmt, ...) \ + do { \ + printf("%d:%ld %s:%d: " fmt, getpid(), syscall(SYS_gettid), \ + __func__, __LINE__, ## __VA_ARGS__); \ + } while (0) +#else +#define DPRINTF(fmt, ...) do { } while (0) +#endif + +#define ALIGN_UP(size, align) (((size) + (align) - 1) & ~((align) - 1)) + +static void fd_close(int *fd) +{ + if (*fd >= 0) { + close(*fd); + *fd = -1; + } +} + +/*************************************************************************** + * QEMUFile for non blocking pipe + */ + +/* read only */ +struct QEMUFilePipe { + int fd; + QEMUFile *file; +}; +typedef struct QEMUFilePipe QEMUFilePipe; + +static int pipe_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +{ + QEMUFilePipe *s = opaque; + ssize_t len = 0; + + while (size > 0) { + ssize_t ret = read(s->fd, buf, size); + if (ret == -1) { + if (errno == EINTR) { + continue; + } + if (len == 0) { + len = -errno; + } + break; + } + + if (ret == 0) { + /* the write end of the pipe is closed */ + break; + } + len += ret; + buf += ret; + size -= ret; + } + + return len; +} + +static int pipe_close(void *opaque) +{ + QEMUFilePipe *s = opaque; + g_free(s); + return 0; +} + +static QEMUFile *qemu_fopen_pipe(int fd) +{ + QEMUFilePipe *s = g_malloc0(sizeof(*s)); + + s->fd = fd; + fcntl_setfl(fd, O_NONBLOCK); + s->file = qemu_fopen_ops(s, NULL, pipe_get_buffer, pipe_close, + NULL, NULL, NULL); + return s->file; +} + +/* write only */ +struct QEMUFileNonblock { + int fd; + QEMUFile *file; + + /* for pipe-write nonblocking mode */ +#define BUF_SIZE_INC (32 * 1024) /* = IO_BUF_SIZE */ + uint8_t *buffer; + size_t buffer_size; + size_t buffer_capacity; + bool freeze_output; +}; +typedef struct QEMUFileNonblock QEMUFileNonblock; + +static void nonblock_flush_buffer(QEMUFileNonblock *s) +{ + size_t offset = 0; + ssize_t ret; + + while (offset < s->buffer_size) { + ret = write(s->fd, s->buffer + offset, s->buffer_size - offset); + if (ret == -1) { + if (errno == EINTR) { + continue; + } else if (errno == EAGAIN) { + s->freeze_output = true; + } else { + qemu_file_set_error(s->file, errno); + } + break; + } + + if (ret == 0) { + DPRINTF("ret == 0\n"); + break; + } + + offset += ret; + } + + if (offset > 0) { + assert(s->buffer_size >= offset); + memmove(s->buffer, s->buffer + offset, s->buffer_size - offset); + s->buffer_size -= offset; + } + if (s->buffer_size > 0) { + s->freeze_output = true; + } +} + +static int nonblock_put_buffer(void *opaque, + const uint8_t *buf, int64_t pos, int size) +{ + QEMUFileNonblock *s = opaque; + int error; + ssize_t len = 0; + + error = qemu_file_get_error(s->file); + if (error) { + return error; + } + + nonblock_flush_buffer(s); + error = qemu_file_get_error(s->file); + if (error) { + return error; + } + + while (!s->freeze_output && size > 0) { + ssize_t ret; + assert(s->buffer_size == 0); + + ret = write(s->fd, buf, size); + if (ret == -1) { + if (errno == EINTR) { + continue; + } else if (errno == EAGAIN) { + s->freeze_output = true; + } else { + qemu_file_set_error(s->file, errno); + } + break; + } + + len += ret; + buf += ret; + size -= ret; + } + + if (size > 0) { + int inc = size - (s->buffer_capacity - s->buffer_size); + if (inc > 0) { + s->buffer_capacity += + DIV_ROUND_UP(inc, BUF_SIZE_INC) * BUF_SIZE_INC; + s->buffer = g_realloc(s->buffer, s->buffer_capacity); + } + memcpy(s->buffer + s->buffer_size, buf, size); + s->buffer_size += size; + + len += size; + } + + return len; +} + +static int nonblock_pending_size(QEMUFileNonblock *s) +{ + return qemu_pending_size(s->file) + s->buffer_size; +} + +static void nonblock_fflush(QEMUFileNonblock *s) +{ + s->freeze_output = false; + nonblock_flush_buffer(s); + if (!s->freeze_output) { + qemu_fflush(s->file); + } +} + +static void nonblock_wait_for_flush(QEMUFileNonblock *s) +{ + while (nonblock_pending_size(s) > 0) { + fd_set fds; + FD_ZERO(&fds); + FD_SET(s->fd, &fds); + select(s->fd + 1, NULL, &fds, NULL, NULL); + + nonblock_fflush(s); + } +} + +static int nonblock_close(void *opaque) +{ + QEMUFileNonblock *s = opaque; + nonblock_wait_for_flush(s); + g_free(s->buffer); + g_free(s); + return 0; +} + +static QEMUFileNonblock *qemu_fopen_nonblock(int fd) +{ + QEMUFileNonblock *s = g_malloc0(sizeof(*s)); + + s->fd = fd; + fcntl_setfl(fd, O_NONBLOCK); + s->file = qemu_fopen_ops(s, nonblock_put_buffer, NULL, nonblock_close, + NULL, NULL, NULL); + return s; +} + +/*************************************************************************** + * umem daemon on destination <-> qemu on source protocol + */ + +#define QEMU_UMEM_REQ_INIT 0x00 +#define QEMU_UMEM_REQ_ON_DEMAND 0x01 +#define QEMU_UMEM_REQ_ON_DEMAND_CONT 0x02 +#define QEMU_UMEM_REQ_BACKGROUND 0x03 +#define QEMU_UMEM_REQ_BACKGROUND_CONT 0x04 +#define QEMU_UMEM_REQ_REMOVE 0x05 +#define QEMU_UMEM_REQ_EOC 0x06 + +struct qemu_umem_req { + int8_t cmd; + uint8_t len; + char *idstr; /* ON_DEMAND, BACKGROUND, REMOVE */ + uint32_t nr; /* ON_DEMAND, ON_DEMAND_CONT, + BACKGROUND, BACKGROUND_CONT, REMOVE */ + + /* in target page size as qemu migration protocol */ + uint64_t *pgoffs; /* ON_DEMAND, ON_DEMAND_CONT, + BACKGROUND, BACKGROUND_CONT, REMOVE */ +}; + +static void postcopy_incoming_send_req_idstr(QEMUFile *f, const char* idstr) +{ + qemu_put_byte(f, strlen(idstr)); + qemu_put_buffer(f, (uint8_t *)idstr, strlen(idstr)); +} + +static void postcopy_incoming_send_req_pgoffs(QEMUFile *f, uint32_t nr, + const uint64_t *pgoffs) +{ + uint32_t i; + + qemu_put_be32(f, nr); + for (i = 0; i < nr; i++) { + qemu_put_be64(f, pgoffs[i]); + } +} + +static void postcopy_incoming_send_req_one(QEMUFile *f, + const struct qemu_umem_req *req) +{ + DPRINTF("cmd %d\n", req->cmd); + qemu_put_byte(f, req->cmd); + switch (req->cmd) { + case QEMU_UMEM_REQ_INIT: + case QEMU_UMEM_REQ_EOC: + /* nothing */ + break; + case QEMU_UMEM_REQ_ON_DEMAND: + case QEMU_UMEM_REQ_BACKGROUND: + case QEMU_UMEM_REQ_REMOVE: + postcopy_incoming_send_req_idstr(f, req->idstr); + postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs); + break; + case QEMU_UMEM_REQ_ON_DEMAND_CONT: + case QEMU_UMEM_REQ_BACKGROUND_CONT: + postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs); + break; + default: + abort(); + break; + } +} + +/* QEMUFile can buffer up to IO_BUF_SIZE = 32 * 1024. + * So one message size must be <= IO_BUF_SIZE + * cmd: 1 + * id len: 1 + * id: 256 + * nr: 2 + */ +#define MAX_PAGE_NR ((32 * 1024 - 1 - 1 - 256 - 2) / sizeof(uint64_t)) +static void postcopy_incoming_send_req(QEMUFile *f, + const struct qemu_umem_req *req) +{ + uint32_t nr = req->nr; + struct qemu_umem_req tmp = *req; + + switch (req->cmd) { + case QEMU_UMEM_REQ_INIT: + case QEMU_UMEM_REQ_EOC: + postcopy_incoming_send_req_one(f, &tmp); + break; + case QEMU_UMEM_REQ_ON_DEMAND: + case QEMU_UMEM_REQ_BACKGROUND: + tmp.nr = MIN(nr, MAX_PAGE_NR); + postcopy_incoming_send_req_one(f, &tmp); + + nr -= tmp.nr; + tmp.pgoffs += tmp.nr; + if (tmp.cmd == QEMU_UMEM_REQ_ON_DEMAND) { + tmp.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT; + }else { + tmp.cmd = QEMU_UMEM_REQ_BACKGROUND_CONT; + } + /* fall through */ + case QEMU_UMEM_REQ_REMOVE: + case QEMU_UMEM_REQ_ON_DEMAND_CONT: + case QEMU_UMEM_REQ_BACKGROUND_CONT: + while (nr > 0) { + tmp.nr = MIN(nr, MAX_PAGE_NR); + postcopy_incoming_send_req_one(f, &tmp); + + nr -= tmp.nr; + tmp.pgoffs += tmp.nr; + } + break; + default: + abort(); + break; + } +} + +static int postcopy_outgoing_recv_req_idstr(QEMUFile *f, + struct qemu_umem_req *req, + size_t *offset) +{ + int ret; + + req->len = qemu_peek_byte(f, *offset); + *offset += 1; + if (req->len == 0) { + return -EAGAIN; + } + req->idstr = g_malloc((int)req->len + 1); + ret = qemu_peek_buffer(f, (uint8_t*)req->idstr, req->len, *offset); + *offset += ret; + if (ret != req->len) { + g_free(req->idstr); + req->idstr = NULL; + return -EAGAIN; + } + req->idstr[req->len] = 0; + return 0; +} + +static int postcopy_outgoing_recv_req_pgoffs(QEMUFile *f, + struct qemu_umem_req *req, + size_t *offset) +{ + int ret; + uint32_t be32; + uint32_t i; + + ret = qemu_peek_buffer(f, (uint8_t*)&be32, sizeof(be32), *offset); + *offset += sizeof(be32); + if (ret != sizeof(be32)) { + return -EAGAIN; + } + + req->nr = be32_to_cpu(be32); + req->pgoffs = g_new(uint64_t, req->nr); + for (i = 0; i < req->nr; i++) { + uint64_t be64; + ret = qemu_peek_buffer(f, (uint8_t*)&be64, sizeof(be64), *offset); + *offset += sizeof(be64); + if (ret != sizeof(be64)) { + g_free(req->pgoffs); + req->pgoffs = NULL; + return -EAGAIN; + } + req->pgoffs[i] = be64_to_cpu(be64); + } + return 0; +} + +static int postcopy_outgoing_recv_req(QEMUFile *f, struct qemu_umem_req *req) +{ + int size; + int ret; + size_t offset = 0; + + size = qemu_peek_buffer(f, (uint8_t*)&req->cmd, 1, offset); + if (size <= 0) { + return -EAGAIN; + } + offset += 1; + + switch (req->cmd) { + case QEMU_UMEM_REQ_INIT: + case QEMU_UMEM_REQ_EOC: + /* nothing */ + break; + case QEMU_UMEM_REQ_ON_DEMAND: + case QEMU_UMEM_REQ_BACKGROUND: + case QEMU_UMEM_REQ_REMOVE: + ret = postcopy_outgoing_recv_req_idstr(f, req, &offset); + if (ret < 0) { + return ret; + } + ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset); + if (ret < 0) { + return ret; + } + break; + case QEMU_UMEM_REQ_ON_DEMAND_CONT: + case QEMU_UMEM_REQ_BACKGROUND_CONT: + ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset); + if (ret < 0) { + return ret; + } + break; + default: + abort(); + break; + } + qemu_file_skip(f, offset); + DPRINTF("cmd %d\n", req->cmd); + return 0; +} + +static void postcopy_outgoing_free_req(struct qemu_umem_req *req) +{ + g_free(req->idstr); + g_free(req->pgoffs); +} + +/*************************************************************************** + * outgoing part + */ + +#define QEMU_SAVE_LIVE_STAGE_START 0x01 /* = QEMU_VM_SECTION_START */ +#define QEMU_SAVE_LIVE_STAGE_PART 0x02 /* = QEMU_VM_SECTION_PART */ +#define QEMU_SAVE_LIVE_STAGE_END 0x03 /* = QEMU_VM_SECTION_END */ + +enum POState { + PO_STATE_ERROR_RECEIVE, + PO_STATE_ACTIVE, + PO_STATE_EOC_RECEIVED, + PO_STATE_ALL_PAGES_SENT, + PO_STATE_COMPLETED, +}; +typedef enum POState POState; + +struct PostcopyOutgoingState { + POState state; + QEMUFile *mig_read; + int fd_read; + RAMBlock *last_block_read; + + QEMUFile *mig_buffered_write; + MigrationState *ms; + + /* For nobg mode. Check if all pages are sent */ + RAMBlock *block; + ram_addr_t addr; +}; +typedef struct PostcopyOutgoingState PostcopyOutgoingState; + +int postcopy_outgoing_create_read_socket(MigrationState *s) +{ + if (!s->params.postcopy) { + return 0; + } + + s->fd_read = dup(s->fd); + if (s->fd_read == -1) { + int ret = -errno; + perror("dup"); + return ret; + } + s->file_read = qemu_fopen_socket(s->fd_read); + if (s->file_read == NULL) { + return -EINVAL; + } + return 0; +} + +int postcopy_outgoing_ram_save_live(Monitor *mon, + QEMUFile *f, int stage, void *opaque) +{ + int ret = 0; + DPRINTF("stage %d\n", stage); + if (stage == QEMU_SAVE_LIVE_STAGE_START) { + sort_ram_list(); + ram_save_live_mem_size(f); + } + if (stage == QEMU_SAVE_LIVE_STAGE_PART) { + ret = 1; + } + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + return ret; +} + +static RAMBlock *postcopy_outgoing_find_block(const char *idstr) +{ + RAMBlock *block; + QLIST_FOREACH(block, &ram_list.blocks, next) { + if (!strncmp(idstr, block->idstr, strlen(idstr))) { + return block; + } + } + return NULL; +} + +/* + * return value + * 0: continue postcopy mode + * > 0: completed postcopy mode. + * < 0: error + */ +static int postcopy_outgoing_handle_req(PostcopyOutgoingState *s, + const struct qemu_umem_req *req, + bool *written) +{ + int i; + RAMBlock *block; + + DPRINTF("cmd %d state %d\n", req->cmd, s->state); + switch(req->cmd) { + case QEMU_UMEM_REQ_INIT: + /* nothing */ + break; + case QEMU_UMEM_REQ_EOC: + /* tell to finish migration. */ + if (s->state == PO_STATE_ALL_PAGES_SENT) { + s->state = PO_STATE_COMPLETED; + DPRINTF("-> PO_STATE_COMPLETED\n"); + } else { + s->state = PO_STATE_EOC_RECEIVED; + DPRINTF("-> PO_STATE_EOC_RECEIVED\n"); + } + return 1; + case QEMU_UMEM_REQ_ON_DEMAND: + case QEMU_UMEM_REQ_BACKGROUND: + DPRINTF("idstr: %s\n", req->idstr); + block = postcopy_outgoing_find_block(req->idstr); + if (block == NULL) { + return -EINVAL; + } + s->last_block_read = block; + /* fall through */ + case QEMU_UMEM_REQ_ON_DEMAND_CONT: + case QEMU_UMEM_REQ_BACKGROUND_CONT: + DPRINTF("nr %d\n", req->nr); + for (i = 0; i < req->nr; i++) { + DPRINTF("offs[%d] 0x%"PRIx64"\n", i, req->pgoffs[i]); + int ret = ram_save_page(s->mig_buffered_write, s->last_block_read, + req->pgoffs[i] << TARGET_PAGE_BITS); + if (ret > 0) { + *written = true; + } + } + break; + case QEMU_UMEM_REQ_REMOVE: + block = postcopy_outgoing_find_block(req->idstr); + if (block == NULL) { + return -EINVAL; + } + for (i = 0; i < req->nr; i++) { + ram_addr_t addr = block->offset + + (req->pgoffs[i] << TARGET_PAGE_BITS); + cpu_physical_memory_reset_dirty(addr, + addr + TARGET_PAGE_SIZE, + MIGRATION_DIRTY_FLAG); + } + break; + default: + return -EINVAL; + } + return 0; +} + +static void postcopy_outgoing_close_mig_read(PostcopyOutgoingState *s) +{ + if (s->mig_read != NULL) { + qemu_set_fd_handler(s->fd_read, NULL, NULL, NULL); + qemu_fclose(s->mig_read); + s->mig_read = NULL; + fd_close(&s->fd_read); + + s->ms->file_read = NULL; + s->ms->fd_read = -1; + } +} + +static void postcopy_outgoing_completed(PostcopyOutgoingState *s) +{ + postcopy_outgoing_close_mig_read(s); + s->ms->postcopy = NULL; + g_free(s); +} + +static void postcopy_outgoing_recv_handler(void *opaque) +{ + PostcopyOutgoingState *s = opaque; + bool written = false; + int ret = 0; + + assert(s->state == PO_STATE_ACTIVE || + s->state == PO_STATE_ALL_PAGES_SENT); + + do { + struct qemu_umem_req req = {.idstr = NULL, + .pgoffs = NULL}; + + ret = postcopy_outgoing_recv_req(s->mig_read, &req); + if (ret < 0) { + if (ret == -EAGAIN) { + ret = 0; + } + break; + } + if (s->state == PO_STATE_ACTIVE) { + ret = postcopy_outgoing_handle_req(s, &req, &written); + } + postcopy_outgoing_free_req(&req); + } while (ret == 0); + + /* + * flush buffered_file. + * Although mig_write is rate-limited buffered file, those written pages + * are requested on demand by the destination. So forcibly push + * those pages ignoring rate limiting + */ + if (written) { + qemu_fflush(s->mig_buffered_write); + /* qemu_buffered_file_drain(s->mig_buffered_write); */ + } + + if (ret < 0) { + switch (s->state) { + case PO_STATE_ACTIVE: + s->state = PO_STATE_ERROR_RECEIVE; + DPRINTF("-> PO_STATE_ERROR_RECEIVE\n"); + break; + case PO_STATE_ALL_PAGES_SENT: + s->state = PO_STATE_COMPLETED; + DPRINTF("-> PO_STATE_ALL_PAGES_SENT\n"); + break; + default: + abort(); + } + } + if (s->state == PO_STATE_ERROR_RECEIVE || s->state == PO_STATE_COMPLETED) { + postcopy_outgoing_close_mig_read(s); + } + if (s->state == PO_STATE_COMPLETED) { + DPRINTF("PO_STATE_COMPLETED\n"); + MigrationState *ms = s->ms; + postcopy_outgoing_completed(s); + migrate_fd_completed(ms); + } +} + +void *postcopy_outgoing_begin(MigrationState *ms) +{ + PostcopyOutgoingState *s = g_new(PostcopyOutgoingState, 1); + DPRINTF("outgoing begin\n"); + qemu_fflush(ms->file); + + s->ms = ms; + s->state = PO_STATE_ACTIVE; + s->fd_read = ms->fd_read; + s->mig_read = ms->file_read; + s->mig_buffered_write = ms->file; + s->block = NULL; + s->addr = 0; + + /* Make sure all dirty bits are set */ + ram_save_memory_set_dirty(); + + qemu_set_fd_handler(s->fd_read, + &postcopy_outgoing_recv_handler, NULL, s); + return s; +} + +static void postcopy_outgoing_ram_all_sent(QEMUFile *f, + PostcopyOutgoingState *s) +{ + assert(s->state == PO_STATE_ACTIVE); + + s->state = PO_STATE_ALL_PAGES_SENT; + /* tell incoming side that all pages are sent */ + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + qemu_fflush(f); + qemu_buffered_file_drain(f); + DPRINTF("sent RAM_SAVE_FLAG_EOS\n"); + migrate_fd_cleanup(s->ms); + + /* Later migrate_fd_complete() will be called which calls + * migrate_fd_cleanup() again. So dummy file is created + * for qemu monitor to keep working. + */ + s->ms->file = qemu_fopen_ops(NULL, NULL, NULL, NULL, NULL, + NULL, NULL); +} + +static int postcopy_outgoing_check_all_ram_sent(PostcopyOutgoingState *s, + RAMBlock *block, + ram_addr_t addr) +{ + if (block == NULL) { + block = QLIST_FIRST(&ram_list.blocks); + addr = block->offset; + } + + for (; block != NULL; + s->block = QLIST_NEXT(s->block, next), addr = block->offset) { + for (; addr < block->offset + block->length; + addr += TARGET_PAGE_SIZE) { + if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) { + s->block = block; + s->addr = addr; + return 0; + } + } + } + + return 1; +} + +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f, + void *postcopy) +{ + PostcopyOutgoingState *s = postcopy; + + assert(s->state == PO_STATE_ACTIVE || + s->state == PO_STATE_EOC_RECEIVED || + s->state == PO_STATE_ERROR_RECEIVE); + + switch (s->state) { + case PO_STATE_ACTIVE: + /* nothing. processed below */ + break; + case PO_STATE_EOC_RECEIVED: + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + s->state = PO_STATE_COMPLETED; + postcopy_outgoing_completed(s); + DPRINTF("PO_STATE_COMPLETED\n"); + return 1; + case PO_STATE_ERROR_RECEIVE: + postcopy_outgoing_completed(s); + DPRINTF("PO_STATE_ERROR_RECEIVE\n"); + return -1; + default: + abort(); + } + + if (s->ms->params.nobg) { + /* See if all pages are sent. */ + if (postcopy_outgoing_check_all_ram_sent(s, s->block, s->addr) == 0) { + return 0; + } + /* ram_list can be reordered. (it doesn't seem so during migration, + though) So the whole list needs to be checked again */ + if (postcopy_outgoing_check_all_ram_sent(s, NULL, 0) == 0) { + return 0; + } + + postcopy_outgoing_ram_all_sent(f, s); + return 0; + } + + DPRINTF("outgoing background state: %d\n", s->state); + + while (qemu_file_rate_limit(f) == 0) { + if (ram_save_block(f) == 0) { /* no more blocks */ + assert(s->state == PO_STATE_ACTIVE); + postcopy_outgoing_ram_all_sent(f, s); + return 0; + } + } + + return 0; +} + +/*************************************************************************** + * incoming part + */ + +/* flags for incoming mode to modify the behavior. + This is for benchmark/debug purpose */ +#define INCOMING_FLAGS_FAULT_REQUEST 0x01 + + +static void postcopy_incoming_umemd(void); + +#define PIS_STATE_QUIT_RECEIVED 0x01 +#define PIS_STATE_QUIT_QUEUED 0x02 +#define PIS_STATE_QUIT_SENT 0x04 + +#define PIS_STATE_QUIT_MASK (PIS_STATE_QUIT_RECEIVED | \ + PIS_STATE_QUIT_QUEUED | \ + PIS_STATE_QUIT_SENT) + +struct PostcopyIncomingState { + /* dest qemu state */ + uint32_t state; + + UMemDev *dev; + int host_page_size; + int host_page_shift; + + /* qemu side */ + int to_umemd_fd; + QEMUFileNonblock *to_umemd; +#define MAX_FAULTED_PAGES 256 + struct umem_pages *faulted_pages; + + int from_umemd_fd; + QEMUFile *from_umemd; + int version_id; /* save/load format version id */ +}; +typedef struct PostcopyIncomingState PostcopyIncomingState; + + +#define UMEM_STATE_EOS_RECEIVED 0x01 /* umem daemon <-> src qemu */ +#define UMEM_STATE_EOC_SENT 0x02 /* umem daemon <-> src qemu */ +#define UMEM_STATE_QUIT_RECEIVED 0x04 /* umem daemon <-> dst qemu */ +#define UMEM_STATE_QUIT_QUEUED 0x08 /* umem daemon <-> dst qemu */ +#define UMEM_STATE_QUIT_SENT 0x10 /* umem daemon <-> dst qemu */ + +#define UMEM_STATE_QUIT_MASK (UMEM_STATE_QUIT_QUEUED | \ + UMEM_STATE_QUIT_SENT | \ + UMEM_STATE_QUIT_RECEIVED) +#define UMEM_STATE_END_MASK (UMEM_STATE_EOS_RECEIVED | \ + UMEM_STATE_EOC_SENT | \ + UMEM_STATE_QUIT_MASK) + +struct PostcopyIncomingUMemDaemon { + /* umem daemon side */ + uint32_t state; + + int host_page_size; + int host_page_shift; + int nr_host_pages_per_target_page; + int host_to_target_page_shift; + int nr_target_pages_per_host_page; + int target_to_host_page_shift; + int version_id; /* save/load format version id */ + + int to_qemu_fd; + QEMUFileNonblock *to_qemu; + int from_qemu_fd; + QEMUFile *from_qemu; + + int mig_read_fd; + QEMUFile *mig_read; /* qemu on source -> umem daemon */ + + int mig_write_fd; + QEMUFileNonblock *mig_write; /* umem daemon -> qemu on source */ + + /* = KVM_MAX_VCPUS * (ASYNC_PF_PER_VCPUS + 1) */ +#define MAX_REQUESTS (512 * (64 + 1)) + + struct umem_page_request page_request; + struct umem_page_cached page_cached; + +#define MAX_PRESENT_REQUESTS MAX_FAULTED_PAGES + struct umem_pages *present_request; + + uint64_t *target_pgoffs; + + /* bitmap indexed by target page offset */ + unsigned long *phys_requested; + + /* bitmap indexed by target page offset */ + unsigned long *phys_received; + + RAMBlock *last_block_read; /* qemu on source -> umem daemon */ + RAMBlock *last_block_write; /* umem daemon -> qemu on source */ +}; +typedef struct PostcopyIncomingUMemDaemon PostcopyIncomingUMemDaemon; + +static PostcopyIncomingState state = { + .state = 0, + .dev = NULL, + .to_umemd_fd = -1, + .to_umemd = NULL, + .from_umemd_fd = -1, + .from_umemd = NULL, +}; + +static PostcopyIncomingUMemDaemon umemd = { + .state = 0, + .to_qemu_fd = -1, + .to_qemu = NULL, + .from_qemu_fd = -1, + .from_qemu = NULL, + .mig_read_fd = -1, + .mig_read = NULL, + .mig_write_fd = -1, + .mig_write = NULL, +}; + +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy) +{ + /* incoming_postcopy makes sense only when incoming migration mode */ + if (!incoming && incoming_postcopy) { + return -EINVAL; + } + + if (!incoming_postcopy) { + return 0; + } + + state.state = 0; + state.dev = umem_dev_new(); + state.host_page_size = getpagesize(); + state.host_page_shift = ffs(state.host_page_size) - 1; + state.version_id = RAM_SAVE_VERSION_ID; /* = save version of + ram_save_live() */ + return 0; +} + +void postcopy_incoming_ram_alloc(const char *name, + size_t size, uint8_t **hostp, UMem **umemp) +{ + UMem *umem; + size = ALIGN_UP(size, state.host_page_size); + umem = umem_dev_create(state.dev, size, name); + + *umemp = umem; + *hostp = umem->umem; +} + +void postcopy_incoming_ram_free(UMem *umem) +{ + umem_unmap(umem); + umem_close(umem); + umem_destroy(umem); +} + +void postcopy_incoming_prepare(void) +{ + RAMBlock *block; + + QLIST_FOREACH(block, &ram_list.blocks, next) { + if (block->umem != NULL) { + umem_mmap(block->umem); + } + } +} + +static int postcopy_incoming_ram_load_get64(QEMUFile *f, + ram_addr_t *addr, int *flags) +{ + *addr = qemu_get_be64(f); + *flags = *addr & ~TARGET_PAGE_MASK; + *addr &= TARGET_PAGE_MASK; + return qemu_file_get_error(f); +} + +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id) +{ + ram_addr_t addr; + int flags; + int error; + + DPRINTF("incoming ram load\n"); + /* + * RAM_SAVE_FLAGS_EOS or + * RAM_SAVE_FLAGS_MEM_SIZE + mem size + RAM_SAVE_FLAGS_EOS + * see postcopy_outgoing_ram_save_live() + */ + + if (version_id != RAM_SAVE_VERSION_ID) { + DPRINTF("RAM_SAVE_VERSION_ID %d != %d\n", + version_id, RAM_SAVE_VERSION_ID); + return -EINVAL; + } + error = postcopy_incoming_ram_load_get64(f, &addr, &flags); + DPRINTF("addr 0x%lx flags 0x%x\n", addr, flags); + if (error) { + DPRINTF("error %d\n", error); + return error; + } + if (flags == RAM_SAVE_FLAG_EOS && addr == 0) { + DPRINTF("EOS\n"); + return 0; + } + + if (flags != RAM_SAVE_FLAG_MEM_SIZE) { + DPRINTF("-EINVAL flags 0x%x\n", flags); + return -EINVAL; + } + error = ram_load_mem_size(f, addr); + if (error) { + DPRINTF("addr 0x%lx error %d\n", addr, error); + return error; + } + + error = postcopy_incoming_ram_load_get64(f, &addr, &flags); + if (error) { + DPRINTF("addr 0x%lx flags 0x%x error %d\n", addr, flags, error); + return error; + } + if (flags == RAM_SAVE_FLAG_EOS && addr == 0) { + DPRINTF("done\n"); + return 0; + } + DPRINTF("-EINVAL\n"); + return -EINVAL; +} + +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read) +{ + int fds[2]; + RAMBlock *block; + + DPRINTF("fork\n"); + + /* socketpair(AF_UNIX)? */ + + if (qemu_pipe(fds) == -1) { + perror("qemu_pipe"); + abort(); + } + state.from_umemd_fd = fds[0]; + umemd.to_qemu_fd = fds[1]; + + if (qemu_pipe(fds) == -1) { + perror("qemu_pipe"); + abort(); + } + umemd.from_qemu_fd = fds[0]; + state.to_umemd_fd = fds[1]; + + pid_t child = fork(); + if (child < 0) { + perror("fork"); + abort(); + } + + if (child == 0) { + int mig_write_fd; + + fd_close(&state.to_umemd_fd); + fd_close(&state.from_umemd_fd); + umemd.host_page_size = state.host_page_size; + umemd.host_page_shift = state.host_page_shift; + + umemd.nr_host_pages_per_target_page = + TARGET_PAGE_SIZE / umemd.host_page_size; + umemd.nr_target_pages_per_host_page = + umemd.host_page_size / TARGET_PAGE_SIZE; + + umemd.target_to_host_page_shift = + ffs(umemd.nr_host_pages_per_target_page) - 1; + umemd.host_to_target_page_shift = + ffs(umemd.nr_target_pages_per_host_page) - 1; + + umemd.state = 0; + umemd.version_id = state.version_id; + umemd.mig_read_fd = mig_read_fd; + umemd.mig_read = mig_read; + + mig_write_fd = dup(mig_read_fd); + if (mig_write_fd < 0) { + perror("could not dup for writable socket \n"); + abort(); + } + umemd.mig_write_fd = mig_write_fd; + umemd.mig_write = qemu_fopen_nonblock(mig_write_fd); + + postcopy_incoming_umemd(); /* noreturn */ + } + + DPRINTF("qemu pid: %d daemon pid: %d\n", getpid(), child); + fd_close(&umemd.to_qemu_fd); + fd_close(&umemd.from_qemu_fd); + state.faulted_pages = g_malloc(umem_pages_size(MAX_FAULTED_PAGES)); + state.faulted_pages->nr = 0; + + /* close all UMem.shmem_fd */ + QLIST_FOREACH(block, &ram_list.blocks, next) { + umem_close_shmem(block->umem); + } + umem_qemu_wait_for_daemon(state.from_umemd_fd); +} + +static void postcopy_incoming_qemu_recv_quit(void) +{ + RAMBlock *block; + if (state.state & PIS_STATE_QUIT_RECEIVED) { + return; + } + + QLIST_FOREACH(block, &ram_list.blocks, next) { + if (block->umem != NULL) { + umem_destroy(block->umem); + block->umem = NULL; + block->flags &= ~RAM_POSTCOPY_UMEM_MASK; + } + } + + DPRINTF("|= PIS_STATE_QUIT_RECEIVED\n"); + state.state |= PIS_STATE_QUIT_RECEIVED; + qemu_set_fd_handler(state.from_umemd_fd, NULL, NULL, NULL); + qemu_fclose(state.from_umemd); + state.from_umemd = NULL; + fd_close(&state.from_umemd_fd); +} + +static void postcopy_incoming_qemu_fflush_to_umemd_handler(void *opaque) +{ + assert(state.to_umemd != NULL); + + nonblock_fflush(state.to_umemd); + if (nonblock_pending_size(state.to_umemd) > 0) { + return; + } + + qemu_set_fd_handler(state.to_umemd->fd, NULL, NULL, NULL); + if (state.state & PIS_STATE_QUIT_QUEUED) { + DPRINTF("|= PIS_STATE_QUIT_SENT\n"); + state.state |= PIS_STATE_QUIT_SENT; + qemu_fclose(state.to_umemd->file); + state.to_umemd = NULL; + fd_close(&state.to_umemd_fd); + g_free(state.faulted_pages); + state.faulted_pages = NULL; + } +} + +static void postcopy_incoming_qemu_fflush_to_umemd(void) +{ + qemu_set_fd_handler(state.to_umemd->fd, NULL, + postcopy_incoming_qemu_fflush_to_umemd_handler, NULL); + postcopy_incoming_qemu_fflush_to_umemd_handler(NULL); +} + +static void postcopy_incoming_qemu_queue_quit(void) +{ + if (state.state & PIS_STATE_QUIT_QUEUED) { + return; + } + + DPRINTF("|= PIS_STATE_QUIT_QUEUED\n"); + umem_qemu_quit(state.to_umemd->file); + state.state |= PIS_STATE_QUIT_QUEUED; +} + +static void postcopy_incoming_qemu_send_pages_present(void) +{ + if (state.faulted_pages->nr > 0) { + umem_qemu_send_pages_present(state.to_umemd->file, + state.faulted_pages); + state.faulted_pages->nr = 0; + } +} + +static void postcopy_incoming_qemu_faulted_pages( + const struct umem_pages *pages) +{ + assert(pages->nr <= MAX_FAULTED_PAGES); + assert(state.faulted_pages != NULL); + + if (state.faulted_pages->nr + pages->nr > MAX_FAULTED_PAGES) { + postcopy_incoming_qemu_send_pages_present(); + } + memcpy(&state.faulted_pages->pgoffs[state.faulted_pages->nr], + &pages->pgoffs[0], sizeof(pages->pgoffs[0]) * pages->nr); + state.faulted_pages->nr += pages->nr; +} + +static void postcopy_incoming_qemu_cleanup_umem(void); + +static int postcopy_incoming_qemu_handle_req_one(void) +{ + int offset = 0; + int ret; + uint8_t cmd; + + ret = qemu_peek_buffer(state.from_umemd, &cmd, sizeof(cmd), offset); + offset += sizeof(cmd); + if (ret != sizeof(cmd)) { + return -EAGAIN; + } + DPRINTF("cmd %c\n", cmd); + + switch (cmd) { + case UMEM_DAEMON_QUIT: + postcopy_incoming_qemu_recv_quit(); + postcopy_incoming_qemu_queue_quit(); + postcopy_incoming_qemu_cleanup_umem(); + break; + case UMEM_DAEMON_TRIGGER_PAGE_FAULT: { + struct umem_pages *pages = + umem_qemu_trigger_page_fault(state.from_umemd, &offset); + if (pages == NULL) { + return -EAGAIN; + } + if (state.to_umemd_fd >= 0 && !(state.state & PIS_STATE_QUIT_QUEUED)) { + postcopy_incoming_qemu_faulted_pages(pages); + g_free(pages); + } + break; + } + case UMEM_DAEMON_ERROR: + /* umem daemon hit troubles, so it warned us to stop vm execution */ + vm_stop(RUN_STATE_IO_ERROR); /* or RUN_STATE_INTERNAL_ERROR */ + break; + default: + abort(); + break; + } + + if (state.from_umemd != NULL) { + qemu_file_skip(state.from_umemd, offset); + } + return 0; +} + +static void postcopy_incoming_qemu_handle_req(void *opaque) +{ + do { + int ret = postcopy_incoming_qemu_handle_req_one(); + if (ret == -EAGAIN) { + break; + } + } while (state.from_umemd != NULL && + qemu_pending_size(state.from_umemd) > 0); + + if (state.to_umemd != NULL) { + if (state.faulted_pages->nr > 0) { + postcopy_incoming_qemu_send_pages_present(); + } + postcopy_incoming_qemu_fflush_to_umemd(); + } +} + +void postcopy_incoming_qemu_ready(void) +{ + umem_qemu_ready(state.to_umemd_fd); + + state.from_umemd = qemu_fopen_pipe(state.from_umemd_fd); + state.to_umemd = qemu_fopen_nonblock(state.to_umemd_fd); + qemu_set_fd_handler(state.from_umemd_fd, + postcopy_incoming_qemu_handle_req, NULL, NULL); +} + +static void postcopy_incoming_qemu_cleanup_umem(void) +{ + /* when qemu will quit before completing postcopy, tell umem daemon + to tear down umem device and exit. */ + if (state.to_umemd_fd >= 0) { + postcopy_incoming_qemu_queue_quit(); + postcopy_incoming_qemu_fflush_to_umemd(); + } + + if (state.dev) { + umem_dev_destroy(state.dev); + state.dev = NULL; + } +} + +void postcopy_incoming_qemu_cleanup(void) +{ + postcopy_incoming_qemu_cleanup_umem(); + if (state.to_umemd != NULL) { + nonblock_wait_for_flush(state.to_umemd); + } +} + +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size) +{ + uint64_t nr = DIV_ROUND_UP(size, state.host_page_size); + size_t len = umem_pages_size(nr); + ram_addr_t end = addr + size; + struct umem_pages *pages; + int i; + + if (state.to_umemd_fd < 0 || state.state & PIS_STATE_QUIT_QUEUED) { + return; + } + pages = g_malloc(len); + pages->nr = nr; + for (i = 0; addr < end; addr += state.host_page_size, i++) { + pages->pgoffs[i] = addr >> state.host_page_shift; + } + umem_qemu_send_pages_unmapped(state.to_umemd->file, pages); + g_free(pages); + assert(state.to_umemd != NULL); + postcopy_incoming_qemu_fflush_to_umemd(); +} + +/************************************************************************** + * incoming umem daemon + */ + +static void postcopy_incoming_umem_recv_quit(void) +{ + if (umemd.state & UMEM_STATE_QUIT_RECEIVED) { + return; + } + DPRINTF("|= UMEM_STATE_QUIT_RECEIVED\n"); + umemd.state |= UMEM_STATE_QUIT_RECEIVED; + qemu_fclose(umemd.from_qemu); + umemd.from_qemu = NULL; + fd_close(&umemd.from_qemu_fd); +} + +static void postcopy_incoming_umem_queue_quit(void) +{ + if (umemd.state & UMEM_STATE_QUIT_QUEUED) { + return; + } + DPRINTF("|= UMEM_STATE_QUIT_QUEUED\n"); + umem_daemon_quit(umemd.to_qemu->file); + umemd.state |= UMEM_STATE_QUIT_QUEUED; +} + +static void postcopy_incoming_umem_send_eoc_req(void) +{ + struct qemu_umem_req req; + + if (umemd.state & UMEM_STATE_EOC_SENT) { + return; + } + + DPRINTF("|= UMEM_STATE_EOC_SENT\n"); + req.cmd = QEMU_UMEM_REQ_EOC; + postcopy_incoming_send_req(umemd.mig_write->file, &req); + umemd.state |= UMEM_STATE_EOC_SENT; + qemu_fclose(umemd.mig_write->file); + umemd.mig_write = NULL; + fd_close(&umemd.mig_write_fd); +} + +static void postcopy_incoming_umem_send_page_req(RAMBlock *block) +{ + struct qemu_umem_req req; + int bit; + uint64_t target_pgoff; + int i; + + umemd.page_request.nr = MAX_REQUESTS; + umem_get_page_request(block->umem, &umemd.page_request); + DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n", + block->idstr, umemd.page_request.nr, + (uint64_t)umemd.page_request.pgoffs[0], + (uint64_t)umemd.page_request.pgoffs[1]); + + if (umemd.last_block_write != block) { + req.cmd = QEMU_UMEM_REQ_ON_DEMAND; + req.idstr = block->idstr; + } else { + req.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT; + } + + req.nr = 0; + req.pgoffs = umemd.target_pgoffs; + if (TARGET_PAGE_SIZE >= umemd.host_page_size) { + for (i = 0; i < umemd.page_request.nr; i++) { + target_pgoff = + umemd.page_request.pgoffs[i] >> umemd.host_to_target_page_shift; + bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff; + + if (!test_and_set_bit(bit, umemd.phys_requested)) { + req.pgoffs[req.nr] = target_pgoff; + req.nr++; + } + } + } else { + for (i = 0; i < umemd.page_request.nr; i++) { + int j; + target_pgoff = + umemd.page_request.pgoffs[i] << umemd.host_to_target_page_shift; + bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff; + + for (j = 0; j < umemd.nr_target_pages_per_host_page; j++) { + if (!test_and_set_bit(bit + j, umemd.phys_requested)) { + req.pgoffs[req.nr] = target_pgoff + j; + req.nr++; + } + } + } + } + + DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n", + block->idstr, req.nr, req.pgoffs[0], req.pgoffs[1]); + if (req.nr > 0 && umemd.mig_write != NULL) { + postcopy_incoming_send_req(umemd.mig_write->file, &req); + umemd.last_block_write = block; + } +} + +static void postcopy_incoming_umem_send_pages_present(void) +{ + if (umemd.present_request->nr > 0) { + umem_daemon_send_pages_present(umemd.to_qemu->file, + umemd.present_request); + umemd.present_request->nr = 0; + } +} + +static void postcopy_incoming_umem_pages_present_one( + uint32_t nr, const __u64 *pgoffs, uint64_t ramblock_pgoffset) +{ + uint32_t i; + assert(nr <= MAX_PRESENT_REQUESTS); + + if (umemd.present_request->nr + nr > MAX_PRESENT_REQUESTS) { + postcopy_incoming_umem_send_pages_present(); + } + + for (i = 0; i < nr; i++) { + umemd.present_request->pgoffs[umemd.present_request->nr + i] = + pgoffs[i] + ramblock_pgoffset; + } + umemd.present_request->nr += nr; +} + +static void postcopy_incoming_umem_pages_present( + const struct umem_page_cached *page_cached, uint64_t ramblock_pgoffset) +{ + uint32_t left = page_cached->nr; + uint32_t offset = 0; + + while (left > 0) { + uint32_t nr = MIN(left, MAX_PRESENT_REQUESTS); + postcopy_incoming_umem_pages_present_one( + nr, &page_cached->pgoffs[offset], ramblock_pgoffset); + + left -= nr; + offset += nr; + } +} + +static int postcopy_incoming_umem_ram_load(void) +{ + ram_addr_t offset; + int flags; + int error; + void *shmem; + int i; + int bit; + + if (umemd.version_id != RAM_SAVE_VERSION_ID) { + return -EINVAL; + } + + offset = qemu_get_be64(umemd.mig_read); + + flags = offset & ~TARGET_PAGE_MASK; + offset &= TARGET_PAGE_MASK; + + assert(!(flags & RAM_SAVE_FLAG_MEM_SIZE)); + + if (flags & RAM_SAVE_FLAG_EOS) { + DPRINTF("RAM_SAVE_FLAG_EOS\n"); + postcopy_incoming_umem_send_eoc_req(); + + qemu_fclose(umemd.mig_read); + umemd.mig_read = NULL; + fd_close(&umemd.mig_read_fd); + umemd.state |= UMEM_STATE_EOS_RECEIVED; + + postcopy_incoming_umem_queue_quit(); + DPRINTF("|= UMEM_STATE_EOS_RECEIVED\n"); + return 0; + } + + shmem = ram_load_host_from_stream_offset(umemd.mig_read, offset, flags, + &umemd.last_block_read); + if (!shmem) { + DPRINTF("shmem == NULL\n"); + return -EINVAL; + } + + if (flags & RAM_SAVE_FLAG_COMPRESS) { + uint8_t ch = qemu_get_byte(umemd.mig_read); + memset(shmem, ch, TARGET_PAGE_SIZE); + } else if (flags & RAM_SAVE_FLAG_PAGE) { + qemu_get_buffer(umemd.mig_read, shmem, TARGET_PAGE_SIZE); + } + + error = qemu_file_get_error(umemd.mig_read); + if (error) { + DPRINTF("error %d\n", error); + return error; + } + + umemd.page_cached.nr = 0; + bit = (umemd.last_block_read->offset + offset) >> TARGET_PAGE_BITS; + if (!test_and_set_bit(bit, umemd.phys_received)) { + if (TARGET_PAGE_SIZE >= umemd.host_page_size) { + __u64 pgoff = offset >> umemd.host_page_shift; + for (i = 0; i < umemd.nr_host_pages_per_target_page; i++) { + umemd.page_cached.pgoffs[umemd.page_cached.nr] = pgoff + i; + umemd.page_cached.nr++; + } + } else { + bool mark_cache = true; + for (i = 0; i < umemd.nr_target_pages_per_host_page; i++) { + if (!test_bit(bit + i, umemd.phys_received)) { + mark_cache = false; + break; + } + } + if (mark_cache) { + umemd.page_cached.pgoffs[0] = offset >> umemd.host_page_shift; + umemd.page_cached.nr = 1; + } + } + } + + if (umemd.page_cached.nr > 0) { + umem_mark_page_cached(umemd.last_block_read->umem, &umemd.page_cached); + + if (!(umemd.state & UMEM_STATE_QUIT_QUEUED) && umemd.to_qemu_fd >=0 && + (incoming_postcopy_flags & INCOMING_FLAGS_FAULT_REQUEST)) { + uint64_t ramblock_pgoffset; + + ramblock_pgoffset = + umemd.last_block_read->offset >> umemd.host_page_shift; + postcopy_incoming_umem_pages_present(&umemd.page_cached, + ramblock_pgoffset); + } + } + + return 0; +} + +static bool postcopy_incoming_umem_check_umem_done(void) +{ + bool all_done = true; + RAMBlock *block; + + QLIST_FOREACH(block, &ram_list.blocks, next) { + UMem *umem = block->umem; + if (umem != NULL && umem->nsets == umem->nbits) { + umem_unmap_shmem(umem); + umem_destroy(umem); + block->umem = NULL; + } + if (block->umem != NULL) { + all_done = false; + } + } + return all_done; +} + +static bool postcopy_incoming_umem_page_faulted(const struct umem_pages *pages) +{ + int i; + + for (i = 0; i < pages->nr; i++) { + ram_addr_t addr = pages->pgoffs[i] << umemd.host_page_shift; + RAMBlock *block = qemu_get_ram_block(addr); + addr -= block->offset; + umem_remove_shmem(block->umem, addr, umemd.host_page_size); + } + return postcopy_incoming_umem_check_umem_done(); +} + +static bool +postcopy_incoming_umem_page_unmapped(const struct umem_pages *pages) +{ + RAMBlock *block; + ram_addr_t addr; + int i; + + struct qemu_umem_req req = { + .cmd = QEMU_UMEM_REQ_REMOVE, + .nr = 0, + .pgoffs = (uint64_t*)pages->pgoffs, + }; + + addr = pages->pgoffs[0] << umemd.host_page_shift; + block = qemu_get_ram_block(addr); + + for (i = 0; i < pages->nr; i++) { + int pgoff; + + addr = pages->pgoffs[i] << umemd.host_page_shift; + pgoff = addr >> TARGET_PAGE_BITS; + if (!test_bit(pgoff, umemd.phys_received) && + !test_bit(pgoff, umemd.phys_requested)) { + req.pgoffs[req.nr] = pgoff; + req.nr++; + } + set_bit(pgoff, umemd.phys_received); + set_bit(pgoff, umemd.phys_requested); + + umem_remove_shmem(block->umem, + addr - block->offset, umemd.host_page_size); + } + if (req.nr > 0 && umemd.mig_write != NULL) { + req.idstr = block->idstr; + postcopy_incoming_send_req(umemd.mig_write->file, &req); + } + + return postcopy_incoming_umem_check_umem_done(); +} + +static void postcopy_incoming_umem_done(void) +{ + postcopy_incoming_umem_send_eoc_req(); + postcopy_incoming_umem_queue_quit(); +} + +static int postcopy_incoming_umem_handle_qemu(void) +{ + int ret; + int offset = 0; + uint8_t cmd; + + ret = qemu_peek_buffer(umemd.from_qemu, &cmd, sizeof(cmd), offset); + offset += sizeof(cmd); + if (ret != sizeof(cmd)) { + return -EAGAIN; + } + DPRINTF("cmd %c\n", cmd); + switch (cmd) { + case UMEM_QEMU_QUIT: + postcopy_incoming_umem_recv_quit(); + postcopy_incoming_umem_done(); + break; + case UMEM_QEMU_PAGE_FAULTED: { + struct umem_pages *pages = umem_recv_pages(umemd.from_qemu, + &offset); + if (pages == NULL) { + return -EAGAIN; + } + if (postcopy_incoming_umem_page_faulted(pages)){ + postcopy_incoming_umem_done(); + } + g_free(pages); + break; + } + case UMEM_QEMU_PAGE_UNMAPPED: { + struct umem_pages *pages = umem_recv_pages(umemd.from_qemu, + &offset); + if (pages == NULL) { + return -EAGAIN; + } + if (postcopy_incoming_umem_page_unmapped(pages)){ + postcopy_incoming_umem_done(); + } + g_free(pages); + break; + } + default: + abort(); + break; + } + if (umemd.from_qemu != NULL) { + qemu_file_skip(umemd.from_qemu, offset); + } + return 0; +} + +static void set_fd(int fd, fd_set *fds, int *nfds) +{ + FD_SET(fd, fds); + if (fd > *nfds) { + *nfds = fd; + } +} + +static int postcopy_incoming_umemd_main_loop(void) +{ + fd_set writefds; + fd_set readfds; + int nfds; + RAMBlock *block; + int ret; + + int pending_size; + bool get_page_request; + + nfds = -1; + FD_ZERO(&writefds); + FD_ZERO(&readfds); + + if (umemd.mig_write != NULL) { + pending_size = nonblock_pending_size(umemd.mig_write); + if (pending_size > 0) { + set_fd(umemd.mig_write_fd, &writefds, &nfds); + } + } else { + pending_size = 0; + } + +#define PENDING_SIZE_MAX (MAX_REQUESTS * sizeof(uint64_t) * 2) + /* If page request to the migration source is accumulated, + suspend getting page fault request. */ + get_page_request = (pending_size <= PENDING_SIZE_MAX); + + if (get_page_request) { + QLIST_FOREACH(block, &ram_list.blocks, next) { + if (block->umem != NULL) { + set_fd(block->umem->fd, &readfds, &nfds); + } + } + } + + if (umemd.mig_read_fd >= 0) { + set_fd(umemd.mig_read_fd, &readfds, &nfds); + } + + if (umemd.to_qemu != NULL && + nonblock_pending_size(umemd.to_qemu) > 0) { + set_fd(umemd.to_qemu_fd, &writefds, &nfds); + } + if (umemd.from_qemu_fd >= 0) { + set_fd(umemd.from_qemu_fd, &readfds, &nfds); + } + + ret = select(nfds + 1, &readfds, &writefds, NULL, NULL); + if (ret == -1) { + if (errno == EINTR) { + return 0; + } + return ret; + } + + if (umemd.mig_write_fd >= 0 && FD_ISSET(umemd.mig_write_fd, &writefds)) { + nonblock_fflush(umemd.mig_write); + } + if (umemd.to_qemu_fd >= 0 && FD_ISSET(umemd.to_qemu_fd, &writefds)) { + nonblock_fflush(umemd.to_qemu); + } + if (get_page_request) { + QLIST_FOREACH(block, &ram_list.blocks, next) { + if (block->umem != NULL && FD_ISSET(block->umem->fd, &readfds)) { + postcopy_incoming_umem_send_page_req(block); + } + } + } + if (umemd.mig_read_fd >= 0 && FD_ISSET(umemd.mig_read_fd, &readfds)) { + do { + ret = postcopy_incoming_umem_ram_load(); + if (ret < 0) { + return ret; + } + } while (umemd.mig_read != NULL && + qemu_pending_size(umemd.mig_read) > 0); + } + if (umemd.from_qemu_fd >= 0 && FD_ISSET(umemd.from_qemu_fd, &readfds)) { + do { + ret = postcopy_incoming_umem_handle_qemu(); + if (ret == -EAGAIN) { + break; + } + } while (umemd.from_qemu != NULL && + qemu_pending_size(umemd.from_qemu) > 0); + } + + if (umemd.mig_write != NULL) { + nonblock_fflush(umemd.mig_write); + } + if (umemd.to_qemu != NULL) { + if (!(umemd.state & UMEM_STATE_QUIT_QUEUED)) { + postcopy_incoming_umem_send_pages_present(); + } + nonblock_fflush(umemd.to_qemu); + if ((umemd.state & UMEM_STATE_QUIT_QUEUED) && + nonblock_pending_size(umemd.to_qemu) == 0) { + DPRINTF("|= UMEM_STATE_QUIT_SENT\n"); + qemu_fclose(umemd.to_qemu->file); + umemd.to_qemu = NULL; + fd_close(&umemd.to_qemu_fd); + umemd.state |= UMEM_STATE_QUIT_SENT; + } + } + + return (umemd.state & UMEM_STATE_END_MASK) == UMEM_STATE_END_MASK; +} + +static void postcopy_incoming_umemd(void) +{ + ram_addr_t last_ram_offset; + int nbits; + RAMBlock *block; + int ret; + + qemu_daemon(1, 1); + signal(SIGPIPE, SIG_IGN); + DPRINTF("daemon pid: %d\n", getpid()); + + umemd.page_request.pgoffs = g_new(__u64, MAX_REQUESTS); + umemd.page_cached.pgoffs = + g_new(__u64, MAX_REQUESTS * + (TARGET_PAGE_SIZE >= umemd.host_page_size ? + 1: umemd.nr_host_pages_per_target_page)); + umemd.target_pgoffs = + g_new(uint64_t, MAX_REQUESTS * + MAX(umemd.nr_host_pages_per_target_page, + umemd.nr_target_pages_per_host_page)); + umemd.present_request = g_malloc(umem_pages_size(MAX_PRESENT_REQUESTS)); + umemd.present_request->nr = 0; + + last_ram_offset = qemu_last_ram_offset(); + nbits = last_ram_offset >> TARGET_PAGE_BITS; + umemd.phys_requested = g_new0(unsigned long, BITS_TO_LONGS(nbits)); + umemd.phys_received = g_new0(unsigned long, BITS_TO_LONGS(nbits)); + umemd.last_block_read = NULL; + umemd.last_block_write = NULL; + + QLIST_FOREACH(block, &ram_list.blocks, next) { + UMem *umem = block->umem; + umem->umem = NULL; /* umem mapping area has VM_DONT_COPY flag, + so we lost those mappings by fork */ + block->host = umem_map_shmem(umem); + umem_close_shmem(umem); + } + umem_daemon_ready(umemd.to_qemu_fd); + umemd.to_qemu = qemu_fopen_nonblock(umemd.to_qemu_fd); + + /* wait for qemu to disown migration_fd */ + umem_daemon_wait_for_qemu(umemd.from_qemu_fd); + umemd.from_qemu = qemu_fopen_pipe(umemd.from_qemu_fd); + + DPRINTF("entering umemd main loop\n"); + for (;;) { + ret = postcopy_incoming_umemd_main_loop(); + if (ret != 0) { + break; + } + } + DPRINTF("exiting umemd main loop\n"); + + /* This daemon forked from qemu and the parent qemu is still running. + * Cleanups of linked libraries like SDL should not be triggered, + * otherwise the parent qemu may use resources which was already freed. + */ + fflush(stdout); + fflush(stderr); + _exit(ret < 0? EXIT_FAILURE: 0); +} diff --git a/migration-tcp.c b/migration-tcp.c index cf6a9b8..aa35050 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -63,18 +63,25 @@ static void tcp_wait_for_connect(void *opaque) } while (ret == -1 && (socket_error()) == EINTR); if (ret < 0) { - migrate_fd_error(s); - return; + goto error_out; } qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); - if (val == 0) + if (val == 0) { + ret = postcopy_outgoing_create_read_socket(s); + if (ret < 0) { + goto error_out; + } migrate_fd_connect(s); - else { + } else { DPRINTF("error connecting %d\n", val); - migrate_fd_error(s); + goto error_out; } + return; + +error_out: + migrate_fd_error(s); } int tcp_start_outgoing_migration(MigrationState *s, const char *host_port) @@ -112,11 +119,19 @@ int tcp_start_outgoing_migration(MigrationState *s, const char *host_port) if (ret < 0) { DPRINTF("connect failed\n"); - migrate_fd_error(s); - return ret; + goto error_out; + } + + ret = postcopy_outgoing_create_read_socket(s); + if (ret < 0) { + goto error_out; } migrate_fd_connect(s); return 0; + +error_out: + migrate_fd_error(s); + return ret; } static void tcp_accept_incoming_migration(void *opaque) @@ -145,7 +160,15 @@ static void tcp_accept_incoming_migration(void *opaque) } process_incoming_migration(f); + if (incoming_postcopy) { + postcopy_incoming_fork_umemd(c, f); + } qemu_fclose(f); + if (incoming_postcopy) { + /* now socket is disowned. + So tell umem server that it's safe to use it */ + postcopy_incoming_qemu_ready(); + } out: close(c); out2: diff --git a/migration-unix.c b/migration-unix.c index dfcf203..3707505 100644 --- a/migration-unix.c +++ b/migration-unix.c @@ -69,12 +69,20 @@ static void unix_wait_for_connect(void *opaque) qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); - if (val == 0) + if (val == 0) { + ret = postcopy_outgoing_create_read_socket(s); + if (ret < 0) { + goto error_out; + } migrate_fd_connect(s); - else { + } else { DPRINTF("error connecting %d\n", val); - migrate_fd_error(s); + goto error_out; } + return; + +error_out: + migrate_fd_error(s); } int unix_start_outgoing_migration(MigrationState *s, const char *path) @@ -109,11 +117,19 @@ int unix_start_outgoing_migration(MigrationState *s, const char *path) if (ret < 0) { DPRINTF("connect failed\n"); - migrate_fd_error(s); - return ret; + goto error_out; + } + + ret = postcopy_outgoing_create_read_socket(s); + if (ret < 0) { + goto error_out; } migrate_fd_connect(s); return 0; + +error_out: + migrate_fd_error(s); + return ret; } static void unix_accept_incoming_migration(void *opaque) @@ -142,7 +158,13 @@ static void unix_accept_incoming_migration(void *opaque) } process_incoming_migration(f); + if (incoming_postcopy) { + postcopy_incoming_fork_umemd(c, f); + } qemu_fclose(f); + if (incoming_postcopy) { + postcopy_incoming_qemu_ready(); + } out: close(c); out2: diff --git a/migration.c b/migration.c index 0149ab3..51efe44 100644 --- a/migration.c +++ b/migration.c @@ -39,6 +39,11 @@ enum { MIG_STATE_COMPLETED, }; +enum { + MIG_SUBSTATE_PRECOPY, + MIG_SUBSTATE_POSTCOPY, +}; + #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ static NotifierList migration_state_notifiers = @@ -255,6 +260,18 @@ static void migrate_fd_put_ready(void *opaque) return; } + if (s->substate == MIG_SUBSTATE_POSTCOPY) { + /* PRINTF("postcopy background\n"); */ + ret = postcopy_outgoing_ram_save_background(s->mon, s->file, + s->postcopy); + if (ret > 0) { + migrate_fd_completed(s); + } else if (ret < 0) { + migrate_fd_error(s); + } + return; + } + DPRINTF("iterate\n"); ret = qemu_savevm_state_iterate(s->mon, s->file); if (ret < 0) { @@ -265,6 +282,19 @@ static void migrate_fd_put_ready(void *opaque) DPRINTF("done iterating\n"); vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + if (s->params.postcopy) { + if (qemu_savevm_state_complete(s->mon, s->file) < 0) { + migrate_fd_error(s); + if (old_vm_running) { + vm_start(); + } + return; + } + s->substate = MIG_SUBSTATE_POSTCOPY; + s->postcopy = postcopy_outgoing_begin(s); + return; + } + if (qemu_savevm_state_complete(s->mon, s->file) < 0) { migrate_fd_error(s); } else { @@ -357,6 +387,7 @@ void migrate_fd_connect(MigrationState *s) int ret; s->state = MIG_STATE_ACTIVE; + s->substate = MIG_SUBSTATE_PRECOPY; s->file = qemu_fopen_ops_buffered(s, s->bandwidth_limit, migrate_fd_put_buffer, diff --git a/migration.h b/migration.h index 90ae362..2809e99 100644 --- a/migration.h +++ b/migration.h @@ -40,6 +40,12 @@ struct MigrationState int (*write)(MigrationState *s, const void *buff, size_t size); void *opaque; MigrationParams params; + + /* for postcopy */ + int substate; /* precopy or postcopy */ + int fd_read; + QEMUFile *file_read; /* connection from the detination */ + void *postcopy; }; void process_incoming_migration(QEMUFile *f); @@ -86,6 +92,7 @@ uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_transferred(void); uint64_t ram_bytes_total(void); +void ram_save_set_params(const MigrationParams *params, void *opaque); void sort_ram_list(void); int ram_save_block(QEMUFile *f); void ram_save_memory_set_dirty(void); @@ -107,7 +114,30 @@ void migrate_add_blocker(Error *reason); */ void migrate_del_blocker(Error *reason); +/* For outgoing postcopy */ +int postcopy_outgoing_create_read_socket(MigrationState *s); +int postcopy_outgoing_ram_save_live(Monitor *mon, + QEMUFile *f, int stage, void *opaque); +void *postcopy_outgoing_begin(MigrationState *s); +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f, + void *postcopy); + +/* For incoming postcopy */ extern bool incoming_postcopy; extern unsigned long incoming_postcopy_flags; +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy); +void postcopy_incoming_ram_alloc(const char *name, + size_t size, uint8_t **hostp, UMem **umemp); +void postcopy_incoming_ram_free(UMem *umem); +void postcopy_incoming_prepare(void); + +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id); +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read); +void postcopy_incoming_qemu_ready(void); +void postcopy_incoming_qemu_cleanup(void); +#ifdef NEED_CPU_H +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size); +#endif + #endif diff --git a/qemu-common.h b/qemu-common.h index 725922b..d74a8c9 100644 --- a/qemu-common.h +++ b/qemu-common.h @@ -17,6 +17,7 @@ typedef struct DeviceState DeviceState; struct Monitor; typedef struct Monitor Monitor; +typedef struct UMem UMem; /* we put basic includes here to avoid repeating them in device drivers */ #include <stdlib.h> diff --git a/qemu-options.hx b/qemu-options.hx index 5c5b8f3..19e20f9 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2510,7 +2510,10 @@ DEF("postcopy-flags", HAS_ARG, QEMU_OPTION_postcopy_flags, "-postcopy-flags unsigned-int(flags)\n" " flags for postcopy incoming migration\n" " when -incoming and -postcopy are specified.\n" - " This is for benchmark/debug purpose (default: 0)\n", + " This is for benchmark/debug purpose (default: 0)\n" + " Currently supprted flags are\n" + " 1: enable fault request from umemd to qemu\n" + " (default: disabled)\n", QEMU_ARCH_ALL) STEXI @item -postcopy-flags int diff --git a/umem.c b/umem.c new file mode 100644 index 0000000..b7be006 --- /dev/null +++ b/umem.c @@ -0,0 +1,379 @@ +/* + * umem.c: user process backed memory module for postcopy livemigration + * + * Copyright (c) 2011 + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <linux/umem.h> + +#include "bitops.h" +#include "sysemu.h" +#include "hw/hw.h" +#include "umem.h" + +//#define DEBUG_UMEM +#ifdef DEBUG_UMEM +#include <sys/syscall.h> +#define DPRINTF(format, ...) \ + do { \ + printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid), \ + __func__, __LINE__, ## __VA_ARGS__); \ + } while (0) +#else +#define DPRINTF(format, ...) do { } while (0) +#endif + +#define DEV_UMEM "/dev/umem" + +struct UMemDev { + int fd; + int page_shift; +}; + +UMemDev *umem_dev_new(void) +{ + UMemDev *umem_dev; + int umem_dev_fd = open(DEV_UMEM, O_RDWR); + if (umem_dev_fd < 0) { + perror("can't open "DEV_UMEM); + abort(); + } + + umem_dev = g_new(UMemDev, 1); + umem_dev->fd = umem_dev_fd; + umem_dev->page_shift = ffs(getpagesize()) - 1; + return umem_dev; +} + +void umem_dev_destroy(UMemDev *dev) +{ + close(dev->fd); + g_free(dev); +} + +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name) +{ + struct umem_create create = { + .size = size, + .async_req_max = 0, + .sync_req_max = 0, + }; + UMem *umem; + + snprintf(create.name.id, sizeof(create.name.id), + "pid-%"PRId64, (uint64_t)getpid()); + create.name.id[UMEM_ID_MAX - 1] = 0; + strncpy(create.name.name, name, sizeof(create.name.name)); + create.name.name[UMEM_NAME_MAX - 1] = 0; + + assert((size % getpagesize()) == 0); + if (ioctl(dev->fd, UMEM_DEV_CREATE_UMEM, &create) < 0) { + perror("UMEM_DEV_CREATE_UMEM"); + abort(); + } + if (ftruncate(create.shmem_fd, create.size) < 0) { + perror("truncate(\"shmem_fd\")"); + abort(); + } + + umem = g_new(UMem, 1); + umem->nbits = 0; + umem->nsets = 0; + umem->faulted = NULL; + umem->page_shift = dev->page_shift; + umem->fd = create.umem_fd; + umem->shmem_fd = create.shmem_fd; + umem->size = create.size; + umem->umem = mmap(NULL, size, PROT_EXEC | PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (umem->umem == MAP_FAILED) { + perror("mmap(UMem) failed"); + abort(); + } + return umem; +} + +void umem_mmap(UMem *umem) +{ + void *ret = mmap(umem->umem, umem->size, + PROT_EXEC | PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_FIXED, umem->fd, 0); + if (ret == MAP_FAILED) { + perror("umem_mmap(UMem) failed"); + abort(); + } +} + +void umem_destroy(UMem *umem) +{ + if (umem->fd != -1) { + close(umem->fd); + } + if (umem->shmem_fd != -1) { + close(umem->shmem_fd); + } + g_free(umem->faulted); + g_free(umem); +} + +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request) +{ + if (ioctl(umem->fd, UMEM_GET_PAGE_REQUEST, page_request)) { + perror("daemon: UMEM_GET_PAGE_REQUEST"); + abort(); + } +} + +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached) +{ + if (ioctl(umem->fd, UMEM_MARK_PAGE_CACHED, page_cached)) { + perror("daemon: UMEM_MARK_PAGE_CACHED"); + abort(); + } +} + +void umem_unmap(UMem *umem) +{ + munmap(umem->umem, umem->size); + umem->umem = NULL; +} + +void umem_close(UMem *umem) +{ + close(umem->fd); + umem->fd = -1; +} + +void *umem_map_shmem(UMem *umem) +{ + umem->nbits = umem->size >> umem->page_shift; + umem->nsets = 0; + umem->faulted = g_new0(unsigned long, BITS_TO_LONGS(umem->nbits)); + + umem->shmem = mmap(NULL, umem->size, PROT_READ | PROT_WRITE, MAP_SHARED, + umem->shmem_fd, 0); + if (umem->shmem == MAP_FAILED) { + perror("daemon: mmap(\"shmem\")"); + abort(); + } + return umem->shmem; +} + +void umem_unmap_shmem(UMem *umem) +{ + munmap(umem->shmem, umem->size); + umem->shmem = NULL; +} + +void umem_remove_shmem(UMem *umem, size_t offset, size_t size) +{ + int s = offset >> umem->page_shift; + int e = (offset + size) >> umem->page_shift; + int i; + + for (i = s; i < e; i++) { + if (!test_and_set_bit(i, umem->faulted)) { + umem->nsets++; +#if defined(CONFIG_MADVISE) && defined(MADV_REMOVE) + madvise(umem->shmem + offset, size, MADV_REMOVE); +#endif + } + } +} + +void umem_close_shmem(UMem *umem) +{ + close(umem->shmem_fd); + umem->shmem_fd = -1; +} + +/***************************************************************************/ +/* qemu <-> umem daemon communication */ + +size_t umem_pages_size(uint64_t nr) +{ + return sizeof(struct umem_pages) + nr * sizeof(uint64_t); +} + +static void umem_write_cmd(int fd, uint8_t cmd) +{ + DPRINTF("write cmd %c\n", cmd); + + for (;;) { + ssize_t ret = write(fd, &cmd, 1); + if (ret == -1) { + if (errno == EINTR) { + continue; + } else if (errno == EPIPE) { + perror("pipe"); + DPRINTF("write cmd %c %zd %d: pipe is closed\n", + cmd, ret, errno); + break; + } + + perror("pipe"); + DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno); + abort(); + } + + break; + } +} + +static void umem_read_cmd(int fd, uint8_t expect) +{ + uint8_t cmd; + for (;;) { + ssize_t ret = read(fd, &cmd, 1); + if (ret == -1) { + if (errno == EINTR) { + continue; + } + perror("pipe"); + DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno); + abort(); + } + + if (ret == 0) { + DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret); + abort(); + } + + break; + } + + DPRINTF("read cmd %c\n", cmd); + if (cmd != expect) { + DPRINTF("cmd %c expect %d\n", cmd, expect); + abort(); + } +} + +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset) +{ + int ret; + uint64_t nr; + size_t size; + struct umem_pages *pages; + + ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset); + *offset += sizeof(nr); + DPRINTF("ret %d nr %ld\n", ret, nr); + if (ret != sizeof(nr) || nr == 0) { + return NULL; + } + + size = umem_pages_size(nr); + pages = g_malloc(size); + pages->nr = nr; + size -= sizeof(pages->nr); + + ret = qemu_peek_buffer(f, (uint8_t*)pages->pgoffs, size, *offset); + *offset += size; + if (ret != size) { + g_free(pages); + return NULL; + } + return pages; +} + +static void umem_send_pages(QEMUFile *f, const struct umem_pages *pages) +{ + size_t len = umem_pages_size(pages->nr); + qemu_put_buffer(f, (const uint8_t*)pages, len); +} + +/* umem daemon -> qemu */ +void umem_daemon_ready(int to_qemu_fd) +{ + umem_write_cmd(to_qemu_fd, UMEM_DAEMON_READY); +} + +void umem_daemon_quit(QEMUFile *to_qemu) +{ + qemu_put_byte(to_qemu, UMEM_DAEMON_QUIT); +} + +void umem_daemon_send_pages_present(QEMUFile *to_qemu, + struct umem_pages *pages) +{ + qemu_put_byte(to_qemu, UMEM_DAEMON_TRIGGER_PAGE_FAULT); + umem_send_pages(to_qemu, pages); +} + +void umem_daemon_wait_for_qemu(int from_qemu_fd) +{ + umem_read_cmd(from_qemu_fd, UMEM_QEMU_READY); +} + +/* qemu -> umem daemon */ +void umem_qemu_wait_for_daemon(int from_umemd_fd) +{ + umem_read_cmd(from_umemd_fd, UMEM_DAEMON_READY); +} + +void umem_qemu_ready(int to_umemd_fd) +{ + umem_write_cmd(to_umemd_fd, UMEM_QEMU_READY); +} + +void umem_qemu_quit(QEMUFile *to_umemd) +{ + qemu_put_byte(to_umemd, UMEM_QEMU_QUIT); +} + +/* qemu side handler */ +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd, + int *offset) +{ + uint64_t i; + int page_shift = ffs(getpagesize()) - 1; + struct umem_pages *pages = umem_recv_pages(from_umemd, offset); + if (pages == NULL) { + return NULL; + } + + for (i = 0; i < pages->nr; i++) { + ram_addr_t addr = pages->pgoffs[i] << page_shift; + + /* make pages present by forcibly triggering page fault. */ + volatile uint8_t *ram = qemu_get_ram_ptr(addr); + uint8_t dummy_read = ram[0]; + (void)dummy_read; /* suppress unused variable warning */ + } + + return pages; +} + +void umem_qemu_send_pages_present(QEMUFile *to_umemd, + const struct umem_pages *pages) +{ + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_FAULTED); + umem_send_pages(to_umemd, pages); +} + +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd, + const struct umem_pages *pages) +{ + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_UNMAPPED); + umem_send_pages(to_umemd, pages); +} diff --git a/umem.h b/umem.h new file mode 100644 index 0000000..5ca19ef --- /dev/null +++ b/umem.h @@ -0,0 +1,105 @@ +/* + * umem.h: user process backed memory module for postcopy livemigration + * + * Copyright (c) 2011 + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef QEMU_UMEM_H +#define QEMU_UMEM_H + +#include <linux/umem.h> + +#include "qemu-common.h" + +typedef struct UMemDev UMemDev; + +struct UMem { + void *umem; + int fd; + void *shmem; + int shmem_fd; + uint64_t size; + + /* indexed by host page size */ + int page_shift; + int nbits; + int nsets; + unsigned long *faulted; +}; + +UMemDev *umem_dev_new(void); +void umem_dev_destroy(UMemDev *dev); +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name); +void umem_mmap(UMem *umem); + +void umem_destroy(UMem *umem); + +/* umem device operations */ +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request); +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached); +void umem_unmap(UMem *umem); +void umem_close(UMem *umem); + +/* umem shmem operations */ +void *umem_map_shmem(UMem *umem); +void umem_unmap_shmem(UMem *umem); +void umem_remove_shmem(UMem *umem, size_t offset, size_t size); +void umem_close_shmem(UMem *umem); + +/* qemu on source <-> umem daemon communication */ + +struct umem_pages { + uint64_t nr; /* nr = 0 means completed */ + uint64_t pgoffs[0]; +}; + +/* daemon -> qemu */ +#define UMEM_DAEMON_READY 'R' +#define UMEM_DAEMON_QUIT 'Q' +#define UMEM_DAEMON_TRIGGER_PAGE_FAULT 'T' +#define UMEM_DAEMON_ERROR 'E' + +/* qemu -> daemon */ +#define UMEM_QEMU_READY 'r' +#define UMEM_QEMU_QUIT 'q' +#define UMEM_QEMU_PAGE_FAULTED 't' +#define UMEM_QEMU_PAGE_UNMAPPED 'u' + +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset); +size_t umem_pages_size(uint64_t nr); + +/* for umem daemon */ +void umem_daemon_ready(int to_qemu_fd); +void umem_daemon_wait_for_qemu(int from_qemu_fd); +void umem_daemon_quit(QEMUFile *to_qemu); +void umem_daemon_send_pages_present(QEMUFile *to_qemu, + struct umem_pages *pages); + +/* for qemu */ +void umem_qemu_wait_for_daemon(int from_umemd_fd); +void umem_qemu_ready(int to_umemd_fd); +void umem_qemu_quit(QEMUFile *to_umemd); +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd, + int *offset); +void umem_qemu_send_pages_present(QEMUFile *to_umemd, + const struct umem_pages *pages); +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd, + const struct umem_pages *pages); + +#endif /* QEMU_UMEM_H */ diff --git a/vl.c b/vl.c index 5430b8c..17427a0 100644 --- a/vl.c +++ b/vl.c @@ -3274,8 +3274,12 @@ int main(int argc, char **argv, char **envp) default_drive(default_sdcard, snapshot, machine->use_scsi, IF_SD, 0, SD_OPTS); - register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, NULL, - ram_save_live, NULL, ram_load, NULL); + if (postcopy_incoming_init(incoming, incoming_postcopy) < 0) { + exit(1); + } + register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, + ram_save_set_params, ram_save_live, NULL, + ram_load, NULL); if (nb_numa_nodes > 0) { int i; @@ -3471,6 +3475,9 @@ int main(int argc, char **argv, char **envp) if (incoming) { runstate_set(RUN_STATE_INMIGRATE); + if (incoming_postcopy) { + postcopy_incoming_prepare(); + } int ret = qemu_start_incoming_migration(incoming); if (ret < 0) { fprintf(stderr, "Migration failed. Exit code %s(%d), exiting.\n", @@ -3488,6 +3495,9 @@ int main(int argc, char **argv, char **envp) bdrv_close_all(); pause_all_vcpus(); net_cleanup(); + if (incoming_postcopy) { + postcopy_incoming_qemu_cleanup(); + } res_free(); return 0; -- 1.7.1.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html