This patch introduces helper function for postcopy to access umem char device and to communicate between incoming-qemu and umemd. Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx> --- changes v1 -> v2: - code simplification - make fault trigger more robust - introduce struct umem_pages --- umem.c | 364 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ umem.h | 101 ++++++++++++++++++ 2 files changed, 465 insertions(+), 0 deletions(-) create mode 100644 umem.c create mode 100644 umem.h diff --git a/umem.c b/umem.c new file mode 100644 index 0000000..64eaab5 --- /dev/null +++ b/umem.c @@ -0,0 +1,364 @@ +/* + * umem.c: user process backed memory module for postcopy livemigration + * + * Copyright (c) 2011 + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <linux/umem.h> + +#include "bitops.h" +#include "sysemu.h" +#include "hw/hw.h" +#include "umem.h" + +//#define DEBUG_UMEM +#ifdef DEBUG_UMEM +#include <sys/syscall.h> +#define DPRINTF(format, ...) \ + do { \ + printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid), \ + __func__, __LINE__, ## __VA_ARGS__); \ + } while (0) +#else +#define DPRINTF(format, ...) do { } while (0) +#endif + +#define DEV_UMEM "/dev/umem" + +UMem *umem_new(void *hostp, size_t size) +{ + struct umem_init uinit = { + .size = size, + }; + UMem *umem; + + assert((size % getpagesize()) == 0); + umem = g_new(UMem, 1); + umem->fd = open(DEV_UMEM, O_RDWR); + if (umem->fd < 0) { + perror("can't open "DEV_UMEM); + abort(); + } + + if (ioctl(umem->fd, UMEM_INIT, &uinit) < 0) { + perror("UMEM_INIT"); + abort(); + } + if (ftruncate(uinit.shmem_fd, uinit.size) < 0) { + perror("truncate(\"shmem_fd\")"); + abort(); + } + + umem->nbits = 0; + umem->nsets = 0; + umem->faulted = NULL; + umem->page_shift = ffs(getpagesize()) - 1; + umem->shmem_fd = uinit.shmem_fd; + umem->size = uinit.size; + umem->umem = mmap(hostp, size, PROT_EXEC | PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_FIXED, umem->fd, 0); + if (umem->umem == MAP_FAILED) { + perror("mmap(UMem) failed"); + abort(); + } + return umem; +} + +void umem_destroy(UMem *umem) +{ + if (umem->fd != -1) { + close(umem->fd); + } + if (umem->shmem_fd != -1) { + close(umem->shmem_fd); + } + g_free(umem->faulted); + g_free(umem); +} + +void umem_get_page_request(UMem *umem, struct umem_pages *page_request) +{ + ssize_t ret = read(umem->fd, page_request->pgoffs, + page_request->nr * sizeof(page_request->pgoffs[0])); + if (ret < 0) { + perror("daemon: umem read"); + abort(); + } + page_request->nr = ret / sizeof(page_request->pgoffs[0]); +} + +void umem_mark_page_cached(UMem *umem, struct umem_pages *page_cached) +{ + const void *buf = page_cached->pgoffs; + ssize_t left = page_cached->nr * sizeof(page_cached->pgoffs[0]); + + while (left > 0) { + ssize_t ret = write(umem->fd, buf, left); + if (ret == -1) { + if (errno == EINTR) + continue; + + perror("daemon: umem write"); + abort(); + } + + left -= ret; + buf += ret; + } +} + +void umem_unmap(UMem *umem) +{ + munmap(umem->umem, umem->size); + umem->umem = NULL; +} + +void umem_close(UMem *umem) +{ + close(umem->fd); + umem->fd = -1; +} + +void *umem_map_shmem(UMem *umem) +{ + umem->nbits = umem->size >> umem->page_shift; + umem->nsets = 0; + umem->faulted = g_new0(unsigned long, BITS_TO_LONGS(umem->nbits)); + + umem->shmem = mmap(NULL, umem->size, PROT_READ | PROT_WRITE, MAP_SHARED, + umem->shmem_fd, 0); + if (umem->shmem == MAP_FAILED) { + perror("daemon: mmap(\"shmem\")"); + abort(); + } + return umem->shmem; +} + +void umem_unmap_shmem(UMem *umem) +{ + munmap(umem->shmem, umem->size); + umem->shmem = NULL; +} + +void umem_remove_shmem(UMem *umem, size_t offset, size_t size) +{ + int s = offset >> umem->page_shift; + int e = (offset + size) >> umem->page_shift; + int i; + + for (i = s; i < e; i++) { + if (!test_and_set_bit(i, umem->faulted)) { + umem->nsets++; +#if defined(CONFIG_MADVISE) && defined(MADV_REMOVE) + madvise(umem->shmem + offset, size, MADV_REMOVE); +#endif + } + } +} + +void umem_close_shmem(UMem *umem) +{ + close(umem->shmem_fd); + umem->shmem_fd = -1; +} + +/***************************************************************************/ +/* qemu <-> umem daemon communication */ + +size_t umem_pages_size(uint64_t nr) +{ + return sizeof(struct umem_pages) + nr * sizeof(uint64_t); +} + +static void umem_write_cmd(int fd, uint8_t cmd) +{ + DPRINTF("write cmd %c\n", cmd); + + for (;;) { + ssize_t ret = write(fd, &cmd, 1); + if (ret == -1) { + if (errno == EINTR) { + continue; + } else if (errno == EPIPE) { + perror("pipe"); + DPRINTF("write cmd %c %zd %d: pipe is closed\n", + cmd, ret, errno); + break; + } + + perror("pipe"); + DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno); + abort(); + } + + break; + } +} + +static void umem_read_cmd(int fd, uint8_t expect) +{ + uint8_t cmd; + for (;;) { + ssize_t ret = read(fd, &cmd, 1); + if (ret == -1) { + if (errno == EINTR) { + continue; + } + perror("pipe"); + DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno); + abort(); + } + + if (ret == 0) { + DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret); + abort(); + } + + break; + } + + DPRINTF("read cmd %c\n", cmd); + if (cmd != expect) { + DPRINTF("cmd %c expect %d\n", cmd, expect); + abort(); + } +} + +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset) +{ + int ret; + uint64_t nr; + size_t size; + struct umem_pages *pages; + + ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset); + *offset += sizeof(nr); + DPRINTF("ret %d nr %ld\n", ret, nr); + if (ret != sizeof(nr) || nr == 0) { + return NULL; + } + + size = umem_pages_size(nr); + pages = g_malloc(size); + pages->nr = nr; + size -= sizeof(pages->nr); + + ret = qemu_peek_buffer(f, (uint8_t*)pages->pgoffs, size, *offset); + *offset += size; + if (ret != size) { + g_free(pages); + return NULL; + } + return pages; +} + +static void umem_send_pages(QEMUFile *f, const struct umem_pages *pages) +{ + size_t len = umem_pages_size(pages->nr); + qemu_put_buffer(f, (const uint8_t*)pages, len); +} + +/* umem daemon -> qemu */ +void umem_daemon_ready(int to_qemu_fd) +{ + umem_write_cmd(to_qemu_fd, UMEM_DAEMON_READY); +} + +void umem_daemon_quit(QEMUFile *to_qemu) +{ + qemu_put_byte(to_qemu, UMEM_DAEMON_QUIT); +} + +void umem_daemon_send_pages_present(QEMUFile *to_qemu, + struct umem_pages *pages) +{ + qemu_put_byte(to_qemu, UMEM_DAEMON_TRIGGER_PAGE_FAULT); + umem_send_pages(to_qemu, pages); +} + +void umem_daemon_wait_for_qemu(int from_qemu_fd) +{ + umem_read_cmd(from_qemu_fd, UMEM_QEMU_READY); +} + +/* qemu -> umem daemon */ +void umem_qemu_wait_for_daemon(int from_umemd_fd) +{ + umem_read_cmd(from_umemd_fd, UMEM_DAEMON_READY); +} + +void umem_qemu_ready(int to_umemd_fd) +{ + umem_write_cmd(to_umemd_fd, UMEM_QEMU_READY); +} + +void umem_qemu_quit(QEMUFile *to_umemd) +{ + qemu_put_byte(to_umemd, UMEM_QEMU_QUIT); +} + +/* qemu side handler */ +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd, + int *offset) +{ + uint64_t i; + int page_shift = ffs(getpagesize()) - 1; + struct umem_pages *pages = umem_recv_pages(from_umemd, offset); + if (pages == NULL) { + return NULL; + } + + for (i = 0; i < pages->nr; i++) { + ram_addr_t addr = pages->pgoffs[i] << page_shift; + + /* make pages present by forcibly triggering page fault. */ + volatile uint8_t *ram = qemu_get_ram_ptr(addr); + uint8_t dummy_read = ram[0]; + (void)dummy_read; /* suppress unused variable warning */ + } + + /* + * Very Linux implementation specific. + * Make it sure that other thread doesn't fault on the above virtual + * address. (More exactly other thread doesn't call fault handler with + * the offset.) + * the fault handler is called with mmap_sem read locked. + * madvise() does down/up_write(mmap_sem) + */ + qemu_madvise(NULL, 0, MADV_NORMAL); + + return pages; +} + +void umem_qemu_send_pages_present(QEMUFile *to_umemd, + const struct umem_pages *pages) +{ + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_FAULTED); + umem_send_pages(to_umemd, pages); +} + +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd, + const struct umem_pages *pages) +{ + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_UNMAPPED); + umem_send_pages(to_umemd, pages); +} diff --git a/umem.h b/umem.h new file mode 100644 index 0000000..058cac6 --- /dev/null +++ b/umem.h @@ -0,0 +1,101 @@ +/* + * umem.h: user process backed memory module for postcopy livemigration + * + * Copyright (c) 2011 + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef QEMU_UMEM_H +#define QEMU_UMEM_H + +#include <linux/umem.h> + +#include "qemu-common.h" + +typedef struct UMemDev UMemDev; + +struct UMem { + void *umem; + int fd; + void *shmem; + int shmem_fd; + uint64_t size; + + /* indexed by host page size */ + int page_shift; + int nbits; + int nsets; + unsigned long *faulted; +}; + +struct umem_pages { + uint64_t nr; + uint64_t pgoffs[0]; +}; + +UMem *umem_new(void *hostp, size_t size); +void umem_destroy(UMem *umem); + +/* umem device operations */ +void umem_get_page_request(UMem *umem, struct umem_pages *page_request); +void umem_mark_page_cached(UMem *umem, struct umem_pages *page_cached); +void umem_unmap(UMem *umem); +void umem_close(UMem *umem); + +/* umem shmem operations */ +void *umem_map_shmem(UMem *umem); +void umem_unmap_shmem(UMem *umem); +void umem_remove_shmem(UMem *umem, size_t offset, size_t size); +void umem_close_shmem(UMem *umem); + +/* qemu on source <-> umem daemon communication */ + +/* daemon -> qemu */ +#define UMEM_DAEMON_READY 'R' +#define UMEM_DAEMON_QUIT 'Q' +#define UMEM_DAEMON_TRIGGER_PAGE_FAULT 'T' +#define UMEM_DAEMON_ERROR 'E' + +/* qemu -> daemon */ +#define UMEM_QEMU_READY 'r' +#define UMEM_QEMU_QUIT 'q' +#define UMEM_QEMU_PAGE_FAULTED 't' +#define UMEM_QEMU_PAGE_UNMAPPED 'u' + +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset); +size_t umem_pages_size(uint64_t nr); + +/* for umem daemon */ +void umem_daemon_ready(int to_qemu_fd); +void umem_daemon_wait_for_qemu(int from_qemu_fd); +void umem_daemon_quit(QEMUFile *to_qemu); +void umem_daemon_send_pages_present(QEMUFile *to_qemu, + struct umem_pages *pages); + +/* for qemu */ +void umem_qemu_wait_for_daemon(int from_umemd_fd); +void umem_qemu_ready(int to_umemd_fd); +void umem_qemu_quit(QEMUFile *to_umemd); +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd, + int *offset); +void umem_qemu_send_pages_present(QEMUFile *to_umemd, + const struct umem_pages *pages); +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd, + const struct umem_pages *pages); + +#endif /* QEMU_UMEM_H */ -- 1.7.1.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html