Here is the what I have right now for post copy chardevice. The sample user land will follow. It would give you more concrete idea and help further discussion, I hope. This is just for discussion, so it's incomplete. I'm open to other ideas and quite happy to throw away this patch and go for better way. thanks, >From e262979e95b3c5a095c8cb0bc178309baa861a3f Mon Sep 17 00:00:00 2001 Message-Id: <e262979e95b3c5a095c8cb0bc178309baa861a3f.1313146664.git.yamahata@xxxxxxxxxxxxx> From: Isaku Yamahata <yamahata@xxxxxxxxxxxxx> Date: Wed, 10 Aug 2011 18:28:05 +0900 Subject: [PATCH] kvm/postcopy: chardevice for postcopy This is a character device to hook page access. The page fault in the area is reported to another user process by this chardriver. Then, the process fills the page contents and resolves the page fault. Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx> --- arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/Makefile | 1 + include/linux/kvm.h | 45 +++ include/linux/kvm_host.h | 2 + mm/memcontrol.c | 1 + mm/shmem.c | 1 + virt/kvm/Kconfig | 3 + virt/kvm/kvm_main.c | 6 + virt/kvm/vmem.c | 847 ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/vmem.h | 68 ++++ 10 files changed, 975 insertions(+), 0 deletions(-) create mode 100644 virt/kvm/vmem.c create mode 100644 virt/kvm/vmem.h diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 0a09b58..dcbd52e 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select KVM_ASYNC_PF + select KVM_VMEM select USER_RETURN_NOTIFIER select KVM_MMIO select TASKSTATS diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..6125f4c 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -10,6 +10,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ assigned-dev.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) +kvm-$(CONFIG_KVM_VMEM) += $(addprefix ../../../virt/kvm/, vmem.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o timer.o diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 55f5afb..623109e 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA 65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ +#define KVM_CAP_POST_COPY_MEMORY 67 #ifdef KVM_CAP_IRQ_ROUTING @@ -760,6 +761,50 @@ struct kvm_clock_data { /* Available with KVM_CAP_RMA */ #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) +struct kvm_vmem_create { + __u64 size; /* in bytes */ + __s32 vmem_fd; + __s32 shmem_fd; +}; + +struct kvm_vmem_page_request { + __u32 nr; + __u64 __user *pgoffs; +}; + +struct kvm_vmem_page_cached { + __u32 nr; + __u64 __user *pgoffs; +}; + +struct kvm_vmem_page_range { + __u64 pgoff; + __u64 nr_pages; +}; + +struct kvm_vmem_make_pages_present { + __u32 nr; + struct kvm_vmem_page_range __user *ranges; +}; + +/* Available with KVM_CAP_POST_COPY_MEMORY */ +#define KVM_CREATE_VMEM_DEV _IO(KVMIO, 0xb0) + +/* ioctl for vmem_dev fd */ +#define KVM_CREATE_VMEM _IOR(KVMIO, 0xb1, __u32) + +/* ioctl for vmem fd */ +#define KVM_VMEM_WAIT_READY _IO(KVMIO, 0xb2) +#define KVM_VMEM_READY _IO(KVMIO, 0xb3) +#define KVM_VMEM_GET_PAGE_REQUEST \ + _IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request) +#define KVM_VMEM_MARK_PAGE_CACHED \ + _IOW(KVMIO, 0xb5, struct kvm_vmem_page_cached) +#define KVM_VMEM_MAKE_PAGES_PRESENT \ + _IOW(KVMIO, 0xb6, struct kvm_vmem_make_pages_present) +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7) + + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) struct kvm_assigned_pci_dev { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ff4d406..8b3dafa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -222,6 +222,8 @@ struct kvm_irq_routing_table {}; #endif +long kvm_dev_ioctl_create_vmem_dev(void); + struct kvm_memslots { int nmemslots; u64 generation; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e013b8e..7f3fc4e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2838,6 +2838,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return ret; } +EXPORT_SYMBOL_GPL(mem_cgroup_cache_charge); /* * While swap-in, try_charge -> commit or cancel, the page is locked. diff --git a/mm/shmem.c b/mm/shmem.c index fcedf54..ae7d61f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3035,6 +3035,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } +EXPORT_SYMBOL_GPL(shmem_zero_setup); /** * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index f63ccb0..d3040ea 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -18,3 +18,6 @@ config KVM_MMIO config KVM_ASYNC_PF bool + +config KVM_VMEM + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aefdda3..9e47e20 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2184,6 +2184,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_SET_BOOT_CPU_ID: #endif case KVM_CAP_INTERNAL_ERROR_DATA: + case KVM_CAP_POST_COPY_MEMORY: return 1; #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQ_ROUTING: @@ -2233,6 +2234,11 @@ static long kvm_dev_ioctl(struct file *filp, case KVM_TRACE_DISABLE: r = -EOPNOTSUPP; break; +#ifdef CONFIG_KVM_VMEM + case KVM_CREATE_VMEM_DEV: + r = kvm_dev_ioctl_create_vmem_dev(); + break; +#endif default: return kvm_arch_dev_ioctl(filp, ioctl, arg); } diff --git a/virt/kvm/vmem.c b/virt/kvm/vmem.c new file mode 100644 index 0000000..b413663 --- /dev/null +++ b/virt/kvm/vmem.c @@ -0,0 +1,847 @@ +/* + * KVM post copy vmem + * + * Copyright (c) 2011, + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/pagemap.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include "vmem.h" + +static void kvm_vmem_release_fake_vmf(int ret, struct vm_fault *fake_vmf) +{ + if (ret & VM_FAULT_LOCKED) { + unlock_page(fake_vmf->page); + } + page_cache_release(fake_vmf->page); +} + +static int kvm_vmem_minor_fault(struct kvm_vmem *vmem, + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct vm_fault fake_vmf; + int ret; + struct page *page; + + BUG_ON(!test_bit(vmf->pgoff, vmem->cached)); + fake_vmf = *vmf; + fake_vmf.page = NULL; + ret = vmem->vma->vm_ops->fault(vmem->vma, &fake_vmf); + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)) + return ret; + + /* + * TODO: pull out fake_vmf->page from shmem file and donate it + * to this vma resolving the page fault. + * vmf->page = fake_vmf->page; + */ + + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (!page) + return VM_FAULT_OOM; + if (mem_cgroup_cache_charge(page, vma->vm_mm, GFP_KERNEL)) { + kvm_vmem_release_fake_vmf(ret, &fake_vmf); + page_cache_release(page); + return VM_FAULT_OOM; + } + + copy_highpage(page, fake_vmf.page); + kvm_vmem_release_fake_vmf(ret, &fake_vmf); + + ret |= VM_FAULT_LOCKED; + SetPageUptodate(page); + vmf->page = page; + set_bit(vmf->pgoff, vmem->faulted); + + return ret; +} + +static int kvm_vmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *filp = vma->vm_file; + struct kvm_vmem *vmem = filp->private_data; + + if (vmf->pgoff >= vmem->pgoff_end) { + return VM_FAULT_SIGBUS; + } + + BUG_ON(test_bit(vmf->pgoff, vmem->faulted)); + + if (!test_bit(vmf->pgoff, vmem->cached)) { + /* major fault */ + unsigned long bit; + DEFINE_WAIT(wait); + + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { + /* async page fault */ + spin_lock(&vmem->lock); + if (vmem->async_req_nr < vmem->async_req_max) { + vmem->async_req[vmem->async_req_nr] = + vmf->pgoff; + vmem->async_req_nr++; + } + spin_unlock(&vmem->lock); + wake_up_poll(&vmem->req_wait, POLLIN); + + if (test_bit(vmf->pgoff, vmem->cached)) + return kvm_vmem_minor_fault(vmem, vma, vmf); + return VM_FAULT_MAJOR | VM_FAULT_RETRY; + } + + spin_lock(&vmem->lock); + bit = find_first_zero_bit(vmem->sync_wait_bitmap, + vmem->sync_req_max); + if (likely(bit < vmem->sync_req_max)) { + vmem->sync_req[bit] = vmf->pgoff; + prepare_to_wait(&vmem->page_wait[bit], &wait, + TASK_UNINTERRUPTIBLE); + set_bit(bit, vmem->sync_req_bitmap); + set_bit(bit, vmem->sync_wait_bitmap); + spin_unlock(&vmem->lock); + wake_up_poll(&vmem->req_wait, POLLIN); + + if (!test_bit(vmf->pgoff, vmem->cached)) + schedule(); + finish_wait(&vmem->page_wait[bit], &wait); + clear_bit(bit, vmem->sync_wait_bitmap); + } else { + struct kvm_vmem_page_req_list page_req_list = { + .pgoff = vmf->pgoff, + }; + vmem->req_list_nr++; + list_add_tail(&page_req_list.list, &vmem->req_list); + wake_up_poll(&vmem->req_wait, POLLIN); + for (;;) { + prepare_to_wait(&vmem->req_list_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (test_bit(vmf->pgoff, vmem->cached)) { + vmem->req_list_nr--; + break; + } + spin_unlock(&vmem->lock); + schedule(); + spin_lock(&vmem->lock); + } + spin_unlock(&vmem->lock); + finish_wait(&vmem->req_list_wait, &wait); + } + + return kvm_vmem_minor_fault(vmem, vma, vmf) | VM_FAULT_MAJOR; + } + + return kvm_vmem_minor_fault(vmem, vma, vmf); +} + +/* for partial munmap */ +static void kvm_vmem_vma_open(struct vm_area_struct *vma) +{ + struct file *filp = vma->vm_file; + struct kvm_vmem *vmem = filp->private_data; + + spin_lock(&vmem->lock); + vmem->vma_nr++; + spin_unlock(&vmem->lock); +} + +static void kvm_vmem_vma_close(struct vm_area_struct *vma) +{ + struct file *filp = vma->vm_file; + struct kvm_vmem *vmem = filp->private_data; + struct task_struct *task = NULL; + + spin_lock(&vmem->lock); + vmem->vma_nr--; + if (vmem->vma_nr == 0) { + task = vmem->task; + vmem->task = NULL; + } + spin_unlock(&vmem->lock); + + if (task) + put_task_struct(task); +} + +static const struct vm_operations_struct kvm_vmem_vm_ops = { + .open = kvm_vmem_vma_open, + .close = kvm_vmem_vma_close, + .fault = kvm_vmem_fault, +}; + +static int kvm_vmem_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct kvm_vmem *vmem = filp->private_data; + int error; + + /* allow mmap() only once */ + spin_lock(&vmem->lock); + if (vmem->mmapped) { + error = -EBUSY; + goto out; + } + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff > + vmem->pgoff_end) { + error = -EINVAL; + goto out; + } + + vmem->mmapped = true; + vmem->vma_nr = 1; + vmem->vm_start = vma->vm_start; + get_task_struct(current); + vmem->task = current; + spin_unlock(&vmem->lock); + + vma->vm_ops = &kvm_vmem_vm_ops; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_flags &= ~VM_SHARED; + return 0; + +out: + spin_unlock(&vmem->lock); + return error; +} + +static bool kvm_vmem_req_pending(struct kvm_vmem* vmem) +{ + return !list_empty(&vmem->req_list) || + !bitmap_empty(vmem->sync_req_bitmap, vmem->sync_req_max) || + (vmem->async_req_nr > 0); +} + +static unsigned int kvm_vmem_poll(struct file* filp, poll_table *wait) +{ + struct kvm_vmem *vmem = filp->private_data; + unsigned int events = 0; + + poll_wait(filp, &vmem->req_wait, wait); + + spin_lock(&vmem->lock); + if (kvm_vmem_req_pending(vmem)) + events |= POLLIN; + spin_unlock(&vmem->lock); + + return events; +} + +/* + * return value + * true: finished + * false: more request + */ +static bool kvm_vmem_copy_page_request(struct kvm_vmem *vmem, + pgoff_t *pgoffs, int req_max, + int *req_nr) +{ + struct kvm_vmem_page_req_list *req_list; + struct kvm_vmem_page_req_list *tmp; + + unsigned long bit; + + *req_nr = 0; + list_for_each_entry_safe(req_list, tmp, &vmem->req_list, list) { + list_del(&req_list->list); + pgoffs[*req_nr] = req_list->pgoff; + (*req_nr)++; + if (*req_nr >= req_max) + return false; + } + + bit = 0; + for (;;) { + bit = find_next_bit(vmem->sync_req_bitmap, vmem->sync_req_max, + bit); + if (bit >= vmem->sync_req_max) + break; + pgoffs[*req_nr] = vmem->sync_req[bit]; + (*req_nr)++; + clear_bit(bit, vmem->sync_req_bitmap); + if (*req_nr >= req_max) + return false; + bit++; + } + + if (vmem->async_req_nr > 0) { + int nr = min(req_max - *req_nr, vmem->async_req_nr); + memcpy(pgoffs + *req_nr, vmem->async_req, + sizeof(*vmem->async_req) * nr); + vmem->async_req_nr -= nr; + *req_nr += nr; + memmove(vmem->async_req, vmem->sync_req + nr, + vmem->async_req_nr * sizeof(*vmem->async_req)); + + } + return vmem->async_req_nr == 0; +} + +static int kvm_vmem_get_page_request(struct kvm_vmem *vmem, + struct kvm_vmem_page_request *page_req) +{ + DEFINE_WAIT(wait); +#define REQ_MAX ((__u32)32) + pgoff_t pgoffs[REQ_MAX]; + __u32 req_copied = 0; + int ret = 0; + + spin_lock(&vmem->lock); + for (;;) { + prepare_to_wait(&vmem->req_wait, &wait, TASK_INTERRUPTIBLE); + if (kvm_vmem_req_pending(vmem)) { + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + spin_unlock(&vmem->lock); + schedule(); + spin_lock(&vmem->lock); + } + finish_wait(&vmem->req_wait, &wait); + if (ret) + goto out_unlock; + + while (req_copied < page_req->nr) { + int req_max; + int req_nr; + bool finished; + req_max = min(page_req->nr - req_copied, REQ_MAX); + finished = kvm_vmem_copy_page_request(vmem, pgoffs, req_max, + &req_nr); + + spin_unlock(&vmem->lock); + + if (req_nr > 0) { + ret = 0; + if (copy_to_user(page_req->pgoffs + req_copied, pgoffs, + sizeof(*pgoffs) * req_nr)) { + ret = -EFAULT; + goto out; + } + } + req_copied += req_nr; + if (finished) + goto out; + + spin_lock(&vmem->lock); + } + +out_unlock: + spin_unlock(&vmem->lock); +out: + page_req->nr = req_copied; + return ret; +} + +static int kvm_vmem_mark_page_cached(struct kvm_vmem *vmem, + struct kvm_vmem_page_cached *page_cached) +{ + int ret = 0; +#define PG_MAX ((__u32)32) + __u64 pgoffs[PG_MAX]; + __u32 nr; + unsigned long bit; + bool wake_up_list = false; + + nr = 0; + while (nr < page_cached->nr) { + __u32 todo = min(PG_MAX, (page_cached->nr - nr)); + int i; + + if (copy_from_user(pgoffs, page_cached->pgoffs + nr, + sizeof(*pgoffs) * todo)) { + ret = -EFAULT; + goto out; + } + for (i = 0; i < todo; ++i) { + if (pgoffs[i] >= vmem->pgoff_end) { + ret = -EINVAL; + goto out; + } + set_bit(pgoffs[i], vmem->cached); + } + nr += todo; + } + + spin_lock(&vmem->lock); + bit = 0; + for (;;) { + bit = find_next_bit(vmem->sync_wait_bitmap, vmem->sync_req_max, + bit); + if (bit >= vmem->sync_req_max) + break; + if (test_bit(vmem->sync_req[bit], vmem->cached)) + wake_up(&vmem->page_wait[bit]); + bit++; + } + + if (vmem->req_list_nr > 0) + wake_up_list = true; + spin_unlock(&vmem->lock); + + if (wake_up_list) + wake_up_all(&vmem->req_list_wait); + +out: + return ret; +} + +static bool kvm_vmem_is_vmem_vma(const struct kvm_vmem *vmem, + const struct vm_area_struct *vma) +{ + return vma->vm_file && vma->vm_file->private_data == vmem; +} + +static void kvm_vmem_make_pages_present_entry(struct kvm_vmem *vmem, + struct kvm_vmem_page_range *range, + struct task_struct *task, + struct mm_struct *mm, + unsigned long vm_start) +{ + unsigned long pgoff = range->pgoff; + unsigned long range_end = range->pgoff + range->nr_pages; + + down_read(&mm->mmap_sem); + + while (pgoff < range->pgoff + range->nr_pages) { + unsigned long pgoff_end; + struct vm_area_struct *vma; + unsigned long saddr; + unsigned long eaddr; + + /* search unfaulted range */ + spin_lock(&vmem->lock); + pgoff = find_next_zero_bit(vmem->faulted, range_end, pgoff); + if (pgoff >= range_end) { + spin_unlock(&vmem->lock); + break; + } + pgoff_end = find_next_bit(vmem->faulted, range_end, pgoff); + spin_unlock(&vmem->lock); + + saddr = vm_start + (pgoff << PAGE_SHIFT); + eaddr = vm_start + (pgoff_end << PAGE_SHIFT); + vma = find_vma(mm, saddr); + if (vma == NULL) { + break; + } + if (eaddr < vma->vm_start) { + pgoff = (vma->vm_start - vm_start) >> PAGE_SHIFT; + continue; + } + + if (kvm_vmem_is_vmem_vma(vmem, vma)) { + unsigned long start = max(vma->vm_start, saddr); + unsigned long end = min(vma->vm_end, eaddr); + int nr_pages = (end - start) >> PAGE_SHIFT; + get_user_pages(task, mm, start, nr_pages, + 1, 1, NULL, NULL); + pgoff = (end - vm_start) >> PAGE_SHIFT; + } else { + pgoff = (vma->vm_end - vm_start) >> PAGE_SHIFT; + } + } + + up_read(&mm->mmap_sem); +} + +static int kvm_vmem_make_pages_present( + struct kvm_vmem *vmem, + struct kvm_vmem_make_pages_present *pages_present) +{ + struct task_struct *task; + struct mm_struct *mm; + pgoff_t pgoff_end; + unsigned long vm_start; + unsigned long vm_eaddr; + +#define NUM_ENTRIES ((__u32)32) + struct kvm_vmem_page_range kranges[NUM_ENTRIES]; + __u32 nr = 0; + int ret; + + spin_lock(&vmem->lock); + task = vmem->task; + pgoff_end = vmem->pgoff_end; + vm_start = vmem->vm_start; + vm_eaddr = vm_start + vmem->size; + spin_unlock(&vmem->lock); + if (task == NULL) + return 0; + mm = get_task_mm(task); + if (mm == NULL) + return 0; + + ret = 0; + while (nr < pages_present->nr) { + int nr_ranges = min(NUM_ENTRIES, pages_present->nr - nr); + int i; + + if (copy_from_user(&kranges, pages_present->ranges + nr, + sizeof(kranges[0]) * nr_ranges)) { + ret = -EFAULT; + break; + } + for (i = 0; i < nr_ranges; ++i) { + struct kvm_vmem_page_range *range = &kranges[i]; + if (range->pgoff >= pgoff_end || + range->nr_pages >= pgoff_end || + range->pgoff + range->nr_pages >= pgoff_end) { + ret = -EINVAL; + break; + } + kvm_vmem_make_pages_present_entry(vmem, range, + task, mm, vm_start); + } + nr += nr_ranges; + } + + mmput(mm); + return ret; +} + +static int kvm_vmem_make_vma_anonymous(struct kvm_vmem *vmem) +{ +#if 1 + return -ENOSYS; +#else + unsigned long saddr; + unsigned long eaddr; + unsigned long addr; + unsigned long bit; + struct task_struct *task; + struct mm_struct *mm; + + spin_lock(&vmem->lock); + task = vmem->task; + saddr = vmem->vm_start; + eaddr = saddr + vmem->size; + bit = find_first_zero_bit(vmem->faulted, vmem->pgoff_end); + if (bit < vmem->pgoff_end) { + spin_unlock(&vmem->lock); + return -EBUSY; + } + spin_unlock(&vmem->lock); + if (task == NULL) + return 0; + mm = get_task_mm(task); + if (mm == NULL) + return 0; + + addr = saddr; + down_write(&mm->mmap_sem); + while (addr < eaddr) { + struct vm_area_struct *vma; + vma = find_vma(mm, addr); + if (kvm_vmem_is_vmem_vma(vmem, vma)) { + /* XXX incorrect. race/locking and more fix up */ + struct file *filp = vma->vm_file; + vma->vm_ops->close(vma); + vma->vm_ops = NULL; + vma->vm_file = NULL; + /* vma->vm_flags */ + fput(filp); + } + addr = vma->vm_end; + } + up_write(&mm->mmap_sem); + + mmput(mm); + return 0; +#endif +} + +static void kvm_vmem_ready(struct kvm_vmem *vmem) +{ + spin_lock(&vmem->lock); + vmem->ready = true; + spin_unlock(&vmem->lock); + wake_up_interruptible(&vmem->ready_wait); +} + +static int kvm_vmem_wait_ready(struct kvm_vmem *vmem) +{ + int ret = 0; + DEFINE_WAIT(wait); + + spin_lock(&vmem->lock); + for (;;) { + prepare_to_wait(&vmem->ready_wait, &wait, TASK_INTERRUPTIBLE); + if (vmem->ready) { + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + spin_unlock(&vmem->lock); + schedule(); + spin_lock(&vmem->lock); + } + spin_unlock(&vmem->lock); + finish_wait(&vmem->ready_wait, &wait); + return ret; +} + +static long kvm_vmem_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct kvm_vmem *vmem = filp->private_data; + void __user *argp = (void __user *) arg; + long ret = 0; + + switch (ioctl) { + case KVM_VMEM_READY: + kvm_vmem_ready(vmem); + ret = 0; + break; + case KVM_VMEM_WAIT_READY: + ret = kvm_vmem_wait_ready(vmem); + break; + case KVM_VMEM_GET_PAGE_REQUEST: { + struct kvm_vmem_page_request page_request; + ret = -EFAULT; + if (copy_from_user(&page_request, argp, sizeof(page_request))) + break; + ret = kvm_vmem_get_page_request(vmem, &page_request); + if (ret == 0 && + copy_to_user(argp + + offsetof(struct kvm_vmem_page_request, nr), + &page_request.nr, + sizeof(page_request.nr))) { + ret = -EFAULT; + break; + } + break; + } + case KVM_VMEM_MARK_PAGE_CACHED: { + struct kvm_vmem_page_cached page_cached; + ret = -EFAULT; + if (copy_from_user(&page_cached, argp, sizeof(page_cached))) + break; + ret = kvm_vmem_mark_page_cached(vmem, &page_cached); + break; + } + case KVM_VMEM_MAKE_PAGES_PRESENT: { + struct kvm_vmem_make_pages_present pages_present; + ret = -EFAULT; + if (copy_from_user(&pages_present, argp, + sizeof(pages_present))) + break; + ret = kvm_vmem_make_pages_present(vmem, &pages_present); + break; + } + case KVM_VMEM_MAKE_VMA_ANONYMOUS: + ret = kvm_vmem_make_vma_anonymous(vmem); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static unsigned long kvm_vmem_bitmap_bytes(const struct kvm_vmem *vmem) +{ + return round_up(vmem->pgoff_end, BITS_PER_LONG) / 8; +} + +static void kvm_vmem_free(struct kvm_vmem *vmem) +{ + if (vmem->task) { + put_task_struct(vmem->task); + vmem->task = NULL; + } + + if (vmem->shmem_filp) + fput(vmem->shmem_filp); + if (kvm_vmem_bitmap_bytes(vmem) > PAGE_SIZE) { + vfree(vmem->cached); + vfree(vmem->faulted); + } else { + kfree(vmem->cached); + kfree(vmem->faulted); + } + kfree(vmem->vma); + kfree(vmem->async_req); + kfree(vmem->sync_req_bitmap); + kfree(vmem->sync_wait_bitmap); + kfree(vmem->page_wait); + kfree(vmem->sync_req); + kfree(vmem); +} + +static int kvm_vmem_release(struct inode *inode, struct file *filp) +{ + struct kvm_vmem *vmem = filp->private_data; + kvm_vmem_free(vmem); + return 0; +} + +static struct file_operations kvm_vmem_fops = { + .release = kvm_vmem_release, + .unlocked_ioctl = kvm_vmem_ioctl, + .mmap = kvm_vmem_mmap, + .poll = kvm_vmem_poll, + .llseek = noop_llseek, +}; + +static int kvm_create_vmem(struct kvm_vmem_create *create) +{ + int error = 0; + struct kvm_vmem *vmem = NULL; + struct vm_area_struct *vma = NULL; + int shmem_fd; + unsigned long bitmap_bytes; + unsigned long sync_bitmap_bytes; + int i; + + vmem = kzalloc(sizeof(*vmem), GFP_KERNEL); + vmem->task = NULL; + vmem->mmapped = false; + spin_lock_init(&vmem->lock); + vmem->size = roundup(create->size, PAGE_SIZE); + vmem->pgoff_end = vmem->size >> PAGE_SHIFT; + init_waitqueue_head(&vmem->req_wait); + + vma = kzalloc(sizeof(*vma), GFP_KERNEL); + vma->vm_start = 0; + vma->vm_end = vmem->size; + /* this shmem file is used for temporal buffer for pages + so it's unlikely that so many pages exists in this shmem file */ + vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY | + VM_DONTEXPAND; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + vma->vm_pgoff = 0; + INIT_LIST_HEAD(&vma->anon_vma_chain); + + vmem->vma = vma; + + shmem_fd = get_unused_fd(); + if (shmem_fd < 0) { + error = shmem_fd; + goto out; + } + error = shmem_zero_setup(vma); + if (error < 0) { + put_unused_fd(shmem_fd); + goto out; + } + vmem->shmem_filp = vma->vm_file; + get_file(vmem->shmem_filp); + fd_install(shmem_fd, vma->vm_file); + create->shmem_fd = shmem_fd; + + create->vmem_fd = anon_inode_getfd("kvm-vmem", + &kvm_vmem_fops, vmem, O_RDWR); + if (create->vmem_fd < 0) { + error = create->vmem_fd; + goto out; + } + + bitmap_bytes = kvm_vmem_bitmap_bytes(vmem); + if (bitmap_bytes > PAGE_SIZE) { + vmem->cached = vzalloc(bitmap_bytes); + vmem->faulted = vzalloc(bitmap_bytes); + } else { + vmem->cached = kzalloc(bitmap_bytes, GFP_KERNEL); + vmem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL); + } + +#define ASYNC_REQ_MAX (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS) + vmem->async_req_max = ASYNC_REQ_MAX; + vmem->async_req_nr = 0; + vmem->async_req = kzalloc(sizeof(*vmem->async_req), GFP_KERNEL); + +#define SYNC_REQ_MAX (KVM_MAX_VCPUS) + vmem->sync_req_max = round_up(SYNC_REQ_MAX, BITS_PER_LONG); + sync_bitmap_bytes = sizeof(unsigned long) * + (vmem->sync_req_max / BITS_PER_LONG); + vmem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); + vmem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); + vmem->page_wait = kzalloc(sizeof(*vmem->page_wait) * + vmem->sync_req_max, GFP_KERNEL); + for (i = 0; i < vmem->sync_req_max; ++i) + init_waitqueue_head(&vmem->page_wait[i]); + vmem->sync_req = kzalloc(sizeof(*vmem->sync_req) * + vmem->sync_req_max, GFP_KERNEL); + + vmem->req_list_nr = 0; + INIT_LIST_HEAD(&vmem->req_list); + init_waitqueue_head(&vmem->req_list_wait); + + init_waitqueue_head(&vmem->ready_wait); + vmem->ready = false; + + return 0; + + out: + kvm_vmem_free(vmem); + return error; +} + +static long kvm_vmem_dev_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *) arg; + long ret; + + switch (ioctl) { + case KVM_CREATE_VMEM: { + struct kvm_vmem_create create; + if (copy_from_user(&create, argp, sizeof(create))) { + ret = -EFAULT; + break; + } + ret = kvm_create_vmem(&create); + if (copy_to_user(argp, &create, sizeof(create))) { + ret = -EFAULT; + break; + } + break; + } + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int kvm_vmem_dev_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static struct file_operations kvm_vmem_dev_fops = { + .release = kvm_vmem_dev_release, + .unlocked_ioctl = kvm_vmem_dev_ioctl, +}; + +long kvm_dev_ioctl_create_vmem_dev(void) +{ + return anon_inode_getfd("kvm-vmem-dev", &kvm_vmem_dev_fops, + NULL, O_RDWR); +} diff --git a/virt/kvm/vmem.h b/virt/kvm/vmem.h new file mode 100644 index 0000000..bc7e8cf --- /dev/null +++ b/virt/kvm/vmem.h @@ -0,0 +1,68 @@ +/* + * KVM post copy vmem + * + * Copyright (c) 2011, + * National Institute of Advanced Industrial Science and Technology + * + * https://sites.google.com/site/grivonhome/quick-kvm-migration + * Author: Isaku Yamahata <yamahata at valinux co jp> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#ifndef __KVM_VMEM_H__ +#define __KVM_VMEM_H__ + +struct kvm_vmem_page_req_list { + struct list_head list; + pgoff_t pgoff; +}; + +struct kvm_vmem { + loff_t size; + pgoff_t pgoff_end; + spinlock_t lock; + + wait_queue_head_t req_wait; + + int async_req_max; + int async_req_nr; + pgoff_t *async_req; + + int sync_req_max; + unsigned long *sync_req_bitmap; + unsigned long *sync_wait_bitmap; + pgoff_t *sync_req; + wait_queue_head_t *page_wait; + + int req_list_nr; + struct list_head req_list; + wait_queue_head_t req_list_wait; + + unsigned long *cached; + unsigned long *faulted; + + bool mmapped; + unsigned long vm_start; + unsigned int vma_nr; + struct task_struct *task; + + wait_queue_head_t ready_wait; + bool ready; + + struct file *shmem_filp; + struct vm_area_struct *vma; +}; + +#endif /* __KVM_VMEM_H__ */ -- 1.7.1.1 -- yamahata -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html