On Fri, Aug 12, 2011 at 11:07 AM, Isaku Yamahata <yamahata@xxxxxxxxxxxxx> wrote: > Here is the what I have right now for post copy chardevice. > The sample user land will follow. > It would give you more concrete idea and help further discussion, I hope. > This is just for discussion, so it's incomplete. > > I'm open to other ideas and quite happy to throw away this patch and > go for better way. > > thanks, > > From e262979e95b3c5a095c8cb0bc178309baa861a3f Mon Sep 17 00:00:00 2001 > Message-Id: <e262979e95b3c5a095c8cb0bc178309baa861a3f.1313146664.git.yamahata@xxxxxxxxxxxxx> > From: Isaku Yamahata <yamahata@xxxxxxxxxxxxx> > Date: Wed, 10 Aug 2011 18:28:05 +0900 > Subject: [PATCH] kvm/postcopy: chardevice for postcopy > > This is a character device to hook page access. > The page fault in the area is reported to another user process by > this chardriver. Then, the process fills the page contents and > resolves the page fault. > > Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx> > --- > arch/x86/kvm/Kconfig | 1 + > arch/x86/kvm/Makefile | 1 + > include/linux/kvm.h | 45 +++ > include/linux/kvm_host.h | 2 + > mm/memcontrol.c | 1 + > mm/shmem.c | 1 + > virt/kvm/Kconfig | 3 + > virt/kvm/kvm_main.c | 6 + > virt/kvm/vmem.c | 847 ++++++++++++++++++++++++++++++++++++++++++++++ > virt/kvm/vmem.h | 68 ++++ > 10 files changed, 975 insertions(+), 0 deletions(-) > create mode 100644 virt/kvm/vmem.c > create mode 100644 virt/kvm/vmem.h > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index 0a09b58..dcbd52e 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -29,6 +29,7 @@ config KVM > select HAVE_KVM_EVENTFD > select KVM_APIC_ARCHITECTURE > select KVM_ASYNC_PF > + select KVM_VMEM > select USER_RETURN_NOTIFIER > select KVM_MMIO > select TASKSTATS > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > index f15501f..6125f4c 100644 > --- a/arch/x86/kvm/Makefile > +++ b/arch/x86/kvm/Makefile > @@ -10,6 +10,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ > assigned-dev.o) > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) > +kvm-$(CONFIG_KVM_VMEM) += $(addprefix ../../../virt/kvm/, vmem.o) > > kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ > i8254.o timer.o > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > index 55f5afb..623109e 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { > #define KVM_CAP_PPC_SMT 64 > #define KVM_CAP_PPC_RMA 65 > #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ > +#define KVM_CAP_POST_COPY_MEMORY 67 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -760,6 +761,50 @@ struct kvm_clock_data { > /* Available with KVM_CAP_RMA */ > #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) > > +struct kvm_vmem_create { > + __u64 size; /* in bytes */ > + __s32 vmem_fd; > + __s32 shmem_fd; > +}; > + > +struct kvm_vmem_page_request { > + __u32 nr; Padding will be needed here on 64 bit hosts unless the order is switched. > + __u64 __user *pgoffs; > +}; > + > +struct kvm_vmem_page_cached { > + __u32 nr; Also here. > + __u64 __user *pgoffs; > +}; > + > +struct kvm_vmem_page_range { > + __u64 pgoff; > + __u64 nr_pages; > +}; > + > +struct kvm_vmem_make_pages_present { > + __u32 nr; And here. > + struct kvm_vmem_page_range __user *ranges; > +}; > + > +/* Available with KVM_CAP_POST_COPY_MEMORY */ > +#define KVM_CREATE_VMEM_DEV _IO(KVMIO, 0xb0) > + > +/* ioctl for vmem_dev fd */ > +#define KVM_CREATE_VMEM _IOR(KVMIO, 0xb1, __u32) > + > +/* ioctl for vmem fd */ > +#define KVM_VMEM_WAIT_READY _IO(KVMIO, 0xb2) > +#define KVM_VMEM_READY _IO(KVMIO, 0xb3) > +#define KVM_VMEM_GET_PAGE_REQUEST \ > + _IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request) > +#define KVM_VMEM_MARK_PAGE_CACHED \ > + _IOW(KVMIO, 0xb5, struct kvm_vmem_page_cached) > +#define KVM_VMEM_MAKE_PAGES_PRESENT \ > + _IOW(KVMIO, 0xb6, struct kvm_vmem_make_pages_present) > +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7) > + > + > #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) > > struct kvm_assigned_pci_dev { > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index ff4d406..8b3dafa 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -222,6 +222,8 @@ struct kvm_irq_routing_table {}; > > #endif > > +long kvm_dev_ioctl_create_vmem_dev(void); > + > struct kvm_memslots { > int nmemslots; > u64 generation; > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e013b8e..7f3fc4e 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -2838,6 +2838,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, > > return ret; > } > +EXPORT_SYMBOL_GPL(mem_cgroup_cache_charge); > > /* > * While swap-in, try_charge -> commit or cancel, the page is locked. > diff --git a/mm/shmem.c b/mm/shmem.c > index fcedf54..ae7d61f 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -3035,6 +3035,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) > vma->vm_flags |= VM_CAN_NONLINEAR; > return 0; > } > +EXPORT_SYMBOL_GPL(shmem_zero_setup); > > /** > * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index f63ccb0..d3040ea 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -18,3 +18,6 @@ config KVM_MMIO > > config KVM_ASYNC_PF > bool > + > +config KVM_VMEM > + bool > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index aefdda3..9e47e20 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2184,6 +2184,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) > case KVM_CAP_SET_BOOT_CPU_ID: > #endif > case KVM_CAP_INTERNAL_ERROR_DATA: > + case KVM_CAP_POST_COPY_MEMORY: > return 1; > #ifdef CONFIG_HAVE_KVM_IRQCHIP > case KVM_CAP_IRQ_ROUTING: > @@ -2233,6 +2234,11 @@ static long kvm_dev_ioctl(struct file *filp, > case KVM_TRACE_DISABLE: > r = -EOPNOTSUPP; > break; > +#ifdef CONFIG_KVM_VMEM > + case KVM_CREATE_VMEM_DEV: > + r = kvm_dev_ioctl_create_vmem_dev(); > + break; > +#endif > default: > return kvm_arch_dev_ioctl(filp, ioctl, arg); > } > diff --git a/virt/kvm/vmem.c b/virt/kvm/vmem.c > new file mode 100644 > index 0000000..b413663 > --- /dev/null > +++ b/virt/kvm/vmem.c > @@ -0,0 +1,847 @@ > +/* > + * KVM post copy vmem > + * > + * Copyright (c) 2011, > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata <yamahata at valinux co jp> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along with > + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. The current address is: 51 Franklin Street, Fifth Floor Boston, MA 02110-1301 USA Then there is the version used in QEMU: if not, see <http://www.gnu.org/licenses/>. I don't know which one is preferred with kernel. > + */ > + > +#include <linux/kvm_host.h> > +#include <linux/kvm.h> > +#include <linux/pagemap.h> > +#include <linux/mm.h> > +#include <linux/memcontrol.h> > +#include <linux/poll.h> > +#include <linux/file.h> > +#include <linux/anon_inodes.h> > +#include "vmem.h" > + > +static void kvm_vmem_release_fake_vmf(int ret, struct vm_fault *fake_vmf) > +{ > + if (ret & VM_FAULT_LOCKED) { > + unlock_page(fake_vmf->page); > + } > + page_cache_release(fake_vmf->page); > +} > + > +static int kvm_vmem_minor_fault(struct kvm_vmem *vmem, > + struct vm_area_struct *vma, > + struct vm_fault *vmf) > +{ > + struct vm_fault fake_vmf; > + int ret; > + struct page *page; > + > + BUG_ON(!test_bit(vmf->pgoff, vmem->cached)); > + fake_vmf = *vmf; > + fake_vmf.page = NULL; > + ret = vmem->vma->vm_ops->fault(vmem->vma, &fake_vmf); > + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)) > + return ret; > + > + /* > + * TODO: pull out fake_vmf->page from shmem file and donate it > + * to this vma resolving the page fault. > + * vmf->page = fake_vmf->page; > + */ > + > + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); > + if (!page) > + return VM_FAULT_OOM; > + if (mem_cgroup_cache_charge(page, vma->vm_mm, GFP_KERNEL)) { > + kvm_vmem_release_fake_vmf(ret, &fake_vmf); > + page_cache_release(page); > + return VM_FAULT_OOM; > + } > + > + copy_highpage(page, fake_vmf.page); > + kvm_vmem_release_fake_vmf(ret, &fake_vmf); > + > + ret |= VM_FAULT_LOCKED; > + SetPageUptodate(page); > + vmf->page = page; > + set_bit(vmf->pgoff, vmem->faulted); > + > + return ret; > +} > + > +static int kvm_vmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > +{ > + struct file *filp = vma->vm_file; > + struct kvm_vmem *vmem = filp->private_data; > + > + if (vmf->pgoff >= vmem->pgoff_end) { > + return VM_FAULT_SIGBUS; > + } > + > + BUG_ON(test_bit(vmf->pgoff, vmem->faulted)); > + > + if (!test_bit(vmf->pgoff, vmem->cached)) { > + /* major fault */ > + unsigned long bit; > + DEFINE_WAIT(wait); > + > + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { > + /* async page fault */ > + spin_lock(&vmem->lock); > + if (vmem->async_req_nr < vmem->async_req_max) { > + vmem->async_req[vmem->async_req_nr] = > + vmf->pgoff; > + vmem->async_req_nr++; > + } > + spin_unlock(&vmem->lock); > + wake_up_poll(&vmem->req_wait, POLLIN); > + > + if (test_bit(vmf->pgoff, vmem->cached)) > + return kvm_vmem_minor_fault(vmem, vma, vmf); > + return VM_FAULT_MAJOR | VM_FAULT_RETRY; > + } > + > + spin_lock(&vmem->lock); > + bit = find_first_zero_bit(vmem->sync_wait_bitmap, > + vmem->sync_req_max); > + if (likely(bit < vmem->sync_req_max)) { > + vmem->sync_req[bit] = vmf->pgoff; > + prepare_to_wait(&vmem->page_wait[bit], &wait, > + TASK_UNINTERRUPTIBLE); > + set_bit(bit, vmem->sync_req_bitmap); > + set_bit(bit, vmem->sync_wait_bitmap); > + spin_unlock(&vmem->lock); > + wake_up_poll(&vmem->req_wait, POLLIN); > + > + if (!test_bit(vmf->pgoff, vmem->cached)) > + schedule(); > + finish_wait(&vmem->page_wait[bit], &wait); > + clear_bit(bit, vmem->sync_wait_bitmap); > + } else { > + struct kvm_vmem_page_req_list page_req_list = { > + .pgoff = vmf->pgoff, > + }; > + vmem->req_list_nr++; > + list_add_tail(&page_req_list.list, &vmem->req_list); > + wake_up_poll(&vmem->req_wait, POLLIN); > + for (;;) { > + prepare_to_wait(&vmem->req_list_wait, &wait, > + TASK_UNINTERRUPTIBLE); > + if (test_bit(vmf->pgoff, vmem->cached)) { > + vmem->req_list_nr--; > + break; > + } > + spin_unlock(&vmem->lock); > + schedule(); > + spin_lock(&vmem->lock); > + } > + spin_unlock(&vmem->lock); > + finish_wait(&vmem->req_list_wait, &wait); > + } > + > + return kvm_vmem_minor_fault(vmem, vma, vmf) | VM_FAULT_MAJOR; > + } > + > + return kvm_vmem_minor_fault(vmem, vma, vmf); > +} > + > +/* for partial munmap */ > +static void kvm_vmem_vma_open(struct vm_area_struct *vma) > +{ > + struct file *filp = vma->vm_file; > + struct kvm_vmem *vmem = filp->private_data; > + > + spin_lock(&vmem->lock); > + vmem->vma_nr++; > + spin_unlock(&vmem->lock); > +} > + > +static void kvm_vmem_vma_close(struct vm_area_struct *vma) > +{ > + struct file *filp = vma->vm_file; > + struct kvm_vmem *vmem = filp->private_data; > + struct task_struct *task = NULL; > + > + spin_lock(&vmem->lock); > + vmem->vma_nr--; > + if (vmem->vma_nr == 0) { > + task = vmem->task; > + vmem->task = NULL; > + } > + spin_unlock(&vmem->lock); > + > + if (task) > + put_task_struct(task); > +} > + > +static const struct vm_operations_struct kvm_vmem_vm_ops = { > + .open = kvm_vmem_vma_open, > + .close = kvm_vmem_vma_close, > + .fault = kvm_vmem_fault, > +}; > + > +static int kvm_vmem_mmap(struct file *filp, struct vm_area_struct *vma) > +{ > + struct kvm_vmem *vmem = filp->private_data; > + int error; > + > + /* allow mmap() only once */ > + spin_lock(&vmem->lock); > + if (vmem->mmapped) { > + error = -EBUSY; > + goto out; > + } > + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff > > + vmem->pgoff_end) { > + error = -EINVAL; > + goto out; > + } > + > + vmem->mmapped = true; > + vmem->vma_nr = 1; > + vmem->vm_start = vma->vm_start; > + get_task_struct(current); > + vmem->task = current; > + spin_unlock(&vmem->lock); > + > + vma->vm_ops = &kvm_vmem_vm_ops; > + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; > + vma->vm_flags &= ~VM_SHARED; > + return 0; > + > +out: > + spin_unlock(&vmem->lock); > + return error; > +} > + > +static bool kvm_vmem_req_pending(struct kvm_vmem* vmem) > +{ > + return !list_empty(&vmem->req_list) || > + !bitmap_empty(vmem->sync_req_bitmap, vmem->sync_req_max) || > + (vmem->async_req_nr > 0); > +} > + > +static unsigned int kvm_vmem_poll(struct file* filp, poll_table *wait) > +{ > + struct kvm_vmem *vmem = filp->private_data; > + unsigned int events = 0; > + > + poll_wait(filp, &vmem->req_wait, wait); > + > + spin_lock(&vmem->lock); > + if (kvm_vmem_req_pending(vmem)) > + events |= POLLIN; > + spin_unlock(&vmem->lock); > + > + return events; > +} > + > +/* > + * return value > + * true: finished > + * false: more request > + */ > +static bool kvm_vmem_copy_page_request(struct kvm_vmem *vmem, > + pgoff_t *pgoffs, int req_max, > + int *req_nr) > +{ > + struct kvm_vmem_page_req_list *req_list; > + struct kvm_vmem_page_req_list *tmp; > + > + unsigned long bit; > + > + *req_nr = 0; > + list_for_each_entry_safe(req_list, tmp, &vmem->req_list, list) { > + list_del(&req_list->list); > + pgoffs[*req_nr] = req_list->pgoff; > + (*req_nr)++; > + if (*req_nr >= req_max) > + return false; > + } > + > + bit = 0; > + for (;;) { > + bit = find_next_bit(vmem->sync_req_bitmap, vmem->sync_req_max, > + bit); > + if (bit >= vmem->sync_req_max) > + break; > + pgoffs[*req_nr] = vmem->sync_req[bit]; > + (*req_nr)++; > + clear_bit(bit, vmem->sync_req_bitmap); > + if (*req_nr >= req_max) > + return false; > + bit++; > + } > + > + if (vmem->async_req_nr > 0) { > + int nr = min(req_max - *req_nr, vmem->async_req_nr); > + memcpy(pgoffs + *req_nr, vmem->async_req, > + sizeof(*vmem->async_req) * nr); > + vmem->async_req_nr -= nr; > + *req_nr += nr; > + memmove(vmem->async_req, vmem->sync_req + nr, > + vmem->async_req_nr * sizeof(*vmem->async_req)); > + > + } > + return vmem->async_req_nr == 0; > +} > + > +static int kvm_vmem_get_page_request(struct kvm_vmem *vmem, > + struct kvm_vmem_page_request *page_req) > +{ > + DEFINE_WAIT(wait); > +#define REQ_MAX ((__u32)32) > + pgoff_t pgoffs[REQ_MAX]; > + __u32 req_copied = 0; > + int ret = 0; > + > + spin_lock(&vmem->lock); > + for (;;) { > + prepare_to_wait(&vmem->req_wait, &wait, TASK_INTERRUPTIBLE); > + if (kvm_vmem_req_pending(vmem)) { > + break; > + } > + if (signal_pending(current)) { > + ret = -ERESTARTSYS; > + break; > + } > + spin_unlock(&vmem->lock); > + schedule(); > + spin_lock(&vmem->lock); > + } > + finish_wait(&vmem->req_wait, &wait); > + if (ret) > + goto out_unlock; > + > + while (req_copied < page_req->nr) { > + int req_max; > + int req_nr; > + bool finished; > + req_max = min(page_req->nr - req_copied, REQ_MAX); > + finished = kvm_vmem_copy_page_request(vmem, pgoffs, req_max, > + &req_nr); > + > + spin_unlock(&vmem->lock); > + > + if (req_nr > 0) { > + ret = 0; > + if (copy_to_user(page_req->pgoffs + req_copied, pgoffs, > + sizeof(*pgoffs) * req_nr)) { > + ret = -EFAULT; > + goto out; > + } > + } > + req_copied += req_nr; > + if (finished) > + goto out; > + > + spin_lock(&vmem->lock); > + } > + > +out_unlock: > + spin_unlock(&vmem->lock); > +out: > + page_req->nr = req_copied; > + return ret; > +} > + > +static int kvm_vmem_mark_page_cached(struct kvm_vmem *vmem, > + struct kvm_vmem_page_cached *page_cached) > +{ > + int ret = 0; > +#define PG_MAX ((__u32)32) > + __u64 pgoffs[PG_MAX]; > + __u32 nr; > + unsigned long bit; > + bool wake_up_list = false; > + > + nr = 0; > + while (nr < page_cached->nr) { > + __u32 todo = min(PG_MAX, (page_cached->nr - nr)); > + int i; > + > + if (copy_from_user(pgoffs, page_cached->pgoffs + nr, > + sizeof(*pgoffs) * todo)) { > + ret = -EFAULT; > + goto out; > + } > + for (i = 0; i < todo; ++i) { > + if (pgoffs[i] >= vmem->pgoff_end) { > + ret = -EINVAL; > + goto out; > + } > + set_bit(pgoffs[i], vmem->cached); > + } > + nr += todo; > + } > + > + spin_lock(&vmem->lock); > + bit = 0; > + for (;;) { > + bit = find_next_bit(vmem->sync_wait_bitmap, vmem->sync_req_max, > + bit); > + if (bit >= vmem->sync_req_max) > + break; > + if (test_bit(vmem->sync_req[bit], vmem->cached)) > + wake_up(&vmem->page_wait[bit]); > + bit++; > + } > + > + if (vmem->req_list_nr > 0) > + wake_up_list = true; > + spin_unlock(&vmem->lock); > + > + if (wake_up_list) > + wake_up_all(&vmem->req_list_wait); > + > +out: > + return ret; > +} > + > +static bool kvm_vmem_is_vmem_vma(const struct kvm_vmem *vmem, > + const struct vm_area_struct *vma) > +{ > + return vma->vm_file && vma->vm_file->private_data == vmem; > +} > + > +static void kvm_vmem_make_pages_present_entry(struct kvm_vmem *vmem, > + struct kvm_vmem_page_range *range, > + struct task_struct *task, > + struct mm_struct *mm, > + unsigned long vm_start) > +{ > + unsigned long pgoff = range->pgoff; > + unsigned long range_end = range->pgoff + range->nr_pages; > + > + down_read(&mm->mmap_sem); > + > + while (pgoff < range->pgoff + range->nr_pages) { > + unsigned long pgoff_end; > + struct vm_area_struct *vma; > + unsigned long saddr; > + unsigned long eaddr; > + > + /* search unfaulted range */ > + spin_lock(&vmem->lock); > + pgoff = find_next_zero_bit(vmem->faulted, range_end, pgoff); > + if (pgoff >= range_end) { > + spin_unlock(&vmem->lock); > + break; > + } > + pgoff_end = find_next_bit(vmem->faulted, range_end, pgoff); > + spin_unlock(&vmem->lock); > + > + saddr = vm_start + (pgoff << PAGE_SHIFT); > + eaddr = vm_start + (pgoff_end << PAGE_SHIFT); > + vma = find_vma(mm, saddr); > + if (vma == NULL) { > + break; > + } > + if (eaddr < vma->vm_start) { > + pgoff = (vma->vm_start - vm_start) >> PAGE_SHIFT; > + continue; > + } > + > + if (kvm_vmem_is_vmem_vma(vmem, vma)) { > + unsigned long start = max(vma->vm_start, saddr); > + unsigned long end = min(vma->vm_end, eaddr); > + int nr_pages = (end - start) >> PAGE_SHIFT; > + get_user_pages(task, mm, start, nr_pages, > + 1, 1, NULL, NULL); > + pgoff = (end - vm_start) >> PAGE_SHIFT; > + } else { > + pgoff = (vma->vm_end - vm_start) >> PAGE_SHIFT; > + } > + } > + > + up_read(&mm->mmap_sem); > +} > + > +static int kvm_vmem_make_pages_present( > + struct kvm_vmem *vmem, > + struct kvm_vmem_make_pages_present *pages_present) > +{ > + struct task_struct *task; > + struct mm_struct *mm; > + pgoff_t pgoff_end; > + unsigned long vm_start; > + unsigned long vm_eaddr; > + > +#define NUM_ENTRIES ((__u32)32) > + struct kvm_vmem_page_range kranges[NUM_ENTRIES]; > + __u32 nr = 0; > + int ret; > + > + spin_lock(&vmem->lock); > + task = vmem->task; > + pgoff_end = vmem->pgoff_end; > + vm_start = vmem->vm_start; > + vm_eaddr = vm_start + vmem->size; > + spin_unlock(&vmem->lock); > + if (task == NULL) > + return 0; > + mm = get_task_mm(task); > + if (mm == NULL) > + return 0; > + > + ret = 0; > + while (nr < pages_present->nr) { > + int nr_ranges = min(NUM_ENTRIES, pages_present->nr - nr); > + int i; > + > + if (copy_from_user(&kranges, pages_present->ranges + nr, > + sizeof(kranges[0]) * nr_ranges)) { > + ret = -EFAULT; > + break; > + } > + for (i = 0; i < nr_ranges; ++i) { > + struct kvm_vmem_page_range *range = &kranges[i]; > + if (range->pgoff >= pgoff_end || > + range->nr_pages >= pgoff_end || > + range->pgoff + range->nr_pages >= pgoff_end) { > + ret = -EINVAL; > + break; > + } > + kvm_vmem_make_pages_present_entry(vmem, range, > + task, mm, vm_start); > + } > + nr += nr_ranges; > + } > + > + mmput(mm); > + return ret; > +} > + > +static int kvm_vmem_make_vma_anonymous(struct kvm_vmem *vmem) > +{ > +#if 1 > + return -ENOSYS; > +#else > + unsigned long saddr; > + unsigned long eaddr; > + unsigned long addr; > + unsigned long bit; > + struct task_struct *task; > + struct mm_struct *mm; > + > + spin_lock(&vmem->lock); > + task = vmem->task; > + saddr = vmem->vm_start; > + eaddr = saddr + vmem->size; > + bit = find_first_zero_bit(vmem->faulted, vmem->pgoff_end); > + if (bit < vmem->pgoff_end) { > + spin_unlock(&vmem->lock); > + return -EBUSY; > + } > + spin_unlock(&vmem->lock); > + if (task == NULL) > + return 0; > + mm = get_task_mm(task); > + if (mm == NULL) > + return 0; > + > + addr = saddr; > + down_write(&mm->mmap_sem); > + while (addr < eaddr) { > + struct vm_area_struct *vma; > + vma = find_vma(mm, addr); > + if (kvm_vmem_is_vmem_vma(vmem, vma)) { > + /* XXX incorrect. race/locking and more fix up */ > + struct file *filp = vma->vm_file; > + vma->vm_ops->close(vma); > + vma->vm_ops = NULL; > + vma->vm_file = NULL; > + /* vma->vm_flags */ > + fput(filp); > + } > + addr = vma->vm_end; > + } > + up_write(&mm->mmap_sem); > + > + mmput(mm); > + return 0; > +#endif > +} > + > +static void kvm_vmem_ready(struct kvm_vmem *vmem) > +{ > + spin_lock(&vmem->lock); > + vmem->ready = true; > + spin_unlock(&vmem->lock); > + wake_up_interruptible(&vmem->ready_wait); > +} > + > +static int kvm_vmem_wait_ready(struct kvm_vmem *vmem) > +{ > + int ret = 0; > + DEFINE_WAIT(wait); > + > + spin_lock(&vmem->lock); > + for (;;) { > + prepare_to_wait(&vmem->ready_wait, &wait, TASK_INTERRUPTIBLE); > + if (vmem->ready) { > + break; > + } > + if (signal_pending(current)) { > + ret = -ERESTARTSYS; > + break; > + } > + spin_unlock(&vmem->lock); > + schedule(); > + spin_lock(&vmem->lock); > + } > + spin_unlock(&vmem->lock); > + finish_wait(&vmem->ready_wait, &wait); > + return ret; > +} > + > +static long kvm_vmem_ioctl(struct file *filp, unsigned int ioctl, > + unsigned long arg) > +{ > + struct kvm_vmem *vmem = filp->private_data; > + void __user *argp = (void __user *) arg; > + long ret = 0; > + > + switch (ioctl) { > + case KVM_VMEM_READY: > + kvm_vmem_ready(vmem); > + ret = 0; > + break; > + case KVM_VMEM_WAIT_READY: > + ret = kvm_vmem_wait_ready(vmem); > + break; > + case KVM_VMEM_GET_PAGE_REQUEST: { > + struct kvm_vmem_page_request page_request; > + ret = -EFAULT; > + if (copy_from_user(&page_request, argp, sizeof(page_request))) > + break; > + ret = kvm_vmem_get_page_request(vmem, &page_request); > + if (ret == 0 && > + copy_to_user(argp + > + offsetof(struct kvm_vmem_page_request, nr), > + &page_request.nr, > + sizeof(page_request.nr))) { > + ret = -EFAULT; > + break; > + } > + break; > + } > + case KVM_VMEM_MARK_PAGE_CACHED: { > + struct kvm_vmem_page_cached page_cached; > + ret = -EFAULT; > + if (copy_from_user(&page_cached, argp, sizeof(page_cached))) > + break; > + ret = kvm_vmem_mark_page_cached(vmem, &page_cached); > + break; > + } > + case KVM_VMEM_MAKE_PAGES_PRESENT: { > + struct kvm_vmem_make_pages_present pages_present; > + ret = -EFAULT; > + if (copy_from_user(&pages_present, argp, > + sizeof(pages_present))) > + break; > + ret = kvm_vmem_make_pages_present(vmem, &pages_present); > + break; > + } > + case KVM_VMEM_MAKE_VMA_ANONYMOUS: > + ret = kvm_vmem_make_vma_anonymous(vmem); > + break; > + default: > + ret = -EINVAL; > + break; > + } > + return ret; > +} > + > +static unsigned long kvm_vmem_bitmap_bytes(const struct kvm_vmem *vmem) > +{ > + return round_up(vmem->pgoff_end, BITS_PER_LONG) / 8; > +} > + > +static void kvm_vmem_free(struct kvm_vmem *vmem) > +{ > + if (vmem->task) { > + put_task_struct(vmem->task); > + vmem->task = NULL; > + } > + > + if (vmem->shmem_filp) > + fput(vmem->shmem_filp); > + if (kvm_vmem_bitmap_bytes(vmem) > PAGE_SIZE) { > + vfree(vmem->cached); > + vfree(vmem->faulted); > + } else { > + kfree(vmem->cached); > + kfree(vmem->faulted); > + } > + kfree(vmem->vma); > + kfree(vmem->async_req); > + kfree(vmem->sync_req_bitmap); > + kfree(vmem->sync_wait_bitmap); > + kfree(vmem->page_wait); > + kfree(vmem->sync_req); > + kfree(vmem); > +} > + > +static int kvm_vmem_release(struct inode *inode, struct file *filp) > +{ > + struct kvm_vmem *vmem = filp->private_data; > + kvm_vmem_free(vmem); > + return 0; > +} > + > +static struct file_operations kvm_vmem_fops = { > + .release = kvm_vmem_release, > + .unlocked_ioctl = kvm_vmem_ioctl, > + .mmap = kvm_vmem_mmap, > + .poll = kvm_vmem_poll, > + .llseek = noop_llseek, > +}; > + > +static int kvm_create_vmem(struct kvm_vmem_create *create) > +{ > + int error = 0; > + struct kvm_vmem *vmem = NULL; > + struct vm_area_struct *vma = NULL; > + int shmem_fd; > + unsigned long bitmap_bytes; > + unsigned long sync_bitmap_bytes; > + int i; > + > + vmem = kzalloc(sizeof(*vmem), GFP_KERNEL); > + vmem->task = NULL; Is this needed, doesn't kzalloc() return zeroed memory? > + vmem->mmapped = false; > + spin_lock_init(&vmem->lock); > + vmem->size = roundup(create->size, PAGE_SIZE); > + vmem->pgoff_end = vmem->size >> PAGE_SHIFT; > + init_waitqueue_head(&vmem->req_wait); > + > + vma = kzalloc(sizeof(*vma), GFP_KERNEL); > + vma->vm_start = 0; Also here. > + vma->vm_end = vmem->size; > + /* this shmem file is used for temporal buffer for pages > + so it's unlikely that so many pages exists in this shmem file */ > + vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY | > + VM_DONTEXPAND; > + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); > + vma->vm_pgoff = 0; > + INIT_LIST_HEAD(&vma->anon_vma_chain); > + > + vmem->vma = vma; > + > + shmem_fd = get_unused_fd(); > + if (shmem_fd < 0) { > + error = shmem_fd; > + goto out; > + } > + error = shmem_zero_setup(vma); > + if (error < 0) { > + put_unused_fd(shmem_fd); > + goto out; > + } > + vmem->shmem_filp = vma->vm_file; > + get_file(vmem->shmem_filp); > + fd_install(shmem_fd, vma->vm_file); > + create->shmem_fd = shmem_fd; > + > + create->vmem_fd = anon_inode_getfd("kvm-vmem", > + &kvm_vmem_fops, vmem, O_RDWR); > + if (create->vmem_fd < 0) { > + error = create->vmem_fd; > + goto out; > + } > + > + bitmap_bytes = kvm_vmem_bitmap_bytes(vmem); > + if (bitmap_bytes > PAGE_SIZE) { > + vmem->cached = vzalloc(bitmap_bytes); > + vmem->faulted = vzalloc(bitmap_bytes); > + } else { > + vmem->cached = kzalloc(bitmap_bytes, GFP_KERNEL); > + vmem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL); > + } > + > +#define ASYNC_REQ_MAX (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS) > + vmem->async_req_max = ASYNC_REQ_MAX; > + vmem->async_req_nr = 0; > + vmem->async_req = kzalloc(sizeof(*vmem->async_req), GFP_KERNEL); > + > +#define SYNC_REQ_MAX (KVM_MAX_VCPUS) > + vmem->sync_req_max = round_up(SYNC_REQ_MAX, BITS_PER_LONG); > + sync_bitmap_bytes = sizeof(unsigned long) * > + (vmem->sync_req_max / BITS_PER_LONG); > + vmem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); > + vmem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); > + vmem->page_wait = kzalloc(sizeof(*vmem->page_wait) * > + vmem->sync_req_max, GFP_KERNEL); > + for (i = 0; i < vmem->sync_req_max; ++i) > + init_waitqueue_head(&vmem->page_wait[i]); > + vmem->sync_req = kzalloc(sizeof(*vmem->sync_req) * > + vmem->sync_req_max, GFP_KERNEL); > + > + vmem->req_list_nr = 0; > + INIT_LIST_HEAD(&vmem->req_list); > + init_waitqueue_head(&vmem->req_list_wait); > + > + init_waitqueue_head(&vmem->ready_wait); > + vmem->ready = false; > + > + return 0; > + > + out: > + kvm_vmem_free(vmem); > + return error; > +} > + > +static long kvm_vmem_dev_ioctl(struct file *filp, unsigned int ioctl, > + unsigned long arg) > +{ > + void __user *argp = (void __user *) arg; > + long ret; > + > + switch (ioctl) { > + case KVM_CREATE_VMEM: { > + struct kvm_vmem_create create; > + if (copy_from_user(&create, argp, sizeof(create))) { > + ret = -EFAULT; > + break; > + } > + ret = kvm_create_vmem(&create); > + if (copy_to_user(argp, &create, sizeof(create))) { > + ret = -EFAULT; > + break; > + } > + break; > + } > + default: > + ret = -EINVAL; > + break; > + } > + return ret; > +} > + > +static int kvm_vmem_dev_release(struct inode *inode, struct file *filp) > +{ > + return 0; > +} > + > +static struct file_operations kvm_vmem_dev_fops = { > + .release = kvm_vmem_dev_release, > + .unlocked_ioctl = kvm_vmem_dev_ioctl, > +}; > + > +long kvm_dev_ioctl_create_vmem_dev(void) > +{ > + return anon_inode_getfd("kvm-vmem-dev", &kvm_vmem_dev_fops, > + NULL, O_RDWR); > +} > diff --git a/virt/kvm/vmem.h b/virt/kvm/vmem.h > new file mode 100644 > index 0000000..bc7e8cf > --- /dev/null > +++ b/virt/kvm/vmem.h > @@ -0,0 +1,68 @@ > +/* > + * KVM post copy vmem > + * > + * Copyright (c) 2011, > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata <yamahata at valinux co jp> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along with > + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. Old address also here. > + */ > + > +#ifndef __KVM_VMEM_H__ > +#define __KVM_VMEM_H__ > + > +struct kvm_vmem_page_req_list { > + struct list_head list; > + pgoff_t pgoff; > +}; > + > +struct kvm_vmem { > + loff_t size; > + pgoff_t pgoff_end; > + spinlock_t lock; > + > + wait_queue_head_t req_wait; > + > + int async_req_max; > + int async_req_nr; > + pgoff_t *async_req; > + > + int sync_req_max; 'int' between pointers would mean 4 bytes of structure padding on 64 bit hosts. > + unsigned long *sync_req_bitmap; > + unsigned long *sync_wait_bitmap; > + pgoff_t *sync_req; > + wait_queue_head_t *page_wait; > + > + int req_list_nr; > + struct list_head req_list; > + wait_queue_head_t req_list_wait; > + > + unsigned long *cached; > + unsigned long *faulted; > + > + bool mmapped; > + unsigned long vm_start; > + unsigned int vma_nr; > + struct task_struct *task; > + > + wait_queue_head_t ready_wait; > + bool ready; > + > + struct file *shmem_filp; > + struct vm_area_struct *vma; > +}; > + > +#endif /* __KVM_VMEM_H__ */ > -- > 1.7.1.1 > > > -- > yamahata > > ��.n��������+%������w��{.n�����o�^n�r������&��z�ޗ�zf���h���~����������_��+v���)ߣ�