Thank you for review. On Thu, Dec 29, 2011 at 01:17:51PM +0200, Avi Kivity wrote: > > + default n > > + help > > + User process backed memory driver provides /dev/umem device. > > + The /dev/umem device is designed for some sort of distributed > > + shared memory. Especially post-copy live migration with KVM. > > + When in doubt, say "N". > > + > > Need documentation of the protocol between the kernel and userspace; not > just the ioctls, but also how faults are propagated. Will do. > > > + > > +struct umem_page_req_list { > > + struct list_head list; > > + pgoff_t pgoff; > > +}; > > + > > > > + > > + > > +static int umem_mark_page_cached(struct umem *umem, > > + struct umem_page_cached *page_cached) > > +{ > > + int ret = 0; > > +#define PG_MAX ((__u32)32) > > + __u64 pgoffs[PG_MAX]; > > + __u32 nr; > > + unsigned long bit; > > + bool wake_up_list = false; > > + > > + nr = 0; > > + while (nr < page_cached->nr) { > > + __u32 todo = min(PG_MAX, (page_cached->nr - nr)); > > + int i; > > + > > + if (copy_from_user(pgoffs, page_cached->pgoffs + nr, > > + sizeof(*pgoffs) * todo)) { > > + ret = -EFAULT; > > + goto out; > > + } > > + for (i = 0; i < todo; ++i) { > > + if (pgoffs[i] >= umem->pgoff_end) { > > + ret = -EINVAL; > > + goto out; > > + } > > + set_bit(pgoffs[i], umem->cached); > > + } > > + nr += todo; > > + } > > + > > Probably need an smp_wmb() where. > > > + spin_lock(&umem->lock); > > + bit = 0; > > + for (;;) { > > + bit = find_next_bit(umem->sync_wait_bitmap, umem->sync_req_max, > > + bit); > > + if (bit >= umem->sync_req_max) > > + break; > > + if (test_bit(umem->sync_req[bit], umem->cached)) > > + wake_up(&umem->page_wait[bit]); > > Why not do this test in the loop above? > > > + bit++; > > + } > > + > > + if (umem->req_list_nr > 0) > > + wake_up_list = true; > > + spin_unlock(&umem->lock); > > + > > + if (wake_up_list) > > + wake_up_all(&umem->req_list_wait); > > + > > +out: > > + return ret; > > +} > > + > > + > > + > > +static void umem_put(struct umem *umem) > > +{ > > + int ret; > > + > > + mutex_lock(&umem_list_mutex); > > + ret = kref_put(&umem->kref, umem_free); > > + if (ret == 0) { > > + mutex_unlock(&umem_list_mutex); > > + } > > This looks wrong. > > > +} > > + > > + > > +static int umem_create_umem(struct umem_create *create) > > +{ > > + int error = 0; > > + struct umem *umem = NULL; > > + struct vm_area_struct *vma; > > + int shmem_fd; > > + unsigned long bitmap_bytes; > > + unsigned long sync_bitmap_bytes; > > + int i; > > + > > + umem = kzalloc(sizeof(*umem), GFP_KERNEL); > > + umem->name = create->name; > > + kref_init(&umem->kref); > > + INIT_LIST_HEAD(&umem->list); > > + > > + mutex_lock(&umem_list_mutex); > > + error = umem_add_list(umem); > > + if (error) { > > + goto out; > > + } > > + > > + umem->task = NULL; > > + umem->mmapped = false; > > + spin_lock_init(&umem->lock); > > + umem->size = roundup(create->size, PAGE_SIZE); > > + umem->pgoff_end = umem->size >> PAGE_SHIFT; > > + init_waitqueue_head(&umem->req_wait); > > + > > + vma = &umem->vma; > > + vma->vm_start = 0; > > + vma->vm_end = umem->size; > > + /* this shmem file is used for temporal buffer for pages > > + so it's unlikely that so many pages exists in this shmem file */ > > + vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY | > > + VM_DONTEXPAND; > > + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); > > + vma->vm_pgoff = 0; > > + INIT_LIST_HEAD(&vma->anon_vma_chain); > > + > > + shmem_fd = get_unused_fd(); > > + if (shmem_fd < 0) { > > + error = shmem_fd; > > + goto out; > > + } > > + error = shmem_zero_setup(vma); > > + if (error < 0) { > > + put_unused_fd(shmem_fd); > > + goto out; > > + } > > + umem->shmem_filp = vma->vm_file; > > + get_file(umem->shmem_filp); > > + fd_install(shmem_fd, vma->vm_file); > > + create->shmem_fd = shmem_fd; > > + > > + create->umem_fd = anon_inode_getfd("umem", > > + &umem_fops, umem, O_RDWR); > > + if (create->umem_fd < 0) { > > + error = create->umem_fd; > > + goto out; > > + } > > + > > + bitmap_bytes = umem_bitmap_bytes(umem); > > + if (bitmap_bytes > PAGE_SIZE) { > > + umem->cached = vzalloc(bitmap_bytes); > > + umem->faulted = vzalloc(bitmap_bytes); > > + } else { > > + umem->cached = kzalloc(bitmap_bytes, GFP_KERNEL); > > + umem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL); > > + } > > + > > + /* those constants are not exported. > > + They are just used for default value */ > > +#define KVM_MAX_VCPUS 256 > > +#define ASYNC_PF_PER_VCPU 64 > > Best to avoid defaults and require userspace choose. Okay. > > + > > +#define ASYNC_REQ_MAX (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS) > > + if (create->async_req_max == 0) > > + create->async_req_max = ASYNC_REQ_MAX; > > + umem->async_req_max = create->async_req_max; > > + umem->async_req_nr = 0; > > + umem->async_req = kzalloc( > > + sizeof(*umem->async_req) * umem->async_req_max, > > + GFP_KERNEL); > > + > > +#define SYNC_REQ_MAX (KVM_MAX_VCPUS) > > + if (create->sync_req_max == 0) > > + create->sync_req_max = SYNC_REQ_MAX; > > + umem->sync_req_max = round_up(create->sync_req_max, BITS_PER_LONG); > > + sync_bitmap_bytes = sizeof(unsigned long) * > > + (umem->sync_req_max / BITS_PER_LONG); > > + umem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); > > + umem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL); > > + umem->page_wait = kzalloc(sizeof(*umem->page_wait) * > > + umem->sync_req_max, GFP_KERNEL); > > + for (i = 0; i < umem->sync_req_max; ++i) > > + init_waitqueue_head(&umem->page_wait[i]); > > + umem->sync_req = kzalloc(sizeof(*umem->sync_req) * > > + umem->sync_req_max, GFP_KERNEL); > > + > > + umem->req_list_nr = 0; > > + INIT_LIST_HEAD(&umem->req_list); > > + init_waitqueue_head(&umem->req_list_wait); > > + > > + mutex_unlock(&umem_list_mutex); > > + return 0; > > + > > + out: > > + umem_free(&umem->kref); > > + return error; > > +} > > + > > + > > +static int umem_reattach_umem(struct umem_create *create) > > +{ > > + struct umem *entry; > > + > > + mutex_lock(&umem_list_mutex); > > + list_for_each_entry(entry, &umem_list, list) { > > + if (umem_name_eq(&entry->name, &create->name)) { > > + kref_get(&entry->kref); > > + mutex_unlock(&umem_list_mutex); > > + > > + create->shmem_fd = get_unused_fd(); > > + if (create->shmem_fd < 0) { > > + umem_put(entry); > > + return create->shmem_fd; > > + } > > + create->umem_fd = anon_inode_getfd( > > + "umem", &umem_fops, entry, O_RDWR); > > + if (create->umem_fd < 0) { > > + put_unused_fd(create->shmem_fd); > > + umem_put(entry); > > + return create->umem_fd; > > + } > > + get_file(entry->shmem_filp); > > + fd_install(create->shmem_fd, entry->shmem_filp); > > + > > + create->size = entry->size; > > + create->sync_req_max = entry->sync_req_max; > > + create->async_req_max = entry->async_req_max; > > + return 0; > > + } > > + } > > + mutex_unlock(&umem_list_mutex); > > + > > + return -ENOENT; > > +} > > Can you explain how reattach is used? > > > + > > +static long umem_dev_ioctl(struct file *filp, unsigned int ioctl, > > + unsigned long arg) > > +{ > > + void __user *argp = (void __user *) arg; > > + long ret; > > + struct umem_create *create = NULL; > > + > > + > > + switch (ioctl) { > > + case UMEM_DEV_CREATE_UMEM: > > + create = kmalloc(sizeof(*create), GFP_KERNEL); > > + if (copy_from_user(create, argp, sizeof(*create))) { > > + ret = -EFAULT; > > + break; > > + } > > + ret = umem_create_umem(create); > > + if (copy_to_user(argp, create, sizeof(*create))) { > > + ret = -EFAULT; > > + break; > > + } > > + break; > > A simpler approach is the open("/dev/umem") returns an mmap()able fd. > You need to call an ioctl() to set the size, etc. but only you only > operate on that fd. So you are suggesting that /dev/umem and /dev/umemctl should be introduced and split the functionality. > > + case UMEM_DEV_LIST: > > + ret = umem_list_umem(argp); > > + break; > > + case UMEM_DEV_REATTACH: > > + create = kmalloc(sizeof(*create), GFP_KERNEL); > > + if (copy_from_user(create, argp, sizeof(*create))) { > > + ret = -EFAULT; > > + break; > > + } > > + ret = umem_reattach_umem(create); > > + if (copy_to_user(argp, create, sizeof(*create))) { > > + ret = -EFAULT; > > + break; > > + } > > + break; > > + default: > > + ret = -EINVAL; > > + break; > > + } > > + > > + kfree(create); > > + return ret; > > +} > > + > > + > > +#ifdef __KERNEL__ > > +#include <linux/compiler.h> > > +#else > > +#define __user > > +#endif > > I think a #include <linux/compiler.h> is sufficient, the export process > (see include/linux/Kbuild, add an entry there) takes care of __user. > > > + > > +#define UMEM_ID_MAX 256 > > +#define UMEM_NAME_MAX 256 > > + > > +struct umem_name { > > + char id[UMEM_ID_MAX]; /* non-zero terminated */ > > + char name[UMEM_NAME_MAX]; /* non-zero terminated */ > > +}; > > IMO, it would be better to avoid names, and use opaque __u64 identifiers > assigned by userspace, or perhaps file descriptors generated by the > kernel. With names come the complications of namespaces, etc. One user > can DoS another by grabbing a name that it knows the other user wants to > use. So how about the kernel assigning identifiers which is system global? > > + > > +struct umem_create { > > + __u64 size; /* in bytes */ > > + __s32 umem_fd; > > + __s32 shmem_fd; > > + __u32 async_req_max; > > + __u32 sync_req_max; > > + struct umem_name name; > > +}; > > + > > +struct umem_page_request { > > + __u64 __user *pgoffs; > > Pointers change their size in 32-bit and 64-bit userspace, best to avoid > them. Ah yes, right. How about following? struct { __u32 nr; __u32 padding; __u64 pgoffs[0]; } > > + __u32 nr; > > + __u32 padding; > > +}; > > + > > +struct umem_page_cached { > > + __u64 __user *pgoffs; > > + __u32 nr; > > + __u32 padding; > > +}; > > + > > +#define UMEMIO 0x1E > > + > > +/* ioctl for umem_dev fd */ > > +#define UMEM_DEV_CREATE_UMEM _IOWR(UMEMIO, 0x0, struct umem_create) > > +#define UMEM_DEV_LIST _IOWR(UMEMIO, 0x1, struct umem_list) > > Why is _LIST needed? > > > +#define UMEM_DEV_REATTACH _IOWR(UMEMIO, 0x2, struct umem_create) > > + > > +/* ioctl for umem fd */ > > +#define UMEM_GET_PAGE_REQUEST _IOWR(UMEMIO, 0x10, struct umem_page_request) > > +#define UMEM_MARK_PAGE_CACHED _IOW (UMEMIO, 0x11, struct umem_page_cached) > > You could make the GET_PAGE_REQUEST / MARK_PAGE_CACHED protocol run over > file descriptors, instead of an ioctl. It allows you to implement the > other side in either the kernel or userspace. This is similar to how > kvm uses an eventfd for communication with vhost-net in the kernel, or > an implementation in userspace. Do you mean that read/write on file descriptors is better than ioctl? Okay, it would be easy to convert ioctl into read/write. > > +#define UMEM_MAKE_VMA_ANONYMOUS _IO (UMEMIO, 0x12) > > + > > +#endif /* __LINUX_UMEM_H */ > > > -- > error compiling committee.c: too many arguments to function > -- yamahata -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html