Re: [PATCH 2/2] umem: chardevice for kvm postcopy

Isaku Yamahata <yamahata@xxxxxxxxxxxxx> · Thu, 29 Dec 2011 21:22:50 +0900

Thank you for review.

On Thu, Dec 29, 2011 at 01:17:51PM +0200, Avi Kivity wrote:
> > +	default n
> > +	help
> > +	  User process backed memory driver provides /dev/umem device.
> > +	  The /dev/umem device is designed for some sort of distributed
> > +	  shared memory. Especially post-copy live migration with KVM.
> > +	  When in doubt, say "N".
> > +
> 
> Need documentation of the protocol between the kernel and userspace; not
> just the ioctls, but also how faults are propagated.

Will do.

> 
> > +
> > +struct umem_page_req_list {
> > +	struct list_head list;
> > +	pgoff_t pgoff;
> > +};
> > +
> >
> > +
> > +
> > +static int umem_mark_page_cached(struct umem *umem,
> > +				 struct umem_page_cached *page_cached)
> > +{
> > +	int ret = 0;
> > +#define PG_MAX	((__u32)32)
> > +	__u64 pgoffs[PG_MAX];
> > +	__u32 nr;
> > +	unsigned long bit;
> > +	bool wake_up_list = false;
> > +
> > +	nr = 0;
> > +	while (nr < page_cached->nr) {
> > +		__u32 todo = min(PG_MAX, (page_cached->nr - nr));
> > +		int i;
> > +
> > +		if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
> > +				   sizeof(*pgoffs) * todo)) {
> > +			ret = -EFAULT;
> > +			goto out;
> > +		}
> > +		for (i = 0; i < todo; ++i) {
> > +			if (pgoffs[i] >= umem->pgoff_end) {
> > +				ret = -EINVAL;
> > +				goto out;
> > +			}
> > +			set_bit(pgoffs[i], umem->cached);
> > +		}
> > +		nr += todo;
> > +	}
> > +
> 
> Probably need an smp_wmb() where.
> 
> > +	spin_lock(&umem->lock);
> > +	bit = 0;
> > +	for (;;) {
> > +		bit = find_next_bit(umem->sync_wait_bitmap, umem->sync_req_max,
> > +				    bit);
> > +		if (bit >= umem->sync_req_max)
> > +			break;
> > +		if (test_bit(umem->sync_req[bit], umem->cached))
> > +			wake_up(&umem->page_wait[bit]);
> 
> Why not do this test in the loop above?
> 
> > +		bit++;
> > +	}
> > +
> > +	if (umem->req_list_nr > 0)
> > +		wake_up_list = true;
> > +	spin_unlock(&umem->lock);
> > +
> > +	if (wake_up_list)
> > +		wake_up_all(&umem->req_list_wait);
> > +
> > +out:
> > +	return ret;
> > +}
> > +
> > +
> > +
> > +static void umem_put(struct umem *umem)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&umem_list_mutex);
> > +	ret = kref_put(&umem->kref, umem_free);
> > +	if (ret == 0) {
> > +		mutex_unlock(&umem_list_mutex);
> > +	}
> 
> This looks wrong.
> 
> > +}
> > +
> > +
> > +static int umem_create_umem(struct umem_create *create)
> > +{
> > +	int error = 0;
> > +	struct umem *umem = NULL;
> > +	struct vm_area_struct *vma;
> > +	int shmem_fd;
> > +	unsigned long bitmap_bytes;
> > +	unsigned long sync_bitmap_bytes;
> > +	int i;
> > +
> > +	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> > +	umem->name = create->name;
> > +	kref_init(&umem->kref);
> > +	INIT_LIST_HEAD(&umem->list);
> > +
> > +	mutex_lock(&umem_list_mutex);
> > +	error = umem_add_list(umem);
> > +	if (error) {
> > +		goto out;
> > +	}
> > +
> > +	umem->task = NULL;
> > +	umem->mmapped = false;
> > +	spin_lock_init(&umem->lock);
> > +	umem->size = roundup(create->size, PAGE_SIZE);
> > +	umem->pgoff_end = umem->size >> PAGE_SHIFT;
> > +	init_waitqueue_head(&umem->req_wait);
> > +
> > +	vma = &umem->vma;
> > +	vma->vm_start = 0;
> > +	vma->vm_end = umem->size;
> > +	/* this shmem file is used for temporal buffer for pages
> > +	   so it's unlikely that so many pages exists in this shmem file */
> > +	vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
> > +		VM_DONTEXPAND;
> > +	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
> > +	vma->vm_pgoff = 0;
> > +	INIT_LIST_HEAD(&vma->anon_vma_chain);
> > +
> > +	shmem_fd = get_unused_fd();
> > +	if (shmem_fd < 0) {
> > +		error = shmem_fd;
> > +		goto out;
> > +	}
> > +	error = shmem_zero_setup(vma);
> > +	if (error < 0) {
> > +		put_unused_fd(shmem_fd);
> > +		goto out;
> > +	}
> > +	umem->shmem_filp = vma->vm_file;
> > +	get_file(umem->shmem_filp);
> > +	fd_install(shmem_fd, vma->vm_file);
> > +	create->shmem_fd = shmem_fd;
> > +
> > +	create->umem_fd = anon_inode_getfd("umem",
> > +					   &umem_fops, umem, O_RDWR);
> > +	if (create->umem_fd < 0) {
> > +		error = create->umem_fd;
> > +		goto out;
> > +	}
> > +
> > +	bitmap_bytes = umem_bitmap_bytes(umem);
> > +	if (bitmap_bytes > PAGE_SIZE) {
> > +		umem->cached = vzalloc(bitmap_bytes);
> > +		umem->faulted = vzalloc(bitmap_bytes);
> > +	} else {
> > +		umem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
> > +		umem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
> > +	}
> > +
> > +	/* those constants are not exported.
> > +	   They are just used for default value */
> > +#define KVM_MAX_VCPUS	256
> > +#define ASYNC_PF_PER_VCPU 64
> 
> Best to avoid defaults and require userspace choose.

Okay.

> > +
> > +#define ASYNC_REQ_MAX	(ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
> > +	if (create->async_req_max == 0)
> > +		create->async_req_max = ASYNC_REQ_MAX;
> > +	umem->async_req_max = create->async_req_max;
> > +	umem->async_req_nr = 0;
> > +	umem->async_req = kzalloc(
> > +		sizeof(*umem->async_req) * umem->async_req_max,
> > +		GFP_KERNEL);
> > +
> > +#define SYNC_REQ_MAX	(KVM_MAX_VCPUS)
> > +	if (create->sync_req_max == 0)
> > +		create->sync_req_max = SYNC_REQ_MAX;
> > +	umem->sync_req_max = round_up(create->sync_req_max, BITS_PER_LONG);
> > +	sync_bitmap_bytes = sizeof(unsigned long) *
> > +		(umem->sync_req_max / BITS_PER_LONG);
> > +	umem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> > +	umem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> > +	umem->page_wait = kzalloc(sizeof(*umem->page_wait) *
> > +				  umem->sync_req_max, GFP_KERNEL);
> > +	for (i = 0; i < umem->sync_req_max; ++i)
> > +		init_waitqueue_head(&umem->page_wait[i]);
> > +	umem->sync_req = kzalloc(sizeof(*umem->sync_req) *
> > +				 umem->sync_req_max, GFP_KERNEL);
> > +
> > +	umem->req_list_nr = 0;
> > +	INIT_LIST_HEAD(&umem->req_list);
> > +	init_waitqueue_head(&umem->req_list_wait);
> > +
> > +	mutex_unlock(&umem_list_mutex);
> > +	return 0;
> > +
> > + out:
> > +	umem_free(&umem->kref);
> > +	return error;
> > +}
> > +
> > +
> > +static int umem_reattach_umem(struct umem_create *create)
> > +{
> > +	struct umem *entry;
> > +
> > +	mutex_lock(&umem_list_mutex);
> > +	list_for_each_entry(entry, &umem_list, list) {
> > +		if (umem_name_eq(&entry->name, &create->name)) {
> > +			kref_get(&entry->kref);
> > +			mutex_unlock(&umem_list_mutex);
> > +
> > +			create->shmem_fd = get_unused_fd();
> > +			if (create->shmem_fd < 0) {
> > +				umem_put(entry);
> > +				return create->shmem_fd;
> > +			}
> > +			create->umem_fd = anon_inode_getfd(
> > +				"umem", &umem_fops, entry, O_RDWR);
> > +			if (create->umem_fd < 0) {
> > +				put_unused_fd(create->shmem_fd);
> > +				umem_put(entry);
> > +				return create->umem_fd;
> > +			}
> > +			get_file(entry->shmem_filp);
> > +			fd_install(create->shmem_fd, entry->shmem_filp);
> > +
> > +			create->size = entry->size;
> > +			create->sync_req_max = entry->sync_req_max;
> > +			create->async_req_max = entry->async_req_max;
> > +			return 0;
> > +		}
> > +	}
> > +	mutex_unlock(&umem_list_mutex);
> > +
> > +	return -ENOENT;
> > +}
> 
> Can you explain how reattach is used?
> 
> > +
> > +static long umem_dev_ioctl(struct file *filp, unsigned int ioctl,
> > +			   unsigned long arg)
> > +{
> > +	void __user *argp = (void __user *) arg;
> > +	long ret;
> > +	struct umem_create *create = NULL;
> > +
> > +
> > +	switch (ioctl) {
> > +	case UMEM_DEV_CREATE_UMEM:
> > +		create = kmalloc(sizeof(*create), GFP_KERNEL);
> > +		if (copy_from_user(create, argp, sizeof(*create))) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		ret = umem_create_umem(create);
> > +		if (copy_to_user(argp, create, sizeof(*create))) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		break;
> 
> A simpler approach is the open("/dev/umem") returns an mmap()able fd. 
> You need to call an ioctl() to set the size, etc. but only you only
> operate on that fd.

So you are suggesting that /dev/umem and /dev/umemctl should be introduced
and split the functionality.

> > +	case UMEM_DEV_LIST:
> > +		ret = umem_list_umem(argp);
> > +		break;
> > +	case UMEM_DEV_REATTACH:
> > +		create = kmalloc(sizeof(*create), GFP_KERNEL);
> > +		if (copy_from_user(create, argp, sizeof(*create))) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		ret = umem_reattach_umem(create);
> > +		if (copy_to_user(argp, create, sizeof(*create))) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		break;
> > +	default:
> > +		ret = -EINVAL;
> > +		break;
> > +	}
> > +
> > +	kfree(create);
> > +	return ret;
> > +}
> > +
> > +
> > +#ifdef __KERNEL__
> > +#include <linux/compiler.h>
> > +#else
> > +#define __user
> > +#endif
> 
> I think a #include <linux/compiler.h> is sufficient, the export process
> (see include/linux/Kbuild, add an entry there) takes care of __user.
> 
> > +
> > +#define UMEM_ID_MAX	256
> > +#define UMEM_NAME_MAX	256
> > +
> > +struct umem_name {
> > +	char id[UMEM_ID_MAX];		/* non-zero terminated */
> > +	char name[UMEM_NAME_MAX];	/* non-zero terminated */
> > +};
> 
> IMO, it would be better to avoid names, and use opaque __u64 identifiers
> assigned by userspace, or perhaps file descriptors generated by the
> kernel.  With names come the complications of namespaces, etc.  One user
> can DoS another by grabbing a name that it knows the other user wants to
> use.

So how about the kernel assigning identifiers which is system global?

> > +
> > +struct umem_create {
> > +	__u64 size;	/* in bytes */
> > +	__s32 umem_fd;
> > +	__s32 shmem_fd;
> > +	__u32 async_req_max;
> > +	__u32 sync_req_max;
> > +	struct umem_name name;
> > +};
> > +
> > +struct umem_page_request {
> > +	__u64 __user *pgoffs;
> 
> Pointers change their size in 32-bit and 64-bit userspace, best to avoid
> them.

Ah yes, right. How about following?
struct {
       __u32 nr;
       __u32 padding;
       __u64 pgoffs[0];
}

> > +	__u32 nr;
> > +	__u32 padding;
> > +};
> > +
> > +struct umem_page_cached {
> > +	__u64 __user *pgoffs;
> > +	__u32 nr;
> > +	__u32 padding;
> > +};
> > +
> > +#define UMEMIO	0x1E
> > +
> > +/* ioctl for umem_dev fd */
> > +#define UMEM_DEV_CREATE_UMEM	_IOWR(UMEMIO, 0x0, struct umem_create)
> > +#define UMEM_DEV_LIST		_IOWR(UMEMIO, 0x1, struct umem_list)
> 
> Why is _LIST needed?
> 
> > +#define UMEM_DEV_REATTACH	_IOWR(UMEMIO, 0x2, struct umem_create)
> > +
> > +/* ioctl for umem fd */
> > +#define UMEM_GET_PAGE_REQUEST	_IOWR(UMEMIO, 0x10, struct umem_page_request)
> > +#define UMEM_MARK_PAGE_CACHED	_IOW (UMEMIO, 0x11, struct umem_page_cached)
> 
> You could make the GET_PAGE_REQUEST / MARK_PAGE_CACHED protocol run over
> file descriptors, instead of an ioctl.  It allows you to implement the
> other side in either the kernel or userspace.  This is similar to how
> kvm uses an eventfd for communication with vhost-net in the kernel, or
> an implementation in userspace.

Do you mean that read/write on file descriptors is better than ioctl?
Okay, it would be easy to convert ioctl into read/write.

> > +#define UMEM_MAKE_VMA_ANONYMOUS	_IO  (UMEMIO, 0x12)
> > +
> > +#endif /* __LINUX_UMEM_H */
> 
> 
> -- 
> error compiling committee.c: too many arguments to function
> 

-- 
yamahata
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html