Re: [PATCH 2/2] umem: chardevice for kvm postcopy

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 12/29/2011 03:26 AM, Isaku Yamahata wrote:
> This is a character device to hook page access.
> The page fault in the area is reported to another user process by
> this chardriver. Then, the process fills the page contents and
> resolves the page fault.
>
>  
> +config UMEM
> +        tristate "/dev/umem user process backed memory support"

tab

> +	default n
> +	help
> +	  User process backed memory driver provides /dev/umem device.
> +	  The /dev/umem device is designed for some sort of distributed
> +	  shared memory. Especially post-copy live migration with KVM.
> +	  When in doubt, say "N".
> +

Need documentation of the protocol between the kernel and userspace; not
just the ioctls, but also how faults are propagated.

> +
> +struct umem_page_req_list {
> +	struct list_head list;
> +	pgoff_t pgoff;
> +};
> +
>
> +
> +
> +static int umem_mark_page_cached(struct umem *umem,
> +				 struct umem_page_cached *page_cached)
> +{
> +	int ret = 0;
> +#define PG_MAX	((__u32)32)
> +	__u64 pgoffs[PG_MAX];
> +	__u32 nr;
> +	unsigned long bit;
> +	bool wake_up_list = false;
> +
> +	nr = 0;
> +	while (nr < page_cached->nr) {
> +		__u32 todo = min(PG_MAX, (page_cached->nr - nr));
> +		int i;
> +
> +		if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
> +				   sizeof(*pgoffs) * todo)) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +		for (i = 0; i < todo; ++i) {
> +			if (pgoffs[i] >= umem->pgoff_end) {
> +				ret = -EINVAL;
> +				goto out;
> +			}
> +			set_bit(pgoffs[i], umem->cached);
> +		}
> +		nr += todo;
> +	}
> +

Probably need an smp_wmb() where.

> +	spin_lock(&umem->lock);
> +	bit = 0;
> +	for (;;) {
> +		bit = find_next_bit(umem->sync_wait_bitmap, umem->sync_req_max,
> +				    bit);
> +		if (bit >= umem->sync_req_max)
> +			break;
> +		if (test_bit(umem->sync_req[bit], umem->cached))
> +			wake_up(&umem->page_wait[bit]);

Why not do this test in the loop above?

> +		bit++;
> +	}
> +
> +	if (umem->req_list_nr > 0)
> +		wake_up_list = true;
> +	spin_unlock(&umem->lock);
> +
> +	if (wake_up_list)
> +		wake_up_all(&umem->req_list_wait);
> +
> +out:
> +	return ret;
> +}
> +
> +
> +
> +static void umem_put(struct umem *umem)
> +{
> +	int ret;
> +
> +	mutex_lock(&umem_list_mutex);
> +	ret = kref_put(&umem->kref, umem_free);
> +	if (ret == 0) {
> +		mutex_unlock(&umem_list_mutex);
> +	}

This looks wrong.

> +}
> +
> +
> +static int umem_create_umem(struct umem_create *create)
> +{
> +	int error = 0;
> +	struct umem *umem = NULL;
> +	struct vm_area_struct *vma;
> +	int shmem_fd;
> +	unsigned long bitmap_bytes;
> +	unsigned long sync_bitmap_bytes;
> +	int i;
> +
> +	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> +	umem->name = create->name;
> +	kref_init(&umem->kref);
> +	INIT_LIST_HEAD(&umem->list);
> +
> +	mutex_lock(&umem_list_mutex);
> +	error = umem_add_list(umem);
> +	if (error) {
> +		goto out;
> +	}
> +
> +	umem->task = NULL;
> +	umem->mmapped = false;
> +	spin_lock_init(&umem->lock);
> +	umem->size = roundup(create->size, PAGE_SIZE);
> +	umem->pgoff_end = umem->size >> PAGE_SHIFT;
> +	init_waitqueue_head(&umem->req_wait);
> +
> +	vma = &umem->vma;
> +	vma->vm_start = 0;
> +	vma->vm_end = umem->size;
> +	/* this shmem file is used for temporal buffer for pages
> +	   so it's unlikely that so many pages exists in this shmem file */
> +	vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
> +		VM_DONTEXPAND;
> +	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
> +	vma->vm_pgoff = 0;
> +	INIT_LIST_HEAD(&vma->anon_vma_chain);
> +
> +	shmem_fd = get_unused_fd();
> +	if (shmem_fd < 0) {
> +		error = shmem_fd;
> +		goto out;
> +	}
> +	error = shmem_zero_setup(vma);
> +	if (error < 0) {
> +		put_unused_fd(shmem_fd);
> +		goto out;
> +	}
> +	umem->shmem_filp = vma->vm_file;
> +	get_file(umem->shmem_filp);
> +	fd_install(shmem_fd, vma->vm_file);
> +	create->shmem_fd = shmem_fd;
> +
> +	create->umem_fd = anon_inode_getfd("umem",
> +					   &umem_fops, umem, O_RDWR);
> +	if (create->umem_fd < 0) {
> +		error = create->umem_fd;
> +		goto out;
> +	}
> +
> +	bitmap_bytes = umem_bitmap_bytes(umem);
> +	if (bitmap_bytes > PAGE_SIZE) {
> +		umem->cached = vzalloc(bitmap_bytes);
> +		umem->faulted = vzalloc(bitmap_bytes);
> +	} else {
> +		umem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
> +		umem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
> +	}
> +
> +	/* those constants are not exported.
> +	   They are just used for default value */
> +#define KVM_MAX_VCPUS	256
> +#define ASYNC_PF_PER_VCPU 64

Best to avoid defaults and require userspace choose.

> +
> +#define ASYNC_REQ_MAX	(ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
> +	if (create->async_req_max == 0)
> +		create->async_req_max = ASYNC_REQ_MAX;
> +	umem->async_req_max = create->async_req_max;
> +	umem->async_req_nr = 0;
> +	umem->async_req = kzalloc(
> +		sizeof(*umem->async_req) * umem->async_req_max,
> +		GFP_KERNEL);
> +
> +#define SYNC_REQ_MAX	(KVM_MAX_VCPUS)
> +	if (create->sync_req_max == 0)
> +		create->sync_req_max = SYNC_REQ_MAX;
> +	umem->sync_req_max = round_up(create->sync_req_max, BITS_PER_LONG);
> +	sync_bitmap_bytes = sizeof(unsigned long) *
> +		(umem->sync_req_max / BITS_PER_LONG);
> +	umem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> +	umem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> +	umem->page_wait = kzalloc(sizeof(*umem->page_wait) *
> +				  umem->sync_req_max, GFP_KERNEL);
> +	for (i = 0; i < umem->sync_req_max; ++i)
> +		init_waitqueue_head(&umem->page_wait[i]);
> +	umem->sync_req = kzalloc(sizeof(*umem->sync_req) *
> +				 umem->sync_req_max, GFP_KERNEL);
> +
> +	umem->req_list_nr = 0;
> +	INIT_LIST_HEAD(&umem->req_list);
> +	init_waitqueue_head(&umem->req_list_wait);
> +
> +	mutex_unlock(&umem_list_mutex);
> +	return 0;
> +
> + out:
> +	umem_free(&umem->kref);
> +	return error;
> +}
> +
> +
> +static int umem_reattach_umem(struct umem_create *create)
> +{
> +	struct umem *entry;
> +
> +	mutex_lock(&umem_list_mutex);
> +	list_for_each_entry(entry, &umem_list, list) {
> +		if (umem_name_eq(&entry->name, &create->name)) {
> +			kref_get(&entry->kref);
> +			mutex_unlock(&umem_list_mutex);
> +
> +			create->shmem_fd = get_unused_fd();
> +			if (create->shmem_fd < 0) {
> +				umem_put(entry);
> +				return create->shmem_fd;
> +			}
> +			create->umem_fd = anon_inode_getfd(
> +				"umem", &umem_fops, entry, O_RDWR);
> +			if (create->umem_fd < 0) {
> +				put_unused_fd(create->shmem_fd);
> +				umem_put(entry);
> +				return create->umem_fd;
> +			}
> +			get_file(entry->shmem_filp);
> +			fd_install(create->shmem_fd, entry->shmem_filp);
> +
> +			create->size = entry->size;
> +			create->sync_req_max = entry->sync_req_max;
> +			create->async_req_max = entry->async_req_max;
> +			return 0;
> +		}
> +	}
> +	mutex_unlock(&umem_list_mutex);
> +
> +	return -ENOENT;
> +}

Can you explain how reattach is used?

> +
> +static long umem_dev_ioctl(struct file *filp, unsigned int ioctl,
> +			   unsigned long arg)
> +{
> +	void __user *argp = (void __user *) arg;
> +	long ret;
> +	struct umem_create *create = NULL;
> +
> +
> +	switch (ioctl) {
> +	case UMEM_DEV_CREATE_UMEM:
> +		create = kmalloc(sizeof(*create), GFP_KERNEL);
> +		if (copy_from_user(create, argp, sizeof(*create))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +		ret = umem_create_umem(create);
> +		if (copy_to_user(argp, create, sizeof(*create))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +		break;

A simpler approach is the open("/dev/umem") returns an mmap()able fd. 
You need to call an ioctl() to set the size, etc. but only you only
operate on that fd.

> +	case UMEM_DEV_LIST:
> +		ret = umem_list_umem(argp);
> +		break;
> +	case UMEM_DEV_REATTACH:
> +		create = kmalloc(sizeof(*create), GFP_KERNEL);
> +		if (copy_from_user(create, argp, sizeof(*create))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +		ret = umem_reattach_umem(create);
> +		if (copy_to_user(argp, create, sizeof(*create))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	}
> +
> +	kfree(create);
> +	return ret;
> +}
> +
> +
> +#ifdef __KERNEL__
> +#include <linux/compiler.h>
> +#else
> +#define __user
> +#endif

I think a #include <linux/compiler.h> is sufficient, the export process
(see include/linux/Kbuild, add an entry there) takes care of __user.

> +
> +#define UMEM_ID_MAX	256
> +#define UMEM_NAME_MAX	256
> +
> +struct umem_name {
> +	char id[UMEM_ID_MAX];		/* non-zero terminated */
> +	char name[UMEM_NAME_MAX];	/* non-zero terminated */
> +};

IMO, it would be better to avoid names, and use opaque __u64 identifiers
assigned by userspace, or perhaps file descriptors generated by the
kernel.  With names come the complications of namespaces, etc.  One user
can DoS another by grabbing a name that it knows the other user wants to
use.

> +
> +struct umem_create {
> +	__u64 size;	/* in bytes */
> +	__s32 umem_fd;
> +	__s32 shmem_fd;
> +	__u32 async_req_max;
> +	__u32 sync_req_max;
> +	struct umem_name name;
> +};
> +
> +struct umem_page_request {
> +	__u64 __user *pgoffs;

Pointers change their size in 32-bit and 64-bit userspace, best to avoid
them.

> +	__u32 nr;
> +	__u32 padding;
> +};
> +
> +struct umem_page_cached {
> +	__u64 __user *pgoffs;
> +	__u32 nr;
> +	__u32 padding;
> +};
> +
> +#define UMEMIO	0x1E
> +
> +/* ioctl for umem_dev fd */
> +#define UMEM_DEV_CREATE_UMEM	_IOWR(UMEMIO, 0x0, struct umem_create)
> +#define UMEM_DEV_LIST		_IOWR(UMEMIO, 0x1, struct umem_list)

Why is _LIST needed?

> +#define UMEM_DEV_REATTACH	_IOWR(UMEMIO, 0x2, struct umem_create)
> +
> +/* ioctl for umem fd */
> +#define UMEM_GET_PAGE_REQUEST	_IOWR(UMEMIO, 0x10, struct umem_page_request)
> +#define UMEM_MARK_PAGE_CACHED	_IOW (UMEMIO, 0x11, struct umem_page_cached)

You could make the GET_PAGE_REQUEST / MARK_PAGE_CACHED protocol run over
file descriptors, instead of an ioctl.  It allows you to implement the
other side in either the kernel or userspace.  This is similar to how
kvm uses an eventfd for communication with vhost-net in the kernel, or
an implementation in userspace.

> +#define UMEM_MAKE_VMA_ANONYMOUS	_IO  (UMEMIO, 0x12)
> +
> +#endif /* __LINUX_UMEM_H */


-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux