Re: [PATCH v9] fs: Add VirtualBox guest shared folder (vboxsf) support

Al Viro <viro@xxxxxxxxxxxxxxxxxx> · Wed, 3 Apr 2019 21:49:23 +0100

On Tue, Apr 02, 2019 at 01:54:21PM +0200, Hans de Goede wrote:

> +static int sf_dir_open(struct inode *inode, struct file *file)
> +{
> +	struct sf_glob_info *sf_g = GET_GLOB_INFO(inode->i_sb);
> +	struct shfl_createparms params = {};
> +	struct sf_dir_info *sf_d;
> +	int err;
> +
> +	sf_d = vboxsf_dir_info_alloc();
> +	if (!sf_d)
> +		return -ENOMEM;
> +
> +	params.handle = SHFL_HANDLE_NIL;
> +	params.create_flags = 0
> +	    | SHFL_CF_DIRECTORY
> +	    | SHFL_CF_ACT_OPEN_IF_EXISTS
> +	    | SHFL_CF_ACT_FAIL_IF_NEW | SHFL_CF_ACCESS_READ;
> +
> +	err = vboxsf_create_at_dentry(file_dentry(file), &params);
> +	if (err == 0) {
> +		if (params.result == SHFL_FILE_EXISTS) {
> +			err = vboxsf_dir_read_all(sf_g, sf_d, params.handle);
> +			if (!err)
> +				file->private_data = sf_d;
> +		} else
> +			err = -ENOENT;
> +
> +		vboxsf_close(sf_g->root, params.handle);
> +	}
> +
> +	if (err)
> +		vboxsf_dir_info_free(sf_d);
> +
> +	return err;
> +}
> +
> +/**
> + * This is called when reference count of [file] goes to zero. Notify
> + * the host that it can free whatever is associated with this directory
> + * and deallocate our own internal buffers
> + * Return: 0 or negative errno value.
> + * @inode	inode
> + * @file	file
> + */
> +static int sf_dir_release(struct inode *inode, struct file *file)
> +{
> +	if (file->private_data)
> +		vboxsf_dir_info_free(file->private_data);
> +
> +	return 0;
> +}
> +
> +/**
> + * Translate RTFMODE into DT_xxx (in conjunction to rtDirType())
> + * Return: d_type
> + * @mode	file mode
> + */
> +static int sf_get_d_type(u32 mode)
> +{
> +	int d_type;
> +
> +	switch (mode & SHFL_TYPE_MASK) {
> +	case SHFL_TYPE_FIFO:
> +		d_type = DT_FIFO;
> +		break;
> +	case SHFL_TYPE_DEV_CHAR:
> +		d_type = DT_CHR;
> +		break;
> +	case SHFL_TYPE_DIRECTORY:
> +		d_type = DT_DIR;
> +		break;
> +	case SHFL_TYPE_DEV_BLOCK:
> +		d_type = DT_BLK;
> +		break;
> +	case SHFL_TYPE_FILE:
> +		d_type = DT_REG;
> +		break;
> +	case SHFL_TYPE_SYMLINK:
> +		d_type = DT_LNK;
> +		break;
> +	case SHFL_TYPE_SOCKET:
> +		d_type = DT_SOCK;
> +		break;
> +	case SHFL_TYPE_WHITEOUT:
> +		d_type = DT_WHT;
> +		break;
> +	default:
> +		d_type = DT_UNKNOWN;
> +		break;
> +	}
> +	return d_type;
> +}

> +/**
> + * Extract element ([dir]->f_pos) from the directory [dir] into [d_name].
> + * Return: 0 or negative errno value.
> + * @dir		Directory to get element at f_pos from
> + * @d_name	Buffer in which to return element name
> + * @d_type	Buffer in which to return element file-type
> + */
> +static int sf_getdent(struct file *dir, char d_name[NAME_MAX], int *d_type)
> +{
> +	struct sf_glob_info *sf_g = GET_GLOB_INFO(file_inode(dir)->i_sb);
> +	struct sf_dir_info *sf_d = dir->private_data;
> +	struct list_head *pos;
> +	loff_t cur = 0;
> +
> +	list_for_each(pos, &sf_d->info_list) {
> +		struct shfl_dirinfo *info;
> +		struct sf_dir_buf *b;
> +		loff_t i;
> +
> +		b = list_entry(pos, struct sf_dir_buf, head);

	list_for_each_entry(b, &sf_d->info_list, head) { , surely?

> +		if (dir->f_pos >= cur + b->entries) {
> +			cur += b->entries;
> +			continue;
> +		}
> +
> +		for (i = 0, info = b->buf; i < dir->f_pos - cur; ++i) {
> +			size_t size;
> +
> +			size = offsetof(struct shfl_dirinfo, name.string) +
> +			       info->name.size;
> +			info = (struct shfl_dirinfo *)((uintptr_t) info + size);

Umm...  How is alignment handled?

One note here: ->f_pos is not your concern.  Leave it alone, just use
ctx->pos.

> +static int sf_dir_iterate(struct file *dir, struct dir_context *ctx)
> +{
> +	for (;;) {
> +		int err;
> +		ino_t fake_ino;
> +		loff_t sanity;
> +		char d_name[NAME_MAX];
> +		int d_type = DT_UNKNOWN;
> +
> +		err = sf_getdent(dir, d_name, &d_type);
> +		switch (err) {
> +		case 1:
> +			return 0;
> +
> +		case 0:
> +			break;
> +
> +		case -1:
> +		default:
> +			/* skip erroneous entry and proceed */
> +			dir->f_pos += 1;
> +			ctx->pos += 1;
> +			continue;
> +		}

Yecchhh....  Took me a while to figure out that it's
		if (unlikely(err)) {	/* EOF or an error */
			if (err == 1)
				return 0;
			/* skip erroneous entry and proceed */
			ctx->pos += 1;
			continue;
		}

> +
> +		/* d_name now contains a valid entry name */
> +		sanity = ctx->pos + 0xbeef;
> +		fake_ino = sanity;
> +		/*
> +		 * On 32 bit systems pos is 64 signed, while ino is 32 bit
> +		 * unsigned so fake_ino may overflow, check for this.
> +		 */
> +		if (sanity - fake_ino) {
> +			vbg_err("vboxsf: can not compute ino\n");
> +			return -EINVAL;
> +		}
> +		if (!dir_emit(ctx, d_name, strlen(d_name), fake_ino, d_type))
> +			return 0;
> +
> +		dir->f_pos += 1;

Again, leave ->f_pos alone; it's caller's responsibility.

> +		ctx->pos += 1;
> +	}
> +}
> +
> +const struct file_operations vboxsf_dir_fops = {
> +	.open = sf_dir_open,
> +	.iterate = sf_dir_iterate,
> +	.release = sf_dir_release,
> +	.read = generic_read_dir,
> +	.llseek = generic_file_llseek,
> +};

> +static struct dentry *sf_lookup(struct inode *parent, struct dentry *dentry,
> +				unsigned int flags)
> +{
> +	struct shfl_fsobjinfo fsinfo;
> +	struct sf_inode_info *sf_i;
> +	struct sf_glob_info *sf_g;
> +	struct inode *inode;
> +	ino_t ino;
> +	int err;
> +
> +	sf_g = GET_GLOB_INFO(parent->i_sb);
> +	sf_i = GET_INODE_INFO(parent);
> +
> +	err = vboxsf_stat_dentry(dentry, &fsinfo);
> +	if (err) {
> +		if (err != -ENOENT)
> +			return ERR_PTR(err);
> +		/*
> +		 * -ENOENT: add NULL inode to dentry so it later can
> +		 * be created via call to create/mkdir/open
> +		 */
> +		inode = NULL;
> +	} else {
> +		ino = iunique(parent->i_sb, 1);
> +		inode = iget_locked(parent->i_sb, ino);
> +		if (!inode)
> +			return ERR_PTR(-ENOMEM);
> +
> +		vboxsf_init_inode(sf_g, inode, &fsinfo);
> +		unlock_new_inode(inode);
> +	}
> +
> +	dentry->d_time = jiffies;
> +	d_set_d_op(dentry, &sf_dentry_ops);
> +	d_add(dentry, inode);

	No.  Just return d_splice_alias(inode, dentry);
What's more, d_set_d_op() is not needed - just set
sb->s_d_op when setting superblock up.  So I would rather do this:

	dentry->d_time = jiffies;
	err = vboxsf_stat_dentry(dentry, &fsinfo);
	if (err) {
		inode = err == -ENOENT ? NULL : PTR_ERR(inode);
	} else {
		ino = iunique(parent->i_sb, 1);
		inode = iget_locked(parent->i_sb, ino);

		if (inode) {
			vboxsf_init_inode(sf_g, inode, &fsinfo);
			unlock_new_inode(inode);
		} else {
			inode = ERR_PTR(-ENOMEM);
		}
	}
	return d_splice_alias(inode, dentry);

Wait, what?  Why do you bother with picking an unique inumber,
followed by icache lookup, anyway?

> +static int sf_instantiate(struct inode *parent, struct dentry *dentry,
> +			  struct shfl_fsobjinfo *info, u64 handle)
> +{
> +	struct sf_glob_info *sf_g = GET_GLOB_INFO(parent->i_sb);
> +	struct sf_inode_info *sf_i;
> +	struct inode *inode;
> +	ino_t ino;
> +
> +	ino = iunique(parent->i_sb, 1);
> +	inode = iget_locked(parent->i_sb, ino);
> +	if (!inode) {
> +		if (handle != SHFL_HANDLE_NIL)
> +			vboxsf_close(sf_g->root, handle);
> +		return -ENOMEM;
> +	}
> +
> +	sf_i = GET_INODE_INFO(inode);
> +	/* the host may have given us different attr then requested */
> +	sf_i->force_restat = 1;
> +	sf_i->handle = handle;
> +	vboxsf_init_inode(sf_g, inode, info);
> +
> +	d_instantiate(dentry, inode);
> +	unlock_new_inode(inode);

	d_instantiate_new, please...
> +
> +	return 0;
> +}

And your use of iget_locked/iunique (both here and in lookup) is... odd.
You try to pick a unique inumber; check that it *is* unique (by icache
lookups), then you try to find an inode with that that inumber in there,
hopefully still finding none, then you allocate an inode, then you
insert it into the icache.

What the hell is that dance for?  To give them unique inumbers?  You
intend the hash lookup to fail; in fact, if it doesn't you are deep
in trouble.  There are many saner ways to get unique numbers allocated
(idr, for one)...

> +static int sf_unlink_aux(struct inode *parent, struct dentry *dentry,
> +			 int is_dir)
> +{
> +	struct sf_glob_info *sf_g = GET_GLOB_INFO(parent->i_sb);
> +	struct sf_inode_info *sf_parent_i = GET_INODE_INFO(parent);
> +	struct inode *inode = d_inode(dentry);
> +	struct shfl_string *path;
> +	uint32_t flags;
> +	int err;
> +
> +	flags = is_dir ? SHFL_REMOVE_DIR : SHFL_REMOVE_FILE;
> +	if (inode && (inode->i_mode & S_IFLNK) == S_IFLNK)
> +		flags |= SHFL_REMOVE_SYMLINK;

Um...  How can we get here with NULL inode and do we really
want to do anything in that case?

> +	path = vboxsf_path_from_dentry(sf_g, dentry);
> +	if (IS_ERR(path))
> +		return PTR_ERR(path);
> +
> +	err = vboxsf_remove(sf_g->root, path, flags);
> +	__putname(path);
> +	if (err)
> +		return err;
> +
> +	/* parent directory access/change time changed */
> +	sf_parent_i->force_restat = 1;
> +
> +	return 0;
> +}
> +
> +/**
> + * Remove a regular file.
> + * Return: 0 or negative errno value.
> + * @parent	inode of the directory
> + * @dentry	directory cache entry
> + */
> +static int sf_unlink(struct inode *parent, struct dentry *dentry)
> +{
> +	return sf_unlink_aux(parent, dentry, 0);
> +}
> +
> +/**
> + * Remove a directory.
> + * Return: 0 or negative errno value.
> + * @parent	inode of the directory
> + * @dentry	directory cache entry
> + */
> +static int sf_rmdir(struct inode *parent, struct dentry *dentry)
> +{
> +	return sf_unlink_aux(parent, dentry, 1);
> +}
> +
> +/**
> + * Rename a regular file / directory.
> + * Return: 0 or negative errno value.
> + * @old_parent	inode of the old parent directory
> + * @old_dentry	old directory cache entry
> + * @new_parent	inode of the new parent directory
> + * @new_dentry	new directory cache entry
> + * @flags	flags
> + */
> +static int sf_rename(struct inode *old_parent, struct dentry *old_dentry,
> +		     struct inode *new_parent, struct dentry *new_dentry,
> +		     unsigned int flags)
> +{
> +	struct sf_glob_info *sf_g = GET_GLOB_INFO(old_parent->i_sb);
> +	struct sf_inode_info *sf_old_parent_i = GET_INODE_INFO(old_parent);
> +	struct sf_inode_info *sf_new_parent_i = GET_INODE_INFO(new_parent);
> +	u32 shfl_flags = SHFL_RENAME_FILE | SHFL_RENAME_REPLACE_IF_EXISTS;
> +	struct shfl_string *old_path, *new_path;
> +	int err;
> +
> +	if (flags)
> +		return -EINVAL;
> +
> +	if (sf_g != GET_GLOB_INFO(new_parent->i_sb))
> +		return -EINVAL;

Huh?  How can that happen?  We are guaranteed that old_parent->i_sb ==
new_parent->i_sb == old_dentry->d_sb == new_dentry->d_sb.  That check
makes no sense...

> +static ssize_t sf_reg_write(struct file *file, const char __user *buf,
> +			    size_t size, loff_t *off)
> +{
> +	struct inode *inode = file_inode(file);
> +	struct sf_inode_info *sf_i = GET_INODE_INFO(inode);
> +	struct sf_glob_info *sf_g = GET_GLOB_INFO(inode->i_sb);
> +	struct sf_reg_info *sf_r = file->private_data;
> +	u32 nwritten;
> +	u64 pos;
> +	int err;
> +
> +	pos = *off;
> +	if (file->f_flags & O_APPEND) {
> +		pos = inode->i_size;
> +		*off = pos;
> +	}

Racy.  lseek() and you are in trouble here.

> +	if (!size)
> +		return 0;
> +
> +	if (size > SHFL_MAX_RW_COUNT)
> +		nwritten = SHFL_MAX_RW_COUNT;
> +	else
> +		nwritten = size;
> +
> +	/* Make sure any pending writes done through mmap are flushed */
> +	err = filemap_fdatawait_range(inode->i_mapping, pos, pos + nwritten);
> +	if (err)
> +		return err;
> +
> +	err = vboxsf_write(sf_g->root, sf_r->handle, pos, &nwritten,
> +			   (uintptr_t)buf, true);

Eww...  "true" being what, "it's a userland address, not a kernel one"?
Almost always a sign of bad interface...

> +	if (err)
> +		return err;
> +
> +	*off += nwritten;
> +	if (*off > inode->i_size)
> +		i_size_write(inode, *off);
> +
> +	/* Invalidate page-cache so that mmap using apps see the changes too */
> +	invalidate_mapping_pages(inode->i_mapping, pos >> PAGE_SHIFT,
> +				 *off >> PAGE_SHIFT);
> +
> +	/* mtime changed */
> +	sf_i->force_restat = 1;
> +	return nwritten;
> +}

> +	/* Already open? */
> +	if (sf_i->handle != SHFL_HANDLE_NIL) {
> +		sf_r->handle = sf_i->handle;
> +		sf_i->handle = SHFL_HANDLE_NIL;
> +		sf_i->file = file;
> +		file->private_data = sf_r;

Obviously racy.

> +	/* the host may have given us different attr then requested */
> +	sf_i->force_restat = 1;
> +	sf_r->handle = params.handle;
> +	sf_i->file = file;

Ditto.  Two opens in parallel and you are in trouble.

> +	file->private_data = sf_r;
> +	return 0;
> +}

> +static int sf_reg_release(struct inode *inode, struct file *file)
> +{
> +	struct sf_reg_info *sf_r;
> +	struct sf_glob_info *sf_g;
> +	struct sf_inode_info *sf_i = GET_INODE_INFO(inode);
> +
> +	sf_g = GET_GLOB_INFO(inode->i_sb);
> +	sf_r = file->private_data;
> +
> +	filemap_write_and_wait(inode->i_mapping);
> +
> +	vboxsf_close(sf_g->root, sf_r->handle);
> +
> +	kfree(sf_r);
> +	sf_i->file = NULL;
> +	sf_i->handle = SHFL_HANDLE_NIL;
> +	file->private_data = NULL;
> +	return 0;
> +}

Also looks confused re multiple overlapping opens...

> +	/**
> +	 * Number of hard links to this filesystem object (st_nlink).
> +	 * This field is 1 if the filesystem doesn't support hardlinking or
> +	 * the information isn't available.
> +	 */
> +	u32 hardlinks;

Wait a sec; what happens if you *do* have hardlinks?  Your lookup will
give each an inode of its own...  Confused...

> +int vboxsf_write(u32 root, u64 handle, u64 offset,
> +		 u32 *buf_len, uintptr_t buf, bool user)
> +{
> +	struct shfl_write parms;
> +	int err;
> +
> +	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
> +	parms.root.u.value32 = root;
> +
> +	parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
> +	parms.handle.u.value64 = handle;
> +	parms.offset.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
> +	parms.offset.u.value64 = offset;
> +	parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
> +	parms.cb.u.value32 = *buf_len;
> +	if (user)
> +		parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_IN;
> +	else
> +		parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
> +	parms.buffer.u.pointer.size = *buf_len;
> +	parms.buffer.u.pointer.u.linear_addr = buf;
> +
> +	err = vboxsf_call(SHFL_FN_WRITE, &parms, SHFL_CPARMS_WRITE, NULL);
> +
> +	*buf_len = parms.cb.u.value32;
> +	return err;

Egads...  So let me see if I got it straight - your vboxsf_call() is in
the business of going through the process' page tables, etc.?  In
hgcm_call_preprocess_linaddr(), presumably?

Frankly, I would suggest to do an equivalent of that on your own, and
to hell with these kludges - there's no existing users in the kernel
right now and it would be much saner to get rid of the "copy_from_user()
deep in the guts of vboxsf_call()" thing.