Hi Al,
Thank you for the detailed review.
I'll reply to each of your remark below, please keep in mind that
I'm pretty green when it comes to fs development, so there are a lot
of questions from me and there are a lot of things which I still need
to learn .
On 03-04-19 22:49, Al Viro wrote:
On Tue, Apr 02, 2019 at 01:54:21PM +0200, Hans de Goede wrote:
+static int sf_dir_open(struct inode *inode, struct file *file)
+{
+ struct sf_glob_info *sf_g = GET_GLOB_INFO(inode->i_sb);
+ struct shfl_createparms params = {};
+ struct sf_dir_info *sf_d;
+ int err;
+
+ sf_d = vboxsf_dir_info_alloc();
+ if (!sf_d)
+ return -ENOMEM;
+
+ params.handle = SHFL_HANDLE_NIL;
+ params.create_flags = 0
+ | SHFL_CF_DIRECTORY
+ | SHFL_CF_ACT_OPEN_IF_EXISTS
+ | SHFL_CF_ACT_FAIL_IF_NEW | SHFL_CF_ACCESS_READ;
+
+ err = vboxsf_create_at_dentry(file_dentry(file), ¶ms);
+ if (err == 0) {
+ if (params.result == SHFL_FILE_EXISTS) {
+ err = vboxsf_dir_read_all(sf_g, sf_d, params.handle);
+ if (!err)
+ file->private_data = sf_d;
+ } else
+ err = -ENOENT;
+
+ vboxsf_close(sf_g->root, params.handle);
+ }
+
+ if (err)
+ vboxsf_dir_info_free(sf_d);
+
+ return err;
+}
+
+/**
+ * This is called when reference count of [file] goes to zero. Notify
+ * the host that it can free whatever is associated with this directory
+ * and deallocate our own internal buffers
+ * Return: 0 or negative errno value.
+ * @inode inode
+ * @file file
+ */
+static int sf_dir_release(struct inode *inode, struct file *file)
+{
+ if (file->private_data)
+ vboxsf_dir_info_free(file->private_data);
+
+ return 0;
+}
+
+/**
+ * Translate RTFMODE into DT_xxx (in conjunction to rtDirType())
+ * Return: d_type
+ * @mode file mode
+ */
+static int sf_get_d_type(u32 mode)
+{
+ int d_type;
+
+ switch (mode & SHFL_TYPE_MASK) {
+ case SHFL_TYPE_FIFO:
+ d_type = DT_FIFO;
+ break;
+ case SHFL_TYPE_DEV_CHAR:
+ d_type = DT_CHR;
+ break;
+ case SHFL_TYPE_DIRECTORY:
+ d_type = DT_DIR;
+ break;
+ case SHFL_TYPE_DEV_BLOCK:
+ d_type = DT_BLK;
+ break;
+ case SHFL_TYPE_FILE:
+ d_type = DT_REG;
+ break;
+ case SHFL_TYPE_SYMLINK:
+ d_type = DT_LNK;
+ break;
+ case SHFL_TYPE_SOCKET:
+ d_type = DT_SOCK;
+ break;
+ case SHFL_TYPE_WHITEOUT:
+ d_type = DT_WHT;
+ break;
+ default:
+ d_type = DT_UNKNOWN;
+ break;
+ }
+ return d_type;
+}
+/**
+ * Extract element ([dir]->f_pos) from the directory [dir] into [d_name].
+ * Return: 0 or negative errno value.
+ * @dir Directory to get element at f_pos from
+ * @d_name Buffer in which to return element name
+ * @d_type Buffer in which to return element file-type
+ */
+static int sf_getdent(struct file *dir, char d_name[NAME_MAX], int *d_type)
+{
+ struct sf_glob_info *sf_g = GET_GLOB_INFO(file_inode(dir)->i_sb);
+ struct sf_dir_info *sf_d = dir->private_data;
+ struct list_head *pos;
+ loff_t cur = 0;
+
+ list_for_each(pos, &sf_d->info_list) {
+ struct shfl_dirinfo *info;
+ struct sf_dir_buf *b;
+ loff_t i;
+
+ b = list_entry(pos, struct sf_dir_buf, head);
list_for_each_entry(b, &sf_d->info_list, head) { , surely?
Ack will fix for the next version.
+ if (dir->f_pos >= cur + b->entries) {
+ cur += b->entries;
+ continue;
+ }
+
+ for (i = 0, info = b->buf; i < dir->f_pos - cur; ++i) {
+ size_t size;
+
+ size = offsetof(struct shfl_dirinfo, name.string) +
+ info->name.size;
+ info = (struct shfl_dirinfo *)((uintptr_t) info + size);
Umm... How is alignment handled?
Good question, and the answer is, it is not handled. The array of variable
sized shfl_dirinfo structs which we are iterating over here is received
directly from the host/hypervisor with these potential alignment issues in
there.
VirtualBox is x86 only, where this is not really problem. Note that vboxsf
depends in Kconfig on VBOXGUEST which has:
depends on X86 && PCI && INPUT
I can also add a depends on X86 to config VBOXSF_FS to make this clear and
maybe add a comment about the alignment issues to the above piece of code?
One note here: ->f_pos is not your concern. Leave it alone, just use
ctx->pos.
Ok, will fix for the next version.
+static int sf_dir_iterate(struct file *dir, struct dir_context *ctx)
+{
+ for (;;) {
+ int err;
+ ino_t fake_ino;
+ loff_t sanity;
+ char d_name[NAME_MAX];
+ int d_type = DT_UNKNOWN;
+
+ err = sf_getdent(dir, d_name, &d_type);
+ switch (err) {
+ case 1:
+ return 0;
+
+ case 0:
+ break;
+
+ case -1:
+ default:
+ /* skip erroneous entry and proceed */
+ dir->f_pos += 1;
+ ctx->pos += 1;
+ continue;
+ }
Yecchhh.... Took me a while to figure out that it's
if (unlikely(err)) { /* EOF or an error */
if (err == 1)
return 0;
/* skip erroneous entry and proceed */
ctx->pos += 1;
continue;
}
Ok, I will replace this with the code you are suggesting.
+
+ /* d_name now contains a valid entry name */
+ sanity = ctx->pos + 0xbeef;
+ fake_ino = sanity;
+ /*
+ * On 32 bit systems pos is 64 signed, while ino is 32 bit
+ * unsigned so fake_ino may overflow, check for this.
+ */
+ if (sanity - fake_ino) {
+ vbg_err("vboxsf: can not compute ino\n");
+ return -EINVAL;
+ }
+ if (!dir_emit(ctx, d_name, strlen(d_name), fake_ino, d_type))
+ return 0;
+
+ dir->f_pos += 1;
Again, leave ->f_pos alone; it's caller's responsibility.
Ok.
+ ctx->pos += 1;
+ }
+}
+
+const struct file_operations vboxsf_dir_fops = {
+ .open = sf_dir_open,
+ .iterate = sf_dir_iterate,
+ .release = sf_dir_release,
+ .read = generic_read_dir,
+ .llseek = generic_file_llseek,
+};
+static struct dentry *sf_lookup(struct inode *parent, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct shfl_fsobjinfo fsinfo;
+ struct sf_inode_info *sf_i;
+ struct sf_glob_info *sf_g;
+ struct inode *inode;
+ ino_t ino;
+ int err;
+
+ sf_g = GET_GLOB_INFO(parent->i_sb);
+ sf_i = GET_INODE_INFO(parent);
+
+ err = vboxsf_stat_dentry(dentry, &fsinfo);
+ if (err) {
+ if (err != -ENOENT)
+ return ERR_PTR(err);
+ /*
+ * -ENOENT: add NULL inode to dentry so it later can
+ * be created via call to create/mkdir/open
+ */
+ inode = NULL;
+ } else {
+ ino = iunique(parent->i_sb, 1);
+ inode = iget_locked(parent->i_sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ vboxsf_init_inode(sf_g, inode, &fsinfo);
+ unlock_new_inode(inode);
+ }
+
+ dentry->d_time = jiffies;
+ d_set_d_op(dentry, &sf_dentry_ops);
+ d_add(dentry, inode);
No. Just return d_splice_alias(inode, dentry);
What's more, d_set_d_op() is not needed - just set
sb->s_d_op when setting superblock up. So I would rather do this:
dentry->d_time = jiffies;
err = vboxsf_stat_dentry(dentry, &fsinfo);
if (err) {
inode = err == -ENOENT ? NULL : PTR_ERR(inode);
} else {
ino = iunique(parent->i_sb, 1);
inode = iget_locked(parent->i_sb, ino);
if (inode) {
vboxsf_init_inode(sf_g, inode, &fsinfo);
unlock_new_inode(inode);
} else {
inode = ERR_PTR(-ENOMEM);
}
}
return d_splice_alias(inode, dentry);
Wait, what? Why do you bother with picking an unique inumber,
followed by icache lookup, anyway?
I'm afraid I'm a bit out of my depth with fs stuff here, I pretty
much copied this 1:1 from the upstream out-of-tree virtualbox.org
version of the vboxsf code.
There are multiple ways why they might be doing things this way
(speculation):
1) They copy and pasted this from somewhere at a point in time and
it works.
2) They support building there out of tree version against some
quite old kernels, so maybe this is the only way to do this on
those old kernels.
I take it from your reaction that there is a better way to do this,
can you explain what better way you have in mind?
I've done some reading up on this myself and I see now that
iunique can be quite expensive and that as you mention below since
the number should be unique the lookup should always fail
(unless we have a race where another path also called iunique
got the same value and created the inode before us).
So I take it, that it would be better to move to using idr for
generating unique inode numbers per superblock and then just
use that to get the inode-number and directly call new_inode()
+ insert_inode_locked() ?
+static int sf_instantiate(struct inode *parent, struct dentry *dentry,
+ struct shfl_fsobjinfo *info, u64 handle)
+{
+ struct sf_glob_info *sf_g = GET_GLOB_INFO(parent->i_sb);
+ struct sf_inode_info *sf_i;
+ struct inode *inode;
+ ino_t ino;
+
+ ino = iunique(parent->i_sb, 1);
+ inode = iget_locked(parent->i_sb, ino);
+ if (!inode) {
+ if (handle != SHFL_HANDLE_NIL)
+ vboxsf_close(sf_g->root, handle);
+ return -ENOMEM;
+ }
+
+ sf_i = GET_INODE_INFO(inode);
+ /* the host may have given us different attr then requested */
+ sf_i->force_restat = 1;
+ sf_i->handle = handle;
+ vboxsf_init_inode(sf_g, inode, info);
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
d_instantiate_new, please...
Ok.
+
+ return 0;
+}
And your use of iget_locked/iunique (both here and in lookup) is... odd.
You try to pick a unique inumber; check that it *is* unique (by icache
lookups), then you try to find an inode with that that inumber in there,
hopefully still finding none, then you allocate an inode, then you
insert it into the icache.
What the hell is that dance for? To give them unique inumbers? You
intend the hash lookup to fail; in fact, if it doesn't you are deep
in trouble. There are many saner ways to get unique numbers allocated
(idr, for one)...
See above, I'm all for switching to idr.
+static int sf_unlink_aux(struct inode *parent, struct dentry *dentry,
+ int is_dir)
+{
+ struct sf_glob_info *sf_g = GET_GLOB_INFO(parent->i_sb);
+ struct sf_inode_info *sf_parent_i = GET_INODE_INFO(parent);
+ struct inode *inode = d_inode(dentry);
+ struct shfl_string *path;
+ uint32_t flags;
+ int err;
+
+ flags = is_dir ? SHFL_REMOVE_DIR : SHFL_REMOVE_FILE;
+ if (inode && (inode->i_mode & S_IFLNK) == S_IFLNK)
+ flags |= SHFL_REMOVE_SYMLINK;
Um... How can we get here with NULL inode and do we really
want to do anything in that case?
I guess we cannot get here with a NULL inode and the check
can be dropped. The only 2 callers of this are the
rmdir and unlink inode_operations for directory-inodes
if those never get called with a dentry where d_inode(dentry)
may return NULL, then the check can be dropped?
+ path = vboxsf_path_from_dentry(sf_g, dentry);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ err = vboxsf_remove(sf_g->root, path, flags);
+ __putname(path);
+ if (err)
+ return err;
+
+ /* parent directory access/change time changed */
+ sf_parent_i->force_restat = 1;
+
+ return 0;
+}
+
+/**
+ * Remove a regular file.
+ * Return: 0 or negative errno value.
+ * @parent inode of the directory
+ * @dentry directory cache entry
+ */
+static int sf_unlink(struct inode *parent, struct dentry *dentry)
+{
+ return sf_unlink_aux(parent, dentry, 0);
+}
+
+/**
+ * Remove a directory.
+ * Return: 0 or negative errno value.
+ * @parent inode of the directory
+ * @dentry directory cache entry
+ */
+static int sf_rmdir(struct inode *parent, struct dentry *dentry)
+{
+ return sf_unlink_aux(parent, dentry, 1);
+}
+
+/**
+ * Rename a regular file / directory.
+ * Return: 0 or negative errno value.
+ * @old_parent inode of the old parent directory
+ * @old_dentry old directory cache entry
+ * @new_parent inode of the new parent directory
+ * @new_dentry new directory cache entry
+ * @flags flags
+ */
+static int sf_rename(struct inode *old_parent, struct dentry *old_dentry,
+ struct inode *new_parent, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ struct sf_glob_info *sf_g = GET_GLOB_INFO(old_parent->i_sb);
+ struct sf_inode_info *sf_old_parent_i = GET_INODE_INFO(old_parent);
+ struct sf_inode_info *sf_new_parent_i = GET_INODE_INFO(new_parent);
+ u32 shfl_flags = SHFL_RENAME_FILE | SHFL_RENAME_REPLACE_IF_EXISTS;
+ struct shfl_string *old_path, *new_path;
+ int err;
+
+ if (flags)
+ return -EINVAL;
+
+ if (sf_g != GET_GLOB_INFO(new_parent->i_sb))
+ return -EINVAL;
Huh? How can that happen? We are guaranteed that old_parent->i_sb ==
new_parent->i_sb == old_dentry->d_sb == new_dentry->d_sb. That check
makes no sense...
Ok I will drop the check for the next version.
+static ssize_t sf_reg_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *off)
+{
+ struct inode *inode = file_inode(file);
+ struct sf_inode_info *sf_i = GET_INODE_INFO(inode);
+ struct sf_glob_info *sf_g = GET_GLOB_INFO(inode->i_sb);
+ struct sf_reg_info *sf_r = file->private_data;
+ u32 nwritten;
+ u64 pos;
+ int err;
+
+ pos = *off;
+ if (file->f_flags & O_APPEND) {
+ pos = inode->i_size;
+ *off = pos;
+ }
Racy. lseek() and you are in trouble here.
Ok, so we do pass the APPEND flag to the host, so I guess we
could just rely on the host getting this right, but then how
do update i_size and pos after the write?
Do a stat to get the new size and then store that in pos?
+ if (!size)
+ return 0;
+
+ if (size > SHFL_MAX_RW_COUNT)
+ nwritten = SHFL_MAX_RW_COUNT;
+ else
+ nwritten = size;
+
+ /* Make sure any pending writes done through mmap are flushed */
+ err = filemap_fdatawait_range(inode->i_mapping, pos, pos + nwritten);
+ if (err)
+ return err;
+
+ err = vboxsf_write(sf_g->root, sf_r->handle, pos, &nwritten,
+ (uintptr_t)buf, true);
Eww... "true" being what, "it's a userland address, not a kernel one"?
Yes
Almost always a sign of bad interface...
See my reply to your comment on this at the end.
+ if (err)
+ return err;
+
+ *off += nwritten;
+ if (*off > inode->i_size)
+ i_size_write(inode, *off);
+
+ /* Invalidate page-cache so that mmap using apps see the changes too */
+ invalidate_mapping_pages(inode->i_mapping, pos >> PAGE_SHIFT,
+ *off >> PAGE_SHIFT);
+
+ /* mtime changed */
+ sf_i->force_restat = 1;
+ return nwritten;
+}
+ /* Already open? */
+ if (sf_i->handle != SHFL_HANDLE_NIL) {
+ sf_r->handle = sf_i->handle;
+ sf_i->handle = SHFL_HANDLE_NIL;
+ sf_i->file = file;
+ file->private_data = sf_r;
Obviously racy.
Ok, so how do we fix this? Add a mutex to the private
struct sf_inode_info and take that here and in other
relevant places I guess?
+ /* the host may have given us different attr then requested */
+ sf_i->force_restat = 1;
+ sf_r->handle = params.handle;
+ sf_i->file = file;
Ditto. Two opens in parallel and you are in trouble.
I assume what you fall over there is the "sf_i->file = file" which
is a hack which is purely implemented to have a handle for writepage.
Looking at this closer, I think the right thing todo might be to
implement our own .mmap callback + vma_close callback and then
refcount the handle and store it in vm_private_data so that we
can get it from there rather then having this hack of storing
a struct file * in the inode which as you rightly point out is
obviously buggy.
What do you think of using the vm_private_data approach?
+ file->private_data = sf_r;
+ return 0;
+}
+static int sf_reg_release(struct inode *inode, struct file *file)
+{
+ struct sf_reg_info *sf_r;
+ struct sf_glob_info *sf_g;
+ struct sf_inode_info *sf_i = GET_INODE_INFO(inode);
+
+ sf_g = GET_GLOB_INFO(inode->i_sb);
+ sf_r = file->private_data;
+
+ filemap_write_and_wait(inode->i_mapping);
+
+ vboxsf_close(sf_g->root, sf_r->handle);
+
+ kfree(sf_r);
+ sf_i->file = NULL;
+ sf_i->handle = SHFL_HANDLE_NIL;
+ file->private_data = NULL;
+ return 0;
+}
Also looks confused re multiple overlapping opens...
Ack.
+ /**
+ * Number of hard links to this filesystem object (st_nlink).
+ * This field is 1 if the filesystem doesn't support hardlinking or
+ * the information isn't available.
+ */
+ u32 hardlinks;
Wait a sec; what happens if you *do* have hardlinks? Your lookup will
give each an inode of its own... Confused...
Right, note that this data is not used, it is possible to ask
the host this *if* the host is running on an OS which provides
hardlinks. Since this is not always the case the vboxsf code
does not use this and instead always assumes the host does
not support hardlinks and thus treats each file as unique.
+int vboxsf_write(u32 root, u64 handle, u64 offset,
+ u32 *buf_len, uintptr_t buf, bool user)
+{
+ struct shfl_write parms;
+ int err;
+
+ parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
+ parms.root.u.value32 = root;
+
+ parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
+ parms.handle.u.value64 = handle;
+ parms.offset.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
+ parms.offset.u.value64 = offset;
+ parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
+ parms.cb.u.value32 = *buf_len;
+ if (user)
+ parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_IN;
+ else
+ parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
+ parms.buffer.u.pointer.size = *buf_len;
+ parms.buffer.u.pointer.u.linear_addr = buf;
+
+ err = vboxsf_call(SHFL_FN_WRITE, &parms, SHFL_CPARMS_WRITE, NULL);
+
+ *buf_len = parms.cb.u.value32;
+ return err;
Egads... So let me see if I got it straight - your vboxsf_call() is in
the business of going through the process' page tables, etc.? In
hgcm_call_preprocess_linaddr(), presumably?
Frankly, I would suggest to do an equivalent of that on your own, and
to hell with these kludges - there's no existing users in the kernel
right now and it would be much saner to get rid of the "copy_from_user()
deep in the guts of vboxsf_call()" thing.
So the way vboxsf works is it makes hypervisor calls, called HCM calls to
the host which on the host then translate to open / read / write / readdir,
etc.
The vboxsf driver uses the vboxguest drivers code to actuall make these calls.
The vboxguest driver also offers the ability to make these call to userspace
through a /dev/vboxguest device. The bounce-buffer and copy_from_user()
guts as you call them are necessary to support the /dev/vboxguest ioctls for
this, so those are not going away.
VirtualBox upstream has very recently (about a month ago) moved toward
using get_user_pages and then passing a list of pages to the host for
regular file reads / writes replacing the need for the user flag here.
But this is purely a performance optimization and adds a bunch of extra
code. So TBH I would perfer to keep the ugly user flag and use that for
regular file read/write ops for now; and instead focus on fixing the other
issues which you have mentioned.
Regards,
Hans