On Mon, Oct 25, 2021 at 11:47 PM Ioannis Angelakopoulos <iangelak@xxxxxxxxxx> wrote: > > Every time a local watch is placed/modified/removed on/from an inode the > same operation has to take place in the FUSE server. > > Thus add the inode operation "fuse_fsnotify_update_mark", which is > specific to FUSE inodes. This operation is called from the > "inotify_add_watch" system call in the inotify subsystem. > > Specifically, the operation is called when a process tries to add, modify > or remove a watch from a FUSE inode and the remote fsnotify support is > enabled both in the guest kernel and the FUSE server (virtiofsd). > > Essentially, when the kernel adds/modifies a watch locally, also send a > fsnotify request to the FUSE server to do the same. We keep the local watch > placement since it is essential for the functionality of the fsnotify > notification subsystem. However, the local events generated by the guest > kernel will be suppressed if they affect FUSE inodes and the remote > fsnotify support is enabled. > > Also modify the "fsnotify_detach_mark" function in fs/notify/mark.c to add > support for the remote deletion of watches for FUSE inodes. In contrast to > the add/modify operation we do not modify the inotify subsystem, but the > fsnotify subsystem. That is because there are two ways of deleting a watch > from an inode. The first is by manually calling the "inotify_rm_watch" > system call and the second is automatically by the kernel when the process > that has created an inotify instance exits. In that case the kernel is > responsible for deleting all the watches corresponding to said inotify > instance. > > Thus we send the fsnotify request for the deletion of the remote watch at > the lowest level within "fsnotify_detach_mark" to catch both watch removal > cases. > > The "fuse_fsnotify_update_mark" function in turn calls the > "fuse_fsnotify_send_request" function, to send an fsnotify request to the > FUSE server related to an inode watch. > > > Signed-off-by: Ioannis Angelakopoulos <iangelak@xxxxxxxxxx> > --- > fs/fuse/dir.c | 29 +++++++++++++++++++++++++++++ > fs/notify/inotify/inotify_user.c | 11 +++++++++++ > fs/notify/mark.c | 10 ++++++++++ > include/linux/fs.h | 2 ++ > 4 files changed, 52 insertions(+) > > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c > index d9b977c0f38d..f666aafc8d3f 100644 > --- a/fs/fuse/dir.c > +++ b/fs/fuse/dir.c > @@ -17,6 +17,8 @@ > #include <linux/xattr.h> > #include <linux/iversion.h> > #include <linux/posix_acl.h> > +#include <linux/fsnotify_backend.h> > +#include <linux/inotify.h> > > static void fuse_advise_use_readdirplus(struct inode *dir) > { > @@ -1805,6 +1807,30 @@ static int fuse_getattr(struct user_namespace *mnt_userns, > return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); > } > > +static int fuse_fsnotify_update_mark(struct inode *inode, uint32_t action, > + uint64_t group, uint32_t mask) > +{ > + /* > + * We have to remove the bits added to the mask before being attached > + * or detached to the inode, since these bits are going to be > + * added by the "remote" host kernel. If these bits were still enabled > + * in the mask that was sent to the "remote" kernel then the watch would > + * be rejected as an unsupported value. These bits are added by the > + * fsnotify subsystem thus we use the corresponding fsnotify bits here. > + */ > + mask = mask & ~(FS_IN_IGNORED | FS_UNMOUNT | FS_IN_ONESHOT | > + FS_EXCL_UNLINK | FS_EVENT_ON_CHILD); > + > + if (!(mask & IN_ALL_EVENTS)) > + return -EINVAL; > + > + /* > + * Action 0: Remove a watch > + * Action 1: Add/Modify watch > + */ > + return fuse_fsnotify_send_request(inode, mask, action, group); > +} > + > static const struct inode_operations fuse_dir_inode_operations = { > .lookup = fuse_lookup, > .mkdir = fuse_mkdir, > @@ -1824,6 +1850,7 @@ static const struct inode_operations fuse_dir_inode_operations = { > .set_acl = fuse_set_acl, > .fileattr_get = fuse_fileattr_get, > .fileattr_set = fuse_fileattr_set, > + .fsnotify_update = fuse_fsnotify_update_mark, > }; > > static const struct file_operations fuse_dir_operations = { > @@ -1846,6 +1873,7 @@ static const struct inode_operations fuse_common_inode_operations = { > .set_acl = fuse_set_acl, > .fileattr_get = fuse_fileattr_get, > .fileattr_set = fuse_fileattr_set, > + .fsnotify_update = fuse_fsnotify_update_mark, > }; > > static const struct inode_operations fuse_symlink_inode_operations = { > @@ -1853,6 +1881,7 @@ static const struct inode_operations fuse_symlink_inode_operations = { > .get_link = fuse_get_link, > .getattr = fuse_getattr, > .listxattr = fuse_listxattr, > + .fsnotify_update = fuse_fsnotify_update_mark, > }; > > void fuse_init_common(struct inode *inode) > diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c > index 62051247f6d2..3a0fee09a7c3 100644 > --- a/fs/notify/inotify/inotify_user.c > +++ b/fs/notify/inotify/inotify_user.c > @@ -46,6 +46,8 @@ > #define INOTIFY_WATCH_COST (sizeof(struct inotify_inode_mark) + \ > 2 * sizeof(struct inode)) > > +#define FSNOTIFY_ADD_MODIFY_MARK 1 > + > /* configurable via /proc/sys/fs/inotify/ */ > static int inotify_max_queued_events __read_mostly; > > @@ -764,6 +766,15 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, > > /* create/update an inode mark */ > ret = inotify_update_watch(group, inode, mask); > + /* > + * If the inode belongs to a remote filesystem/server that supports > + * remote inotify events then send the mark to the remote server > + */ > + if (ret >= 0 && inode->i_op->fsnotify_update) { > + inode->i_op->fsnotify_update(inode, > + FSNOTIFY_ADD_MODIFY_MARK, > + (uint64_t)group, mask); > + } > path_put(&path); > fput_and_out: > fdput(f); > diff --git a/fs/notify/mark.c b/fs/notify/mark.c > index fa1d99101f89..f0d37276afcb 100644 > --- a/fs/notify/mark.c > +++ b/fs/notify/mark.c > @@ -77,6 +77,7 @@ > #include "fsnotify.h" > > #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ > +#define FSNOTIFY_DELETE_MARK 0 /* Delete a mark in remote fsnotify */ This define is part of the vfs API it should be in an include file along side FSNOTIFY_ADD_MODIFY_MARK (if we keep them in the API). > > struct srcu_struct fsnotify_mark_srcu; > struct kmem_cache *fsnotify_mark_connector_cachep; > @@ -399,6 +400,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) > void fsnotify_detach_mark(struct fsnotify_mark *mark) > { > struct fsnotify_group *group = mark->group; > + struct inode *inode = NULL; > > WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); > WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && > @@ -411,6 +413,14 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) > spin_unlock(&mark->lock); > return; > } > + > + /* Only if the object is an inode send a request to FUSE server */ > + inode = fsnotify_conn_inode(mark->connector); > + if (inode && inode->i_op->fsnotify_update) { > + inode->i_op->fsnotify_update(inode, FSNOTIFY_DELETE_MARK, > + (uint64_t)group, mark->mask); > + } > + > mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; > list_del_init(&mark->g_list); > spin_unlock(&mark->lock); > diff --git a/include/linux/fs.h b/include/linux/fs.h > index e7a633353fd2..86bcc44e3ab8 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -2149,6 +2149,8 @@ struct inode_operations { > int (*fileattr_set)(struct user_namespace *mnt_userns, > struct dentry *dentry, struct fileattr *fa); > int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); > + int (*fsnotify_update)(struct inode *inode, uint32_t action, > + uint64_t group, uint32_t mask); > } ____cacheline_aligned; > Please split the patch that introduces the API from the FUSE implementation. Regarding the API, group does not belong in this interface. The inode object has an "aggregated mask" at i_fsnotify_mask indicating an interest for an event from any group. Remote servers should be notified when the aggregated mask changes. Hence, Miklos has proposed a "remote fsnotify update" API which does not carry the mask nor the action, only the watched object: https://lore.kernel.org/linux-fsdevel/20190501205541.GC30899@xxxxxxxxxxxxxxxxxxxxxxxxxx/ On that same thread, you will see that I also proposed the API to support full filesystem watch (by passing sb). I am not requiring that you implement sb watch for FUSE/virtiofs, but the API should take this future extension into account. Thanks, Amir.