On Sat, Nov 23, 2024 at 12:00 AM Song Liu <song@xxxxxxxxxx> wrote: > > fanotify filter enables handling fanotify events within the kernel, and > thus saves a trip to the user space. fanotify filter can be useful in > many use cases. For example, if a user is only interested in events for > some files in side a directory, a filter can be used to filter out > irrelevant events. > > fanotify filter is attached to fsnotify_group. At most one filter can > be attached to a fsnotify_group. The attach/detach of filter are > controlled by two new ioctls on the fanotify fds: FAN_IOC_ADD_FILTER > and FAN_IOC_DEL_FILTER. > > fanotify filter is packaged in a kernel module. In the future, it is > also possible to package fanotify filter in a BPF program. Since loading > modules requires CAP_SYS_ADMIN, _loading_ fanotify filter in kernel > modules is limited to CAP_SYS_ADMIN. However, non-SYS_CAP_ADMIN users > can _attach_ filter loaded by sys admin to their fanotify fds. The owner > of the fanotify fitler can use flag FAN_FILTER_F_SYS_ADMIN_ONLY to > make a filter available only to users with CAP_SYS_ADMIN. > > To make fanotify filters more flexible, a filter can take arguments at > attach time. > > sysfs entry /sys/kernel/fanotify_filter is added to help users know > which fanotify filters are available. At the moment, these files are > added for each filter: flags, desc, and init_args. It's a shame that we have fanotify knobs at /proc/sys/fs/fanotify/ and in sysfs, but understand we don't want to make more use of proc for this. Still I would add the filter files under a new /sys/fs/fanotify/ dir and not directly under /sys/kernel/ > > Signed-off-by: Song Liu <song@xxxxxxxxxx> > --- > fs/notify/fanotify/Kconfig | 13 ++ > fs/notify/fanotify/Makefile | 1 + > fs/notify/fanotify/fanotify.c | 44 +++- > fs/notify/fanotify/fanotify_filter.c | 289 +++++++++++++++++++++++++++ > fs/notify/fanotify/fanotify_user.c | 7 + > include/linux/fanotify.h | 128 ++++++++++++ > include/linux/fsnotify_backend.h | 6 +- > include/uapi/linux/fanotify.h | 36 ++++ > 8 files changed, 520 insertions(+), 4 deletions(-) > create mode 100644 fs/notify/fanotify/fanotify_filter.c > > diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig > index 0e36aaf379b7..abfd59d95f49 100644 > --- a/fs/notify/fanotify/Kconfig > +++ b/fs/notify/fanotify/Kconfig > @@ -24,3 +24,16 @@ config FANOTIFY_ACCESS_PERMISSIONS > hierarchical storage management systems. > > If unsure, say N. > + > +config FANOTIFY_FILTER > + bool "fanotify in kernel filter" > + depends on FANOTIFY > + default y > + help > + Say Y here if you want to use fanotify in kernel filter. > + The filter can be implemented in a kernel module or a > + BPF program. The filter can speed up fanotify in many > + use cases. For example, when the listener is only interested in > + a subset of events. > + > + If unsure, say Y. > \ No newline at end of file > diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile > index 25ef222915e5..d95ec0aeffb5 100644 > --- a/fs/notify/fanotify/Makefile > +++ b/fs/notify/fanotify/Makefile > @@ -1,2 +1,3 @@ > # SPDX-License-Identifier: GPL-2.0-only > obj-$(CONFIG_FANOTIFY) += fanotify.o fanotify_user.o > +obj-$(CONFIG_FANOTIFY_FILTER) += fanotify_filter.o > diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c > index 224bccaab4cc..c70184cd2d45 100644 > --- a/fs/notify/fanotify/fanotify.c > +++ b/fs/notify/fanotify/fanotify.c > @@ -18,6 +18,8 @@ > > #include "fanotify.h" > > +extern struct srcu_struct fsnotify_mark_srcu; > + > static bool fanotify_path_equal(const struct path *p1, const struct path *p2) > { > return p1->mnt == p2->mnt && p1->dentry == p2->dentry; > @@ -888,6 +890,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, > struct fsnotify_event *fsn_event; > __kernel_fsid_t fsid = {}; > u32 match_mask = 0; > + struct fanotify_filter_hook *filter_hook __maybe_unused; > > BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); > BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); > @@ -921,6 +924,39 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, > pr_debug("%s: group=%p mask=%x report_mask=%x\n", __func__, > group, mask, match_mask); > > + if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) > + fsid = fanotify_get_fsid(iter_info); > + > +#ifdef CONFIG_FANOTIFY_FILTER > + filter_hook = srcu_dereference(group->fanotify_data.filter_hook, &fsnotify_mark_srcu); Do we actually need the sleeping rcu protection for calling the hook? Can regular rcu read side be nested inside srcu read side? Jan, I don't remember why srcu is needed since we are not holding it when waiting for userspace anymore? > + if (filter_hook) { > + struct fanotify_filter_event filter_event = { > + .mask = mask, > + .data = data, > + .data_type = data_type, > + .dir = dir, > + .file_name = file_name, > + .fsid = &fsid, > + .match_mask = match_mask, > + }; > + > + ret = filter_hook->ops->filter(group, filter_hook, &filter_event); > + > + /* > + * The filter may return > + * - FAN_FILTER_RET_SEND_TO_USERSPACE => continue the rest; > + * - FAN_FILTER_RET_SKIP_EVENT => return 0 now; > + * - < 0 error => return error now. > + * > + * For the latter two cases, we can just return ret. > + */ > + BUILD_BUG_ON(FAN_FILTER_RET_SKIP_EVENT != 0); > + > + if (ret != FAN_FILTER_RET_SEND_TO_USERSPACE) > + return ret; > + } > +#endif > + > if (fanotify_is_perm_event(mask)) { > /* > * fsnotify_prepare_user_wait() fails if we race with mark > @@ -930,9 +966,6 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, > return 0; > } > > - if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) > - fsid = fanotify_get_fsid(iter_info); > - > event = fanotify_alloc_event(group, mask, data, data_type, dir, > file_name, &fsid, match_mask); > ret = -ENOMEM; > @@ -976,6 +1009,11 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) > > if (mempool_initialized(&group->fanotify_data.error_events_pool)) > mempool_exit(&group->fanotify_data.error_events_pool); > + > +#ifdef CONFIG_FANOTIFY_FILTER > + if (group->fanotify_data.filter_hook) > + fanotify_filter_hook_free(group->fanotify_data.filter_hook); > +#endif > } > > static void fanotify_free_path_event(struct fanotify_event *event) > diff --git a/fs/notify/fanotify/fanotify_filter.c b/fs/notify/fanotify/fanotify_filter.c > new file mode 100644 > index 000000000000..9215612e2fcb > --- /dev/null > +++ b/fs/notify/fanotify/fanotify_filter.c > @@ -0,0 +1,289 @@ > +// SPDX-License-Identifier: GPL-2.0 > +#include <linux/fanotify.h> > +#include <linux/kobject.h> > +#include <linux/module.h> > + > +#include "fanotify.h" > + > +extern struct srcu_struct fsnotify_mark_srcu; > + > +static DEFINE_SPINLOCK(filter_list_lock); > +static LIST_HEAD(filter_list); > + > +static struct kobject *fan_filter_root_kobj; > + > +static struct { > + enum fanotify_filter_flags flag; > + const char *name; > +} fanotify_filter_flags_names[] = { > + { > + .flag = FAN_FILTER_F_SYS_ADMIN_ONLY, > + .name = "SYS_ADMIN_ONLY", > + } > +}; > + > +static ssize_t flags_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + struct fanotify_filter_ops *ops; > + ssize_t len = 0; > + int i; > + > + ops = container_of(kobj, struct fanotify_filter_ops, kobj); > + for (i = 0; i < ARRAY_SIZE(fanotify_filter_flags_names); i++) { > + if (ops->flags & fanotify_filter_flags_names[i].flag) { > + len += sysfs_emit_at(buf, len, "%s%s", len ? " " : "", > + fanotify_filter_flags_names[i].name); > + } > + } > + len += sysfs_emit_at(buf, len, "\n"); > + return len; > +} > + > +static ssize_t desc_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + struct fanotify_filter_ops *ops; > + > + ops = container_of(kobj, struct fanotify_filter_ops, kobj); > + > + return sysfs_emit(buf, "%s\n", ops->desc ?: "N/A"); > +} > + > +static ssize_t init_args_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + struct fanotify_filter_ops *ops; > + > + ops = container_of(kobj, struct fanotify_filter_ops, kobj); > + > + return sysfs_emit(buf, "%s\n", ops->init_args ?: "N/A"); > +} > + > +static struct kobj_attribute flags_kobj_attr = __ATTR_RO(flags); > +static struct kobj_attribute desc_kobj_attr = __ATTR_RO(desc); > +static struct kobj_attribute init_args_kobj_attr = __ATTR_RO(init_args); > + > +static struct attribute *fan_filter_attrs[] = { > + &flags_kobj_attr.attr, > + &desc_kobj_attr.attr, > + &init_args_kobj_attr.attr, > + NULL, > +}; > +ATTRIBUTE_GROUPS(fan_filter); > + > +static void fan_filter_kobj_release(struct kobject *kobj) > +{ > +} > + > +static const struct kobj_type fan_filter_ktype = { > + .release = fan_filter_kobj_release, > + .sysfs_ops = &kobj_sysfs_ops, > + .default_groups = fan_filter_groups, > +}; > + > +static struct fanotify_filter_ops *fanotify_filter_find(const char *name) > +{ > + struct fanotify_filter_ops *ops; > + > + list_for_each_entry(ops, &filter_list, list) { > + if (!strcmp(ops->name, name)) > + return ops; > + } > + return NULL; > +} > + > +static void __fanotify_filter_unregister(struct fanotify_filter_ops *ops) > +{ > + spin_lock(&filter_list_lock); > + list_del_init(&ops->list); > + spin_unlock(&filter_list_lock); > +} > + > +/* > + * fanotify_filter_register - Register a new filter. > + * > + * Add a filter to the filter_list. These filter are > + * available for all users in the system. > + * > + * @ops: pointer to fanotify_filter_ops to add. > + * > + * Returns: > + * 0 - on success; > + * -EEXIST - filter of the same name already exists. > + * -ENODEV - fanotify filter was not properly initialized. > + */ > +int fanotify_filter_register(struct fanotify_filter_ops *ops) > +{ > + int ret; > + > + if (!fan_filter_root_kobj) > + return -ENODEV; > + > + spin_lock(&filter_list_lock); > + if (fanotify_filter_find(ops->name)) { > + /* cannot register two filters with the same name */ > + spin_unlock(&filter_list_lock); > + return -EEXIST; > + } > + list_add_tail(&ops->list, &filter_list); > + spin_unlock(&filter_list_lock); > + > + > + kobject_init(&ops->kobj, &fan_filter_ktype); > + ret = kobject_add(&ops->kobj, fan_filter_root_kobj, "%s", ops->name); > + if (ret) { > + __fanotify_filter_unregister(ops); > + return ret; > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(fanotify_filter_register); > + > +/* > + * fanotify_filter_unregister - Unregister a new filter. > + * > + * Remove a filter from filter_list. > + * > + * @ops: pointer to fanotify_filter_ops to remove. > + */ > +void fanotify_filter_unregister(struct fanotify_filter_ops *ops) > +{ > + kobject_put(&ops->kobj); > + __fanotify_filter_unregister(ops); > +} > +EXPORT_SYMBOL_GPL(fanotify_filter_unregister); > + > +/* > + * fanotify_filter_add - Add a filter to fsnotify_group. > + * > + * Add a filter from filter_list to a fsnotify_group. > + * > + * @group: fsnotify_group that will have add > + * @argp: fanotify_filter_args that specifies the filter > + * and the init arguments of the filter. > + * > + * Returns: > + * 0 - on success; > + * -EEXIST - filter of the same name already exists. > + */ > +int fanotify_filter_add(struct fsnotify_group *group, > + struct fanotify_filter_args __user *argp) > +{ > + struct fanotify_filter_hook *filter_hook; > + struct fanotify_filter_ops *filter_ops; > + struct fanotify_filter_args args; > + void *init_args = NULL; > + int ret = 0; > + > + ret = copy_from_user(&args, argp, sizeof(args)); > + if (ret) > + return -EFAULT; > + > + if (args.init_args_size > FAN_FILTER_ARGS_MAX) > + return -EINVAL; > + > + args.name[FAN_FILTER_NAME_MAX - 1] = '\0'; > + > + fsnotify_group_lock(group); > + > + if (rcu_access_pointer(group->fanotify_data.filter_hook)) { > + fsnotify_group_unlock(group); > + return -EBUSY; > + } > + > + filter_hook = kzalloc(sizeof(*filter_hook), GFP_KERNEL); > + if (!filter_hook) { > + ret = -ENOMEM; > + goto out; > + } > + > + spin_lock(&filter_list_lock); > + filter_ops = fanotify_filter_find(args.name); > + if (!filter_ops || !try_module_get(filter_ops->owner)) { > + spin_unlock(&filter_list_lock); > + ret = -ENOENT; > + goto err_free_hook; > + } > + spin_unlock(&filter_list_lock); > + > + if (!capable(CAP_SYS_ADMIN) && (filter_ops->flags & FAN_FILTER_F_SYS_ADMIN_ONLY)) { 1. feels better to opt-in for UNPRIV (and maybe later on) rather than make it the default. 2. need to check that filter_ops->flags has only "known" flags 3. probably need to add filter_ops->version check in case we want to change the ABI > + ret = -EPERM; > + goto err_module_put; > + } > + > + if (filter_ops->filter_init) { > + if (args.init_args_size != filter_ops->init_args_size) { > + ret = -EINVAL; > + goto err_module_put; > + } > + if (args.init_args_size) { > + init_args = kzalloc(args.init_args_size, GFP_KERNEL); > + if (!init_args) { > + ret = -ENOMEM; > + goto err_module_put; > + } > + if (copy_from_user(init_args, (void __user *)args.init_args, > + args.init_args_size)) { > + ret = -EFAULT; > + goto err_free_args; > + } > + > + } > + ret = filter_ops->filter_init(group, filter_hook, init_args); > + if (ret) > + goto err_free_args; > + kfree(init_args); > + } > + filter_hook->ops = filter_ops; > + rcu_assign_pointer(group->fanotify_data.filter_hook, filter_hook); > + > +out: > + fsnotify_group_unlock(group); > + return ret; > + > +err_free_args: > + kfree(init_args); > +err_module_put: > + module_put(filter_ops->owner); > +err_free_hook: > + kfree(filter_hook); > + goto out; > +} > + > +void fanotify_filter_hook_free(struct fanotify_filter_hook *filter_hook) > +{ > + if (filter_hook->ops->filter_free) > + filter_hook->ops->filter_free(filter_hook); > + > + module_put(filter_hook->ops->owner); > + kfree(filter_hook); > +} > + > +/* > + * fanotify_filter_del - Delete a filter from fsnotify_group. > + */ > +void fanotify_filter_del(struct fsnotify_group *group) > +{ > + struct fanotify_filter_hook *filter_hook; > + > + fsnotify_group_lock(group); > + filter_hook = group->fanotify_data.filter_hook; > + if (!filter_hook) > + goto out; > + > + rcu_assign_pointer(group->fanotify_data.filter_hook, NULL); > + fanotify_filter_hook_free(filter_hook); The read side is protected with srcu and there is no srcu/rcu delay of freeing. You will either need something along the lines of fsnotify_connector_destroy_workfn() with synchronize_srcu() or use regular rcu delay and read side (assuming that it can be nested inside srcu read side?). Thanks, Amir.