On Mon, Apr 26, 2021 at 9:42 PM Gabriel Krisman Bertazi <krisman@xxxxxxxxxxxxx> wrote: > > This adds support for the ring buffer mode in fanotify. It is enabled > by a new flag FAN_PREALLOC_QUEUE passed to fanotify_init. If this flag > is enabled, the group only allows marks that support the ring buffer I don't like this limitation. I think FAN_PREALLOC_QUEUE can work with other events, why not? In any case if we keep ring buffer, please use a different set of fanotify_ring_buffer_ops struct instead of spraying if/else all over the event queue implementation. > submission. In a following patch, FAN_ERROR will make use of this > mechanism. > > Signed-off-by: Gabriel Krisman Bertazi <krisman@xxxxxxxxxxxxx> > --- > fs/notify/fanotify/fanotify.c | 77 +++++++++++++++++++--------- > fs/notify/fanotify/fanotify_user.c | 81 ++++++++++++++++++------------ > include/linux/fanotify.h | 5 +- > include/uapi/linux/fanotify.h | 1 + > 4 files changed, 105 insertions(+), 59 deletions(-) > > diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c > index e3669d8a4a64..98591a8155a7 100644 > --- a/fs/notify/fanotify/fanotify.c > +++ b/fs/notify/fanotify/fanotify.c > @@ -612,6 +612,26 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, > return event; > } > > +static struct fanotify_event *fanotify_ring_get_slot(struct fsnotify_group *group, > + u32 mask, const void *data, > + int data_type) > +{ > + size_t size = 0; > + > + pr_debug("%s: group=%p mask=%x size=%lu\n", __func__, group, mask, size); > + > + return FANOTIFY_E(fsnotify_ring_alloc_event_slot(group, size)); > +} > + > +static void fanotify_ring_write_event(struct fsnotify_group *group, > + struct fanotify_event *event, u32 mask, > + const void *data, __kernel_fsid_t *fsid) > +{ > + fanotify_init_event(group, event, 0, mask); > + > + event->pid = get_pid(task_tgid(current)); > +} > + > /* > * Get cached fsid of the filesystem containing the object from any connector. > * All connectors are supposed to have the same fsid, but we do not verify that > @@ -701,31 +721,38 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, > return 0; > } > > - event = fanotify_alloc_event(group, mask, data, data_type, dir, > - file_name, &fsid); > - ret = -ENOMEM; > - if (unlikely(!event)) { > - /* > - * We don't queue overflow events for permission events as > - * there the access is denied and so no event is in fact lost. > - */ > - if (!fanotify_is_perm_event(mask)) > - fsnotify_queue_overflow(group); > - goto finish; > - } > - > - fsn_event = &event->fse; > - ret = fsnotify_add_event(group, fsn_event, fanotify_merge); > - if (ret) { > - /* Permission events shouldn't be merged */ > - BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); > - /* Our event wasn't used in the end. Free it. */ > - fsnotify_destroy_event(group, fsn_event); > - > - ret = 0; > - } else if (fanotify_is_perm_event(mask)) { > - ret = fanotify_get_response(group, FANOTIFY_PERM(event), > - iter_info); > + if (group->flags & FSN_SUBMISSION_RING_BUFFER) { > + event = fanotify_ring_get_slot(group, mask, data, data_type); > + if (IS_ERR(event)) > + return PTR_ERR(event); So no FAN_OVERFLOW with the ring buffer implementation? This will be unexpected for fanotify users and frankly, less useful IMO. I also don't see the technical reason to omit the overflow event. > + fanotify_ring_write_event(group, event, mask, data, &fsid); > + fsnotify_ring_commit_slot(group, &event->fse); > + } else { > + event = fanotify_alloc_event(group, mask, data, data_type, dir, > + file_name, &fsid); > + ret = -ENOMEM; > + if (unlikely(!event)) { > + /* > + * We don't queue overflow events for permission events as > + * there the access is denied and so no event is in fact lost. > + */ > + if (!fanotify_is_perm_event(mask)) > + fsnotify_queue_overflow(group); > + goto finish; > + } > + fsn_event = &event->fse; > + ret = fsnotify_add_event(group, fsn_event, fanotify_merge); > + if (ret) { > + /* Permission events shouldn't be merged */ > + BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); > + /* Our event wasn't used in the end. Free it. */ > + fsnotify_destroy_event(group, fsn_event); > + > + ret = 0; > + } else if (fanotify_is_perm_event(mask)) { > + ret = fanotify_get_response(group, FANOTIFY_PERM(event), > + iter_info); > + } > } > finish: > if (fanotify_is_perm_event(mask)) > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c > index fe605359af88..5031198bf7db 100644 > --- a/fs/notify/fanotify/fanotify_user.c > +++ b/fs/notify/fanotify/fanotify_user.c > @@ -521,7 +521,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, > * Permission events get queued to wait for response. Other > * events can be destroyed now. > */ > - if (!fanotify_is_perm_event(event->mask)) { > + if (group->fanotify_data.flags & FAN_PREALLOC_QUEUE) { > + fsnotify_ring_buffer_consume_event(group, &event->fse); > + } else if (!fanotify_is_perm_event(event->mask)) { > fsnotify_destroy_event(group, &event->fse); > } else { > if (ret <= 0) { > @@ -587,40 +589,39 @@ static int fanotify_release(struct inode *ignored, struct file *file) > */ > fsnotify_group_stop_queueing(group); > > - /* > - * Process all permission events on access_list and notification queue > - * and simulate reply from userspace. > - */ > - spin_lock(&group->notification_lock); > - while (!list_empty(&group->fanotify_data.access_list)) { > - struct fanotify_perm_event *event; > - > - event = list_first_entry(&group->fanotify_data.access_list, > - struct fanotify_perm_event, fae.fse.list); > - list_del_init(&event->fae.fse.list); > - finish_permission_event(group, event, FAN_ALLOW); > + if (!(group->flags & FSN_SUBMISSION_RING_BUFFER)) { > + /* > + * Process all permission events on access_list and notification queue > + * and simulate reply from userspace. > + */ > spin_lock(&group->notification_lock); > - } > - > - /* > - * Destroy all non-permission events. For permission events just > - * dequeue them and set the response. They will be freed once the > - * response is consumed and fanotify_get_response() returns. > - */ > - while (!fsnotify_notify_queue_is_empty(group)) { > - struct fanotify_event *event; > - > - event = FANOTIFY_E(fsnotify_remove_first_event(group)); > - if (!(event->mask & FANOTIFY_PERM_EVENTS)) { > - spin_unlock(&group->notification_lock); > - fsnotify_destroy_event(group, &event->fse); > - } else { > - finish_permission_event(group, FANOTIFY_PERM(event), > - FAN_ALLOW); > + while (!list_empty(&group->fanotify_data.access_list)) { > + struct fanotify_perm_event *event; > + event = list_first_entry(&group->fanotify_data.access_list, > + struct fanotify_perm_event, fae.fse.list); > + list_del_init(&event->fae.fse.list); > + finish_permission_event(group, event, FAN_ALLOW); > + spin_lock(&group->notification_lock); > } > - spin_lock(&group->notification_lock); > + /* > + * Destroy all non-permission events. For permission events just > + * dequeue them and set the response. They will be freed once the > + * response is consumed and fanotify_get_response() returns. > + */ > + while (!fsnotify_notify_queue_is_empty(group)) { > + struct fanotify_event *event; > + event = FANOTIFY_E(fsnotify_remove_first_event(group)); > + if (!(event->mask & FANOTIFY_PERM_EVENTS)) { > + spin_unlock(&group->notification_lock); > + fsnotify_destroy_event(group, &event->fse); > + } else { > + finish_permission_event(group, FANOTIFY_PERM(event), > + FAN_ALLOW); > + } > + spin_lock(&group->notification_lock); > + } > + spin_unlock(&group->notification_lock); > } > - spin_unlock(&group->notification_lock); > > /* Response for all permission events it set, wakeup waiters */ > wake_up(&group->fanotify_data.access_waitq); > @@ -981,6 +982,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) > if (flags & FAN_NONBLOCK) > f_flags |= O_NONBLOCK; > > + if (flags & FAN_PREALLOC_QUEUE) { > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > + if (flags & FAN_UNLIMITED_QUEUE) > + return -EINVAL; > + > + fsn_flags = FSN_SUBMISSION_RING_BUFFER; > + } > + > /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ > group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops, fsn_flags); > if (IS_ERR(group)) { > @@ -1223,6 +1234,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, > goto fput_and_out; > } > > + if ((group->flags & FSN_SUBMISSION_RING_BUFFER) && > + (mask & ~FANOTIFY_SUBMISSION_BUFFER_EVENTS)) > + goto fput_and_out; > + > ret = fanotify_find_path(dfd, pathname, &path, flags, > (mask & ALL_FSNOTIFY_EVENTS), obj_type); > if (ret) > @@ -1327,7 +1342,7 @@ SYSCALL32_DEFINE6(fanotify_mark, > */ > static int __init fanotify_user_setup(void) > { > - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); > + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); > BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); > > fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, > diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h > index 3e9c56ee651f..5a4cefb4b1c3 100644 > --- a/include/linux/fanotify.h > +++ b/include/linux/fanotify.h > @@ -23,7 +23,8 @@ > #define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \ > FAN_REPORT_TID | \ > FAN_CLOEXEC | FAN_NONBLOCK | \ > - FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS) > + FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS | \ > + FAN_PREALLOC_QUEUE) > > #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \ > FAN_MARK_FILESYSTEM) > @@ -71,6 +72,8 @@ > FANOTIFY_PERM_EVENTS | \ > FAN_Q_OVERFLOW | FAN_ONDIR) > > +#define FANOTIFY_SUBMISSION_BUFFER_EVENTS 0 FANOTIFY_RING_BUFFER_EVENTS? FANOTIFY_PREALLOC_EVENTS? Please leave a comment above to state what this group means. I *think* there is no reason to limit the set of events, only the sort of information that is possible with FAN_PREALLOC_QUEUE. Perhaps FAN_REPORT_FID cannot be allowed and as a result FANOTIFY_INODE_EVENTS will not be allowed, but I am not even sure if that limitation is needed. Thanks, Amir.