On Tue, Jun 29, 2021 at 10:13 PM Gabriel Krisman Bertazi <krisman@xxxxxxxxxxxxx> wrote: > > The FAN_FS_ERROR event is a new inode event used by filesystem wide > monitoring tools to receive notifications of type FS_ERROR_EVENT, > emitted directly by filesystems when a problem is detected. The error > notification includes a generic error descriptor and a FID identifying > the file affected. > > FID is sent for every FAN_FS_ERROR. Errors not linked to a regular inode > are reported against the root inode. > > An error reporting structure is attached per-mark, and only a single > error can be stored at a time. This is ok, since once an error occurs, > it is common for a stream of related errors to be reported. We only log > accumulate the total of errors occurred since the last notification. > > Signed-off-by: Gabriel Krisman Bertazi <krisman@xxxxxxxxxxxxx> > > --- > Changes since v2: > - Support and equire FID mode (amir) > - Goto error path instead of early return (amir) > - Simplify get_one_event (me) > - Base merging on error_count > - drop fanotify_queue_error_event > > Changes since v1: > - Pass dentry to fanotify_check_fsid (Amir) > - FANOTIFY_EVENT_TYPE_ERROR -> FANOTIFY_EVENT_TYPE_FS_ERROR > - Merge previous patch into it > - Use a single slot > - Move fanotify_mark.error_event definition to this commit > - Rename FAN_ERROR -> FAN_FS_ERROR > - Restrict FAN_FS_ERROR to FAN_MARK_FILESYSTEM > --- > fs/notify/fanotify/fanotify.c | 108 +++++++++++++++++++++++------ > fs/notify/fanotify/fanotify.h | 53 ++++++++++++++ > fs/notify/fanotify/fanotify_user.c | 97 +++++++++++++++++++++++++- > include/linux/fanotify.h | 11 ++- > include/uapi/linux/fanotify.h | 8 +++ > 5 files changed, 253 insertions(+), 24 deletions(-) > > diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c > index 769703ef2b9a..854adcbeb95d 100644 > --- a/fs/notify/fanotify/fanotify.c > +++ b/fs/notify/fanotify/fanotify.c > @@ -336,23 +336,6 @@ static u32 fanotify_group_event_mask( > return test_mask & user_mask; > } > > -/* > - * Check size needed to encode fanotify_fh. > - * > - * Return size of encoded fh without fanotify_fh header. > - * Return 0 on failure to encode. > - */ > -static int fanotify_encode_fh_len(struct inode *inode) > -{ > - int dwords = 0; > - > - if (!inode) > - return 0; > - > - exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); > - > - return dwords << 2; > -} > > /* > * Encode fanotify_fh. > @@ -405,8 +388,12 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, > fh->type = type; > fh->len = fh_len; > > - /* Mix fh into event merge key */ > - *hash ^= fanotify_hash_fh(fh); > + /* > + * Mix fh into event merge key. Hash might be NULL in case of > + * unhashed FID events (i.e. FAN_FS_ERROR). > + */ > + if (hash) > + *hash ^= fanotify_hash_fh(fh); > > return FANOTIFY_FH_HDR_LEN + fh_len; > > @@ -692,6 +679,60 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) > return fsid; > } > > +static int fanotify_merge_error_event(struct fsnotify_group *group, > + struct fsnotify_event *event) > +{ > + struct fanotify_event *fae = FANOTIFY_E(event); > + > + /* > + * When err_count > 0, the reporting slot is full. Just account > + * the additional error and abort the insertion. > + */ > + if (atomic_fetch_inc(&FANOTIFY_EE(fae)->err_count) != 0) > + return 1; > + I'd like to address the review of this patch in two parts. First, I will comments on minor technical details. Stay tuned for another reply regarding this merge() method... > + return 0; > +} > + > +static void fanotify_insert_error_event(struct fsnotify_group *group, > + struct fsnotify_event *event, > + const void *data) > +{ > + const struct fsnotify_event_info *ei = > + (struct fsnotify_event_info *) data; > + struct fanotify_event *fae = FANOTIFY_E(event); > + const struct fs_error_report *report; > + struct fanotify_error_event *fee; > + struct inode *inode; > + int fh_len; > + > + /* This might be an unexpected type of event (i.e. overflow). */ > + if (!fanotify_is_error_event(fae->mask)) > + return; > + > + report = (struct fs_error_report *) ei->data; > + inode = report->inode ?: ei->sb->s_root->d_inode; > + > + fee = FANOTIFY_EE(fae); > + fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR; > + fee->error = report->error; > + fee->fsid = fee->mark->connector->fsid; > + > + fsnotify_get_mark(fee->mark); > + > + /* > + * Error reporting needs to happen in atomic context. If this > + * inode's file handler is more than we initially predicted, > + * there is nothing better we can do than report the error with > + * a bad FH. > + */ > + fh_len = fanotify_encode_fh_len(inode); > + if (WARN_ON(fh_len > fee->max_fh_len)) WARN_ON() is not acceptable for things that can logically happen if you think this is important you could use pr_warn_ratelimited() like we do in fanotify_encode_fh(), but since fs-monitor will observe the lack of FID anyway, I think there is little point in reporting this to kmsg. > + return; > + > + fanotify_encode_fh(&fee->object_fh, inode, fh_len, NULL, 0); > +} > + > /* > * Add an event to hash table for faster merge. > */ > @@ -742,8 +783,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, > BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); > BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); > BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); > + BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); > > - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); > + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); > > mask = fanotify_group_event_mask(group, mask, event_info, iter_info); > if (!mask) > @@ -767,6 +809,17 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, > return 0; > } > > + if (fanotify_is_error_event(mask)) { > + struct fanotify_sb_mark *sb_mark = > + FANOTIFY_SB_MARK(fsnotify_iter_sb_mark(iter_info)); > + > + ret = fsnotify_add_event(group, &sb_mark->error_event->fae.fse, > + fanotify_merge_error_event, > + fanotify_insert_error_event, > + event_info); > + goto finish; > + } > + > event = fanotify_alloc_event(group, mask, event_info, &fsid); > ret = -ENOMEM; > if (unlikely(!event)) { > @@ -834,6 +887,17 @@ static void fanotify_free_name_event(struct fanotify_event *event) > kfree(FANOTIFY_NE(event)); > } > > +static void fanotify_free_error_event(struct fanotify_event *event) > +{ > + /* > + * Just drop the reference acquired by > + * fanotify_insert_error_event. > + * > + * The actual memory is freed with the mark. > + */ > + fsnotify_put_mark(FANOTIFY_EE(event)->mark); > +} > + > static void fanotify_free_event(struct fsnotify_event *fsn_event) > { > struct fanotify_event *event; > @@ -856,6 +920,9 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) > case FANOTIFY_EVENT_TYPE_OVERFLOW: > kfree(event); > break; > + case FANOTIFY_EVENT_TYPE_FS_ERROR: > + fanotify_free_error_event(event); > + break; > default: > WARN_ON_ONCE(1); > } > @@ -873,6 +940,7 @@ static void fanotify_free_mark(struct fsnotify_mark *mark) > if (mark->flags & FSNOTIFY_MARK_FLAG_SB) { > struct fanotify_sb_mark *fa_mark = FANOTIFY_SB_MARK(mark); > > + kfree(fa_mark->error_event); > kmem_cache_free(fanotify_sb_mark_cache, fa_mark); > } else { > kmem_cache_free(fanotify_mark_cache, mark); > diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h > index 2e005b3a75f2..0259d631b8d8 100644 > --- a/fs/notify/fanotify/fanotify.h > +++ b/fs/notify/fanotify/fanotify.h > @@ -132,6 +132,7 @@ static inline void fanotify_info_copy_name(struct fanotify_info *info, > > struct fanotify_sb_mark { > struct fsnotify_mark fsn_mark; > + struct fanotify_error_event *error_event; > }; > > static inline > @@ -154,6 +155,7 @@ enum fanotify_event_type { > FANOTIFY_EVENT_TYPE_PATH, > FANOTIFY_EVENT_TYPE_PATH_PERM, > FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ > + FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */ > __FANOTIFY_EVENT_TYPE_NUM > }; > > @@ -209,12 +211,37 @@ FANOTIFY_NE(struct fanotify_event *event) > return container_of(event, struct fanotify_name_event, fae); > } > > +struct fanotify_error_event { > + struct fanotify_event fae; > + s32 error; /* Error reported by the Filesystem. */ > + atomic_t err_count; /* Suppressed errors count */ > + __kernel_fsid_t fsid; /* FSID this error refers to. */ > + > + struct fsnotify_mark *mark; /* Back reference to the mark. */ > + int max_fh_len; /* Maximum object_fh buffer size. */ > + > + /* > + * object_fh is followed by a variable sized buffer, so it must > + * be the last element of this structure. > + */ > + struct fanotify_fh object_fh; > +}; > + > + > +static inline struct fanotify_error_event * > +FANOTIFY_EE(struct fanotify_event *event) > +{ > + return container_of(event, struct fanotify_error_event, fae); > +} > + > static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event) > { > if (event->type == FANOTIFY_EVENT_TYPE_FID) > return &FANOTIFY_FE(event)->fsid; > else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) > return &FANOTIFY_NE(event)->fsid; > + else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) > + return &FANOTIFY_EE(event)->fsid; > else > return NULL; > } > @@ -226,6 +253,8 @@ static inline struct fanotify_fh *fanotify_event_object_fh( > return &FANOTIFY_FE(event)->object_fh; > else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) > return fanotify_info_file_fh(&FANOTIFY_NE(event)->info); > + else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) > + return &FANOTIFY_EE(event)->object_fh; > else > return NULL; > } > @@ -300,6 +329,11 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) > return container_of(fse, struct fanotify_event, fse); > } > > +static inline bool fanotify_is_error_event(u32 mask) > +{ > + return mask & FANOTIFY_ERROR_EVENTS; > +} > + > static inline bool fanotify_event_has_path(struct fanotify_event *event) > { > return event->type == FANOTIFY_EVENT_TYPE_PATH || > @@ -329,6 +363,7 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event) > static inline bool fanotify_is_hashed_event(u32 mask) > { > return !(fanotify_is_perm_event(mask) || > + fanotify_is_error_event(mask) || > fsnotify_is_overflow_event(mask)); > } > > @@ -338,3 +373,21 @@ static inline unsigned int fanotify_event_hash_bucket( > { > return event->hash & FANOTIFY_HTABLE_MASK; > } > + > +/* > + * Check size needed to encode fanotify_fh. > + * > + * Return size of encoded fh without fanotify_fh header. > + * Return 0 on failure to encode. > + */ > +static inline int fanotify_encode_fh_len(struct inode *inode) > +{ > + int dwords = 0; > + > + if (!inode) > + return 0; > + > + exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); > + > + return dwords << 2; > +} > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c > index a42521e140e6..8bcce26e21df 100644 > --- a/fs/notify/fanotify/fanotify_user.c > +++ b/fs/notify/fanotify/fanotify_user.c > @@ -107,6 +107,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; > #define FANOTIFY_EVENT_ALIGN 4 > #define FANOTIFY_INFO_HDR_LEN \ > (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) > +#define FANOTIFY_INFO_ERROR_LEN \ > + (sizeof(struct fanotify_event_info_error)) > > static int fanotify_fid_info_len(int fh_len, int name_len) > { > @@ -130,6 +132,9 @@ static size_t fanotify_event_len(struct fanotify_event *event, > if (!fid_mode) > return event_len; > > + if (fanotify_is_error_event(event->mask)) > + event_len += FANOTIFY_INFO_ERROR_LEN; > + > info = fanotify_event_info(event); > dir_fh_len = fanotify_event_dir_fh_len(event); > fh_len = fanotify_event_object_fh_len(event); > @@ -310,6 +315,30 @@ static int process_access_response(struct fsnotify_group *group, > return -ENOENT; > } > > +static size_t copy_error_info_to_user(struct fanotify_event *event, > + char __user *buf, int count) > +{ > + struct fanotify_event_info_error info; > + struct fanotify_error_event *fee = FANOTIFY_EE(event); > + > + info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; > + info.hdr.pad = 0; > + info.hdr.len = sizeof(struct fanotify_event_info_error); > + > + if (WARN_ON(count < info.hdr.len)) > + return -EFAULT; > + > + info.error = fee->error; > + > + /* This effectively releases the event for logging another error */ > + info.error_count = atomic_xchg(&fee->err_count, 0); > + > + if (copy_to_user(buf, &info, sizeof(info))) > + return -EFAULT; > + > + return info.hdr.len; > +} > + > static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, > int info_type, const char *name, size_t name_len, > char __user *buf, size_t count) > @@ -468,6 +497,14 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, > if (f) > fd_install(fd, f); > > + if (fanotify_is_error_event(event->mask)) { > + ret = copy_error_info_to_user(event, buf, count); > + if (ret < 0) > + goto out_close_fd; > + buf += ret; > + count -= ret; > + } > + > /* Event info records order is: dir fid + name, child fid */ > if (fanotify_event_dir_fh_len(event)) { > info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : > @@ -896,6 +933,43 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, > flags, umask); > } > > +static int fanotify_create_fs_error_event(struct fsnotify_mark *fsn_mark, > + fsnotify_connp_t *connp) > +{ > + struct fanotify_sb_mark *sb_mark = FANOTIFY_SB_MARK(fsn_mark); > + struct super_block *sb = > + container_of(connp, struct super_block, s_fsnotify_marks); > + struct fanotify_error_event *fee; > + int fh_len; > + > + /* > + * Since the allocation is done holding group->mark_mutex, the > + * error event allocation is guaranteed not to race with itself. If this is protected by a mutex then READ_ONCE/WRITE_ONCE are not need and the comment above is confusing. You should fire your code reviewer ;-) > + */ > + if (READ_ONCE(sb_mark->error_event)) > + return 0; > + > + /* Since, for error events, every memory must be preallocated, > + * the FH buffer size is predicted to be the same as the root > + * inode file handler size. This should work for file systems > + * without variable sized FH. > + */ > + fh_len = fanotify_encode_fh_len(sb->s_root->d_inode); > + > + fee = kzalloc(sizeof(*fee) + fh_len, GFP_KERNEL); GFP_KERNEL_ACCOUNT > + if (!fee) > + return -ENOMEM; > + > + fanotify_init_event(&fee->fae, 0, FS_ERROR); > + fee->mark = fsn_mark; > + fee->max_fh_len = fh_len; > + > + WRITE_ONCE(sb_mark->error_event, fee); > + > + return 0; > + > +} > + > static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, > __u32 mask, > unsigned int flags) > @@ -994,6 +1068,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, > { > struct fsnotify_mark *fsn_mark; > __u32 added; > + int ret = 0; > > mutex_lock(&group->mark_mutex); > fsn_mark = fsnotify_find_mark(connp, group); > @@ -1004,13 +1079,28 @@ static int fanotify_add_mark(struct fsnotify_group *group, > return PTR_ERR(fsn_mark); > } > } > + > + /* > + * Error events are allocated per super-block mark, but only if > + * strictly needed (i.e. FAN_FS_ERROR was requested). > + */ > + if (type == FSNOTIFY_OBJ_TYPE_SB && !(flags & FAN_MARK_IGNORED_MASK) && > + (mask & FAN_FS_ERROR)) { > + if (fanotify_create_fs_error_event(fsn_mark, connp)) { > + ret = -ENOMEM; > + goto out; > + } > + } > + > added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); > if (added & ~fsnotify_conn_mask(fsn_mark->connector)) > fsnotify_recalc_mask(fsn_mark->connector); > + > +out: > mutex_unlock(&group->mark_mutex); > > fsnotify_put_mark(fsn_mark); > - return 0; > + return ret; > } > > static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, > @@ -1427,6 +1517,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, > fsid = &__fsid; > } > > + if (mask & FAN_FS_ERROR && mark_type != FAN_MARK_FILESYSTEM) { > + ret = -EINVAL; > + goto path_put_and_out; > + } > + > /* inode held in place by reference to path; group by fget on fd */ > if (mark_type == FAN_MARK_INODE) > inode = path.dentry->d_inode; > diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h > index a16dbeced152..d086a19aff63 100644 > --- a/include/linux/fanotify.h > +++ b/include/linux/fanotify.h > @@ -81,13 +81,17 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ > */ > #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE) > > -/* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */ > +#define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR) > + > +/* Events that can only be reported to groups that support FID mode */ Let's not do that. How about the opposite: /* Events that can be reported with event->fd */ #define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) /* * Events that do not carry enough information to report event->fd * require a group that supports reporting fid. * Those events are not supported on a mount mark, because they do * not carry enough information (i.e. path) to be filtered by mount point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); if (!(mask & FANOTIFY_FD_EVENTS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) Thanks, Amir.