On Tue, May 28, 2019 at 6:05 PM David Howells <dhowells@xxxxxxxxxx> wrote: > Add a superblock event notification facility whereby notifications about > superblock events, such as I/O errors (EIO), quota limits being hit > (EDQUOT) and running out of space (ENOSPC) can be reported to a monitoring > process asynchronously. Note that this does not cover vfsmount topology > changes. mount_notify() is used for that. [...] > +#ifdef CONFIG_SB_NOTIFICATIONS > +/* > + * Post superblock notifications. > + */ > +void post_sb_notification(struct super_block *s, struct superblock_notification *n) > +{ > + post_watch_notification(s->s_watchers, &n->watch, current_cred(), > + s->s_unique_id); > +} You're using current_cred() here? So the idea is that if some random process runs into a disk I/O error, the I/O error will come from that task's credentials? In general, you're not supposed to look at task credentials in ->read/->write handlers. > +static void release_sb_watch(struct watch_list *wlist, struct watch *watch) > +{ > + struct super_block *s = watch->private; > + > + put_super(s); > +} > + > +/** > + * sys_sb_notify - Watch for superblock events. > + * @dfd: Base directory to pathwalk from or fd referring to superblock. > + * @filename: Path to superblock to place the watch upon > + * @at_flags: Pathwalk control flags > + * @watch_fd: The watch queue to send notifications to. > + * @watch_id: The watch ID to be placed in the notification (-1 to remove watch) > + */ > +SYSCALL_DEFINE5(sb_notify, > + int, dfd, > + const char __user *, filename, > + unsigned int, at_flags, > + int, watch_fd, > + int, watch_id) > +{ > + struct watch_queue *wqueue; > + struct super_block *s; > + struct watch_list *wlist = NULL; > + struct watch *watch; > + struct path path; > + int ret; > + > + if (watch_id < -1 || watch_id > 0xff) > + return -EINVAL; > + > + ret = user_path_at(dfd, filename, at_flags, &path); As in the other patch, I don't think userspace is supposed to be able to supply user_path_at()'s third argument. It might make sense to require that the path points to the root inode of the superblock? That way you wouldn't be able to do this on a bind mount that exposes part of a shared filesystem to a container. > + if (ret) > + return ret; > + > + wqueue = get_watch_queue(watch_fd); > + if (IS_ERR(wqueue)) > + goto err_path; > + > + s = path.dentry->d_sb; > + if (watch_id >= 0) { > + if (!s->s_watchers) { > + wlist = kzalloc(sizeof(*wlist), GFP_KERNEL); > + if (!wlist) > + goto err_wqueue; > + INIT_HLIST_HEAD(&wlist->watchers); > + spin_lock_init(&wlist->lock); > + wlist->release_watch = release_sb_watch; > + } > + > + watch = kzalloc(sizeof(*watch), GFP_KERNEL); > + if (!watch) > + goto err_wlist; > + > + init_watch(watch, wqueue); > + watch->id = s->s_unique_id; > + watch->private = s; > + watch->info_id = (u32)watch_id << 24; > + > + down_write(&s->s_umount); > + ret = -EIO; > + if (atomic_read(&s->s_active)) { > + if (!s->s_watchers) { > + s->s_watchers = wlist; > + wlist = NULL; > + } > + > + ret = add_watch_to_object(watch, s->s_watchers); > + if (ret == 0) { > + spin_lock(&sb_lock); > + s->s_count++; > + spin_unlock(&sb_lock); Why do watches hold references on the superblock they're watching? > + } > + } > + up_write(&s->s_umount); > + if (ret < 0) > + kfree(watch); > + } else if (s->s_watchers) { This should probably have something like a READ_ONCE() for clarity? > + down_write(&s->s_umount); > + ret = remove_watch_from_object(s->s_watchers, wqueue, > + s->s_unique_id, false); > + up_write(&s->s_umount); > + } else { > + ret = -EBADSLT; > + } > + > +err_wlist: > + kfree(wlist); > +err_wqueue: > + put_watch_queue(wqueue); > +err_path: > + path_put(&path); > + return ret; > +} > +#endif