On Fri, Jan 6, 2017 at 12:43 PM, Jan Kara <jack@xxxxxxx> wrote: > Currently notification marks are attached to object (inode or vfsmnt) by > a hlist_head in the object. The list is also protected by a spinlock in > the object. So while there is any mark attached to the list of marks, > the object must be pinned in memory (and thus e.g. last iput() deleting > inode cannot happen). Also for list iteration in fsnotify() to work, we > must hold fsnotify_mark_srcu lock so that mark itself and > mark->obj_list.next cannot get freed. Thus we are required to wait for > response to fanotify events from userspace process with > fsnotify_mark_srcu lock held. That causes issues when userspace process > is buggy and does not reply to some event - basically the whole > notification subsystem gets eventually stuck. > > So to be able to drop fsnotify_mark_srcu lock while waiting for > response, we have to pin the mark in memory and make sure it stays in > the object list (as removing the mark waiting for response could lead to > lost notification events for groups later in the list). However we don't > want inode reclaim to block on such mark as that would lead to system > just locking up elsewhere. > > This commit tries to pave a way towards solving these conflicting > lifetime needs. Instead of anchoring the list of marks directly in the > object, we anchor it in a dedicated structure (fsnotify_mark_connector) and > just point to that structure from the object. Also the list is protected > by a spinlock contained in that structure. With this, we can detach > notification marks from object without having to modify the list itself. > > Signed-off-by: Jan Kara <jack@xxxxxxx> > --- Looks good, except for some confusing semantics of grab/put and question about rcu_assign_pointer() ... > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c > index b41515d3f081..d512ef9f75fc 100644 > --- a/fs/notify/fsnotify.c > +++ b/fs/notify/fsnotify.c > @@ -193,6 +193,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; > struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; > struct fsnotify_group *inode_group, *vfsmount_group; > + struct fsnotify_mark_connector *inode_conn, *vfsmount_conn; > struct mount *mnt; > int idx, ret = 0; > /* global tests shouldn't care about events on child only the specific event */ > @@ -210,8 +211,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > * SRCU because we have no references to any objects and do not > * need SRCU to keep them "alive". > */ > - if (hlist_empty(&to_tell->i_fsnotify_marks) && > - (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) > + if (!to_tell->i_fsnotify_marks && > + (!mnt || !mnt->mnt_fsnotify_marks)) > return 0; > /* > * if this is a modify event we may need to clear the ignored masks > @@ -226,16 +227,27 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > idx = srcu_read_lock(&fsnotify_mark_srcu); > > if ((mask & FS_MODIFY) || > - (test_mask & to_tell->i_fsnotify_mask)) > - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, > + (test_mask & to_tell->i_fsnotify_mask)) { > + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, > &fsnotify_mark_srcu); > + if (inode_conn) > + inode_node = srcu_dereference(inode_conn->list.first, > + &fsnotify_mark_srcu); > + } > > if (mnt && ((mask & FS_MODIFY) || > (test_mask & mnt->mnt_fsnotify_mask))) { > - vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, > - &fsnotify_mark_srcu); > - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, > + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, > &fsnotify_mark_srcu); > + if (inode_conn) > + inode_node = srcu_dereference(inode_conn->list.first, > + &fsnotify_mark_srcu); > + vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks, > + &fsnotify_mark_srcu); > + if (vfsmount_conn) > + vfsmount_node = srcu_dereference( > + vfsmount_conn->list.first, > + &fsnotify_mark_srcu); > } > ... > + > +static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) > +{ > + struct fsnotify_mark_connector *conn; > + struct inode *inode = NULL; > + bool free_conn = false; > + > + conn = mark->connector; > + spin_lock(&conn->lock); > + hlist_del_init_rcu(&mark->obj_list); > + if (hlist_empty(&conn->list)) { > + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { > + inode = conn->inode; > + inode->i_fsnotify_marks = NULL; Shouldn't that assignment be done using rcu_assign_pointer() to match srcu_dereference(to_tell->i_fsnotify_marks, &fsnotify_mark_srcu); ? > + inode->i_fsnotify_mask = 0; > + conn->inode = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; > + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { > + real_mount(conn->mnt)->mnt_fsnotify_marks = NULL; Same here > + real_mount(conn->mnt)->mnt_fsnotify_mask = 0; > + conn->mnt = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; > + } > + free_conn = true; > + } else > + __fsnotify_recalc_mask(conn); > + mark->connector = NULL; > + spin_unlock(&conn->lock); > + > + if (free_conn) { > + spin_lock(&destroy_lock); > + conn->destroy_next = connector_destroy_list; > + connector_destroy_list = conn; > + spin_unlock(&destroy_lock); > + queue_work(system_unbound_wq, &connector_reaper_work); > + } > + > + return inode; > } > > /* > @@ -137,13 +227,8 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) > > mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; > > - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { > - inode = mark->inode; > - fsnotify_destroy_inode_mark(mark); > - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) > - fsnotify_destroy_vfsmount_mark(mark); > - else > - BUG(); > + inode = fsnotify_detach_from_object(mark); > + > /* > * Note that we didn't update flags telling whether inode cares about > * what's happening with children. We update these flags from > @@ -155,7 +240,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) > > spin_unlock(&mark->lock); > > - if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) > + if (inode) > iput(inode); > > atomic_dec(&group->num_marks); > @@ -220,45 +305,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, > fsnotify_free_mark(mark); > } > > -void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) > -{ > - struct fsnotify_mark *mark; > - > - while (1) { > - /* > - * We have to be careful since we can race with e.g. > - * fsnotify_clear_marks_by_group() and once we drop 'lock', > - * mark can get removed from the obj_list and destroyed. But > - * we are holding mark reference so mark cannot be freed and > - * calling fsnotify_destroy_mark() more than once is fine. > - */ > - spin_lock(lock); > - if (hlist_empty(head)) { > - spin_unlock(lock); > - break; > - } > - mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); > - /* > - * We don't update i_fsnotify_mask / mnt_fsnotify_mask here > - * since inode / mount is going away anyway. So just remove > - * mark from the list. > - */ > - hlist_del_init_rcu(&mark->obj_list); > - fsnotify_get_mark(mark); > - spin_unlock(lock); > - fsnotify_destroy_mark(mark, mark->group); > - fsnotify_put_mark(mark); > - } > -} > - > void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) > { > assert_spin_locked(&mark->lock); > > mark->mask = mask; > - > - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) > - fsnotify_set_inode_mark_mask_locked(mark, mask); > } > > void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) > @@ -304,37 +355,118 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) > return -1; > } > > -/* Add mark into proper place in given list of marks */ > -int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, > - int allow_dups) > +/* > + * Get mark connector, make sure it is alive and return with its lock held. > + * This is for users that get connector pointer from inode or mount. Users that > + * hold reference to a mark on the list may directly lock connector->lock as > + * they are sure list cannot go away under them. > + */ > +static struct fsnotify_mark_connector *fsnotify_grab_connector( > + struct fsnotify_mark_connector * __rcu *connp) > +{ > + struct fsnotify_mark_connector *conn; > + int idx; > + > + idx = srcu_read_lock(&fsnotify_mark_srcu); > + conn = srcu_dereference(*connp, &fsnotify_mark_srcu); > + if (!conn) > + goto out; > + spin_lock(&conn->lock); > + if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE | > + FSNOTIFY_OBJ_TYPE_VFSMOUNT))) { > + spin_unlock(&conn->lock); > + srcu_read_unlock(&fsnotify_mark_srcu, idx); > + return NULL; > + } > +out: > + srcu_read_unlock(&fsnotify_mark_srcu, idx); > + return conn; > +} > + > +static void fsnotify_put_connector(struct fsnotify_mark_connector *conn) > +{ > + spin_unlock(&conn->lock); > +} > + Is there a precedent in the kernel for using grab()/put() semantics for what is essentially aquire()/release() for exclusive object access? The counterpart for grab_cache_page(), for example, is unlock_page(page); put_page(page); (and not just put_page()). igrab() does not return with i_lock held, so igrab()/iput() sematics do not apply to this case. I find a function that is called put_connector() and actually does unlock to be confusing and if that function is a one liner helper, I prefer that it did not exist at all and call site should use spin_unlock(&conn->lock); explicitly instead of hiding it. > +/* > + * Add mark into proper place in given list of marks. These marks may be used > + * for the fsnotify backend to determine which event types should be delivered > + * to which group and for which inodes. These marks are ordered according to > + * priority, highest number first, and then by the group's location in memory. > + */ > +static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > + struct inode *inode, struct vfsmount *mnt, > + int allow_dups) > { > struct fsnotify_mark *lmark, *last = NULL; > + struct fsnotify_mark_connector *conn; > + struct fsnotify_mark_connector **connp; > int cmp; > + int err = 0; > + > + BUG_ON(!inode && !mnt); > + if (inode) > + connp = &inode->i_fsnotify_marks; > + else > + connp = &real_mount(mnt)->mnt_fsnotify_marks; > +restart: > + spin_lock(&mark->lock); > + conn = fsnotify_grab_connector(connp); > + if (!conn) { > + spin_unlock(&mark->lock); > + conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, > + GFP_KERNEL); > + if (!conn) > + return -ENOMEM; > + spin_lock_init(&conn->lock); > + INIT_HLIST_HEAD(&conn->list); > + if (inode) { > + conn->flags = FSNOTIFY_OBJ_TYPE_INODE; > + conn->inode = igrab(inode); > + } else { > + conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT; > + conn->mnt = mnt; > + } > + if (cmpxchg(connp, NULL, conn)) { Same question as in fsnotify_detach_from_object() Is there a need for any smb_wmb() here to match srcu_dereference() in fsnotify(), or is there no need because cmpxchg from NULL to a live object is always safe for the reader? > + /* Someone else created list structure for us */ > + if (inode) > + iput(inode); > + kmem_cache_free(fsnotify_mark_connector_cachep, conn); > + } > + goto restart; > + } > > /* is mark the first mark? */ > - if (hlist_empty(head)) { > - hlist_add_head_rcu(&mark->obj_list, head); > - return 0; > + if (hlist_empty(&conn->list)) { > + hlist_add_head_rcu(&mark->obj_list, &conn->list); > + goto added; > } > > /* should mark be in the middle of the current list? */ > - hlist_for_each_entry(lmark, head, obj_list) { > + hlist_for_each_entry(lmark, &conn->list, obj_list) { > last = lmark; > > - if ((lmark->group == mark->group) && !allow_dups) > - return -EEXIST; > + if ((lmark->group == mark->group) && !allow_dups) { > + err = -EEXIST; > + goto out_err; > + } > > cmp = fsnotify_compare_groups(lmark->group, mark->group); > if (cmp >= 0) { > hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); > - return 0; > + goto added; > } > } > > BUG_ON(last == NULL); > /* mark should be the last entry. last is the current last entry */ > hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); > - return 0; > +added: > + mark->connector = conn; > +out_err: > + fsnotify_put_connector(conn); > + spin_unlock(&mark->lock); > + return err; > } > > /* > @@ -356,7 +488,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, > * LOCKING ORDER!!!! > * group->mark_mutex > * mark->lock > - * inode->i_lock > + * mark->connector->lock > */ > spin_lock(&mark->lock); > mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; > @@ -366,25 +498,14 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, > list_add(&mark->g_list, &group->marks_list); > atomic_inc(&group->num_marks); > fsnotify_get_mark(mark); /* for i_list and g_list */ > - > - if (inode) { > - ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); > - if (ret) > - goto err; > - } else if (mnt) { > - ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); > - if (ret) > - goto err; > - } else { > - BUG(); > - } > - > - /* this will pin the object if appropriate */ > - fsnotify_set_mark_mask_locked(mark, mark->mask); > spin_unlock(&mark->lock); > > - if (inode) > - __fsnotify_update_child_dentry_flags(inode); > + ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups); > + if (ret) > + goto err; > + > + if (mark->mask) > + fsnotify_recalc_mask(mark->connector); > > return ret; > err: > @@ -419,17 +540,23 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, > * Given a list of marks, find the mark associated with given group. If found > * take a reference to that mark and return it, else return NULL. > */ > -struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, > +struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_mark_connector **connp, > struct fsnotify_group *group) > { > struct fsnotify_mark *mark; > + struct fsnotify_mark_connector *conn; > > - hlist_for_each_entry(mark, head, obj_list) { > + conn = fsnotify_grab_connector(connp); > + if (!conn) > + return NULL; > + hlist_for_each_entry(mark, &conn->list, obj_list) { > if (mark->group == group) { > fsnotify_get_mark(mark); > + fsnotify_put_connector(conn); > return mark; > } > } > + fsnotify_put_connector(conn); > return NULL; > } > > @@ -453,7 +580,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, > */ > mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); > list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { > - if (mark->flags & flags) > + if (mark->connector->flags & flags) > list_move(&mark->g_list, &to_free); > } > mutex_unlock(&group->mark_mutex); > @@ -499,6 +626,29 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) > } > } > > +/* Destroy all marks attached to inode / vfsmount */ > +void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp) > +{ > + struct fsnotify_mark_connector *conn; > + struct fsnotify_mark *mark; > + > + while ((conn = fsnotify_grab_connector(connp))) { > + /* > + * We have to be careful since we can race with e.g. > + * fsnotify_clear_marks_by_group() and once we drop the list > + * lock, mark can get removed from the obj_list and destroyed. > + * But we are holding mark reference so mark cannot be freed > + * and calling fsnotify_destroy_mark() more than once is fine. > + */ > + mark = hlist_entry(conn->list.first, struct fsnotify_mark, > + obj_list); > + fsnotify_get_mark(mark); > + fsnotify_put_connector(conn); > + fsnotify_destroy_mark(mark, mark->group); > + fsnotify_put_mark(mark); > + } > +} > + -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html