On Tue, Mar 28, 2017 at 06:13:19PM +0200, Jan Kara wrote: > Instead of removing mark from object list from fsnotify_detach_mark(), > remove the mark when last reference to the mark is dropped. This will > allow fanotify to wait for userspace response to event without having to > hold onto fsnotify_mark_srcu. > > To avoid pinning inodes by elevated refcount (and thus e.g. delaying > file deletion) while someone holds mark reference, we detach connector > from the object also from fsnotify_destroy_marks() and not only after > removing last mark from the list as it was now. > > Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx> > Signed-off-by: Jan Kara <jack@xxxxxxx> > --- > fs/notify/mark.c | 185 ++++++++++++++++++++++++--------------- > include/linux/fsnotify_backend.h | 4 +- > kernel/audit_tree.c | 24 ++--- > 3 files changed, 121 insertions(+), 92 deletions(-) > > diff --git a/fs/notify/mark.c b/fs/notify/mark.c > index 2bd9cdb094e1..7eef1824a76c 100644 > --- a/fs/notify/mark.c > +++ b/fs/notify/mark.c > @@ -49,7 +49,13 @@ > * > * A list of notification marks relating to inode / mnt is contained in > * fsnotify_mark_connector. That structure is alive as long as there are any > - * marks in the list and is also protected by fsnotify_mark_srcu. > + * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets > + * detached from fsnotify_mark_connector when last reference to the mark is > + * dropped. Thus having mark reference is enough to protect mark->connector > + * pointer and to make sure fsnotify_mark_connector cannot disappear. Also > + * because we remove mark from g_list before dropping mark reference associated > + * with that, any mark found through g_list is guaranteed to have > + * mark->connector set until we drop group->mark_mutex. > * > * LIFETIME: > * Inode marks survive between when they are added to an inode and when their > @@ -103,26 +109,16 @@ void fsnotify_get_mark(struct fsnotify_mark *mark) > atomic_inc(&mark->refcnt); > } > > -void fsnotify_put_mark(struct fsnotify_mark *mark) > -{ > - if (atomic_dec_and_test(&mark->refcnt)) { > - spin_lock(&destroy_lock); > - list_add(&mark->g_list, &destroy_list); > - spin_unlock(&destroy_lock); > - queue_delayed_work(system_unbound_wq, &reaper_work, > - FSNOTIFY_REAPER_DELAY); > - } > -} > - > static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) > { > u32 new_mask = 0; > struct fsnotify_mark *mark; > > assert_spin_locked(&conn->lock); > - hlist_for_each_entry(mark, &conn->list, obj_list) > - new_mask |= mark->mask; > - > + hlist_for_each_entry(mark, &conn->list, obj_list) { > + if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) > + new_mask |= mark->mask; > + } > if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) > conn->inode->i_fsnotify_mask = new_mask; > else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) > @@ -131,8 +127,9 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) > > /* > * Calculate mask of events for a list of marks. The caller must make sure > - * connector cannot disappear under us (usually by holding a mark->lock or > - * mark->group->mark_mutex for a mark on this list). > + * connector and connector->inode cannot disappear under us. Callers achieve > + * this by holding a mark->lock or mark->group->mark_mutex for a mark on this > + * list. > */ > void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) > { > @@ -164,30 +161,59 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) > } > } > > -static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) > +static struct inode *fsnotify_detach_connector_from_object( > + struct fsnotify_mark_connector *conn) > +{ > + struct inode *inode = NULL; > + > + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { > + inode = conn->inode; > + rcu_assign_pointer(inode->i_fsnotify_marks, NULL); > + inode->i_fsnotify_mask = 0; > + conn->inode = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; > + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { > + rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks, > + NULL); > + real_mount(conn->mnt)->mnt_fsnotify_mask = 0; > + conn->mnt = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; > + } > + > + return inode; > +} Could this helper been added in the previous patch where the code was introduced? > + > +static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) > +{ > + if (mark->group) > + fsnotify_put_group(mark->group); > + mark->free_mark(mark); > +} > + > +void fsnotify_put_mark(struct fsnotify_mark *mark) > { > struct fsnotify_mark_connector *conn; > struct inode *inode = NULL; > bool free_conn = false; > > + /* Catch marks that were actually never attached to object */ > + if (!mark->connector) { > + if (atomic_dec_and_test(&mark->refcnt)) > + fsnotify_final_mark_destroy(mark); > + return; > + } > + > + /* > + * We have to be careful so that traversals of obj_list under lock can > + * safely grab mark reference. > + */ > + if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock)) > + return; > + > conn = mark->connector; > - spin_lock(&conn->lock); > hlist_del_init_rcu(&mark->obj_list); > if (hlist_empty(&conn->list)) { > - if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { > - inode = conn->inode; > - rcu_assign_pointer(inode->i_fsnotify_marks, NULL); > - inode->i_fsnotify_mask = 0; > - conn->inode = NULL; > - conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; > - } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { > - rcu_assign_pointer( > - real_mount(conn->mnt)->mnt_fsnotify_marks, > - NULL); > - real_mount(conn->mnt)->mnt_fsnotify_mask = 0; > - conn->mnt = NULL; > - conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; > - } > + inode = fsnotify_detach_connector_from_object(conn); > free_conn = true; > } else { > __fsnotify_recalc_mask(conn); > @@ -195,6 +221,9 @@ static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) > mark->connector = NULL; > spin_unlock(&conn->lock); > > + if (inode) > + iput(inode); > + iput() checks for non-NULL inode. > if (free_conn) { > spin_lock(&destroy_lock); > conn->destroy_next = connector_destroy_list; > @@ -202,20 +231,31 @@ static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) > spin_unlock(&destroy_lock); > queue_work(system_unbound_wq, &connector_reaper_work); > } > - > - return inode; > + /* > + * Note that we didn't update flags telling whether inode cares about > + * what's happening with children. We update these flags from > + * __fsnotify_parent() lazily when next event happens on one of our > + * children. > + */ > + spin_lock(&destroy_lock); > + list_add(&mark->g_list, &destroy_list); > + spin_unlock(&destroy_lock); > + queue_delayed_work(system_unbound_wq, &reaper_work, > + FSNOTIFY_REAPER_DELAY); > } > > /* > - * Remove mark from inode / vfsmount list, group list, drop inode reference > - * if we got one. > + * Mark mark as detached, remove it from group list. Mark still stays in object > + * list until its last reference is dropped. Note that we rely on mark being > + * removed from group list before corresponding reference to it is dropped. In > + * particular we rely on mark->connector being valid while we hold > + * group->mark_mutex if we found the mark through g_list. > * > * Must be called with group->mark_mutex held. The caller must either hold > * reference to the mark or be protected by fsnotify_mark_srcu. > */ > void fsnotify_detach_mark(struct fsnotify_mark *mark) > { > - struct inode *inode = NULL; > struct fsnotify_group *group = mark->group; > > WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); > @@ -224,31 +264,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) > !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); > > spin_lock(&mark->lock); > - > /* something else already called this function on this mark */ > if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { > spin_unlock(&mark->lock); > return; > } > - > mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; > - > - inode = fsnotify_detach_from_object(mark); > - > - /* > - * Note that we didn't update flags telling whether inode cares about > - * what's happening with children. We update these flags from > - * __fsnotify_parent() lazily when next event happens on one of our > - * children. > - */ > - > list_del_init(&mark->g_list); > - > spin_unlock(&mark->lock); > > - if (inode) > - iput(inode); > - > atomic_dec(&group->num_marks); > > /* Drop mark reference acquired in fsnotify_add_mark_locked() */ > @@ -448,7 +472,9 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > hlist_for_each_entry(lmark, &conn->list, obj_list) { > last = lmark; > > - if ((lmark->group == mark->group) && !allow_dups) { > + if ((lmark->group == mark->group) && > + (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && > + !allow_dups) { > err = -EEXIST; > goto out_err; > } > @@ -499,7 +525,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, > mark->group = group; > list_add(&mark->g_list, &group->marks_list); > atomic_inc(&group->num_marks); > - fsnotify_get_mark(mark); /* for i_list and g_list */ > + fsnotify_get_mark(mark); /* for g_list */ > spin_unlock(&mark->lock); > > ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups); > @@ -547,7 +573,8 @@ struct fsnotify_mark *fsnotify_find_mark( > return NULL; > > hlist_for_each_entry(mark, &conn->list, obj_list) { > - if (mark->group == group) { > + if (mark->group == group && > + (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { > fsnotify_get_mark(mark); > spin_unlock(&conn->lock); > return mark; > @@ -627,23 +654,39 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) > void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp) > { > struct fsnotify_mark_connector *conn; > - struct fsnotify_mark *mark; > + struct fsnotify_mark *mark, *old_mark = NULL; > + struct inode *inode; > > - while ((conn = fsnotify_grab_connector(connp))) { > - /* > - * We have to be careful since we can race with e.g. > - * fsnotify_clear_marks_by_group() and once we drop the list > - * lock, mark can get removed from the obj_list and destroyed. > - * But we are holding mark reference so mark cannot be freed > - * and calling fsnotify_destroy_mark() more than once is fine. > - */ > - mark = hlist_entry(conn->list.first, struct fsnotify_mark, > - obj_list); > + conn = fsnotify_grab_connector(connp); > + if (!conn) > + return; > + /* > + * We have to be careful since we can race with e.g. > + * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the > + * list can get modified. However we are holding mark reference and > + * thus our mark cannot be removed from obj_list so we can continue > + * iteration after regaining conn->lock. > + */ > + hlist_for_each_entry(mark, &conn->list, obj_list) { > fsnotify_get_mark(mark); > spin_unlock(&conn->lock); > + if (old_mark) > + fsnotify_put_mark(old_mark); > + old_mark = mark; > fsnotify_destroy_mark(mark, mark->group); > - fsnotify_put_mark(mark); > + spin_lock(&conn->lock); > } > + /* > + * Detach list from object now so that we don't pin inode until all > + * mark references get dropped. It would lead to strange results such > + * as delaying inode deletion or blocking unmount. > + */ > + inode = fsnotify_detach_connector_from_object(conn); > + spin_unlock(&conn->lock); > + if (old_mark) > + fsnotify_put_mark(old_mark); > + if (inode) > + iput(inode); > } > > /* > @@ -676,9 +719,7 @@ void fsnotify_mark_destroy_list(void) > > list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { > list_del_init(&mark->g_list); > - if (mark->group) > - fsnotify_put_group(mark->group); > - mark->free_mark(mark); > + fsnotify_final_mark_destroy(mark); > } > } > > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > index 84d71b6f75f6..a483614b25d0 100644 > --- a/include/linux/fsnotify_backend.h > +++ b/include/linux/fsnotify_backend.h > @@ -245,9 +245,9 @@ struct fsnotify_mark { > struct list_head g_list; > /* Protects inode / mnt pointers, flags, masks */ > spinlock_t lock; > - /* List of marks for inode / vfsmount [connector->lock] */ > + /* List of marks for inode / vfsmount [connector->lock, mark ref] */ > struct hlist_node obj_list; > - /* Head of list of marks for an object [mark->lock, group->mark_mutex] */ > + /* Head of list of marks for an object [mark ref] */ > struct fsnotify_mark_connector *connector; > /* Events types to ignore [mark->lock, group->mark_mutex] */ > __u32 ignored_mask; > diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c > index c0e494fd8eca..152400e8d077 100644 > --- a/kernel/audit_tree.c > +++ b/kernel/audit_tree.c > @@ -172,27 +172,15 @@ static unsigned long inode_to_key(const struct inode *inode) > /* > * Function to return search key in our hash from chunk. Key 0 is special and > * should never be present in the hash. > - * > - * Must be called with chunk->mark.lock held to protect from connector > - * becoming NULL. > */ > -static unsigned long __chunk_to_key(struct audit_chunk *chunk) > +static unsigned long chunk_to_key(struct audit_chunk *chunk) > { > - if (!chunk->mark.connector) > + /* We have a reference to the mark so it should be attached. */ > + if (WARN_ON_ONCE(!chunk->mark.connector)) > return 0; Hmm, lifetime of mark previously: - created but not attached (connector is NULL, no FSNOTIFY_MARK_FLAG_ATTACHED) - attached (connector is non-NULL, FSNOTIFY_MARK_FLAG_ATTACHED) - detached (connector is NULL, no FSNOTIFY_MARK_FLAG_ATTACHED) The created, and attached states remain the same but detached now changed to: - detached (connector is non-NULL, no FSNOTIFY_MARK_FLAG_ATTACHED) Considering that, the warning seems right but the comment above not so. Maybe something like: /* We have a reference to the mark and it has been attached so we should have a valid connector. */ > return (unsigned long)chunk->mark.connector->inode; > } > > -static unsigned long chunk_to_key(struct audit_chunk *chunk) > -{ > - unsigned long key; > - > - spin_lock(&chunk->mark.lock); > - key = __chunk_to_key(chunk); > - spin_unlock(&chunk->mark.lock); > - return key; > -} > - > static inline struct list_head *chunk_hash(unsigned long key) > { > unsigned long n = key / L1_CACHE_BYTES; > @@ -202,7 +190,7 @@ static inline struct list_head *chunk_hash(unsigned long key) > /* hash_lock & entry->lock is held by caller */ > static void insert_hash(struct audit_chunk *chunk) > { > - unsigned long key = __chunk_to_key(chunk); > + unsigned long key = chunk_to_key(chunk); > struct list_head *list; > > if (!key) > @@ -263,7 +251,7 @@ static void untag_chunk(struct node *p) > > mutex_lock(&entry->group->mark_mutex); > spin_lock(&entry->lock); > - if (chunk->dead || !entry->connector) { > + if (chunk->dead || !entry->connector || !entry->connector->inode) { So should we be testing FSNOTIFY_MARK_FLAG_ATTACHED instead? Without understanding what audit is trying to do, that would be a lot more logical. Or maybe there is a reason this is correct, it just needs an explanation. > spin_unlock(&entry->lock); > mutex_unlock(&entry->group->mark_mutex); > if (new) > @@ -423,7 +411,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) > > mutex_lock(&old_entry->group->mark_mutex); > spin_lock(&old_entry->lock); > - if (!old_entry->connector) { > + if (!old_entry->connector || !old_entry->connector->inode) { > /* old_entry is being shot, lets just lie */ > spin_unlock(&old_entry->lock); > mutex_unlock(&old_entry->group->mark_mutex); > -- > 2.10.2 >