Instead of removing mark from object list from fsnotify_detach_mark(), remove the mark when last reference to the mark is dropped. This will allow fanotify to wait for userspace response to event without having to hold onto fsnotify_mark_srcu. To avoid pinning inodes by elevated refcount (and thus e.g. delaying file deletion) while someone holds mark reference, we detach connector from the object also from fsnotify_destroy_marks() and not only after removing last mark from the list as it was now. Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx> Signed-off-by: Jan Kara <jack@xxxxxxx> --- fs/notify/mark.c | 185 ++++++++++++++++++++++++--------------- include/linux/fsnotify_backend.h | 4 +- kernel/audit_tree.c | 24 ++--- 3 files changed, 121 insertions(+), 92 deletions(-) diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 2bd9cdb094e1..7eef1824a76c 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -49,7 +49,13 @@ * * A list of notification marks relating to inode / mnt is contained in * fsnotify_mark_connector. That structure is alive as long as there are any - * marks in the list and is also protected by fsnotify_mark_srcu. + * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets + * detached from fsnotify_mark_connector when last reference to the mark is + * dropped. Thus having mark reference is enough to protect mark->connector + * pointer and to make sure fsnotify_mark_connector cannot disappear. Also + * because we remove mark from g_list before dropping mark reference associated + * with that, any mark found through g_list is guaranteed to have + * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their @@ -103,26 +109,16 @@ void fsnotify_get_mark(struct fsnotify_mark *mark) atomic_inc(&mark->refcnt); } -void fsnotify_put_mark(struct fsnotify_mark *mark) -{ - if (atomic_dec_and_test(&mark->refcnt)) { - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - } -} - static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; struct fsnotify_mark *mark; assert_spin_locked(&conn->lock); - hlist_for_each_entry(mark, &conn->list, obj_list) - new_mask |= mark->mask; - + hlist_for_each_entry(mark, &conn->list, obj_list) { + if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) + new_mask |= mark->mask; + } if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) conn->inode->i_fsnotify_mask = new_mask; else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) @@ -131,8 +127,9 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) /* * Calculate mask of events for a list of marks. The caller must make sure - * connector cannot disappear under us (usually by holding a mark->lock or - * mark->group->mark_mutex for a mark on this list). + * connector and connector->inode cannot disappear under us. Callers achieve + * this by holding a mark->lock or mark->group->mark_mutex for a mark on this + * list. */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { @@ -164,30 +161,59 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } -static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) +static struct inode *fsnotify_detach_connector_from_object( + struct fsnotify_mark_connector *conn) +{ + struct inode *inode = NULL; + + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { + inode = conn->inode; + rcu_assign_pointer(inode->i_fsnotify_marks, NULL); + inode->i_fsnotify_mask = 0; + conn->inode = NULL; + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks, + NULL); + real_mount(conn->mnt)->mnt_fsnotify_mask = 0; + conn->mnt = NULL; + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; + } + + return inode; +} + +static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) +{ + if (mark->group) + fsnotify_put_group(mark->group); + mark->free_mark(mark); +} + +void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn; struct inode *inode = NULL; bool free_conn = false; + /* Catch marks that were actually never attached to object */ + if (!mark->connector) { + if (atomic_dec_and_test(&mark->refcnt)) + fsnotify_final_mark_destroy(mark); + return; + } + + /* + * We have to be careful so that traversals of obj_list under lock can + * safely grab mark reference. + */ + if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock)) + return; + conn = mark->connector; - spin_lock(&conn->lock); hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { - if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { - inode = conn->inode; - rcu_assign_pointer(inode->i_fsnotify_marks, NULL); - inode->i_fsnotify_mask = 0; - conn->inode = NULL; - conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; - } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { - rcu_assign_pointer( - real_mount(conn->mnt)->mnt_fsnotify_marks, - NULL); - real_mount(conn->mnt)->mnt_fsnotify_mask = 0; - conn->mnt = NULL; - conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; - } + inode = fsnotify_detach_connector_from_object(conn); free_conn = true; } else { __fsnotify_recalc_mask(conn); @@ -195,6 +221,9 @@ static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) mark->connector = NULL; spin_unlock(&conn->lock); + if (inode) + iput(inode); + if (free_conn) { spin_lock(&destroy_lock); conn->destroy_next = connector_destroy_list; @@ -202,20 +231,31 @@ static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) spin_unlock(&destroy_lock); queue_work(system_unbound_wq, &connector_reaper_work); } - - return inode; + /* + * Note that we didn't update flags telling whether inode cares about + * what's happening with children. We update these flags from + * __fsnotify_parent() lazily when next event happens on one of our + * children. + */ + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); } /* - * Remove mark from inode / vfsmount list, group list, drop inode reference - * if we got one. + * Mark mark as detached, remove it from group list. Mark still stays in object + * list until its last reference is dropped. Note that we rely on mark being + * removed from group list before corresponding reference to it is dropped. In + * particular we rely on mark->connector being valid while we hold + * group->mark_mutex if we found the mark through g_list. * * Must be called with group->mark_mutex held. The caller must either hold * reference to the mark or be protected by fsnotify_mark_srcu. */ void fsnotify_detach_mark(struct fsnotify_mark *mark) { - struct inode *inode = NULL; struct fsnotify_group *group = mark->group; WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); @@ -224,31 +264,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); spin_lock(&mark->lock); - /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } - mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; - - inode = fsnotify_detach_from_object(mark); - - /* - * Note that we didn't update flags telling whether inode cares about - * what's happening with children. We update these flags from - * __fsnotify_parent() lazily when next event happens on one of our - * children. - */ - list_del_init(&mark->g_list); - spin_unlock(&mark->lock); - if (inode) - iput(inode); - atomic_dec(&group->num_marks); /* Drop mark reference acquired in fsnotify_add_mark_locked() */ @@ -448,7 +472,9 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; - if ((lmark->group == mark->group) && !allow_dups) { + if ((lmark->group == mark->group) && + (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && + !allow_dups) { err = -EEXIST; goto out_err; } @@ -499,7 +525,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, mark->group = group; list_add(&mark->g_list, &group->marks_list); atomic_inc(&group->num_marks); - fsnotify_get_mark(mark); /* for i_list and g_list */ + fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups); @@ -547,7 +573,8 @@ struct fsnotify_mark *fsnotify_find_mark( return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { - if (mark->group == group) { + if (mark->group == group && + (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); return mark; @@ -627,23 +654,39 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp) { struct fsnotify_mark_connector *conn; - struct fsnotify_mark *mark; + struct fsnotify_mark *mark, *old_mark = NULL; + struct inode *inode; - while ((conn = fsnotify_grab_connector(connp))) { - /* - * We have to be careful since we can race with e.g. - * fsnotify_clear_marks_by_group() and once we drop the list - * lock, mark can get removed from the obj_list and destroyed. - * But we are holding mark reference so mark cannot be freed - * and calling fsnotify_destroy_mark() more than once is fine. - */ - mark = hlist_entry(conn->list.first, struct fsnotify_mark, - obj_list); + conn = fsnotify_grab_connector(connp); + if (!conn) + return; + /* + * We have to be careful since we can race with e.g. + * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the + * list can get modified. However we are holding mark reference and + * thus our mark cannot be removed from obj_list so we can continue + * iteration after regaining conn->lock. + */ + hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); + if (old_mark) + fsnotify_put_mark(old_mark); + old_mark = mark; fsnotify_destroy_mark(mark, mark->group); - fsnotify_put_mark(mark); + spin_lock(&conn->lock); } + /* + * Detach list from object now so that we don't pin inode until all + * mark references get dropped. It would lead to strange results such + * as delaying inode deletion or blocking unmount. + */ + inode = fsnotify_detach_connector_from_object(conn); + spin_unlock(&conn->lock); + if (old_mark) + fsnotify_put_mark(old_mark); + if (inode) + iput(inode); } /* @@ -676,9 +719,7 @@ void fsnotify_mark_destroy_list(void) list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); - if (mark->group) - fsnotify_put_group(mark->group); - mark->free_mark(mark); + fsnotify_final_mark_destroy(mark); } } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 84d71b6f75f6..a483614b25d0 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -245,9 +245,9 @@ struct fsnotify_mark { struct list_head g_list; /* Protects inode / mnt pointers, flags, masks */ spinlock_t lock; - /* List of marks for inode / vfsmount [connector->lock] */ + /* List of marks for inode / vfsmount [connector->lock, mark ref] */ struct hlist_node obj_list; - /* Head of list of marks for an object [mark->lock, group->mark_mutex] */ + /* Head of list of marks for an object [mark ref] */ struct fsnotify_mark_connector *connector; /* Events types to ignore [mark->lock, group->mark_mutex] */ __u32 ignored_mask; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index c0e494fd8eca..152400e8d077 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -172,27 +172,15 @@ static unsigned long inode_to_key(const struct inode *inode) /* * Function to return search key in our hash from chunk. Key 0 is special and * should never be present in the hash. - * - * Must be called with chunk->mark.lock held to protect from connector - * becoming NULL. */ -static unsigned long __chunk_to_key(struct audit_chunk *chunk) +static unsigned long chunk_to_key(struct audit_chunk *chunk) { - if (!chunk->mark.connector) + /* We have a reference to the mark so it should be attached. */ + if (WARN_ON_ONCE(!chunk->mark.connector)) return 0; return (unsigned long)chunk->mark.connector->inode; } -static unsigned long chunk_to_key(struct audit_chunk *chunk) -{ - unsigned long key; - - spin_lock(&chunk->mark.lock); - key = __chunk_to_key(chunk); - spin_unlock(&chunk->mark.lock); - return key; -} - static inline struct list_head *chunk_hash(unsigned long key) { unsigned long n = key / L1_CACHE_BYTES; @@ -202,7 +190,7 @@ static inline struct list_head *chunk_hash(unsigned long key) /* hash_lock & entry->lock is held by caller */ static void insert_hash(struct audit_chunk *chunk) { - unsigned long key = __chunk_to_key(chunk); + unsigned long key = chunk_to_key(chunk); struct list_head *list; if (!key) @@ -263,7 +251,7 @@ static void untag_chunk(struct node *p) mutex_lock(&entry->group->mark_mutex); spin_lock(&entry->lock); - if (chunk->dead || !entry->connector) { + if (chunk->dead || !entry->connector || !entry->connector->inode) { spin_unlock(&entry->lock); mutex_unlock(&entry->group->mark_mutex); if (new) @@ -423,7 +411,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) mutex_lock(&old_entry->group->mark_mutex); spin_lock(&old_entry->lock); - if (!old_entry->connector) { + if (!old_entry->connector || !old_entry->connector->inode) { /* old_entry is being shot, lets just lie */ spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); -- 2.10.2