Protect i_state updates with i_lock Signed-off-by: Nick Piggin <npiggin@xxxxxxxxx> --- fs/drop_caches.c | 9 +++-- fs/fs-writeback.c | 35 +++++++++++++++++--- fs/inode.c | 83 ++++++++++++++++++++++++++++++++++++++++++------- fs/notify/inode_mark.c | 23 +++++++++---- fs/quota/dquot.c | 28 ++++++++++++++-- 5 files changed, 148 insertions(+), 30 deletions(-) Index: linux-2.6/fs/drop_caches.c =================================================================== --- linux-2.6.orig/fs/drop_caches.c 2010-10-19 14:18:58.000000000 +1100 +++ linux-2.6/fs/drop_caches.c 2010-10-19 14:19:32.000000000 +1100 @@ -19,11 +19,14 @@ spin_lock(&inode_lock); spin_lock(&sb_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - if (inode->i_mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) + || inode->i_mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&sb_inode_list_lock); spin_unlock(&inode_lock); invalidate_mapping_pages(inode->i_mapping, 0, -1); Index: linux-2.6/fs/fs-writeback.c =================================================================== --- linux-2.6.orig/fs/fs-writeback.c 2010-10-19 14:18:58.000000000 +1100 +++ linux-2.6/fs/fs-writeback.c 2010-10-19 14:19:36.000000000 +1100 @@ -288,10 +288,12 @@ wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - while (inode->i_state & I_SYNC) { + while (inode->i_state & I_SYNC) { + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); } } @@ -345,6 +347,7 @@ /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); @@ -366,8 +369,10 @@ * write_inode() */ spin_lock(&inode_lock); + spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { @@ -377,6 +382,7 @@ } spin_lock(&inode_lock); + spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { @@ -487,7 +493,9 @@ return 0; } + spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); requeue_io(inode); continue; } @@ -495,8 +503,10 @@ * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, wbc->wb_start)) + if (inode_dirtied_after(inode, wbc->wb_start)) { + spin_unlock(&inode->i_lock); return 1; + } BUG_ON(inode->i_state & I_FREEING); __iget(inode); @@ -509,6 +519,7 @@ */ redirty_tail(inode); } + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); iput(inode); cond_resched(); @@ -944,6 +955,7 @@ block_dump___mark_inode_dirty(inode); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -994,6 +1006,7 @@ } } out: + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); if (wakeup_bdi) @@ -1041,12 +1054,20 @@ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); continue; + } + mapping = inode->i_mapping; - if (mapping->nrpages == 0) + if (mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); continue; - __iget(inode); + } + + __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&sb_inode_list_lock); spin_unlock(&inode_lock); /* @@ -1173,7 +1194,9 @@ might_sleep(); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); ret = writeback_single_inode(inode, &wbc); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); if (sync) inode_sync_wait(inode); @@ -1197,7 +1220,9 @@ int ret; spin_lock(&inode_lock); + spin_lock(&inode->i_lock); ret = writeback_single_inode(inode, wbc); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); return ret; } Index: linux-2.6/fs/inode.c =================================================================== --- linux-2.6.orig/fs/inode.c 2010-10-19 14:18:58.000000000 +1100 +++ linux-2.6/fs/inode.c 2010-10-19 14:19:37.000000000 +1100 @@ -31,10 +31,13 @@ * s_inodes, i_sb_list * inode_hash_lock protects: * inode hash table, i_hash + * inode->i_lock protects: + * i_state * * Ordering: * inode_lock * sb_inode_list_lock + * inode->i_lock * inode_lock * inode_hash_lock */ @@ -97,7 +100,7 @@ */ DEFINE_SPINLOCK(inode_lock); DEFINE_SPINLOCK(sb_inode_list_lock); -DEFINE_SPINLOCK(inode_hash_lock); +static DEFINE_SPINLOCK(inode_hash_lock); /* * iprune_sem provides exclusion between the kswapd or try_to_free_pages @@ -296,6 +299,8 @@ */ void __iget(struct inode *inode) { + assert_spin_locked(&inode->i_lock); + if (atomic_inc_return(&inode->i_count) != 1) return; @@ -396,16 +401,21 @@ if (tmp == head) break; inode = list_entry(tmp, struct inode, i_sb_list); - if (inode->i_state & I_NEW) + spin_lock(&inode->i_lock); + if (inode->i_state & I_NEW) { + spin_unlock(&inode->i_lock); continue; + } invalidate_inode_buffers(inode); if (!atomic_read(&inode->i_count)) { list_move(&inode->i_list, dispose); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); count++; continue; } + spin_unlock(&inode->i_lock); busy = 1; } /* only unused inodes may be cached with i_count zero */ @@ -484,12 +494,15 @@ inode = list_entry(inode_unused.prev, struct inode, i_list); + spin_lock(&inode->i_lock); if (inode->i_state || atomic_read(&inode->i_count)) { list_move(&inode->i_list, &inode_unused); + spin_unlock(&inode->i_lock); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, @@ -500,12 +513,16 @@ if (inode != list_entry(inode_unused.next, struct inode, i_list)) continue; /* wrong inode or list_empty */ - if (!can_unuse(inode)) + spin_lock(&inode->i_lock); + if (!can_unuse(inode)) { + spin_unlock(&inode->i_lock); continue; + } } list_move(&inode->i_list, &freeable); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); nr_pruned++; } inodes_stat.nr_unused -= nr_pruned; @@ -577,8 +594,14 @@ hlist_for_each_entry(inode, node, head, i_hash) { if (inode->i_sb != sb) continue; - if (!test(inode, data)) + if (!spin_trylock(&inode->i_lock)) { + spin_unlock(&inode_hash_lock); + goto repeat; + } + if (!test(inode, data)) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { spin_unlock(&inode_hash_lock); __wait_on_freeing_inode(inode); @@ -607,6 +630,10 @@ continue; if (inode->i_sb != sb) continue; + if (!spin_trylock(&inode->i_lock)) { + spin_unlock(&inode_hash_lock); + goto repeat; + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { spin_unlock(&inode_hash_lock); __wait_on_freeing_inode(inode); @@ -633,10 +660,10 @@ struct inode *inode) { inodes_stat.nr_inodes++; - list_add(&inode->i_list, &inode_in_use); spin_lock(&sb_inode_list_lock); list_add(&inode->i_sb_list, &sb->s_inodes); spin_unlock(&sb_inode_list_lock); + list_add(&inode->i_list, &inode_in_use); if (head) { spin_lock(&inode_hash_lock); hlist_add_head(&inode->i_hash, head); @@ -693,9 +720,9 @@ inode = alloc_inode(sb); if (inode) { spin_lock(&inode_lock); - __inode_add_to_lists(sb, NULL, inode); inode->i_ino = ++last_ino; inode->i_state = 0; + __inode_add_to_lists(sb, NULL, inode); spin_unlock(&inode_lock); } return inode; @@ -762,8 +789,8 @@ if (set(inode, data)) goto set_failed; - __inode_add_to_lists(sb, head, inode); inode->i_state = I_NEW; + __inode_add_to_lists(sb, head, inode); spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -778,6 +805,7 @@ * allocated. */ __iget(old); + spin_unlock(&old->i_lock); spin_unlock(&inode_lock); destroy_inode(inode); inode = old; @@ -786,6 +814,7 @@ return inode; set_failed: + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); destroy_inode(inode); return NULL; @@ -809,8 +838,8 @@ old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; - __inode_add_to_lists(sb, head, inode); inode->i_state = I_NEW; + __inode_add_to_lists(sb, head, inode); spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -825,6 +854,7 @@ * allocated. */ __iget(old); + spin_unlock(&old->i_lock); spin_unlock(&inode_lock); destroy_inode(inode); inode = old; @@ -866,6 +896,7 @@ res = counter++; head = inode_hashtable + hash(sb, res); inode = find_inode_fast(sb, head, res); + spin_unlock(&inode->i_lock); } while (inode != NULL); spin_unlock(&inode_lock); @@ -875,7 +906,10 @@ struct inode *igrab(struct inode *inode) { + struct inode *ret = inode; + spin_lock(&inode_lock); + spin_lock(&inode->i_lock); if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) __iget(inode); else @@ -884,9 +918,11 @@ * called yet, and somebody is calling igrab * while the inode is getting freed. */ - inode = NULL; + ret = NULL; + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); - return inode; + + return ret; } EXPORT_SYMBOL(igrab); @@ -919,6 +955,7 @@ inode = find_inode(sb, head, test, data); if (inode) { __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); if (likely(wait)) wait_on_inode(inode); @@ -952,6 +989,7 @@ inode = find_inode_fast(sb, head, ino); if (inode) { __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); wait_on_inode(inode); return inode; @@ -1121,6 +1159,7 @@ struct inode *old = NULL; spin_lock(&inode_lock); +repeat: spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_ino != ino) @@ -1129,6 +1168,10 @@ continue; if (old->i_state & (I_FREEING|I_WILL_FREE)) continue; + if (!spin_trylock(&old->i_lock)) { + spin_unlock(&inode_hash_lock); + goto repeat; + } break; } if (likely(!node)) { @@ -1139,6 +1182,7 @@ } spin_unlock(&inode_hash_lock); __iget(old); + spin_unlock(&old->i_lock); spin_unlock(&inode_lock); wait_on_inode(old); if (unlikely(!hlist_unhashed(&old->i_hash))) { @@ -1163,6 +1207,7 @@ struct inode *old = NULL; spin_lock(&inode_lock); +repeat: spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_sb != sb) @@ -1171,6 +1216,10 @@ continue; if (old->i_state & (I_FREEING|I_WILL_FREE)) continue; + if (!spin_trylock(&old->i_lock)) { + spin_unlock(&inode_hash_lock); + goto repeat; + } break; } if (likely(!node)) { @@ -1181,6 +1230,7 @@ } spin_unlock(&inode_hash_lock); __iget(old); + spin_unlock(&old->i_lock); spin_unlock(&inode_lock); wait_on_inode(old); if (unlikely(!hlist_unhashed(&old->i_hash))) { @@ -1260,6 +1310,8 @@ const struct super_operations *op = inode->i_sb->s_op; int drop; + spin_lock(&sb_inode_list_lock); + spin_lock(&inode->i_lock); if (op && op->drop_inode) drop = op->drop_inode(inode); else @@ -1270,14 +1322,20 @@ list_move(&inode->i_list, &inode_unused); inodes_stat.nr_unused++; if (sb->s_flags & MS_ACTIVE) { + spin_unlock(&inode->i_lock); + spin_unlock(&sb_inode_list_lock); spin_unlock(&inode_lock); return; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); + spin_unlock(&sb_inode_list_lock); spin_unlock(&inode_lock); write_inode_now(inode, 1); spin_lock(&inode_lock); + spin_lock(&sb_inode_list_lock); + spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; inodes_stat.nr_unused--; @@ -1286,12 +1344,12 @@ spin_unlock(&inode_hash_lock); } list_del_init(&inode->i_list); - spin_lock(&sb_inode_list_lock); list_del_init(&inode->i_sb_list); spin_unlock(&sb_inode_list_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); evict(inode); spin_lock(&inode_lock); @@ -1500,6 +1558,8 @@ * wake_up_inode() after removing from the hash list will DTRT. * * This is called with inode_lock held. + * + * Called with i_lock held and returns with it dropped. */ static void __wait_on_freeing_inode(struct inode *inode) { @@ -1507,6 +1567,7 @@ DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); schedule(); finish_wait(wq, &wait.wait); Index: linux-2.6/fs/quota/dquot.c =================================================================== --- linux-2.6.orig/fs/quota/dquot.c 2010-10-19 14:18:58.000000000 +1100 +++ linux-2.6/fs/quota/dquot.c 2010-10-19 14:19:32.000000000 +1100 @@ -246,6 +246,7 @@ EXPORT_SYMBOL(dqstats); static qsize_t inode_get_rsv_space(struct inode *inode); +static qsize_t __inode_get_rsv_space(struct inode *inode); static void __dquot_initialize(struct inode *inode, int type); static inline unsigned int @@ -899,18 +900,26 @@ spin_lock(&inode_lock); spin_lock(&sb_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); continue; + } #ifdef CONFIG_QUOTA_DEBUG - if (unlikely(inode_get_rsv_space(inode) > 0)) + if (unlikely(__inode_get_rsv_space(inode) > 0)) reserved = 1; #endif - if (!atomic_read(&inode->i_writecount)) + if (!atomic_read(&inode->i_writecount)) { + spin_unlock(&inode->i_lock); continue; - if (!dqinit_needed(inode, type)) + } + if (!dqinit_needed(inode, type)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&sb_inode_list_lock); spin_unlock(&inode_lock); @@ -1494,6 +1503,17 @@ } EXPORT_SYMBOL(inode_sub_rsv_space); +/* no i_lock variant of inode_get_rsv_space */ +static qsize_t __inode_get_rsv_space(struct inode *inode) +{ + qsize_t ret; + + if (!inode->i_sb->dq_op->get_reserved_space) + return 0; + ret = *inode_reserved_space(inode); + return ret; +} + static qsize_t inode_get_rsv_space(struct inode *inode) { qsize_t ret; Index: linux-2.6/fs/notify/inode_mark.c =================================================================== --- linux-2.6.orig/fs/notify/inode_mark.c 2010-10-19 14:18:58.000000000 +1100 +++ linux-2.6/fs/notify/inode_mark.c 2010-10-19 14:19:37.000000000 +1100 @@ -243,13 +243,16 @@ list_for_each_entry_safe(inode, next_i, list, i_sb_list) { struct inode *need_iput_tmp; + spin_lock(&inode->i_lock); /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); continue; + } /* * If i_count is zero, the inode cannot have any watches and @@ -257,8 +260,10 @@ * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. */ - if (!atomic_read(&inode->i_count)) + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); continue; + } need_iput_tmp = need_iput; need_iput = NULL; @@ -268,13 +273,17 @@ __iget(inode); else need_iput_tmp = NULL; + spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ - if ((&next_i->i_sb_list != list) && - atomic_read(&next_i->i_count) && - !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { - __iget(next_i); - need_iput = next_i; + if (&next_i->i_sb_list != list) { + spin_lock(&next_i->i_lock); + if (atomic_read(&next_i->i_count) && + !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + spin_unlock(&next_i->i_lock); } /* -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html