dqptr_sem is one of the most contenting locks for now each dquot_initialize and dquot_transfer result in down_write(dqptr_sem) Let's user inode->i_lock to protect i_dquot pointers. In that case all users which modified i_dquot simply converted to that lock. But users who hold the dqptr_sem for read(charge/uncharge methods) usually looks like follows down_read(&dqptr_sem) ___charge_quota() make_quota_dirty(inode->i_dquot) --> may_sleep up_read(&dquot_sem) We must drop i_lock before make_quota_dirty or flush_warnings, to protect dquot from being fried let's grab extra reference for dquot, and drop it after we have done with dquot object. XXX: There is a plan to get rid of extra dqget/dqput calls by protecting dquot destruction by SRCU. Signed-off-by: Dmitry Monakhov <dmonakhov@xxxxxxxxxx> --- fs/quota/dquot.c | 145 ++++++++++++++++++++++++++-------------------- fs/super.c | 1 - include/linux/quota.h | 1 - include/linux/quotaops.h | 4 +- 4 files changed, 84 insertions(+), 67 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a7a7670..5c8ad82 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -91,18 +91,16 @@ * in inode_add_bytes() and inode_sub_bytes(). * * The spinlock ordering is hence: - * dq_data_lock > dq_lock > dq_list_lock > i_lock + * dq_data_lock > i_lock > dq_lock > dq_list_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock * - * Any operation working on dquots via inode pointers must hold dqptr_sem. If - * operation is just reading pointers from inode (or not using them at all) the - * read lock is enough. If pointers are altered function must hold write lock. + * Any operation working on dquots via inode pointers must hold i_lock. * Special care needs to be taken about S_NOQUOTA inode flag (marking that * inode is a quota file). Functions adding pointers from inode to dquots have - * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they - * have to do all pointer modifications before dropping dqptr_sem. This makes + * to check this flag under i_lock and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping i_lock. This makes * sure they cannot race with quotaon which first sets S_NOQUOTA flag and * then drops all pointers to dquots from an inode. * @@ -117,14 +115,8 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is the following: - * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_mutex > + * i_mutex > dqonoff_sem > journal_lock > dquot->dq_mutex > * dqio_mutex - * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > - * dqptr_sem. But filesystem has to count with the fact that functions such as - * dquot_alloc_space() acquire dqptr_sem and they usually have to be called - * from inside a transaction to keep filesystem consistency after a crash. Also - * filesystems usually want to do some IO on dquot from ->mark_dirty which is - * called with dqptr_sem held. * i_mutex on quota files is special (it's below dqio_mutex) */ @@ -1024,15 +1016,18 @@ static inline int dqput_blocks(struct dquot *dquot) /* * Remove references to dquots from inode and add dquot to list for freeing * if we have the last referece to dquot - * We can't race with anybody because we hold dqptr_sem for writing... */ static int remove_inode_dquot_ref(struct inode *inode, int type, struct list_head *tofree_head) { - struct dquot *dquot = inode->i_dquot[type]; + struct dquot *dquot; struct quota_info *dqopt = dqopts(inode->i_sb); + spin_lock(&inode->i_lock); + dquot = inode->i_dquot[type]; inode->i_dquot[type] = NULL; + spin_unlock(&inode->i_lock); + if (dquot) { if (dqput_blocks(dquot)) { #ifdef CONFIG_QUOTA_DEBUG @@ -1086,7 +1081,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch * only quota pointers and these have separate locking - * (dqptr_sem). + * (i_lock). */ if (!IS_NOQUOTA(inode)) { if (unlikely(inode_get_rsv_space(inode) > 0)) @@ -1110,9 +1105,7 @@ static void drop_dquot_ref(struct super_block *sb, int type) LIST_HEAD(tofree_head); if (dqctl(sb)->dq_op) { - down_write(&dqctl(sb)->dqptr_sem); remove_dquot_ref(sb, type, &tofree_head); - up_write(&dqctl(sb)->dqptr_sem); put_dquot_list(&tofree_head); } } @@ -1426,9 +1419,6 @@ static int dquot_active(const struct inode *inode) /* * Initialize quota pointers in inode * - * We do things in a bit complicated way but by that we avoid calling - * find_get_dquot() and thus filesystem callbacks under dqptr_sem. - * * It is better to call this function outside of any transaction as it * might need a lot of space in journal for dquot structure allocation. */ @@ -1461,7 +1451,8 @@ static void __dquot_initialize(struct inode *inode, int type) got[cnt] = find_get_dquot(sb, id, cnt); } - down_write(&dqctl(sb)->dqptr_sem); + spin_lock(&inode->i_lock); + rsv = __inode_get_rsv_space(inode); if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1480,7 +1471,6 @@ static void __dquot_initialize(struct inode *inode, int type) * Make quota reservation system happy if someone * did a write before quota was turned on */ - rsv = inode_get_rsv_space(inode); if (unlikely(rsv)) { spin_lock(&got[cnt]->dq_lock); dquot_resv_space(got[cnt], rsv); @@ -1489,7 +1479,7 @@ static void __dquot_initialize(struct inode *inode, int type) } } out_err: - up_write(&dqctl(sb)->dqptr_sem); + spin_unlock(&inode->i_lock); /* Drop unused references */ dqput_all(got); } @@ -1508,12 +1498,12 @@ static void __dquot_drop(struct inode *inode) int cnt; struct dquot *put[MAXQUOTAS]; - down_write(&dqctl(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { put[cnt] = inode->i_dquot[cnt]; inode->i_dquot[cnt] = NULL; } - up_write(&dqctl(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); dqput_all(put); } @@ -1652,6 +1642,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) int warn = flags & DQUOT_SPACE_WARN; int reserve = flags & DQUOT_SPACE_RESERVE; int nofail = flags & DQUOT_SPACE_NOFAIL; + struct dquot *dquot[MAXQUOTAS] = {}; /* * First test before acquiring mutex - solves deadlocks when we @@ -1662,18 +1653,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) goto out; } - down_read(&dqctl(inode->i_sb)->dqptr_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; + spin_lock(&inode->i_lock); lock_inode_dquots(inode->i_dquot); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; + dquot[cnt] = inode->i_dquot[cnt]; + dqget(dquot[cnt]); ret = check_bdq(inode->i_dquot[cnt], number, !warn, warntype + cnt); if (ret && !nofail) { unlock_inode_dquots(inode->i_dquot); + spin_unlock(&inode->i_lock); goto out_flush_warn; } } @@ -1685,15 +1679,14 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) else dquot_incr_space(inode->i_dquot[cnt], number); } - inode_incr_space(inode, number, reserve); + __inode_incr_space(inode, number, reserve); unlock_inode_dquots(inode->i_dquot); - - if (reserve) - goto out_flush_warn; - mark_all_dquot_dirty(inode->i_dquot); + spin_unlock(&inode->i_lock); + if (!reserve) + mark_all_dquot_dirty(dquot); out_flush_warn: - flush_warnings(inode->i_dquot, warntype); - up_read(&dqctl(inode->i_sb)->dqptr_sem); + flush_warnings(dquot, warntype); + dqput_all(dquot); out: return ret; } @@ -1702,10 +1695,11 @@ EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated */ -int dquot_alloc_inode(const struct inode *inode) +int dquot_alloc_inode(struct inode *inode) { int cnt, ret = 0; char warntype[MAXQUOTAS]; + struct dquot *dquot[MAXQUOTAS] = {}; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1713,11 +1707,14 @@ int dquot_alloc_inode(const struct inode *inode) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; - down_read(&dqctl(inode->i_sb)->dqptr_sem); + + spin_lock(&inode->i_lock); lock_inode_dquots(inode->i_dquot); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; + dquot[cnt] = inode->i_dquot[cnt]; + dqget(dquot[cnt]); ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); if (ret) goto warn_put_all; @@ -1731,10 +1728,11 @@ int dquot_alloc_inode(const struct inode *inode) warn_put_all: unlock_inode_dquots(inode->i_dquot); + spin_unlock(&inode->i_lock); if (ret == 0) - mark_all_dquot_dirty(inode->i_dquot); - flush_warnings(inode->i_dquot, warntype); - up_read(&dqctl(inode->i_sb)->dqptr_sem); + mark_all_dquot_dirty(dquot); + flush_warnings(dquot, warntype); + dqput_all(dquot); return ret; } EXPORT_SYMBOL(dquot_alloc_inode); @@ -1745,25 +1743,30 @@ EXPORT_SYMBOL(dquot_alloc_inode); int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { int cnt; + struct dquot *dquot[MAXQUOTAS] = {}; if (!dquot_active(inode)) { inode_claim_rsv_space(inode, number); return 0; } - down_read(&dqctl(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); lock_inode_dquots(inode->i_dquot); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) + if (inode->i_dquot[cnt]) { + dquot[cnt] = inode->i_dquot[cnt]; + dqget(dquot[cnt]); dquot_claim_reserved_space(inode->i_dquot[cnt], number); + } } /* Update inode bytes */ - inode_claim_rsv_space(inode, number); + __inode_claim_rsv_space(inode, number); unlock_inode_dquots(inode->i_dquot); - mark_all_dquot_dirty(inode->i_dquot); - up_read(&dqctl(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); + mark_all_dquot_dirty(dquot); + dqput_all(dquot); return 0; } EXPORT_SYMBOL(dquot_claim_space_nodirty); @@ -1776,6 +1779,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) unsigned int cnt; char warntype[MAXQUOTAS]; int reserve = flags & DQUOT_SPACE_RESERVE; + struct dquot *dquot[MAXQUOTAS] = {}; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1784,54 +1788,60 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) return; } - down_read(&dqctl(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); lock_inode_dquots(inode->i_dquot); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; + dquot[cnt] = inode->i_dquot[cnt]; + dqget(dquot[cnt]); warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); if (reserve) dquot_free_reserved_space(inode->i_dquot[cnt], number); else dquot_decr_space(inode->i_dquot[cnt], number); } - inode_decr_space(inode, number, reserve); + __inode_decr_space(inode, number, reserve); unlock_inode_dquots(inode->i_dquot); + spin_unlock(&inode->i_lock); - if (reserve) - goto out_unlock; - mark_all_dquot_dirty(inode->i_dquot); -out_unlock: - flush_warnings(inode->i_dquot, warntype); - up_read(&dqctl(inode->i_sb)->dqptr_sem); + if (!reserve) + mark_all_dquot_dirty(dquot); + flush_warnings(dquot, warntype); + dqput_all(dquot); } EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated */ -void dquot_free_inode(const struct inode *inode) +void dquot_free_inode(struct inode *inode) { unsigned int cnt; char warntype[MAXQUOTAS]; + struct dquot *dquot[MAXQUOTAS] = {}; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return; - down_read(&dqctl(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); lock_inode_dquots(inode->i_dquot); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; + dquot[cnt] = inode->i_dquot[cnt]; + dqget(dquot[cnt]); warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); dquot_decr_inodes(inode->i_dquot[cnt], 1); } unlock_inode_dquots(inode->i_dquot); - mark_all_dquot_dirty(inode->i_dquot); - flush_warnings(inode->i_dquot, warntype); - up_read(&dqctl(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); + + mark_all_dquot_dirty(dquot); + flush_warnings(dquot, warntype); + dqput_all(dquot); } EXPORT_SYMBOL(dquot_free_inode); @@ -1862,9 +1872,10 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype_to[cnt] = QUOTA_NL_NOWARN; - down_write(&dqctl(inode->i_sb)->dqptr_sem); + + spin_lock(&inode->i_lock); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ - up_write(&dqctl(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); return 0; } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1873,15 +1884,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) */ if (!transfer_to[cnt]) continue; + dqget(transfer_to[cnt]); /* Avoid races with quotaoff() */ if (!sb_has_quota_active(inode->i_sb, cnt)) continue; is_valid[cnt] = 1; transfer_from[cnt] = inode->i_dquot[cnt]; + if (transfer_from[cnt]) + dqget(transfer_from[cnt]); + } lock_dquot_double(transfer_from, transfer_to); - cur_space = inode_get_bytes(inode); - rsv_space = inode_get_rsv_space(inode); + cur_space = __inode_get_bytes(inode); + rsv_space = __inode_get_rsv_space(inode); space = cur_space + rsv_space; /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1921,13 +1936,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) } unlock_inode_dquots(transfer_to); unlock_inode_dquots(transfer_from); - up_write(&dqctl(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(transfer_from); mark_all_dquot_dirty(transfer_to); flush_warnings(transfer_to, warntype_to); flush_warnings(transfer_from, warntype_from_inodes); flush_warnings(transfer_from, warntype_from_space); + dqput_all(transfer_to); + dqput_all(transfer_from); /* Pass back references to put */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (is_valid[cnt]) @@ -1936,8 +1953,10 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) over_quota: unlock_inode_dquots(transfer_to); unlock_inode_dquots(transfer_from); - up_write(&dqctl(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); flush_warnings(transfer_to, warntype_to); + dqput_all(transfer_to); + dqput_all(transfer_from); return ret; } EXPORT_SYMBOL(__dquot_transfer); diff --git a/fs/super.c b/fs/super.c index 9eea8e9..2f2090c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -104,7 +104,6 @@ static struct super_block *alloc_super(struct file_system_type *type) mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); mutex_init(&s->s_dquot.dqonoff_mutex); - init_rwsem(&s->s_dquot.dqptr_sem); init_waitqueue_head(&s->s_wait_unfrozen); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; diff --git a/include/linux/quota.h b/include/linux/quota.h index 867848b..834ed1b 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -409,7 +409,6 @@ struct quota_ctl_info { unsigned int flags; /* Flags for diskquotas on this device */ struct mutex dqonoff_mutex; /* Serialize quotaon & quotaoff */ - struct rw_semaphore dqptr_sem; /* serialize ops using quota_info struct, pointers from inode to dquots */ const struct quotactl_ops *qcop; const struct dquot_operations *dq_op; struct quota_info *dq_opt; diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 4259da8..2675ff7 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -64,10 +64,10 @@ void dquot_destroy(struct dquot *dquot); int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags); void __dquot_free_space(struct inode *inode, qsize_t number, int flags); -int dquot_alloc_inode(const struct inode *inode); +int dquot_alloc_inode(struct inode *inode); int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); -void dquot_free_inode(const struct inode *inode); +void dquot_free_inode(struct inode *inode); int dquot_disable(struct super_block *sb, int type, unsigned int flags); /* Suspend quotas on remount RO */ -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html