[PATCH 2/4] ext4: fix reserved space transferring on chown() [V2]

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Currently all quota's functions except vfs_dq_reserve_block()
called without i_block_reservation_lock. This result in
ext4_reservation vs quota_reservation inconsistency which provoke
incorrect reservation transfer ==> incorrect quota after transfer.

Race (1)
| Task 1 (chown)                 | Task 2 (truncate)                    |
| dquot_transfer                 |                                      |
| ->down_write(dqptr_sem)        | ext4_da_release_spac                 |
| -->dquot_get_reserved_space    | ->lock(i_block_reservation_lock)     |
| --->get_reserved_space         | /* decrement reservation */          |
| ---->ext4_get_reserved_space   | ->unlock(i_block_reservation_lock)   |
| ----->lock(i_block_rsv_lock)   | /* During this time window           |
| /* Read ext4_rsv from inode */ | * fs's reservation not equals        |
| /* transfer it to new quota */ | * to quota's */                      |
| ->up_write(dqptr_sem)          | ->vfs_dq_release_reservation_block() |
|                                | /* quota_rsv goes negative here */   |
|                                |                                      |

Race (2)
| Task 1 (chown)                | Task 2 (flush-8:16)             |
| dquot_transfer()              | ext4_mb_mark_diskspace_used()   |
| ->down_write(dqptr_sem)       | ->vfs_dq_claim_block()          |
| --->get_reserved_space()      | /* After this moment         */ |
| --->ext4_get_reserved_space() | /* ext4_rsv != quota_ino_rsv */ |
| /* Read rsv from inode which  |                                 |
| ->dquot_free_reserved_space() |                                 |
| /* quota_rsv goes negative */ |                                 |
|                               |                                 |
|                               | dquot_free_reserved_space()     |
|                               | /* finally dec ext4_ino_rsv */  |

So, in order to protect us from this type of races we always have to
provides ext4_ino_rsv == quot_ino_rsv guarantee. And this is only
possible then i_block_reservation_lock is taken before entering any
quota operations.

In fact i_block_reservation_lock is held by ext4_da_reserve_space()
while calling vfs_dq_reserve_block(). Lock are held in following order
i_block_reservation_lock > dqptr_sem

This may result in deadlock because of different lock ordering:
ext4_da_reserve_space()             dquot_transfer()
lock(i_block_reservation_lock)	     down_write(dqptr_sem)
  down_write(dqptr_sem)		     lock(i_block_reservation_lock)

But this not happen only because both callers must have i_mutex so
serialization happens on i_mutex.

To prevent ext4_reservation vs dquot_reservation inconsistency we
have to make following changes
- Reorganize locking ordering like follows:
  i_block_reservation_lock > dqptr_sem
  This means what all functions which call vfs_dq_XXX must acquire
  i_block_reservation_lock before.

- Move space claiming to ext4_da_update_reserve_space()
  Only here we do know what we are dealing with data or metadata

- Delay dquot_claim for metadata before it until proper moment(
  until we are allowed to decrement inodes->i_reserved_meta_blocks)

The most useful test case for delalloc+quota is concurrent tasks
with write+truncate vs chown for the same file.

Signed-off-by: Dmitry Monakhov <dmonakhov@xxxxxxxxxx>
---
 fs/ext4/inode.c   |   33 ++++++++++++++++++++++++---------
 fs/ext4/mballoc.c |    6 ------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cc4737e..979362d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1047,16 +1047,23 @@ cleanup:
 out:
 	return err;
 }
+/*
+ * Quota_transfer callback.
+ * During quota transfer we have to transfer rsv-blocks from one dquot to
+ * another. inode must be protected from concurrent reservation/reclamation.
+ * Locking ordering for all space reservation code:
+ *  i_block_reservation_lock > dqptr_sem
+ * This is differ from i_block,i_lock locking ordering, but this is the
+ * only possible way.
+ * NOTE: Caller must hold i_block_reservation_lock.
+ */
 
 qsize_t ext4_get_reserved_space(struct inode *inode)
 {
 	unsigned long long total;
 
-	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	total = EXT4_I(inode)->i_reserved_data_blocks +
 		EXT4_I(inode)->i_reserved_meta_blocks;
-	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-
 	return (total << inode->i_blkbits);
 }
 /*
@@ -1096,7 +1103,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
 static void ext4_da_update_reserve_space(struct inode *inode, int used)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	int total, mdb, mdb_free;
+	int total, mdb, mdb_free, mdb_claim = 0;
 
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	/* recalculate the number of metablocks still need to be reserved */
@@ -1109,7 +1116,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 
 	if (mdb_free) {
 		/* Account for allocated meta_blocks */
-		mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+		mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks;
+		BUG_ON(mdb_free < mdb_claim);
+		mdb_free -= mdb_claim;
 
 		/* update fs dirty blocks counter */
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
@@ -1119,14 +1128,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 
 	/* update per-inode reservations */
 	BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
-	EXT4_I(inode)->i_reserved_data_blocks -= used;
-	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ 	EXT4_I(inode)->i_reserved_data_blocks -= used;
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
+	vfs_dq_claim_block(inode, used + mdb_claim);
 
 	/*
 	 * free those over-booking quota for metadata blocks
 	 */
 	if (mdb_free)
 		vfs_dq_release_reservation_block(inode, mdb_free);
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
 	/*
 	 * If we have done all the pending block allocations and if
@@ -1863,8 +1874,8 @@ repeat:
 	}
 
 	if (ext4_claim_free_blocks(sbi, total)) {
-		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 		vfs_dq_release_reservation_block(inode, total);
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
@@ -1921,9 +1932,9 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 
 	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
 	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
-	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
 	vfs_dq_release_reservation_block(inode, release);
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
@@ -5433,7 +5444,11 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
+		/* i_block_reservation must being held in order to avoid races
+		 * with concurent block reservation. */
+		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba1282..d4c52db 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2750,12 +2750,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
 		/* release all the reserved blocks if non delalloc */
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
-	else {
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
-						ac->ac_b_ex.fe_len);
-		/* convert reserved quota blocks to real quota blocks */
-		vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
-	}
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
-- 
1.6.0.4

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux