Here is another overreservation which results in premature ENOSPC when using bigalloc and delalloc, case 1: $ xfs_io -f -c "pwrite 0 2" -c "fsync" -c "pwrite 16K 2" -c "fsync" /mnt/dir/foo or case 2: $ xfs_io -f -c "falloc 0 2" -c "pwrite 16K 2" -c "fsync" /mnt/dir/foo Currently 'du' reports 128K instead of 64K because of overreservation. What was happending is, take case 1 as an example, a) in ext4_da_write_begin(), extent [0,2) will reserve one cluster for itself, b) forced by fsync, extent [0,2) allocates a 64K cluster and uses the reserved cluster, c) in ext4_da_write_begin(), extent [16k,16k+2) will reserve one cluster for itself, d) forced by fsync, extent [16k, 16k+2) finds out that it can use the new 'allocated' block, i.e. @map_from_cluster is true, such that it doesn't update the reservation accounting, and no one will. Currently ext4_da_write_begin() only checks if there is a delalloc extent which has reserved a cluster, but a written extent also implies an available cluster. With this, ext4_da_write_begin() now checks delalloc, written and unwritten extents. Signed-off-by: Liu Bo <bo.liu@xxxxxxxxxxxxxxxxx> --- fs/ext4/ext4.h | 3 ++- fs/ext4/extents.c | 24 ++++++++++++++++++++++-- fs/ext4/extents_status.c | 49 ++++++++++++++++++++++++++++++++---------------- fs/ext4/extents_status.h | 12 ++++++++++++ fs/ext4/inode.c | 5 +++-- 5 files changed, 72 insertions(+), 21 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a42e71203e53..b07d30b1c635 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3121,7 +3121,8 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, extern int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t lblk_start, ext4_lblk_t lblk_end); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_find_delalloc_or_allocated_cluster(struct inode *inode, + ext4_lblk_t lblk); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1b3b24a53e1f..acc8b5c213b1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3836,14 +3836,34 @@ int ext4_find_delalloc_range(struct inode *inode, return 0; } -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) +static int ext4_find_own_rsv_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end) +{ + struct extent_status es; + + ext4_es_find_own_rsv_extent_range(inode, lblk_start, + lblk_end, &es); + if (es.es_len == 0) + return 0; /* there is no delay extent in this tree */ + else if (es.es_lblk <= lblk_start && + lblk_start < es.es_lblk + es.es_len) + return 1; + else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) + return 1; + else + return 0; +} + +int ext4_find_delalloc_or_allocated_cluster(struct inode *inode, + ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - return ext4_find_delalloc_range(inode, lblk_start, lblk_end); + return ext4_find_own_rsv_range(inode, lblk_start, lblk_end); } /** diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 763ef185dd17..c828c04c947d 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -233,18 +233,10 @@ static struct extent_status *__es_tree_search(struct rb_root *root, return NULL; } -/* - * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering - * @es->lblk if it exists, otherwise, the next extent after @es->lblk. - * - * @inode: the inode which owns delayed extents - * @lblk: the offset where we start to search - * @end: the offset where we stop to search - * @es: delayed extent that we found - */ -void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es) +void __ext4_es_find_range_type(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es, + ext4_es_istype_t *ext4_es_istype) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; @@ -252,7 +244,6 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, BUG_ON(es == NULL); BUG_ON(end < lblk); - trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; @@ -272,19 +263,19 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, es1 = __es_tree_search(&tree->root, lblk); out: - if (es1 && !ext4_es_is_delayed(es1)) { + if (es1 && !ext4_es_istype(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) { es1 = NULL; break; } - if (ext4_es_is_delayed(es1)) + if (ext4_es_istype(es1)) break; } } - if (es1 && ext4_es_is_delayed(es1)) { + if (es1 && ext4_es_istype(es1)) { tree->cache_es = es1; es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; @@ -292,10 +283,36 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, } read_unlock(&EXT4_I(inode)->i_es_lock); +} + +/* + * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering + * @es->lblk if it exists, otherwise, the next extent after @es->lblk. + * + * @inode: the inode which owns delayed extents + * @lblk: the offset where we start to search + * @end: the offset where we stop to search + * @es: delayed extent that we found + */ +void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); + + __ext4_es_find_range_type(inode, lblk, end, es, ext4_es_is_delayed); trace_ext4_es_find_delayed_extent_range_exit(inode, es); } +void ext4_es_find_own_rsv_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + __ext4_es_find_range_type(inode, lblk, end, es, + ext4_es_is_own_rsv); +} + static void ext4_es_list_add(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8efdeb903d6b..90ce7fd1cfe7 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -93,9 +93,14 @@ extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, extern void ext4_es_find_delayed_extent_range(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); +extern void ext4_es_find_own_rsv_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +typedef int (ext4_es_istype_t)(struct extent_status *es); + static inline unsigned int ext4_es_status(struct extent_status *es) { return es->es_pblk >> ES_SHIFT; @@ -121,6 +126,13 @@ static inline int ext4_es_is_delayed(struct extent_status *es) return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } +static inline int ext4_es_is_own_rsv(struct extent_status *es) +{ + return ext4_es_is_delayed(es) || + ext4_es_is_written(es) || + ext4_es_is_unwritten(es); +} + static inline int ext4_es_is_hole(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1e50c5efae67..86f9545fd941 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1679,7 +1679,7 @@ static void ext4_da_page_release_reservation(struct page *page, lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk)) + !ext4_find_delalloc_or_allocated_cluster(inode, lblk)) ext4_da_release_space(inode, 1); num_clusters--; @@ -1867,7 +1867,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * to reserve metadata for every block we're going to write. */ if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, map->m_lblk)) { + !ext4_find_delalloc_or_allocated_cluster(inode, + map->m_lblk)) { ret = ext4_da_reserve_space(inode); if (ret) { /* not enough space to reserve */ -- 1.8.3.1