It is rediculus practice to scan inode block by block, this technique applicable only for old indirect files. This takes signifficant amount of time for really large files. Let's reuse ext4_fiemap which already traverse inode-tree in most optimal meaner. TESTCASE: ftruncate64(fd, 0); ftruncate64(fd, 1ULL << 40); /* lseek will spin very long time */ lseek64(fd, 0, SEEK_DATA); lseek64(fd, 0, SEEK_HOLE); Original report: https://lkml.org/lkml/2014/10/16/620 ################################## BTW: Why do we need i_mutex here? --- fs/ext4/extents.c | 4 +- fs/ext4/file.c | 220 +++++++++++++++++++++++++--------------------------- 2 files changed, 108 insertions(+), 116 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bed4308..e5d3ead 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5166,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return generic_block_fiemap(inode, fieinfo, start, len, - ext4_get_block); + return __generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) return -EBADR; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8131be8..513c12c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * we determine this extent as a data or a hole according to whether the * page cache has data or not. */ -static int ext4_find_unwritten_pgoff(struct inode *inode, - int whence, - struct ext4_map_blocks *map, - loff_t *offset) +static int ext4_find_unwritten_pgoff(struct inode *inode, int whence, + loff_t endoff, loff_t *offset) { struct pagevec pvec; - unsigned int blkbits; pgoff_t index; pgoff_t end; - loff_t endoff; loff_t startoff; loff_t lastoff; int found = 0; - blkbits = inode->i_sb->s_blocksize_bits; startoff = *offset; lastoff = startoff; - endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + index = startoff >> PAGE_CACHE_SHIFT; end = endoff >> PAGE_CACHE_SHIFT; @@ -408,147 +403,144 @@ out: static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t dataoff, isize; - int blkbits; - int ret = 0; + struct fiemap_extent_info fie; + struct fiemap_extent ext[2]; + loff_t next; + int i, ret = 0; mutex_lock(&inode->i_mutex); - - isize = i_size_read(inode); - if (offset >= isize) { + if (offset >= inode->i_size) { mutex_unlock(&inode->i_mutex); return -ENXIO; } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - dataoff = offset; - - do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - if (last != start) - dataoff = (loff_t)last << blkbits; + fie.fi_flags = 0; + fie.fi_extents_max = 2; + fie.fi_extents_start = (struct fiemap_extent __user *) &ext; + while (1) { + mm_segment_t old_fs = get_fs(); + + fie.fi_extents_mapped = 0; + memset(ext, 0, sizeof(*ext) * fie.fi_extents_max); + + set_fs(get_ds()); + ret = ext4_fiemap(inode, &fie, offset, maxsize - offset); + set_fs(old_fs); + if (ret) break; - } - /* - * If there is a delay extent at this offset, - * it will be as a data. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - if (last != start) - dataoff = (loff_t)last << blkbits; + /* No extents found, EOF */ + if (!fie.fi_extents_mapped) { + ret = -ENXIO; break; } + for (i = 0; i < fie.fi_extents_mapped; i++) { + next = (loff_t)(ext[i].fe_length + ext[i].fe_logical); - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, - &map, &dataoff); - if (unwritten) - break; - } + if (offset < (loff_t)ext[i].fe_logical) + offset = (loff_t)ext[i].fe_logical; + /* + * If extent is not unwritten, then it contains valid + * data, mapped or delayed. + */ + if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) + goto out; - last++; - dataoff = (loff_t)last << blkbits; - } while (last <= end); + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, + next, &offset)) + goto out; + if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) { + ret = -ENXIO; + goto out; + } + offset = next; + } + } + if (offset > inode->i_size) + offset = inode->i_size; +out: mutex_unlock(&inode->i_mutex); + if (ret) + return ret; - if (dataoff > isize) - return -ENXIO; - - return vfs_setpos(file, dataoff, maxsize); + return vfs_setpos(file, offset, maxsize); } /* - * ext4_seek_hole() retrieves the offset for SEEK_HOLE. + * ext4_seek_hole() retrieves the offset for SEEK_HOLE */ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t holeoff, isize; - int blkbits; - int ret = 0; + struct fiemap_extent_info fie; + struct fiemap_extent ext[2]; + loff_t next; + int i, ret = 0; mutex_lock(&inode->i_mutex); - - isize = i_size_read(inode); - if (offset >= isize) { + if (offset >= inode->i_size) { mutex_unlock(&inode->i_mutex); return -ENXIO; } - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - holeoff = offset; + fie.fi_flags = 0; + fie.fi_extents_max = 2; + fie.fi_extents_start = (struct fiemap_extent __user *)&ext; + while (1) { + mm_segment_t old_fs = get_fs(); - do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; - } + fie.fi_extents_mapped = 0; + memset(ext, 0, sizeof(*ext)); - /* - * If there is a delay extent at this offset, - * we will skip this extent. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - last = es.es_lblk + es.es_len; - holeoff = (loff_t)last << blkbits; - continue; - } + set_fs(get_ds()); + ret = ext4_fiemap(inode, &fie, offset, maxsize - offset); + set_fs(old_fs); + if (ret) + break; - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - &map, &holeoff); - if (!unwritten) { - last += ret; - holeoff = (loff_t)last << blkbits; + /* No extents found */ + if (!fie.fi_extents_mapped) + break; + + for (i = 0; i < fie.fi_extents_mapped; i++) { + next = (loff_t)(ext[i].fe_logical + ext[i].fe_length); + /* + * If extent is not unwritten, then it contains valid + * data, mapped or delayed. + */ + if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) { + if (offset < (loff_t)ext[i].fe_logical) + goto out; + offset = next; continue; } - } - - /* find a hole */ - break; - } while (last <= end); + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + next, &offset)) + goto out; + offset = next; + if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) + goto out; + } + } + if (offset > inode->i_size) + offset = inode->i_size; +out: mutex_unlock(&inode->i_mutex); + if (ret) + return ret; - if (holeoff > isize) - holeoff = isize; - - return vfs_setpos(file, holeoff, maxsize); + return vfs_setpos(file, offset, maxsize); } /* -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html