For very large ext4 filesystems (128TB and larger) kmalloc() of some per-group structures can fail at mount time due to memory fragmentation. If kmalloc() fails, fall back to vmalloc() for the s_group_info and s_group_desc arrays. Signed-off-by: Yu Jian <yujian@xxxxxxxxxxxxx> Signed-off-by: Andreas Dilger <adilger@xxxxxxxxxxxxx> --- fs/ext4/mballoc.c | 49 +++++++++++++++++++++++++++++++++---------------- fs/ext4/super.c | 29 +++++++++++++++++++++++------ 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6ed859d..72c5796 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2325,25 +2325,37 @@ static int ext4_mb_init_backend(struct super_block *sb) while (array_size < sizeof(*sbi->s_group_info) * num_meta_group_infos_max) array_size = array_size << 1; - /* An 8TB filesystem with 64-bit pointers requires a 4096 byte - * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. - * So a two level scheme suffices for now. */ + /* A 16TB filesystem with 64-bit pointers requires an 8192 byte + * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally) + * have group descriptors at least twice as large (64 bytes or + * more vs. 32 bytes for traditional ext3 filesystems), so a 128TB + * filesystem needs a 128kB allocation, which may need vmalloc(). */ sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); - return -ENOMEM; + sbi->s_group_info = vmalloc(array_size); + if (sbi->s_group_info != NULL) { + memset(sbi->s_group_info, 0, array_size); + } else { + ext4_msg(sb, KERN_ERR, "no memory for groupinfo (%u)\n", + array_size); + return -ENOMEM; + } } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { - printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + ext4_msg(sb, KERN_ERR, "can't get new inode\n"); goto err_freesgi; } - sbi->s_buddy_cache->i_ino = get_next_ino(); + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { - printk(KERN_ERR + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't read descriptor %u\n", i); goto err_freebuddy; } @@ -2362,7 +2374,10 @@ err_freebuddy: kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - kfree(sbi->s_group_info); + if (is_vmalloc_addr(sbi->s_group_info)) + vfree(sbi->s_group_info); + else + kfree(sbi->s_group_info); return -ENOMEM; } @@ -2457,12 +2472,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i++; } while (i <= sb->s_blocksize_bits + 1); - /* init file for buddy data */ - ret = ext4_mb_init_backend(sb); - if (ret != 0) { - goto out; - } - spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); @@ -2487,6 +2496,11 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) + goto out; + if (sbi->s_proc) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); @@ -2544,7 +2558,10 @@ int ext4_mb_release(struct super_block *sb) EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - kfree(sbi->s_group_info); + if (is_vmalloc_addr(sbi->s_group_info)) + vfree(sbi->s_group_info); + else + kfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa..556084b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -789,7 +789,12 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + + if (is_vmalloc_addr(sbi->s_group_desc)) + vfree(sbi->s_group_desc); + else + kfree(sbi->s_group_desc); + if (is_vmalloc_addr(sbi->s_flex_groups)) vfree(sbi->s_flex_groups); else @@ -3059,6 +3064,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int ret = -ENOMEM; int blocksize; unsigned int db_count; + size_t size; unsigned int i; int needs_recovery, has_huge_files; __u64 blocks_count; @@ -3408,11 +3414,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), - GFP_KERNEL); + size = (size_t)db_count * sizeof(struct buffer_head *); + sbi->s_group_desc = kzalloc(size, GFP_KERNEL); if (sbi->s_group_desc == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory"); - goto failed_mount; + sbi->s_group_desc = vmalloc(size); + if (sbi->s_group_desc != NULL) { + memset(sbi->s_group_desc, 0, size); + } else { + ext4_msg(sb, KERN_ERR, "no memory for %u groups (%u)\n", + sbi->s_groups_count, (unsigned int)size); + ret = -ENOMEM; + goto failed_mount; + } } #ifdef CONFIG_PROC_FS @@ -3756,7 +3769,11 @@ failed_mount3: failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + + if (is_vmalloc_addr(sbi->s_group_desc)) + vfree(sbi->s_group_desc); + else + kfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { remove_proc_entry(sb->s_id, ext4_proc_root); -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html