From: Ben Chociej <bchociej@xxxxxxxxx> Modified mkfs.btrfs to add hot data relocation option (-h) which preallocates BTRFS_BLOCK_GROUP_DATA_SSD and BTRFS_BLOCK_GROUP_METADATA_SSD at mkfs time for future use by hot data relocation code. Also added a userspace function to detect whether a block device is an SSD by reading the sysfs block queue rotational flag. Signed-off-by: Ben Chociej <bchociej@xxxxxxxxx> Signed-off-by: Matt Lupfer <mlupfer@xxxxxxxxx> Tested-by: Conor Scott <conscott@xxxxxx> --- ctree.h | 2 + extent-tree.c | 2 +- mkfs.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++-------- utils.c | 1 + volumes.c | 73 +++++++++++++++++++++++++++++++- volumes.h | 3 +- 6 files changed, 190 insertions(+), 22 deletions(-) diff --git a/ctree.h b/ctree.h index 64ecf12..8c29122 100644 --- a/ctree.h +++ b/ctree.h @@ -640,6 +640,8 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_BLOCK_GROUP_DATA_SSD (1 << 7) +#define BTRFS_BLOCK_GROUP_METADATA_SSD (1 << 8) struct btrfs_block_group_item { __le64 used; diff --git a/extent-tree.c b/extent-tree.c index b2f9bb2..a6b2beb 100644 --- a/extent-tree.c +++ b/extent-tree.c @@ -1812,7 +1812,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, thresh) return 0; - ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags); + ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags, 0); if (ret == -ENOSPC) { space_info->full = 1; return 0; diff --git a/mkfs.c b/mkfs.c index 2e99b95..f45cfc3 100644 --- a/mkfs.c +++ b/mkfs.c @@ -69,7 +69,61 @@ static u64 parse_size(char *s) return atol(s) * mult; } -static int make_root_dir(struct btrfs_root *root) +static int make_root_dir2(struct btrfs_root *root, int hotdata) +{ + struct btrfs_trans_handle *trans; + u64 chunk_start = 0; + u64 chunk_size = 0; + int ret; + + trans = btrfs_start_transaction(root, 1); + + /* + * If hotdata option is set, preallocate a metadata SSD block group + * (not currently used) + */ + if (hotdata) { + ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root, + &chunk_start, &chunk_size, + BTRFS_BLOCK_GROUP_METADATA_SSD, hotdata); + BUG_ON(ret); + ret = btrfs_make_block_group(trans, root, 0, + BTRFS_BLOCK_GROUP_METADATA_SSD, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + chunk_start, chunk_size); + BUG_ON(ret); + } + + ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root, + &chunk_start, &chunk_size, + BTRFS_BLOCK_GROUP_DATA, hotdata); + BUG_ON(ret); + ret = btrfs_make_block_group(trans, root, 0, + BTRFS_BLOCK_GROUP_DATA, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + chunk_start, chunk_size); + BUG_ON(ret); + + /* + * If hotdata option is set, preallocate a data SSD block group + */ + if (hotdata) { + ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root, + &chunk_start, &chunk_size, + BTRFS_BLOCK_GROUP_DATA_SSD, hotdata); + BUG_ON(ret); + ret = btrfs_make_block_group(trans, root, 0, + BTRFS_BLOCK_GROUP_DATA_SSD, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + chunk_start, chunk_size); + BUG_ON(ret); + } + + btrfs_commit_transaction(trans, root); + return ret; +} + +static int make_root_dir(struct btrfs_root *root, int hotdata) { struct btrfs_trans_handle *trans; struct btrfs_key location; @@ -90,7 +144,7 @@ static int make_root_dir(struct btrfs_root *root) ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root, &chunk_start, &chunk_size, - BTRFS_BLOCK_GROUP_METADATA); + BTRFS_BLOCK_GROUP_METADATA, hotdata); BUG_ON(ret); ret = btrfs_make_block_group(trans, root, 0, BTRFS_BLOCK_GROUP_METADATA, @@ -103,16 +157,6 @@ static int make_root_dir(struct btrfs_root *root) trans = btrfs_start_transaction(root, 1); BUG_ON(!trans); - ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root, - &chunk_start, &chunk_size, - BTRFS_BLOCK_GROUP_DATA); - BUG_ON(ret); - ret = btrfs_make_block_group(trans, root, 0, - BTRFS_BLOCK_GROUP_DATA, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - chunk_start, chunk_size); - BUG_ON(ret); - ret = btrfs_make_root_dir(trans, root->fs_info->tree_root, BTRFS_ROOT_TREE_DIR_OBJECTID); if (ret) @@ -189,7 +233,7 @@ static int create_one_raid_group(struct btrfs_trans_handle *trans, int ret; ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root, - &chunk_start, &chunk_size, type); + &chunk_start, &chunk_size, type, 0); BUG_ON(ret); ret = btrfs_make_block_group(trans, root->fs_info->extent_root, 0, type, BTRFS_FIRST_CHUNK_TREE_OBJECTID, @@ -198,14 +242,24 @@ static int create_one_raid_group(struct btrfs_trans_handle *trans, return ret; } +/* + * counters for SSD and HDD devices to determine which block group types are + * allowed when hotdata is enabled + */ +static int ssd_devices = 0; +static int hdd_devices = 0; + static int create_raid_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 data_profile, - u64 metadata_profile) + u64 metadata_profile, int hotdata) { u64 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy); u64 allowed; int ret; + if (hotdata) + num_devices = hdd_devices; + if (num_devices == 1) allowed = BTRFS_BLOCK_GROUP_DUP; else if (num_devices >= 4) { @@ -271,6 +325,7 @@ static void print_usage(void) fprintf(stderr, "\t -A --alloc-start the offset to start the FS\n"); fprintf(stderr, "\t -b --byte-count total number of bytes in the FS\n"); fprintf(stderr, "\t -d --data data profile, raid0, raid1, raid10 or single\n"); + fprintf(stderr, "\t -h --hotdata allocate hot data block groups to SSD\n"); fprintf(stderr, "\t -l --leafsize size of btree leaves\n"); fprintf(stderr, "\t -L --label set a label\n"); fprintf(stderr, "\t -m --metadata metadata profile, values like data profile\n"); @@ -325,6 +380,7 @@ static char *parse_label(char *input) static struct option long_options[] = { { "alloc-start", 1, NULL, 'A'}, { "byte-count", 1, NULL, 'b' }, + { "hotdata", 0, NULL, 'h' }, { "leafsize", 1, NULL, 'l' }, { "label", 1, NULL, 'L'}, { "metadata", 1, NULL, 'm' }, @@ -358,10 +414,11 @@ int main(int ac, char **av) int first_fd; int ret; int i; + int hotdata = 0; while(1) { int c; - c = getopt_long(ac, av, "A:b:l:n:s:m:d:L:V", long_options, + c = getopt_long(ac, av, "A:b:l:n:s:m:d:L:hV", long_options, &option_index); if (c < 0) break; @@ -398,6 +455,9 @@ int main(int ac, char **av) } zero_end = 0; break; + case 'h': + hotdata = 1; + break; case 'V': print_version(); break; @@ -405,6 +465,7 @@ int main(int ac, char **av) print_usage(); } } + sectorsize = max(sectorsize, (u32)getpagesize()); if (leafsize < sectorsize || (leafsize & (sectorsize - 1))) { fprintf(stderr, "Illegal leafsize %u\n", leafsize); @@ -414,7 +475,9 @@ int main(int ac, char **av) fprintf(stderr, "Illegal nodesize %u\n", nodesize); exit(1); } + ac = ac - optind; + if (ac == 0) print_usage(); @@ -422,6 +485,20 @@ int main(int ac, char **av) printf("WARNING! - see http://btrfs.wiki.kernel.org before using\n\n"); file = av[optind++]; + + /* + * Setup for hot data relocation + */ + if (hotdata) { + if (btrfs_is_dev_ssd(file)) { + fprintf(stderr, "Hot data relocation mode requires " + "the first listed device NOT be a SSD (%s)\n", + file); + exit(1); + } + hdd_devices++; + } + ret = check_mounted(file); if (ret < 0) { fprintf(stderr, "error checking %s mount status\n", file); @@ -459,7 +536,7 @@ int main(int ac, char **av) root = open_ctree(file, 0, O_RDWR); root->fs_info->alloc_start = alloc_start; - ret = make_root_dir(root); + ret = make_root_dir(root, hotdata); if (ret) { fprintf(stderr, "failed to setup the root directory\n"); exit(1); @@ -479,6 +556,15 @@ int main(int ac, char **av) zero_end = 1; while(ac-- > 0) { file = av[optind++]; + + if (hotdata) { + if (btrfs_is_dev_ssd(file)) { + ssd_devices++; + } else { + hdd_devices++; + } + } + ret = check_mounted(file); if (ret < 0) { fprintf(stderr, "error checking %s mount status\n", @@ -504,7 +590,6 @@ int main(int ac, char **av) } ret = btrfs_prepare_device(fd, file, zero_end, &dev_block_count); - BUG_ON(ret); ret = btrfs_add_to_fsid(trans, root, fd, file, dev_block_count, @@ -514,8 +599,18 @@ int main(int ac, char **av) } raid_groups: + btrfs_commit_transaction(trans, root); + + ret = make_root_dir2(root, hotdata); + if (ret) { + fprintf(stderr, "failed to setup the root directory\n"); + exit(1); + } + + trans = btrfs_start_transaction(root, 1); + ret = create_raid_groups(trans, root, data_profile, - metadata_profile); + metadata_profile, hotdata); BUG_ON(ret); ret = create_data_reloc_tree(trans, root); diff --git a/utils.c b/utils.c index 2f4c6e1..852c5d6 100644 --- a/utils.c +++ b/utils.c @@ -473,6 +473,7 @@ int btrfs_add_to_fsid(struct btrfs_trans_handle *trans, device->bytes_used = 0; device->total_ios = 0; device->dev_root = root->fs_info->dev_root; + device->name = path; ret = btrfs_add_device(trans, root, device); BUG_ON(ret); diff --git a/volumes.c b/volumes.c index 7671855..79d3871 100644 --- a/volumes.c +++ b/volumes.c @@ -19,6 +19,7 @@ #define __USE_XOPEN2K #include <stdio.h> #include <stdlib.h> +#include <ctype.h> #include <sys/types.h> #include <sys/stat.h> #include <uuid/uuid.h> @@ -630,7 +631,7 @@ static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes, int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 *start, - u64 *num_bytes, u64 type) + u64 *num_bytes, u64 type, int hotdata) { u64 dev_offset; struct btrfs_fs_info *info = extent_root->fs_info; @@ -733,8 +734,24 @@ again: /* build a private list of devices we will allocate from */ while(index < num_stripes) { device = list_entry(cur, struct btrfs_device, dev_list); - avail = device->total_bytes - device->bytes_used; cur = cur->next; + int is_ssd = btrfs_is_dev_ssd(device->name); + + if (hotdata) { + if (type & BTRFS_BLOCK_GROUP_DATA && + is_ssd) + goto skip_device; + if (type & BTRFS_BLOCK_GROUP_METADATA && + is_ssd) + goto skip_device; + if (type & BTRFS_BLOCK_GROUP_DATA_SSD && + !is_ssd) + goto skip_device; + if (type & BTRFS_BLOCK_GROUP_METADATA_SSD && + !is_ssd) + goto skip_device; + } + avail = device->total_bytes - device->bytes_used; if (avail >= min_free) { list_move_tail(&device->dev_list, &private_devs); index++; @@ -742,6 +759,7 @@ again: index++; } else if (avail > max_avail) max_avail = avail; +skip_device: if (cur == dev_list) break; } @@ -853,6 +871,7 @@ again: BUG_ON(ret); } + kfree(chunk); return ret; } @@ -1448,3 +1467,53 @@ struct list_head *btrfs_scanned_uuids(void) { return &fs_uuids; } + +/* + * A userspace function for determining whether a device is + * an SSD + */ +int btrfs_is_dev_ssd(char *device_path) +{ + int fd; + int ret = 0; + char *deva = "/sys/block/"; + char *devb = "/queue/rotational"; + char dev_string[256] = ""; + char dev[256]; + size_t dev_name_len; + char rot_flag[2]; + int index; + + memset(rot_flag, 0, 2); + + dev_name_len = strlen(device_path); + memcpy(dev, device_path + 5, dev_name_len - 4); + + /* remove partition numbers from device name */ + index = strlen(dev) - 1; + while (isdigit(dev[index])) + dev[index--] = '\0'; + + strcat(dev_string, deva); + strcat(dev_string, dev); + strcat(dev_string, devb); + + fd = open(dev_string, O_RDONLY); + + if (fd < 0) { + fprintf(stderr, "unable to open %s\n", dev_string); + return 0; + } + + ret = read(fd, rot_flag, 1); + if (ret < 1) { + fprintf(stderr, "unable to read rotational flag for %s\n", + device_path); + return 0; + } + + close(fd); + + return !atoi(rot_flag); +} + diff --git a/volumes.h b/volumes.h index bb78751..bb26580 100644 --- a/volumes.h +++ b/volumes.h @@ -106,7 +106,7 @@ int btrfs_read_sys_array(struct btrfs_root *root); int btrfs_read_chunk_tree(struct btrfs_root *root); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 *start, - u64 *num_bytes, u64 type); + u64 *num_bytes, u64 type, int hotdata); int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -130,4 +130,5 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int btrfs_is_dev_ssd(char *device_path); #endif -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html