Add the --write-zeros option for Create which will send a write zeros request to all the disks before assembling the array. After zeroing the array, the disks will be in a known clean state and the initial sync may be skipped. Writing zeroes is best used when there is a hardware offload method to zero the data. But even still, zeroing can take several minutes on a large device. Because of this, all disks are zeroed in parallel using their own forked process and a message is printed to the user. The main process will proceed only after all the zeroing processes have completed successfully. Signed-off-by: Logan Gunthorpe <logang@xxxxxxxxxxxx> fixup! mdadm: Add --write-zeros option for Create --- Create.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ReadMe.c | 2 ++ mdadm.c | 9 ++++++ mdadm.h | 7 +++++ 4 files changed, 101 insertions(+) diff --git a/Create.c b/Create.c index 1e94b90b96bd..31bed3a37a90 100644 --- a/Create.c +++ b/Create.c @@ -26,6 +26,8 @@ #include "md_u.h" #include "md_p.h" #include <ctype.h> +#include <fcntl.h> +#include <sys/wait.h> static int round_size_and_verify(unsigned long long *size, int chunk) { @@ -91,6 +93,73 @@ int default_layout(struct supertype *st, int level, int verbose) return layout; } +static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st, + struct mddev_dev *dv) + +{ + unsigned long long offset_bytes, size_bytes; + int ret = 0; + pid_t pid; + + size_bytes = KIB_TO_BYTES(s->size); + + /* + * If size_bytes is zero, this is a zoned raid array where + * each disk is of a different size and uses its full + * disk. Thus zero the entire disk. + */ + if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes)) + return -1; + + if (dv->data_offset != INVALID_SECTORS) + offset_bytes = SEC_TO_BYTES(dv->data_offset); + else + offset_bytes = SEC_TO_BYTES(st->data_offset); + + pr_info("zeroing data from %lld to %lld on: %s\n", + offset_bytes, size_bytes, dv->devname); + + pid = fork(); + if (pid < 0) { + pr_err("Could not fork to zero disks: %m\n"); + return pid; + } else if (pid != 0) { + return pid; + } + + if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, + offset_bytes, size_bytes)) { + pr_err("zeroing %s failed: %m\n", dv->devname); + ret = 1; + } + + exit(ret); +} + +static int wait_for_zero_forks(struct mdinfo *info, int count) +{ + int wstatus, ret = 0, i; + bool waited = false; + + for (i = 0; i < count; i++) { + if (!info[i].zero_pid) + continue; + + waited = true; + waitpid(info[i].zero_pid, &wstatus, 0); + + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus)) + ret = 1; + } + + if (ret) + pr_err("zeroing failed!\n"); + else if (waited) + pr_info("zeroing finished\n"); + + return ret; +} + static int add_disk_to_super(int mdfd, struct shape *s, struct context *c, struct supertype *st, struct mddev_dev *dv, struct mdinfo *info, int have_container, int major_num) @@ -148,6 +217,14 @@ static int add_disk_to_super(int mdfd, struct shape *s, struct context *c, } st->ss->getinfo_super(st, info, NULL); + if (s->write_zeroes) { + info->zero_pid = write_zeroes_fork(fd, s, st, dv); + if (info->zero_pid <= 0) { + ioctl(mdfd, STOP_ARRAY, NULL); + return 1; + } + } + if (have_container && c->verbose > 0) pr_err("Using %s for device %d\n", map_dev(info->disk.major, info->disk.minor, 0), @@ -287,6 +364,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, } if (pass == 1) { + ret = wait_for_zero_forks(infos, total_slots); + if (ret) + goto out; + ret = update_metadata(mdfd, s, st, map, info, chosen_name); if (ret) @@ -295,6 +376,8 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, } out: + if (ret) + wait_for_zero_forks(infos, total_slots); free(infos); return ret; } diff --git a/ReadMe.c b/ReadMe.c index 7f94847e9769..50913dabca40 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -138,6 +138,7 @@ struct option long_options[] = { {"size", 1, 0, 'z'}, {"auto", 1, 0, Auto}, /* also for --assemble */ {"assume-clean",0,0, AssumeClean }, + {"write-zeroes",0,0, WriteZeroes }, {"metadata", 1, 0, 'e'}, /* superblock format */ {"bitmap", 1, 0, Bitmap}, {"bitmap-chunk", 1, 0, BitmapChunk}, @@ -390,6 +391,7 @@ char Help_create[] = " --write-journal= : Specify journal device for RAID-4/5/6 array\n" " --consistency-policy= : Specify the policy that determines how the array\n" " -k : maintains consistency in case of unexpected shutdown.\n" +" --write-zeroes : Write zeroes to the disks before creating. This will bypass initial sync.\n" "\n" ; diff --git a/mdadm.c b/mdadm.c index 972adb524dfb..141838bd394f 100644 --- a/mdadm.c +++ b/mdadm.c @@ -602,6 +602,10 @@ int main(int argc, char *argv[]) s.assume_clean = 1; continue; + case O(CREATE, WriteZeroes): + s.write_zeroes = 1; + continue; + case O(GROW,'n'): case O(CREATE,'n'): case O(BUILD,'n'): /* number of raid disks */ @@ -1306,6 +1310,11 @@ int main(int argc, char *argv[]) } } + if (s.write_zeroes && !s.assume_clean) { + pr_info("Disk zeroing requested, setting --assume-clean to skip resync\n"); + s.assume_clean = 1; + } + if (!mode && devs_found) { mode = MISC; devmode = 'Q'; diff --git a/mdadm.h b/mdadm.h index 1ab31564efef..c7e00195d8c8 100644 --- a/mdadm.h +++ b/mdadm.h @@ -273,6 +273,9 @@ static inline void __put_unaligned32(__u32 val, void *p) #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) +#define KIB_TO_BYTES(x) ((x) << 10) +#define SEC_TO_BYTES(x) ((x) << 9) + extern const char Name[]; struct md_bb_entry { @@ -387,6 +390,8 @@ struct mdinfo { ARRAY_UNKNOWN_STATE, } array_state; struct md_bb bb; + + pid_t zero_pid; }; struct createinfo { @@ -433,6 +438,7 @@ extern char Version[], Usage[], Help[], OptionHelp[], */ enum special_options { AssumeClean = 300, + WriteZeroes, BitmapChunk, WriteBehind, ReAdd, @@ -593,6 +599,7 @@ struct shape { int bitmap_chunk; char *bitmap_file; int assume_clean; + bool write_zeroes; int write_behind; unsigned long long size; unsigned long long data_offset; -- 2.30.2