Enable creating and assembling IMSM raid5 arrays with PPL. Update the IMSM metadata format to include new fields used for PPL. Add structures for PPL metadata. They are used also by super1 and shared with the kernel, so put them in md_p.h. Write the initial empty PPL header when creating an array. When assembling an array with PPL, validate the PPL header and in case it is not correct allow to overwrite it if --force was provided. Write the PPL location and size for a device to the new rdev sysfs attributes 'ppl_sector' and 'ppl_size'. Enable PPL in the kernel by writing to 'consistency_policy' before the array is activated. Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@xxxxxxxxx> --- Assemble.c | 49 +++++++++++ Makefile | 5 +- md_p.h | 25 ++++++ mdadm.h | 6 ++ super-intel.c | 274 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- sysfs.c | 14 +++ 6 files changed, 349 insertions(+), 24 deletions(-) diff --git a/Assemble.c b/Assemble.c index 3da09033..8e55b49f 100644 --- a/Assemble.c +++ b/Assemble.c @@ -1942,6 +1942,55 @@ int assemble_container_content(struct supertype *st, int mdfd, map_update(NULL, fd2devnm(mdfd), content->text_version, content->uuid, chosen_name); + if (content->consistency_policy == CONSISTENCY_POLICY_PPL && + st->ss->validate_ppl) { + content->array.state |= 1; + err = 0; + + for (dev = content->devs; dev; dev = dev->next) { + int dfd; + char *devpath; + int ret; + + ret = st->ss->validate_ppl(st, content, dev); + if (ret == 0) + continue; + + if (ret < 0) { + err = 1; + break; + } + + if (!c->force) { + pr_err("%s contains invalid PPL - consider --force or --update-subarray with --update=no-ppl\n", + chosen_name); + content->array.state &= ~1; + avail[dev->disk.raid_disk] = 0; + break; + } + + /* have --force - overwrite the invalid ppl */ + devpath = map_dev(dev->disk.major, dev->disk.minor, 0); + dfd = dev_open(devpath, O_RDWR); + if (dfd < 0) { + pr_err("Failed to open %s\n", devpath); + err = 1; + break; + } + + err = st->ss->write_init_ppl(st, content, dfd); + close(dfd); + + if (err) + break; + } + + if (err) { + free(avail); + return err; + } + } + if (enough(content->array.level, content->array.raid_disks, content->array.layout, content->array.state & 1, avail) == 0) { if (c->export && result) diff --git a/Makefile b/Makefile index d1a6ac46..5ff6cc09 100644 --- a/Makefile +++ b/Makefile @@ -151,7 +151,7 @@ MON_OBJS = mdmon.o monitor.o managemon.o util.o maps.o mdstat.o sysfs.o \ Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \ super-mbr.o super-gpt.o \ super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \ - platform-intel.o probe_roms.o + platform-intel.o probe_roms.o crc32c.o MON_SRCS = $(patsubst %.o,%.c,$(MON_OBJS)) @@ -161,7 +161,8 @@ STATICOBJS = pwgr.o ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c policy.c dlink.c util.c \ maps.c lib.c xmalloc.c \ super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \ - platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c mapfile.c + platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c mapfile.c \ + crc32c.c ASSEMBLE_AUTO_SRCS := mdopen.c ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE ifdef MDASSEMBLE_AUTO diff --git a/md_p.h b/md_p.h index dc9fec16..358a28ce 100644 --- a/md_p.h +++ b/md_p.h @@ -267,4 +267,29 @@ struct r5l_meta_block { #define R5LOG_VERSION 0x1 #define R5LOG_MAGIC 0x6433c509 +struct ppl_header_entry { + __u64 data_sector; /* raid sector of the new data */ + __u32 pp_size; /* length of partial parity */ + __u32 data_size; /* length of data */ + __u32 parity_disk; /* member disk containing parity */ + __u32 checksum; /* checksum of this entry's partial parity */ +} __attribute__ ((__packed__)); + +#define PPL_HEADER_SIZE 4096 +#define PPL_HDR_RESERVED 512 +#define PPL_HDR_ENTRY_SPACE \ + (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__u32) - sizeof(__u64)) +#define PPL_HDR_MAX_ENTRIES \ + (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry)) + +struct ppl_header { + __u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */ + __u32 signature; /* signature (family number of volume) */ + __u32 padding; /* zero pad */ + __u64 generation; /* generation number of the header */ + __u32 entries_count; /* number of entries in entry array */ + __u32 checksum; /* checksum of the header */ + struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES]; +} __attribute__ ((__packed__)); + #endif diff --git a/mdadm.h b/mdadm.h index b52d4d30..d222cc3b 100644 --- a/mdadm.h +++ b/mdadm.h @@ -300,6 +300,8 @@ struct mdinfo { #define MaxSector (~0ULL) /* resync/recovery complete position */ }; long bitmap_offset; /* 0 == none, 1 == a file */ + unsigned int ppl_size; + unsigned long long ppl_sector; unsigned long safe_mode_delay; /* ms delay to mark clean */ int new_level, delta_disks, new_layout, new_chunk; int errors; @@ -1074,6 +1076,10 @@ extern struct superswitch { /* write initial empty PPL on device */ int (*write_init_ppl)(struct supertype *st, struct mdinfo *info, int fd); + /* validate ppl before assemble */ + int (*validate_ppl)(struct supertype *st, struct mdinfo *info, + struct mdinfo *disk); + /* records new bad block in metadata */ int (*record_bad_block)(struct active_array *a, int n, unsigned long long sector, int length); diff --git a/super-intel.c b/super-intel.c index 2d92c8e3..87fec8bc 100644 --- a/super-intel.c +++ b/super-intel.c @@ -102,6 +102,7 @@ struct imsm_disk { #define SPARE_DISK __cpu_to_le32(0x01) /* Spare */ #define CONFIGURED_DISK __cpu_to_le32(0x02) /* Member of some RaidDev */ #define FAILED_DISK __cpu_to_le32(0x04) /* Permanent failure */ +#define JOURNAL_DISK __cpu_to_le32(0x2000000) /* Device marked as Journaling Drive */ __u32 status; /* 0xF0 - 0xF3 */ __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */ __u32 total_blocks_hi; /* 0xF4 - 0xF5 total blocks hi */ @@ -155,6 +156,9 @@ struct imsm_vol { #define MIGR_STATE_CHANGE 4 #define MIGR_REPAIR 5 __u8 migr_type; /* Initializing, Rebuilding, ... */ +#define RAIDVOL_CLEAN 0 +#define RAIDVOL_DIRTY 1 +#define RAIDVOL_DSRECORD_VALID 2 __u8 dirty; __u8 fs_state; /* fast-sync state for CnG (0xff == disabled) */ __u16 verify_errors; /* number of mismatches */ @@ -190,7 +194,24 @@ struct imsm_dev { __u16 cache_policy; __u8 cng_state; __u8 cng_sub_state; -#define IMSM_DEV_FILLERS 10 + __u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */ + + /* NVM_EN */ + __u8 nv_cache_mode; + __u8 nv_cache_flags; + + /* Unique Volume Id of the NvCache Volume associated with this volume */ + __u32 nvc_vol_orig_family_num; + __u16 nvc_vol_raid_dev_num; + +#define RWH_OFF 0 +#define RWH_DISTRIBUTED 1 +#define RWH_JOURNALING_DRIVE 2 + __u8 rwh_policy; /* Raid Write Hole Policy */ + __u8 jd_serial[MAX_RAID_SERIAL_LEN]; /* Journal Drive serial number */ + __u8 filler1; + +#define IMSM_DEV_FILLERS 3 __u32 filler[IMSM_DEV_FILLERS]; struct imsm_vol vol; } __attribute__ ((packed)); @@ -257,6 +278,9 @@ static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" #define UNIT_SRC_IN_CP_AREA 1 /* Source data for curr_migr_unit has * already been migrated and must * be recovered from checkpoint area */ + +#define PPL_ENTRY_SPACE (128 * 1024) /* Size of the PPL, without the header */ + struct migr_record { __u32 rec_status; /* Status used to determine how to restart * migration in case it aborts @@ -1288,6 +1312,11 @@ static int is_failed(struct imsm_disk *disk) return (disk->status & FAILED_DISK) == FAILED_DISK; } +static int is_journal(struct imsm_disk *disk) +{ + return (disk->status & JOURNAL_DISK) == JOURNAL_DISK; +} + /* try to determine how much space is reserved for metadata from * the last get_extents() entry on the smallest active disk, * otherwise fallback to the default @@ -1477,7 +1506,17 @@ static void print_imsm_dev(struct intel_super *super, blocks_per_migr_unit(super, dev)); } printf("\n"); - printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean"); + printf(" Dirty State : %s\n", (dev->vol.dirty & RAIDVOL_DIRTY) ? + "dirty" : "clean"); + printf(" RWH Policy : "); + if (dev->rwh_policy == RWH_OFF) + printf("off\n"); + else if (dev->rwh_policy == RWH_DISTRIBUTED) + printf("PPL distributed\n"); + else if (dev->rwh_policy == RWH_JOURNALING_DRIVE) + printf("PPL journaling drive\n"); + else + printf("<unknown:%d>\n", dev->rwh_policy); } static void print_imsm_disk(struct imsm_disk *disk, @@ -1496,9 +1535,10 @@ static void print_imsm_disk(struct imsm_disk *disk, printf(" Disk%02d Serial : %s\n", index, str); else printf(" Disk Serial : %s\n", str); - printf(" State :%s%s%s\n", is_spare(disk) ? " spare" : "", - is_configured(disk) ? " active" : "", - is_failed(disk) ? " failed" : ""); + printf(" State :%s%s%s%s\n", is_spare(disk) ? " spare" : "", + is_configured(disk) ? " active" : "", + is_failed(disk) ? " failed" : "", + is_journal(disk) ? " journal" : ""); printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); sz = total_blocks(disk) - reserved; printf(" Usable Size : %llu%s\n", @@ -3114,6 +3154,15 @@ static unsigned long long imsm_component_size_aligment_check(int level, return component_size; } +static unsigned long long get_ppl_sector(struct intel_super *super, int dev_idx) +{ + struct imsm_dev *dev = get_imsm_dev(super, dev_idx); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + return pba_of_lba0(map) + + (num_data_stripes(map) * map->blocks_per_strip); +} + static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, char *dmap) { struct intel_super *super = st->sb; @@ -3140,7 +3189,7 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, info->array.utime = 0; info->array.chunk_size = __le16_to_cpu(map_to_analyse->blocks_per_strip) << 9; - info->array.state = !dev->vol.dirty; + info->array.state = !(dev->vol.dirty & RAIDVOL_DIRTY); info->custom_array_size = __le32_to_cpu(dev->size_high); info->custom_array_size <<= 32; info->custom_array_size |= __le32_to_cpu(dev->size_low); @@ -3221,10 +3270,20 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, memset(info->uuid, 0, sizeof(info->uuid)); info->recovery_start = MaxSector; + if (info->array.level == 5 && dev->rwh_policy == RWH_DISTRIBUTED) { + info->consistency_policy = CONSISTENCY_POLICY_PPL; + info->ppl_sector = get_ppl_sector(super, super->current_vol); + info->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE) >> 9; + } else if (info->array.level <= 0) { + info->consistency_policy = CONSISTENCY_POLICY_NONE; + } else { + info->consistency_policy = CONSISTENCY_POLICY_RESYNC; + } + info->reshape_progress = 0; info->resync_start = MaxSector; if ((map_to_analyse->map_state == IMSM_T_STATE_UNINITIALIZED || - dev->vol.dirty) && + !(info->array.state & 1)) && imsm_reshape_blocks_arrays_changes(super) == 0) { info->resync_start = 0; } @@ -3451,7 +3510,8 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char * * found the 'most fresh' version of the metadata */ info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; - info->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); + info->disk.state |= (is_spare(disk) || is_journal(disk)) ? + 0 : (1 << MD_DISK_SYNC); } /* only call uuid_from_super_imsm when this disk is part of a populated container, @@ -3906,7 +3966,7 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) */ if (is_failed(&dl->disk)) dl->index = -2; - else if (is_spare(&dl->disk)) + else if (is_spare(&dl->disk) || is_journal(&dl->disk)) dl->index = -1; } @@ -5303,6 +5363,20 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, } mpb->num_raid_devs++; + if (s->consistency_policy == UnSet || + s->consistency_policy == CONSISTENCY_POLICY_RESYNC || + s->consistency_policy == CONSISTENCY_POLICY_NONE) { + dev->rwh_policy = RWH_OFF; + } else if (s->consistency_policy == CONSISTENCY_POLICY_PPL) { + dev->rwh_policy = RWH_DISTRIBUTED; + } else { + free(dev); + free(dv); + pr_err("imsm does not support consistency policy %s\n", + map_num(consistency_policies, s->consistency_policy)); + return 0; + } + dv->dev = dev; dv->index = super->current_vol; dv->next = super->devlist; @@ -5927,11 +6001,146 @@ static int mgmt_disk(struct supertype *st) return 0; } +#endif + +__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len); + +static int write_init_ppl_imsm(struct supertype *st, struct mdinfo *info, int fd) +{ + struct intel_super *super = st->sb; + void *buf; + struct ppl_header *ppl_hdr; + int ret; + + ret = posix_memalign(&buf, 4096, PPL_HEADER_SIZE); + if (ret) { + pr_err("Failed to allocate PPL header buffer\n"); + return ret; + } + + memset(buf, 0, PPL_HEADER_SIZE); + ppl_hdr = buf; + memset(ppl_hdr->reserved, 0xff, PPL_HDR_RESERVED); + ppl_hdr->signature = __cpu_to_le32(super->anchor->orig_family_num); + ppl_hdr->checksum = __cpu_to_le32(~crc32c_le(~0, buf, PPL_HEADER_SIZE)); + + if (lseek64(fd, info->ppl_sector * 512, SEEK_SET) < 0) { + ret = errno; + perror("Failed to seek to PPL header location"); + } + + if (!ret && write(fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) { + ret = errno; + perror("Write PPL header failed"); + } + + if (!ret) + fsync(fd); + + free(buf); + return ret; +} + +static int validate_ppl_imsm(struct supertype *st, struct mdinfo *info, + struct mdinfo *disk) +{ + struct intel_super *super = st->sb; + struct dl *d; + void *buf; + int ret = 0; + struct ppl_header *ppl_hdr; + __u32 crc; + struct imsm_dev *dev; + struct imsm_map *map; + __u32 idx; + + if (disk->disk.raid_disk < 0) + return 0; + + if (posix_memalign(&buf, 4096, PPL_HEADER_SIZE)) { + pr_err("Failed to allocate PPL header buffer\n"); + return -1; + } + + dev = get_imsm_dev(super, info->container_member); + map = get_imsm_map(dev, MAP_X); + idx = get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_X); + d = get_imsm_dl_disk(super, idx); + + if (!d || d->index < 0 || is_failed(&d->disk)) + goto out; + + if (lseek64(d->fd, info->ppl_sector * 512, SEEK_SET) < 0) { + perror("Failed to seek to PPL header location"); + ret = -1; + goto out; + } + + if (read(d->fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) { + perror("Read PPL header failed"); + ret = -1; + goto out; + } + + ppl_hdr = buf; + + crc = __le32_to_cpu(ppl_hdr->checksum); + ppl_hdr->checksum = 0; + + if (crc != ~crc32c_le(~0, buf, PPL_HEADER_SIZE)) { + dprintf("Wrong PPL header checksum on %s\n", + d->devname); + ret = 1; + } + + if (!ret && (__le32_to_cpu(ppl_hdr->signature) != + super->anchor->orig_family_num)) { + dprintf("Wrong PPL header signature on %s\n", + d->devname); + ret = 1; + } + +out: + free(buf); + + if (ret == 1 && map->map_state == IMSM_T_STATE_UNINITIALIZED) + return st->ss->write_init_ppl(st, info, d->fd); + + return ret; +} + +#ifndef MDASSEMBLE + +static int write_init_ppl_imsm_all(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct dl *d; + int ret = 0; + + if (info->consistency_policy != CONSISTENCY_POLICY_PPL || + info->array.level != 5) + return 0; + + for (d = super->disks; d ; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + + ret = st->ss->write_init_ppl(st, info, d->fd); + if (ret) + break; + } + + return ret; +} static int write_init_super_imsm(struct supertype *st) { struct intel_super *super = st->sb; int current_vol = super->current_vol; + int rv = 0; + struct mdinfo info; + + getinfo_super_imsm(st, &info, NULL); /* we are done with current_vol reset it to point st at the container */ super->current_vol = -1; @@ -5939,24 +6148,29 @@ static int write_init_super_imsm(struct supertype *st) if (st->update_tail) { /* queue the recently created array / added disk * as a metadata update */ - int rv; /* determine if we are creating a volume or adding a disk */ if (current_vol < 0) { /* in the mgmt (add/remove) disk case we are running * in mdmon context, so don't close fd's */ - return mgmt_disk(st); - } else - rv = create_array(st, current_vol); - - return rv; + rv = mgmt_disk(st); + } else { + rv = write_init_ppl_imsm_all(st, &info); + if (!rv) + rv = create_array(st, current_vol); + } } else { struct dl *d; for (d = super->disks; d; d = d->next) Kill(d->devname, NULL, 0, -1, 1); - return write_super_imsm(st, 1); + if (current_vol >= 0) + rv = write_init_ppl_imsm_all(st, &info); + if (!rv) + rv = write_super_imsm(st, 1); } + + return rv; } #endif @@ -7375,7 +7589,8 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra * * FIXME handle dirty degraded */ - if ((skip || recovery_start == 0) && !dev->vol.dirty) + if ((skip || recovery_start == 0) && + !(dev->vol.dirty & RAIDVOL_DIRTY)) this->resync_start = MaxSector; if (skip) continue; @@ -7410,9 +7625,12 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra info_d->component_size = num_data_stripes(map) * map->blocks_per_strip; + info_d->ppl_sector = this->ppl_sector; + info_d->ppl_size = this->ppl_size; } else { info_d->component_size = blocks_per_member(map); } + info_d->consistency_policy = this->consistency_policy; info_d->bb.supported = 1; get_volume_badblocks(super->bbm_log, ord_to_idx(ord), @@ -7928,12 +8146,16 @@ mark_checkpoint: skip_mark_checkpoint: /* mark dirty / clean */ - if (dev->vol.dirty != !consistent) { + if (((dev->vol.dirty & RAIDVOL_DIRTY) && consistent) || + (!(dev->vol.dirty & RAIDVOL_DIRTY) && !consistent)) { dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty"); - if (consistent) - dev->vol.dirty = 0; - else - dev->vol.dirty = 1; + if (consistent) { + dev->vol.dirty = RAIDVOL_CLEAN; + } else { + dev->vol.dirty = RAIDVOL_DIRTY; + if (dev->rwh_policy == RWH_DISTRIBUTED) + dev->vol.dirty |= RAIDVOL_DSRECORD_VALID; + } super->updates_pending++; } @@ -8445,6 +8667,11 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a, di->component_size = a->info.component_size; di->container_member = inst; di->bb.supported = 1; + if (dev->rwh_policy == RWH_DISTRIBUTED) { + di->consistency_policy = CONSISTENCY_POLICY_PPL; + di->ppl_sector = get_ppl_sector(super, inst); + di->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE) >> 9; + } super->random = random32(); di->next = rv; rv = di; @@ -11600,6 +11827,9 @@ struct superswitch super_imsm = { .container_content = container_content_imsm, .validate_container = validate_container_imsm, + .write_init_ppl = write_init_ppl_imsm, + .validate_ppl = validate_ppl_imsm, + .external = 1, .name = "imsm", diff --git a/sysfs.c b/sysfs.c index 53589a76..2a91ba0a 100644 --- a/sysfs.c +++ b/sysfs.c @@ -689,6 +689,16 @@ int sysfs_set_array(struct mdinfo *info, int vers) * once the reshape completes. */ } + + if (info->consistency_policy == CONSISTENCY_POLICY_PPL) { + if (sysfs_set_str(info, NULL, "consistency_policy", + map_num(consistency_policies, + info->consistency_policy))) { + pr_err("This kernel does not support PPL\n"); + return 1; + } + } + return rv; } @@ -720,6 +730,10 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume) rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); if (sra->array.level != LEVEL_CONTAINER) { + if (sd->consistency_policy == CONSISTENCY_POLICY_PPL) { + rv |= sysfs_set_num(sra, sd, "ppl_sector", sd->ppl_sector); + rv |= sysfs_set_num(sra, sd, "ppl_size", sd->ppl_size); + } if (sd->recovery_start == MaxSector) /* This can correctly fail if array isn't started, * yet, so just ignore status for now. -- 2.11.0 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html