[PATCH] md - 12 of 12 - Add new superblock format for md

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



### Comments for ChangeSet

Superblock format '1' resolves a number of issues with
superblock format '0'.
It is more dense and can support many more sub-devices.
It does not contains un-needed redundancy.
It adds a few new useful fields

 ----------- Diffstat output ------------
 ./drivers/md/md.c           |  213 +++++++++++++++++++++++++++++++++++++++++++-
 ./include/linux/raid/md_p.h |   53 ++++++++++
 2 files changed, 264 insertions(+), 2 deletions(-)

diff ./drivers/md/md.c~current~ ./drivers/md/md.c
--- ./drivers/md/md.c~current~	2003-03-12 13:46:37.000000000 +1100
+++ ./drivers/md/md.c	2003-03-12 13:46:39.000000000 +1100
@@ -763,6 +763,210 @@ static void super_90_sync(mddev_t *mddev
 	sb->sb_csum = calc_sb_csum(sb);
 }
 
+/*
+ * version 1 superblock
+ */
+
+static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+{
+	unsigned int disk_csum, csum;
+	int size = 256 + sb->max_dev*2;
+
+	disk_csum = sb->sb_csum;
+	sb->sb_csum = 0;
+	csum = csum_partial((void *)sb, size, 0);
+	sb->sb_csum = disk_csum;
+	return csum;
+}
+
+static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+	struct mdp_superblock_1 *sb;
+	int ret;
+	sector_t sb_offset;
+
+	/*
+	 * Calculate the position of the superblock.
+	 * It is always aligned to a 4K boundary and
+	 * depeding on minor_version, it can be:
+	 * 0: At least 8K, but less than 12K, from end of device
+	 * 1: At start of device
+	 * 2: 4K from start of device.
+	 */
+	switch(minor_version) {
+	case 0:
+		sb_offset = rdev->bdev->bd_inode->i_size >> 9;
+		sb_offset -= 8*2;
+		sb_offset &= ~(4*2);
+		/* convert from sectors to K */
+		sb_offset /= 2;
+		break;
+	case 1:
+		sb_offset = 0;
+		break;
+	case 2:
+		sb_offset = 4;
+		break;
+	default:
+		return -EINVAL;
+	}
+	rdev->sb_offset = sb_offset;
+
+	ret = read_disk_sb(rdev);
+	if (ret) return ret;
+
+
+	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+	    sb->major_version != cpu_to_le32(1) ||
+	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+	    le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+	    sb->feature_map != 0)
+		return -EINVAL;
+
+	if (calc_sb_1_csum(sb) != sb->sb_csum) {
+		printk(BAD_CSUM, bdev_partition_name(rdev->bdev));
+		return -EINVAL;
+	}
+	rdev->preferred_minor = 0xffff;
+	rdev->data_offset = le64_to_cpu(sb->data_offset);
+
+	if (refdev == 0)
+		return 1;
+	else {
+		__u64 ev1, ev2;
+		struct mdp_superblock_1 *refsb = 
+			(struct mdp_superblock_1*)page_address(refdev->sb_page);
+
+		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+		    sb->level != refsb->level ||
+		    sb->layout != refsb->layout ||
+		    sb->chunksize != refsb->chunksize) {
+			printk(KERN_WARNING "md: %s has strangely different superblock to %s\n",
+			       bdev_partition_name(rdev->bdev),
+			       bdev_partition_name(refdev->bdev));
+			return -EINVAL;
+		}
+		ev1 = le64_to_cpu(sb->events);
+		ev2 = le64_to_cpu(refsb->events);
+
+		if (ev1 > ev2)
+			return 1;
+	}
+	if (minor_version) 
+		rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+	else
+		rdev->size = rdev->sb_offset;
+	if (rdev->size < le64_to_cpu(sb->data_size)/2)
+		return -EINVAL;
+	rdev->size = le64_to_cpu(sb->data_size)/2;
+	if (le32_to_cpu(sb->chunksize))
+		rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+	return 0;
+}
+
+static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+	if (mddev->raid_disks == 0) {
+		mddev->major_version = 1;
+		mddev->minor_version = 0;
+		mddev->patch_version = 0;
+		mddev->persistent = 1;
+		mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+		mddev->level = le32_to_cpu(sb->level);
+		mddev->layout = le32_to_cpu(sb->layout);
+		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+		mddev->size = (u32)le64_to_cpu(sb->size);
+		mddev->events = le64_to_cpu(sb->events);
+		
+		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+		memcpy(mddev->uuid, sb->set_uuid, 16);
+
+		mddev->max_disks =  (4096-256)/2;
+	} else {
+		__u64 ev1;
+		ev1 = le64_to_cpu(sb->events);
+		++ev1;
+		if (ev1 < mddev->events)
+			return -EINVAL;
+	}
+
+	if (mddev->level != LEVEL_MULTIPATH) {
+		int role;
+		rdev->desc_nr = le32_to_cpu(sb->dev_number);
+		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+		switch(role) {
+		case 0xffff: /* spare */
+			rdev->in_sync = 0;
+			rdev->faulty = 0;
+			rdev->raid_disk = -1;
+			break;
+		case 0xfffe: /* faulty */
+			rdev->in_sync = 0;
+			rdev->faulty = 1;
+			rdev->raid_disk = -1;
+			break;
+		default:
+			rdev->in_sync = 1;
+			rdev->faulty = 0;
+			rdev->raid_disk = role;
+			break;
+		}
+	}
+	return 0;
+}
+
+static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct mdp_superblock_1 *sb;
+	struct list_head *tmp;
+	mdk_rdev_t *rdev2;
+	int max_dev, i;
+	/* make rdev->sb match mddev and rdev data. */
+
+	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+	sb->feature_map = 0;
+	sb->pad0 = 0;
+	memset(sb->pad1, 0, sizeof(sb->pad1));
+	memset(sb->pad2, 0, sizeof(sb->pad2));
+	memset(sb->pad3, 0, sizeof(sb->pad3));
+
+	sb->utime = cpu_to_le64((__u64)mddev->utime);
+	sb->events = cpu_to_le64(mddev->events);
+	if (mddev->in_sync)
+		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+	else
+		sb->resync_offset = cpu_to_le64(0);
+
+	max_dev = 0;
+	ITERATE_RDEV(mddev,rdev2,tmp)
+		if (rdev2->desc_nr > max_dev)
+			max_dev = rdev2->desc_nr;
+	
+	sb->max_dev = max_dev;
+	for (i=0; i<max_dev;i++)
+		sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
+	
+	ITERATE_RDEV(mddev,rdev2,tmp) {
+		i = rdev2->desc_nr;
+		if (rdev2->faulty)
+			sb->dev_roles[i] = cpu_to_le16(0xfffe);
+		else if (rdev2->in_sync)
+			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+		else
+			sb->dev_roles[i] = cpu_to_le16(0xffff);
+	}
+
+	sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
+}
+
+
 struct super_type super_types[] = {
 	[0] = {
 		.name	= "0.90.0",
@@ -771,9 +975,14 @@ struct super_type super_types[] = {
 		.validate_super	= super_90_validate,
 		.sync_super	= super_90_sync,
 	},
+	[1] = {
+		.name	= "md-1",
+		.owner	= THIS_MODULE,
+		.load_super	= super_1_load,
+		.validate_super	= super_1_validate,
+		.sync_super	= super_1_sync,
+	},
 };
-
-
 	
 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
 {

diff ./include/linux/raid/md_p.h~current~ ./include/linux/raid/md_p.h
--- ./include/linux/raid/md_p.h~current~	2003-03-12 13:46:37.000000000 +1100
+++ ./include/linux/raid/md_p.h	2003-03-12 13:46:39.000000000 +1100
@@ -173,5 +173,58 @@ static inline __u64 md_event(mdp_super_t
 	return (ev<<32)| sb->events_lo;
 }
 
+/*
+ * The version-1 superblock :
+ * All numeric fields are little-endian.
+ *
+ * total size: 256 bytes plus 2 per device.
+ *  1K allows 384 devices.
+ */
+struct mdp_superblock_1 {
+	/* constant array information - 128 bytes */
+	__u32	magic;		/* MD_SB_MAGIC: 0xa92b4efc - little endian */
+	__u32	major_version;	/* 1 */
+	__u32	feature_map;	/* 0 for now */
+	__u32	pad0;		/* always set to 0 when writing */
+
+	__u8	set_uuid[16];	/* user-space generated. */
+	char	set_name[32];	/* set and interpreted by user-space */
+
+	__u64	ctime;		/* lo 40 bits are seconds, top 24 are microseconds or 0*/
+	__u32	level;		/* -4 (multipath), -1 (linear), 0,1,4,5 */
+	__u32	layout;		/* only for raid5 currently */
+	__u64	size;		/* used size of component devices, in 512byte sectors */
+
+	__u32	chunksize;	/* in 512byte sectors */
+	__u32	raid_disks;
+	__u8	pad1[128-92];	/* set to 0 when written */
+
+	/* constant this-device information - 64 bytes */
+	__u64	data_offset;	/* sector start of data, often 0 */
+	__u64	data_size;	/* sectors in this device that can be used for data */
+	__u64	super_offset;	/* sector start of this superblock */
+	__u64	recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
+	__u32	dev_number;	/* permanent identifier of this  device - not role in raid */
+	__u32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
+	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
+	__u8	pad2[64-56];	/* set to 0 when writing */
+
+	/* array state information - 64 bytes */
+	__u64	utime;		/* 40 bits second, 24 btes microseconds */
+	__u64	events;		/* incremented when superblock updated */
+	__u64	resync_offset;	/* data before this offset (from data_offset) known to be in sync */
+	__u32	sb_csum;	/* checksum upto devs[max_dev] */
+	__u32	max_dev;	/* size of devs[] array to consider */
+	__u8	pad3[64-40];	/* set to 0 when writing */
+
+	/* device state information. Indexed by dev_number.
+	 * 2 bytes per device
+	 * Note there are no per-device state flags. State information is rolled
+	 * into the 'roles' value.  If a device is spare or faulty, then it doesn't
+	 * have a meaningful role.
+	 */
+	__u16	dev_roles[0];	/* role in array, or 0xffff for a spare, or 0xfffe for faulty */
+};
+
 #endif 
 
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux