[PATCH 3/6] Enable create array with write journal (--write-journal DEVICE).

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Specify the write journal device with --write-journal DEVICE

./mdadm --create -f /dev/md0 --assume-clean -c 32 --raid-devices=4 --level=5 /dev/sd[c-f] --write-journal /dev/sdb1
mdadm: Defaulting to version 1.2 metadata
mdadm: array /dev/md0 started.

Only one journal device is allowed. If multiple --write-journal
are given, mdadm will use the first and ignore others

./mdadm --create -f /dev/md0 --assume-clean -c 32 --raid-devices=4 --level=5 /dev/sd[c-f] --write-journal /dev/sdb1 --write-journal /dev/sdx
mdadm: Please specify only one journal device for the array.
mdadm: Ignoring --write-journal /dev/sdx...
mdadm: Defaulting to version 1.2 metadata
mdadm: array /dev/md0 started.

Signed-off-by: Shaohua Li <shli@xxxxxx>
Signed-off-by: Song Liu <songliubraving@xxxxxx>
---
 Create.c | 20 ++++++++++++++------
 ReadMe.c |  1 +
 md_p.h   | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mdadm.c  | 24 ++++++++++++++++++++++++
 mdadm.h  |  2 ++
 super1.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 161 insertions(+), 7 deletions(-)

diff --git a/Create.c b/Create.c
index b62d8d4..6a2b307 100644
--- a/Create.c
+++ b/Create.c
@@ -87,7 +87,7 @@ int Create(struct supertype *st, char *mddev,
 	unsigned long long minsize=0, maxsize=0;
 	char *mindisc = NULL;
 	char *maxdisc = NULL;
-	int dnum;
+	int dnum, raid_disk_num;
 	struct mddev_dev *dv;
 	int fail=0, warn=0;
 	struct stat stb;
@@ -180,11 +180,11 @@ int Create(struct supertype *st, char *mddev,
 		pr_err("This metadata type does not support spare disks at create time\n");
 		return 1;
 	}
-	if (subdevs > s->raiddisks+s->sparedisks) {
+	if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) {
 		pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks);
 		return 1;
 	}
-	if (!have_container && subdevs < s->raiddisks+s->sparedisks) {
+	if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) {
 		pr_err("You haven't given enough devices (real or missing) to create this array\n");
 		return 1;
 	}
@@ -397,6 +397,9 @@ int Create(struct supertype *st, char *mddev,
 			}
 		}
 
+		if (dv->disposition == 'j')
+			continue;  /* skip write journal for size check */
+
 		freesize /= 2; /* convert to K */
 		if (s->chunk && s->chunk != UnSet) {
 			/* round to chunk size */
@@ -837,7 +840,7 @@ int Create(struct supertype *st, char *mddev,
 	for (pass=1; pass <=2 ; pass++) {
 		struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
 
-		for (dnum=0, dv = devlist ; dv ;
+		for (dnum=0, raid_disk_num=0, dv = devlist ; dv ;
 		     dv=(dv->next)?(dv->next):moved_disk, dnum++) {
 			int fd;
 			struct stat stb;
@@ -862,8 +865,13 @@ int Create(struct supertype *st, char *mddev,
 				*inf = info;
 
 				inf->disk.number = dnum;
-				inf->disk.raid_disk = dnum;
-				if (inf->disk.raid_disk < s->raiddisks)
+				inf->disk.raid_disk = raid_disk_num++;
+
+				if (dv->disposition == 'j') {
+					inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
+					inf->disk.state = (1<<MD_DISK_JOURNAL);
+					raid_disk_num--;
+				} else if (inf->disk.raid_disk < s->raiddisks)
 					inf->disk.state = (1<<MD_DISK_ACTIVE) |
 						(1<<MD_DISK_SYNC);
 				else
diff --git a/ReadMe.c b/ReadMe.c
index c242319..10921e3 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -142,6 +142,7 @@ struct option long_options[] = {
     {"data-offset",1, 0, DataOffset},
     {"nodes",1, 0, Nodes}, /* also for --assemble */
     {"home-cluster",1, 0, ClusterName},
+    {"write-journal",1, 0, WriteJournal},
 
     /* For assemble */
     {"uuid",      1, 0, 'u'},
diff --git a/md_p.h b/md_p.h
index fae73ba..0d691fb 100644
--- a/md_p.h
+++ b/md_p.h
@@ -208,4 +208,62 @@ static inline __u64 md_event(mdp_super_t *sb) {
 	return (ev<<32)| sb->events_lo;
 }
 
+struct r5l_payload_header {
+	__u16 type;
+	__u16 flags;
+} __attribute__ ((__packed__));
+
+enum r5l_payload_type {
+	R5LOG_PAYLOAD_DATA = 0,
+	R5LOG_PAYLOAD_PARITY = 1,
+	R5LOG_PAYLOAD_FLUSH = 2,
+};
+
+struct r5l_payload_data_parity {
+	struct r5l_payload_header header;
+	__u32 size; /* sector. data/parity size. each 4k has a checksum */
+	__u64 location; /* sector. For data, it's raid sector. For
+				parity, it's stripe sector */
+	__u32 checksum[];
+} __attribute__ ((__packed__));
+
+enum r5l_payload_data_parity_flag {
+	R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
+	/*
+	 * RESHAPED/RESHAPING is only set when there is reshape activity. Note,
+	 * both data/parity of a stripe should have the same flag set
+	 *
+	 * RESHAPED: reshape is running, and this stripe finished reshape
+	 * RESHAPING: reshape is running, and this stripe isn't reshaped
+	 * */
+	R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
+	R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
+};
+
+struct r5l_payload_flush {
+	struct r5l_payload_header header;
+	__u32 size; /* flush_stripes size, bytes */
+	__u64 flush_stripes[];
+} __attribute__ ((__packed__));
+
+enum r5l_payload_flush_flag {
+	R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
+};
+
+struct r5l_meta_block {
+	__u32 magic;
+	__u32 checksum;
+	__u8 version;
+	__u8 __zero_pading_1;
+	__u16 __zero_pading_2;
+	__u32 meta_size; /* whole size of the block */
+
+	__u64 seq;
+	__u64 position; /* sector, start from rdev->data_offset, current position */
+	struct r5l_payload_header payloads[];
+} __attribute__ ((__packed__));
+
+#define R5LOG_VERSION 0x1
+#define R5LOG_MAGIC 0x6433c509
+
 #endif
diff --git a/mdadm.c b/mdadm.c
index 5d5a1b8..412c03c 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -74,6 +74,7 @@ int main(int argc, char *argv[])
 		.require_homehost = 1,
 	};
 	struct shape s = {
+		.journaldisks	= 0,
 		.level		= UnSet,
 		.layout		= UnSet,
 		.bitmap_chunk	= UnSet,
@@ -1170,6 +1171,24 @@ int main(int argc, char *argv[])
 		case O(INCREMENTAL, IncrementalPath):
 			remove_path = optarg;
 			continue;
+		case O(CREATE, WriteJournal):
+			if (s.journaldisks) {
+				pr_err("Please specify only one journal device for the array.\n");
+				pr_err("Ignoring --write-journal %s...\n", optarg);
+				continue;
+			}
+			dv = xmalloc(sizeof(*dv));
+			dv->devname = optarg;
+			dv->disposition = 'j';  /* WriteJournal */
+			dv->writemostly = writemostly;
+			dv->used = 0;
+			dv->next = NULL;
+			*devlistend = dv;
+			devlistend = &dv->next;
+			devs_found++;
+
+			s.journaldisks = 1;
+			continue;
 		}
 		/* We have now processed all the valid options. Anything else is
 		 * an error
@@ -1197,6 +1216,11 @@ int main(int argc, char *argv[])
 		exit(0);
 	}
 
+	if (s.journaldisks && (s.level < 4 || s.level > 6)) {
+		pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
+		exit(2);
+	}
+
 	if (!mode && devs_found) {
 		mode = MISC;
 		devmode = 'Q';
diff --git a/mdadm.h b/mdadm.h
index 6bdaa37..3cc1532 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -347,6 +347,7 @@ enum special_options {
 	Nodes,
 	ClusterName,
 	ClusterConfirm,
+	WriteJournal,
 };
 
 enum prefix_standard {
@@ -434,6 +435,7 @@ struct context {
 struct shape {
 	int	raiddisks;
 	int	sparedisks;
+	int	journaldisks;
 	int	level;
 	int	layout;
 	char	*layout_str;
diff --git a/super1.c b/super1.c
index 4558783..799c86c 100644
--- a/super1.c
+++ b/super1.c
@@ -68,7 +68,10 @@ struct mdp_superblock_1 {
 	__u64	data_offset;	/* sector start of data, often 0 */
 	__u64	data_size;	/* sectors in this device that can be used for data */
 	__u64	super_offset;	/* sector start of this superblock */
-	__u64	recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
+	union {
+		__u64	recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
+		__u64	journal_tail;/* journal tail of journal device (from data_offset) */
+	};
 	__u32	dev_number;	/* permanent identifier of this  device - not role in raid */
 	__u32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
@@ -1568,6 +1571,57 @@ static unsigned long choose_bm_space(unsigned long devsize)
 
 static void free_super1(struct supertype *st);
 
+#define META_BLOCK_SIZE 4096
+unsigned long crc32(
+	unsigned long crc,
+	const unsigned char *buf,
+	unsigned len);
+
+static int write_empty_r5l_meta_block(struct supertype *st, int fd)
+{
+	struct r5l_meta_block *mb;
+	struct mdp_superblock_1 *sb = st->sb;
+	struct align_fd afd;
+	__u32 crc;
+
+	init_afd(&afd, fd);
+
+	if (posix_memalign((void**)&mb, 4096, META_BLOCK_SIZE) != 0) {
+		pr_err("Could not allocate memory for the meta block.\n");
+		return 1;
+	}
+
+	memset(mb, 0, META_BLOCK_SIZE);
+
+	mb->magic = __cpu_to_le32(R5LOG_MAGIC);
+	mb->version = R5LOG_VERSION;
+	mb->meta_size = __cpu_to_le32(sizeof(struct r5l_meta_block));
+	mb->seq = __cpu_to_le64(random32());
+	mb->position = __cpu_to_le64(0);
+
+	crc = crc32(0xffffffff, sb->set_uuid, sizeof(sb->set_uuid));
+	crc = crc32(crc, (void *)mb, META_BLOCK_SIZE);
+	mb->checksum = __cpu_to_le32(crc);
+
+	if (lseek64(fd, (sb->data_offset) * 512, 0) < 0LL) {
+		pr_err("cannot seek to offset of the meta block\n");
+		goto fail_to_write;
+	}
+
+	if (awrite(&afd, mb, META_BLOCK_SIZE) != META_BLOCK_SIZE) {
+		pr_err("failed to store write the meta block \n");
+		goto fail_to_write;
+	}
+	fsync(fd);
+
+	free(mb);
+	return 0;
+
+fail_to_write:
+	free(mb);
+	return 1;
+}
+
 #ifndef MDASSEMBLE
 static int write_init_super1(struct supertype *st)
 {
@@ -1724,6 +1778,13 @@ static int write_init_super1(struct supertype *st)
 
 		sb->sb_csum = calc_sb_1_csum(sb);
 		rv = store_super1(st, di->fd);
+
+		if (rv == 0 && (di->disk.state & (1 << MD_DISK_JOURNAL))) {
+			rv = write_empty_r5l_meta_block(st, di->fd);
+			if (rv)
+				goto error_out;
+		}
+
 		if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
 			rv = st->ss->write_bitmap(st, di->fd, NoUpdate);
 		close(di->fd);
-- 
1.8.1

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux