[PATCH v2] hfsplus: add journal replay

Sergei Antonov <saproj@xxxxxxxxx> · Mon, 24 Feb 2014 11:05:19 +0100

v1->v2 changes:
* Use WRITE_SYNC instead of WRITE to replay sectors.
* Return a more proper -ENOTSUPP instead on -EIO on unsupported journal flags.
* Updated on "Advantages over" section in description.
* Fixed typos in comments and patch description.
* journal_swab field has been removed from hfsplus_sb_info, now a more local
  hfs_replay_data::need_swab is used instead. Suggested by Hin-Tak Leung.
* This line of code having no effect removed: "rd.header = rd.header;".

Add journal replay functionality. If filesystem is mounted for read-write and
a non-empty journal is found, the replay action will be done. The original
behavior of switching to read-only mode in presence of journal by default is
retained since the driver does not support journaled metadata modification
anyway. After an attempt to replay has been done, the mount procedure repeats
loading volume header to get its updated state. Having performed journal replay
we do not run a risk of stumbling on inconsistency when working with the volume,
so this functionality adds stability, while not changing any existing behavior.
Mount logic in super.c is changed as conservatively as possible.

The replay code checks the integrity of the whole journal before actually
replaying sectors. If any error (e.g. checksum mismatch, invalid/misaligned
offsets) is detected, the volume will remain unchanged and the journal will not
be cleared.

The code supports checksums for replay data in addition to checksums for header
and lists. Checksums for data is a relatively modern feature present on volumes
modified by modern OS X versions.

Tested with:
 drives with 512 and 4K sector sizes
 default (8 MB) and custom (1 MB, 500 MB) journal sizes
 big-endian (PowerPC) and little-endian (Intel) journal byte sexes

Advantages over a previously presented
   "[PATCH v4 00/15] hfsplus: introduce journal replay functionality"
   by a different author:
* 12 times faster mount with replay (0.5s versus 6s on my test volume)
* supports 4K sector devices
* no redundant stuff like journal initialization
* no fanatical use of macros with parameters
* misaligned fields are read through get_unaligned_be64()
* less intrusive changes
* shorter, easier to review

Signed-off-by: Sergei Antonov <saproj@xxxxxxxxx>
CC: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
CC: Christoph Hellwig <hch@xxxxxxxxxxxxx>
CC: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
CC: Vyacheslav Dubeyko <slava@xxxxxxxxxxx>
CC: Hin-Tak Leung <htl10@xxxxxxxxxxxxxxxxxxxxx>
Acked-by: Hin-Tak Leung <htl10@xxxxxxxxxxxxxxxxxxxxx>
---
 fs/hfsplus/Makefile      |   3 +-
 fs/hfsplus/hfsplus_fs.h  |   8 +
 fs/hfsplus/hfsplus_raw.h |  65 +++-
 fs/hfsplus/journal.c     | 909 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/hfsplus/super.c       |  30 +-
 5 files changed, 1009 insertions(+), 6 deletions(-)
 create mode 100644 fs/hfsplus/journal.c

diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
index 683fca2..927e89c 100644
--- a/fs/hfsplus/Makefile
+++ b/fs/hfsplus/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
 
 hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
 		bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \
-		attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o
+		attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o \
+		journal.o
 
 hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL)	+= posix_acl.o
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 08846425b..7a45d45 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -31,6 +31,7 @@
 #define DBG_BITMAP	0x00000040
 #define DBG_ATTR_MOD	0x00000080
 #define DBG_ACL_MOD	0x00000100
+#define DBG_JOURNAL	0x00000200
 
 #if 0
 #define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
@@ -166,6 +167,10 @@ struct hfsplus_sb_info {
 	u32 total_blocks;
 	u32 data_clump_blocks, rsrc_clump_blocks;
 
+	/* immutable data on the placement of the journal (if there is one) */
+	u64 journal_offset;	/* relative to volume start */
+	u64 journal_size;	/* relative to volume start */
+
 	/* mutable data from the volume header, protected by alloc_mutex */
 	u32 free_blocks;
 	struct mutex alloc_mutex;
@@ -516,6 +521,9 @@ int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 		void *buf, void **data, int rw);
 
+/* journal.c */
+int hfsplus_jrnl_replay(struct super_block *, u32, bool *);
+
 /* time macros */
 #define __hfsp_mt2ut(t)		(be32_to_cpu(t) - 2082844800U)
 #define __hfsp_ut2mt(t)		(cpu_to_be32(t + 2082844800U))
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 8ffb3a8..73ba1b6 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -105,7 +105,7 @@ struct hfsplus_vh {
 	__be16 version;
 	__be32 attributes;
 	__be32 last_mount_vers;
-	u32 reserved;
+	__be32 jrnl_info_block;
 
 	__be32 create_date;
 	__be32 modify_date;
@@ -145,6 +145,69 @@ struct hfsplus_vh {
 #define HFSPLUS_VOL_JOURNALED		(1 << 13)
 #define HFSPLUS_VOL_SOFTLOCK		(1 << 15)
 
+/* Journal info */
+struct hfs_jrnl_info {
+	__be32 flags;
+	u32 unused[8];
+	__be64 journal_offset;	/* Misaligned! relative to volume start */
+	__be64 journal_size;	/* Misaligned! relative to volume start */
+} __packed;
+
+/* Journal info flags */
+#define HFSPLUS_JRNL_IN_FS		1
+#define HFSPLUS_JRNL_ON_OTHER_DEVICE	2
+#define HFSPLUS_JRNL_NEED_INIT		4
+
+/* Journal header */
+struct hfs_jrnl_header {
+	u32 magic;
+	u32 endian;
+	u64 data_begin;		/* relative to journal start */
+	u64 data_end;		/* relative to journal start */
+	u64 journal_size;
+	u32 list_size;		/* size of hfs_jrnl_list */
+	u32 checksum;
+	u32 hdr_size;		/* Header size. Tells us where journal buffer
+				 * begins. Also works as journal block size. */
+	u32 last_seq_num;		/* transaction sequence number */
+};
+
+#define HFSPLUS_JRNL_HDR_MAGIC	0x4a4e4c78
+#define HFSPLUS_JRNL_HDR_ENDIAN	0x12345678
+
+/* Journal data run. Describes sequential data stored in the journal. */
+struct hfs_jrnl_run {
+	u64 first_block;	/* Block number the run begins at. Note that it
+				 * is journal blocks, not filesystem blocks.
+				 * Value ~0 means the run should be skipped. */
+	u32 byte_len;		/* Data size in bytes. Multiple of
+				 * journal block size. */
+	u32 checksum;		/* Data checksum. */
+};
+
+/*
+ * Journal list. Describes one or more journaled data runs.
+ * It is normally a single transaction.
+ */
+struct hfs_jrnl_list {
+	u16 reserved;
+	u16 count;			/* number of runs plus 1 */
+	u32 length_with_data;		/* length of the list and data */
+	u32 checksum;			/* checksum of the first 32 bytes */
+	u32 flags;			/* see possible flags below */
+	u64 reserved1;			/* unused part of 1st fake run */
+	u32 reserved2;			/* unused part of 1st fake run */
+	u32 tr_end_or_seq_num;		/* Before sequence numbers introduction:
+					   zero value means end of transaction.
+					   After sequence numbers introduction:
+					   a non-zero sequence number. */
+	struct hfs_jrnl_run runs[0];	/* number of elements is count-1 */
+};
+
+/* Journal run list flags */
+#define HFSPLUS_JRNL_LIST_RUN_CHECKSUMS		1
+#define HFSPLUS_JRNL_LIST_TRANSACTION_STARTS	2
+
 /* HFS+ BTree node descriptor */
 struct hfs_bnode_desc {
 	__be32 next;
diff --git a/fs/hfsplus/journal.c b/fs/hfsplus/journal.c
new file mode 100644
index 0000000..3490297
--- /dev/null
+++ b/fs/hfsplus/journal.c
@@ -0,0 +1,909 @@
+/*
+ *  linux/include/linux/jrnl.h
+ *
+ * Handling of journal
+ */
+
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/swab.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/blkdev.h>
+#include <linux/printk.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <asm/unaligned.h>
+#include <linux/interval_tree_generic.h>
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+/*
+ * An interval tree node.
+ * Holds data to replay to sequential sectors.
+ */
+struct hfs_replay_interval {
+	/* tree node data */
+	struct rb_node rb;
+	u64 subtree_last;
+
+	/* replay data */
+	u64 start, last;	/* 512 byte sectors relative to volume start */
+	void *data;		/* the bytes to write to the above sectors */
+
+	/*
+	 * Before the interval is read and checked it is marked so and
+	 * kept in a singly-linked list with head in struct hfs_replay_data.
+	 */
+	struct hfs_replay_interval *next_to_read;
+
+	/*
+	 * Works as read and check barrier. True means do all pending reads and
+	 * checks before overwriting the data with data from later transactions.
+	 */
+	bool in_read_list;
+
+	/*
+	 * Data needed to check a checksum. Valid while in the read list.
+	 * Normally checksum_ptr is equal to data, they will only be different
+	 * in a weird case of one run included in the other. HFS+ journal
+	 * format does not exclude that. See hfsplus_replay_data_add() for
+	 * a code processing intersecting runs.
+	 */
+	bool need_check_checksum;
+	u32 checksum;
+	void *checksum_ptr;
+};
+
+/* Used to read, verifiy and collect a complete replay data from journal */
+struct hfs_replay_data {
+	/* volume information */
+	const struct hfsplus_sb_info *sbi;
+	struct hfs_jrnl_header *header;
+	u64 volume_jrnl_blocks;
+	bool need_swab;			/* need swap bytes in journal */
+
+	/* IO stuff */
+	struct block_device *bdev;
+	struct bio *bio;		/* to keep pending read requests */
+	unsigned sectors_queued;	/* how many pending sectors to read */
+	unsigned max_vecs, max_sectors;	/* bio limits */
+
+	/* the root of an interval tree with hfs_jrnl_interval as nodes */
+	struct rb_root intervals_root;
+
+	/* head of the list of intervals to read and possibly check checksum */
+	struct hfs_replay_interval *first_to_read;
+
+	/*
+	 * The last transaction sequence number from journal list. Zero value
+	 * means we have a pre-sequence-number version of journal.
+	 */
+	u32 last_list_seq_num;
+
+	/* number of 512 byte sectors read and replayed from journal */
+	u64 sectors_read;
+	u64 sectors_replayed;
+};
+
+#define START(interval) ((interval)->start)
+#define LAST(interval) ((interval)->last)
+
+INTERVAL_TREE_DEFINE(struct hfs_replay_interval, rb, u64, subtree_last,
+		     START, LAST, static, hfsplus_interval);
+
+enum {
+	LIST_CHECKSUM_LENGTH = 32,
+	HEADER_CHECKSUM_LENGTH = 44
+};
+
+/* frees memory buffers, removes interval from the interval tree */
+static void hfsplus_jrnl_free_interval(struct rb_root *root,
+				       struct hfs_replay_interval *interval)
+{
+	kfree(interval->data);
+	hfsplus_interval_remove(interval, root);
+	kfree(interval);
+}
+
+/* to be called after we finish working with replay data */
+static void hfsplus_replay_data_free(struct hfs_replay_data *rd)
+{
+	struct hfs_replay_interval *interval;
+
+	/* free bio */
+	if (rd->bio)
+		bio_put(rd->bio);
+	rd->bio = 0;
+
+	/* free all replay intervals */
+	while ((interval = hfsplus_interval_iter_first(
+			&rd->intervals_root, 0, U64_MAX))) {
+		hfsplus_jrnl_free_interval(&rd->intervals_root, interval);
+	}
+	rd->intervals_root = RB_ROOT;
+}
+
+static int hfsplus_jrnl_process_info(struct super_block *sb,
+				     const struct hfs_jrnl_info *info)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	const u64 volume_length = (u64)sbi->total_blocks <<
+		sbi->alloc_blksz_shift;
+
+	/*
+	 * Check the flags and proceed only if the journal is initialized
+	 * and resides on this volume.
+	 */
+	if (be32_to_cpu(info->flags) != HFSPLUS_JRNL_IN_FS) {
+		pr_err("unsupported journal info flags 0x%x\n",
+		       be32_to_cpu(info->flags));
+		return -ENOTSUPP;
+	}
+
+	/*
+	 * Extract journal placement data from unaligned fields
+	 * and store them in superblock info.
+	 */
+	sbi->journal_offset = get_unaligned_be64(&info->journal_offset);
+	sbi->journal_size = get_unaligned_be64(&info->journal_size);
+
+	/*
+	 * Validate journal placement. It is determined by
+	 * offset and size in bytes realative to the start of the volume.
+	 * These values are sane when sector-aligned and not outside the volume.
+	 */
+	if (!sbi->journal_size ||
+	    (sbi->journal_offset & (hfsplus_min_io_size(sb) - 1)) ||
+	    (sbi->journal_size & (hfsplus_min_io_size(sb) - 1)) ||
+	    sbi->journal_offset >= volume_length ||
+	    sbi->journal_size > volume_length - sbi->journal_offset) {
+		pr_err("bad journal placement: offset=0x%llx size=0x%llx\n",
+		       sbi->journal_offset, sbi->journal_size);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/* check if byte offset is inside journal buffer */
+static inline bool is_inside_buffer(const struct hfs_jrnl_header *header,
+				    u64 offset /* relative to journal start */)
+{
+	/*
+	 * The journal buffer begins right after the header,
+	 * ends at the end of the journal.
+	 */
+	return offset >= header->hdr_size && offset <= header->journal_size;
+}
+
+static u32 hfsplus_jrnl_checksum(const void *data, unsigned len)
+{
+	unsigned i;
+	u32 checksum = 0;
+	const u8 *ptr = data;
+
+	for (i = 0; i < len; i++, ptr++)
+		checksum = (checksum << 8) ^ (checksum + *ptr);
+	return ~checksum;
+}
+
+static int hfsplus_jrnl_process_header(struct super_block *sb,
+				       struct hfs_jrnl_header *header,
+				       bool *need_swab)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	u32 header_checksum, calculated_checksum;
+
+	/*
+	 * Deal with checksums.
+	 * Save the original checksum value, replace it with zero, recalculate.
+	 */
+	header_checksum = header->checksum;
+	header->checksum = 0;
+	calculated_checksum = hfsplus_jrnl_checksum(header,
+						    HEADER_CHECKSUM_LENGTH);
+
+	/*
+	 * Check magic fields, determine if journal needs byte swapping,
+	 * and convert fields accordingly.
+	 */
+	if (HFSPLUS_JRNL_HDR_MAGIC == header->magic &&
+	    HFSPLUS_JRNL_HDR_ENDIAN == header->endian) {
+		/* The journal uses the same endianness as CPU. */
+		*need_swab = false;
+	} else if (HFSPLUS_JRNL_HDR_MAGIC == swab32(header->magic) &&
+		   HFSPLUS_JRNL_HDR_ENDIAN == swab32(header->endian)) {
+		/* The journal needs byte swapping. */
+		*need_swab = true;
+		header->data_begin = swab64(header->data_begin);
+		header->data_end = swab64(header->data_end);
+		header->journal_size = swab64(header->journal_size);
+		header->list_size = swab32(header->list_size);
+		header_checksum = swab32(header_checksum);
+		header->hdr_size = swab32(header->hdr_size);
+		header->last_seq_num = swab32(header->last_seq_num);
+	} else {
+		pr_err("unrecognized journal header\n");
+		return -EIO;
+	}
+
+	/* validate fields */
+	if (calculated_checksum != header_checksum) {
+		pr_err("journal header checksum mismatch (0x%x != 0x%x)\n",
+		       calculated_checksum, header_checksum);
+		return -EIO;
+	}
+	if (header->journal_size != sbi->journal_size) {
+		pr_err("different journal sizes in journal info and header\n");
+		return -EIO;
+	}
+	if (!header->hdr_size ||
+	    header->hdr_size % hfsplus_min_io_size(sb) ||
+	    !is_power_of_2(header->hdr_size) ||
+	    header->hdr_size >= sbi->journal_size) {
+		pr_err("invalid journal header size %d\n", header->hdr_size);
+		return -EIO;
+	}
+	if (!header->list_size || header->list_size % hfsplus_min_io_size(sb)) {
+		pr_err("journal list size %d is not a multiple of minimum io size %d\n",
+		       header->list_size, hfsplus_min_io_size(sb));
+		return -EIO;
+	}
+	if (!is_inside_buffer(header, header->data_begin) ||
+	    !is_inside_buffer(header, header->data_end)) {
+		pr_err("bad journal data offsets\n");
+		return -EIO;
+	}
+
+	hfs_dbg(JOURNAL,
+		"journal header: need swap bytes = %s, last transaction sequence number 0x%x\n",
+		*need_swab ? "true" : "false",
+		header->last_seq_num);
+	return 0;
+}
+
+static u64 hfsplus_jrnl_data_length(const struct hfs_jrnl_header *header)
+{
+	u64 buffer_size;
+
+	/*
+	 * Journal data is stored in a circular journal buffer.
+	 * If data does not wrap around the end the buffer, simply subtract.
+	 * Begin==end means zero data length, subtraction will result in it.
+	 */
+	if (header->data_begin <= header->data_end)
+		return header->data_end - header->data_begin;
+
+	/* data wraps around, so it is divided into two pieces */
+	buffer_size = header->journal_size - header->hdr_size;
+	return buffer_size - (header->data_begin - header->data_end);
+}
+
+/* executes all pending reads and checks checksums */
+static int hfsplus_jrnl_read_now(struct hfs_replay_data *rd)
+{
+	int error;
+	struct hfs_replay_interval *interval;
+
+	if (rd->bio && rd->bio->bi_vcnt) {
+		/* read synchronously */
+		hfs_dbg(JOURNAL, "submitting bio read nr_vecs %u nr_sectors %u\n",
+			rd->bio->bi_vcnt, rd->sectors_queued);
+		error = submit_bio_wait(READ, rd->bio);
+		if (error)
+			return error;
+
+		/*
+		 * We've done with the current bio.
+		 * Next call to hfsplus_jrnl_queue_read will create a new one.
+		 */
+		bio_put(rd->bio);
+		rd->bio = 0;
+	}
+
+	/* process a list of read intervals */
+	for (interval = rd->first_to_read;
+	     interval;
+	     interval = interval->next_to_read) {
+		u32 calculated_checksum;
+
+		/* cast is ok - intervals are limited by a 32-bit run length */
+		const u32 byte_len =
+			(u32)(interval->last - interval->start + 1) << 9;
+
+		interval->in_read_list = false; /* read list will be emptied */
+
+		if (!interval->need_check_checksum)
+			continue;
+
+		/* calculate and compare checksum */
+		calculated_checksum = hfsplus_jrnl_checksum(
+			interval->checksum_ptr, byte_len);
+		if (interval->checksum != calculated_checksum) {
+			pr_err("invalid journal run 0x%llx - 0x%llx checksum (0x%x != 0x%x)\n",
+			       interval->start, interval->last,
+			       interval->checksum, calculated_checksum);
+			return -EIO;
+		}
+	}
+	rd->first_to_read = 0;	/* empty read list */
+
+	return 0;
+}
+
+/*
+ * Adds an IO vector to read sequential bytes from the device.
+ * Does IO if bio is full.
+ *
+ * Isolates callers from bio limitations and IO vector handling.
+ */
+static int hfsplus_jrnl_queue_bio(struct hfs_replay_data *rd,
+				  char *buf, u32 len)
+{
+	int error = 0;
+	u32 len_sectors = len >> 9;
+
+	/* split too long request */
+	while (len_sectors > rd->max_sectors) {
+		const u32 split_len = rd->max_sectors << 9;
+
+		error = hfsplus_jrnl_queue_bio(rd, buf, split_len);
+		if (error)
+			return error;
+		buf += split_len;
+		len -= split_len;
+		len_sectors = len >> 9;
+	}
+
+	/* do IO if bio is full by a sector limit */
+	if (rd->max_sectors - rd->sectors_queued < len_sectors) {
+		error = hfsplus_jrnl_read_now(rd);
+		if (error)
+			return error;
+	}
+
+	/* create and initialize a bio structure if it does not exist */
+	if (!rd->bio) {
+		const sector_t sector = rd->sbi->part_start +
+			((rd->sbi->journal_offset + rd->header->data_begin) >>
+			 9);
+
+		rd->bio = bio_alloc(GFP_NOIO, rd->max_vecs);
+		if (!rd->bio)
+			return -ENOMEM;
+		rd->bio->bi_bdev = rd->bdev;
+		rd->bio->bi_vcnt = 0;
+		rd->bio->bi_iter.bi_size = 0;
+		rd->bio->bi_iter.bi_sector = sector;
+		rd->sectors_queued = 0;
+	}
+
+	/* add a new IO vector */
+	rd->bio->bi_io_vec[rd->bio->bi_vcnt].bv_page
+		= virt_to_page(buf);
+	rd->bio->bi_io_vec[rd->bio->bi_vcnt].bv_offset
+		= offset_in_page(buf);
+	rd->bio->bi_io_vec[rd->bio->bi_vcnt].bv_len = len;
+	rd->bio->bi_vcnt++;
+	rd->bio->bi_iter.bi_size += len;
+	rd->sectors_queued += len_sectors;
+
+	/* do IO if bio is full by an iovecs limit */
+	if (rd->bio->bi_vcnt == rd->bio->bi_max_vecs)
+		error = hfsplus_jrnl_read_now(rd);
+
+	return error;
+}
+
+/*
+ * Queue read request for journal data. Requests are placed at sequentially
+ * increasing offsets (header->data_begin is increased every time) so that
+ * the whole journal data from data_begin to data_end will be eventually read.
+ *
+ * Isolates callers from buffers's circular nature and IO.
+ */
+static int hfsplus_jrnl_queue_read(struct hfs_replay_data *rd,
+				   char *buf, u32 len)
+{
+	int error = 0;
+	const u64 data_length = hfsplus_jrnl_data_length(rd->header);
+	const u64 len_to_buffer_end =
+		rd->sbi->journal_size - rd->header->data_begin;
+
+	/* make sure we can read the whole buffer */
+	if (data_length < len) {
+		pr_err("journal data ends unexpectedly: want to read 0x%x bytes, available 0x%llx bytes\n",
+		       len, data_length);
+		return -EIO;
+	}
+
+	if (len >= len_to_buffer_end) {
+		/* the requested data is split into two pieces */
+		error = hfsplus_jrnl_queue_bio(rd, buf, len_to_buffer_end);
+		buf += len_to_buffer_end;
+		len -= len_to_buffer_end;
+
+		/* finish current bio, we are jumping to another sector */
+		if (!error)
+			error = hfsplus_jrnl_read_now(rd);
+
+		/* wrap around the journal data pointer */
+		rd->header->data_begin = rd->header->hdr_size;
+	}
+
+	if (!error && len) {
+		error = hfsplus_jrnl_queue_bio(rd, buf, len);
+
+		/* advance the journal data pointer */
+		rd->header->data_begin += len;
+	}
+
+	return error;
+}
+
+/* reads data from the journal circular buffer at header->begin */
+static int hfsplus_jrnl_read_buf(struct hfs_replay_data *rd,
+				 void *buf, u32 len)
+{
+	/* queue a new read iovec */
+	int error = hfsplus_jrnl_queue_read(rd, buf, len);
+
+	/* execute all queued iovecs */
+	if (!error)
+		error = hfsplus_jrnl_read_now(rd);
+	return error;
+}
+
+/*
+ * Receives information on where the next byte_len bytes in journal data are
+ * supposed to be replayed to, what is their checksum (if available).
+ * Finds the interval the data belongs to or creates a new interval.
+ * Then queues read and check operations.
+ * byte_len is a multiple of journal block size.
+ */
+static int hfsplus_replay_data_add(struct hfs_replay_data *rd,
+				   u64 jrnl_block, u32 byte_len,
+				   bool has_checksum, u32 checksum)
+{
+	/* turn journal blocks into 512 byte bio sectors */
+	const unsigned shift = ilog2(rd->header->hdr_size) - 9;
+	const u64 start = jrnl_block << shift;
+	const u64 last = start + (byte_len >> 9) - 1;
+
+	int error;
+	struct hfs_replay_interval *interval;
+	char *read_ptr;
+
+	/* check for intersections with existing intervals */
+	interval = hfsplus_interval_iter_first(&rd->intervals_root,
+					       start, last);
+	while (interval) {
+		/*
+		 * Intersection is found. We are going to overwrite its
+		 * memory, so check the barrier.
+		 */
+		if (interval->in_read_list) {
+			error = hfsplus_jrnl_read_now(rd);
+			if (error)
+				return error;
+		}
+
+		/* new data is completely included in the found interval */
+		if (interval->start <= start && interval->last >= last) {
+			hfs_dbg(JOURNAL,
+				"adding to an existing interval 0x%llx - 0x%llx\n",
+				interval->start, interval->last);
+			read_ptr = interval->data;
+			read_ptr += (start - interval->start) << 9;
+			goto queue_and_return;
+		}
+
+		/* new data completely overrides the found interval */
+		if (interval->start >= start && interval->last <= last) {
+			hfs_dbg(JOURNAL,
+				"removing an existing interval 0x%llx - 0x%llx\n",
+				interval->start, interval->last);
+			hfsplus_jrnl_free_interval(&rd->intervals_root,
+						   interval);
+		}
+
+		/* new data overrides the found interval at the end */
+		if (interval->start < start) {
+			hfs_dbg(JOURNAL,
+				"left intersection with 0x%llx - 0x%llx\n",
+				interval->start, interval->last);
+
+			/* shorten interval and reinsert */
+			hfsplus_interval_remove(interval, &rd->intervals_root);
+			interval->last = start - 1;
+			hfsplus_interval_insert(interval, &rd->intervals_root);
+		}
+
+		/* new data overrides the found interval at the beginning */
+		if (interval->last > last) {
+			const u32 intersection_len =
+				(last - interval->start) << 9;
+			hfs_dbg(JOURNAL,
+				"right intersection with 0x%llx - 0x%llx\n",
+				interval->start, interval->last);
+
+			/* shorten interval, move data and reinsert */
+			hfsplus_interval_remove(interval, &rd->intervals_root);
+			memmove(interval->data,
+				interval->data + intersection_len,
+				(interval->last - last) << 9);
+			interval->start = last + 1;
+			hfsplus_interval_insert(interval, &rd->intervals_root);
+		}
+
+		/* start searching anew after we deleted or reinserted */
+		interval = hfsplus_interval_iter_first(&rd->intervals_root,
+						       start, last);
+	}
+
+	/* allocate a new interval and its data buffer */
+	interval = kmalloc(sizeof(*interval), GFP_KERNEL);
+	if (!interval)
+		return -ENOMEM;
+	interval->data = kmalloc(byte_len, GFP_KERNEL);
+	if (!interval->data) {
+		kfree(interval);
+		return -ENOMEM;
+	}
+
+	interval->start = start;
+	interval->last = last;
+	read_ptr = interval->data;
+	hfs_dbg(JOURNAL, "inserting a new interval 0x%llx - 0x%llx\n",
+		start, last);
+	hfsplus_interval_insert(interval, &rd->intervals_root);
+
+queue_and_return:
+	error = hfsplus_jrnl_queue_read(rd, read_ptr, byte_len);
+	if (error)
+		return error;
+	interval->in_read_list = true;
+	interval->need_check_checksum = has_checksum;
+	interval->checksum = checksum;
+	interval->checksum_ptr = read_ptr;
+	interval->next_to_read = rd->first_to_read;
+	rd->first_to_read = interval;
+	return 0;
+}
+
+/*
+ * Receives one journal run, converts it, validates it,
+ * queues read and check operations for replay data found in it.
+ */
+static int hfsplus_jrnl_run(struct hfs_replay_data *rd,
+			    struct hfs_jrnl_run *run,
+			    u32 *list_data_len, bool has_checksum)
+{
+	const u64 SKIP_RUN = ~0;
+
+	/* skip the run if it is marked by all 64 bits set */
+	if (run->first_block == SKIP_RUN)
+		return 0;
+
+	/* swap bytes if needed */
+	if (rd->need_swab) {
+		run->first_block = swab64(run->first_block);
+		run->byte_len = swab32(run->byte_len);
+		run->checksum = swab32(run->checksum);
+	}
+
+	/*
+	 * Validate length: it has to be a multiple of journal block size,
+	 * it cannot go beyond data length specified in the list.
+	 */
+	if (run->byte_len % rd->header->hdr_size) {
+		pr_err("journal run length 0x%x is not a multiple of journal block size 0x%x\n",
+		       run->byte_len, rd->header->hdr_size);
+		return -EIO;
+	}
+	if (!run->byte_len || run->byte_len > *list_data_len) {
+		pr_err("journal run length 0x%x is out of bounds\n",
+		       run->byte_len);
+		return -EIO;
+	}
+
+	/* it has to be inside volume bounds of course */
+	if (run->first_block >= rd->volume_jrnl_blocks ||
+	    rd->volume_jrnl_blocks - run->first_block <
+	    run->byte_len / rd->header->hdr_size) {
+		pr_err("invalid journal run block (0x%llx, %x)\n",
+		       run->first_block, run->byte_len);
+		return -EIO;
+	}
+
+	*list_data_len -= run->byte_len;
+	rd->sectors_read += run->byte_len >> 9;
+	return hfsplus_replay_data_add(rd, run->first_block, run->byte_len,
+				       has_checksum, run->checksum);
+}
+
+/* used during journal list validation */
+static inline unsigned max_list_count(const struct hfs_jrnl_header *header)
+{
+	const unsigned space_for_runs = header->list_size -
+		offsetof(struct hfs_jrnl_list, runs);
+
+	/* Remember to add 1 for one fake run. See structure definition. */
+	return space_for_runs / sizeof(struct hfs_jrnl_run) + 1;
+}
+
+/*
+ * Receives one journal list, converts it, validates it,
+ * queues read and check operations for replay data found in it.
+ */
+static int hfsplus_jrnl_list(struct hfs_replay_data *rd,
+			     struct hfs_jrnl_list *list)
+{
+	int error, i;
+	u32 list_checksum, calculated_checksum;
+	bool transaction_starts, run_checksums;
+	u32 list_data_len;
+
+	/*
+	 * Deal with checksums.
+	 * Save the original checksum value, replace it with zero, recalculate.
+	 */
+	list_checksum = list->checksum;
+	list->checksum = 0;
+	calculated_checksum = hfsplus_jrnl_checksum(list, LIST_CHECKSUM_LENGTH);
+
+	/* swap bytes if needed */
+	if (rd->need_swab) {
+		list->count = swab16(list->count);
+		list->length_with_data = swab32(list->length_with_data);
+		list_checksum = swab32(list_checksum);
+		list->flags = swab32(list->flags);
+		list->tr_end_or_seq_num = swab32(list->tr_end_or_seq_num);
+	}
+
+	/* validate fields */
+	if (calculated_checksum != list_checksum) {
+		pr_err("journal list checksum mismatch (0x%x != 0x%x)\n",
+		       calculated_checksum, list_checksum);
+		return -EIO;
+	}
+	if (!list->count || list->count > max_list_count(rd->header)) {
+		pr_err("journal list is invalid: count=0x%x\n", list->count);
+		return -EIO;
+	}
+	if (list->length_with_data < rd->header->list_size) {
+		pr_err("invalid journal list length (0x%x < 0x%x)\n",
+		       list->length_with_data, rd->header->list_size);
+		return -EIO;
+	}
+	list_data_len = list->length_with_data - rd->header->list_size;
+	if (list_data_len > hfsplus_jrnl_data_length(rd->header)) {
+		pr_err("journal list data length is longer than remaining data\n");
+		return -EIO;
+	}
+	if (list_data_len % rd->header->hdr_size) {
+		pr_err("journal list data length 0x%x is not a multiple of journal block size 0x%x\n",
+		       list_data_len, rd->header->hdr_size);
+		return  -EIO;
+	}
+
+	rd->last_list_seq_num = list->tr_end_or_seq_num;
+	list->count--;
+	transaction_starts =
+		!!(list->flags & HFSPLUS_JRNL_LIST_TRANSACTION_STARTS);
+	run_checksums = !!(list->flags & HFSPLUS_JRNL_LIST_RUN_CHECKSUMS);
+
+	hfs_dbg(JOURNAL, "processing journal list with %u runs, 0x%x bytes of data, trstart = %d, run checksums = %d, seq_num=0x%x\n",
+		list->count, list_data_len,
+		transaction_starts, run_checksums,
+		list->tr_end_or_seq_num);
+
+	/* iterate over runs */
+	for (i = 0, error = 0; !error && i < list->count; i++) {
+		error = hfsplus_jrnl_run(rd, list->runs+i, &list_data_len,
+					 run_checksums);
+	}
+
+	/* if all went right we have zero unprocessed data left */
+	if (!error && list_data_len) {
+		pr_err("journal list data length exceeds a sum of run lengths in the list\n");
+		error = -EIO;
+	}
+
+	return error;
+}
+
+/* iterates over all journal lists, collects and checks replay data */
+static int hfsplus_jrnl_read_lists(struct hfs_replay_data *rd)
+{
+	int error;
+	void *buf;
+
+	buf = kmalloc(rd->header->list_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* iterate over all journal lists from older transactions to newer */
+	do {
+		error = hfsplus_jrnl_read_buf(rd, buf, rd->header->list_size);
+		if (!error)
+			error = hfsplus_jrnl_list(rd, buf);
+	} while (!error && hfsplus_jrnl_data_length(rd->header));
+
+	/* do all pending reads and checks */
+	if (!error)
+		error = hfsplus_jrnl_read_now(rd);
+
+	hfs_dbg(JOURNAL,
+		"last seq num in journal data 0x%x, last seq num in header 0x%x\n",
+		rd->last_list_seq_num, rd->header->last_seq_num);
+
+	kfree(buf);
+	return error;
+}
+
+/* replay sectors read from journal */
+static int hfsplus_jrnl_write_intervals(struct hfs_replay_data *rd)
+{
+	int error = 0;
+	struct hfs_replay_interval *interval;
+
+	for (interval = hfsplus_interval_iter_first(&rd->intervals_root,
+						    0, U64_MAX);
+	     interval && !error;
+	     interval = hfsplus_interval_iter_next(interval, 0, U64_MAX)) {
+		struct bio *bio;
+
+		/* cast is ok - intervals are limited by a 32-bit run length */
+		const u32 len_sectors =
+			(u32)(interval->last - interval->start + 1);
+
+		hfs_dbg(JOURNAL, "replaying 0x%llx - 0x%llx\n",
+			interval->start, interval->last);
+
+		bio = bio_alloc(GFP_NOIO, 1);
+		if (!bio) {
+			error = -ENOMEM;
+			break;
+		}
+		bio->bi_bdev = rd->bdev;
+		bio->bi_vcnt = 1;
+		bio->bi_iter.bi_size = len_sectors << 9;
+		bio->bi_iter.bi_sector = rd->sbi->part_start + interval->start;
+		bio->bi_io_vec[0].bv_page = virt_to_page(interval->data);
+		bio->bi_io_vec[0].bv_offset = offset_in_page(interval->data);
+		bio->bi_io_vec[0].bv_len = len_sectors << 9;
+		error = submit_bio_wait(WRITE_SYNC, bio);
+		if (error)
+			pr_err("journal replay write error %d\n", error);
+		else
+			rd->sectors_replayed += len_sectors;
+		bio_put(bio);
+	}
+
+	return error;
+}
+
+/*
+ * After writing all sectors to the volume this function updates
+ * journal header. The header will be written in CPU endianness.
+ * This function assumes header->data_begin == header->data_end.
+ * Write mode is WRITE_FLUSH_FUA to flush previous writes first.
+ */
+static int hfsplus_jrnl_clear_header(struct super_block *sb,
+				     struct hfs_jrnl_header *header,
+				     sector_t journal_hdr_sector)
+{
+	int error;
+
+	header->magic = HFSPLUS_JRNL_HDR_MAGIC;
+	header->endian = HFSPLUS_JRNL_HDR_ENDIAN;
+	header->checksum = 0;
+	header->checksum =
+		hfsplus_jrnl_checksum(header, HEADER_CHECKSUM_LENGTH);
+	error = hfsplus_submit_bio(sb, journal_hdr_sector,
+				   header, 0, WRITE_FLUSH_FUA);
+	if (error)
+		pr_err("journal header write error %d\n", error);
+	return error;
+}
+
+/*
+ * This function is called during mount operation.
+ * Finds a journal, replays it if it is not empty.
+ */
+int hfsplus_jrnl_replay(struct super_block *sb, u32 jrnl_info_block,
+			bool *journal_was_empty)
+{
+	int error;
+	const struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	const unsigned long block_shift =
+		sbi->alloc_blksz_shift - HFSPLUS_SECTOR_SHIFT;
+	struct hfs_jrnl_info *info;
+	void *buf;
+	sector_t journal_hdr_sector;
+	struct hfs_replay_data rd = {};
+
+	buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* step 1: journal info */
+	error = hfsplus_submit_bio(sb, sbi->part_start +
+				   (jrnl_info_block << block_shift),
+				   buf, (void **)&info, READ);
+	if (error) {
+		pr_err("journal info read error %d\n", error);
+		goto out;
+	}
+	error = hfsplus_jrnl_process_info(sb, info);
+	if (error)
+		goto out;
+	journal_hdr_sector = sbi->part_start +
+		(sbi->journal_offset >> HFSPLUS_SECTOR_SHIFT);
+
+	/* step 2: journal header */
+	error = hfsplus_submit_bio(sb, journal_hdr_sector,
+				   buf, (void **)&rd.header, READ);
+	if (error) {
+		pr_err("journal header read error %d\n", error);
+		goto out;
+	}
+	error = hfsplus_jrnl_process_header(sb, rd.header, &rd.need_swab);
+	if (error)
+		goto out;
+
+	*journal_was_empty = rd.header->data_begin == rd.header->data_end;
+	if (*journal_was_empty) {
+		hfs_dbg(JOURNAL, "journal is empty\n");
+		goto out;
+	}
+
+	/*
+	 * Initialize replay data structure.
+	 * It will change the header to advance data pointer as it reads data.
+	 * It needs to know the number of journal blocks on the volume.
+	 * It heeds to know bio limitations.
+	 */
+	rd.sbi = sbi;
+	rd.volume_jrnl_blocks =
+		((u64)sbi->total_blocks << sbi->alloc_blksz_shift) >>
+		ilog2(rd.header->hdr_size);
+	rd.bdev = sb->s_bdev;
+	/* get max nr_vecs, limiting at 100 in case it is very generous */
+	rd.max_vecs = min_t(unsigned, 100, bio_get_nr_vecs(sb->s_bdev));
+	if (!rd.max_vecs) {
+		error = -EIO;
+		goto out;
+	}
+	rd.max_sectors = queue_max_sectors(bdev_get_queue(sb->s_bdev));
+	rd.intervals_root = RB_ROOT;
+
+	/* step 3: journal lists */
+	error = hfsplus_jrnl_read_lists(&rd);
+	if (error)
+		goto out;
+
+	/* step 4: write transactions to the volume */
+	error = hfsplus_jrnl_write_intervals(&rd);
+	if (error)
+		goto out;
+
+	/* step 5: flush written data and update journal header */
+	error = hfsplus_jrnl_clear_header(sb, rd.header, journal_hdr_sector);
+	if (error)
+		goto out;
+
+	hfs_dbg(JOURNAL, "%llu sectors have been read from journal, %llu sectors have been written to volume\n",
+		rd.sectors_read, rd.sectors_replayed);
+	pr_info("journal has been replayed\n");
+out:
+	hfsplus_replay_data_free(&rd);
+	kfree(buf);
+	return error;
+}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 80875aa..84b832e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -403,6 +403,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_unload_nls;
 	}
 
+reread_vhdr:
 	/* Grab the volume header */
 	if (hfsplus_read_wrapper(sb)) {
 		if (!silent)
@@ -447,6 +448,31 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op = &hfsplus_sops;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
+	if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		bool journal_was_empty;
+		int replay_error;
+
+		pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+
+		replay_error = hfsplus_jrnl_replay(sb,
+			be32_to_cpu(vhdr->jrnl_info_block), &journal_was_empty);
+		if (replay_error)
+			pr_err("journal replay error %d\n", replay_error);
+
+		/*
+		 * Reread volume header after replay, no matter if it succeeded
+		 * or not. We've just set read-only flag, so no further replay
+		 * attempts will happen.
+		 */
+		kfree(sbi->s_vhdr_buf);
+		kfree(sbi->s_backup_vhdr_buf);
+		sbi->s_vhdr_buf = 0;
+		sbi->s_backup_vhdr_buf = 0;
+		goto reread_vhdr;
+	}
+
 	if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
 		pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
@@ -455,10 +481,6 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 		pr_warn("Filesystem is marked locked, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
-	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
-			!(sb->s_flags & MS_RDONLY)) {
-		pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
-		sb->s_flags |= MS_RDONLY;
 	}
 
 	err = -EINVAL;
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html