Adds support for declare blocks, used by ext3's journal guided resync (declared mode.) A declare block is added to the journal to list blocks to be written during the current transaction. During journal replay, we perform a RAID resync of only these blocks and skip the rest of the resync. We also set the fs_raidsync flag on buffers being submitted when declared mode is active to inform MD that the filesystem is responsible for resyncing the stripe parity in the event of a system crash. Signed-off-by: Jody McIntyre <scjody@xxxxxxx> Index: linux-2.6.18-128.7.1/fs/jbd/checkpoint.c =================================================================== --- linux-2.6.18-128.7.1.orig/fs/jbd/checkpoint.c +++ linux-2.6.18-128.7.1/fs/jbd/checkpoint.c @@ -712,6 +712,8 @@ void __journal_drop_transaction(journal_ J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); + J_ASSERT(transaction->t_declare_root.rnode == NULL); + J_ASSERT(transaction->t_declare_done_root.rnode == NULL); J_ASSERT(transaction->t_sync_datalist == NULL); J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_iobuf_list == NULL); Index: linux-2.6.18-128.7.1/fs/jbd/commit.c =================================================================== --- linux-2.6.18-128.7.1.orig/fs/jbd/commit.c +++ linux-2.6.18-128.7.1/fs/jbd/commit.c @@ -372,6 +372,270 @@ static inline __u32 jbd_checksum_data(__ return checksum; } +int wait_for_descriptors(journal_t *journal, transaction_t *trans) +{ + struct journal_head *jh; + struct buffer_head *bh; + int err = 0; + +wait_for_ctlbuf: + while (trans->t_log_list != NULL) { + jh = trans->t_log_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + wait_on_buffer(bh); + goto wait_for_ctlbuf; + } + if (cond_resched()) + goto wait_for_ctlbuf; + + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_buffer_jwrite(bh); + journal_unfile_buffer(journal, jh); + journal_put_journal_head(jh); + __brelse(bh); /* One for getblk */ + } + + return err; +} + +struct journal_head *get_descriptor(journal_t *journal, transaction_t *trans, + int blocktype, char **tagp, int *space_left) +{ + struct journal_head *descriptor; + struct buffer_head *dbh; + journal_header_t *header; + + jbd_debug(4, "JBD: get descriptor\n"); + + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) + return NULL; + + dbh = jh2bh(descriptor); + jbd_debug(4, "JBD: got buffer %llu (%p)\n", + (unsigned long long)dbh->b_blocknr, dbh->b_data); + header = (journal_header_t *)&dbh->b_data[0]; + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(blocktype); + header->h_sequence = cpu_to_be32(trans->t_tid); + + *tagp = &dbh->b_data[sizeof(journal_header_t)]; + *space_left = dbh->b_size - sizeof(journal_header_t); + + set_buffer_jwrite(dbh); + set_buffer_dirty(dbh); + + /* Record it so that we can wait for it later */ + BUFFER_TRACE(dbh, "ph3: file as descriptor"); + journal_file_buffer(descriptor, trans, BJ_LogCtl); + + return descriptor; +} + +/* + * Write declare blocks containing a list of the data blocks that will be + * written out + */ +void journal_write_declare_blocks(journal_t *journal, + transaction_t *transaction, + int committing) +{ + struct journal_head *jh, *descriptor = NULL; + struct buffer_head *bh; + int i; + int bufs = 0; + int err; + unsigned int n; + unsigned int count = 0; + unsigned int to_write; + unsigned long nextblock = 0; + char *tagp = NULL; + journal_block_tag_t *tag = NULL; + int space_left = 0; + int first_tag = 0; + int tag_flag; + struct radix_tree_root *root; + + root = &transaction->t_declare_root; + + spin_lock(&journal->j_list_lock); + to_write = transaction->t_declare_request; + transaction->t_declare_request = 0; + spin_unlock(&journal->j_list_lock); + + if (to_write == UINT_MAX) + jbd_debug(1, "jbd: tid %d write declare request for ALL " + "blocks\n", transaction->t_tid); + else + jbd_debug(1, "jbd: tid %d write declare request for %u " + "blocks\n", transaction->t_tid, to_write); +write_declare: + cond_resched(); + spin_lock(&journal->j_list_lock); + + n = radix_tree_gang_lookup(root, journal->j_declare_jhs, nextblock, 1); + while (n) { + if (!descriptor) { + J_ASSERT(bufs == 0); + + spin_unlock(&journal->j_list_lock); + + descriptor = get_descriptor(journal, transaction, + JFS_DECLARE_BLOCK, + &tagp, &space_left); + + if (!descriptor) { + journal_abort(journal, -EIO); + return; + } + + first_tag = 1; + journal->j_declare_bhs[bufs++] = jh2bh(descriptor); + + goto write_declare; + } + + jh = (struct journal_head *)journal->j_declare_jhs[0]; + bh = jh2bh(jh); + + /* refile the buffer as having been declared */ + if (!inverted_lock(journal, bh)) + goto write_declare; + __journal_unfile_buffer(jh); + __journal_file_buffer(jh, transaction, BJ_DeclareDone); + + jbd_unlock_bh_state(bh); + + /* record the block's tag in the current descriptor buffer */ + tag_flag = 0; + if (!first_tag) + tag_flag |= JFS_FLAG_SAME_UUID; + + tag = (journal_block_tag_t *)tagp; + tag->t_blocknr = cpu_to_be32(bh->b_blocknr); + tag->t_flags = cpu_to_be32(tag_flag); + tagp += sizeof(journal_block_tag_t); + space_left -= sizeof(journal_block_tag_t); + + if (first_tag) { + memcpy(tagp, journal->j_uuid, 16); + tagp += 16; + space_left -= 16; + first_tag = 0; + } + + count++; + + /* advance to the next journal head and buffer */ + nextblock = bh->b_blocknr + 1; + n = radix_tree_gang_lookup(root, journal->j_declare_jhs, + nextblock, 1); + + /* If there's no more to do, or if the descriptor is full, + let the IO rip! */ + + if (bufs == ARRAY_SIZE(journal->j_declare_bhs) || n == 0 || + count == to_write || + space_left < sizeof(journal_block_tag_t) + 16) { + + jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + + /* Write an end-of-descriptor marker before + * submitting the IOs. "tag" still points to + * the last tag we set up. + */ + + tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); + + spin_unlock(&journal->j_list_lock); + + for (i = 0; i < bufs; i++) { + struct buffer_head *bh = + journal->j_declare_bhs[i]; + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + submit_bh(WRITE, bh); + } + + cond_resched(); + spin_lock(&journal->j_list_lock); + + /* force a new descriptor to be generated next time */ + descriptor = NULL; + bufs = 0; + + /* need to redo tree lookup since we lost the lock, + but that will happen after we get a new descriptor */ + } + + if (count == to_write) + break; + } + spin_unlock(&journal->j_list_lock); + + jbd_debug(2, "jbd: tid %d wrote declarations for %u blocks\n", + transaction->t_tid, count); + if (to_write == UINT_MAX) + J_ASSERT(transaction->t_declare_root.rnode == NULL); + + /* wait for the declare blocks to be written */ + err = wait_for_descriptors(journal, transaction); + + /* move the declared buffers to the sync data list */ + + root = &transaction->t_declare_done_root; + count = 0; + nextblock = 0; + +move_declare: + cond_resched(); + spin_lock(&journal->j_list_lock); + + while (n = radix_tree_gang_lookup(root, journal->j_declare_jhs, + nextblock, + ARRAY_SIZE(journal->j_declare_jhs))) { + /* loop and move the journal heads */ + for (i = 0; i < n; i++) { + jh = journal->j_declare_jhs[i]; + bh = jh2bh(jh); + + if (!inverted_lock(journal, bh)) + goto move_declare; + __journal_unfile_buffer(jh); + + if (committing) + /* set buffer dirty for writing below */ + set_buffer_dirty(bh); + else + /* set page dirty for virtual memory */ + mark_buffer_dirty(bh); + + __journal_file_buffer(jh, transaction, BJ_SyncData); + + count++; + + nextblock = bh->b_blocknr + 1; + + jbd_unlock_bh_state(bh); + + if (lock_need_resched(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto move_declare; + } + } + } + spin_unlock(&journal->j_list_lock); + + jbd_debug(2, "jbd: tid %d moved %u declare blocks\n", + transaction->t_tid, count); +} + /* * journal_commit_transaction * @@ -389,7 +653,6 @@ void journal_commit_transaction(journal_ int err; unsigned long blocknr; char *tagp = NULL; - journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; @@ -516,6 +779,11 @@ void journal_commit_transaction(journal_ jbd_debug (3, "JBD: commit phase 2\n"); + if (journal->j_flags & JFS_DECLARE) { + commit_transaction->t_declare_request = UINT_MAX; + journal_write_declare_blocks(journal, commit_transaction, 1); + } + /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. @@ -541,12 +809,15 @@ void journal_commit_transaction(journal_ jbd_debug(3, "JBD: commit phase 2\n"); /* - * If we found any dirty or locked buffers, then we should have - * looped back up to the write_out_data label. If there weren't - * any then journal_clean_data_list should have wiped the list - * clean by now, so check that it is in fact empty. + * If we found any dirty or locked buffers, then we should have looped + * back up to the write_out_data label. If there weren't any then + * journal_clean_data_list should have wiped the list clean by now, so + * check that it is in fact empty. Also check declared mode trees - + * journal_write_declare_blocks() should have left them empty. */ - J_ASSERT (commit_transaction->t_sync_datalist == NULL); + J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(commit_transaction->t_declare_root.rnode == NULL); + J_ASSERT(commit_transaction->t_declare_done_root.rnode == NULL); jbd_debug (3, "JBD: commit phase 3\n"); @@ -595,38 +866,20 @@ void journal_commit_transaction(journal_ record the metadata buffer. */ if (!descriptor) { - struct buffer_head *bh; - J_ASSERT (bufs == 0); - jbd_debug(4, "JBD: get descriptor\n"); + descriptor = get_descriptor(journal, + commit_transaction, + JFS_DESCRIPTOR_BLOCK, + &tagp, &space_left); - descriptor = journal_get_descriptor_buffer(journal); if (!descriptor) { journal_abort(journal, -EIO); continue; } - bh = jh2bh(descriptor); - jbd_debug(4, "JBD: got buffer %llu (%p)\n", - (unsigned long long)bh->b_blocknr, bh->b_data); - header = (journal_header_t *)&bh->b_data[0]; - header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); - header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - - tagp = &bh->b_data[sizeof(journal_header_t)]; - space_left = bh->b_size - sizeof(journal_header_t); first_tag = 1; - set_buffer_jwrite(bh); - set_buffer_dirty(bh); - wbuf[bufs++] = bh; - - /* Record it so that we can wait for IO - completion later */ - BUFFER_TRACE(bh, "ph3: file as descriptor"); - journal_file_buffer(descriptor, commit_transaction, - BJ_LogCtl); + wbuf[bufs++] = jh2bh(descriptor); } /* Where is the buffer to be written? */ @@ -825,29 +1078,7 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 5\n"); /* Here we wait for the revoke record and descriptor record buffers */ - wait_for_ctlbuf: - while (commit_transaction->t_log_list != NULL) { - struct buffer_head *bh; - - jh = commit_transaction->t_log_list->b_tprev; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - wait_on_buffer(bh); - goto wait_for_ctlbuf; - } - if (cond_resched()) - goto wait_for_ctlbuf; - - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - - BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); - clear_buffer_jwrite(bh); - journal_unfile_buffer(journal, jh); - journal_put_journal_head(jh); - __brelse(bh); /* One for getblk */ - /* AKPM: bforget here */ - } + err = wait_for_descriptors(journal, commit_transaction); if (err) journal_abort(journal, err); @@ -903,6 +1134,8 @@ wait_for_iobuf: J_ASSERT(commit_transaction->t_iobuf_list == NULL); J_ASSERT(commit_transaction->t_shadow_list == NULL); J_ASSERT(commit_transaction->t_log_list == NULL); + J_ASSERT(commit_transaction->t_declare_root.rnode == NULL); + J_ASSERT(commit_transaction->t_declare_done_root.rnode == NULL); restart_loop: /* Index: linux-2.6.18-128.7.1/fs/jbd/journal.c =================================================================== --- linux-2.6.18-128.7.1.orig/fs/jbd/journal.c +++ linux-2.6.18-128.7.1/fs/jbd/journal.c @@ -156,6 +156,16 @@ loop: journal_commit_transaction(journal); spin_lock(&journal->j_state_lock); goto loop; + } else if (journal->j_flags & JFS_DECLARE && + (transaction == journal->j_running_transaction) && + transaction->t_declare_request) { + jbd_debug(2, "early declare\n"); + spin_unlock(&journal->j_state_lock); + journal_write_declare_blocks(journal, transaction, 0); + spin_lock(&journal->j_state_lock); + + wake_up(&journal->j_wait_declare); + goto loop; } wake_up(&journal->j_wait_done_commit); @@ -299,6 +309,8 @@ int journal_write_metadata_buffer(transa */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); + set_buffer_fs_raidsync(bh_in); + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* @@ -376,6 +388,9 @@ repeat: new_bh->b_blocknr = blocknr; set_buffer_mapped(new_bh); set_buffer_dirty(new_bh); + if (transaction->t_journal->j_flags & JFS_DECLARE && + transaction->t_journal->j_fs_dev == transaction->t_journal->j_dev) + set_buffer_fs_raidsync(new_bh); *jh_out = new_jh; @@ -635,6 +650,9 @@ struct journal_head *journal_get_descrip lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); set_buffer_uptodate(bh); + if (journal->j_flags & JFS_DECLARE && + journal->j_fs_dev == journal->j_dev) + set_buffer_fs_raidsync(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "return this buffer"); return journal_add_journal_head(bh); @@ -959,6 +977,7 @@ static journal_t * journal_init_common ( init_waitqueue_head(&journal->j_wait_checkpoint); init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); + init_waitqueue_head(&journal->j_wait_declare); mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); @@ -1259,6 +1278,9 @@ void journal_update_superblock(journal_t BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); + if (journal->j_flags & JFS_DECLARE && + journal->j_fs_dev == journal->j_dev) + set_buffer_fs_raidsync(bh); if (wait) sync_dirty_buffer(bh); else @@ -1292,6 +1314,8 @@ static int journal_get_superblock(journa J_ASSERT(bh != NULL); if (!buffer_uptodate(bh)) { + set_buffer_syncraid(bh); + ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { Index: linux-2.6.18-128.7.1/fs/jbd/recovery.c =================================================================== --- linux-2.6.18-128.7.1.orig/fs/jbd/recovery.c +++ linux-2.6.18-128.7.1/fs/jbd/recovery.c @@ -22,6 +22,7 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/crc32.h> +#include <linux/raid/md.h> #endif /* @@ -36,6 +37,7 @@ struct recovery_info int nr_replays; int nr_revokes; int nr_revoke_hits; + int nr_declared; }; enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; @@ -43,6 +45,7 @@ static int do_one_pass(journal_t *journa struct recovery_info *info, enum passtype pass); static int scan_revoke_records(journal_t *, struct buffer_head *, tid_t, struct recovery_info *); +static void journal_syncraid(journal_t *, unsigned long); #ifdef __KERNEL__ @@ -53,7 +56,6 @@ void journal_brelse_array(struct buffer_ brelse (b[n]); } - /* * When reading from the journal, we are going through the block device * layer directly and so there is no readahead being done for us. We @@ -67,7 +69,7 @@ void journal_brelse_array(struct buffer_ */ #define MAXBUF 8 -static int do_readahead(journal_t *journal, unsigned int start) +static int do_readahead(journal_t *journal, unsigned int start, int raid_sync) { int err; unsigned int max, nbufs, next; @@ -103,6 +105,15 @@ static int do_readahead(journal_t *journ if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; + + /* For declared mode: perform a raid synchronization + * for the journal blocks; this will resync all of the + * journal blocks read, which is more than strictly + * necessary. + */ + if (raid_sync) + set_buffer_syncraid(bh); + if (nbufs == MAXBUF) { ll_rw_block(READ, nbufs, bufs); journal_brelse_array(bufs, nbufs); @@ -130,7 +141,7 @@ failed: */ static int jread(struct buffer_head **bhp, journal_t *journal, - unsigned int offset) + unsigned int offset, int sync_raid) { int err; unsigned long blocknr; @@ -159,7 +170,7 @@ static int jread(struct buffer_head **bh /* If this is a brand new buffer, start readahead. Otherwise, we assume we are already reading it. */ if (!buffer_req(bh)) - do_readahead(journal, offset); + do_readahead(journal, offset, sync_raid); wait_on_buffer(bh); } @@ -257,6 +268,14 @@ int journal_recover(journal_t *journal) jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", info.nr_replays, info.nr_revoke_hits, info.nr_revokes); + if (!err && JFS_HAS_INCOMPAT_FEATURE(journal, + JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) + jbd_debug(0, "JBD: Resynced %d declared blocks\n", + info.nr_declared); + + journal_clear_features(journal, 0, 0, + JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS); + /* Restart the log at the next transaction ID, thus invalidating * any existing commit records in the log. */ journal->j_transaction_sequence = ++info.end_transaction; @@ -329,7 +348,7 @@ static int calc_chksums(journal_t *journ for (i = 0; i < num_blks; i++) { io_block = (*next_log_block)++; wrap(journal, *next_log_block); - err = jread(&obh, journal, io_block); + err = jread(&obh, journal, io_block, 0); if (err) { printk(KERN_ERR "JBD: IO error %d recovering block " "%lu in log\n", err, io_block); @@ -355,6 +374,8 @@ static int do_one_pass(journal_t *journa unsigned int sequence; int blocktype; __u32 crc32_sum = ~0; /* Transactional Checksums */ + int raid_sync_journal = 0; + int raid_sync_data = 0; /* Precompute the maximum metadata descriptors in a descriptor block */ int MAX_BLOCKS_PER_DESC; @@ -397,9 +418,33 @@ static int do_one_pass(journal_t *journa * check right now that we haven't gone past the end of * the log. */ - if (pass != PASS_SCAN) - if (tid_geq(next_commit_ID, info->end_transaction)) - break; + if (pass != PASS_SCAN) { + if (tid_geq(next_commit_ID, info->end_transaction)) { + /* For declared mode resync, move ahead past + * the last commmitted transaction to deal with + * raid sync for declare blocks and the head + * of the journal. + */ + if (pass == PASS_REPLAY && + JFS_HAS_INCOMPAT_FEATURE(journal, + JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) { + if (journal->j_fs_dev == journal->j_dev) + raid_sync_journal = 1; + if (!raid_sync_data) + jbd_debug(1, "Declared mode " + "was used; " + "performing raid " + "sync %s\n", + raid_sync_journal ? + "of journal and " + "data" : + "of data"); + raid_sync_data = 1; + } else { + break; + } + } + } jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", next_commit_ID, next_log_block, journal->j_last); @@ -409,7 +454,7 @@ static int do_one_pass(journal_t *journa * record. */ jbd_debug(3, "JBD: checking block %ld\n", next_log_block); - err = jread(&bh, journal, next_log_block); + err = jread(&bh, journal, next_log_block, raid_sync_journal); if (err) goto failed; @@ -426,6 +471,10 @@ static int do_one_pass(journal_t *journa if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { brelse(bh); + + /* raid sync the head of the journal */ + if (raid_sync_journal) + journal_syncraid(journal, next_log_block); break; } @@ -436,6 +485,10 @@ static int do_one_pass(journal_t *journa if (sequence != next_commit_ID) { brelse(bh); + + /* raid sync the head of the journal */ + if (raid_sync_journal) + journal_syncraid(journal, next_log_block); break; } @@ -485,7 +538,8 @@ static int do_one_pass(journal_t *journa io_block = next_log_block++; wrap(journal, next_log_block); - err = jread(&obh, journal, io_block); + err = jread(&obh, journal, io_block, + raid_sync_journal); if (err) { /* Recover what we can, but * report failure at the end. */ @@ -526,6 +580,16 @@ static int do_one_pass(journal_t *journa goto failed; } + /* We must raid sync the home location + * when replaying the write in case the + * crash occured during the checkpoint + * write. + */ + + if (raid_sync_journal && + !buffer_uptodate(nbh)) + set_buffer_syncraid(nbh); + lock_buffer(nbh); memcpy(nbh->b_data, obh->b_data, journal->j_blocksize); @@ -668,6 +732,57 @@ static int do_one_pass(journal_t *journa goto failed; continue; + case JFS_DECLARE_BLOCK: + if (!raid_sync_data) { + brelse(bh); + continue; + } + + /* This is a declare block for an uncommitted + * transaction, so raid sync all of the blocks it + * describes. + */ + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) + <= journal->j_blocksize) { + + unsigned long blocknr; + + tag = (journal_block_tag_t *) tagp; + flags = be32_to_cpu(tag->t_flags); + blocknr = be32_to_cpu(tag->t_blocknr); + + nbh = __getblk(journal->j_fs_dev, blocknr, + journal->j_blocksize); + + if (nbh == NULL) { + printk(KERN_ERR "JBD: Out of memory " + "during recovery.\n"); + err = -ENOMEM; + brelse(bh); + goto failed; + } + + set_buffer_syncraid(nbh); + ll_rw_block(READ, 1, &nbh); + wait_on_buffer(nbh); + + brelse(nbh); + + ++info->nr_declared; + + tagp += sizeof(journal_block_tag_t); + if (!(flags & JFS_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JFS_FLAG_LAST_TAG) + break; + } + + brelse(bh); + continue; + default: jbd_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); @@ -705,6 +820,31 @@ static int do_one_pass(journal_t *journa return err; } +/* RAID sync the next one quarter of the journal. This is called once at the + * end of recovery if declare blocks are present since that part of the journal + * was likely undergoing writes before the crash. + */ +static void journal_syncraid(journal_t *journal, unsigned long next_log_block) +{ + struct buffer_head *bh; + int i, err; + + jbd_debug(2, "RAID resync of 1/4 of the journal starting at %lu\n", + next_log_block); + + for (i = 0; i < journal->j_maxlen / 4; i++) { + err = jread(&bh, journal, next_log_block, 1); + brelse(bh); + + if (err) { + printk(KERN_ERR "JBD: bad block at offset %lu\n", + next_log_block); + } + + next_log_block++; + wrap(journal, next_log_block); + } +} /* Scan a revoke record, marking all blocks mentioned as revoked. */ Index: linux-2.6.18-128.7.1/fs/jbd/transaction.c =================================================================== --- linux-2.6.18-128.7.1.orig/fs/jbd/transaction.c +++ linux-2.6.18-128.7.1/fs/jbd/transaction.c @@ -58,6 +58,10 @@ get_transaction(journal_t *journal, tran journal->j_commit_timer.expires = transaction->t_expires; add_timer(&journal->j_commit_timer); + /* Initialize the declare radix tree */ + INIT_RADIX_TREE(&transaction->t_declare_root, GFP_ATOMIC); + INIT_RADIX_TREE(&transaction->t_declare_done_root, GFP_ATOMIC); + J_ASSERT(journal->j_running_transaction == NULL); journal->j_running_transaction = transaction; transaction->t_max_wait = 0; @@ -956,6 +960,7 @@ int journal_dirty_data(handle_t *handle, journal_t *journal = handle->h_transaction->t_journal; int need_brelse = 0; struct journal_head *jh; + int jdatalist; if (is_handle_aborted(handle)) return 0; @@ -999,6 +1004,8 @@ int journal_dirty_data(handle_t *handle, goto no_journal; } + jdatalist = journal->j_flags & JFS_DECLARE ? BJ_Declare : BJ_SyncData; + if (jh->b_transaction) { JBUFFER_TRACE(jh, "has transaction"); if (jh->b_transaction != handle->h_transaction) { @@ -1041,6 +1048,8 @@ int journal_dirty_data(handle_t *handle, */ if (jh->b_jlist != BJ_None && jh->b_jlist != BJ_SyncData && + jh->b_jlist != BJ_Declare && + jh->b_jlist != BJ_DeclareDone && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "Not stealing"); goto no_journal; @@ -1088,18 +1097,19 @@ int journal_dirty_data(handle_t *handle, * committing transaction, so might still be left on that * transaction's metadata lists. */ - if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { + if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Declare && + jh->b_jlist != BJ_DeclareDone && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "not on correct data list: unfile"); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); __journal_temp_unlink_buffer(jh); jh->b_transaction = handle->h_transaction; JBUFFER_TRACE(jh, "file as data"); __journal_file_buffer(jh, handle->h_transaction, - BJ_SyncData); + jdatalist); } } else { JBUFFER_TRACE(jh, "not on a transaction"); - __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); + __journal_file_buffer(jh, handle->h_transaction, jdatalist); } no_journal: spin_unlock(&journal->j_list_lock); @@ -1578,6 +1588,7 @@ void __journal_temp_unlink_buffer(struct struct journal_head **list = NULL; transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); + struct radix_tree_root *root = NULL; J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); transaction = jh->b_transaction; @@ -1617,9 +1628,25 @@ void __journal_temp_unlink_buffer(struct case BJ_Locked: list = &transaction->t_locked_list; break; + case BJ_Declare: + root = &transaction->t_declare_root; + transaction->t_declare_count--; + break; + case BJ_DeclareDone: + root = &transaction->t_declare_done_root; + break; + } + + if (jh->b_jlist == BJ_Declare || jh->b_jlist == BJ_DeclareDone) { + if ((radix_tree_delete(root, bh->b_blocknr)) != jh) { + printk(KERN_ERR + "jbd: ERROR radix tree delete block %8llu\n", + (unsigned long long)bh->b_blocknr); + } + } else { + __blist_del_buffer(list, jh); } - __blist_del_buffer(list, jh); jh->b_jlist = BJ_None; if (test_clear_buffer_jbddirty(bh)) mark_buffer_dirty(bh); /* Expose it to the VM */ @@ -1660,7 +1687,8 @@ __journal_try_to_free_buffer(journal_t * spin_lock(&journal->j_list_lock); if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { - if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { + if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Declare || + jh->b_jlist == BJ_DeclareDone || jh->b_jlist == BJ_Locked) { /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); __journal_unfile_buffer(jh); @@ -2072,6 +2100,8 @@ void __journal_file_buffer(struct journa struct journal_head **list = NULL; int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); + struct radix_tree_root *root = NULL; + int declare_per_block; J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -2126,15 +2156,44 @@ void __journal_file_buffer(struct journa list = &transaction->t_reserved_list; break; case BJ_Locked: - list = &transaction->t_locked_list; + list = &transaction->t_locked_list; + break; + case BJ_Declare: + root = &transaction->t_declare_root; + transaction->t_declare_count++; break; + case BJ_DeclareDone: + root = &transaction->t_declare_done_root; + break; + } + + if (jlist == BJ_Declare || jlist == BJ_DeclareDone) { + if ((radix_tree_insert(root, bh->b_blocknr, jh)) != 0) { + printk(KERN_ERR + "jbd: ERROR radix tree insert block %8lu\n", + (long unsigned)bh->b_blocknr); + } + } else { + __blist_add_buffer(list, jh); } - __blist_add_buffer(list, jh); jh->b_jlist = jlist; if (was_dirty) set_buffer_jbddirty(bh); + + declare_per_block = (bh->b_size - (sizeof(journal_header_t) + 32)) / + sizeof(journal_block_tag_t); + + /* wake up the commit thread to perform early declarations */ + assert_spin_locked(&transaction->t_journal->j_list_lock); + if (transaction->t_journal->j_flags & JFS_DECLARE && + jlist == BJ_Declare && + transaction->t_declare_count >= declare_per_block) { + transaction->t_declare_request = transaction->t_declare_count / + declare_per_block * declare_per_block; + wake_up(&transaction->t_journal->j_wait_commit); + } } void journal_file_buffer(struct journal_head *jh, Index: linux-2.6.18-128.7.1/include/linux/jbd.h =================================================================== --- linux-2.6.18-128.7.1.orig/include/linux/jbd.h +++ linux-2.6.18-128.7.1/include/linux/jbd.h @@ -26,6 +26,7 @@ #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/journal-head.h> +#include <linux/radix-tree.h> #include <linux/stddef.h> #include <linux/bit_spinlock.h> #include <linux/mutex.h> @@ -137,6 +138,7 @@ typedef struct journal_s journal_t; /* J #define JFS_SUPERBLOCK_V1 3 #define JFS_SUPERBLOCK_V2 4 #define JFS_REVOKE_BLOCK 5 +#define JFS_DECLARE_BLOCK 6 /* * Standard header for all descriptor blocks: @@ -261,12 +263,14 @@ typedef struct journal_superblock_s #define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 #define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 +#define JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS 0x00000008 /* Features known to this kernel version: */ #define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM #define JFS_KNOWN_ROCOMPAT_FEATURES 0 #define JFS_KNOWN_INCOMPAT_FEATURES (JFS_FEATURE_INCOMPAT_REVOKE | \ - JFS_FEATURE_INCOMPAT_ASYNC_COMMIT) + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT | \ + JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS) #ifdef __KERNEL__ @@ -559,6 +563,15 @@ struct transaction_s struct journal_head *t_sync_datalist; /* + * Radix tree of all data buffers that must be declared before being + * written, declare mode counters [j_list_lock] + */ + struct radix_tree_root t_declare_root; + struct radix_tree_root t_declare_done_root; + unsigned int t_declare_count; + unsigned int t_declare_request; + + /* * Doubly-linked circular list of all forget buffers (superseded * buffers which we can un-checkpoint once this transaction commits) * [j_list_lock] @@ -730,6 +743,7 @@ jbd_time_diff(unsigned int start, unsign * @j_wait_checkpoint: Wait queue to trigger checkpointing * @j_wait_commit: Wait queue to trigger commit * @j_wait_updates: Wait queue to wait for updates to complete + * @j_wait_declare: Wait queue to wait for declarations to complete * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints * @j_head: Journal head - identifies the first unused block in the journal * @j_tail: Journal tail - identifies the oldest still-used block in the @@ -768,6 +782,8 @@ jbd_time_diff(unsigned int start, unsign * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the * number that will fit in j_blocksize * @j_last_sync_writer: most recent pid which did a synchronous write + * @j_declare_jhs: array of journal_heads for journal_write_declare_blocks + * @j_declare_bhs: array of buffer_heads for journal_write_declare_blocks * @j_private: An opaque pointer to fs-private information. */ @@ -841,6 +857,9 @@ struct journal_s /* Wait queue to wait for updates to complete */ wait_queue_head_t j_wait_updates; + /* Wait queue to wait for declarations to complete */ + wait_queue_head_t j_wait_declare; + /* Semaphore for locking against concurrent checkpoints */ struct mutex j_checkpoint_mutex; @@ -970,6 +989,13 @@ struct journal_s struct transaction_stats_s j_stats; /* + * Arrays of jhs and bhs for journal_write_declare_blocks, to avoid + * having to allocate them each time. + */ + void *j_declare_jhs[64]; + struct buffer_head *j_declare_bhs[64]; + + /* * An opaque pointer to fs-private information. ext3 puts its * superblock pointer here */ @@ -985,6 +1011,7 @@ struct journal_s #define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JFS_LOADED 0x010 /* The journal superblock has been loaded */ #define JFS_BARRIER 0x020 /* Use IDE barriers */ +#define JFS_DECLARE 0x040 /* Declare data blocks before writing */ /* * Function declarations for the journaling transaction and buffer @@ -1008,6 +1035,9 @@ int journal_next_log_block(journal_t *, /* Commit management */ extern void journal_commit_transaction(journal_t *); +extern void journal_write_declare_blocks(journal_t *journal, + transaction_t *commit_transaction, + int committing); /* Checkpoint list management */ int __journal_clean_checkpoint_list(journal_t *journal); @@ -1100,6 +1130,7 @@ extern void journal_ack_err (journ extern int journal_clear_err (journal_t *); extern int journal_bmap(journal_t *, unsigned long, unsigned long *); extern int journal_force_commit(journal_t *); +extern int journal_write_declare(journal_t *); /* * journal_head management @@ -1244,7 +1275,9 @@ static inline int jbd_space_needed(journ #define BJ_LogCtl 6 /* Buffer contains log descriptors */ #define BJ_Reserved 7 /* Buffer is reserved for access by journal */ #define BJ_Locked 8 /* Locked for I/O during commit */ -#define BJ_Types 9 +#define BJ_Declare 9 /* Needs to be declared first */ +#define BJ_DeclareDone 10 /* Has been declared */ +#define BJ_Types 11 extern int jbd_blocks_per_page(struct inode *inode); -- -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html