This patch implements the actual commit path for fast commit. Based on inodes tracked and their respective logical ranges remembered, this patch adds code to create a fast commit block that stores extents added to the inode. We use new JBD2 interfaces added in previous patches in this series. The fast commit blocks that are created have extents that _should_ be present in the file. It doesn't yet support removing of extents, making operations such as truncate, delete fast commit incompatible. Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@xxxxxxxxx> --- fs/ext4/ext4.h | 26 ++++++ fs/ext4/extents.c | 8 +- fs/ext4/fsync.c | 2 +- fs/ext4/inode.c | 5 +- fs/ext4/super.c | 213 +++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 246 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 92dc4432c7ed..5d92a2e4f0af 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2276,6 +2276,32 @@ struct mmpd_data { */ #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL +/* Magic of fast commit header */ +#define EXT4_FC_MAGIC 0xE2540090 + +struct ext4_fc_commit_hdr { + __le32 fc_magic; + /* JBD2 tid after which this fast commit should be applied */ + __le32 fc_tid; + /* Sub transaction ID */ + __le32 fc_subtid; + /* Length of this partial commit in terms of num blocks */ + __le32 fc_len; + /* Inode number */ + __le32 fc_ino; + /* ext4 inode on disk copy */ + struct ext4_inode inode; + /* Csum(hdr+contents) */ + __le32 fc_csum; +}; + +#define EXT4_FC_TAG_EXT 0x1 /* Extent */ + +struct ext4_fc_tl { + __le16 fc_tag; + __le16 fc_len; +}; + /* * Function prototypes */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index eb77e306a82b..66f7f4fb1612 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4899,10 +4899,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) goto out; - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { - ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, - EXT4_I(inode)->i_sync_tid); - } + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) + ret = jbd2_fc_complete_commit( + EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid, + journal_current_handle()->h_journal->j_subtid); out: inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5508baa11bb6..4f783f9723c5 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -151,7 +151,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(journal, commit_tid)) needs_barrier = true; - ret = jbd2_complete_transaction(journal, commit_tid); + ret = jbd2_fc_complete_commit(journal, commit_tid, journal->j_subtid); if (needs_barrier) { issue_flush: err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f79b185c013e..dd5d39a48363 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5476,8 +5476,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; - err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, - EXT4_I(inode)->i_sync_tid); + err = jbd2_fc_complete_commit( + EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid, + EXT4_SB(inode->i_sb)->s_journal->j_subtid); } else { struct ext4_iloc iloc; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f6e820384ee0..a291d41b91de 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -437,6 +437,214 @@ static bool system_going_down(void) || system_state == SYSTEM_RESTART; } +static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + struct buffer_head *orig_bh = bh->b_private; + + BUFFER_TRACE(bh, ""); + if (uptodate) { + ext4_debug("%s: Block %lld up-to-date", + __func__, bh->b_blocknr); + set_buffer_uptodate(bh); + } else { + ext4_debug("%s: Block %lld not up-to-date", + __func__, bh->b_blocknr); + clear_buffer_uptodate(bh); + } + if (orig_bh) { + clear_bit_unlock(BH_Shadow, &orig_bh->b_state); + /* Protect BH_Shadow bit in b_state */ + smp_mb__after_atomic(); + wake_up_bit(&orig_bh->b_state, BH_Shadow); + } + unlock_buffer(bh); +} + +static int ext4_fc_write_inode(journal_t *journal, struct inode *inode, + tid_t tid, tid_t subtid) +{ + loff_t old_blk_size, cur_lblk_off, new_blk_size; + struct super_block *sb = journal->j_private; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_fc_commit_hdr *fc_hdr; + struct ext4_map_blocks map; + struct ext4_iloc iloc; + struct ext4_fc_tl tl; + struct ext4_extent extent; + struct buffer_head *bh; + __u8 *cur, *end; + int ret; + + if (tid != ei->i_fc.fc_tid || subtid != ei->i_fc.fc_subtid) { + jbd_debug(3, + "File not modified. Modified %d:%d, expected %d:%d", + ei->i_fc.fc_tid, ei->i_fc.fc_subtid, tid, subtid); + return 0; + } + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -ECANCELED; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + ret = jbd2_map_fc_buf(journal, &bh); + if (ret) + return -ENOMEM; + + end = (__u8 *)bh->b_data + journal->j_blocksize; + + old_blk_size = (ei->i_fc.fc_lblk_start + sb->s_blocksize - 1) >> + inode->i_blkbits; + new_blk_size = ei->i_fc.fc_lblk_end >> inode->i_blkbits; + + jbd_debug(3, "Committing as tid = %d, subtid = %d on buffer %lld\n", + tid, subtid, bh->b_blocknr); + + ei->i_fc.fc_lblk_start = ei->i_fc.fc_lblk_end; + + fc_hdr = (struct ext4_fc_commit_hdr *) + ((__u8 *)bh->b_data + sizeof(journal_header_t)); + fc_hdr->fc_magic = cpu_to_le32(EXT4_FC_MAGIC); + fc_hdr->fc_tid = cpu_to_le32(tid); + fc_hdr->fc_subtid = cpu_to_le32(subtid); + fc_hdr->fc_len = cpu_to_le32(0x1); + fc_hdr->fc_ino = cpu_to_le32(inode->i_ino); + + memcpy(&fc_hdr->inode, ext4_raw_inode(&iloc), EXT4_INODE_SIZE(sb)); + cur = (__u8 *)(fc_hdr + 1); + + cur_lblk_off = old_blk_size; + while (cur_lblk_off <= new_blk_size) { + map.m_lblk = cur_lblk_off; + map.m_len = new_blk_size - cur_lblk_off + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (!ret) { + cur_lblk_off += map.m_len; + continue; + } + + if (map.m_flags & EXT4_MAP_UNWRITTEN) + return -ECANCELED; + extent.ee_block = cpu_to_le32(map.m_lblk); + cur_lblk_off += map.m_len; + if (cur + sizeof(struct ext4_extent) + + sizeof(struct ext4_fc_tl) >= end) + return -ENOSPC; + + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_EXT); + tl.fc_len = cpu_to_le16(sizeof(struct ext4_extent)); + extent.ee_len = cpu_to_le16(map.m_len); + ext4_ext_store_pblock(&extent, map.m_pblk); + if (map.m_flags & EXT4_MAP_UNWRITTEN) + ext4_ext_mark_unwritten(&extent); + else + ext4_ext_mark_initialized(&extent); + memcpy(cur, &tl, sizeof(struct ext4_fc_tl)); + cur += sizeof(struct ext4_fc_tl); + memcpy(cur, &extent, sizeof(struct ext4_extent)); + cur += sizeof(struct ext4_extent); + } + + jbd_debug(3, "Created FC block for inode %ld with [%d, %d]", + inode->i_ino, tid, subtid); + + return 1; +} + +static void ext4_journal_fc_cleanup_cb(journal_t *journal) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *iter; + struct inode *inode; + + mutex_lock(&sbi->s_fc_lock); + while (!list_empty(&sbi->s_fc_q)) { + iter = list_first_entry(&sbi->s_fc_q, + struct ext4_inode_info, i_fc_list); + list_del_init(&iter->i_fc_list); + inode = &iter->vfs_inode; + } + INIT_LIST_HEAD(&sbi->s_fc_q); + sbi->s_fc_q_cnt = 0; + mutex_unlock(&sbi->s_fc_lock); +} + +/* + * Fast-commit commit callback. There is contention between sbi->s_fc_lock and + * i_data_sem. Locking order is - i_data_sem then s_fc_lock + */ +static int ext4_journal_fc_commit_cb(journal_t *journal, tid_t tid, + tid_t subtid) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct list_head *pos, *tmp; + struct ext4_inode_info *iter; + struct jbd2_inode *jinode; + int num_bufs = 0, ret; + + sbi = sbi; + mutex_lock(&sbi->s_fc_lock); + if (!sbi->s_fc_eligible) { + sbi->s_fc_eligible = true; + mutex_unlock(&sbi->s_fc_lock); + return -ECANCELED; + } + + list_for_each_safe(pos, tmp, &sbi->s_fc_q) { + struct inode *inode; + + iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + inode = &iter->vfs_inode; + + mutex_unlock(&sbi->s_fc_lock); + /* + * Release s_fc_lock here since fc_write_inode calls + * ext4_map_blocks which needs i_data_sem. + */ + ret = ext4_fc_write_inode(journal, inode, tid, subtid); + if (ret < 0) + return ret; + mutex_lock(&sbi->s_fc_lock); + + num_bufs += ret; + } + + /* Submit data buffers first */ + list_for_each(pos, &sbi->s_fc_q) { + iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + jinode = iter->jinode; + ret = jbd2_submit_inode_data(journal, jinode); + if (ret) { + mutex_unlock(&sbi->s_fc_lock); + return ret; + } + } + + if (num_bufs == 0) { + mutex_unlock(&sbi->s_fc_lock); + return 0; + } + + /* + * Before returning, check if s_fc_eligible was modified since we + * started. + */ + if (!sbi->s_fc_eligible) { + mutex_unlock(&sbi->s_fc_lock); + return -ECANCELED; + } + + mutex_unlock(&sbi->s_fc_lock); + + jbd_debug(3, "%s: Journal blocks ready for fast commit\n", __func__); + + return jbd2_submit_fc_bufs(journal, ext4_end_buffer_io_sync); +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -4723,7 +4931,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_commit_interval = sbi->s_commit_interval; journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; - + if (ext4_should_fast_commit(sb)) { + journal->j_fc_commit_callback = ext4_journal_fc_commit_cb; + journal->j_fc_cleanup_callback = ext4_journal_fc_cleanup_cb; + } write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; -- 2.22.0.657.g960e92d24f-goog