Re: Documentation for jbd2

"Peter Teoh" <htmldeveloper@xxxxxxxxx> · Sat, 29 Mar 2008 01:25:55 +0800

On Wed, Mar 26, 2008 at 3:31 AM, Manish Katiyar <mkatiyar@xxxxxxxxx> wrote:
> Other than the source code, are there any links/resources for the new
>  jbd2 design ?
>
>  --
>  Thanks & Regards,
>  ********************************************
>  Manish Katiyar ( http://mkatiyar.googlepages.com )

As it is under development now, which works in conjunction with ext4
(which is ext4dev currently, if u look at the .config file), u will
find difficulty in getting official documentation - possible under
Linux Symposium:

http://www.google.com/search?q=jbd2+linux+symposium&btnG=Google+Search

So here are some question u can use to build your FAQ on jbd2 (or ext4
FAQ) - hopefully everyone can contribute MORE QUESTIONS and we can
compile it together to update ext4 wiki:

What is ext4/jbd2?   How is ext4 different from ext3/ext2?   Similarly
how is jbd2 different from jbd?   Performance/delays/async vs sync
mechanism/size of logging/configurability of logging/periodicity of
logging/recovery of corrupted data and its mechanics - any differences
etc.

But pertaining exactly your question, I did a diff between jbd/*.c and
its corresponding files in jbd2/*.c, the summary observation:

a.   checksumming on journal logs (commit.c): as shown below, jbd2 has
some checksum mechanism on journal, whereas jbd does not.

125,135c120,123
<       tmp = (struct commit_header *)bh->b_data;
<       tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
<       tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
<       tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
<
<       if (JBD2_HAS_COMPAT_FEATURE(journal,
<                                   JBD2_FEATURE_COMPAT_CHECKSUM)) {
<               tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
<               tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
<               tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
<       }
---
>       header = (journal_header_t *)(bh->b_data);
>       header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
>       header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
>       header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
137,139c125

b.   async vs sync mechanism of logging (commit.c):   because of
asynchronous operation, jbd2 will call wait for I/O to complete after
it has done other things (if it is busy), whereas jbd waiting is
implicit - I/O necessarily must complete before it can do anything
else.

<       *cbh = bh;
<       return ret;
< }
<
< /*
<  * This function along with journal_submit_commit_record
<  * allows to write the commit record asynchronously.
<  */
< static int journal_wait_on_commit_record(struct buffer_head *bh)
< {
<       int ret = 0;
<
<       clear_buffer_dirty(bh);
<       wait_on_buffer(bh);
<
<       if (unlikely(!buffer_uptodate(bh)))
<               ret = -EIO;
<       put_bh(bh);            /* One for getblk() */
<       jbd2_journal_put_journal_head(bh2jh(bh));
---
>       put_bh(bh);             /* One for getblk() */
>       journal_put_journal_head(descriptor);

And this block is new in jbd2:

< /*
<  * Wait for all submitted IO to complete.
<  */
< static int journal_wait_on_locked_list(journal_t *journal,
<                                      transaction_t *commit_transaction)
< {
<       int ret = 0;
<       struct journal_head *jh;
<
<       while (commit_transaction->t_locked_list) {
<               struct buffer_head *bh;
<
<               jh = commit_transaction->t_locked_list->b_tprev;
<               bh = jh2bh(jh);
<               get_bh(bh);
<               if (buffer_locked(bh)) {
<                       spin_unlock(&journal->j_list_lock);
<                       wait_on_buffer(bh);
<                       if (unlikely(!buffer_uptodate(bh)))
<                               ret = -EIO;
<                       spin_lock(&journal->j_list_lock);
<               }
<               if (!inverted_lock(journal, bh)) {
<                       put_bh(bh);
<                       spin_lock(&journal->j_list_lock);
<                       continue;
<               }
<               if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
<                       __jbd2_journal_unfile_buffer(jh);
<                       jbd_unlock_bh_state(bh);
<                       jbd2_journal_remove_journal_head(bh);
<                       put_bh(bh);
<               } else {
<                       jbd_unlock_bh_state(bh);
<               }
<               put_bh(bh);
<               cond_resched_lock(&journal->j_list_lock);
<       }

c.   This diff I cannot explain:

<       jbd2_journal_switch_revoke_table(journal);
<
<       stats.u.run.rs_flushing = jiffies;
<       stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
<                                              stats.u.run.rs_flushing);
---
>       journal_switch_revoke_table(journal);

And similar to above - the following block are new in jbd2:

<       stats.u.run.rs_logging = jiffies;
<       stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
<                                                stats.u.run.rs_logging);
<       stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
<       stats.u.run.rs_blocks_logged = 0;
<

d.   This block is removed in jbd2:

>               jh = commit_transaction->t_locked_list->b_tprev;
>               bh = jh2bh(jh);
>               get_bh(bh);
>               if (buffer_locked(bh)) {
>                       spin_unlock(&journal->j_list_lock);
>                       wait_on_buffer(bh);
>                       if (unlikely(!buffer_uptodate(bh)))
>                               err = -EIO;
>                       spin_lock(&journal->j_list_lock);
>               }
>               if (!inverted_lock(journal, bh)) {
>                       put_bh(bh);
>                       spin_lock(&journal->j_list_lock);
>                       continue;
>               }
>               if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
>                       __journal_unfile_buffer(jh);
>                       jbd_unlock_bh_state(bh);
>                       journal_remove_journal_head(bh);
>                       put_bh(bh);
>               } else {
>                       jbd_unlock_bh_state(bh);
>               }
>               put_bh(bh);
>               cond_resched_lock(&journal->j_list_lock);
>       }

e.   checksum again:

<                               /*
<                                * Compute checksum.
<                                */
<                               if (JBD2_HAS_COMPAT_FEATURE(journal,
<                                       JBD2_FEATURE_COMPAT_CHECKSUM)) {
<                                       crc32_sum =
<                                           jbd2_checksum_data(crc32_sum, bh);
<                               }
<

f.   async commit for logging again:

<       /* Done it all: now write the commit record asynchronously. */
<
<       if (JBD2_HAS_INCOMPAT_FEATURE(journal,
<               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
<               err = journal_submit_commit_record(journal, commit_transaction,
<                                                &cbh, crc32_sum);
<               if (err)
<                       __jbd2_journal_abort_hard(journal);
<
<               spin_lock(&journal->j_list_lock);
<               err = journal_wait_on_locked_list(journal,
<                                               commit_transaction);
<               spin_unlock(&journal->j_list_lock);
<               if (err)
<                       __jbd2_journal_abort_hard(journal);
<       }
<

and this:

<       if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
<               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
<               err = journal_submit_commit_record(journal, commit_transaction,
<                                               &cbh, crc32_sum);
<               if (err)
<                       __jbd2_journal_abort_hard(journal);
<       }
<       if (!err && !is_journal_aborted(journal))
<               err = journal_wait_on_commit_record(cbh);
---
>       if (journal_write_commit_record(journal, commit_transaction))
>               err = -EIO;

g.   other than statistics calculation, what is this history thing:

<       commit_transaction->t_start = jiffies;
<       stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
<                                               commit_transaction->t_start);
<
<       /*
<        * File the transaction for history
<        */
<       stats.ts_type = JBD2_STATS_RUN;
<       stats.ts_tid = commit_transaction->t_tid;
<       stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
<       spin_lock(&journal->j_history_lock);
<       memcpy(journal->j_history + journal->j_history_cur, &stats,
<                       sizeof(stats));
<       if (++journal->j_history_cur == journal->j_history_max)
<               journal->j_history_cur = 0;
<
<       /*
<        * Calculate overall stats
<        */
<       journal->j_stats.ts_tid++;
<       journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
<       journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
<       journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
<       journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
<       journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
<       journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
<       journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
<       journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
<       spin_unlock(&journal->j_history_lock);
<

My question:   Since it is async-based, does it mean that a timeout is
needed?   can it ever happened that the CPU is kept so busy that it
will never come back and complete the I/O operation for logging?
(thus necessitating a timeout?)

h.   Why is it providing a /proc entry for querying internal
information (highly truncated due to voluminous chunks) - these
functions / components are new in Ext4:

< static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
< static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
< static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
< static int jbd2_seq_history_show(struct seq_file *seq, void *v)
< static void jbd2_seq_history_stop(struct seq_file *seq, void *v)

< static struct seq_operations jbd2_seq_history_ops = {
<       .start  = jbd2_seq_history_start,
<       .next   = jbd2_seq_history_next,
<       .stop   = jbd2_seq_history_stop,
<       .show   = jbd2_seq_history_show,
< };
<
< static int jbd2_seq_history_open(struct inode *inode, struct file *file)
< static int jbd2_seq_history_release(struct inode *inode, struct file *file)
< static struct file_operations jbd2_seq_history_fops = {
<       .owner          = THIS_MODULE,
<       .open           = jbd2_seq_history_open,
<       .read           = seq_read,
<       .llseek         = seq_lseek,
<       .release        = jbd2_seq_history_release,
< };
<
< static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
< static int jbd2_seq_info_show(struct seq_file *seq, void *v)
< static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
< static struct seq_operations jbd2_seq_info_ops = {
<       .start  = jbd2_seq_info_start,
<       .next   = jbd2_seq_info_next,
<       .stop   = jbd2_seq_info_stop,
<       .show   = jbd2_seq_info_show,
< };
<
< static int jbd2_seq_info_open(struct inode *inode, struct file *file)
< static int jbd2_seq_info_release(struct inode *inode, struct file *file)
< static struct file_operations jbd2_seq_info_fops = {
<       .owner          = THIS_MODULE,
<       .open           = jbd2_seq_info_open,
<       .read           = seq_read,
<       .llseek         = seq_lseek,
<       .release        = jbd2_seq_info_release,
< };
<
< static struct proc_dir_entry *proc_jbd2_stats;
<
< static void jbd2_stats_proc_init(journal_t *journal)
< static void jbd2_stats_proc_exit(journal_t *journal)
< static void journal_init_stats(journal_t *journal)

/proc/fs/ext4/sdb2>d
group_prealloc  max_to_scan  mb_groups  mb_history  min_to_scan  order2_req
stats           stream_req   ./

Every time there is a new file copied, a new entry will be added here
(and it is added asynchronously - not immediately, but after a short
while):

/proc/fs/jbd2/sdb2>cat history
R/C  tid   wait  run   lock  flush log   hndls  block inlog ctime
write drop  close
R    2     0     107692 0     0     0     1      1     2
R    3     0     5471  0     0     0     1      1     2
R    4     0     11    0     0     16    1      6     7

I am tired.....Several more differences, but just a few more -
majorities of others are just name changes.

-- 
Regards,
Peter Teoh

--
To unsubscribe from this list: send an email with
"unsubscribe kernelnewbies" to ecartis@xxxxxxxxxxxx
Please read the FAQ at http://kernelnewbies.org/FAQ