Re: [PATCH v8] md/r5cache: handle SYNC and FUA

Shaohua Li <shli@xxxxxxxxxx> · Fri, 18 Nov 2016 17:00:30 -0800

On Fri, Nov 18, 2016 at 04:46:50PM -0800, Song Liu wrote:
> With raid5 cache, we committing data from journal device. When
> there is flush request, we need to flush journal device's cache.
> This was not needed in raid5 journal, because we will flush the
> journal before committing data to raid disks.
> 
> This is similar to FUA, except that we also need flush journal for
> FUA. Otherwise, corruptions in earlier meta data will stop recovery
> from reaching FUA data.

Looks good, applied!

> 
> Signed-off-by: Song Liu <songliubraving@xxxxxx>
> ---
>  drivers/md/raid5-cache.c | 163 +++++++++++++++++++++++++++++++++++++++++------
>  drivers/md/raid5.c       |  12 ++++
>  drivers/md/raid5.h       |   1 +
>  3 files changed, 158 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 6b99570..a904268 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -19,6 +19,7 @@
>  #include <linux/raid/md_p.h>
>  #include <linux/crc32c.h>
>  #include <linux/random.h>
> +#include <trace/events/block.h>
>  #include "md.h"
>  #include "raid5.h"
>  #include "bitmap.h"
> @@ -159,6 +160,9 @@ struct r5l_log {
>  
>  	spinlock_t stripe_in_journal_lock;
>  	atomic_t stripe_in_journal_count;
> +
> +	/* to submit async io_units, to fulfill ordering of flush */
> +	struct work_struct deferred_io_work;
>  };
>  
>  /*
> @@ -185,6 +189,18 @@ struct r5l_io_unit {
>  
>  	int state;
>  	bool need_split_bio;
> +	struct bio *split_bio;
> +
> +	unsigned int has_flush:1;      /* include flush request */
> +	unsigned int has_fua:1;        /* include fua request */
> +	unsigned int has_null_flush:1; /* include empty flush request */
> +	/*
> +	 * io isn't sent yet, flush/fua request can only be submitted till it's
> +	 * the first IO in running_ios list
> +	 */
> +	unsigned int io_deferred:1;
> +
> +	struct bio_list flush_barriers;   /* size == 0 flush bios */
>  };
>  
>  /* r5l_io_unit state */
> @@ -494,9 +510,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
>  	}
>  }
>  
> +static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
>  static void r5l_log_endio(struct bio *bio)
>  {
>  	struct r5l_io_unit *io = bio->bi_private;
> +	struct r5l_io_unit *io_deferred;
>  	struct r5l_log *log = io->log;
>  	unsigned long flags;
>  
> @@ -512,18 +530,89 @@ static void r5l_log_endio(struct bio *bio)
>  		r5l_move_to_end_ios(log);
>  	else
>  		r5l_log_run_stripes(log);
> +	if (!list_empty(&log->running_ios)) {
> +		/*
> +		 * FLUSH/FUA io_unit is deferred because of ordering, now we
> +		 * can dispatch it
> +		 */
> +		io_deferred = list_first_entry(&log->running_ios,
> +					       struct r5l_io_unit, log_sibling);
> +		if (io_deferred->io_deferred)
> +			schedule_work(&log->deferred_io_work);
> +	}
> +
>  	spin_unlock_irqrestore(&log->io_list_lock, flags);
>  
>  	if (log->need_cache_flush)
>  		md_wakeup_thread(log->rdev->mddev->thread);
> +
> +	if (io->has_null_flush) {
> +		struct bio *bi;
> +
> +		WARN_ON(bio_list_empty(&io->flush_barriers));
> +		while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
> +			bio_endio(bi);
> +			atomic_dec(&io->pending_stripe);
> +		}
> +		if (atomic_read(&io->pending_stripe) == 0)
> +			__r5l_stripe_write_finished(io);
> +	}
> +}
> +
> +static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&log->io_list_lock, flags);
> +	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
> +	spin_unlock_irqrestore(&log->io_list_lock, flags);
> +
> +	if (io->has_flush)
> +		bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH);
> +	if (io->has_fua)
> +		bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA);
> +	submit_bio(io->current_bio);
> +
> +	if (!io->split_bio)
> +		return;
> +
> +	if (io->has_flush)
> +		bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH);
> +	if (io->has_fua)
> +		bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA);
> +	submit_bio(io->split_bio);
> +}
> +
> +/* deferred io_unit will be dispatched here */
> +static void r5l_submit_io_async(struct work_struct *work)
> +{
> +	struct r5l_log *log = container_of(work, struct r5l_log,
> +					   deferred_io_work);
> +	struct r5l_io_unit *io = NULL;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&log->io_list_lock, flags);
> +	if (!list_empty(&log->running_ios)) {
> +		io = list_first_entry(&log->running_ios, struct r5l_io_unit,
> +				      log_sibling);
> +		if (!io->io_deferred)
> +			io = NULL;
> +		else
> +			io->io_deferred = 0;
> +	}
> +	spin_unlock_irqrestore(&log->io_list_lock, flags);
> +	if (io)
> +		r5l_do_submit_io(log, io);
>  }
>  
>  static void r5l_submit_current_io(struct r5l_log *log)
>  {
>  	struct r5l_io_unit *io = log->current_io;
> +	struct bio *bio;
>  	struct r5l_meta_block *block;
>  	unsigned long flags;
>  	u32 crc;
> +	bool do_submit = true;
>  
>  	if (!io)
>  		return;
> @@ -532,13 +621,20 @@ static void r5l_submit_current_io(struct r5l_log *log)
>  	block->meta_size = cpu_to_le32(io->meta_offset);
>  	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
>  	block->checksum = cpu_to_le32(crc);
> +	bio = io->current_bio;
>  
>  	log->current_io = NULL;
>  	spin_lock_irqsave(&log->io_list_lock, flags);
> -	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
> +	if (io->has_flush || io->has_fua) {
> +		if (io != list_first_entry(&log->running_ios,
> +					   struct r5l_io_unit, log_sibling)) {
> +			io->io_deferred = 1;
> +			do_submit = false;
> +		}
> +	}
>  	spin_unlock_irqrestore(&log->io_list_lock, flags);
> -
> -	submit_bio(io->current_bio);
> +	if (do_submit)
> +		r5l_do_submit_io(log, io);
>  }
>  
>  static struct bio *r5l_bio_alloc(struct r5l_log *log)
> @@ -583,6 +679,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
>  	io->log = log;
>  	INIT_LIST_HEAD(&io->log_sibling);
>  	INIT_LIST_HEAD(&io->stripe_list);
> +	bio_list_init(&io->flush_barriers);
>  	io->state = IO_UNIT_RUNNING;
>  
>  	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
> @@ -653,12 +750,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
>  	struct r5l_io_unit *io = log->current_io;
>  
>  	if (io->need_split_bio) {
> -		struct bio *prev = io->current_bio;
> -
> +		BUG_ON(io->split_bio);
> +		io->split_bio = io->current_bio;
>  		io->current_bio = r5l_bio_alloc(log);
> -		bio_chain(io->current_bio, prev);
> -
> -		submit_bio(prev);
> +		bio_chain(io->current_bio, io->split_bio);
> +		io->need_split_bio = false;
>  	}
>  
>  	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
> @@ -687,12 +783,24 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  
>  	io = log->current_io;
>  
> +	if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
> +		io->has_flush = 1;
> +
>  	for (i = 0; i < sh->disks; i++) {
>  		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
>  		    test_bit(R5_InJournal, &sh->dev[i].flags))
>  			continue;
>  		if (i == sh->pd_idx || i == sh->qd_idx)
>  			continue;
> +		if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
> +		    log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
> +			io->has_fua = 1;
> +			/*
> +			 * we need to flush journal to make sure recovery can
> +			 * reach the data with fua flag
> +			 */
> +			io->has_flush = 1;
> +		}
>  		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
>  					raid5_compute_blocknr(sh, i, 0),
>  					sh->dev[i].log_checksum, 0, false);
> @@ -856,17 +964,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
>  {
>  	if (!log)
>  		return -ENODEV;
> -	/*
> -	 * we flush log disk cache first, then write stripe data to raid disks.
> -	 * So if bio is finished, the log disk cache is flushed already. The
> -	 * recovery guarantees we can recovery the bio from log disk, so we
> -	 * don't need to flush again
> -	 */
> -	if (bio->bi_iter.bi_size == 0) {
> -		bio_endio(bio);
> -		return 0;
> +
> +	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
> +		/*
> +		 * in write through (journal only)
> +		 * we flush log disk cache first, then write stripe data to
> +		 * raid disks. So if bio is finished, the log disk cache is
> +		 * flushed already. The recovery guarantees we can recovery
> +		 * the bio from log disk, so we don't need to flush again
> +		 */
> +		if (bio->bi_iter.bi_size == 0) {
> +			bio_endio(bio);
> +			return 0;
> +		}
> +		bio->bi_opf &= ~REQ_PREFLUSH;
> +	} else {
> +		/* write back (with cache) */
> +		if (bio->bi_iter.bi_size == 0) {
> +			mutex_lock(&log->io_mutex);
> +			r5l_get_meta(log, 0);
> +			bio_list_add(&log->current_io->flush_barriers, bio);
> +			log->current_io->has_flush = 1;
> +			log->current_io->has_null_flush = 1;
> +			atomic_inc(&log->current_io->pending_stripe);
> +			r5l_submit_current_io(log);
> +			mutex_unlock(&log->io_mutex);
> +			return 0;
> +		}
>  	}
> -	bio->bi_opf &= ~REQ_PREFLUSH;
>  	return -EAGAIN;
>  }
>  
> @@ -2470,6 +2595,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  	INIT_LIST_HEAD(&log->no_space_stripes);
>  	spin_lock_init(&log->no_space_stripes_lock);
>  
> +	INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
> +
>  	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
>  	INIT_LIST_HEAD(&log->stripe_in_journal_list);
>  	spin_lock_init(&log->stripe_in_journal_lock);
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index aa4968c..a850663 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5248,6 +5248,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
>  	int remaining;
>  	DEFINE_WAIT(w);
>  	bool do_prepare;
> +	bool do_flush = false;
>  
>  	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
>  		int ret = r5l_handle_flush_request(conf->log, bi);
> @@ -5259,6 +5260,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
>  			return;
>  		}
>  		/* ret == -EAGAIN, fallback */
> +		/*
> +		 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
> +		 * we need to flush journal device
> +		 */
> +		do_flush = (bi->bi_opf & REQ_PREFLUSH) != 0;
>  	}
>  
>  	md_write_start(mddev, bi);
> @@ -5398,6 +5404,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
>  				do_prepare = true;
>  				goto retry;
>  			}
> +			if (do_flush) {
> +				set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
> +				/* we only need flush for one stripe */
> +				do_flush = false;
> +			}
> +
>  			set_bit(STRIPE_HANDLE, &sh->state);
>  			clear_bit(STRIPE_DELAYED, &sh->state);
>  			if ((!sh->batch_head || sh == sh->batch_head) &&
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index a698113..d13fe45 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -376,6 +376,7 @@ enum {
>  	STRIPE_R5C_FULL_STRIPE,	/* in r5c cache (to-be/being handled or
>  				 * in conf->r5c_full_stripe_list)
>  				 */
> +	STRIPE_R5C_PREFLUSH,	/* need to flush journal device */
>  };
>  
>  #define STRIPE_EXPAND_SYNC_FLAGS \
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html