Re: [PATCH v6 05/11] md/r5cache: write-out mode and reclaim support

NeilBrown <neilb@xxxxxxxx> · Thu, 17 Nov 2016 11:28:57 +1100

On Fri, Nov 11 2016, Song Liu wrote:

> +/*
> + * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
> + *
> + * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
> + * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
> + * device is less than 2x of reclaim_required_space.
> + */
> +static inline void r5c_update_log_state(struct r5l_log *log)
> +{
> +	struct r5conf *conf = log->rdev->mddev->private;
> +	sector_t free_space;
> +	sector_t reclaim_space;
> +
> +	if (!r5c_is_writeback(log))
> +		return;
> +
> +	free_space = r5l_ring_distance(log, log->log_start,
> +				       log->last_checkpoint);
> +	reclaim_space = r5c_log_required_to_flush_cache(conf);
> +	if (free_space < 2 * reclaim_space)
> +		set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
> +	else
> +		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
> +	if (free_space < 3 * reclaim_space)
> +		set_bit(R5C_LOG_TIGHT, &conf->cache_state);
> +	else
> +		clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
> +}

This code, that you rewrote as I requested (Thanks) behaves slightly
differently to the previous version.
Maybe that is intentional, but I thought I would mention it anyway.
The previous would set TIGHT when free_space dropped below
3*reclaim_space, and would only clear it when free_space when above
4*reclaim_space.  This provided some hysteresis.
Now it is cleared as soon as free_space reaches 3*reclaim_space.

Maybe this is what you want, but as the hysteresis seemed like it might
be sensible, it is worth asking.

>  
> +/*
> + * calculate new last_checkpoint
> + * for write through mode, returns log->next_checkpoint
> + * for write back, returns log_start of first sh in stripe_in_cache_list
> + */
> +static sector_t r5c_calculate_new_cp(struct r5conf *conf)
> +{
> +	struct stripe_head *sh;
> +	struct r5l_log *log = conf->log;
> +	sector_t end = MaxSector;

The value assigned here is never used.

> +
> +	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
> +		return log->next_checkpoint;
> +
> +	spin_lock(&log->stripe_in_cache_lock);
> +	if (list_empty(&conf->log->stripe_in_cache_list)) {
> +		/* all stripes flushed */
> +		spin_unlock(&log->stripe_in_cache_lock);
> +		return log->next_checkpoint;
> +	}
> +	sh = list_first_entry(&conf->log->stripe_in_cache_list,
> +			      struct stripe_head, r5c);
> +	end = sh->log_start;
> +	spin_unlock(&log->stripe_in_cache_lock);
> +	return end;

Given that we only assign "log_start" to the variable "end", it is
strange that it is called "end".
"new_cp" would make sense, or "log_start", but why "end" ??

> +}
> +
>  static sector_t r5l_reclaimable_space(struct r5l_log *log)
>  {
> +	struct r5conf *conf = log->rdev->mddev->private;
> +
>  	return r5l_ring_distance(log, log->last_checkpoint,
> -				 log->next_checkpoint);
> +				 r5c_calculate_new_cp(conf));
>  }
>  
>  static void r5l_run_no_mem_stripe(struct r5l_log *log)
> @@ -776,6 +966,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
>  static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
>  {
>  	struct r5l_log *log = io->log;
> +	struct r5conf *conf = log->rdev->mddev->private;
>  	unsigned long flags;
>  
>  	spin_lock_irqsave(&log->io_list_lock, flags);
> @@ -786,7 +977,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
>  		return;
>  	}
>  
> -	if (r5l_reclaimable_space(log) > log->max_free_space)
> +	if (r5l_reclaimable_space(log) > log->max_free_space ||
> +	    test_bit(R5C_LOG_TIGHT, &conf->cache_state))
>  		r5l_wake_reclaim(log, 0);
>  
>  	spin_unlock_irqrestore(&log->io_list_lock, flags);
> @@ -907,14 +1099,140 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
>  	}
>  }
>  
> +/*
> + * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
> + * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
> + *
> + * must hold conf->device_lock
> + */
> +static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
> +{
> +	BUG_ON(list_empty(&sh->lru));
> +	BUG_ON(test_bit(STRIPE_R5C_WRITE_OUT, &sh->state));
> +	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
> +	assert_spin_locked(&conf->device_lock);
> +
> +	list_del_init(&sh->lru);
> +	atomic_inc(&sh->count);
> +
> +	set_bit(STRIPE_HANDLE, &sh->state);
> +	atomic_inc(&conf->active_stripes);
> +	r5c_make_stripe_write_out(sh);
> +
> +	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
> +		atomic_inc(&conf->preread_active_stripes);
> +	raid5_release_stripe(sh);

This looks wrong.  raid5_release_stripe() can try to take
conf->device_lock but this function is called with ->device_lock
held. This would cause a deadlock.

It presumably doesn't deadlock because you just incremented sh->count,
so raid5_release_stripe() will probably just decrement sh->count and
that count will remain > 0.
So why are you incrementing ->count for a few instructions and then
releasing the stripe?  Either that isn't necessary, or it could
deadlock.

I guess that if we are certain that STRIPE_ON_RELEASE_LIST is clear,
then it won't deadlock as it will do a lock-less add to
conf->release_stripes.
But if that is the case, it needs to be documented, and probaby there
needs to be a WARN_ON(test_bit(STRIPE_ON_RELEASE_LIST.....));

Thanks,
NeilBrown
Attachment:
signature.asc

Description: PGP signature