Re: [PATCH] md/raid5: fix 'out of memory' during raid cache recovery

Song Liu <liu.song.a23@xxxxxxxxx> · Fri, 30 Mar 2018 13:26:09 -0700

On Tue, Mar 27, 2018 at 4:54 PM, Alexei Naberezhnov <anaberezhnov@xxxxxx> wrote:
> This fixes the case when md array assembly fails because of raid cache recovery
> unable to allocate a stripe, despite attempts to replay stripes and increase
> cache size. This happens because stripes released by r5c_recovery_replay_stripes
> and raid5_set_cache_size don't become available for allocation immediately.
> Released stripes first are placed on conf->released_stripes list and require
> md thread to merge them on conf->inactive_list before they can be allocated.
>
> Patch allows final allocation attempt during cache recovery to wait for
> new stripes to become availabe for allocation.
>
> CC: linux-raid@xxxxxxxxxxxxxxx
> CC: Shaohua Li <shli@xxxxxxxxxx>
> Signed-off-by: Alexei Naberezhnov <anaberezhnov@xxxxxx>
> ---
>  drivers/md/raid5-cache.c | 33 ++++++++++++++++++++++-----------
>  drivers/md/raid5.c       |  8 ++++++--
>  2 files changed, 28 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 3c65f52..bb1ebbf 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -1937,12 +1937,14 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf,
>  }
>
>  static struct stripe_head *
> -r5c_recovery_alloc_stripe(struct r5conf *conf,
> -                         sector_t stripe_sect)
> +r5c_recovery_alloc_stripe(
> +               struct r5conf *conf,
> +               sector_t stripe_sect,
> +               int noblock)
>  {
>         struct stripe_head *sh;
>
> -       sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
> +       sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0);
>         if (!sh)
>                 return NULL;  /* no more stripe available */
>
> @@ -2152,7 +2154,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
>                                                 stripe_sect);
>
>                 if (!sh) {
> -                       sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
> +                       sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1);
>                         /*
>                          * cannot get stripe from raid5_get_active_stripe
>                          * try replay some stripes
> @@ -2161,20 +2163,29 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
>                                 r5c_recovery_replay_stripes(
>                                         cached_stripe_list, ctx);
>                                 sh = r5c_recovery_alloc_stripe(
> -                                       conf, stripe_sect);
> +                                       conf, stripe_sect, 1);
>                         }
>                         if (!sh) {
> +                               int new_size = conf->min_nr_stripes * 2;
>                                 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
>                                         mdname(mddev),
> -                                       conf->min_nr_stripes * 2);
> -                               raid5_set_cache_size(mddev,
> -                                                    conf->min_nr_stripes * 2);
> -                               sh = r5c_recovery_alloc_stripe(conf,
> -                                                              stripe_sect);
> +                                       new_size);
> +                               ret = raid5_set_cache_size(mddev, new_size);
> +                               if (conf->min_nr_stripes <= new_size / 2) {
> +                                       pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n",
> +                                               mdname(mddev),
> +                                               ret,
> +                                               new_size,
> +                                               conf->min_nr_stripes,
> +                                               conf->max_nr_stripes);
> +                                       return -ENOMEM;
> +                               }
> +                               sh = r5c_recovery_alloc_stripe(
> +                                       conf, stripe_sect, 0);
>                         }
>                         if (!sh) {
>                                 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
> -                                      mdname(mddev));
> +                                       mdname(mddev));
>                                 return -ENOMEM;
>                         }
>                         list_add_tail(&sh->lru, cached_stripe_list);
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index b5d2601..76c6720 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6347,6 +6347,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
>  int
>  raid5_set_cache_size(struct mddev *mddev, int size)
>  {
> +       int result = 0;
>         struct r5conf *conf = mddev->private;
>
>         if (size <= 16 || size > 32768)
> @@ -6363,11 +6364,14 @@ raid5_set_cache_size(struct mddev *mddev, int size)
>
>         mutex_lock(&conf->cache_size_mutex);
>         while (size > conf->max_nr_stripes)
> -               if (!grow_one_stripe(conf, GFP_KERNEL))
> +               if (!grow_one_stripe(conf, GFP_KERNEL)) {
> +                       conf->min_nr_stripes = conf->max_nr_stripes;
> +                       result = -ENOMEM;
>                         break;
> +               }
>         mutex_unlock(&conf->cache_size_mutex);
>
> -       return 0;
> +       return result;
>  }
>  EXPORT_SYMBOL(raid5_set_cache_size);
>
> --
> 2.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

This fix looks good to me, as a fix. However, we still need to wake up
md_thread during recovery. A better solution would be a new way to
allocate stripe that doesn't require md_thread. I think we can ship this
fix first, and introduce the new stripe allocation later.

What do you think Shaohua?

Thanks,
Song

Reviewed-by: Song Liu <songliubraving@xxxxxx>
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html