Re: [PATCH v5 3/3] z3fold: add shrinker

Dan Streetman <ddstreet@xxxxxxxx> · Mon, 17 Oct 2016 22:06:57 -0400

On Sat, Oct 15, 2016 at 8:05 AM, Vitaly Wool <vitalywool@xxxxxxxxx> wrote:
> This patch implements shrinker for z3fold. This shrinker
> implementation does not free up any pages directly but it allows
> for a denser placement of compressed objects which results in
> less actual pages consumed and higher compression ratio therefore.
>
> This update removes z3fold page compaction from the freeing path
> since we can rely on shrinker to do the job. Also, a new flag
> UNDER_COMPACTION is introduced to protect against two threads
> trying to compact the same page.

i'm completely unconvinced that this should be a shrinker.  The
alloc/free paths are much, much better suited to compacting a page
than a shrinker that must scan through all the unbuddied pages.  Why
not just improve compaction for the alloc/free paths?

>
> This patch has been checked with the latest Linus's tree.
>
> Signed-off-by: Vitaly Wool <vitalywool@xxxxxxxxx>
> ---
>  mm/z3fold.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 119 insertions(+), 25 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 10513b5..8f84d3c 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -27,6 +27,7 @@
>  #include <linux/mm.h>
>  #include <linux/module.h>
>  #include <linux/preempt.h>
> +#include <linux/shrinker.h>
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
>  #include <linux/zpool.h>
> @@ -72,6 +73,7 @@ struct z3fold_ops {
>   * @unbuddied_nr:      number of unbuddied z3fold pages in the pool.
>   * @ops:       pointer to a structure of user defined operations specified at
>   *             pool creation time.
> + * @shrinker:  shrinker structure to optimize page layout in background
>   *
>   * This structure is allocated at pool creation time and maintains metadata
>   * pertaining to a particular z3fold pool.
> @@ -86,6 +88,7 @@ struct z3fold_pool {
>         const struct z3fold_ops *ops;
>         struct zpool *zpool;
>         const struct zpool_ops *zpool_ops;
> +       struct shrinker shrinker;
>  };
>
>  enum buddy {
> @@ -121,6 +124,7 @@ enum z3fold_page_flags {
>         UNDER_RECLAIM = 0,
>         PAGE_HEADLESS,
>         MIDDLE_CHUNK_MAPPED,
> +       UNDER_COMPACTION,
>  };
>
>  /*****************
> @@ -136,6 +140,9 @@ static int size_to_chunks(size_t size)
>  #define for_each_unbuddied_list(_iter, _begin) \
>         for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
>
> +#define for_each_unbuddied_list_down(_iter, _end) \
> +       for ((_iter) = (_end); (_iter) > 0; (_iter)--)
> +

bikeshed: the conventional suffix is _reverse, not _down, i.e.
for_each_unbuddied_list_reverse()

>  /* Initializes the z3fold header of a newly allocated z3fold page */
>  static struct z3fold_header *init_z3fold_page(struct page *page)
>  {
> @@ -145,6 +152,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
>         clear_bit(UNDER_RECLAIM, &page->private);
>         clear_bit(PAGE_HEADLESS, &page->private);
>         clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
> +       clear_bit(UNDER_COMPACTION, &page->private);
>
>         zhdr->first_chunks = 0;
>         zhdr->middle_chunks = 0;
> @@ -211,6 +219,103 @@ static int num_free_chunks(struct z3fold_header *zhdr)
>         return nfree;
>  }
>
> +/* Has to be called with lock held */
> +static int z3fold_compact_page(struct z3fold_header *zhdr, bool sync)
> +{
> +       struct page *page = virt_to_page(zhdr);
> +       void *beg = zhdr;
> +
> +
> +       if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
> +           !test_bit(UNDER_RECLAIM, &page->private) &&
> +           !test_bit(UNDER_COMPACTION, &page->private)) {
> +               set_bit(UNDER_COMPACTION, &page->private);

i assume the reclaim check, and new compaction bit check, is due to
the removal of the spinlock, which was incorrect.  changing to a
per-page spinlock may be the best way to handle this, but flags are
absolutely not appropriate - they don't provide the needed locking.
Even if the compaction bit was the only locking needed (which it
isn't), it still isn't correct here - while extremely unlikely, it's
still possible for multiple threads to race between checking the
compaction bit, and setting it.  That's what test_and_set_bit() is
for.

> +               if (zhdr->middle_chunks != 0 &&

only need to check if the middle bud is in use once; if it isn't,
there's no compaction to do.

> +                   zhdr->first_chunks == 0 &&
> +                   zhdr->last_chunks == 0) {
> +                       memmove(beg + ZHDR_SIZE_ALIGNED,
> +                               beg + (zhdr->start_middle << CHUNK_SHIFT),
> +                               zhdr->middle_chunks << CHUNK_SHIFT);
> +                       zhdr->first_chunks = zhdr->middle_chunks;
> +                       zhdr->middle_chunks = 0;
> +                       zhdr->start_middle = 0;
> +                       zhdr->first_num++;
> +                       clear_bit(UNDER_COMPACTION, &page->private);
> +                       return 1;
> +               }
> +               if (sync)
> +                       goto out;

i don't get it, why is that compaction synchronous while the others
below aren't?

> +
> +               /* moving data is expensive, so let's only do that if
> +                * there's substantial gain (2+ chunks)

"at least 2 chunks" feels arbitrary...it should be a #define instead
of a magic number...

> +                */
> +               if (zhdr->middle_chunks != 0 && zhdr->first_chunks != 0 &&
> +                   zhdr->last_chunks == 0 &&
> +                   zhdr->start_middle > zhdr->first_chunks + 2) {
> +                       unsigned short new_start = zhdr->first_chunks + 1;
> +                       memmove(beg + (new_start << CHUNK_SHIFT),
> +                               beg + (zhdr->start_middle << CHUNK_SHIFT),
> +                               zhdr->middle_chunks << CHUNK_SHIFT);
> +                       zhdr->start_middle = new_start;
> +                       clear_bit(UNDER_COMPACTION, &page->private);
> +                       return 1;
> +               }
> +               if (zhdr->middle_chunks != 0 && zhdr->last_chunks != 0 &&
> +                   zhdr->first_chunks == 0 &&
> +                   zhdr->middle_chunks + zhdr->last_chunks <=
> +                   NCHUNKS - zhdr->start_middle - 2) {
> +                       unsigned short new_start = NCHUNKS - zhdr->last_chunks -
> +                               zhdr->middle_chunks;
> +                       memmove(beg + (new_start << CHUNK_SHIFT),
> +                               beg + (zhdr->start_middle << CHUNK_SHIFT),
> +                               zhdr->middle_chunks << CHUNK_SHIFT);

these if clauses, and the memmoves, aren't very readable, could it be
made any better with a separate function?

> +                       zhdr->start_middle = new_start;
> +                       clear_bit(UNDER_COMPACTION, &page->private);
> +                       return 1;
> +               }
> +       }
> +out:
> +       clear_bit(UNDER_COMPACTION, &page->private);
> +       return 0;
> +}
> +
> +static unsigned long z3fold_shrink_count(struct shrinker *shrink,
> +                               struct shrink_control *sc)
> +{
> +       struct z3fold_pool *pool = container_of(shrink, struct z3fold_pool,
> +                                               shrinker);
> +
> +       return atomic64_read(&pool->unbuddied_nr);
> +}
> +
> +static unsigned long z3fold_shrink_scan(struct shrinker *shrink,
> +                               struct shrink_control *sc)
> +{
> +       struct z3fold_pool *pool = container_of(shrink, struct z3fold_pool,
> +                                               shrinker);
> +       struct z3fold_header *zhdr;
> +       int i, nr_to_scan = sc->nr_to_scan, nr_shrunk = 0;
> +
> +       spin_lock(&pool->lock);
> +       for_each_unbuddied_list_down(i, NCHUNKS - 3) {
> +               if (!list_empty(&pool->unbuddied[i])) {
> +                       zhdr = list_first_entry(&pool->unbuddied[i],
> +                                               struct z3fold_header, buddy);
> +                       list_del(&zhdr->buddy);
> +                       spin_unlock(&pool->lock);
> +                       nr_shrunk += z3fold_compact_page(zhdr, false);
> +                       spin_lock(&pool->lock);
> +                       list_add(&zhdr->buddy,
> +                               &pool->unbuddied[num_free_chunks(zhdr)]);

use list_add_tail(), we just compacted it and putting it at the head
of the new unbuddied list will cause it to be unnecessarily scanned
first later.

> +                       if (!--nr_to_scan)
> +                               break;
> +               }
> +       }
> +       spin_unlock(&pool->lock);
> +       return nr_shrunk;
> +}
> +
> +
>  /*****************
>   * API Functions
>  *****************/
> @@ -230,7 +335,7 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
>
>         pool = kzalloc(sizeof(struct z3fold_pool), gfp);
>         if (!pool)
> -               return NULL;
> +               goto out;
>         spin_lock_init(&pool->lock);
>         for_each_unbuddied_list(i, 0)
>                 INIT_LIST_HEAD(&pool->unbuddied[i]);
> @@ -238,8 +343,19 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
>         INIT_LIST_HEAD(&pool->lru);
>         atomic64_set(&pool->pages_nr, 0);
>         atomic64_set(&pool->unbuddied_nr, 0);
> +       pool->shrinker.count_objects = z3fold_shrink_count;
> +       pool->shrinker.scan_objects = z3fold_shrink_scan;
> +       pool->shrinker.seeks = DEFAULT_SEEKS;
> +       pool->shrinker.batch = NCHUNKS - 4;
> +       if (register_shrinker(&pool->shrinker))
> +               goto out_free;
>         pool->ops = ops;
>         return pool;
> +
> +out_free:
> +       kfree(pool);
> +out:
> +       return NULL;
>  }
>
>  /**
> @@ -250,31 +366,10 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
>   */
>  static void z3fold_destroy_pool(struct z3fold_pool *pool)
>  {
> +       unregister_shrinker(&pool->shrinker);
>         kfree(pool);
>  }
>
> -/* Has to be called with lock held */
> -static int z3fold_compact_page(struct z3fold_header *zhdr)
> -{
> -       struct page *page = virt_to_page(zhdr);
> -       void *beg = zhdr;
> -
> -
> -       if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
> -           zhdr->middle_chunks != 0 &&
> -           zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
> -               memmove(beg + ZHDR_SIZE_ALIGNED,
> -                       beg + (zhdr->start_middle << CHUNK_SHIFT),
> -                       zhdr->middle_chunks << CHUNK_SHIFT);
> -               zhdr->first_chunks = zhdr->middle_chunks;
> -               zhdr->middle_chunks = 0;
> -               zhdr->start_middle = 0;
> -               zhdr->first_num++;
> -               return 1;
> -       }
> -       return 0;
> -}
> -
>  /**
>   * z3fold_alloc() - allocates a region of a given size
>   * @pool:      z3fold pool from which to allocate
> @@ -464,7 +559,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
>                 free_z3fold_page(zhdr);
>                 atomic64_dec(&pool->pages_nr);
>         } else {
> -               z3fold_compact_page(zhdr);

why remove this?

>                 /* Add to the unbuddied list */
>                 freechunks = num_free_chunks(zhdr);
>                 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
> @@ -596,7 +690,7 @@ next:
>                                 /* Full, add to buddied list */
>                                 list_add(&zhdr->buddy, &pool->buddied);
>                         } else {
> -                               z3fold_compact_page(zhdr);
> +                               z3fold_compact_page(zhdr, true);
>                                 /* add to unbuddied list */
>                                 freechunks = num_free_chunks(zhdr);
>                                 list_add(&zhdr->buddy,
> --
> 2.4.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>