Re: [PATCH v5 3/3] z3fold: add shrinker

Dan Streetman <ddstreet@xxxxxxxx> · Tue, 18 Oct 2016 10:27:26 -0400

On Mon, Oct 17, 2016 at 10:45 PM, Vitaly Wool <vitalywool@xxxxxxxxx> wrote:
> Hi Dan,
>
> On Tue, Oct 18, 2016 at 4:06 AM, Dan Streetman <ddstreet@xxxxxxxx> wrote:
>> On Sat, Oct 15, 2016 at 8:05 AM, Vitaly Wool <vitalywool@xxxxxxxxx> wrote:
>>> This patch implements shrinker for z3fold. This shrinker
>>> implementation does not free up any pages directly but it allows
>>> for a denser placement of compressed objects which results in
>>> less actual pages consumed and higher compression ratio therefore.
>>>
>>> This update removes z3fold page compaction from the freeing path
>>> since we can rely on shrinker to do the job. Also, a new flag
>>> UNDER_COMPACTION is introduced to protect against two threads
>>> trying to compact the same page.
>>
>> i'm completely unconvinced that this should be a shrinker.  The
>> alloc/free paths are much, much better suited to compacting a page
>> than a shrinker that must scan through all the unbuddied pages.  Why
>> not just improve compaction for the alloc/free paths?
>
> Basically the main reason is performance, I want to avoid compaction on hot
> paths as much as possible. This patchset brings both performance and
> compression ratio gain, I'm not sure how to achieve that with improving
> compaction on alloc/free paths.

It seems like a tradeoff of slight improvement in hot paths, for
significant decrease in performance by adding a shrinker, which will
do a lot of unnecessary scanning.  The alloc/free/unmap functions are
working directly with the page at exactly the point where compaction
is needed - when adding or removing a bud from the page.

Sorry if I missed it in earlier emails, but have you done any
performance measurements comparing with/without the shrinker?  The
compression ratio gains may be possible with only the
z3fold_compact_page() improvements, and performance may be stable (or
better) with only a per-z3fold-page lock, instead of adding the
shrinker...?

If a shrinker really is needed, it seems like it would be better
suited to coalescing separate z3fold pages via migration, like
zsmalloc does (although that's a significant amount of work).

>
>>
>>>
>>> This patch has been checked with the latest Linus's tree.
>>>
>>> Signed-off-by: Vitaly Wool <vitalywool@xxxxxxxxx>
>>> ---
>>>  mm/z3fold.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++-----------
>>>  1 file changed, 119 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/mm/z3fold.c b/mm/z3fold.c
>>> index 10513b5..8f84d3c 100644
>>> --- a/mm/z3fold.c
>>> +++ b/mm/z3fold.c
>>> @@ -27,6 +27,7 @@
>>>  #include <linux/mm.h>
>>>  #include <linux/module.h>
>>>  #include <linux/preempt.h>
>>> +#include <linux/shrinker.h>
>>>  #include <linux/slab.h>
>>>  #include <linux/spinlock.h>
>>>  #include <linux/zpool.h>
>>> @@ -72,6 +73,7 @@ struct z3fold_ops {
>>>   * @unbuddied_nr:      number of unbuddied z3fold pages in the pool.
>>>   * @ops:       pointer to a structure of user defined operations specified at
>>>   *             pool creation time.
>>> + * @shrinker:  shrinker structure to optimize page layout in background
>>>   *
>>>   * This structure is allocated at pool creation time and maintains metadata
>>>   * pertaining to a particular z3fold pool.
>>> @@ -86,6 +88,7 @@ struct z3fold_pool {
>>>         const struct z3fold_ops *ops;
>>>         struct zpool *zpool;
>>>         const struct zpool_ops *zpool_ops;
>>> +       struct shrinker shrinker;
>>>  };
>>>
>>>  enum buddy {
>>> @@ -121,6 +124,7 @@ enum z3fold_page_flags {
>>>         UNDER_RECLAIM = 0,
>>>         PAGE_HEADLESS,
>>>         MIDDLE_CHUNK_MAPPED,
>>> +       UNDER_COMPACTION,
>>>  };
>>>
>>>  /*****************
>>> @@ -136,6 +140,9 @@ static int size_to_chunks(size_t size)
>>>  #define for_each_unbuddied_list(_iter, _begin) \
>>>         for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
>>>
>>> +#define for_each_unbuddied_list_down(_iter, _end) \
>>> +       for ((_iter) = (_end); (_iter) > 0; (_iter)--)
>>> +
>>
>> bikeshed: the conventional suffix is _reverse, not _down, i.e.
>> for_each_unbuddied_list_reverse()
>
> Ok :)
>
>>>  /* Initializes the z3fold header of a newly allocated z3fold page */
>>>  static struct z3fold_header *init_z3fold_page(struct page *page)
>>>  {
>>> @@ -145,6 +152,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
>>>         clear_bit(UNDER_RECLAIM, &page->private);
>>>         clear_bit(PAGE_HEADLESS, &page->private);
>>>         clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
>>> +       clear_bit(UNDER_COMPACTION, &page->private);
>>>
>>>         zhdr->first_chunks = 0;
>>>         zhdr->middle_chunks = 0;
>>> @@ -211,6 +219,103 @@ static int num_free_chunks(struct z3fold_header *zhdr)
>>>         return nfree;
>>>  }
>>>
>>> +/* Has to be called with lock held */
>>> +static int z3fold_compact_page(struct z3fold_header *zhdr, bool sync)
>>> +{
>>> +       struct page *page = virt_to_page(zhdr);
>>> +       void *beg = zhdr;
>>> +
>>> +
>>> +       if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
>>> +           !test_bit(UNDER_RECLAIM, &page->private) &&
>>> +           !test_bit(UNDER_COMPACTION, &page->private)) {
>>> +               set_bit(UNDER_COMPACTION, &page->private);
>>
>> i assume the reclaim check, and new compaction bit check, is due to
>> the removal of the spinlock, which was incorrect.  changing to a
>> per-page spinlock may be the best way to handle this, but flags are
>> absolutely not appropriate - they don't provide the needed locking.
>> Even if the compaction bit was the only locking needed (which it
>> isn't), it still isn't correct here - while extremely unlikely, it's
>> still possible for multiple threads to race between checking the
>> compaction bit, and setting it.  That's what test_and_set_bit() is
>> for.
>
> Yep, thanks, will fix that.
>
>>
>>> +               if (zhdr->middle_chunks != 0 &&
>>
>> only need to check if the middle bud is in use once; if it isn't,
>> there's no compaction to do.
>>
>>> +                   zhdr->first_chunks == 0 &&
>>> +                   zhdr->last_chunks == 0) {
>>> +                       memmove(beg + ZHDR_SIZE_ALIGNED,
>>> +                               beg + (zhdr->start_middle << CHUNK_SHIFT),
>>> +                               zhdr->middle_chunks << CHUNK_SHIFT);
>>> +                       zhdr->first_chunks = zhdr->middle_chunks;
>>> +                       zhdr->middle_chunks = 0;
>>> +                       zhdr->start_middle = 0;
>>> +                       zhdr->first_num++;
>>> +                       clear_bit(UNDER_COMPACTION, &page->private);
>>> +                       return 1;
>>> +               }
>>> +               if (sync)
>>> +                       goto out;
>>
>> i don't get it, why is that compaction synchronous while the others
>> below aren't?
>
> That is the "important" compaction which changes first_num and brings the
> biggest benefit. Moving middle object closer to another existing one is not
> so important ratio wise, so the idea was to leave that for the shrinker.
>
>>
>>> +
>>> +               /* moving data is expensive, so let's only do that if
>>> +                * there's substantial gain (2+ chunks)
>>
>> "at least 2 chunks" feels arbitrary...it should be a #define instead
>> of a magic number...
>
> ...or the module parameter but that could be added later.
>
>>> +                */
>>> +               if (zhdr->middle_chunks != 0 && zhdr->first_chunks != 0 &&
>>> +                   zhdr->last_chunks == 0 &&
>>> +                   zhdr->start_middle > zhdr->first_chunks + 2) {
>>> +                       unsigned short new_start = zhdr->first_chunks + 1;
>>> +                       memmove(beg + (new_start << CHUNK_SHIFT),
>>> +                               beg + (zhdr->start_middle << CHUNK_SHIFT),
>>> +                               zhdr->middle_chunks << CHUNK_SHIFT);
>>> +                       zhdr->start_middle = new_start;
>>> +                       clear_bit(UNDER_COMPACTION, &page->private);
>>> +                       return 1;
>>> +               }
>>> +               if (zhdr->middle_chunks != 0 && zhdr->last_chunks != 0 &&
>>> +                   zhdr->first_chunks == 0 &&
>>> +                   zhdr->middle_chunks + zhdr->last_chunks <=
>>> +                   NCHUNKS - zhdr->start_middle - 2) {
>>> +                       unsigned short new_start = NCHUNKS - zhdr->last_chunks -
>>> +                               zhdr->middle_chunks;
>>> +                       memmove(beg + (new_start << CHUNK_SHIFT),
>>> +                               beg + (zhdr->start_middle << CHUNK_SHIFT),
>>> +                               zhdr->middle_chunks << CHUNK_SHIFT);
>>
>> these if clauses, and the memmoves, aren't very readable, could it be
>> made any better with a separate function?
>
> Yep, look somewhat scary, I'll think what I can do about it.
>
>>> +                       zhdr->start_middle = new_start;
>>> +                       clear_bit(UNDER_COMPACTION, &page->private);
>>> +                       return 1;
>>> +               }
>>> +       }
>>> +out:
>>> +       clear_bit(UNDER_COMPACTION, &page->private);
>>> +       return 0;
>>> +}
>>> +
>>> +static unsigned long z3fold_shrink_count(struct shrinker *shrink,
>>> +                               struct shrink_control *sc)
>>> +{
>>> +       struct z3fold_pool *pool = container_of(shrink, struct z3fold_pool,
>>> +                                               shrinker);
>>> +
>>> +       return atomic64_read(&pool->unbuddied_nr);
>>> +}
>>> +
>>> +static unsigned long z3fold_shrink_scan(struct shrinker *shrink,
>>> +                               struct shrink_control *sc)
>>> +{
>>> +       struct z3fold_pool *pool = container_of(shrink, struct z3fold_pool,
>>> +                                               shrinker);
>>> +       struct z3fold_header *zhdr;
>>> +       int i, nr_to_scan = sc->nr_to_scan, nr_shrunk = 0;
>>> +
>>> +       spin_lock(&pool->lock);
>>> +       for_each_unbuddied_list_down(i, NCHUNKS - 3) {
>>> +               if (!list_empty(&pool->unbuddied[i])) {
>>> +                       zhdr = list_first_entry(&pool->unbuddied[i],
>>> +                                               struct z3fold_header, buddy);
>>> +                       list_del(&zhdr->buddy);
>>> +                       spin_unlock(&pool->lock);
>>> +                       nr_shrunk += z3fold_compact_page(zhdr, false);
>>> +                       spin_lock(&pool->lock);
>>> +                       list_add(&zhdr->buddy,
>>> +                               &pool->unbuddied[num_free_chunks(zhdr)]);
>>
>> use list_add_tail(), we just compacted it and putting it at the head
>> of the new unbuddied list will cause it to be unnecessarily scanned
>> first later.
>>
>>> +                       if (!--nr_to_scan)
>>> +                               break;
>>> +               }
>>> +       }
>>> +       spin_unlock(&pool->lock);
>>> +       return nr_shrunk;
>>> +}
>>> +
>>> +
>>>  /*****************
>>>   * API Functions
>>>  *****************/
>>> @@ -230,7 +335,7 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
>>>
>>>         pool = kzalloc(sizeof(struct z3fold_pool), gfp);
>>>         if (!pool)
>>> -               return NULL;
>>> +               goto out;
>>>         spin_lock_init(&pool->lock);
>>>         for_each_unbuddied_list(i, 0)
>>>                 INIT_LIST_HEAD(&pool->unbuddied[i]);
>>> @@ -238,8 +343,19 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
>>>         INIT_LIST_HEAD(&pool->lru);
>>>         atomic64_set(&pool->pages_nr, 0);
>>>         atomic64_set(&pool->unbuddied_nr, 0);
>>> +       pool->shrinker.count_objects = z3fold_shrink_count;
>>> +       pool->shrinker.scan_objects = z3fold_shrink_scan;
>>> +       pool->shrinker.seeks = DEFAULT_SEEKS;
>>> +       pool->shrinker.batch = NCHUNKS - 4;
>>> +       if (register_shrinker(&pool->shrinker))
>>> +               goto out_free;
>>>         pool->ops = ops;
>>>         return pool;
>>> +
>>> +out_free:
>>> +       kfree(pool);
>>> +out:
>>> +       return NULL;
>>>  }
>>>
>>>  /**
>>> @@ -250,31 +366,10 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
>>>   */
>>>  static void z3fold_destroy_pool(struct z3fold_pool *pool)
>>>  {
>>> +       unregister_shrinker(&pool->shrinker);
>>>         kfree(pool);
>>>  }
>>>
>>> -/* Has to be called with lock held */
>>> -static int z3fold_compact_page(struct z3fold_header *zhdr)
>>> -{
>>> -       struct page *page = virt_to_page(zhdr);
>>> -       void *beg = zhdr;
>>> -
>>> -
>>> -       if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
>>> -           zhdr->middle_chunks != 0 &&
>>> -           zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
>>> -               memmove(beg + ZHDR_SIZE_ALIGNED,
>>> -                       beg + (zhdr->start_middle << CHUNK_SHIFT),
>>> -                       zhdr->middle_chunks << CHUNK_SHIFT);
>>> -               zhdr->first_chunks = zhdr->middle_chunks;
>>> -               zhdr->middle_chunks = 0;
>>> -               zhdr->start_middle = 0;
>>> -               zhdr->first_num++;
>>> -               return 1;
>>> -       }
>>> -       return 0;
>>> -}
>>> -
>>>  /**
>>>   * z3fold_alloc() - allocates a region of a given size
>>>   * @pool:      z3fold pool from which to allocate
>>> @@ -464,7 +559,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
>>>                 free_z3fold_page(zhdr);
>>>                 atomic64_dec(&pool->pages_nr);
>>>         } else {
>>> -               z3fold_compact_page(zhdr);
>>
>> why remove this?
>
> As I've said above, that gives some performance improvement.
>
> ~vitaly

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>