On Thu, Oct 27, 2016 at 7:13 AM, Vitaly Wool <vitalywool@xxxxxxxxx> wrote: > This patch implements compaction worker thread for z3fold. This > worker does not free up any pages directly but it allows for a > denser placement of compressed objects which results in less > actual pages consumed and higher compression ratio therefore. > > This patch has been checked with the latest Linus's tree. > > Signed-off-by: Vitaly Wool <vitalywool@xxxxxxxxx> > --- > mm/z3fold.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++---------- > 1 file changed, 140 insertions(+), 26 deletions(-) > > diff --git a/mm/z3fold.c b/mm/z3fold.c > index 014d84f..cc26ff5 100644 > --- a/mm/z3fold.c > +++ b/mm/z3fold.c > @@ -27,6 +27,7 @@ > #include <linux/mm.h> > #include <linux/module.h> > #include <linux/preempt.h> > +#include <linux/workqueue.h> > #include <linux/slab.h> > #include <linux/spinlock.h> > #include <linux/zpool.h> > @@ -59,6 +60,7 @@ struct z3fold_ops { > > /** > * struct z3fold_pool - stores metadata for each z3fold pool > + * @name: pool name > * @lock: protects all pool fields and first|last_chunk fields of any > * z3fold page in the pool > * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; > @@ -72,11 +74,14 @@ struct z3fold_ops { > * @unbuddied_nr: number of unbuddied z3fold pages in the pool. > * @ops: pointer to a structure of user defined operations specified at > * pool creation time. > + * @compact_wq: workqueue for page layout background optimization > + * @work: compaction work item > * > * This structure is allocated at pool creation time and maintains metadata > * pertaining to a particular z3fold pool. > */ > struct z3fold_pool { > + const char *name; > rwlock_t lock; > struct list_head unbuddied[NCHUNKS]; > struct list_head buddied; > @@ -86,6 +91,8 @@ struct z3fold_pool { > const struct z3fold_ops *ops; > struct zpool *zpool; > const struct zpool_ops *zpool_ops; > + struct workqueue_struct *compact_wq; > + struct delayed_work work; > }; > > enum buddy { > @@ -121,6 +128,7 @@ enum z3fold_page_flags { > UNDER_RECLAIM = 0, > PAGE_HEADLESS, > MIDDLE_CHUNK_MAPPED, > + COMPACTION_DEFERRED, > }; > > /***************** > @@ -136,6 +144,9 @@ static int size_to_chunks(size_t size) > #define for_each_unbuddied_list(_iter, _begin) \ > for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) > > +#define for_each_unbuddied_list_reverse(_iter, _end) \ > + for ((_iter) = (_end); (_iter) > 0; (_iter)--) > + > /* Initializes the z3fold header of a newly allocated z3fold page */ > static struct z3fold_header *init_z3fold_page(struct page *page) > { > @@ -145,6 +156,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page) > clear_bit(UNDER_RECLAIM, &page->private); > clear_bit(PAGE_HEADLESS, &page->private); > clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); > + clear_bit(COMPACTION_DEFERRED, &page->private); > > zhdr->first_chunks = 0; > zhdr->middle_chunks = 0; > @@ -211,6 +223,116 @@ static int num_free_chunks(struct z3fold_header *zhdr) > return nfree; > } > > +static inline void *mchunk_memmove(struct z3fold_header *zhdr, > + unsigned short dst_chunk) > +{ > + void *beg = zhdr; > + return memmove(beg + (dst_chunk << CHUNK_SHIFT), > + beg + (zhdr->start_middle << CHUNK_SHIFT), > + zhdr->middle_chunks << CHUNK_SHIFT); > +} > + > +static int z3fold_compact_page(struct z3fold_header *zhdr, bool sync) > +{ > + struct page *page = virt_to_page(zhdr); > + int ret = 0; > + > + if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) { > + set_bit(COMPACTION_DEFERRED, &page->private); > + ret = -1; > + goto out; > + } > + > + clear_bit(COMPACTION_DEFERRED, &page->private); > + if (zhdr->middle_chunks != 0) { > + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { > + /* > + * If we are here, no one can access this page > + * except for z3fold_map or z3fold_free. Both > + * will wait for page_lock to become free. > + */ > + mchunk_memmove(zhdr, 1); /* move to the beginning */ > + zhdr->first_chunks = zhdr->middle_chunks; > + zhdr->middle_chunks = 0; > + zhdr->start_middle = 0; > + zhdr->first_num++; > + ret = 1; > + goto out; > + } > + if (sync) > + goto out; > + > + /* > + * These are more complicated cases: first or last object > + * may be mapped. Luckily we don't touch these anyway. > + * > + * NB: moving data is expensive, so let's only do that if > + * there's substantial gain (2+ chunks) > + */ > + if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && > + zhdr->start_middle > zhdr->first_chunks + 2) { > + mchunk_memmove(zhdr, zhdr->first_chunks + 1); > + zhdr->start_middle = zhdr->first_chunks + 1; > + ret = 1; > + goto out; > + } > + if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && > + zhdr->middle_chunks + zhdr->last_chunks <= > + NCHUNKS - zhdr->start_middle - 2) { > + unsigned short new_start = NCHUNKS - zhdr->last_chunks - > + zhdr->middle_chunks; > + mchunk_memmove(zhdr, new_start); > + zhdr->start_middle = new_start; > + ret = 1; > + goto out; > + } > + } > +out: > + return ret; > +} > + > +#define COMPACTION_BATCH (NCHUNKS/2) > +static void z3fold_compact_work(struct work_struct *w) > +{ > + struct z3fold_pool *pool = container_of(to_delayed_work(w), > + struct z3fold_pool, work); > + struct z3fold_header *zhdr; > + struct page *page; > + int i, ret, compacted = 0; > + bool requeue = false; > + > + write_lock(&pool->lock); > + for_each_unbuddied_list_reverse(i, NCHUNKS - 3) { do we still need to scan each unbuddied list? This is a lot of overhead when we already knew exactly what page(s) we marked as compaction deferred. also, this is only scanning the last entry in each unbuddied list - but that's possibly missing a whole lot of entries assuming the unbuddied lists are larger than 1 entry each. Can't we add one more list_head element to the zhdr, and add a compaction_needed list to the pool? Then the workqueue work can just quickly process all the entries in that list. > + zhdr = list_empty(&pool->unbuddied[i]) ? > + NULL : list_last_entry(&pool->unbuddied[i], > + struct z3fold_header, buddy); > + if (!zhdr) > + continue; > + page = virt_to_page(zhdr); > + if (likely(!test_bit(COMPACTION_DEFERRED, &page->private))) > + continue; > + list_del(&zhdr->buddy); > + ret = z3fold_compact_page(zhdr, false); > + if (ret < 0) > + requeue = true; > + else > + compacted += ret; > + write_unlock(&pool->lock); > + cond_resched(); > + write_lock(&pool->lock); > + list_add(&zhdr->buddy, > + &pool->unbuddied[num_free_chunks(zhdr)]); > + if (compacted >= COMPACTION_BATCH) { instead of doing batches, if we have a dedicated list of entries that we know need compaction, just process the entire list. call cond_resched() between compacting entries. > + requeue = true; > + break; > + } > + } > + write_unlock(&pool->lock); > + if (requeue && !delayed_work_pending(&pool->work)) > + queue_delayed_work(pool->compact_wq, &pool->work, HZ); > +} > + > + > /***************** > * API Functions > *****************/ > @@ -230,7 +352,7 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, > > pool = kzalloc(sizeof(struct z3fold_pool), gfp); > if (!pool) > - return NULL; > + goto out; > rwlock_init(&pool->lock); > for_each_unbuddied_list(i, 0) > INIT_LIST_HEAD(&pool->unbuddied[i]); > @@ -238,8 +360,17 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, > INIT_LIST_HEAD(&pool->lru); > atomic64_set(&pool->pages_nr, 0); > atomic64_set(&pool->unbuddied_nr, 0); > + pool->compact_wq = create_singlethread_workqueue(pool->name); > + if (!pool->compact_wq) > + goto out; > + INIT_DELAYED_WORK(&pool->work, z3fold_compact_work); > pool->ops = ops; > return pool; > + > +out: > + if (pool) > + kfree(pool); > + return NULL; > } > > /** > @@ -250,31 +381,11 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, > */ > static void z3fold_destroy_pool(struct z3fold_pool *pool) > { > + if (pool->compact_wq) > + destroy_workqueue(pool->compact_wq); > kfree(pool); > } > > -/* Has to be called with lock held */ > -static int z3fold_compact_page(struct z3fold_header *zhdr) > -{ > - struct page *page = virt_to_page(zhdr); > - void *beg = zhdr; > - > - > - if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) && > - zhdr->middle_chunks != 0 && > - zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { > - memmove(beg + ZHDR_SIZE_ALIGNED, > - beg + (zhdr->start_middle << CHUNK_SHIFT), > - zhdr->middle_chunks << CHUNK_SHIFT); > - zhdr->first_chunks = zhdr->middle_chunks; > - zhdr->middle_chunks = 0; > - zhdr->start_middle = 0; > - zhdr->first_num++; > - return 1; > - } > - return 0; > -} > - > /** > * z3fold_alloc() - allocates a region of a given size > * @pool: z3fold pool from which to allocate > @@ -464,11 +575,13 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) > free_z3fold_page(zhdr); > atomic64_dec(&pool->pages_nr); > } else { > - z3fold_compact_page(zhdr); > + set_bit(COMPACTION_DEFERRED, &page->private); can't we at least try to compact inline? at minimum, we should throw this to the compact function to mark for compaction, if needed - we can certainly at least do the zhdr placement calculations to see if compaction is needed, even if we don't want to actually do the compaction right now. > /* Add to the unbuddied list */ > freechunks = num_free_chunks(zhdr); > - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); > + list_add_tail(&zhdr->buddy, &pool->unbuddied[freechunks]); > atomic64_inc(&pool->unbuddied_nr); > + if (!delayed_work_pending(&pool->work)) > + queue_delayed_work(pool->compact_wq, &pool->work, HZ); > } > > write_unlock(&pool->lock); > @@ -596,7 +709,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) > /* Full, add to buddied list */ > list_add(&zhdr->buddy, &pool->buddied); > } else { > - z3fold_compact_page(zhdr); > + z3fold_compact_page(zhdr, true); > /* add to unbuddied list */ > freechunks = num_free_chunks(zhdr); > list_add(&zhdr->buddy, > @@ -725,6 +838,7 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp, > if (pool) { > pool->zpool = zpool; > pool->zpool_ops = zpool_ops; > + pool->name = name; > } > return pool; > } > -- > 2.4.2 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>