Re: [PATCH] ceph: move sb->wb_pagevec_pool to be a global mempool

Ilya Dryomov <idryomov@xxxxxxxxx> · Mon, 3 Aug 2020 11:27:49 +0200

On Thu, Jul 30, 2020 at 5:48 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote:
>
> When doing some testing recently, I hit some page allocation failures
> on mount, when creating the wb_pagevec_pool for the mount. That
> requires 128k (32 contiguous pages), and after thrashing the memory
> during an xfstests run, sometimes that would fail.
>
> 128k for each mount seems like a lot to hold in reserve for a rainy
> day, so let's change this to a global mempool that gets allocated
> when the module is plugged in.
>
> Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> ---
>  fs/ceph/addr.c               | 23 +++++++++++------------
>  fs/ceph/super.c              | 22 ++++++++--------------
>  fs/ceph/super.h              |  2 --
>  include/linux/ceph/libceph.h |  1 +
>  4 files changed, 20 insertions(+), 28 deletions(-)
>
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 01ad09733ac7..6ea761c84494 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -862,8 +862,7 @@ static void writepages_finish(struct ceph_osd_request *req)
>
>         osd_data = osd_req_op_extent_osd_data(req, 0);
>         if (osd_data->pages_from_pool)
> -               mempool_free(osd_data->pages,
> -                            ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
> +               mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
>         else
>                 kfree(osd_data->pages);
>         ceph_osdc_put_request(req);
> @@ -955,10 +954,10 @@ static int ceph_writepages_start(struct address_space *mapping,
>                 int num_ops = 0, op_idx;
>                 unsigned i, pvec_pages, max_pages, locked_pages = 0;
>                 struct page **pages = NULL, **data_pages;
> -               mempool_t *pool = NULL; /* Becomes non-null if mempool used */
>                 struct page *page;
>                 pgoff_t strip_unit_end = 0;
>                 u64 offset = 0, len = 0;
> +               bool from_pool = false;
>
>                 max_pages = wsize >> PAGE_SHIFT;
>
> @@ -1057,16 +1056,16 @@ static int ceph_writepages_start(struct address_space *mapping,
>                                                       sizeof(*pages),
>                                                       GFP_NOFS);
>                                 if (!pages) {
> -                                       pool = fsc->wb_pagevec_pool;
> -                                       pages = mempool_alloc(pool, GFP_NOFS);
> +                                       from_pool = true;
> +                                       pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
>                                         BUG_ON(!pages);
>                                 }
>
>                                 len = 0;
>                         } else if (page->index !=
>                                    (offset + len) >> PAGE_SHIFT) {
> -                               if (num_ops >= (pool ?  CEPH_OSD_SLAB_OPS :
> -                                                       CEPH_OSD_MAX_OPS)) {
> +                               if (num_ops >= (from_pool ?  CEPH_OSD_SLAB_OPS :
> +                                                            CEPH_OSD_MAX_OPS)) {
>                                         redirty_page_for_writepage(wbc, page);
>                                         unlock_page(page);
>                                         break;
> @@ -1161,7 +1160,7 @@ static int ceph_writepages_start(struct address_space *mapping,
>                                      offset, len);
>                                 osd_req_op_extent_osd_data_pages(req, op_idx,
>                                                         data_pages, len, 0,
> -                                                       !!pool, false);
> +                                                       from_pool, false);
>                                 osd_req_op_extent_update(req, op_idx, len);
>
>                                 len = 0;
> @@ -1188,12 +1187,12 @@ static int ceph_writepages_start(struct address_space *mapping,
>                 dout("writepages got pages at %llu~%llu\n", offset, len);
>
>                 osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
> -                                                0, !!pool, false);
> +                                                0, from_pool, false);
>                 osd_req_op_extent_update(req, op_idx, len);
>
>                 BUG_ON(op_idx + 1 != req->r_num_ops);
>
> -               pool = NULL;
> +               from_pool = false;
>                 if (i < locked_pages) {
>                         BUG_ON(num_ops <= req->r_num_ops);
>                         num_ops -= req->r_num_ops;
> @@ -1204,8 +1203,8 @@ static int ceph_writepages_start(struct address_space *mapping,
>                         pages = kmalloc_array(locked_pages, sizeof(*pages),
>                                               GFP_NOFS);
>                         if (!pages) {
> -                               pool = fsc->wb_pagevec_pool;
> -                               pages = mempool_alloc(pool, GFP_NOFS);
> +                               from_pool = true;
> +                               pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
>                                 BUG_ON(!pages);
>                         }
>                         memcpy(pages, data_pages + i,
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 585aecea5cad..7ec0e6d03d10 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -637,8 +637,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>                                         struct ceph_options *opt)
>  {
>         struct ceph_fs_client *fsc;
> -       int page_count;
> -       size_t size;
>         int err;
>
>         fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
> @@ -686,22 +684,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>         if (!fsc->cap_wq)
>                 goto fail_inode_wq;
>
> -       /* set up mempools */
> -       err = -ENOMEM;
> -       page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
> -       size = sizeof (struct page *) * (page_count ? page_count : 1);
> -       fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
> -       if (!fsc->wb_pagevec_pool)
> -               goto fail_cap_wq;
> -
>         spin_lock(&ceph_fsc_lock);
>         list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
>         spin_unlock(&ceph_fsc_lock);
>
>         return fsc;
>
> -fail_cap_wq:
> -       destroy_workqueue(fsc->cap_wq);
>  fail_inode_wq:
>         destroy_workqueue(fsc->inode_wq);
>  fail_client:
> @@ -732,8 +720,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>         destroy_workqueue(fsc->inode_wq);
>         destroy_workqueue(fsc->cap_wq);
>
> -       mempool_destroy(fsc->wb_pagevec_pool);
> -
>         destroy_mount_options(fsc->mount_options);
>
>         ceph_destroy_client(fsc->client);
> @@ -752,6 +738,7 @@ struct kmem_cache *ceph_dentry_cachep;
>  struct kmem_cache *ceph_file_cachep;
>  struct kmem_cache *ceph_dir_file_cachep;
>  struct kmem_cache *ceph_mds_request_cachep;
> +mempool_t *ceph_wb_pagevec_pool;
>
>  static void ceph_inode_init_once(void *foo)
>  {
> @@ -796,6 +783,10 @@ static int __init init_caches(void)
>         if (!ceph_mds_request_cachep)
>                 goto bad_mds_req;
>
> +       ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT);
> +       if (!ceph_wb_pagevec_pool)
> +               goto bad_pagevec_pool;
> +
>         error = ceph_fscache_register();
>         if (error)
>                 goto bad_fscache;
> @@ -804,6 +795,8 @@ static int __init init_caches(void)
>
>  bad_fscache:
>         kmem_cache_destroy(ceph_mds_request_cachep);
> +bad_pagevec_pool:
> +       mempool_destroy(ceph_wb_pagevec_pool);
>  bad_mds_req:
>         kmem_cache_destroy(ceph_dir_file_cachep);
>  bad_dir_file:
> @@ -834,6 +827,7 @@ static void destroy_caches(void)
>         kmem_cache_destroy(ceph_file_cachep);
>         kmem_cache_destroy(ceph_dir_file_cachep);
>         kmem_cache_destroy(ceph_mds_request_cachep);
> +       mempool_destroy(ceph_wb_pagevec_pool);
>
>         ceph_fscache_unregister();
>  }
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 9001a896ae8c..4c3c964b1c54 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -118,8 +118,6 @@ struct ceph_fs_client {
>
>         struct ceph_mds_client *mdsc;
>
> -       /* writeback */
> -       mempool_t *wb_pagevec_pool;
>         atomic_long_t writeback_count;
>
>         struct workqueue_struct *inode_wq;
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index e5ed1c541e7f..c8645f0b797d 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -282,6 +282,7 @@ extern struct kmem_cache *ceph_dentry_cachep;
>  extern struct kmem_cache *ceph_file_cachep;
>  extern struct kmem_cache *ceph_dir_file_cachep;
>  extern struct kmem_cache *ceph_mds_request_cachep;
> +extern mempool_t *ceph_wb_pagevec_pool;
>
>  /* ceph_common.c */
>  extern bool libceph_compatible(void *data);

Looks fine to me.

I think it used to be just a single page per mount before
95cca2b44e54 ("ceph: limit osd write size") because fsopt->wsize
defaulted to 0 which meant "no limit".  And CEPH_MSG_MAX_DATA_LEN
got increased from 16M to 64M as well...

Thanks,

                Ilya