Re: a patch to improve cephfs direct io performance

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi, zhucaifeng

The patch generally looks good. But I think there is minor flaw, see
my comment below. Besides, could you write a patch for the newest
kernel (it's easier because there is iov_iter_get_pages() helper).


On Fri, Sep 25, 2015 at 11:52 AM, zhucaifeng <zhucaifeng@xxxxxxxxxxxxxxx> wrote:
> Hi, all
>
> When using cephfs, we find that cephfs direct io is very slow.
> For example, installing Windows 7 takes more than 1 hour on a
> virtual machine whose disk is a file in cephfs.
>
> The cause is that when doing direct io, both ceph_sync_direct_write
> and ceph_sync_read iterate iov elements one by one, and send one
> osd_op request for each iov that covers about 4K~8K bytes data. The
> result is lots of small requests and replies shuttled among the kernel
> client and OSDs.
>
> The fix tries to combine as many iovs as possible into one page vector, and
> then send one request for this page vector. In this way, small requests and
> replies are reduced; the OSD can serve a large read/write request one time.
>
> We observe that the performance is improved by about 5~15 times, dependent
> on
> different application workload.
>
> The fix is below. Some change notes are as follows:
>     - function ceph_get_direct_page_vetor and ceph_put_page_vector
>       are moved from net/ceph/pagevec.c to fs/ceph/file.c, and are
>       renamed as dio_grab_pages and dio_free_pagev respectively.
>
>     - function dio_get_pagevlen and dio_alloc_pagev are newly introduced.
>
>       dio_get_pagevlen is to calculate the page vector length in bytes
>       from the io vectors. Except for the first and last one, an io vector
>       can be merged into a page vector iff its base and tail are page
> aligned.
>       for the first vector, only its tail should be page aligned; for the
> last
>       vector, only its head should be page aligned.
>
>       dio_alloc_pagev allocates pages for each io vector and put these pages
>       together in one page pointer array.
>
> Best Regards!
>
> --- /home/linux-3.10.0-229.7.2.el7/fs/ceph/file.c    2015-05-16
> 09:05:28.000000000 +0800
> +++ ./fs/ceph/file.c    2015-09-17 10:43:44.798591846 +0800
> @@ -34,6 +34,115 @@
>   * need to wait for MDS acknowledgement.
>   */
>
> +/*
> + * Calculate the length sum of direct io vectors that can
> + * be combined into one page vector.
> + */
> +static int
> +dio_get_pagevlen(const struct iov_iter *it)
> +{
> +    const struct iovec *iov = it->iov;
> +    const struct iovec *iovend = iov + it->nr_segs;
> +    int pagevlen;
> +
> +    pagevlen = iov->iov_len - it->iov_offset;
> +    /*
> +     * An iov can be page vectored when both the current tail
> +     * and the next base are page aligned.
> +     */
> +    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
> +           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
> +        pagevlen += iov->iov_len;
> +    }
> +    dout("dio_get_pagevlen len = %d\n", pagevlen);
> +    return pagevlen;
> +}
> +
> +/*
> + * Grab @num_pages from the process vm space. These pages are
> + * continuous and start from @data.
> + */
> +static int
> +dio_grab_pages(const void *data, int num_pages, bool write_page,
> +    struct page **pages)
> +{
> +    int got = 0;
> +    int rc = 0;
> +
> +    down_read(&current->mm->mmap_sem);
> +    while (got < num_pages) {
> +        rc = get_user_pages(current, current->mm,
> +            (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
> +            num_pages - got, write_page, 0, pages + got, NULL);
> +        if (rc < 0)
> +            break;
> +        BUG_ON(rc == 0);
> +        got += rc;
> +    }
> +    up_read(&current->mm->mmap_sem);
> +    return got;
> +}
> +
> +static void
> +dio_free_pagev(struct page **pages, int num_pages, bool dirty)
> +{
> +    int i;
> +
> +    for (i = 0; i < num_pages; i++) {
> +        if (dirty)
> +            set_page_dirty_lock(pages[i]);
> +        put_page(pages[i]);
> +    }
> +    kfree(pages);
> +}
> +
> +/*
> + * Allocate a page vector based on (@it, @pagevlen).
> + * The return value is the tuple describing a page vector,
> + * that is (@pages, @pagevlen, @page_align, @num_pages).
> + */
> +static struct page **
> +dio_alloc_pagev(const struct iov_iter *it, int pagevlen, bool write_page,
> +    int *page_align, int *num_pages)
> +{
> +    const struct iovec *iov = it->iov;
> +    struct page **pages;
> +    int n, m, k, npages;
> +    int align;
> +    int len;
> +    void *data;
> +
> +    data = iov->iov_base + it->iov_offset;
> +    len = iov->iov_len - it->iov_offset;
> +    align = ((ulong)data) & ~PAGE_MASK;
> +    npages = calc_pages_for((ulong)data, pagevlen);
> +    pages = kmalloc(sizeof(*pages) * npages, GFP_NOFS);
> +    if (!pages)
> +        return ERR_PTR(-ENOMEM);
> +    for (n = 0; n < npages; n += m) {
> +        m = calc_pages_for((ulong)data, len);
> +        if (n + m > npages)
> +            m = npages - n;
> +        k = dio_grab_pages(data, m, write_page, pages + n);
> +        if (k < m) {
> +            n += k;
> +            goto failed;
> +        }

if (align + len) is not page aligned, the loop should stop.

> +
> +        iov++;
> +        data = iov->iov_base;

if (unsigned long)data is not page aligned, the loop should stop


Regards
Yan, Zheng

> +        len = iov->iov_len;
> +    }
> +    *num_pages = npages;
> +    *page_align = align;
> +    dout("dio_alloc_pagev: alloc pages pages[0:%d], page align %d\n",
> +        npages, align);
> +    return pages;
> +
> +failed:
> +    dio_free_pagev(pages, n, false);
> +    return ERR_PTR(-ENOMEM);
> +}
>
>  /*
>   * Prepare an open request.  Preallocate ceph_cap to avoid an
> @@ -354,13 +463,14 @@ more:
>      if (ret >= 0) {
>          int didpages;
>          if (was_short && (pos + ret < inode->i_size)) {
> -            u64 tmp = min(this_len - ret,
> +            u64 zlen = min(this_len - ret,
>                      inode->i_size - pos - ret);
> +            int zoff = (o_direct ? buf_align : io_align) +
> +                   read + ret;
>              dout(" zero gap %llu to %llu\n",
> -                pos + ret, pos + ret + tmp);
> -            ceph_zero_page_vector_range(page_align + read + ret,
> -                            tmp, pages);
> -            ret += tmp;
> +                pos + ret, pos + ret + zlen);
> +            ceph_zero_page_vector_range(zoff, zlen, pages);
> +            ret += zlen;
>          }
>
>          didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
> @@ -421,19 +531,19 @@ static ssize_t ceph_sync_read(struct kio
>
>      if (file->f_flags & O_DIRECT) {
>          while (iov_iter_count(i)) {
> -            void __user *data = i->iov[0].iov_base + i->iov_offset;
> -            size_t len = i->iov[0].iov_len - i->iov_offset;
> -
> -            num_pages = calc_pages_for((unsigned long)data, len);
> -            pages = ceph_get_direct_page_vector(data,
> -                                num_pages, true);
> +            int page_align;
> +            size_t len;
> +
> +            len = dio_get_pagevlen(i);
> +            pages = dio_alloc_pagev(i, len, true, &page_align,
> +                &num_pages);
>              if (IS_ERR(pages))
>                  return PTR_ERR(pages);
>
>              ret = striped_read(inode, off, len,
>                         pages, num_pages, checkeof,
> -                       1, (unsigned long)data & ~PAGE_MASK);
> -            ceph_put_page_vector(pages, num_pages, true);
> +                       1, page_align);
> +            dio_free_pagev(pages, num_pages, true);
>
>              if (ret <= 0)
>                  break;
> @@ -570,10 +680,7 @@ ceph_sync_direct_write(struct kiocb *ioc
>      iov_iter_init(&i, iov, nr_segs, count, 0);
>
>      while (iov_iter_count(&i) > 0) {
> -        void __user *data = i.iov->iov_base + i.iov_offset;
> -        u64 len = i.iov->iov_len - i.iov_offset;
> -
> -        page_align = (unsigned long)data & ~PAGE_MASK;
> +        u64 len = dio_get_pagevlen(&i);
>
>          snapc = ci->i_snap_realm->cached_context;
>          vino = ceph_vino(inode);
> @@ -589,8 +696,7 @@ ceph_sync_direct_write(struct kiocb *ioc
>              break;
>          }
>
> -        num_pages = calc_pages_for(page_align, len);
> -        pages = ceph_get_direct_page_vector(data, num_pages, false);
> +        pages = dio_alloc_pagev(&i, len, false, &page_align, &num_pages);
>          if (IS_ERR(pages)) {
>              ret = PTR_ERR(pages);
>              goto out;
> @@ -612,7 +718,7 @@ ceph_sync_direct_write(struct kiocb *ioc
>          if (!ret)
>              ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
>
> -        ceph_put_page_vector(pages, num_pages, false);
> +        dio_free_pagev(pages, num_pages, false);
>
>  out:
>          ceph_osdc_put_request(req);
>
>
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux