[PATCH] aio: fix the increment of aio-nr and counting against aio-max-nr

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Currently, aio-nr is incremented in steps of 'num_possible_cpus() * 8'
for io_setup(nr_events, ..) with 'nr_events < num_possible_cpus() * 4':

    ioctx_alloc()
    ...
        nr_events = max(nr_events, num_possible_cpus() * 4);
        nr_events *= 2;
    ...
        ctx->max_reqs = nr_events;
    ...
        aio_nr += ctx->max_reqs;
    ....

This limits the number of aio contexts actually available to much less
than aio-max-nr, and is increasingly worse with greater number of CPUs.

For example, with 64 CPUs, only 256 aio contexts are actually available
(with aio-max-nr = 65536) because the increment is 512 in that scenario.

Note: 65536 [max aio contexts] / (2*4*64) [increment per aio context]
is 128, but make it 256 (double) as counting against 'aio-max-nr * 2':

    ioctx_alloc()
    ...
        if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
        ...
            goto err_ctx;
    ...

This patch uses the original value of nr_events (from userspace) to
increment aio-nr and count against aio-max-nr, which resolves those.

The test-case and test-suite validation steps are included later in
this patch message, for documentation purposes.

Example on a system with 64 CPUs:

    # cat /sys/devices/system/cpu/possible
    0-63

    # grep . /proc/sys/fs/aio-*
    /proc/sys/fs/aio-max-nr:65536
    /proc/sys/fs/aio-nr:0

    test 1)  number of aio contexts available with nr_events == 1
    -------------------------------------------------------------

    This test calls io_setup(1, ..) up to 65536 times, exiting on error.

    - original kernel:

    Only 256 aio contexts could be created successfully,
    quickly falling into the aio-max-nr exceeded error path (-EAGAIN).

    # ./io_setup 1 65536 | grep -m1 . - /proc/sys/fs/aio-nr
    (standard input):io_setup(1, ): 256 calls with rc 0, last call with rc -11.
    /proc/sys/fs/aio-nr:131072

    One might notice the aio-nr value is twice the aio-max-nr limit,
    an effect of how the current code handles that 'nr_events *= 2'.

    - patched kernel:

    Almost all of the limit of aio contexts could be allocated,
    eventually falling into the insufficient resources error path (-ENOMEM):

    # ./io_setup 1 65536 | grep -m1 . - /proc/sys/fs/aio-nr
    (standard input):io_setup(1, ): 65516 calls with rc 0, last call with rc -12.
    /proc/sys/fs/aio-nr:65516

    Notice the aio-nr value is now _under_ the aio-max-nr limit.

    test 2)  increment value for nr_events == 1
    -------------------------------------------

    This test calls io_setup(1, ..) only 1 time, to show the increment:

    - original kernel:

    # ./io_setup 1 1 | grep -m1 . - /proc/sys/fs/aio-nr
    (standard input):io_setup(1, ) : 1 calls with rc 0, last call with rc 0.
    /proc/sys/fs/aio-nr:512

    Notice the increment is 'num_online_cpus() * 8'.

    - patched kernel:

    # ./io_setup 1 1 | grep -m1 . - /proc/sys/fs/aio-nr
    (standard input):io_setup(1, ): 1 calls with rc 0, last call with rc 0.
    /proc/sys/fs/aio-nr:1

    Notice the increment is exactly 1 (matches nr_events from userspace).

    test 3)  more aio contexts available with great-enough nr_events
    ----------------------------------------------------------------

    The full aio-max-nr limit (65536) is available for greater nr_events.
    This test calls io_setup(1024, ) exactly 64 times, without error.

    - original kernel:

    # ./io_setup 1024 64 | grep -m1 . - /proc/sys/fs/aio-nr
    (standard input):io_setup(1024, ): 64 calls with rc 0, last call with rc 0.
    /proc/sys/fs/aio-nr:131072

    Notice the aio-nr value is twice the aio-max-nr limit.

    - patched kernel:

    # ./io_setup 1024 64 | grep -m1 . - /proc/sys/fs/aio-nr
    (standard input):io_setup(1024, ): 64 calls with rc 0, last call with rc 0.
    /proc/sys/fs/aio-nr:65536

    Notice the aio-nr value is now _exactly_ the aio-max-nr limit.

Test-case: io_setup.c # gcc -o io_setup io_setup.c -laio

    """
    #include <libaio.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <unistd.h>

    int main(int argc, char *argv[]) {

            int nr_events, nr_calls, rc, i;
            io_context_t *ioctx;

            /* usage: io_setup <nr_events for io_setup()> <max calls to io_setup()> */
            if (argc != 3)
                    return -1;

            nr_events = atoi(argv[1]);
            nr_calls = atoi(argv[2]);

            ioctx = calloc(nr_calls, sizeof(*ioctx));
            if (!ioctx)
                    return -2;

            for (i = 0; i < nr_calls; i++)
                    if (rc = io_setup(nr_events, &ioctx[i]))
                            break;

            printf("io_setup(%d, ): %d calls with rc 0, last call with rc %d.\n",
                   nr_events, i, rc);
            fflush(stdout);

            sleep(1);
            return 0;
    }
    """

Test-suite: libaio

    # curl https://kojipkgs.fedoraproject.org//packages/libaio/0.3.110/7.fc26/src/libaio-0.3.110-7.fc26.src.rpm \
        | rpm2cpio | cpio -mid

    # tar xf libaio-0.3.110.tar.gz
    # cd libaio-0.3.110

    # make
    # make check 2>&1 | grep '^test cases'
    test cases/2.t completed PASSED.
    test cases/3.t completed PASSED.
    test cases/4.t completed PASSED.
    test cases/5.t completed PASSED.
    test cases/6.t completed PASSED.
    test cases/7.t completed PASSED.
    test cases/11.t completed PASSED.
    test cases/12.t completed PASSED.
    test cases/13.t completed PASSED.
    test cases/14.t completed PASSED.
    test cases/15.t completed PASSED.
    test cases/16.t completed PASSED.
    test cases/10.t completed PASSED.
    test cases/8.t completed PASSED.

Signed-off-by: Mauricio Faria de Oliveira <mauricfo@xxxxxxxxxxxxxxxxxx>
Reported-by: Lekshmi C. Pillai <lekshmi.cpillai@xxxxxxxxxx>
Tested-by: Lekshmi C. Pillai <lekshmi.cpillai@xxxxxxxxxx>
Tested-by: Paul Nguyen <nguyenp@xxxxxxxxxx>
---
 fs/aio.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index f52d925ee259..3908480d7ccd 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -441,10 +441,9 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 #endif
 };
 
-static int aio_setup_ring(struct kioctx *ctx)
+static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 {
 	struct aio_ring *ring;
-	unsigned nr_events = ctx->max_reqs;
 	struct mm_struct *mm = current->mm;
 	unsigned long size, unused;
 	int nr_pages;
@@ -707,6 +706,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	int err = -ENOMEM;
 
 	/*
+	 * Store the original nr_events -- what userspace passed to io_setup(),
+	 * for counting against the global limit -- before it changes.
+	 */
+	unsigned int max_reqs = nr_events;
+
+	/*
 	 * We keep track of the number of available ringbuffer slots, to prevent
 	 * overflow (reqs_available), and we also use percpu counters for this.
 	 *
@@ -724,14 +729,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
+	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
-	ctx->max_reqs = nr_events;
+	ctx->max_reqs = max_reqs;
 
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
@@ -753,7 +758,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	if (!ctx->cpu)
 		goto err;
 
-	err = aio_setup_ring(ctx);
+	err = aio_setup_ring(ctx, nr_events);
 	if (err < 0)
 		goto err;
 
@@ -764,8 +769,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	/* limit the number of system wide aios */
 	spin_lock(&aio_nr_lock);
-	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
-	    aio_nr + nr_events < aio_nr) {
+	if (aio_nr + ctx->max_reqs > aio_max_nr ||
+	    aio_nr + ctx->max_reqs < aio_nr) {
 		spin_unlock(&aio_nr_lock);
 		err = -EAGAIN;
 		goto err_ctx;
-- 
1.8.3.1




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux