audit: io_uring openat triggers audit reference count underflow in worker thread

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This discussion is about how to fix an audit reference count decrement
race between two io_uring threads.

Original discussion link:
https : / / github . com / axboe / liburing / issues / 958

Details:

The test program below hangs indefinitely waiting for an openat cqe.  The
reproduction is with a distro kernel Ubuntu-azure-6.2-6.2.0-1012.12_22.04.1.
However, the bug seems possible with an upstream kernel.  An experiment of
changing the reference count in struct filename from int to refcount_t allows
the test program to complete.

The bug did not occur with this test program until a kernel containing
commit 5bd2182d58e9 was used.  I have not found a matching reported issue
or upstream commit yet.

The dmseg log shows an audit related path:

[27883.992550] kernel BUG at fs/namei.c:262!
[27883.994051] invalid opcode: 0000 [#15] SMP PTI
[27883.995719] CPU: 3 PID: 84988 Comm: iou-wrk-84835 Tainted: G      D
               6.2.0-1012-azure #12~22.04.1-Ubuntu
[27883.999064] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine,
               BIOS Hyper-V UEFI Release v4.1 05/09/2022
[27884.002734] RIP: 0010:putname+0x68/0x70
...
[27884.032893] Call Trace:
[27884.034032]  <TASK>
[27884.035117]  ? show_regs+0x6a/0x80
[27884.036763]  ? die+0x38/0xa0
[27884.038023]  ? do_trap+0xd0/0xf0
[27884.039359]  ? do_error_trap+0x70/0x90
[27884.040861]  ? putname+0x68/0x70
[27884.042201]  ? exc_invalid_op+0x53/0x70
[27884.043698]  ? putname+0x68/0x70
[27884.045076]  ? asm_exc_invalid_op+0x1b/0x20
[27884.047051]  ? putname+0x68/0x70
[27884.048415]  audit_reset_context.part.0.constprop.0+0xe1/0x300
[27884.050511]  __audit_uring_exit+0xda/0x1c0
[27884.052100]  io_issue_sqe+0x1f3/0x450
[27884.053702]  ? lock_timer_base+0x3b/0xd0
[27884.055283]  io_wq_submit_work+0x8d/0x2b0
[27884.056848]  ? __try_to_del_timer_sync+0x67/0xa0
[27884.058577]  io_worker_handle_work+0x17c/0x2b0
[27884.060267]  io_wqe_worker+0x10a/0x350
[27884.061714]  ? raw_spin_rq_unlock+0x10/0x30
[27884.063295]  ? finish_task_switch.isra.0+0x8b/0x2c0
[27884.065537]  ? __pfx_io_wqe_worker+0x10/0x10
[27884.067215]  ret_from_fork+0x2c/0x50
[27884.068733] RIP: 0033:0x0
...

Test program usage:

./io_uring_open_close_audit_hang --directory /tmp/deleteme --count 10000

Test program source:

// Note: The test program is C++ but could be converted to C.
#include <cassert>
#include <fcntl.h>
#include <filesystem>
#include <getopt.h>
#include <iostream>
#include <liburing.h>

// open and close a file.  the file is created if it does not exist.

void
openClose(struct io_uring& ring, std::string fileName)
{
    int ret;
    struct io_uring_cqe* cqe {};
    struct io_uring_sqe* sqe {};
    int fd {};
    int flags {O_RDWR | O_CREAT};
    mode_t mode {0666};

    // openat2

    sqe = io_uring_get_sqe(&ring);
    assert(sqe != nullptr);

    io_uring_prep_openat(sqe, AT_FDCWD, fileName.data(), flags, mode);
    io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);

    ret = io_uring_submit(&ring);
    assert(ret == 1);

    ret = io_uring_wait_cqe(&ring, &cqe);
    assert(ret == 0);

    fd = cqe->res;
    assert(fd > 0);

    io_uring_cqe_seen(&ring, cqe);

    // close

    sqe = io_uring_get_sqe(&ring);
    assert(sqe != nullptr);

    io_uring_prep_close(sqe, fd);
    io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);

    ret = io_uring_submit(&ring);
    assert(ret == 1);

    // wait for the close to complete.
    ret = io_uring_wait_cqe(&ring, &cqe);
    assert(ret == 0);

    // verify that close succeeded.
    assert(cqe->res == 0);

    io_uring_cqe_seen(&ring, cqe);
}

// create 100 files and then open each file twice.

void
openCloseHang(std::string filePath)
{
    int ret;
    struct io_uring ring;

    ret = io_uring_queue_init(8, &ring, 0);
    assert(0 == ret);

    int repeat {3};
    int numFiles {100};

    std::filesystem::create_directory(filePath);

    // files of length 0 are created in the j==0 iteration below.
    // those files are opened and closed during the j>0 iteraions.
    // a repeat of 3 results in a fairly reliable reproduction.

    for (int j = 0; j < repeat; j += 1) {
        for (int i = 0; i < numFiles; i += 1) {
            std::string fileName(filePath + "/file" + std::to_string(i));
            openClose(ring, fileName);
        }
    }

    std::filesystem::remove_all(filePath);

    io_uring_queue_exit(&ring);
}

int
main(int argc, char** argv)
{
    std::string filePath {};
    int iterations {};

    struct option options[]
    {
        {"help", no_argument, 0, 'h'}, {"directory", required_argument, 0, 'd'},
            {"count", required_argument, 0, 'c'},
        {
            0, 0, 0, 0
        }
    };
    bool printUsage {false};
    int val {};

    while ((val = getopt_long_only(argc, argv, "", options, nullptr)) != -1) {
        if (val == 'h') {
            printUsage = true;
        } else if (val == 'd') {
            filePath = optarg;
            if (std::filesystem::exists(filePath)) {
                printUsage = true;
                std::cerr << "directory must not exist" << std::endl;
            }
        } else if (val == 'c') {
            iterations = atoi(optarg);
            if (0 == iterations) {
                printUsage = true;
            }
        } else {
            printUsage = true;
        }
    }

    if ((0 == iterations) || (filePath.empty())) {
        printUsage = true;
    }

    if (printUsage || (optind < argc)) {
        std::cerr << "io_uring_open_close_audit_hang.cc --directory DIR --count COUNT" << std::endl;
        exit(1);
    }

    for (int i = 0; i < iterations; i += 1) {
        if (0 == (i % 100)) {
            std::cout << "i=" << std::to_string(i) << std::endl;
        }
        openCloseHang(filePath);
    }
    return 0;
}

Changing the reference count from int to refcount_t allows the test program
to complete using the v6.2 distro kernel.  The patch applies and builds on the
upstream v6.1.55 kernel.

Signed-off-by: Dan Clash <dan.clash@xxxxxxxxxxxxx>
---
diff --git a/fs/namei.c b/fs/namei.c
index 2a8baa6ce3e8..4f7ac131c9d1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -187,7 +187,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
 		}
 	}

-	result->refcnt = 1;
+	refcount_set(&result->refcnt, 1);
 	/* The empty path is special. */
 	if (unlikely(!len)) {
 		if (empty)
@@ -248,7 +248,7 @@ getname_kernel(const char * filename)
 	memcpy((char *)result->name, filename, len);
 	result->uptr = NULL;
 	result->aname = NULL;
-	result->refcnt = 1;
+	refcount_set(&result->refcnt, 1);
 	audit_getname(result);

 	return result;
@@ -259,9 +259,10 @@ void putname(struct filename *name)
 	if (IS_ERR(name))
 		return;

-	BUG_ON(name->refcnt <= 0);
+	BUG_ON(refcount_read(&name->refcnt) == 0);
+	BUG_ON(refcount_read(&name->refcnt) == REFCOUNT_SATURATED);

-	if (--name->refcnt > 0)
+	if (!refcount_dec_and_test(&name->refcnt))
 		return;

 	if (name->name != name->iname) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d0a54e9aac7a..8217e07726d4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2719,7 +2719,7 @@ struct audit_names;
 struct filename {
 	const char		*name;	/* pointer to actual string */
 	const __user char	*uptr;	/* original userland pointer */
-	int			refcnt;
+	refcount_t		refcnt;
 	struct audit_names	*aname;
 	const char		iname[];
 };
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 37cded22497e..232e0be9f6d9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2188,7 +2188,7 @@ __audit_reusename(const __user char *uptr)
 		if (!n->name)
 			continue;
 		if (n->name->uptr == uptr) {
-			n->name->refcnt++;
+			refcount_inc(&n->name->refcnt);
 			return n->name;
 		}
 	}
@@ -2217,7 +2217,7 @@ void __audit_getname(struct filename *name)
 	n->name = name;
 	n->name_len = AUDIT_NAME_FULL;
 	name->aname = n;
-	name->refcnt++;
+	refcount_inc(&name->refcnt);
 }

 static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2349,7 +2349,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
 		return;
 	if (name) {
 		n->name = name;
-		name->refcnt++;
+		refcount_inc(&name->refcnt);
 	}

 out:
@@ -2474,7 +2474,7 @@ void __audit_inode_child(struct inode *parent,
 		if (found_parent) {
 			found_child->name = found_parent->name;
 			found_child->name_len = AUDIT_NAME_FULL;
-			found_child->name->refcnt++;
+			refcount_inc(&found_child->name->refcnt);
 		}
 	}





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux