This discussion is about how to fix an audit reference count decrement race between two io_uring threads. Original discussion link: https : / / github . com / axboe / liburing / issues / 958 Details: The test program below hangs indefinitely waiting for an openat cqe. The reproduction is with a distro kernel Ubuntu-azure-6.2-6.2.0-1012.12_22.04.1. However, the bug seems possible with an upstream kernel. An experiment of changing the reference count in struct filename from int to refcount_t allows the test program to complete. The bug did not occur with this test program until a kernel containing commit 5bd2182d58e9 was used. I have not found a matching reported issue or upstream commit yet. The dmseg log shows an audit related path: [27883.992550] kernel BUG at fs/namei.c:262! [27883.994051] invalid opcode: 0000 [#15] SMP PTI [27883.995719] CPU: 3 PID: 84988 Comm: iou-wrk-84835 Tainted: G D 6.2.0-1012-azure #12~22.04.1-Ubuntu [27883.999064] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 05/09/2022 [27884.002734] RIP: 0010:putname+0x68/0x70 ... [27884.032893] Call Trace: [27884.034032] <TASK> [27884.035117] ? show_regs+0x6a/0x80 [27884.036763] ? die+0x38/0xa0 [27884.038023] ? do_trap+0xd0/0xf0 [27884.039359] ? do_error_trap+0x70/0x90 [27884.040861] ? putname+0x68/0x70 [27884.042201] ? exc_invalid_op+0x53/0x70 [27884.043698] ? putname+0x68/0x70 [27884.045076] ? asm_exc_invalid_op+0x1b/0x20 [27884.047051] ? putname+0x68/0x70 [27884.048415] audit_reset_context.part.0.constprop.0+0xe1/0x300 [27884.050511] __audit_uring_exit+0xda/0x1c0 [27884.052100] io_issue_sqe+0x1f3/0x450 [27884.053702] ? lock_timer_base+0x3b/0xd0 [27884.055283] io_wq_submit_work+0x8d/0x2b0 [27884.056848] ? __try_to_del_timer_sync+0x67/0xa0 [27884.058577] io_worker_handle_work+0x17c/0x2b0 [27884.060267] io_wqe_worker+0x10a/0x350 [27884.061714] ? raw_spin_rq_unlock+0x10/0x30 [27884.063295] ? finish_task_switch.isra.0+0x8b/0x2c0 [27884.065537] ? __pfx_io_wqe_worker+0x10/0x10 [27884.067215] ret_from_fork+0x2c/0x50 [27884.068733] RIP: 0033:0x0 ... Test program usage: ./io_uring_open_close_audit_hang --directory /tmp/deleteme --count 10000 Test program source: // Note: The test program is C++ but could be converted to C. #include <cassert> #include <fcntl.h> #include <filesystem> #include <getopt.h> #include <iostream> #include <liburing.h> // open and close a file. the file is created if it does not exist. void openClose(struct io_uring& ring, std::string fileName) { int ret; struct io_uring_cqe* cqe {}; struct io_uring_sqe* sqe {}; int fd {}; int flags {O_RDWR | O_CREAT}; mode_t mode {0666}; // openat2 sqe = io_uring_get_sqe(&ring); assert(sqe != nullptr); io_uring_prep_openat(sqe, AT_FDCWD, fileName.data(), flags, mode); io_uring_sqe_set_flags(sqe, IOSQE_ASYNC); ret = io_uring_submit(&ring); assert(ret == 1); ret = io_uring_wait_cqe(&ring, &cqe); assert(ret == 0); fd = cqe->res; assert(fd > 0); io_uring_cqe_seen(&ring, cqe); // close sqe = io_uring_get_sqe(&ring); assert(sqe != nullptr); io_uring_prep_close(sqe, fd); io_uring_sqe_set_flags(sqe, IOSQE_ASYNC); ret = io_uring_submit(&ring); assert(ret == 1); // wait for the close to complete. ret = io_uring_wait_cqe(&ring, &cqe); assert(ret == 0); // verify that close succeeded. assert(cqe->res == 0); io_uring_cqe_seen(&ring, cqe); } // create 100 files and then open each file twice. void openCloseHang(std::string filePath) { int ret; struct io_uring ring; ret = io_uring_queue_init(8, &ring, 0); assert(0 == ret); int repeat {3}; int numFiles {100}; std::filesystem::create_directory(filePath); // files of length 0 are created in the j==0 iteration below. // those files are opened and closed during the j>0 iteraions. // a repeat of 3 results in a fairly reliable reproduction. for (int j = 0; j < repeat; j += 1) { for (int i = 0; i < numFiles; i += 1) { std::string fileName(filePath + "/file" + std::to_string(i)); openClose(ring, fileName); } } std::filesystem::remove_all(filePath); io_uring_queue_exit(&ring); } int main(int argc, char** argv) { std::string filePath {}; int iterations {}; struct option options[] { {"help", no_argument, 0, 'h'}, {"directory", required_argument, 0, 'd'}, {"count", required_argument, 0, 'c'}, { 0, 0, 0, 0 } }; bool printUsage {false}; int val {}; while ((val = getopt_long_only(argc, argv, "", options, nullptr)) != -1) { if (val == 'h') { printUsage = true; } else if (val == 'd') { filePath = optarg; if (std::filesystem::exists(filePath)) { printUsage = true; std::cerr << "directory must not exist" << std::endl; } } else if (val == 'c') { iterations = atoi(optarg); if (0 == iterations) { printUsage = true; } } else { printUsage = true; } } if ((0 == iterations) || (filePath.empty())) { printUsage = true; } if (printUsage || (optind < argc)) { std::cerr << "io_uring_open_close_audit_hang.cc --directory DIR --count COUNT" << std::endl; exit(1); } for (int i = 0; i < iterations; i += 1) { if (0 == (i % 100)) { std::cout << "i=" << std::to_string(i) << std::endl; } openCloseHang(filePath); } return 0; } Changing the reference count from int to refcount_t allows the test program to complete using the v6.2 distro kernel. The patch applies and builds on the upstream v6.1.55 kernel. Signed-off-by: Dan Clash <dan.clash@xxxxxxxxxxxxx> --- diff --git a/fs/namei.c b/fs/namei.c index 2a8baa6ce3e8..4f7ac131c9d1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -187,7 +187,7 @@ getname_flags(const char __user *filename, int flags, int *empty) } } - result->refcnt = 1; + refcount_set(&result->refcnt, 1); /* The empty path is special. */ if (unlikely(!len)) { if (empty) @@ -248,7 +248,7 @@ getname_kernel(const char * filename) memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; - result->refcnt = 1; + refcount_set(&result->refcnt, 1); audit_getname(result); return result; @@ -259,9 +259,10 @@ void putname(struct filename *name) if (IS_ERR(name)) return; - BUG_ON(name->refcnt <= 0); + BUG_ON(refcount_read(&name->refcnt) == 0); + BUG_ON(refcount_read(&name->refcnt) == REFCOUNT_SATURATED); - if (--name->refcnt > 0) + if (!refcount_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { diff --git a/include/linux/fs.h b/include/linux/fs.h index d0a54e9aac7a..8217e07726d4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2719,7 +2719,7 @@ struct audit_names; struct filename { const char *name; /* pointer to actual string */ const __user char *uptr; /* original userland pointer */ - int refcnt; + refcount_t refcnt; struct audit_names *aname; const char iname[]; }; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 37cded22497e..232e0be9f6d9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2188,7 +2188,7 @@ __audit_reusename(const __user char *uptr) if (!n->name) continue; if (n->name->uptr == uptr) { - n->name->refcnt++; + refcount_inc(&n->name->refcnt); return n->name; } } @@ -2217,7 +2217,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - name->refcnt++; + refcount_inc(&name->refcnt); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2349,7 +2349,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, return; if (name) { n->name = name; - name->refcnt++; + refcount_inc(&name->refcnt); } out: @@ -2474,7 +2474,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - found_child->name->refcnt++; + refcount_inc(&found_child->name->refcnt); } }