hi,
On Jul 7, 2020, at 9:25 PM, Xiaoguang Wang <xiaoguang.wang@xxxxxxxxxxxxxxxxx> wrote:
hi,
On 7/7/20 8:28 AM, Jens Axboe wrote:
On 7/7/20 7:24 AM, Xiaoguang Wang wrote:
For those applications which are not willing to use io_uring_enter()
to reap and handle cqes, they may completely rely on liburing's
io_uring_peek_cqe(), but if cq ring has overflowed, currently because
io_uring_peek_cqe() is not aware of this overflow, it won't enter
kernel to flush cqes, below test program can reveal this bug:
static void test_cq_overflow(struct io_uring *ring)
{
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
int issued = 0;
int ret = 0;
do {
sqe = io_uring_get_sqe(ring);
if (!sqe) {
fprintf(stderr, "get sqe failed\n");
break;;
}
ret = io_uring_submit(ring);
if (ret <= 0) {
if (ret != -EBUSY)
fprintf(stderr, "sqe submit failed: %d\n", ret);
break;
}
issued++;
} while (ret > 0);
assert(ret == -EBUSY);
printf("issued requests: %d\n", issued);
while (issued) {
ret = io_uring_peek_cqe(ring, &cqe);
if (ret) {
if (ret != -EAGAIN) {
fprintf(stderr, "peek completion failed: %s\n",
strerror(ret));
break;
}
printf("left requets: %d\n", issued);
continue;
}
io_uring_cqe_seen(ring, cqe);
issued--;
printf("left requets: %d\n", issued);
}
}
int main(int argc, char *argv[])
{
int ret;
struct io_uring ring;
ret = io_uring_queue_init(16, &ring, 0);
if (ret) {
fprintf(stderr, "ring setup failed: %d\n", ret);
return 1;
}
test_cq_overflow(&ring);
return 0;
}
To fix this issue, export cq overflow status to userspace, then
helper functions() in liburing, such as io_uring_peek_cqe, can be
aware of this cq overflow and do flush accordingly.
Is there any way we can accomplish the same without exporting
another set of flags? Would it be enough for the SQPOLl thread to set
IORING_SQ_NEED_WAKEUP if we're in overflow condition? That should
result in the app entering the kernel when it's flushed the user CQ
side, and then the sqthread could attempt to flush the pending
events as well.
Something like this, totally untested...
OK, took a closer look at this, it's a generic thing, not just
SQPOLL related. My bad!
Anyway, my suggestion would be to add IORING_SQ_CQ_OVERFLOW to the
existing flags, and then make a liburing change almost identical to
what you had.
Hence kernel side:
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d37d7ea5ebe5..af9fd5cefc51 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1234,11 +1234,12 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
struct io_uring_cqe *cqe;
struct io_kiocb *req;
unsigned long flags;
+ bool ret = true;
LIST_HEAD(list);
if (!force) {
if (list_empty_careful(&ctx->cq_overflow_list))
- return true;
+ goto done;
if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
rings->cq_ring_entries))
return false;
@@ -1284,7 +1285,11 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
io_put_req(req);
}
- return cqe != NULL;
+ ret = cqe != NULL;
+done:
+ if (ret)
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ return ret;
}
static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
@@ -5933,10 +5938,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
int i, submitted = 0;
/* if we have a backlog and couldn't flush it all, return BUSY */
- if (test_bit(0, &ctx->sq_check_overflow)) {
+ if (unlikely(test_bit(0, &ctx->sq_check_overflow))) {
if (!list_empty(&ctx->cq_overflow_list) &&
- !io_cqring_overflow_flush(ctx, false))
+ !io_cqring_overflow_flush(ctx, false)) {
+ ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+ smp_mb();
return -EBUSY;
+ }
}
/* make sure SQ entry isn't read before tail */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c22699a5a7..9c7e028beda5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -197,6 +197,7 @@ struct io_sqring_offsets {
* sq_ring->flags
*/
#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */
+#define IORING_SQ_CQ_OVERFLOW (1U << 1) /* app needs to enter kernel */
struct io_cqring_offsets {
__u32 head;
I think above codes are still not correct, you only set IORING_SQ_CQ_OVERFLOW in
io_submit_sqes, but if cq ring has been overflowed and applications don't do io
submit anymore, just calling io_uring_peek_cqe continuously, then applications
still won't be aware of the cq ring overflow.
With the associated liburing change in that same email, the peek will enter the kernel just to flush the overflow. So not sure I see your concern?
I modify above test program a bit:
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <assert.h>
#include "liburing.h"
static void test_cq_overflow(struct io_uring *ring)
{
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
int issued = 0;
int ret = 0;
int i;
for (i = 0; i < 33; i++) {
sqe = io_uring_get_sqe(ring);
if (!sqe) {
fprintf(stderr, "get sqe failed\n");
break;;
}
ret = io_uring_submit(ring);
if (ret <= 0) {
if (ret != -EBUSY)
fprintf(stderr, "sqe submit failed: %d\n", ret);
break;
}
issued++;
}
printf("issued requests: %d\n", issued);
while (issued) {
ret = io_uring_peek_cqe(ring, &cqe);
if (ret) {
if (ret != -EAGAIN) {
fprintf(stderr, "peek completion failed: %s\n",
strerror(ret));
break;
}
printf("left requets: %d %d\n", issued, IO_URING_READ_ONCE(*ring->sq.kflags));
continue;
}
io_uring_cqe_seen(ring, cqe);
issued--;
printf("left requets: %d\n", issued);
}
}
int main(int argc, char *argv[])
{
int ret;
struct io_uring ring;
ret = io_uring_queue_init(16, &ring, 0);
if (ret) {
fprintf(stderr, "ring setup failed: %d\n", ret);
return 1;
}
test_cq_overflow(&ring);
return 0;
}
Though with your patches applied, we still can not peek the last cqe.
This test program will only issue 33 sqes, so it won't get EBUSY error.
Regards,
Xiaoguang Wang
We can put the IORING_SQ_CQ_OVERFLOW set in __io_cqring_fill_event() when setting
cq_check_overflow. In non-sqpoll, this will be safe, but in sqpoll mode, there maybe
concurrent modifications to sq_flags, which is a race condition and may need extral
lock to protect IORING_SQ_NEED_WAKEP and IORING_SQ_CQ_OVERFLOW.
That’s why I wanted to keep it in the shared submission path, as that’s properly synchronized.