On 1/6/2024 17:05, Jens Axboe wrote:
On 6/1/24 8:19 AM, Jens Axboe wrote:
On 6/1/24 3:43 AM, Stefan wrote:
io_uring uses the __u32 len field in order to pass the length to
madvise and fadvise, but these calls use an off_t, which is 64bit on
64bit platforms.
When using liburing, the length is silently truncated to 32bits (so
8GB length would become zero, which has a different meaning of "until
the end of the file" for fadvise).
If my understanding is correct, we could fix this by introducing new
operations MADVISE64 and FADVISE64, which use the addr3 field instead
of the length field for length.
We probably just want to introduce a flag and ensure that older stable
kernels check it, and then use a 64-bit field for it when the flag is
set.
I think this should do it on the kernel side, as we already check these
fields and return -EINVAL as needed. Should also be trivial to backport.
Totally untested... Might want a FEAT flag for this, or something where
it's detectable, to make the liburing change straight forward.
diff --git a/io_uring/advise.c b/io_uring/advise.c
index 7085804c513c..cb7b881665e5 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -17,14 +17,14 @@
struct io_fadvise {
struct file *file;
u64 offset;
- u32 len;
+ u64 len;
u32 advice;
};
struct io_madvise {
struct file *file;
u64 addr;
- u32 len;
+ u64 len;
u32 advice;
};
@@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
- if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
ma->addr = READ_ONCE(sqe->addr);
- ma->len = READ_ONCE(sqe->len);
+ ma->len = READ_ONCE(sqe->off);
+ if (!ma->len)
+ ma->len = READ_ONCE(sqe->len);
ma->advice = READ_ONCE(sqe->fadvise_advice);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
@@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
- if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
fa->offset = READ_ONCE(sqe->off);
- fa->len = READ_ONCE(sqe->len);
+ fa->len = READ_ONCE(sqe->addr);
+ if (!fa->len)
+ fa->len = READ_ONCE(sqe->len);
fa->advice = READ_ONCE(sqe->fadvise_advice);
if (io_fadvise_force_async(fa))
req->flags |= REQ_F_FORCE_ASYNC;
If we want to have the length in the same field in both *ADVISE
operations, we can put a flag in splice_fd_in/optlen.
Maybe the explicit flag is a bit clearer for users of the API
compared to the implicit flag when setting sqe->len to zero?
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 994bf7af0efe..70794ac1471e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -415,6 +415,14 @@ enum io_uring_msg_ring_flags {
*/
#define IORING_NOP_INJECT_RESULT (1U << 0)
+/*
+ * IORING_OP_FADVISE and IORING_OP_MADVISE flags (stored in sqe->optlen)
+ *
+ * IORING_ADVISE_LEN64 Use 64-bit length stored in sqe->addr3
+ *
+ */
+#define IORING_ADVISE_LEN64 (1U << 0)
+
/*
* IO completion data structure (Completion Queue Entry)
*/
diff --git a/io_uring/advise.c b/io_uring/advise.c
index 7085804c513c..f229c751adbc 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -17,14 +17,14 @@
struct io_fadvise {
struct file *file;
u64 offset;
- u32 len;
+ u64 len;
u32 advice;
};
struct io_madvise {
struct file *file;
u64 addr;
- u32 len;
+ u64 len;
u32 advice;
};
@@ -32,12 +32,26 @@ int io_madvise_prep(struct io_kiocb *req, const
struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
+ u32 flags;
- if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->off)
return -EINVAL;
+ flags = READ_ONCE(sqe->optlen);
+
+ if (flags & ~IORING_ADVISE_LEN64)
+ return -EINVAL;
+
+ if (flags & IORING_ADVISE_LEN64) {
+ if (sqe->len)
+ return -EINVAL;
+
+ ma->len = READ_ONCE(sqe->addr3);
+ } else {
+ ma->len = READ_ONCE(sqe->len);
+ }
+
ma->addr = READ_ONCE(sqe->addr);
- ma->len = READ_ONCE(sqe->len);
ma->advice = READ_ONCE(sqe->fadvise_advice);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
@@ -77,12 +91,26 @@ static bool io_fadvise_force_async(struct io_fadvise
*fa)
int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
+ u32 flags;
- if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->addr)
return -EINVAL;
+ flags = READ_ONCE(sqe->optlen);
+
+ if (flags & ~IORING_ADVISE_LEN64)
+ return -EINVAL;
+
+ if (flags & IORING_ADVISE_LEN64) {
+ if (sqe->len)
+ return -EINVAL;
+
+ fa->len = READ_ONCE(sqe->addr3);
+ } else {
+ fa->len = READ_ONCE(sqe->len);
+ }
+
fa->offset = READ_ONCE(sqe->off);
- fa->len = READ_ONCE(sqe->len);
fa->advice = READ_ONCE(sqe->fadvise_advice);
if (io_fadvise_force_async(fa))
req->flags |= REQ_F_FORCE_ASYNC;