Re: [PATCH 0/5] add initial io_uring_cmd support for sockets

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 5/2/23 10:21, Adrien Delorme wrote:
 From Adrien Delorme

From: David Ahern
Sent: 12 April 2023 7:39
Sent: 11 April 2023 16:28
....
Christoph's patch set a few years back that removed set_fs broke the
ability to do in-kernel ioctl and {s,g}setsockopt calls. I did not
follow that change; was it a deliberate intent to not allow these
in-kernel calls vs wanting to remove the set_fs? e.g., can we add a
kioctl variant for in-kernel use of the APIs?

I think that was a side effect, and with no in-tree in-kernel
users (apart from limited calls in bpf) it was deemed acceptable.
(It is a PITA for any code trying to use SCTP in kernel.)

One problem is that not all sockopt calls pass the correct length.
And some of them can have very long buffers.
Not to mention the ones that are read-modify-write.

A plausible solution is to pass a 'fat pointer' that contains
some, or all, of:
       - A userspace buffer pointer.
       - A kernel buffer pointer.
       - The length supplied by the user.
       - The length of the kernel buffer.
       = The number of bytes to copy on completion.
For simple user requests the syscall entry/exit code
would copy the data to a short on-stack buffer.
Kernel users just pass the kernel address.
Odd requests can just use the user pointer.

Probably needs accessors that add in an offset.

It might also be that some of the problematic sockopt
were in decnet - now removed.

Hello everyone,

I'm currently working on an implementation of {get,set} sockopt.
Since this thread is already talking about it, I hope that I replying at the correct place.

Hi Adrien, I believe Breno is working on set/getsockopt as well
and had similar patches for awhile, but that would need for some
problems to be solved first, e.g. try and decide whether it copies
to a ptr as the syscall versions or would get/return optval
directly in sqe/cqe. And also where to store bits that you pass
in struct args_setsockopt_uring, and whether to rely on SQE128
or not.


My implementation is rather simple using a struct that will be used to pass the necessary info throught sqe->cmd.

Here is my implementation based of kernel version 6.3 :

Signed-off-by: Adrien Delorme <delorme.ade@xxxxxxxxxxx>

diff -uprN a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
--- a/include/uapi/linux/io_uring.h     2023-04-23 15:02:52.000000000 -0400
+++ b/include/uapi/linux/io_uring.h     2023-04-24 07:55:21.406981696 -0400
@@ -235,6 +235,25 @@ enum io_uring_op {
   */
#define IORING_URING_CMD_FIXED (1U << 0)

+/* struct io_uring_cmd->cmd_op flags for socket operations */
+#define IO_URING_CMD_OP_GETSOCKOPT 0x0
+#define IO_URING_CMD_OP_SETSOCKOPT 0x1
+
+/* Struct to pass args for IO_URING_CMD_OP_GETSOCKOPT and IO_URING_CMD_OP_SETSOCKOPT operations */
+struct args_setsockopt_uring{

The name of the structure is quite inconsistent with the
rest. It's better to be io_[uring_]_sockopt_arg or some
variation.

+       int                             level;
+       int                     optname;
+       char __user *   user_optval;
+       int                     optlen;

That's uapi, there should not be __user, and field sizes
should be more portable, i.e. use __u32, __u64, etc, look
through the file.

Would need to look into the get/setsockopt implementation
before saying anything about uring_cmd_{set,get}sockopt.


+};
+
+struct args_getsockopt_uring{
+       int                             level;
+       int                     optname;
+       char __user *   user_optval;
+       int      __user *       optlen;
+};
+

/*
   * sqe->fsync_flags
diff -uprN a/net/socket.c b/net/socket.c
--- a/net/socket.c      2023-04-23 15:02:52.000000000 -0400
+++ b/net/socket.c      2023-04-24 08:06:44.800981696 -0400
@@ -108,6 +108,11 @@
#include <linux/ptp_clock_kernel.h>
#include <trace/events/sock.h>

+#ifdef CONFIG_IO_URING
+#include <uapi/linux/io_uring.h>
+#include <linux/io_uring.h>
+#endif
+
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
@@ -132,6 +137,11 @@ static ssize_t sock_splice_read(struct f
                                 struct pipe_inode_info *pipe, size_t len,
                                 unsigned int flags);

+
+#ifdef CONFIG_IO_URING
+int socket_uring_cmd_handler(struct io_uring_cmd *cmd, unsigned int flags);
+#endif
+
#ifdef CONFIG_PROC_FS
static void sock_show_fdinfo(struct seq_file *m, struct file *f)
{
@@ -166,6 +176,9 @@ static const struct file_operations sock
         .splice_write = generic_splice_sendpage,
         .splice_read =  sock_splice_read,
         .show_fdinfo =  sock_show_fdinfo,
+#ifdef CONFIG_IO_URING
+       .uring_cmd = socket_uring_cmd_handler,
+#endif
};

static const char * const pf_family_names[] = {
@@ -2330,6 +2343,126 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int
         return __sys_getsockopt(fd, level, optname, optval, optlen);
}

+#ifdef CONFIG_IO_URING
+
+/*
+ * Make getsockopt operation with io_uring.
+ * This fonction is based of the __sys_getsockopt without sockfd_lookup_light
+ * since io_uring retrieves it for us.
+ */
+int uring_cmd_getsockopt(struct socket *sock, int level, int optname, char __user *optval,
+               int __user *optlen)
+{
+       int err;
+       int max_optlen;
+
+       err = security_socket_getsockopt(sock, level, optname);
+       if (err)
+               goto out_put;
+
+       if (!in_compat_syscall())
+               max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+
+       if (level == SOL_SOCKET)
+               err = sock_getsockopt(sock, level, optname, optval, optlen);
+       else if (unlikely(!sock->ops->getsockopt))
+               err = -EOPNOTSUPP;
+       else
+               err = sock->ops->getsockopt(sock, level, optname, optval,
+                                           optlen);
+
+       if (!in_compat_syscall())
+               err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
+                                                    optval, optlen, max_optlen,
+                                                    err);
+out_put:
+       return err;
+}
+
+/*
+ * Make setsockopt operation with io_uring.
+ * This fonction is based of the __sys_setsockopt without sockfd_lookup_light
+ * since io_uring retrieves it for us.
+ */
+int uring_cmd_setsockopt(struct socket *sock, int level, int optname, char *user_optval,
+               int optlen)
+{
+       sockptr_t optval = USER_SOCKPTR(user_optval);
+       char *kernel_optval = NULL;
+       int err;
+
+       if (optlen < 0)
+               return -EINVAL;
+
+       err = security_socket_setsockopt(sock, level, optname);
+       if (err)
+               goto out_put;
+
+       if (!in_compat_syscall())
+               err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname,
+                                                    user_optval, &optlen,
+                                                    &kernel_optval);
+       if (err < 0)
+               goto out_put;
+       if (err > 0) {
+               err = 0;
+               goto out_put;
+       }
+
+       if (kernel_optval)
+               optval = KERNEL_SOCKPTR(kernel_optval);
+       if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock))
+               err = sock_setsockopt(sock, level, optname, optval, optlen);
+       else if (unlikely(!sock->ops->setsockopt))
+               err = -EOPNOTSUPP;
+       else
+               err = sock->ops->setsockopt(sock, level, optname, optval,
+                                           optlen);
+       kfree(kernel_optval);
+out_put:
+       return err;
+}
+
+/*
+ * Handler uring_cmd socket file_operations.
+ *
+ * Operation code and struct are defined in /include/uapi/linux/io_uring.h
+ * The io_uring ring needs to be set with the flags : IORING_SETUP_SQE128 and IORING_SETUP_CQE32
+ *
+ */
+int socket_uring_cmd_handler(struct io_uring_cmd *cmd, unsigned int flags){
+
+       /* Retrieve socket */
+       struct socket *sock = sock_from_file(cmd->file);
+
+       if (!sock)
+               return -EINVAL;
+
+       /* Does the requested operation */
+       switch (cmd->cmd_op) {
+               case IO_URING_CMD_OP_GETSOCKOPT:
+                       struct args_getsockopt_uring *values_get = (struct args_getsockopt_uring *) cmd->cmd;
+                       return uring_cmd_getsockopt(sock,
+                                                                               values_get->level,
+                                                                               values_get->optname,
+                                                                               values_get->user_optval,
+                                                                               values_get->optlen);
+
+               case IO_URING_CMD_OP_SETSOCKOPT:
+                       struct args_setsockopt_uring *values_set = (struct args_setsockopt_uring *) cmd->cmd;
+                       return uring_cmd_setsockopt(sock,
+                                                                               values_set->level,
+                                                                               values_set->optname,
+                                                                               values_set->user_optval,
+                                                                               values_set->optlen);
+               default:
+                       break;
+
+       }
+       return -EINVAL;
+}
+#endif
+
/*
   *     Shutdown a socket.
   */

I would appreciate any feedback or advice you may have on this work. Hopefully it will be of some kind of help. Thank you for your time and consideration.

Adrien

--
Pavel Begunkov



[Index of Archives]     [Linux Kernel]     [IETF DCCP]     [Linux Networking]     [Git]     [Security]     [Linux Assembly]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]

  Powered by Linux