On 11/12/24 7:40 AM, John Ousterhout wrote:
homa_plumbing.c contains functions that connect Homa to the rest of
the Linux kernel, such as dispatch tables used by Linux and the
top-level functions that Linux invokes from those dispatch tables.
homa_utils.c contains a few odds and ends, such as code to initialize
and destroy struct homa's.
Signed-off-by: John Ousterhout <ouster@xxxxxxxxxxxxxxx>
---
net/homa/homa_plumbing.c | 965 +++++++++++++++++++++++++++++++++++++++
net/homa/homa_utils.c | 150 ++++++
2 files changed, 1115 insertions(+)
create mode 100644 net/homa/homa_plumbing.c
create mode 100644 net/homa/homa_utils.c
diff --git a/net/homa/homa_plumbing.c b/net/homa/homa_plumbing.c
new file mode 100644
index 000000000000..afd3a9cc97ba
--- /dev/null
+++ b/net/homa/homa_plumbing.c
@@ -0,0 +1,965 @@
+// SPDX-License-Identifier: BSD-2-Clause
+
+/* This file consists mostly of "glue" that hooks Homa into the rest of
+ * the Linux kernel. The guts of the protocol are in other files.
+ */
+
+#include "homa_impl.h"
+#include "homa_peer.h"
+#include "homa_pool.h"
+
+MODULE_LICENSE("Dual MIT/GPL");
+MODULE_AUTHOR("John Ousterhout");
+MODULE_DESCRIPTION("Homa transport protocol");
+MODULE_VERSION("0.01");
+
+/* Not yet sure what these variables are for */
+static long sysctl_homa_mem[3] __read_mostly;
+static int sysctl_homa_rmem_min __read_mostly;
+static int sysctl_homa_wmem_min __read_mostly;
+
+/* Global data for Homa. Never reference homa_data directory. Always use
+ * the homa variable instead; this allows overriding during unit tests.
+ */
+static struct homa homa_data;
+struct homa *homa = &homa_data;
+
+/* True means that the Homa module is in the process of unloading itself,
+ * so everyone should clean up.
+ */
+static bool exiting;
+
+/* Thread that runs timer code to detect lost packets and crashed peers. */
+static struct task_struct *timer_kthread;
+
+/* This structure defines functions that handle various operations on
+ * Homa sockets. These functions are relatively generic: they are called
+ * to implement top-level system calls. Many of these operations can
+ * be implemented by PF_INET6 functions that are independent of the
+ * Homa protocol.
+ */
+static const struct proto_ops homa_proto_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = homa_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = homa_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = homa_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .set_peek_off = sk_set_peek_off,
+};
+
+static const struct proto_ops homav6_proto_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = homa_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet6_getname,
+ .poll = homa_poll,
+ .ioctl = inet6_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = homa_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .set_peek_off = sk_set_peek_off,
+};
+
+/* This structure also defines functions that handle various operations
+ * on Homa sockets. However, these functions are lower-level than those
+ * in homa_proto_ops: they are specific to the PF_INET or PF_INET6
+ * protocol family, and in many cases they are invoked by functions in
+ * homa_proto_ops. Most of these functions have Homa-specific implementations.
+ */
+static struct proto homa_prot = {
+ .name = "HOMA",
+ .owner = THIS_MODULE,
+ .close = homa_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = homa_disconnect,
+ .ioctl = homa_ioctl,
+ .init = homa_socket,
+ .destroy = NULL,
+ .setsockopt = homa_setsockopt,
+ .getsockopt = homa_getsockopt,
+ .sendmsg = homa_sendmsg,
+ .recvmsg = homa_recvmsg,
+ .backlog_rcv = homa_backlog_rcv,
+ .hash = homa_hash,
+ .unhash = homa_unhash,
+ .get_port = homa_get_port,
+ .sysctl_mem = sysctl_homa_mem,
+ .sysctl_wmem = &sysctl_homa_wmem_min,
+ .sysctl_rmem = &sysctl_homa_rmem_min,
+ .obj_size = sizeof(struct homa_sock),
+ .no_autobind = 1,
+};
+
+static struct proto homav6_prot = {
+ .name = "HOMAv6",
+ .owner = THIS_MODULE,
+ .close = homa_close,
+ .connect = ip6_datagram_connect,
+ .disconnect = homa_disconnect,
+ .ioctl = homa_ioctl,
+ .init = homa_socket,
+ .destroy = NULL,
+ .setsockopt = homa_setsockopt,
+ .getsockopt = homa_getsockopt,
+ .sendmsg = homa_sendmsg,
+ .recvmsg = homa_recvmsg,
+ .backlog_rcv = homa_backlog_rcv,
+ .hash = homa_hash,
+ .unhash = homa_unhash,
+ .get_port = homa_get_port,
+ .sysctl_mem = sysctl_homa_mem,
+ .sysctl_wmem = &sysctl_homa_wmem_min,
+ .sysctl_rmem = &sysctl_homa_rmem_min,
+
+ /* IPv6 data comes *after* Homa's data, and isn't included in
+ * struct homa_sock.
+ */
+ .obj_size = sizeof(struct homa_sock) + sizeof(struct ipv6_pinfo),
The implementation of inet6_sk_generic() has already changed, you should set
.ipv6_pinfo_offset.
+ .no_autobind = 1,
+};
+
+/* Top-level structure describing the Homa protocol. */
+static struct inet_protosw homa_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_HOMA,
+ .prot = &homa_prot,
+ .ops = &homa_proto_ops,
+ .flags = INET_PROTOSW_REUSE,
+};
+
+static struct inet_protosw homav6_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_HOMA,
+ .prot = &homav6_prot,
+ .ops = &homav6_proto_ops,
+ .flags = INET_PROTOSW_REUSE,
+};
+
+/* This structure is used by IP to deliver incoming Homa packets to us. */
+static struct net_protocol homa_protocol = {
+ .handler = homa_softirq,
+ .err_handler = homa_err_handler_v4,
+ .no_policy = 1,
+};
+
+static struct inet6_protocol homav6_protocol = {
+ .handler = homa_softirq,
+ .err_handler = homa_err_handler_v6,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+};
+
+/* Sizes of the headers for each Homa packet type, in bytes. */
+static __u16 header_lengths[] = {
+ sizeof32(struct data_header),
+ 0,
+ sizeof32(struct resend_header),
+ sizeof32(struct unknown_header),
+ sizeof32(struct busy_header),
+ 0,
+ sizeof32(struct common_header),
+ sizeof32(struct need_ack_header),
+ sizeof32(struct ack_header)
+};
+
+static DECLARE_COMPLETION(timer_thread_done);
+
+/**
+ * homa_load() - invoked when this module is loaded into the Linux kernel
+ * Return: 0 on success, otherwise a negative errno.
+ */
+static int __init homa_load(void)
+{
+ int status;
+
+ pr_notice("Homa module loading\n");
+ pr_notice("Homa structure sizes: data_header %u, seg_header %u, ack %u, peer %u, ip_hdr %u flowi %u ipv6_hdr %u, flowi6 %u tcp_sock %u homa_rpc %u sk_buff %u rcvmsg_control %u union sockaddr_in_union %u HOMA_MAX_BPAGES %u NR_CPUS %u nr_cpu_ids %u, MAX_NUMNODES %d\n",
+ sizeof32(struct data_header),
+ sizeof32(struct seg_header),
+ sizeof32(struct homa_ack),
+ sizeof32(struct homa_peer),
+ sizeof32(struct iphdr),
+ sizeof32(struct flowi),
+ sizeof32(struct ipv6hdr),
+ sizeof32(struct flowi6),
+ sizeof32(struct tcp_sock),
+ sizeof32(struct homa_rpc),
+ sizeof32(struct sk_buff),
+ sizeof32(struct homa_recvmsg_args),
+ sizeof32(union sockaddr_in_union),
+ HOMA_MAX_BPAGES,
+ NR_CPUS,
+ nr_cpu_ids,
+ MAX_NUMNODES);
+ status = proto_register(&homa_prot, 1);
+ if (status != 0) {
+ pr_err("proto_register failed for homa_prot: %d\n", status);
+ goto out;
+ }
+ status = proto_register(&homav6_prot, 1);
+ if (status != 0) {
+ pr_err("proto_register failed for homav6_prot: %d\n", status);
+ goto out;
+ }
+ inet_register_protosw(&homa_protosw);
+ inet6_register_protosw(&homav6_protosw);
better to check the retval of inet6_register_protosw().
+ status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA);
+ if (status != 0) {
+ pr_err("inet_add_protocol failed in %s: %d\n", __func__,
+ status);
+ goto out_cleanup;
+ }
+ status = inet6_add_protocol(&homav6_protocol, IPPROTO_HOMA);
+ if (status != 0) {
+ pr_err("inet6_add_protocol failed in %s: %d\n", __func__,
+ status);
+ goto out_cleanup;
+ }
+
+ status = homa_init(homa);
+ if (status)
+ goto out_cleanup;
+
+ timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer");
+ if (IS_ERR(timer_kthread)) {
+ status = PTR_ERR(timer_kthread);
+ pr_err("couldn't create homa pacer thread: error %d\n",
+ status);
+ timer_kthread = NULL;
+ goto out_cleanup;
+ }
+
+ return 0;
+
+out_cleanup:
+ homa_destroy(homa);
+ inet_del_protocol(&homa_protocol, IPPROTO_HOMA);
+ inet_unregister_protosw(&homa_protosw);
+ inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA);
+ inet6_unregister_protosw(&homav6_protosw);
+ proto_unregister(&homa_prot);
+ proto_unregister(&homav6_prot);
It's a bit strange for me that this relies on a premise: that every reverse operation can correctly
identify whether the corresponding forward operation has been executed. Currently, perhaps every
function includes this capability. It's up to you, I don't insist.
+out:
+ return status;
+}
+
+/**
+ * homa_unload() - invoked when this module is unloaded from the Linux kernel.
+ */
+static void __exit homa_unload(void)
+{
+ pr_notice("Homa module unloading\n");
+ exiting = true;
+
+ if (timer_kthread)
+ wake_up_process(timer_kthread);
+ wait_for_completion(&timer_thread_done);
+ homa_destroy(homa);
+ inet_del_protocol(&homa_protocol, IPPROTO_HOMA);
+ inet_unregister_protosw(&homa_protosw);
+ inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA);
+ inet6_unregister_protosw(&homav6_protosw);
+ proto_unregister(&homa_prot);
+ proto_unregister(&homav6_prot);
+}
+
+module_init(homa_load);
+module_exit(homa_unload);
Perhaps you can try adding MODULE_ALIAS_NET_PF_PROTO_TYPE so that the kernel will automatically load
the module when creating IPPROTO_HOMA socket. A functional suggestion, It's up to you.
+
+/**
+ * homa_bind() - Implements the bind system call for Homa sockets: associates
+ * a well-known service port with a socket. Unlike other AF_INET6 protocols,
+ * there is no need to invoke this system call for sockets that are only
+ * used as clients.
+ * @sock: Socket on which the system call was invoked.
+ * @addr: Contains the desired port number.
+ * @addr_len: Number of bytes in uaddr.
+ * Return: 0 on success, otherwise a negative errno.
+ */
+int homa_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+ struct homa_sock *hsk = homa_sk(sock->sk);
+ union sockaddr_in_union *addr_in = (union sockaddr_in_union *)addr;
+ int port = 0;
+
+ if (unlikely(addr->sa_family != sock->sk->sk_family))
+ return -EAFNOSUPPORT;
+ if (addr_in->in6.sin6_family == AF_INET6) {
+ if (addr_len < sizeof(struct sockaddr_in6))
+ return -EINVAL;
+ port = ntohs(addr_in->in4.sin_port);
+ } else if (addr_in->in4.sin_family == AF_INET) {
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+ port = ntohs(addr_in->in6.sin6_port);
+ }
+ return homa_sock_bind(homa->port_map, hsk, port);
+}
Is binding multiple times legal? For example, bind 80 first and then bind 8080. If not, I think
you might need to check the inet_num.
+
+/**
+ * homa_close() - Invoked when close system call is invoked on a Homa socket.
+ * @sk: Socket being closed
+ * @timeout: ??
+ */
+void homa_close(struct sock *sk, long timeout)
+{
+ struct homa_sock *hsk = homa_sk(sk);
+
+ homa_sock_destroy(hsk);
+ sk_common_release(sk);
+}
+
+/**
+ * homa_shutdown() - Implements the shutdown system call for Homa sockets.
+ * @sock: Socket to shut down.
+ * @how: Ignored: for other sockets, can independently shut down
+ * sending and receiving, but for Homa any shutdown will
+ * shut down everything.
+ *
+ * Return: 0 on success, otherwise a negative errno.
+ */
+int homa_shutdown(struct socket *sock, int how)
+{
+ homa_sock_shutdown(homa_sk(sock->sk));
+ return 0;
+}
+
+/**
+ * homa_disconnect() - Invoked when disconnect system call is invoked on a
+ * Homa socket.
+ * @sk: Socket to disconnect
+ * @flags: ??
+ *
+ * Return: 0 on success, otherwise a negative errno.
+ */
+int homa_disconnect(struct sock *sk, int flags)
+{
+ pr_warn("unimplemented disconnect invoked on Homa socket\n");
+ return -EINVAL;
+}
+
+/**
+ * homa_ioctl() - Implements the ioctl system call for Homa sockets.
+ * @sk: Socket on which the system call was invoked.
+ * @cmd: Identifier for a particular ioctl operation.
+ * @karg: Operation-specific argument; typically the address of a block
+ * of data in user address space.
+ *
+ * Return: 0 on success, otherwise a negative errno.
+ */
+int homa_ioctl(struct sock *sk, int cmd, int *karg)
+{
+ return -EINVAL;
+}
+
+/**
+ * homa_socket() - Implements the socket(2) system call for sockets.
+ * @sk: Socket on which the system call was invoked. The non-Homa
+ * parts have already been initialized.
+ *
+ * Return: always 0 (success).
+ */
+int homa_socket(struct sock *sk)
+{
+ struct homa_sock *hsk = homa_sk(sk);
+
+ homa_sock_init(hsk, homa)
I noticed that homa_sock_init() contains a memory allocation action, perhaps you should add a return
value check.
-> hsk->buffer_pool = kzalloc(sizeof(*hsk->buffer_pool), GFP_KERNEL);
+ return 0;
+}
+
+/**
+ * homa_setsockopt() - Implements the getsockopt system call for Homa sockets.
+ * @sk: Socket on which the system call was invoked.
+ * @level: Level at which the operation should be handled; will always
+ * be IPPROTO_HOMA.
+ * @optname: Identifies a particular setsockopt operation.
+ * @optval: Address in user space of information about the option.
+ * @optlen: Number of bytes of data at @optval.
+ * Return: 0 on success, otherwise a negative errno.
+ */
+int homa_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct homa_sock *hsk = homa_sk(sk);
+ struct homa_set_buf_args args;
+ int ret;
+
+ if (level != IPPROTO_HOMA || optname != SO_HOMA_SET_BUF ||
+ optlen != sizeof(struct homa_set_buf_args))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&args, optval, optlen))
+ return -EFAULT;
+
+ /* Do a trivial test to make sure we can at least write the first
+ * page of the region.
+ */
+ if (copy_to_user((__force void __user *)args.start, &args, sizeof(args)))
+ return -EFAULT;
+
+ homa_sock_lock(hsk, "homa_setsockopt SO_HOMA_SET_BUF");
+ ret = homa_pool_init(hsk, (__force void __user *)args.start, args.length);
+ homa_sock_unlock(hsk);
+ return ret;
+}
+
+/**
+ * homa_getsockopt() - Implements the getsockopt system call for Homa sockets.
+ * @sk: Socket on which the system call was invoked.
+ * @level: ??
+ * @optname: Identifies a particular setsockopt operation.
+ * @optval: Address in user space where the option's value should be stored.
+ * @option: ??.
+ * Return: 0 on success, otherwise a negative errno.
+ */
+int homa_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *option)
+{
+ pr_warn("unimplemented getsockopt invoked on Homa socket: level %d, optname %d\n",
+ level, optname);
+ return -EINVAL;
+}
+
+/**
+ * homa_sendmsg() - Send a request or response message on a Homa socket.
+ * @sk: Socket on which the system call was invoked.
+ * @msg: Structure describing the message to send; the msg_control
+ * field points to additional information.
+ * @length: Number of bytes of the message.
+ * Return: 0 on success, otherwise a negative errno.
+ */
+int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length)
+{
+ struct homa_sock *hsk = homa_sk(sk);
+ struct homa_sendmsg_args args;
+ int result = 0;
+ struct homa_rpc *rpc = NULL;
+ union sockaddr_in_union *addr = (union sockaddr_in_union *)msg->msg_name;
+
+ if (unlikely(!msg->msg_control_is_user)) {
+ result = -EINVAL;
+ goto error;
+ }
+ if (unlikely(copy_from_user(&args,
+ (__force void __user *)msg->msg_control,
+ sizeof(args)))) {
+ result = -EFAULT;
+ goto error;
+ }
+ if (addr->in6.sin6_family != sk->sk_family) {
+ result = -EAFNOSUPPORT;
+ goto error;
+ }
+ if (msg->msg_namelen < sizeof(struct sockaddr_in) ||
+ (msg->msg_namelen < sizeof(struct sockaddr_in6) &&
+ addr->in6.sin6_family == AF_INET6)) {
+ result = -EINVAL;
+ goto error;
+ }
+
+ if (!args.id) {
+ /* This is a request message. */
+ rpc = homa_rpc_new_client(hsk, addr);
+ if (IS_ERR(rpc)) {
+ result = PTR_ERR(rpc);
+ rpc = NULL;
+ goto error;
+ }
+ rpc->completion_cookie = args.completion_cookie;
+ result = homa_message_out_fill(rpc, &msg->msg_iter, 1);
+ if (result)
+ goto error;
+ args.id = rpc->id;
+ homa_rpc_unlock(rpc);
+ rpc = NULL;
+
+ if (unlikely(copy_to_user((__force void __user *)msg->msg_control,
+ &args, sizeof(args)))) {
+ rpc = homa_find_client_rpc(hsk, args.id);
+ result = -EFAULT;
+ goto error;
+ }
+ } else {
+ /* This is a response message. */
+ struct in6_addr canonical_dest;
+
+ if (args.completion_cookie != 0) {
+ result = -EINVAL;
+ goto error;
+ }
+ canonical_dest = canonical_ipv6_addr(addr);
+
+ rpc = homa_find_server_rpc(hsk, &canonical_dest,
+ ntohs(addr->in6.sin6_port), args.id);
+ if (!rpc)
+ /* Return without an error if the RPC doesn't exist;
+ * this could be totally valid (e.g. client is
+ * no longer interested in it).
+ */
+ return 0;
+ if (rpc->error) {
+ result = rpc->error;
+ goto error;
+ }
+ if (rpc->state != RPC_IN_SERVICE) {
+ homa_rpc_unlock(rpc);
+ rpc = NULL;
+ result = -EINVAL;
+ goto error;
+ }
+ rpc->state = RPC_OUTGOING;
+
+ result = homa_message_out_fill(rpc, &msg->msg_iter, 1);
+ if (result && rpc->state != RPC_DEAD)
+ goto error;
+ homa_rpc_unlock(rpc);
+ }
+ return 0;
+
+error:
+ if (rpc) {
+ homa_rpc_free(rpc);
+ homa_rpc_unlock(rpc);
+ }
+ return result;
+}
+
+/**
+ * homa_recvmsg() - Receive a message from a Homa socket.
+ * @sk: Socket on which the system call was invoked.
+ * @msg: Controlling information for the receive.
+ * @len: Total bytes of space available in msg->msg_iov; not used.
+ * @flags: Flags from system call; only MSG_DONTWAIT is used.
+ * @addr_len: Store the length of the sender address here
+ * Return: The length of the message on success, otherwise a negative
+ * errno.
+ */
+int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
+{
+ struct homa_sock *hsk = homa_sk(sk);
+ struct homa_recvmsg_args control;
+ struct homa_rpc *rpc;
+ int result;
+
+ if (unlikely(!msg->msg_control)) {
+ /* This test isn't strictly necessary, but it provides a
+ * hook for testing kernel call times.
+ */
+ return -EINVAL;
+ }
+ if (msg->msg_controllen != sizeof(control)) {
+ result = -EINVAL;
+ goto done;
+ }
+ if (unlikely(copy_from_user(&control,
+ (__force void __user *)msg->msg_control,
+ sizeof(control)))) {
+ result = -EFAULT;
+ goto done;
+ }
+ control.completion_cookie = 0;
+
+ if (control.num_bpages > HOMA_MAX_BPAGES ||
+ (control.flags & ~HOMA_RECVMSG_VALID_FLAGS)) {
+ result = -EINVAL;
+ goto done;
+ }
+ homa_pool_release_buffers(hsk->buffer_pool, control.num_bpages,
+ control.bpage_offsets);
+ control.num_bpages = 0;
+
+ rpc = homa_wait_for_message(hsk, (flags & MSG_DONTWAIT)
+ ? (control.flags | HOMA_RECVMSG_NONBLOCKING)
+ : control.flags, control.id);
+ if (IS_ERR(rpc)) {
+ /* If we get here, it means there was an error that prevented
+ * us from finding an RPC to return. If there's an error in
+ * the RPC itself we won't get here.
+ */
+ result = PTR_ERR(rpc);
+ goto done;
+ }
+ result = rpc->error ? rpc->error : rpc->msgin.length;
+
+ /* Collect result information. */
+ control.id = rpc->id;
+ control.completion_cookie = rpc->completion_cookie;
+ if (likely(rpc->msgin.length >= 0)) {
+ control.num_bpages = rpc->msgin.num_bpages;
+ memcpy(control.bpage_offsets, rpc->msgin.bpage_offsets,
+ sizeof(control.bpage_offsets));
+ }
+ if (sk->sk_family == AF_INET6) {
+ struct sockaddr_in6 *in6 = msg->msg_name;
+
+ in6->sin6_family = AF_INET6;
+ in6->sin6_port = htons(rpc->dport);
+ in6->sin6_addr = rpc->peer->addr;
+ *addr_len = sizeof(*in6);
+ } else {
+ struct sockaddr_in *in4 = msg->msg_name;
+
+ in4->sin_family = AF_INET;
+ in4->sin_port = htons(rpc->dport);
+ in4->sin_addr.s_addr = ipv6_to_ipv4(rpc->peer->addr);
+ *addr_len = sizeof(*in4);
+ }
+
+ /* This indicates that the application now owns the buffers, so
+ * we won't free them in homa_rpc_free.
+ */
+ rpc->msgin.num_bpages = 0;
+
+ /* Must release the RPC lock (and potentially free the RPC) before
+ * copying the results back to user space.
+ */
+ if (homa_is_client(rpc->id)) {
+ homa_peer_add_ack(rpc);
+ homa_rpc_free(rpc);
+ } else {
+ if (result < 0)
+ homa_rpc_free(rpc);
+ else
+ rpc->state = RPC_IN_SERVICE;
+ }
+ homa_rpc_unlock(rpc);
+
+done:
+ if (unlikely(copy_to_user((__force void __user *)msg->msg_control,
+ &control, sizeof(control)))) {
+ /* Note: in this case the message's buffers will be leaked. */
+ pr_notice("%s couldn't copy back args\n", __func__);
+ result = -EFAULT;
+ }
+
+ return result;
+}
+
+/**
+ * homa_hash() - Not needed for Homa.
+ * @sk: Socket for the operation
+ * Return: ??
+ */
+int homa_hash(struct sock *sk)
+{
+ return 0;
+}
+
+/**
+ * homa_unhash() - Not needed for Homa.
+ * @sk: Socket for the operation
+ */
+void homa_unhash(struct sock *sk)
+{
+}
+
+/**
+ * homa_get_port() - It appears that this function is called to assign a
+ * default port for a socket.
+ * @sk: Socket for the operation
+ * @snum: Unclear what this is.
+ * Return: Zero for success, or a negative errno for an error.
+ */
+int homa_get_port(struct sock *sk, unsigned short snum)
+{
+ /* Homa always assigns ports immediately when a socket is created,
+ * so there is nothing to do here.
+ */
+ return 0;
+}
+
+/**
+ * homa_softirq() - This function is invoked at SoftIRQ level to handle
+ * incoming packets.
+ * @skb: The incoming packet.
+ * Return: Always 0
+ */
+int homa_softirq(struct sk_buff *skb)
+{
+ struct sk_buff *packets, *other_pkts, *next;
+ struct sk_buff **prev_link, **other_link;
+ struct common_header *h;
+ int first_packet = 1;
+ int header_offset;
+ int pull_length;
+
+ /* skb may actually contain many distinct packets, linked through
+ * skb_shinfo(skb)->frag_list by the Homa GRO mechanism. Make a
+ * pass through the list to process all of the short packets,
+ * leaving the longer packets in the list. Also, perform various
+ * prep/cleanup/error checking functions.
+ */
+ skb->next = skb_shinfo(skb)->frag_list;
+ skb_shinfo(skb)->frag_list = NULL;
+ packets = skb;
+ prev_link = &packets;
+ for (skb = packets; skb; skb = next) {
+ next = skb->next;
+
+ /* Make the header available at skb->data, even if the packet
+ * is fragmented. One complication: it's possible that the IP
+ * header hasn't yet been removed (this happens for GRO packets
+ * on the frag_list, since they aren't handled explicitly by IP.
+ */
+ header_offset = skb_transport_header(skb) - skb->data;
+ pull_length = HOMA_MAX_HEADER + header_offset;
+ if (pull_length > skb->len)
+ pull_length = skb->len;
+ if (!pskb_may_pull(skb, pull_length))
+ goto discard;
+ if (header_offset)
+ __skb_pull(skb, header_offset);
+
+ /* Reject packets that are too short or have bogus types. */
+ h = (struct common_header *)skb->data;
+ if (unlikely(skb->len < sizeof(struct common_header) ||
+ h->type < DATA || h->type >= BOGUS ||
+ skb->len < header_lengths[h->type - DATA]))
+ goto discard;
+
+ if (first_packet)
+ first_packet = 0;
+
+ /* Process the packet now if it is a control packet or
+ * if it contains an entire short message.
+ */
+ if (h->type != DATA || ntohl(((struct data_header *)h)
+ ->message_length) < 1400) {
+ *prev_link = skb->next;
+ skb->next = NULL;
+ homa_dispatch_pkts(skb, homa);
+ } else {
+ prev_link = &skb->next;
+ }
+ continue;
+
+discard:
+ *prev_link = skb->next;
+ kfree_skb(skb);
+ }
+
+ /* Now process the longer packets. Each iteration of this loop
+ * collects all of the packets for a particular RPC and dispatches
+ * them.
+ */
+ while (packets) {
+ struct in6_addr saddr, saddr2;
+ struct common_header *h2;
+ struct sk_buff *skb2;
+
+ skb = packets;
+ prev_link = &skb->next;
+ saddr = skb_canonical_ipv6_saddr(skb);
+ other_pkts = NULL;
+ other_link = &other_pkts;
+ h = (struct common_header *)skb->data;
+ for (skb2 = skb->next; skb2; skb2 = next) {
+ next = skb2->next;
+ h2 = (struct common_header *)skb2->data;
+ if (h2->sender_id == h->sender_id) {
+ saddr2 = skb_canonical_ipv6_saddr(skb2);
+ if (ipv6_addr_equal(&saddr, &saddr2)) {
+ *prev_link = skb2;
+ prev_link = &skb2->next;
+ continue;
+ }
+ }
+ *other_link = skb2;
+ other_link = &skb2->next;
+ }
+ *prev_link = NULL;
+ *other_link = NULL;
+ homa_dispatch_pkts(packets, homa);
+ packets = other_pkts;
+ }
+
+ return 0;
+}
+
+/**
+ * homa_backlog_rcv() - Invoked to handle packets saved on a socket's
+ * backlog because it was locked when the packets first arrived.
+ * @sk: Homa socket that owns the packet's destination port.
+ * @skb: The incoming packet. This function takes ownership of the packet
+ * (we'll delete it).
+ *
+ * Return: Always returns 0.
+ */
+int homa_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ pr_warn("unimplemented backlog_rcv invoked on Homa socket\n");
+ kfree_skb(skb);
+ return 0;
+}
+
+/**
+ * homa_err_handler_v4() - Invoked by IP to handle an incoming error
+ * packet, such as ICMP UNREACHABLE.
+ * @skb: The incoming packet.
+ * @info: Information about the error that occurred?
+ *
+ * Return: zero, or a negative errno if the error couldn't be handled here.
+ */
+int homa_err_handler_v4(struct sk_buff *skb, u32 info)
+{
+ const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) {
+ char *icmp = (char *)icmp_hdr(skb);
+ struct common_header *h;
+
+ iph = (struct iphdr *)(icmp + sizeof(struct icmphdr));
+ h = (struct common_header *)(icmp + sizeof(struct icmphdr)
+ + iph->ihl * 4);
+ homa_abort_rpcs(homa, &saddr, ntohs(h->dport), -ENOTCONN);
+ } else if (type == ICMP_DEST_UNREACH) {
+ int error;
+
+ if (code == ICMP_PROT_UNREACH)
+ error = -EPROTONOSUPPORT;
+ else
+ error = -EHOSTUNREACH;
+ homa_abort_rpcs(homa, &saddr, 0, error);
+ } else {
+ pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n",
+ __func__, info, type, code);
+ }
+ return 0;
+}
+
+/**
+ * homa_err_handler_v6() - Invoked by IP to handle an incoming error
+ * packet, such as ICMP UNREACHABLE.
+ * @skb: The incoming packet.
+ * @opt: Not used.
+ * @type: Type of ICMP packet.
+ * @code: Additional information about the error.
+ * @offset: Not used.
+ * @info: Information about the error that occurred?
+ *
+ * Return: zero, or a negative errno if the error couldn't be handled here.
+ */
+int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+
+ if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_PORT_UNREACH) {
+ char *icmp = (char *)icmp_hdr(skb);
+ struct common_header *h;
+
+ iph = (struct ipv6hdr *)(icmp + sizeof(struct icmphdr));
+ h = (struct common_header *)(icmp + sizeof(struct icmphdr)
+ + HOMA_IPV6_HEADER_LENGTH);
+ homa_abort_rpcs(homa, &iph->daddr, ntohs(h->dport), -ENOTCONN);
+ } else if (type == ICMPV6_DEST_UNREACH) {
+ int error;
+
+ if (code == ICMP_PROT_UNREACH)
+ error = -EPROTONOSUPPORT;
+ else
+ error = -EHOSTUNREACH;
+ homa_abort_rpcs(homa, &iph->daddr, 0, error);
+ }
+ return 0;
+}
+
+/**
+ * homa_poll() - Invoked by Linux as part of implementing select, poll,
+ * epoll, etc.
+ * @file: Open file that is participating in a poll, select, etc.
+ * @sock: A Homa socket, associated with @file.
+ * @wait: This table will be registered with the socket, so that it
+ * is notified when the socket's ready state changes.
+ *
+ * Return: A mask of bits such as EPOLLIN, which indicate the current
+ * state of the socket.
+ */
+__poll_t homa_poll(struct file *file, struct socket *sock,
+ struct poll_table_struct *wait)
+{
+ struct sock *sk = sock->sk;
+ __u32 mask;
+
+ /* It seems to be standard practice for poll functions *not* to
+ * acquire the socket lock, so we don't do it here; not sure
+ * why...
+ */
+
+ sock_poll_wait(file, sock, wait);
+ mask = POLLOUT | POLLWRNORM;
+
+ if (!list_empty(&homa_sk(sk)->ready_requests) ||
+ !list_empty(&homa_sk(sk)->ready_responses))
+ mask |= POLLIN | POLLRDNORM;
+ return (__force __poll_t)mask;
+}
+
+/**
+ * homa_hrtimer() - This function is invoked by the hrtimer mechanism to
+ * wake up the timer thread. Runs at IRQ level.
+ * @timer: The timer that triggered; not used.
+ *
+ * Return: Always HRTIMER_RESTART.
+ */
+enum hrtimer_restart homa_hrtimer(struct hrtimer *timer)
+{
+ wake_up_process(timer_kthread);
+ return HRTIMER_NORESTART;
+}
+
+/**
+ * homa_timer_main() - Top-level function for the timer thread.
+ * @transport: Pointer to struct homa.
+ *
+ * Return: Always 0.
+ */
+int homa_timer_main(void *transport)
+{
+ struct homa *homa = (struct homa *)transport;
+ struct hrtimer hrtimer;
+ ktime_t tick_interval;
+ u64 nsec;
+
+ hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer.function = &homa_hrtimer;
+ nsec = 1000000; /* 1 ms */
+ tick_interval = ns_to_ktime(nsec);
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!exiting) {
+ hrtimer_start(&hrtimer, tick_interval, HRTIMER_MODE_REL);
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ if (exiting)
+ break;
+ homa_timer(homa);
+ }
+ hrtimer_cancel(&hrtimer);
+ kthread_complete_and_exit(&timer_thread_done, 0);
+ return 0;
+}
diff --git a/net/homa/homa_utils.c b/net/homa/homa_utils.c
new file mode 100644
index 000000000000..905d00c836bd
--- /dev/null
+++ b/net/homa/homa_utils.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: BSD-2-Clause
+
+/* This file contains miscellaneous utility functions for Homa, such
+ * as initializing and destroying homa structs.
+ */
+
+#include "homa_impl.h"
+#include "homa_peer.h"
+#include "homa_rpc.h"
+#include "homa_stub.h"
+
+struct completion homa_pacer_kthread_done;
+
+/**
+ * homa_init() - Constructor for homa objects.
+ * @homa: Object to initialize.
+ *
+ * Return: 0 on success, or a negative errno if there was an error. Even
+ * if an error occurs, it is safe (and necessary) to call
+ * homa_destroy at some point.
+ */
+int homa_init(struct homa *homa)
+{
+ int err;
+
+ _Static_assert(HOMA_MAX_PRIORITIES >= 8,
+ "homa_init assumes at least 8 priority levels");
+
+ homa->pacer_kthread = NULL;
+ init_completion(&homa_pacer_kthread_done);
+ atomic64_set(&homa->next_outgoing_id, 2);
+ atomic64_set(&homa->link_idle_time, sched_clock());
+ spin_lock_init(&homa->pacer_mutex);
+ homa->pacer_fifo_fraction = 50;
+ homa->pacer_fifo_count = 1;
+ homa->pacer_wake_time = 0;
+ spin_lock_init(&homa->throttle_lock);
+ INIT_LIST_HEAD_RCU(&homa->throttled_rpcs);
+ homa->throttle_add = 0;
+ homa->throttle_min_bytes = 200;
+ homa->next_client_port = HOMA_MIN_DEFAULT_PORT;
+ homa->port_map = kmalloc(sizeof(*homa->port_map), GFP_KERNEL);
+ if (!homa->port_map) {
+ pr_err("%s couldn't create port_map: kmalloc failure", __func__);
+ return -ENOMEM;
+ }
+ homa_socktab_init(homa->port_map);
+ homa->peers = kmalloc(sizeof(*homa->peers), GFP_KERNEL);
+ if (!homa->peers) {
+ pr_err("%s couldn't create peers: kmalloc failure", __func__);
+ return -ENOMEM;
+ }
+ err = homa_peertab_init(homa->peers);
+ if (err) {
+ pr_err("%s couldn't initialize peer table (errno %d)\n",
+ __func__, -err);
+ return err;
+ }
+
+ /* Wild guesses to initialize configuration values... */
+ homa->unsched_bytes = 40000;
+ homa->window_param = 100000;
+ homa->link_mbps = 25000;
+ homa->fifo_grant_increment = 10000;
+ homa->grant_fifo_fraction = 50;
+ homa->max_overcommit = 8;
+ homa->max_incoming = 400000;
+ homa->max_rpcs_per_peer = 1;
+ homa->resend_ticks = 5;
+ homa->resend_interval = 5;
+ homa->timeout_ticks = 100;
+ homa->timeout_resends = 5;
+ homa->request_ack_ticks = 2;
+ homa->reap_limit = 10;
+ homa->dead_buffs_limit = 5000;
+ homa->max_dead_buffs = 0;
+ homa->pacer_kthread = kthread_run(homa_pacer_main, homa,
+ "homa_pacer");
+ if (IS_ERR(homa->pacer_kthread)) {
+ err = PTR_ERR(homa->pacer_kthread);
+ homa->pacer_kthread = NULL;
+ pr_err("couldn't create homa pacer thread: error %d\n", err);
+ return err;
+ }
+ homa->pacer_exit = false;
+ homa->max_nic_queue_ns = 5000;
+ homa->ns_per_mbyte = 0;
+ homa->max_gso_size = 10000;
+ homa->gso_force_software = 0;
+ homa->max_gro_skbs = 20;
+ homa->gro_policy = HOMA_GRO_NORMAL;
+ homa->timer_ticks = 0;
+ homa->flags = 0;
+ homa->bpage_lease_usecs = 10000;
+ homa->next_id = 0;
+ homa_outgoing_sysctl_changed(homa);
+ homa_incoming_sysctl_changed(homa);
+ return 0;
+}
+
+/**
+ * homa_destroy() - Destructor for homa objects.
+ * @homa: Object to destroy.
+ */
+void homa_destroy(struct homa *homa)
+{
+ if (homa->pacer_kthread) {
+ homa_pacer_stop(homa);
+ wait_for_completion(&homa_pacer_kthread_done);
+ }
+
+ /* The order of the following statements matters! */
+ if (homa->port_map) {
+ homa_socktab_destroy(homa->port_map);
+ kfree(homa->port_map);
+ homa->port_map = NULL;
+ }
+ if (homa->peers) {
+ homa_peertab_destroy(homa->peers);
+ kfree(homa->peers);
+ homa->peers = NULL;
+ }
+}
+
+/**
+ * homa_spin() - Delay (without sleeping) for a given time interval.
+ * @ns: How long to delay (in nanoseconds)
+ */
+void homa_spin(int ns)
+{
+ __u64 end;
+
+ end = sched_clock() + ns;
+ while (sched_clock() < end)
+ /* Empty loop body.*/
+ ;
+}
+
+/**
+ * homa_throttle_lock_slow() - This function implements the slow path for
+ * acquiring the throttle lock. It is invoked when the lock isn't immediately
+ * available. It waits for the lock, but also records statistics about
+ * the waiting time.
+ * @homa: Overall data about the Homa protocol implementation.
+ */
+void homa_throttle_lock_slow(struct homa *homa)
+ __acquires(&homa->throttle_lock)
+{
+ spin_lock_bh(&homa->throttle_lock);
+}