On Tue, Oct 8, 2024 at 2:55 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote: > > RSS is a receive steering algorithm that can be negotiated to use with > virtio_net. Conventionally the hash calculation was done by the VMM. > However, computing the hash after the queue was chosen defeats the > purpose of RSS. > > Another approach is to use eBPF steering program. This approach has > another downside: it cannot report the calculated hash due to the > restrictive nature of eBPF steering program. > > Introduce the code to perform RSS to the kernel in order to overcome > thse challenges. An alternative solution is to extend the eBPF steering > program so that it will be able to report to the userspace, but I didn't > opt for it because extending the current mechanism of eBPF steering > program as is because it relies on legacy context rewriting, and > introducing kfunc-based eBPF will result in non-UAPI dependency while > the other relevant virtualization APIs such as KVM and vhost_net are > UAPIs. > > Signed-off-by: Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> > --- > drivers/net/tap.c | 11 +++++- > drivers/net/tun.c | 57 ++++++++++++++++++++------- > drivers/net/tun_vnet.h | 96 +++++++++++++++++++++++++++++++++++++++++---- > include/linux/if_tap.h | 4 +- > include/uapi/linux/if_tun.h | 27 +++++++++++++ > 5 files changed, 169 insertions(+), 26 deletions(-) > > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index 5e2fbe63ca47..a58b83285af4 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -207,6 +207,7 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, > * racing against queue removal. > */ > int numvtaps = READ_ONCE(tap->numvtaps); > + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash); > __u32 rxq; > > *tap_add_hash(skb) = (struct virtio_net_hash) { .report = VIRTIO_NET_HASH_REPORT_NONE }; > @@ -217,6 +218,12 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, > if (numvtaps == 1) > goto single; > > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > + rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash); > + queue = rcu_dereference(tap->taps[rxq]); > + goto out; > + } > + > if (!skb->l4_hash && !skb->sw_hash) { > struct flow_keys keys; > > @@ -234,7 +241,7 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap, > > /* Check if we can use flow to select a queue */ > if (rxq) { > - tun_vnet_hash_report(&tap->vnet_hash, skb, &keys_basic, rxq, tap_add_hash); > + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash); > queue = rcu_dereference(tap->taps[rxq % numvtaps]); > goto out; > } > @@ -1058,7 +1065,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd, > tap = rtnl_dereference(q->tap); > ret = tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags, > tap ? &tap->vnet_hash : NULL, -EINVAL, > - cmd, sp); > + true, cmd, sp); > rtnl_unlock(); > return ret; > } > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 27308417b834..18528568aed7 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -209,7 +209,7 @@ struct tun_struct { > struct bpf_prog __rcu *xdp_prog; > struct tun_prog __rcu *steering_prog; > struct tun_prog __rcu *filter_prog; > - struct tun_vnet_hash vnet_hash; > + struct tun_vnet_hash_container __rcu *vnet_hash; > struct ethtool_link_ksettings link_ksettings; > /* init args */ > struct file *file; > @@ -468,7 +468,9 @@ static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb) > * the userspace application move between processors, we may get a > * different rxq no. here. > */ > -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > +static u16 tun_automq_select_queue(struct tun_struct *tun, > + const struct tun_vnet_hash_container *vnet_hash, > + struct sk_buff *skb) > { > struct flow_keys keys; > struct flow_keys_basic keys_basic; > @@ -493,7 +495,7 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) > .control = keys.control, > .basic = keys.basic > }; > - tun_vnet_hash_report(&tun->vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, > + tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq, > tun_add_hash); > > return txq; > @@ -523,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > u16 ret; > > rcu_read_lock(); > - if (rcu_dereference(tun->steering_prog)) > + if (rcu_dereference(tun->steering_prog)) { > ret = tun_ebpf_select_queue(tun, skb); > - else > - ret = tun_automq_select_queue(tun, skb); > + } else { > + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); > + > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) > + ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash, > + skb, tun_add_hash); > + else > + ret = tun_automq_select_queue(tun, vnet_hash, skb); > + } > rcu_read_unlock(); > > return ret; > @@ -2248,6 +2257,9 @@ static void tun_free_netdev(struct net_device *dev) > security_tun_dev_free_security(tun->security); > __tun_set_ebpf(tun, &tun->steering_prog, NULL); > __tun_set_ebpf(tun, &tun->filter_prog, NULL); > + rtnl_lock(); > + kfree_rcu_mightsleep(rtnl_dereference(tun->vnet_hash)); > + rtnl_unlock(); > } > > static void tun_setup(struct net_device *dev) > @@ -2946,13 +2958,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) > } > > static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, > - void __user *data) > + int fd) > { > struct bpf_prog *prog; > - int fd; > - > - if (copy_from_user(&fd, data, sizeof(fd))) > - return -EFAULT; > > if (fd == -1) { > prog = NULL; > @@ -3019,6 +3027,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > int sndbuf; > int ret; > bool do_notify = false; > + struct tun_vnet_hash_container *vnet_hash; > > if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || > (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { > @@ -3078,7 +3087,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > } > > if (!tun) { > - ret = tun_vnet_ioctl(NULL, NULL, NULL, -EBADFD, cmd, argp); > + ret = tun_vnet_ioctl(NULL, NULL, NULL, -EBADFD, true, cmd, argp); > goto unlock; > } > > @@ -3256,11 +3265,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > break; > > case TUNSETSTEERINGEBPF: > - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); > + if (get_user(ret, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + vnet_hash = rtnl_dereference(tun->vnet_hash); > + if (ret != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > + ret = -EBUSY; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->steering_prog, ret); > break; > > case TUNSETFILTEREBPF: > - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); > + if (get_user(ret, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->filter_prog, ret); > break; > > case TUNSETCARRIER: > @@ -3280,7 +3305,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > > default: > ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, > - &tun->vnet_hash, -EINVAL, cmd, argp); > + &tun->vnet_hash, -EINVAL, > + !rtnl_dereference(tun->steering_prog), > + cmd, argp); > } > > if (do_notify) > diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h > index 589a97dd7d02..f5de4fe9d14e 100644 > --- a/drivers/net/tun_vnet.h > +++ b/drivers/net/tun_vnet.h > @@ -9,6 +9,13 @@ > typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *); > typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *); > > +struct tun_vnet_hash_container { > + struct tun_vnet_hash common; > + struct tun_vnet_hash_rss rss; > + u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > + u16 rss_indirection_table[]; > +}; > + > static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) > { > return !(IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && (flags & TUN_VNET_BE)) && > @@ -62,14 +69,16 @@ static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val) > } > > static inline long tun_vnet_ioctl(int *sz, unsigned int *flags, > - struct tun_vnet_hash *hash, long fallback, > + struct tun_vnet_hash_container __rcu **hashp, > + long fallback, bool can_rss, > unsigned int cmd, void __user *argp) > { > static const struct tun_vnet_hash cap = { > - .flags = TUN_VNET_HASH_REPORT, > + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, > .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > }; > struct tun_vnet_hash hash_buf; > + struct tun_vnet_hash_container *hash; > int __user *sp = argp; > int s; > > @@ -132,13 +141,57 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags, > return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0; > > case TUNSETVNETHASH: > - if (!hash) > + if (!hashp) > return -EBADFD; > > if (copy_from_user(&hash_buf, argp, sizeof(hash_buf))) > return -EFAULT; > + argp = (struct tun_vnet_hash __user *)argp + 1; > + > + if (hash_buf.flags & TUN_VNET_HASH_RSS) { > + struct tun_vnet_hash_rss rss; > + size_t indirection_table_size; > + size_t key_size; > + size_t size; > + > + if (!can_rss) > + return -EBUSY; > + > + if (copy_from_user(&rss, argp, sizeof(rss))) This seems to be a change of the uAPI of TUNSETVNETHASH. > + return -EFAULT; > + argp = (struct tun_vnet_hash_rss __user *)argp + 1; > + > + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; > + key_size = virtio_net_hash_key_length(hash_buf.types); > + size = struct_size(hash, rss_indirection_table, > + (size_t)rss.indirection_table_mask + 1); > + > + hash = kmalloc(size, GFP_KERNEL); > + if (!hash) > + return -ENOMEM; > + > + if (copy_from_user(hash->rss_indirection_table, > + argp, indirection_table_size)) { > + kfree(hash); > + return -EFAULT; > + } > + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; > + > + if (copy_from_user(hash->rss_key, argp, key_size)) { > + kfree(hash); > + return -EFAULT; > + } > + > + virtio_net_toeplitz_convert_key(hash->rss_key, key_size); > + hash->rss = rss; > + } else { > + hash = kmalloc(sizeof(hash->common), GFP_KERNEL); > + if (!hash) > + return -ENOMEM; > + } > > - *hash = hash_buf; > + hash->common = hash_buf; > + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash)); > return 0; > > default: > @@ -146,7 +199,7 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags, > } > } > > -static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash, > +static inline void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash, > struct sk_buff *skb, > const struct flow_keys_basic *keys, > u32 value, > @@ -154,7 +207,7 @@ static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash, > { > struct virtio_net_hash *report; > > - if (!(hash->flags & TUN_VNET_HASH_REPORT)) > + if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT)) > return; > > report = vnet_hash_add(skb); > @@ -162,11 +215,40 @@ static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash, > return; > > *report = (struct virtio_net_hash) { > - .report = virtio_net_hash_report(hash->types, keys), > + .report = virtio_net_hash_report(hash->common.types, keys), > .value = value > }; > } > > +static inline u16 tun_vnet_rss_select_queue(u32 numqueues, > + const struct tun_vnet_hash_container *hash, > + struct sk_buff *skb, > + tun_vnet_hash_add vnet_hash_add) > +{ > + struct virtio_net_hash *report; > + struct virtio_net_hash ret; > + u16 txq, index; > + > + if (!numqueues) > + return 0; > + > + virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret); > + > + if (!ret.report) > + return hash->rss.unclassified_queue % numqueues; > + > + if (hash->common.flags & TUN_VNET_HASH_REPORT) { > + report = vnet_hash_add(skb); > + if (report) > + *report = ret; > + } > + > + index = ret.value & hash->rss.indirection_table_mask; > + txq = READ_ONCE(hash->rss_indirection_table[index]); > + > + return txq % numqueues; > +} > + > static inline int tun_vnet_hdr_get(int sz, unsigned int flags, > struct iov_iter *from, > struct virtio_net_hdr *hdr) > diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h > index 5bbb343a6dba..7334c46a3f10 100644 > --- a/include/linux/if_tap.h > +++ b/include/linux/if_tap.h > @@ -4,7 +4,6 @@ > > #include <net/sock.h> > #include <linux/skb_array.h> > -#include <uapi/linux/if_tun.h> > > struct file; > struct socket; > @@ -32,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) > #define MAX_TAP_QUEUES 256 > > struct tap_queue; > +struct tun_vnet_hash_container; > > struct tap_dev { > struct net_device *dev; > @@ -44,7 +44,7 @@ struct tap_dev { > int numqueues; > netdev_features_t tap_features; > int minor; > - struct tun_vnet_hash vnet_hash; > + struct tun_vnet_hash_container __rcu *vnet_hash; > > void (*update_features)(struct tap_dev *tap, netdev_features_t features); > void (*count_tx_dropped)(struct tap_dev *tap); > diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > index d11e79b4e0dc..4887f97500a8 100644 > --- a/include/uapi/linux/if_tun.h > +++ b/include/uapi/linux/if_tun.h > @@ -75,6 +75,14 @@ > * > * The argument is a pointer to &struct tun_vnet_hash. > * > + * The argument is a pointer to the compound of the following in order if > + * %TUN_VNET_HASH_RSS is set: > + * > + * 1. &struct tun_vnet_hash > + * 2. &struct tun_vnet_hash_rss > + * 3. Indirection table > + * 4. Key > + * Let's try not modify uAPI. We can introduce new ioctl if necessary. > * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only > * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal > * to the size of &struct virtio_net_hdr_v1_hash. > @@ -148,6 +156,13 @@ struct tun_filter { > */ > #define TUN_VNET_HASH_REPORT 0x0001 > > +/** > + * define TUN_VNET_HASH_RSS - Request virtio_net RSS > + * > + * This is mutually exclusive with eBPF steering program. > + */ > +#define TUN_VNET_HASH_RSS 0x0002 > + > /** > * struct tun_vnet_hash - virtio_net hashing configuration > * @flags: > @@ -163,4 +178,16 @@ struct tun_vnet_hash { > __u32 types; > }; > > +/** > + * struct tun_vnet_hash_rss - virtio_net RSS configuration > + * @indirection_table_mask: > + * Bitmask to be applied to the indirection table index > + * @unclassified_queue: > + * The index of the queue to place unclassified packets in > + */ > +struct tun_vnet_hash_rss { > + __u16 indirection_table_mask; > + __u16 unclassified_queue; > +}; > + > #endif /* _UAPI__IF_TUN_H */ > > -- > 2.46.2 > Thanks