Akihiko Odaki wrote: > RSS is a receive steering algorithm that can be negotiated to use with > virtio_net. Conventionally the hash calculation was done by the VMM. > However, computing the hash after the queue was chosen defeats the > purpose of RSS. > > Another approach is to use eBPF steering program. This approach has > another downside: it cannot report the calculated hash due to the > restrictive nature of eBPF steering program. > > Introduce the code to perform RSS to the kernel in order to overcome > thse challenges. An alternative solution is to extend the eBPF steering > program so that it will be able to report to the userspace, but I didn't > opt for it because extending the current mechanism of eBPF steering > program as is because it relies on legacy context rewriting, and > introducing kfunc-based eBPF will result in non-UAPI dependency while > the other relevant virtualization APIs such as KVM and vhost_net are > UAPIs. > > Signed-off-by: Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> > --- > drivers/net/tun.c | 119 +++++++++++++++++++++++++++++++++++++++----- > include/uapi/linux/if_tun.h | 27 ++++++++++ > 2 files changed, 133 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index b8fcd71becac..5a429b391144 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -175,6 +175,9 @@ struct tun_prog { > > struct tun_vnet_hash_container { > struct tun_vnet_hash common; > + struct tun_vnet_hash_rss rss; > + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > + u16 rss_indirection_table[]; > }; > > /* Since the socket were moved to tun_file, to preserve the behavior of persist > @@ -227,7 +230,7 @@ struct veth { > }; > > static const struct tun_vnet_hash tun_vnet_hash_cap = { > - .flags = TUN_VNET_HASH_REPORT, > + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, > .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > }; > > @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) > return ret % numqueues; > } > > +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun, > + struct sk_buff *skb, > + const struct tun_vnet_hash_container *vnet_hash) > +{ > + struct tun_vnet_hash_ext *ext; > + struct virtio_net_hash hash; > + u32 numqueues = READ_ONCE(tun->numqueues); > + u16 txq, index; > + > + if (!numqueues) > + return 0; > + > + if (!virtio_net_hash_rss(skb, vnet_hash->common.types, vnet_hash->rss_key, > + &hash)) > + return vnet_hash->rss.unclassified_queue % numqueues; > + > + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) { > + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); > + if (ext) { > + ext->value = hash.value; > + ext->report = hash.report; > + } > + } > + > + index = hash.value & vnet_hash->rss.indirection_table_mask; > + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]); > + > + return txq % numqueues; > +} > + > static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > struct net_device *sb_dev) > { > @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > } else { > struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash); > > - ret = tun_automq_select_queue(tun, skb, vnet_hash); > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) > + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash); > + else > + ret = tun_automq_select_queue(tun, skb, vnet_hash); > } > rcu_read_unlock(); > > @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) > } > > static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, > - void __user *data) > + int fd) > { > struct bpf_prog *prog; > - int fd; > - > - if (copy_from_user(&fd, data, sizeof(fd))) > - return -EFAULT; > > if (fd == -1) { > prog = NULL; > @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > int ifindex; > int sndbuf; > int vnet_hdr_sz; > + int fd; > int le; > int ret; > bool do_notify = false; > @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > break; > > case TUNSETSTEERINGEBPF: > - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); > + if (get_user(fd, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + vnet_hash = rtnl_dereference(tun->vnet_hash); > + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) { > + ret = -EBUSY; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->steering_prog, fd); > break; > > case TUNSETFILTEREBPF: > - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); > + if (get_user(fd, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + ret = tun_set_ebpf(tun, &tun->filter_prog, fd); > break; > > case TUNSETCARRIER: > @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > break; > } > > - vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); > - if (!vnet_hash) { > - ret = -ENOMEM; > - break; > + if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) { > + struct tun_vnet_hash_rss rss; > + size_t indirection_table_size; > + size_t key_size; > + size_t size; > + > + if (tun->steering_prog) { > + ret = -EBUSY; > + break; > + } > + > + if (copy_from_user(&rss, argp, sizeof(rss))) { > + ret = -EFAULT; > + break; > + } > + argp = (struct tun_vnet_hash_rss __user *)argp + 1; > + > + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2; Why make uapi a mask rather than a length? Also is there a upper length bounds sanity check for this input from userspace? > + key_size = virtio_net_hash_key_length(vnet_hash_common.types); > + size = sizeof(*vnet_hash) + indirection_table_size + key_size; key_size is included in sizeof(*vnet_hash), always VIRTIO_NET_RSS_MAX_KEY_SIZE. > + > + vnet_hash = kmalloc(size, GFP_KERNEL); > + if (!vnet_hash) { > + ret = -ENOMEM; > + break; > + } > + > + if (copy_from_user(vnet_hash->rss_indirection_table, > + argp, indirection_table_size)) { > + kfree(vnet_hash); > + ret = -EFAULT; > + break; > + } > + argp = (u16 __user *)argp + rss.indirection_table_mask + 1; > + > + if (copy_from_user(vnet_hash->rss_key, argp, key_size)) { > + kfree(vnet_hash); > + ret = -EFAULT; > + break; > + } > + > + vnet_hash->rss = rss; > + } else { > + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL); > + if (!vnet_hash) { > + ret = -ENOMEM; > + break; > + } > } > > vnet_hash->common = vnet_hash_common; > diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > index 1561e8ce0a0a..1c130409db5d 100644 > --- a/include/uapi/linux/if_tun.h > +++ b/include/uapi/linux/if_tun.h > @@ -75,6 +75,14 @@ > * > * The argument is a pointer to &struct tun_vnet_hash. > * > + * The argument is a pointer to the compound of the following in order if > + * %TUN_VNET_HASH_RSS is set: > + * > + * 1. &struct tun_vnet_hash > + * 2. &struct tun_vnet_hash_rss > + * 3. Indirection table > + * 4. Key > + * > * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to > * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with > * %TUN_VNET_HASH_REPORT. > @@ -144,6 +152,13 @@ struct tun_filter { > */ > #define TUN_VNET_HASH_REPORT 0x0001 > > +/** > + * define TUN_VNET_HASH_RSS - Request virtio_net RSS > + * > + * This is mutually exclusive with eBPF steering program. > + */ > +#define TUN_VNET_HASH_RSS 0x0002 > + > /** > * struct tun_vnet_hash - virtio_net hashing configuration > * @flags: > @@ -159,4 +174,16 @@ struct tun_vnet_hash { > __u32 types; > }; > > +/** > + * struct tun_vnet_hash_rss - virtio_net RSS configuration > + * @indirection_table_mask: > + * Bitmask to be applied to the indirection table index > + * @unclassified_queue: > + * The index of the queue to place unclassified packets in > + */ > +struct tun_vnet_hash_rss { > + __u16 indirection_table_mask; > + __u16 unclassified_queue; > +}; > + > #endif /* _UAPI__IF_TUN_H */ > > -- > 2.46.0 >