Akihiko Odaki sorry about reposts. Having an email with two "@" in the CC list: xuanzhuo@linux.alibaba.comshuah@xxxxxxxxxx tripped up mutt's reply-all for me and made it send to you only. I am guessing you meant two addresses: xuanzhuo@xxxxxxxxxxxxxxxxx and shuah@xxxxxxxxxx. On Sun, Oct 08, 2023 at 02:20:49PM +0900, Akihiko Odaki wrote: > virtio-net have two usage of hashes: one is RSS and another is hash > reporting. Conventionally the hash calculation was done by the VMM. > However, computing the hash after the queue was chosen defeats the > purpose of RSS. > > Another approach is to use eBPF steering program. This approach has > another downside: it cannot report the calculated hash due to the > restrictive nature of eBPF. > > Introduce the code to compute hashes to the kernel in order to overcome > thse challenges. An alternative solution is to extend the eBPF steering > program so that it will be able to report to the userspace, but it makes > little sense to allow to implement different hashing algorithms with > eBPF since the hash value reported by virtio-net is strictly defined by > the specification. > > The hash value already stored in sk_buff is not used and computed > independently since it may have been computed in a way not conformant > with the specification. > > Signed-off-by: Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> > --- > drivers/net/tun.c | 187 ++++++++++++++++++++++++++++++++---- > include/uapi/linux/if_tun.h | 16 +++ > 2 files changed, 182 insertions(+), 21 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 89ab9efe522c..561a573cd008 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -171,6 +171,9 @@ struct tun_prog { > struct bpf_prog *prog; > }; > > +#define TUN_VNET_HASH_MAX_KEY_SIZE 40 > +#define TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH 128 > + where do these come from? > /* Since the socket were moved to tun_file, to preserve the behavior of persist > * device, socket filter, sndbuf and vnet header size were restore when the > * file were attached to a persist device. > @@ -209,6 +212,9 @@ struct tun_struct { > struct tun_prog __rcu *steering_prog; > struct tun_prog __rcu *filter_prog; > struct ethtool_link_ksettings link_ksettings; > + struct tun_vnet_hash vnet_hash; > + u16 vnet_hash_indirection_table[TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH]; > + u32 vnet_hash_key[TUN_VNET_HASH_MAX_KEY_SIZE / 4]; That's quite a lot of data to add in this struct, and will be used by a small minority of users. Are you pushing any hot data out of cache with this? Why not allocate these as needed? > /* init args */ > struct file *file; > struct ifreq *ifr; > @@ -219,6 +225,13 @@ struct veth { > __be16 h_vlan_TCI; > }; > > +static const struct tun_vnet_hash_cap tun_vnet_hash_cap = { > + .max_indirection_table_length = > + TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH, > + > + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > +}; > + > static void tun_flow_init(struct tun_struct *tun); > static void tun_flow_uninit(struct tun_struct *tun); > > @@ -320,10 +333,16 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) > if (get_user(be, argp)) > return -EFAULT; > > - if (be) > + if (be) { > + if (!(tun->flags & TUN_VNET_LE) && > + (tun->vnet_hash.flags & TUN_VNET_HASH_REPORT)) { > + return -EINVAL; > + } > + > tun->flags |= TUN_VNET_BE; > - else > + } else { > tun->flags &= ~TUN_VNET_BE; > + } > > return 0; > } > @@ -558,15 +577,47 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) > return ret % numqueues; > } > > +static u16 tun_vnet_select_queue(struct tun_struct *tun, struct sk_buff *skb) > +{ > + u32 value = qdisc_skb_cb(skb)->tun_vnet_hash_value; > + u32 numqueues; > + u32 index; > + u16 queue; > + > + numqueues = READ_ONCE(tun->numqueues); > + if (!numqueues) > + return 0; > + > + index = value & READ_ONCE(tun->vnet_hash.indirection_table_mask); > + queue = READ_ONCE(tun->vnet_hash_indirection_table[index]); > + if (!queue) > + queue = READ_ONCE(tun->vnet_hash.unclassified_queue); Apparently 0 is an illegal queue value? You are making this part of UAPI better document things like this. > + > + return queue % numqueues; > +} > + > static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > struct net_device *sb_dev) > { > struct tun_struct *tun = netdev_priv(dev); > + u8 vnet_hash_flags = READ_ONCE(tun->vnet_hash.flags); > + struct virtio_net_hash hash; > u16 ret; > > + if (vnet_hash_flags & (TUN_VNET_HASH_RSS | TUN_VNET_HASH_REPORT)) { > + virtio_net_hash(skb, READ_ONCE(tun->vnet_hash.types), > + tun->vnet_hash_key, &hash); > + What are all these READ_ONCE things doing? E.g. you seem to be very content to have tun->vnet_hash.types read twice apparently? This is volatile which basically breaks all compiler's attempts to optimize code - needs to be used judiciously. > + skb->tun_vnet_hash = true; > + qdisc_skb_cb(skb)->tun_vnet_hash_value = hash.value; > + qdisc_skb_cb(skb)->tun_vnet_hash_report = hash.report; > + } > + > rcu_read_lock(); > if (rcu_dereference(tun->steering_prog)) > ret = tun_ebpf_select_queue(tun, skb); > + else if (vnet_hash_flags & TUN_VNET_HASH_RSS) > + ret = tun_vnet_select_queue(tun, skb); > else > ret = tun_automq_select_queue(tun, skb); > rcu_read_unlock(); > @@ -2088,10 +2139,15 @@ static ssize_t tun_put_user(struct tun_struct *tun, > struct iov_iter *iter) > { > struct tun_pi pi = { 0, skb->protocol }; > + struct virtio_net_hash vnet_hash = { > + .value = qdisc_skb_cb(skb)->tun_vnet_hash_value, > + .report = qdisc_skb_cb(skb)->tun_vnet_hash_report > + }; > ssize_t total; > int vlan_offset = 0; > int vlan_hlen = 0; > int vnet_hdr_sz = 0; > + size_t vnet_hdr_content_sz; > > if (skb_vlan_tag_present(skb)) > vlan_hlen = VLAN_HLEN; > @@ -2116,31 +2172,49 @@ static ssize_t tun_put_user(struct tun_struct *tun, > } > > if (vnet_hdr_sz) { > - struct virtio_net_hdr gso; > + union { > + struct virtio_net_hdr hdr; > + struct virtio_net_hdr_v1_hash v1_hash_hdr; > + } hdr; > + int ret; > > if (iov_iter_count(iter) < vnet_hdr_sz) > return -EINVAL; > > - if (virtio_net_hdr_from_skb(skb, &gso, > - tun_is_little_endian(tun), true, > - vlan_hlen)) { > + if ((READ_ONCE(tun->vnet_hash.flags) & TUN_VNET_HASH_REPORT) && > + vnet_hdr_sz >= sizeof(hdr.v1_hash_hdr) && > + skb->tun_vnet_hash) { > + vnet_hdr_content_sz = sizeof(hdr.v1_hash_hdr); > + ret = virtio_net_hdr_v1_hash_from_skb(skb, > + &hdr.v1_hash_hdr, > + true, > + vlan_hlen, > + &vnet_hash); > + } else { > + vnet_hdr_content_sz = sizeof(hdr.hdr); > + ret = virtio_net_hdr_from_skb(skb, &hdr.hdr, > + tun_is_little_endian(tun), > + true, vlan_hlen); > + } > + > + if (ret) { > struct skb_shared_info *sinfo = skb_shinfo(skb); > pr_err("unexpected GSO type: " > "0x%x, gso_size %d, hdr_len %d\n", > - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), > - tun16_to_cpu(tun, gso.hdr_len)); > + sinfo->gso_type, tun16_to_cpu(tun, hdr.hdr.gso_size), > + tun16_to_cpu(tun, hdr.hdr.hdr_len)); > print_hex_dump(KERN_ERR, "tun: ", > DUMP_PREFIX_NONE, > 16, 1, skb->head, > - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); > + min((int)tun16_to_cpu(tun, hdr.hdr.hdr_len), 64), true); > WARN_ON_ONCE(1); > return -EINVAL; > } > > - if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) > + if (copy_to_iter(&hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz) > return -EFAULT; > > - iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); > + iov_iter_advance(iter, vnet_hdr_sz - vnet_hdr_content_sz); > } > > if (vlan_hlen) { > @@ -3007,24 +3081,27 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) > return ret; > } > > -static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, > - void __user *data) > +static struct bpf_prog *tun_set_ebpf(struct tun_struct *tun, > + struct tun_prog __rcu **prog_p, > + void __user *data) > { > struct bpf_prog *prog; > int fd; > + int ret; > > if (copy_from_user(&fd, data, sizeof(fd))) > - return -EFAULT; > + return ERR_PTR(-EFAULT); > > if (fd == -1) { > prog = NULL; > } else { > prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); > if (IS_ERR(prog)) > - return PTR_ERR(prog); > + return prog; > } > > - return __tun_set_ebpf(tun, prog_p, prog); > + ret = __tun_set_ebpf(tun, prog_p, prog); > + return ret ? ERR_PTR(ret) : prog; > } > > /* Return correct value for tun->dev->addr_len based on tun->dev->type. */ > @@ -3082,6 +3159,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > int le; > int ret; > bool do_notify = false; > + struct bpf_prog *bpf_ret; > + struct tun_vnet_hash vnet_hash; > + u16 vnet_hash_indirection_table[TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH]; > + u8 vnet_hash_key[TUN_VNET_HASH_MAX_KEY_SIZE]; > + size_t len; > > if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || > (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { > @@ -3295,7 +3377,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > ret = -EFAULT; > break; > } > - if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) { > + if (vnet_hdr_sz < > + (int)((tun->vnet_hash.flags & TUN_VNET_HASH_REPORT) ? > + sizeof(struct virtio_net_hdr_v1_hash) : > + sizeof(struct virtio_net_hdr))) { > ret = -EINVAL; > break; > } > @@ -3314,10 +3399,16 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > ret = -EFAULT; > break; > } > - if (le) > + if (le) { > tun->flags |= TUN_VNET_LE; > - else > + } else { > + if (!tun_legacy_is_little_endian(tun)) { > + ret = -EINVAL; > + break; > + } > + > tun->flags &= ~TUN_VNET_LE; > + } > break; > > case TUNGETVNETBE: > @@ -3360,11 +3451,17 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > break; > > case TUNSETSTEERINGEBPF: > - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); > + bpf_ret = tun_set_ebpf(tun, &tun->steering_prog, argp); > + if (IS_ERR(bpf_ret)) > + ret = PTR_ERR(bpf_ret); > + else if (bpf_ret) > + tun->vnet_hash.flags &= ~TUN_VNET_HASH_RSS; what is this doing? > break; > > case TUNSETFILTEREBPF: > - ret = tun_set_ebpf(tun, &tun->filter_prog, argp); > + bpf_ret = tun_set_ebpf(tun, &tun->filter_prog, argp); > + if (IS_ERR(bpf_ret)) > + ret = PTR_ERR(bpf_ret); > break; > > case TUNSETCARRIER: > @@ -3382,6 +3479,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > ret = open_related_ns(&net->ns, get_net_ns); > break; > > + case TUNGETVNETHASHCAP: > + if (copy_to_user(argp, &tun_vnet_hash_cap, > + sizeof(tun_vnet_hash_cap))) > + ret = -EFAULT; > + break; > + > + case TUNSETVNETHASH: > + len = sizeof(vnet_hash); > + if (copy_from_user(&vnet_hash, argp, len)) { > + ret = -EFAULT; > + break; > + } what if flags has some bits set you don't know how to handle? should these be ignored as now or cause a failure? these decisions all affect uapi. > + > + if (((vnet_hash.flags & TUN_VNET_HASH_REPORT) && > + (tun->vnet_hdr_sz < sizeof(struct virtio_net_hdr_v1_hash) || > + !tun_is_little_endian(tun))) || > + vnet_hash.indirection_table_mask >= > + TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH) { > + ret = -EINVAL; > + break; > + } Given this is later used to index within an array one has to be very careful about spectre things here, which this code isn't. > + > + argp = (u8 __user *)argp + len; > + len = (vnet_hash.indirection_table_mask + 1) * 2; comment pointer math tricks like this extensively please. > + if (copy_from_user(vnet_hash_indirection_table, argp, len)) { > + ret = -EFAULT; > + break; > + } > + > + argp = (u8 __user *)argp + len; > + len = virtio_net_hash_key_length(vnet_hash.types); > + > + if (copy_from_user(vnet_hash_key, argp, len)) { > + ret = -EFAULT; > + break; > + } > + > + tun->vnet_hash = vnet_hash; > + memcpy(tun->vnet_hash_indirection_table, > + vnet_hash_indirection_table, > + (vnet_hash.indirection_table_mask + 1) * 2); > + memcpy(tun->vnet_hash_key, vnet_hash_key, len); > + > + if (vnet_hash.flags & TUN_VNET_HASH_RSS) > + __tun_set_ebpf(tun, &tun->steering_prog, NULL); > + > + break; > + > default: > ret = -EINVAL; > break; > diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > index 287cdc81c939..dc591cd897c8 100644 > --- a/include/uapi/linux/if_tun.h > +++ b/include/uapi/linux/if_tun.h > @@ -61,6 +61,8 @@ > #define TUNSETFILTEREBPF _IOR('T', 225, int) > #define TUNSETCARRIER _IOW('T', 226, int) > #define TUNGETDEVNETNS _IO('T', 227) > +#define TUNGETVNETHASHCAP _IO('T', 228) > +#define TUNSETVNETHASH _IOW('T', 229, unsigned int) > > /* TUNSETIFF ifr flags */ > #define IFF_TUN 0x0001 > @@ -115,4 +117,18 @@ struct tun_filter { > __u8 addr[][ETH_ALEN]; > }; > > +struct tun_vnet_hash_cap { > + __u16 max_indirection_table_length; > + __u32 types; > +}; > + There's hidden padding in this struct - not good, copy will leak kernel info out. > +#define TUN_VNET_HASH_RSS 0x01 > +#define TUN_VNET_HASH_REPORT 0x02 Do you intend to add more flags down the road? How will userspace know what is supported? > +struct tun_vnet_hash { > + __u8 flags; > + __u32 types; > + __u16 indirection_table_mask; > + __u16 unclassified_queue; > +}; > + Padding here too. Best avoided. In any case, document UAPI please. > #endif /* _UAPI__IF_TUN_H */ > -- > 2.42.0