Re: [PATCH net-next v9 3/6] tun: Introduce virtio-net hash feature

Jason Wang <jasowang@xxxxxxxxxx> · Tue, 11 Mar 2025 08:38:13 +0800

On Mon, Mar 10, 2025 at 3:45 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote:
>
> On 2025/03/10 12:55, Jason Wang wrote:
> > On Fri, Mar 7, 2025 at 7:01 PM Akihiko Odaki <akihiko.odaki@xxxxxxxxxx> wrote:
> >>
> >> Hash reporting
> >> ==============
> >>
> >> Allow the guest to reuse the hash value to make receive steering
> >> consistent between the host and guest, and to save hash computation.
> >>
> >> RSS
> >> ===
> >>
> >> RSS is a receive steering algorithm that can be negotiated to use with
> >> virtio_net. Conventionally the hash calculation was done by the VMM.
> >> However, computing the hash after the queue was chosen defeats the
> >> purpose of RSS.
> >>
> >> Another approach is to use eBPF steering program. This approach has
> >> another downside: it cannot report the calculated hash due to the
> >> restrictive nature of eBPF steering program.
> >>
> >> Introduce the code to perform RSS to the kernel in order to overcome
> >> thse challenges. An alternative solution is to extend the eBPF steering
> >> program so that it will be able to report to the userspace, but I didn't
> >> opt for it because extending the current mechanism of eBPF steering
> >> program as is because it relies on legacy context rewriting, and
> >> introducing kfunc-based eBPF will result in non-UAPI dependency while
> >> the other relevant virtualization APIs such as KVM and vhost_net are
> >> UAPIs.
> >>
> >> Signed-off-by: Akihiko Odaki <akihiko.odaki@xxxxxxxxxx>
> >> Tested-by: Lei Yang <leiyang@xxxxxxxxxx>
> >> ---
> >>   Documentation/networking/tuntap.rst |   7 ++
> >>   drivers/net/Kconfig                 |   1 +
> >>   drivers/net/tap.c                   |  68 ++++++++++++++-
> >>   drivers/net/tun.c                   |  98 +++++++++++++++++-----
> >>   drivers/net/tun_vnet.h              | 159 ++++++++++++++++++++++++++++++++++--
> >>   include/linux/if_tap.h              |   2 +
> >>   include/linux/skbuff.h              |   3 +
> >>   include/uapi/linux/if_tun.h         |  75 +++++++++++++++++
> >>   net/core/skbuff.c                   |   4 +
> >>   9 files changed, 386 insertions(+), 31 deletions(-)
> >>
> >> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst
> >> index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644
> >> --- a/Documentation/networking/tuntap.rst
> >> +++ b/Documentation/networking/tuntap.rst
> >> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
> >>         return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
> >>     }
> >>
> >> +3.4 Reference
> >> +-------------
> >> +
> >> +``linux/if_tun.h`` defines the interface described below:
> >> +
> >> +.. kernel-doc:: include/uapi/linux/if_tun.h
> >> +
> >>   Universal TUN/TAP device driver Frequently Asked Question
> >>   =========================================================
> >>
> >> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> >> index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644
> >> --- a/drivers/net/Kconfig
> >> +++ b/drivers/net/Kconfig
> >> @@ -395,6 +395,7 @@ config TUN
> >>          tristate "Universal TUN/TAP device driver support"
> >>          depends on INET
> >>          select CRC32
> >> +       select SKB_EXTENSIONS
> >>          help
> >>            TUN/TAP provides packet reception and transmission for user space
> >>            programs.  It can be viewed as a simple Point-to-Point or Ethernet
> >> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> >> index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644
> >> --- a/drivers/net/tap.c
> >> +++ b/drivers/net/tap.c
> >> @@ -49,6 +49,10 @@ struct major_info {
> >>          struct list_head next;
> >>   };
> >>
> >> +struct tap_skb_cb {
> >> +       struct virtio_net_hash hash;
> >> +};
> >> +
> >>   #define GOODCOPY_LEN 128
> >>
> >>   static const struct proto_ops tap_socket_ops;
> >> @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q)
> >>          sock_put(&q->sk);
> >>   }
> >>
> >> +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb)
> >> +{
> >> +       BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb));
> >> +       return (struct tap_skb_cb *)skb->cb;
> >> +}
> >> +
> >> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb)
> >> +{
> >> +       return &tap_skb_cb(skb)->hash;
> >> +}
> >> +
> >> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb)
> >> +{
> >> +       return &tap_skb_cb(skb)->hash;
> >> +}
> >> +
> >>   /*
> >>    * Select a queue based on the rxq of the device on which this packet
> >>    * arrived. If the incoming device is not mq, calculate a flow hash
> >> @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q)
> >>   static struct tap_queue *tap_get_queue(struct tap_dev *tap,
> >>                                         struct sk_buff *skb)
> >>   {
> >> +       struct flow_keys_basic keys_basic;
> >>          struct tap_queue *queue = NULL;
> >>          /* Access to taps array is protected by rcu, but access to numvtaps
> >>           * isn't. Below we use it to lookup a queue, but treat it as a hint
> >> @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
> >>           * racing against queue removal.
> >>           */
> >>          int numvtaps = READ_ONCE(tap->numvtaps);
> >> +       struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash);
> >>          __u32 rxq;
> >>
> >> +       *tap_skb_cb(skb) = (struct tap_skb_cb) {
> >> +               .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE }
> >> +       };
> >> +
> >>          if (!numvtaps)
> >>                  goto out;
> >>
> >>          if (numvtaps == 1)
> >>                  goto single;
> >>
> >> +       if (vnet_hash) {
> >> +               if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
> >> +                       rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash);
> >> +                       queue = rcu_dereference(tap->taps[rxq]);
> >> +                       goto out;
> >> +               }
> >> +
> >> +               if (!skb->l4_hash && !skb->sw_hash) {
> >> +                       struct flow_keys keys;
> >> +
> >> +                       skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
> >> +                       rxq = flow_hash_from_keys(&keys);
> >> +                       keys_basic = (struct flow_keys_basic) {
> >> +                               .control = keys.control,
> >> +                               .basic = keys.basic
> >> +                       };
> >> +               } else {
> >> +                       skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0,
> >> +                                                        FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
> >> +                       rxq = skb->hash;
> >> +               }
> >> +       } else {
> >> +               rxq = skb_get_hash(skb);
> >> +       }
> >> +
> >>          /* Check if we can use flow to select a queue */
> >> -       rxq = skb_get_hash(skb);
> >>          if (rxq) {
> >> +               tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash);
> >>                  queue = rcu_dereference(tap->taps[rxq % numvtaps]);
> >>                  goto out;
> >>          }
> >> @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q,
> >>          int total;
> >>
> >>          if (q->flags & IFF_VNET_HDR) {
> >> -               struct virtio_net_hdr vnet_hdr;
> >> +               struct virtio_net_hdr_v1_hash vnet_hdr;
> >>
> >>                  vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
> >>
> >> -               ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr);
> >> +               ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb,
> >> +                                           tap_find_hash, &vnet_hdr);
> >>                  if (ret)
> >>                          return ret;
> >>
> >> @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
> >>                  rtnl_unlock();
> >>                  return ret;
> >>
> >> +       case TUNGETVNETHASHCAP:
> >> +               return tun_vnet_ioctl_gethashcap(argp);
> >> +
> >> +       case TUNSETVNETHASH:
> >> +               rtnl_lock();
> >> +               tap = rtnl_dereference(q->tap);
> >> +               ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD;
> >> +               rtnl_unlock();
> >> +               return ret;
> >> +
> >>          case SIOCGIFHWADDR:
> >>                  rtnl_lock();
> >>                  tap = tap_get_tap_dev(q);
> >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >> index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644
> >> --- a/drivers/net/tun.c
> >> +++ b/drivers/net/tun.c
> >> @@ -209,6 +209,7 @@ struct tun_struct {
> >>          struct bpf_prog __rcu *xdp_prog;
> >>          struct tun_prog __rcu *steering_prog;
> >>          struct tun_prog __rcu *filter_prog;
> >> +       struct tun_vnet_hash_container __rcu *vnet_hash;
> >>          struct ethtool_link_ksettings link_ksettings;
> >>          /* init args */
> >>          struct file *file;
> >> @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
> >>                  e->rps_rxhash = hash;
> >>   }
> >>
> >> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb)
> >> +{
> >> +       return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
> >> +}
> >> +
> >> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb)
> >> +{
> >> +       return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH);
> >> +}
> >> +
> >>   /* We try to identify a flow through its rxhash. The reason that
> >>    * we do not check rxq no. is because some cards(e.g 82599), chooses
> >>    * the rxq based on the txq where the last packet of the flow comes. As
> >>    * the userspace application move between processors, we may get a
> >>    * different rxq no. here.
> >>    */
> >> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> >> +static u16 tun_automq_select_queue(struct tun_struct *tun,
> >> +                                  const struct tun_vnet_hash_container *vnet_hash,
> >> +                                  struct sk_buff *skb)
> >>   {
> >> +       struct flow_keys keys;
> >> +       struct flow_keys_basic keys_basic;
> >>          struct tun_flow_entry *e;
> >>          u32 txq, numqueues;
> >>
> >>          numqueues = READ_ONCE(tun->numqueues);
> >>
> >> -       txq = __skb_get_hash_symmetric(skb);
> >> +       memset(&keys, 0, sizeof(keys));
> >> +       skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
> >> +
> >> +       txq = flow_hash_from_keys(&keys);
> >>          e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
> >>          if (e) {
> >>                  tun_flow_save_rps_rxhash(e, txq);
> >> @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> >>                  txq = reciprocal_scale(txq, numqueues);
> >>          }
> >>
> >> +       keys_basic = (struct flow_keys_basic) {
> >> +               .control = keys.control,
> >> +               .basic = keys.basic
> >> +       };
> >> +       tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq,
> >> +                            tun_add_hash);
> >> +
> >>          return txq;
> >>   }
> >>
> >> @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
> >>          u16 ret;
> >>
> >>          rcu_read_lock();
> >> -       if (rcu_dereference(tun->steering_prog))
> >> +       if (rcu_dereference(tun->steering_prog)) {
> >>                  ret = tun_ebpf_select_queue(tun, skb);
> >> -       else
> >> -               ret = tun_automq_select_queue(tun, skb);
> >> +       } else {
> >> +               struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
> >> +
> >> +               if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
> >> +                       ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash,
> >> +                                                       skb, tun_add_hash);
> >> +               else
> >> +                       ret = tun_automq_select_queue(tun, vnet_hash, skb);
> >> +       }
> >>          rcu_read_unlock();
> >>
> >>          return ret;
> >> @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
> >>          ssize_t ret;
> >>
> >>          if (tun->flags & IFF_VNET_HDR) {
> >> -               struct virtio_net_hdr gso = { 0 };
> >> +               struct virtio_net_hdr_v1_hash gso = { 0 };
> >>
> >>                  vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
> >>                  ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
> >> @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> >>          }
> >>
> >>          if (vnet_hdr_sz) {
> >> -               struct virtio_net_hdr gso;
> >> +               struct virtio_net_hdr_v1_hash gso;
> >>
> >> -               ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
> >> +               ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev,
> >> +                                           skb, tun_find_hash, &gso);
> >>                  if (ret)
> >>                          return ret;
> >>
> >> @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev)
> >>          security_tun_dev_free_security(tun->security);
> >>          __tun_set_ebpf(tun, &tun->steering_prog, NULL);
> >>          __tun_set_ebpf(tun, &tun->filter_prog, NULL);
> >> +       kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash));
> >>   }
> >>
> >>   static void tun_setup(struct net_device *dev)
> >> @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
> >>   }
> >>
> >>   static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
> >> -                       void __user *data)
> >> +                       int fd)
> >>   {
> >>          struct bpf_prog *prog;
> >> -       int fd;
> >> -
> >> -       if (copy_from_user(&fd, data, sizeof(fd)))
> >> -               return -EFAULT;
> >>
> >>          if (fd == -1) {
> >>                  prog = NULL;
> >> @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>          int ifindex;
> >>          int sndbuf;
> >>          int ret;
> >> +       int fd;
> >>          bool do_notify = false;
> >> +       struct tun_vnet_hash_container *vnet_hash;
> >>
> >>          if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
> >>              (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
> >> @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>          rtnl_lock();
> >>
> >>          tun = tun_get(tfile);
> >> -       if (cmd == TUNSETIFF) {
> >> +       switch (cmd) {
> >> +       case TUNSETIFF:
> >>                  ret = -EEXIST;
> >>                  if (tun)
> >>                          goto unlock;
> >> @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>                  if (copy_to_user(argp, &ifr, ifreq_len))
> >>                          ret = -EFAULT;
> >>                  goto unlock;
> >> -       }
> >> -       if (cmd == TUNSETIFINDEX) {
> >> +
> >> +       case TUNSETIFINDEX:
> >>                  ret = -EPERM;
> >>                  if (tun)
> >>                          goto unlock;
> >> @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>                  ret = 0;
> >>                  tfile->ifindex = ifindex;
> >>                  goto unlock;
> >> +
> >> +       case TUNGETVNETHASHCAP:
> >> +               ret = tun_vnet_ioctl_gethashcap(argp);
> >> +               goto unlock;
> >>          }
> >>
> >>          ret = -EBADFD;
> >> @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>                  break;
> >>
> >>          case TUNSETSTEERINGEBPF:
> >> -               ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> >> +               if (get_user(fd, (int __user *)argp)) {
> >> +                       ret = -EFAULT;
> >> +                       break;
> >> +               }
> >> +
> >> +               vnet_hash = rtnl_dereference(tun->vnet_hash);
> >> +               if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
> >> +                       ret = -EBUSY;
> >> +                       break;
> >> +               }
> >> +
> >> +               ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
> >>                  break;
> >>
> >>          case TUNSETFILTEREBPF:
> >> -               ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> >> +               if (get_user(fd, (int __user *)argp)) {
> >> +                       ret = -EFAULT;
> >> +                       break;
> >> +               }
> >> +
> >> +               ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
> >>                  break;
> >>
> >>          case TUNSETCARRIER:
> >> @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>                  ret = open_related_ns(&net->ns, get_net_ns);
> >>                  break;
> >>
> >> +       case TUNSETVNETHASH:
> >> +               ret = tun_vnet_ioctl_sethash(&tun->vnet_hash,
> >> +                                            !rtnl_dereference(tun->steering_prog),
> >> +                                            argp);
> >> +               break;
> >> +
> >>          default:
> >> -               ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp);
> >> +               ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags,
> >> +                                    cmd, argp);
> >>                  break;
> >>          }
> >>
> >> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
> >> index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644
> >> --- a/drivers/net/tun_vnet.h
> >> +++ b/drivers/net/tun_vnet.h
> >> @@ -6,6 +6,16 @@
> >>   #define TUN_VNET_LE     0x80000000
> >>   #define TUN_VNET_BE     0x40000000
> >>
> >> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *);
> >> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *);
> >> +
> >> +struct tun_vnet_hash_container {
> >> +       struct tun_vnet_hash common;
> >
> > I'd rename this as hash.
> >
> >> +       struct tun_vnet_hash_rss rss;
> >> +       u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
> >> +       u16 rss_indirection_table[];
> >> +};
> >
> > Besides the separated ioctl, I'd split this structure into rss and
> > hash part as well.
> >
> >> +
> >>   static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
> >>   {
> >>          bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) &&
> >> @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags,
> >>          }
> >>   }
> >>
> >> +static inline long tun_vnet_ioctl_gethashcap(void __user *argp)
> >> +{
> >> +       static const struct tun_vnet_hash cap = {
> >> +               .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
> >> +               .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
> >> +       };
> >> +
> >> +       return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0;
> >
> > Let's has a consistent name for this and the uapi to be consistent
> > with TUNSETIFF/TUNGETIFF. Probably TUNSETVNETHASH and
> > tun_vnet_ioctl_gethash().
>
> They have different semantics so they should have different names.
> TUNGETIFF reports the value currently set while TUNGETVNETHASHCAP
> reports the value that can be set later.

I'm not sure I will get here. I meant a symmetric name

TUNSETVNETHASH and TUNVETVNETHASH.

>
> >
> >> +}
> >> +
> >> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container __rcu **hashp,
> >> +                                         bool can_rss, void __user *argp)
> >
> > So again, can_rss seems to be tricky. Looking at its caller, it tires
> > to make eBPF and RSS mutually exclusive. I still don't understand why
> > we need this. Allow eBPF program to override some of the path seems to
> > be common practice.
> >
> > What's more, we didn't try (or even can't) to make automq and eBPF to
> > be mutually exclusive. So I still didn't see what we gain from this
> > and it complicates the codes and may lead to ambiguous uAPI/behaviour.
>
> automq and eBPF are mutually exclusive; automq is disabled when an eBPF
> steering program is set so I followed the example here.

I meant from the view of uAPI, the kernel doesn't or can't reject eBPF
while using automq.

>
> We don't even have an interface for eBPF to let it fall back to another
> alogirhtm.

It doesn't even need this, e.g XDP overrides the default receiving path.

> I could make it fall back to RSS if the eBPF steeering
> program is designed to fall back to automq when it returns e.g., -1. But
> such an interface is currently not defined and defining one is out of
> scope of this patch series.

Just to make sure we are on the same page, I meant we just need to
make the behaviour consistent: allow eBPF to override the behaviour of
both automq and rss.

>
> >
> >> +{
> >> +       struct tun_vnet_hash hash_buf;
> >> +       struct tun_vnet_hash_container *hash;
> >> +
> >> +       if (copy_from_user(&hash_buf, argp, sizeof(hash_buf)))
> >> +               return -EFAULT;
> >> +       argp = (struct tun_vnet_hash __user *)argp + 1;
> >> +
> >> +       if (hash_buf.flags & TUN_VNET_HASH_RSS) {
> >> +               struct tun_vnet_hash_rss rss;
> >> +               size_t indirection_table_size;
> >> +               size_t key_size;
> >> +               size_t size;
> >> +
> >> +               if (!can_rss)
> >> +                       return -EBUSY;
> >> +
> >> +               if (copy_from_user(&rss, argp, sizeof(rss)))
> >> +                       return -EFAULT;
> >> +               argp = (struct tun_vnet_hash_rss __user *)argp + 1;
> >> +
> >> +               indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2;
> >> +               key_size = virtio_net_hash_key_length(hash_buf.types);
> >> +               size = struct_size(hash, rss_indirection_table,
> >> +                                  (size_t)rss.indirection_table_mask + 1);
> >> +
> >> +               hash = kmalloc(size, GFP_KERNEL);
> >> +               if (!hash)
> >> +                       return -ENOMEM;
> >> +
> >> +               if (copy_from_user(hash->rss_indirection_table,
> >> +                                  argp, indirection_table_size)) {
> >> +                       kfree(hash);
> >> +                       return -EFAULT;
> >> +               }
> >> +               argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
> >> +
> >> +               if (copy_from_user(hash->rss_key, argp, key_size)) {
> >> +                       kfree(hash);
> >> +                       return -EFAULT;
> >> +               }
> >> +
> >> +               virtio_net_toeplitz_convert_key(hash->rss_key, key_size);
> >> +               hash->rss = rss;
> >> +       } else {
> >> +               hash = kmalloc(sizeof(hash->common), GFP_KERNEL);
> >> +               if (!hash)
> >> +                       return -ENOMEM;
> >
> > Do we need to validate the hash here (at least against the types we supported?)
> >
> >> +       }
> >> +
> >> +       hash->common = hash_buf;
> >> +       kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash));
> >
> > I still didn't understand the trick here. E.g we use very simple
> > primitives in synchronizing ebpf program through RCU in
> > __tun_set_ebpf().
>
> It is even simpler than __tun_set_ebpf(). The differences from
> __tun_set_ebpf() are:
> 1. This uses the rtnl lock instead of TUN-specific one. It makes the
> code simpler as the rtnl lock is already taken in __tun_chr_ioctl().

It can be tweaked to use rtnl as well.

> 2. This does not add rcu_head and uses blocking APIs for simplicity.

Right.

>
> >
> >> +       return 0;
> >> +}
> >> +
> >> +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash,
> >> +                                struct sk_buff *skb,
> >> +                                const struct flow_keys_basic *keys,
> >> +                                u32 value,
> >> +                                tun_vnet_hash_add vnet_hash_add)
> >> +{
> >> +       struct virtio_net_hash *report;
> >> +
> >> +       if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT))
> >> +               return;
> >> +
> >> +       report = vnet_hash_add(skb);
> >> +       if (!report)
> >> +               return;
> >> +
> >> +       *report = (struct virtio_net_hash) {
> >> +               .report = virtio_net_hash_report(hash->common.types, keys),
> >> +               .value = value
> >> +       };
> >
> > What's the advantage of using Designated Initializers here? Simple
> > assignment can save two lines of code.
>
> It automatically fills other fileds with zero. Simple assignments will
> need more tokens for zeroing.

Ok.

>
> >
> >> +}
> >> +
> >> +static u16 tun_vnet_rss_select_queue(u32 numqueues,
> >> +                                    const struct tun_vnet_hash_container *hash,
> >> +                                    struct sk_buff *skb,
> >> +                                    tun_vnet_hash_add vnet_hash_add)
> >> +{
> >> +       struct virtio_net_hash *report;
> >> +       struct virtio_net_hash ret;
> >> +       u16 txq, index;
> >> +
> >> +       if (!numqueues)
> >> +               return 0;
> >> +
> >> +       virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret);
> >> +
> >> +       if (!ret.report)
> >> +               return hash->rss.unclassified_queue % numqueues;
> >> +
> >> +       if (hash->common.flags & TUN_VNET_HASH_REPORT) {
> >> +               report = vnet_hash_add(skb);
> >> +               if (report)
> >> +                       *report = ret;
> >> +       }
> >
> > Is there a chance that we can reach here without TUN_VNET_HASH_REPORT?
> > If yes, it should be a bug.
>
> It is possible to use RSS without TUN_VNET_HASH_REPORT.

Another call to separate the ioctls then.

> It is more of a
> feature instead of a bug; it behaves like QEMU's eBPF program but
> requires no privilege and is more optimized with native code and ffs().
>
> >
> >> +
> >> +       index = ret.value & hash->rss.indirection_table_mask;
> >> +       txq = READ_ONCE(hash->rss_indirection_table[index]);
> >
> > So vnet_hash is accessed via rcu_dereference(), I don't get any reason
> > we need READ_ONCE here, is this paired with something? If yes, let's
> > add a comment here. If rss_indirection_table need why
> > indirection_table_mask doesn't need this?
>
> I'll drop it. I think it's just a left-over of previous versions without
> RCU.
>
> >
> >> +
> >> +       return txq % numqueues;
> >> +}
> >> +
> >>   static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
> >>                                     struct iov_iter *from,
> >>                                     struct virtio_net_hdr *hdr)
> >> @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
> >>   }
> >>
> >>   static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
> >> -                                  const struct virtio_net_hdr *hdr)
> >> +                                  const struct virtio_net_hdr_v1_hash *hdr)
> >>   {
> >
> > To be more robust, we can tweak the function to accept a vnet_hdr_len
> > parameter then we can avoid touching this every time when we need to
> > extend vnet hdr in the future?
>
> I think you meant vnet_hdr_sz instead of vnet_hdr_len. It is already
> passed just as "sz" here as the function name already says it's about
> the header.
>
> It is possible to add another parameter for sizeof(*hdr) and convert the
> hdr parameter to void * to avoid future changes. But I rather keep it as
> is because the current form ensures the hdr is large enough and
> statically avoids buffer overrun.

Right.

>
> >
> >> +       int content_sz = MIN(sizeof(*hdr), sz);
> >> +
> >>          if (unlikely(iov_iter_count(iter) < sz))
> >>                  return -EINVAL;
> >>
> >> -       if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)))
> >> +       if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz))
> >>                  return -EFAULT;
> >>
> >> -       if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
> >> +       if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz)
> >>                  return -EFAULT;
> >>
> >>          return 0;
> >> @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb,
> >>          return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags));
> >>   }
> >>
> >> -static inline int tun_vnet_hdr_from_skb(unsigned int flags,
> >> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags,
> >>                                          const struct net_device *dev,
> >>                                          const struct sk_buff *skb,
> >> -                                       struct virtio_net_hdr *hdr)
> >> +                                       tun_vnet_hash_find vnet_hash_find,
> >> +                                       struct virtio_net_hdr_v1_hash *hdr)
> >>   {
> >>          int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
> >> +       const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ?
> >> +                                              NULL : vnet_hash_find(skb);
> >> +
> >> +       *hdr = (struct virtio_net_hdr_v1_hash) {
> >> +               .hash_report = VIRTIO_NET_HASH_REPORT_NONE
> >> +       };
> >> +
> >> +       if (report) {
> >> +               hdr->hash_value = cpu_to_le32(report->value);
> >> +               hdr->hash_report = cpu_to_le16(report->report);
> >> +       }
> >>
> >> -       if (virtio_net_hdr_from_skb(skb, hdr,
> >> +       if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr,
> >>                                      tun_vnet_is_little_endian(flags), true,
> >>                                      vlan_hlen)) {
> >>                  struct skb_shared_info *sinfo = skb_shinfo(skb);
> >>
> >>                  if (net_ratelimit()) {
> >>                          netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
> >> -                                  sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size),
> >> -                                  tun_vnet16_to_cpu(flags, hdr->hdr_len));
> >> +                                  sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size),
> >> +                                  tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len));
> >>                          print_hex_dump(KERN_ERR, "tun: ",
> >>                                         DUMP_PREFIX_NONE,
> >>                                         16, 1, skb->head,
> >> -                                      min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true);
> >> +                                      min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true);
> >>                  }
> >>                  WARN_ON_ONCE(1);
> >>                  return -EINVAL;
> >> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
> >> index 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f 100644
> >> --- a/include/linux/if_tap.h
> >> +++ b/include/linux/if_tap.h
> >> @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
> >>   #define MAX_TAP_QUEUES 256
> >>
> >>   struct tap_queue;
> >> +struct tun_vnet_hash_container;
> >>
> >>   struct tap_dev {
> >>          struct net_device       *dev;
> >> @@ -43,6 +44,7 @@ struct tap_dev {
> >>          int                     numqueues;
> >>          netdev_features_t       tap_features;
> >>          int                     minor;
> >> +       struct tun_vnet_hash_container __rcu *vnet_hash;
> >>
> >>          void (*update_features)(struct tap_dev *tap, netdev_features_t features);
> >>          void (*count_tx_dropped)(struct tap_dev *tap);
> >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> >> index bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f 100644
> >> --- a/include/linux/skbuff.h
> >> +++ b/include/linux/skbuff.h
> >> @@ -4842,6 +4842,9 @@ enum skb_ext_id {
> >>   #endif
> >>   #if IS_ENABLED(CONFIG_MCTP_FLOWS)
> >>          SKB_EXT_MCTP,
> >> +#endif
> >> +#if IS_ENABLED(CONFIG_TUN)
> >> +       SKB_EXT_TUN_VNET_HASH,
> >>   #endif
> >>          SKB_EXT_NUM, /* must be last */
> >>   };
> >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> >> index 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040 100644
> >> --- a/include/uapi/linux/if_tun.h
> >> +++ b/include/uapi/linux/if_tun.h
> >> @@ -62,6 +62,42 @@
> >>   #define TUNSETCARRIER _IOW('T', 226, int)
> >>   #define TUNGETDEVNETNS _IO('T', 227)
> >>
> >> +/**
> >> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability.
> >> + *
> >> + * The argument is a pointer to &struct tun_vnet_hash which will store the
> >> + * maximal virtio_net hashing configuration.
> >> + */
> >> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash)
> >> +
> >> +/**
> >> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing
> >> + *
> >> + * The argument is a pointer to &struct tun_vnet_hash.
> >> + *
> >> + * The argument is a pointer to the compound of the following in order if
> >> + * %TUN_VNET_HASH_RSS is set:
> >> + *
> >> + * 1. &struct tun_vnet_hash
> >> + * 2. &struct tun_vnet_hash_rss
> >> + * 3. Indirection table
> >> + * 4. Key
> >> + *
> >> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only
> >> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal
> >> + * to the size of &struct virtio_net_hdr_v1_hash.
> >
> > So you had a dependency check already for vnet hdr len. I'd still
> > suggest to split this into rss and hash as they are separated
> > features. Then we can use separate data structure for them instead of
> > a container struct.
>
> I added a dependency check and found it is complicating the code and
> requires additional tests. I need a reason to justify the complexity if
> we are going to split it.

As we discussed above: They don't dedpend on each other.

>
> Regards,
> Akihiko Odaki
>
> >
> >> + *
> >> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will
> >> + * always be little-endian.
> >> + *
> >> + * This ioctl results in %EBADFD if the underlying device is deleted. It affects
> >> + * all queues attached to the same device.
> >> + *
> >> + * This ioctl currently has no effect on XDP packets and packets with
> >> + * queue_mapping set by TC.
> >> + */
> >> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash)
> >> +
> >>   /* TUNSETIFF ifr flags */
> >>   #define IFF_TUN                0x0001
> >>   #define IFF_TAP                0x0002
> >> @@ -115,4 +151,43 @@ struct tun_filter {
> >>          __u8   addr[][ETH_ALEN];
> >>   };
> >>
> >> +/**
> >> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost
> >> + */
> >> +#define TUN_VNET_HASH_REPORT   0x0001
> >> +
> >> +/**
> >> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
> >> + *
> >> + * This is mutually exclusive with eBPF steering program.
> >> + */
> >> +#define TUN_VNET_HASH_RSS      0x0002
> >> +
> >> +/**
> >> + * struct tun_vnet_hash - virtio_net hashing configuration
> >> + * @flags:
> >> + *             Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS
> >> + * @pad:
> >> + *             Should be filled with zero before passing to %TUNSETVNETHASH
> >> + * @types:
> >> + *             Bitmask of allowed hash types
> >> + */
> >> +struct tun_vnet_hash {
> >> +       __u16 flags;
> >> +       __u8 pad[2];
> >> +       __u32 types;
> >> +};
> >
> > Padding in the middle of the structure is not elegant. Any reason for this?
> >
> > And hash->types seems never used.
> >
> >> +
> >> +/**
> >> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
> >> + * @indirection_table_mask:
> >> + *             Bitmask to be applied to the indirection table index
> >> + * @unclassified_queue:
> >> + *             The index of the queue to place unclassified packets in
> >> + */
> >> +struct tun_vnet_hash_rss {
> >> +       __u16 indirection_table_mask;
> >> +       __u16 unclassified_queue;
> >> +};
> >> +
> >>   #endif /* _UAPI__IF_TUN_H */
> >> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >> index 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb 100644
> >> --- a/net/core/skbuff.c
> >> +++ b/net/core/skbuff.c
> >> @@ -64,6 +64,7 @@
> >>   #include <linux/mpls.h>
> >>   #include <linux/kcov.h>
> >>   #include <linux/iov_iter.h>
> >> +#include <linux/virtio_net.h>
> >>
> >>   #include <net/protocol.h>
> >>   #include <net/dst.h>
> >> @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = {
> >>   #if IS_ENABLED(CONFIG_MCTP_FLOWS)
> >>          [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
> >>   #endif
> >> +#if IS_ENABLED(CONFIG_TUN)
> >> +       [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash),
> >> +#endif
> >>   };
> >>
> >>   static __always_inline unsigned int skb_ext_total_length(void)
> >>
> >> --
> >> 2.48.1
> >>
> >
> > Thanks
> >
>