Hash reporting
==============
Allow the guest to reuse the hash value to make receive steering
consistent between the host and guest, and to save hash computation.
RSS
===
RSS is a receive steering algorithm that can be negotiated to use with
virtio_net. Conventionally the hash calculation was done by the VMM.
However, computing the hash after the queue was chosen defeats the
purpose of RSS.
Another approach is to use eBPF steering program. This approach has
another downside: it cannot report the calculated hash due to the
restrictive nature of eBPF steering program.
Introduce the code to perform RSS to the kernel in order to overcome
thse challenges. An alternative solution is to extend the eBPF steering
program so that it will be able to report to the userspace, but I didn't
opt for it because extending the current mechanism of eBPF steering
program as is because it relies on legacy context rewriting, and
introducing kfunc-based eBPF will result in non-UAPI dependency while
the other relevant virtualization APIs such as KVM and vhost_net are
UAPIs.
Signed-off-by: Akihiko Odaki <akihiko.odaki@xxxxxxxxxx>
Tested-by: Lei Yang <leiyang@xxxxxxxxxx>
---
Documentation/networking/tuntap.rst | 7 ++
drivers/net/Kconfig | 1 +
drivers/net/tap.c | 68 ++++++++++++++-
drivers/net/tun.c | 98 +++++++++++++++++-----
drivers/net/tun_vnet.h | 159 ++++++++++++++++++++++++++++++++++--
include/linux/if_tap.h | 2 +
include/linux/skbuff.h | 3 +
include/uapi/linux/if_tun.h | 75 +++++++++++++++++
net/core/skbuff.c | 4 +
9 files changed, 386 insertions(+), 31 deletions(-)
diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst
index 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465 100644
--- a/Documentation/networking/tuntap.rst
+++ b/Documentation/networking/tuntap.rst
@@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
}
+3.4 Reference
+-------------
+
+``linux/if_tun.h`` defines the interface described below:
+
+.. kernel-doc:: include/uapi/linux/if_tun.h
+
Universal TUN/TAP device driver Frequently Asked Question
=========================================================
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -395,6 +395,7 @@ config TUN
tristate "Universal TUN/TAP device driver support"
depends on INET
select CRC32
+ select SKB_EXTENSIONS
help
TUN/TAP provides packet reception and transmission for user space
programs. It can be viewed as a simple Point-to-Point or Ethernet
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -49,6 +49,10 @@ struct major_info {
struct list_head next;
};
+struct tap_skb_cb {
+ struct virtio_net_hash hash;
+};
+
#define GOODCOPY_LEN 128
static const struct proto_ops tap_socket_ops;
@@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q)
sock_put(&q->sk);
}
+static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb));
+ return (struct tap_skb_cb *)skb->cb;
+}
+
+static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb)
+{
+ return &tap_skb_cb(skb)->hash;
+}
+
+static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb)
+{
+ return &tap_skb_cb(skb)->hash;
+}
+
/*
* Select a queue based on the rxq of the device on which this packet
* arrived. If the incoming device is not mq, calculate a flow hash
@@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q)
static struct tap_queue *tap_get_queue(struct tap_dev *tap,
struct sk_buff *skb)
{
+ struct flow_keys_basic keys_basic;
struct tap_queue *queue = NULL;
/* Access to taps array is protected by rcu, but access to numvtaps
* isn't. Below we use it to lookup a queue, but treat it as a hint
@@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
* racing against queue removal.
*/
int numvtaps = READ_ONCE(tap->numvtaps);
+ struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash);
__u32 rxq;
+ *tap_skb_cb(skb) = (struct tap_skb_cb) {
+ .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE }
+ };
+
if (!numvtaps)
goto out;
if (numvtaps == 1)
goto single;
+ if (vnet_hash) {
+ if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
+ rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash);
+ queue = rcu_dereference(tap->taps[rxq]);
+ goto out;
+ }
+
+ if (!skb->l4_hash && !skb->sw_hash) {
+ struct flow_keys keys;
+
+ skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+ rxq = flow_hash_from_keys(&keys);
+ keys_basic = (struct flow_keys_basic) {
+ .control = keys.control,
+ .basic = keys.basic
+ };
+ } else {
+ skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+ rxq = skb->hash;
+ }
+ } else {
+ rxq = skb_get_hash(skb);
+ }
+
/* Check if we can use flow to select a queue */
- rxq = skb_get_hash(skb);
if (rxq) {
+ tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash);
queue = rcu_dereference(tap->taps[rxq % numvtaps]);
goto out;
}
@@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q,
int total;
if (q->flags & IFF_VNET_HDR) {
- struct virtio_net_hdr vnet_hdr;
+ struct virtio_net_hdr_v1_hash vnet_hdr;
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
- ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr);
+ ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb,
+ tap_find_hash, &vnet_hdr);
if (ret)
return ret;
@@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
rtnl_unlock();
return ret;
+ case TUNGETVNETHASHCAP:
+ return tun_vnet_ioctl_gethashcap(argp);
+
+ case TUNSETVNETHASH:
+ rtnl_lock();
+ tap = rtnl_dereference(q->tap);
+ ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) : -EBADFD;
+ rtnl_unlock();
+ return ret;
+
case SIOCGIFHWADDR:
rtnl_lock();
tap = tap_get_tap_dev(q);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -209,6 +209,7 @@ struct tun_struct {
struct bpf_prog __rcu *xdp_prog;
struct tun_prog __rcu *steering_prog;
struct tun_prog __rcu *filter_prog;
+ struct tun_vnet_hash_container __rcu *vnet_hash;
struct ethtool_link_ksettings link_ksettings;
/* init args */
struct file *file;
@@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
e->rps_rxhash = hash;
}
+static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb)
+{
+ return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
+}
+
+static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb)
+{
+ return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH);
+}
+
/* We try to identify a flow through its rxhash. The reason that
* we do not check rxq no. is because some cards(e.g 82599), chooses
* the rxq based on the txq where the last packet of the flow comes. As
* the userspace application move between processors, we may get a
* different rxq no. here.
*/
-static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
+static u16 tun_automq_select_queue(struct tun_struct *tun,
+ const struct tun_vnet_hash_container *vnet_hash,
+ struct sk_buff *skb)
{
+ struct flow_keys keys;
+ struct flow_keys_basic keys_basic;
struct tun_flow_entry *e;
u32 txq, numqueues;
numqueues = READ_ONCE(tun->numqueues);
- txq = __skb_get_hash_symmetric(skb);
+ memset(&keys, 0, sizeof(keys));
+ skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
+
+ txq = flow_hash_from_keys(&keys);
e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
if (e) {
tun_flow_save_rps_rxhash(e, txq);
@@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
txq = reciprocal_scale(txq, numqueues);
}
+ keys_basic = (struct flow_keys_basic) {
+ .control = keys.control,
+ .basic = keys.basic
+ };
+ tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq,
+ tun_add_hash);
+
return txq;
}
@@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
u16 ret;
rcu_read_lock();
- if (rcu_dereference(tun->steering_prog))
+ if (rcu_dereference(tun->steering_prog)) {
ret = tun_ebpf_select_queue(tun, skb);
- else
- ret = tun_automq_select_queue(tun, skb);
+ } else {
+ struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
+
+ if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
+ ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash,
+ skb, tun_add_hash);
+ else
+ ret = tun_automq_select_queue(tun, vnet_hash, skb);
+ }
rcu_read_unlock();
return ret;
@@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
ssize_t ret;
if (tun->flags & IFF_VNET_HDR) {
- struct virtio_net_hdr gso = { 0 };
+ struct virtio_net_hdr_v1_hash gso = { 0 };
vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
@@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun,
}
if (vnet_hdr_sz) {
- struct virtio_net_hdr gso;
+ struct virtio_net_hdr_v1_hash gso;
- ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
+ ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev,
+ skb, tun_find_hash, &gso);
if (ret)
return ret;
@@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev)
security_tun_dev_free_security(tun->security);
__tun_set_ebpf(tun, &tun->steering_prog, NULL);
__tun_set_ebpf(tun, &tun->filter_prog, NULL);
+ kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash));
}
static void tun_setup(struct net_device *dev)
@@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
}
static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
- void __user *data)
+ int fd)
{
struct bpf_prog *prog;
- int fd;
-
- if (copy_from_user(&fd, data, sizeof(fd)))
- return -EFAULT;
if (fd == -1) {
prog = NULL;
@@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
int ifindex;
int sndbuf;
int ret;
+ int fd;
bool do_notify = false;
+ struct tun_vnet_hash_container *vnet_hash;
if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
(_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
@@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
rtnl_lock();
tun = tun_get(tfile);
- if (cmd == TUNSETIFF) {
+ switch (cmd) {
+ case TUNSETIFF:
ret = -EEXIST;
if (tun)
goto unlock;
@@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
if (copy_to_user(argp, &ifr, ifreq_len))
ret = -EFAULT;
goto unlock;
- }
- if (cmd == TUNSETIFINDEX) {
+
+ case TUNSETIFINDEX:
ret = -EPERM;
if (tun)
goto unlock;
@@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = 0;
tfile->ifindex = ifindex;
goto unlock;
+
+ case TUNGETVNETHASHCAP:
+ ret = tun_vnet_ioctl_gethashcap(argp);
+ goto unlock;
}
ret = -EBADFD;
@@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
break;
case TUNSETSTEERINGEBPF:
- ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
+ if (get_user(fd, (int __user *)argp)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ vnet_hash = rtnl_dereference(tun->vnet_hash);
+ if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
+ ret = -EBUSY;
+ break;
+ }
+
+ ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
break;
case TUNSETFILTEREBPF:
- ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
+ if (get_user(fd, (int __user *)argp)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
break;
case TUNSETCARRIER:
@@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = open_related_ns(&net->ns, get_net_ns);
break;
+ case TUNSETVNETHASH:
+ ret = tun_vnet_ioctl_sethash(&tun->vnet_hash,
+ !rtnl_dereference(tun->steering_prog),
+ argp);
+ break;
+
default:
- ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp);
+ ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags,
+ cmd, argp);
break;
}
diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
index 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed 100644
--- a/drivers/net/tun_vnet.h
+++ b/drivers/net/tun_vnet.h
@@ -6,6 +6,16 @@
#define TUN_VNET_LE 0x80000000
#define TUN_VNET_BE 0x40000000
+typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *);
+typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *);
+
+struct tun_vnet_hash_container {
+ struct tun_vnet_hash common;