On 25/08/2022 16:46, Eyal Birger wrote: > This commit adds support for 'collect_md' mode on xfrm interfaces. > > Each net can have one collect_md device, created by providing the > IFLA_XFRM_COLLECT_METADATA flag at creation. This device cannot be > altered and has no if_id or link device attributes. > > On transmit to this device, the if_id is fetched from the attached dst > metadata on the skb. If exists, the link property is also fetched from > the metadata. The dst metadata type used is METADATA_XFRM which holds > these properties. > > On the receive side, xfrmi_rcv_cb() populates a dst metadata for each > packet received and attaches it to the skb. The if_id used in this case is > fetched from the xfrm state, and the link is fetched from the incoming > device. This information can later be used by upper layers such as tc, > ebpf, and ip rules. > > Because the skb is scrubed in xfrmi_rcv_cb(), the attachment of the dst > metadata is postponed until after scrubing. Similarly, xfrm_input() is > adapted to avoid dropping metadata dsts by only dropping 'valid' > (skb_valid_dst(skb) == true) dsts. > > Policy matching on packets arriving from collect_md xfrmi devices is > done by using the xfrm state existing in the skb's sec_path. > The xfrm_if_cb.decode_cb() interface implemented by xfrmi_decode_session() > is changed to keep the details of the if_id extraction tucked away > in xfrm_interface.c. > > Signed-off-by: Eyal Birger <eyal.birger@xxxxxxxxx> > > ---- > > v2: > - add "link" property as suggested by Nicolas Dichtel > - rename xfrm_if_decode_session_params to xfrm_if_decode_session_result > --- (+CC Daniel) Hi, Generally I really like the idea, but I missed to comment the first round. :) A few comments below.. > include/net/xfrm.h | 11 +++- > include/uapi/linux/if_link.h | 1 + > net/xfrm/xfrm_input.c | 7 +- > net/xfrm/xfrm_interface.c | 120 +++++++++++++++++++++++++++++------ > net/xfrm/xfrm_policy.c | 10 +-- > 5 files changed, 121 insertions(+), 28 deletions(-) > > diff --git a/include/net/xfrm.h b/include/net/xfrm.h > index 6e8fa98f786f..28b988577ed2 100644 > --- a/include/net/xfrm.h > +++ b/include/net/xfrm.h > @@ -312,9 +312,15 @@ struct km_event { > struct net *net; > }; > > +struct xfrm_if_decode_session_result { > + struct net *net; > + u32 if_id; > +}; > + > struct xfrm_if_cb { > - struct xfrm_if *(*decode_session)(struct sk_buff *skb, > - unsigned short family); > + bool (*decode_session)(struct sk_buff *skb, > + unsigned short family, > + struct xfrm_if_decode_session_result *res); > }; > > void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); > @@ -985,6 +991,7 @@ void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); > struct xfrm_if_parms { > int link; /* ifindex of underlying L2 interface */ > u32 if_id; /* interface identifyer */ > + bool collect_md; > }; > > struct xfrm_if { > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > index e36d9d2c65a7..d96f13a42589 100644 > --- a/include/uapi/linux/if_link.h > +++ b/include/uapi/linux/if_link.h > @@ -694,6 +694,7 @@ enum { > IFLA_XFRM_UNSPEC, > IFLA_XFRM_LINK, > IFLA_XFRM_IF_ID, > + IFLA_XFRM_COLLECT_METADATA, > __IFLA_XFRM_MAX > }; > > diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c > index 144238a50f3d..25e822fb5771 100644 > --- a/net/xfrm/xfrm_input.c > +++ b/net/xfrm/xfrm_input.c > @@ -20,6 +20,7 @@ > #include <net/xfrm.h> > #include <net/ip_tunnels.h> > #include <net/ip6_tunnel.h> > +#include <net/dst_metadata.h> > > #include "xfrm_inout.h" > > @@ -720,7 +721,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) > sp = skb_sec_path(skb); > if (sp) > sp->olen = 0; > - skb_dst_drop(skb); > + if (skb_valid_dst(skb)) > + skb_dst_drop(skb); > gro_cells_receive(&gro_cells, skb); > return 0; > } else { > @@ -738,7 +740,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) > sp = skb_sec_path(skb); > if (sp) > sp->olen = 0; > - skb_dst_drop(skb); > + if (skb_valid_dst(skb)) > + skb_dst_drop(skb); > gro_cells_receive(&gro_cells, skb); > return err; > } > diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c > index 5113fa0fbcee..389d8be12801 100644 > --- a/net/xfrm/xfrm_interface.c > +++ b/net/xfrm/xfrm_interface.c > @@ -41,6 +41,7 @@ > #include <net/addrconf.h> > #include <net/xfrm.h> > #include <net/net_namespace.h> > +#include <net/dst_metadata.h> > #include <net/netns/generic.h> > #include <linux/etherdevice.h> > > @@ -56,6 +57,7 @@ static const struct net_device_ops xfrmi_netdev_ops; > struct xfrmi_net { > /* lists for storing interfaces in use */ > struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; > + struct xfrm_if __rcu *collect_md_xfrmi; > }; > > #define for_each_xfrmi_rcu(start, xi) \ > @@ -77,17 +79,23 @@ static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) > return xi; > } > > + xi = rcu_dereference(xfrmn->collect_md_xfrmi); > + if (xi && xi->dev->flags & IFF_UP) minor nit: xi && (xi->dev->flags & IFF_UP) > + return xi; > + > return NULL; > } > > -static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, > - unsigned short family) > +static bool xfrmi_decode_session(struct sk_buff *skb, > + unsigned short family, > + struct xfrm_if_decode_session_result *res) > { > struct net_device *dev; > + struct xfrm_if *xi; > int ifindex = 0; > > if (!secpath_exists(skb) || !skb->dev) > - return NULL; > + return false; > > switch (family) { > case AF_INET6: > @@ -107,11 +115,18 @@ static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, > } > > if (!dev || !(dev->flags & IFF_UP)) > - return NULL; > + return false; > if (dev->netdev_ops != &xfrmi_netdev_ops) > - return NULL; > + return false; > > - return netdev_priv(dev); > + xi = netdev_priv(dev); > + res->net = xi->net; > + > + if (xi->p.collect_md) > + res->if_id = xfrm_input_state(skb)->if_id; > + else > + res->if_id = xi->p.if_id; > + return true; > } > > static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) > @@ -157,7 +172,10 @@ static int xfrmi_create(struct net_device *dev) > if (err < 0) > goto out; > > - xfrmi_link(xfrmn, xi); > + if (xi->p.collect_md) > + rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); > + else > + xfrmi_link(xfrmn, xi); > > return 0; > > @@ -185,7 +203,10 @@ static void xfrmi_dev_uninit(struct net_device *dev) > struct xfrm_if *xi = netdev_priv(dev); > struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); > > - xfrmi_unlink(xfrmn, xi); > + if (xi->p.collect_md) > + rcu_assign_pointer(xfrmn->collect_md_xfrmi, NULL); RCU_INIT_POINTER() > + else > + xfrmi_unlink(xfrmn, xi); > } > > static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) > @@ -214,6 +235,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) > struct xfrm_state *x; > struct xfrm_if *xi; > bool xnet; > + int link; > > if (err && !secpath_exists(skb)) > return 0; > @@ -224,6 +246,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) > if (!xi) > return 1; > > + link = skb->dev->ifindex; > dev = xi->dev; > skb->dev = dev; > > @@ -254,6 +277,17 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) > } > > xfrmi_scrub_packet(skb, xnet); > + if (xi->p.collect_md) { > + struct metadata_dst *md_dst; > + > + md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); > + if (!md_dst) > + return -ENOMEM; > + > + md_dst->u.xfrm_info.if_id = x->if_id; > + md_dst->u.xfrm_info.link = link; > + skb_dst_set(skb, (struct dst_entry *)md_dst); > + } > dev_sw_netstats_rx_add(dev, skb->len); > > return 0; > @@ -269,10 +303,24 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) > struct net_device *tdev; > struct xfrm_state *x; > int err = -1; > + u32 if_id; > int mtu; > > + if (xi->p.collect_md) { > + struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); > + > + if (unlikely(!md_info)) > + return -EINVAL; > + > + if_id = md_info->if_id; > + if (md_info->link) > + fl->flowi_oif = md_info->link; > + } else { > + if_id = xi->p.if_id; > + } > + > dst_hold(dst); > - dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id); > + dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); > if (IS_ERR(dst)) { > err = PTR_ERR(dst); > dst = NULL; > @@ -283,7 +331,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) > if (!x) > goto tx_err_link_failure; > > - if (x->if_id != xi->p.if_id) > + if (x->if_id != if_id) > goto tx_err_link_failure; > > tdev = dst->dev; > @@ -633,6 +681,9 @@ static void xfrmi_netlink_parms(struct nlattr *data[], > > if (data[IFLA_XFRM_IF_ID]) > parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); > + > + if (data[IFLA_XFRM_COLLECT_METADATA]) > + parms->collect_md = true; > } > > static int xfrmi_newlink(struct net *src_net, struct net_device *dev, > @@ -645,14 +696,27 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, > int err; > > xfrmi_netlink_parms(data, &p); > - if (!p.if_id) { > - NL_SET_ERR_MSG(extack, "if_id must be non zero"); > - return -EINVAL; > - } > + if (p.collect_md) { > + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); > > - xi = xfrmi_locate(net, &p); > - if (xi) > - return -EEXIST; > + if (p.link || p.if_id) { > + NL_SET_ERR_MSG(extack, "link and if_id must be zero"); > + return -EINVAL; > + } > + > + if (rtnl_dereference(xfrmn->collect_md_xfrmi)) > + return -EEXIST; > + > + } else { > + if (!p.if_id) { > + NL_SET_ERR_MSG(extack, "if_id must be non zero"); > + return -EINVAL; > + } > + > + xi = xfrmi_locate(net, &p); > + if (xi) > + return -EEXIST; > + } > > xi = netdev_priv(dev); > xi->p = p; > @@ -682,12 +746,19 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], > return -EINVAL; > } > > + if (p.collect_md) { > + NL_SET_ERR_MSG(extack, "collect_md can't be changed"); > + return -EINVAL; > + } > + > xi = xfrmi_locate(net, &p); > if (!xi) { > xi = netdev_priv(dev); > } else { > if (xi->dev != dev) > return -EEXIST; > + if (xi->p.collect_md) > + return -EINVAL; > } > > return xfrmi_update(xi, &p); > @@ -700,6 +771,8 @@ static size_t xfrmi_get_size(const struct net_device *dev) > nla_total_size(4) + > /* IFLA_XFRM_IF_ID */ > nla_total_size(4) + > + /* IFLA_XFRM_COLLECT_METADATA */ > + nla_total_size(0) + > 0; > } > > @@ -711,6 +784,10 @@ static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) > if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || > nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id)) > goto nla_put_failure; > + if (xi->p.collect_md) { > + if (nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA)) minor: these can be combined > + goto nla_put_failure; > + } > return 0; > > nla_put_failure: > @@ -725,8 +802,10 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev) > } > > static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { > - [IFLA_XFRM_LINK] = { .type = NLA_U32 }, > - [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, > + [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, > + [IFLA_XFRM_LINK] = { .type = NLA_U32 }, link is signed, so s32 > + [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, > + [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, > }; > > static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { > @@ -762,6 +841,9 @@ static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) > xip = &xi->next) > unregister_netdevice_queue(xi->dev, &list); > } > + xi = rcu_dereference(xfrmn->collect_md_xfrmi); rtnl_dereference() > + if (xi) > + unregister_netdevice_queue(xi->dev, &list); > } > unregister_netdevice_many(&list); > rtnl_unlock(); > diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c > index f1a0bab920a5..70376f3fe84a 100644 > --- a/net/xfrm/xfrm_policy.c > +++ b/net/xfrm/xfrm_policy.c > @@ -3516,17 +3516,17 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, > int xerr_idx = -1; > const struct xfrm_if_cb *ifcb; > struct sec_path *sp; > - struct xfrm_if *xi; > u32 if_id = 0; > > rcu_read_lock(); > ifcb = xfrm_if_get_cb(); > > if (ifcb) { > - xi = ifcb->decode_session(skb, family); > - if (xi) { > - if_id = xi->p.if_id; > - net = xi->net; > + struct xfrm_if_decode_session_result r; > + > + if (ifcb->decode_session(skb, family, &r)) { > + if_id = r.if_id; > + net = r.net; > } > } > rcu_read_unlock();