The NetDIM library, currently leveraged by an array of NICs, delivers excellent acceleration benefits. Nevertheless, NICs vary significantly in their dim profile list prerequisites. Specifically, virtio-net backends may present diverse sw or hw device implementation, making a one-size-fits-all parameter list impractical. On Alibaba Cloud, the virtio DPU's performance under the default DIM profile falls short of expectations, partly due to a mismatch in parameter configuration. I also noticed that ice/idpf/ena and other NICs have customized profilelist or placed some restrictions on dim capabilities. Motivated by this, I tried adding new sysfs attributes that provides a per-device control to modify and access a device's interrupt parameters. Usage ======== 1. Query the currently customized list of the device $ cat dim_profs The profiles of (RX, EQE): {.usec = 1, .pkts = 256, .comps = 0,}, {.usec = 8, .pkts = 256, .comps = 0,}, {.usec = 64, .pkts = 256, .comps = 0,}, {.usec = 128, .pkts = 256, .comps = 0,}, {.usec = 256, .pkts = 256, .comps = 0,} The profiles of (TX, EQE): {.usec = 1, .pkts = 256, .comps = 0,}, {.usec = 2, .pkts = 256, .comps = 0,}, {.usec = 3, .pkts = 256, .comps = 0,}, {.usec = 4, .pkts = 256, .comps = 0,}, {.usec = 5, .pkts = 256, .comps = 0,} 2. Tune $ echo "RX EQE 8,8,0 16,16,0 32,32,0 64,64,0 128,128,0" > dim_profs $ echo " TX EQE 0,2,0 1,3,0 2,4,0 3,5,0 4,6,0 " > dim_profs $ cat dim_profs The profiles of (RX, EQE): {.usec = 8, .pkts = 8, .comps = 0,}, {.usec = 16, .pkts = 16, .comps = 0,}, {.usec = 32, .pkts = 32, .comps = 0,}, {.usec = 64, .pkts = 64, .comps = 0,}, {.usec = 128, .pkts = 128, .comps = 0,} The profiles of (TX, EQE): {.usec = 0, .pkts = 2, .comps = 0,}, {.usec = 1, .pkts = 3, .comps = 0,}, {.usec = 2, .pkts = 4, .comps = 0,}, {.usec = 3, .pkts = 5, .comps = 0,}, {.usec = 4, .pkts = 6, .comps = 0,} 3. Warn If the device does not support .ndo_dim_moder_{set, get}, the following warning will response: "Profile is default and not customized by the device." Signed-off-by: Heng Qi <hengqi@xxxxxxxxxxxxxxxxx> --- Documentation/ABI/testing/sysfs-class-net | 17 +++ include/linux/dim.h | 7 ++ include/linux/netdevice.h | 35 ++++++ lib/dim/net_dim.c | 6 -- net/core/net-sysfs.c | 172 ++++++++++++++++++++++++++++++ 5 files changed, 231 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index ebf21be..1e4faa8 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -352,3 +352,20 @@ Description: 0 threaded mode disabled for this dev 1 threaded mode enabled for this dev == ================================== + +What: /sys/class/net/<iface>/dim_profs +Date: Mar 2024 +KernelVersion: 6.8 +Contact: netdev@xxxxxxxxxxxxxxx +Description: + String value to control the profile list of DIM per device. User could + set this value to tune the profile list for RX/TX direction and EQE/CQE + mode respectively. + + Possible values: + ================================================ ========================== + RX EQE 1,1,0 2,2,0 3,3,0 4,4,0 5,5,0 tune RX + EQE profile list + RX CQE 8,8,0 16,16,0 32,32,0 64,64,0 128,128,0 tune RX + CQE profile list + TX EQE 16,8,0 2,16,0 16,8,0 32,64,0 128,64,0 tune TX + EQE profile list + TX CQE 8,5,0 8,16,0 32,12,0 128,64,0 256,128,0 tune TX + CQE profile list + ================================================ ========================== diff --git a/include/linux/dim.h b/include/linux/dim.h index f343bc9..43398f5 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -10,6 +10,13 @@ #include <linux/types.h> #include <linux/workqueue.h> +/* Number of DIM profiles and period mode. */ +#define NET_DIM_PARAMS_NUM_PROFILES 5 +#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128 +#define NET_DIM_DEF_PROFILE_CQE 1 +#define NET_DIM_DEF_PROFILE_EQE 1 + /* * Number of events between DIM iterations. * Causes a moderation of the algorithm run. diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c6f6ac7..bc2f3ac 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -49,6 +49,7 @@ #include <uapi/linux/netdev.h> #include <linux/hashtable.h> #include <linux/rbtree.h> +#include <linux/dim.h> #include <net/net_trackers.h> #include <net/net_debug.h> #include <net/dropreason-core.h> @@ -998,6 +999,27 @@ struct netdev_net_notifier { struct notifier_block *nb; }; +enum dim_direction { + DIM_RX_DIRECTION = 0x0, + DIM_TX_DIRECTION = 0x1, + DIM_NUM_DIRECTIONS +}; +/** + * struct dim_profs_list - Structure for dim sysfs configuration. + * Used to exchange profile list between the sysfs and the driver. + * + * @direction: RX or TX dim information + * @mode: CQ period count mode (from CQE/EQE) + * @num: the number of profs array + * @profs: dim profile list + */ +struct dim_profs_list { + u8 direction; + u8 mode; + u8 num; + struct dim_cq_moder profs[]; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1351,6 +1373,14 @@ struct netdev_net_notifier { * struct kernel_hwtstamp_config *kernel_config, * struct netlink_ext_ack *extack); * Change the hardware timestamping parameters for NIC device. + * + * int (*ndo_dim_moder_get)(struct net_device *dev, + * struct dim_profs_list *list); + * Get dim profiles list from the NIC device. + * + * int (*ndo_dim_moder_set)(struct net_device *dev, + * struct dim_profs_list *list); + * Configure dim profiles list for the NIC device. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1595,6 +1625,11 @@ struct net_device_ops { int (*ndo_hwtstamp_set)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config, struct netlink_ext_ack *extack); + int (*ndo_dim_moder_get)(struct net_device *dev, + struct dim_profs_list *list); + + int (*ndo_dim_moder_set)(struct net_device *dev, + struct dim_profs_list *list); }; /** diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c index 4e32f7a..67d5beb 100644 --- a/lib/dim/net_dim.c +++ b/lib/dim/net_dim.c @@ -11,12 +11,6 @@ * There are different set of profiles for RX/TX CQs. * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES */ -#define NET_DIM_PARAMS_NUM_PROFILES 5 -#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256 -#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128 -#define NET_DIM_DEF_PROFILE_CQE 1 -#define NET_DIM_DEF_PROFILE_EQE 1 - #define NET_DIM_RX_EQE_PROFILES { \ {.usec = 1, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \ {.usec = 8, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e3d7a8c..801cb07 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,7 @@ #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> +#include <linux/dim.h> #include <net/netdev_rx_queue.h> #include <net/rps.h> @@ -638,6 +639,176 @@ static ssize_t threaded_store(struct device *dev, } static DEVICE_ATTR_RW(threaded); +static struct dim_profs_list *parse_dim_profs(const char *buf, ssize_t len) +{ + int i, ret, size, totlen = 0, retlen = 0; + char direction[3], period_mode[4]; + struct dim_profs_list *list; + + size = sizeof(*list) + NET_DIM_PARAMS_NUM_PROFILES * sizeof(struct dim_cq_moder); + list = kzalloc(size, GFP_KERNEL); + if (!list) + goto err_list; + + list->num = NET_DIM_PARAMS_NUM_PROFILES; + + ret = sscanf(buf, "%2s %3s%n", direction, period_mode, &retlen); + if (ret != 2) + goto err_parse; + + if (!strcasecmp(direction, "RX")) + list->direction = DIM_RX_DIRECTION; + else if (!strcasecmp(direction, "TX")) + list->direction = DIM_TX_DIRECTION; + else + goto err_parse; + + if (!strcasecmp(period_mode, "EQE")) + list->mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; + else if (!strcasecmp(period_mode, "CQE")) + list->mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE; + else + goto err_parse; + + totlen += retlen; + if (totlen > len) + goto err_parse; + + buf += retlen; + if (!buf) + goto err_parse; + + for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) { + ret = sscanf(buf, "%hu,%hu,%hu%n", &list->profs[i].usec, + &list->profs[i].pkts, &list->profs[i].comps, &retlen); + if (ret != 3) + goto err_parse; + + totlen += retlen; + if (totlen > len) + goto err_parse; + + buf += retlen; + if (i == NET_DIM_PARAMS_NUM_PROFILES - 1) + break; + } + + return list; + +err_parse: + kfree(list); +err_list: + return NULL; +} + +static ssize_t dim_profs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct net_device *netdev = to_net_dev(dev); + const struct net_device_ops *ops = netdev->netdev_ops; + struct net *net = dev_net(netdev); + struct dim_profs_list *list; + int ret = 0; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + list = parse_dim_profs(buf, len); + if (!list) + return -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + if (!ops->ndo_dim_moder_set) + ret = -EINVAL; + else + ret = ops->ndo_dim_moder_set(netdev, list) ? : len; + } + + kfree(list); + rtnl_unlock(); + + return ret; +} + +static ssize_t dim_profs_show_one(struct device *dev, + struct device_attribute *attr, + char *buf, u8 direct, u8 mode, + size_t *len_) +{ + static const char fmt_body[] = "{.usec = %3hu, .pkts = %3hu, .comps = %3hu,}%s"; + static const char fmt_hdr[] = "The profiles of (%2s, %3s):\n"; + const char *direction[2] = {"RX", "TX"}, *period_mode[2] = {"EQE", "CQE"}; + struct net_device *netdev = to_net_dev(dev); + const struct net_device_ops *ops = netdev->netdev_ops; + struct dim_profs_list *list; + size_t size, len = *len_; + ssize_t i; + + size = sizeof(*list) + NET_DIM_PARAMS_NUM_PROFILES * sizeof(struct dim_cq_moder); + list = kzalloc(size, GFP_KERNEL); + if (!list) + return -ENOMEM; + + list->num = NET_DIM_PARAMS_NUM_PROFILES; + list->direction = direct; + list->mode = mode; + if (ops->ndo_dim_moder_get(netdev, list)) + goto ret_; + + len += scnprintf(buf + len, PAGE_SIZE - len, + fmt_hdr, direction[direct], period_mode[mode]); + for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) { + len += scnprintf(buf + len, PAGE_SIZE - len, fmt_body, + list->profs[i].usec, list->profs[i].pkts, + list->profs[i].comps, + (i == NET_DIM_PARAMS_NUM_PROFILES - 1) ? "\n" : ",\n"); + } + *len_ = len; +ret_: + kfree(list); + return 0; +} + +static ssize_t dim_profs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + static const char out[] = "profile is default and not customized by the device."; + struct net_device *netdev = to_net_dev(dev); + const struct net_device_ops *ops = netdev->netdev_ops; + ssize_t i, j, ret = 0; + size_t len = 0; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (!ops->ndo_dim_moder_get) { + ret = sysfs_emit(buf, "%s\n", out); + goto ret_; + } + + for (i = 0; i < DIM_NUM_DIRECTIONS; i++) { + for (j = 0; j < DIM_CQ_PERIOD_NUM_MODES; j++) { + ret = dim_profs_show_one(dev, attr, buf, i, j, &len); + if (ret) + goto ret_; + } + } + + rtnl_unlock(); + return len; + +ret_: + rtnl_unlock(); + return ret; +} + +static DEVICE_ATTR_RW(dim_profs); + static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, @@ -671,6 +842,7 @@ static ssize_t threaded_store(struct device *dev, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, &dev_attr_threaded.attr, + &dev_attr_dim_profs.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); -- 1.8.3.1