On 4/15/2021 2:43 PM, Hans Westgaard Ry wrote:
Handling comm_channel_event in mlx4_master_comm_channel uses a double
loop to determine which slaves have requested work. The search is
always started at lowest slave. This leads to unfairness; lower VFs
tends to be prioritized over higher VFs.
The patch uses find_next_bit to determine which slaves to handle.
Fairness is implemented by always starting at the next to the last
start.
An MPI program has been used to measure improvements. It runs 500
ibv_reg_mr, synchronizes with all other instances and then runs 500
ibv_dereg_mr.
The results running 500 processes, time reported is for running 500
calls:
ibv_reg_mr:
Mod. Org.
mlx4_1 403.356ms 424.674ms
mlx4_2 403.355ms 424.674ms
mlx4_3 403.354ms 424.674ms
mlx4_4 403.355ms 424.674ms
mlx4_5 403.357ms 424.677ms
mlx4_6 403.354ms 424.676ms
mlx4_7 403.357ms 424.675ms
mlx4_8 403.355ms 424.675ms
ibv_dereg_mr:
Mod. Org.
mlx4_1 116.408ms 142.818ms
mlx4_2 116.434ms 142.793ms
mlx4_3 116.488ms 143.247ms
mlx4_4 116.679ms 143.230ms
mlx4_5 112.017ms 107.204ms
mlx4_6 112.032ms 107.516ms
mlx4_7 112.083ms 184.195ms
mlx4_8 115.089ms 190.618ms
Suggested-by: Håkon Bugge <haakon.bugge@xxxxxxxxxx>
Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@xxxxxxxxxx>
---
drivers/net/ethernet/mellanox/mlx4/cmd.c | 75 +++++++++++++++++--------------
drivers/net/ethernet/mellanox/mlx4/mlx4.h | 1 +
2 files changed, 43 insertions(+), 33 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index c678344d22a2..24989e96ab9d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -51,7 +51,6 @@
#include "fw.h"
#include "fw_qos.h"
#include "mlx4_stats.h"
-
#define CMD_POLL_TOKEN 0xffff
#define INBOX_MASK 0xffffffffffffff00ULL
Unrelated change. Please revert it.
@@ -2241,48 +2240,58 @@ void mlx4_master_comm_channel(struct work_struct *work)
struct mlx4_priv *priv =
container_of(mfunc, struct mlx4_priv, mfunc);
struct mlx4_dev *dev = &priv->dev;
- __be32 *bit_vec;
u32 comm_cmd;
- u32 vec;
- int i, j, slave;
+ int i, slave;
int toggle;
int served = 0;
int reported = 0;
u32 slt;
-
- bit_vec = master->comm_arm_bit_vector;
- for (i = 0; i < COMM_CHANNEL_BIT_ARRAY_SIZE; i++) {
- vec = be32_to_cpu(bit_vec[i]);
- for (j = 0; j < 32; j++) {
- if (!(vec & (1 << j)))
- continue;
- ++reported;
- slave = (i * 32) + j;
- comm_cmd = swab32(readl(
- &mfunc->comm[slave].slave_write));
- slt = swab32(readl(&mfunc->comm[slave].slave_read))
- >> 31;
- toggle = comm_cmd >> 31;
- if (toggle != slt) {
- if (master->slave_state[slave].comm_toggle
- != slt) {
- pr_info("slave %d out of sync. read toggle %d, state toggle %d. Resynching.\n",
- slave, slt,
- master->slave_state[slave].comm_toggle);
- master->slave_state[slave].comm_toggle =
- slt;
- }
- mlx4_master_do_cmd(dev, slave,
- comm_cmd >> 16 & 0xff,
- comm_cmd & 0xffff, toggle);
- ++served;
+ u32 lbit_vec[COMM_CHANNEL_BIT_ARRAY_SIZE];
+ u32 nmbr_bits;
+ u32 prev_slave;
+ bool first = true;
Please maintain reversed Christmas tree when possible.
Split declaration from init if needed.
+
+ for (i = 0; i < COMM_CHANNEL_BIT_ARRAY_SIZE; i++)
+ lbit_vec[i] = be32_to_cpu(master->comm_arm_bit_vector[i]);
+ nmbr_bits = dev->persist->num_vfs + 1;
+ if (++priv->next_slave >= nmbr_bits)
+ priv->next_slave = 0;
+ slave = priv->next_slave;
+ while (true) {
+ slave = find_next_bit((const unsigned long *)&lbit_vec, nmbr_bits, slave);
+ if (!first && slave >= priv->next_slave) {
+ break;
+ } else if (slave == nmbr_bits) {
Unnecessary else, as if block breaks.
+ if (!first)
+ break;
+ first = false;
+ slave = 0;
+ continue;
+ }
+ ++reported;
+ comm_cmd = swab32(readl(&mfunc->comm[slave].slave_write));
+ slt = swab32(readl(&mfunc->comm[slave].slave_read)) >> 31;
+ toggle = comm_cmd >> 31;
+ if (toggle != slt) {
+ if (master->slave_state[slave].comm_toggle
+ != slt) {
+ pr_info("slave %d out of sync. read toggle %d, state toggle %d. Resynching.\n",
+ slave, slt,
+ master->slave_state[slave].comm_toggle);
+ master->slave_state[slave].comm_toggle =
+ slt;
}
+ mlx4_master_do_cmd(dev, slave,
+ comm_cmd >> 16 & 0xff,
+ comm_cmd & 0xffff, toggle);
+ ++served;
}
+ prev_slave = slave++;
}
if (reported && reported != served)
- mlx4_warn(dev, "Got command event with bitmask from %d slaves but %d were served\n",
- reported, served);
+ mlx4_warn(dev, "Got command event with bitmask from %d slaves but %d were served %x %d\n",
Not clear from the string what these newly printed values are.
Improve the string/description.
+ reported, served, lbit_vec[0], priv->next_slave);
if (mlx4_ARM_COMM_CHANNEL(dev))
mlx4_warn(dev, "Failed to arm comm channel events\n");
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 64bed7ac3836..cd6ba80f4c90 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -924,6 +924,7 @@ struct mlx4_priv {
atomic_t opreq_count;
struct work_struct opreq_task;
+ u32 next_slave; /* mlx4_master_comm_channel */
Move it to a more specific struct, consider struct mlx4_mfunc_master_ctx.
};
static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
Please submit to netdev mailing list, and CC needed maintainers.