XDP setup and PF reset code access the same resources in the following sections: * ice_vsi_close() in ice_prepare_for_reset() - already rtnl-locked * ice_vsi_rebuild() for the PF VSI - not protected With an unfortunate timing, such accesses can result in a crash such as the one below: [ +1.999878] ice 0000:b1:00.0: Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring 14 [ +2.002992] ice 0000:b1:00.0: Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring 18 [Mar15 18:17] ice 0000:b1:00.0 ens801f0np0: NETDEV WATCHDOG: CPU: 38: transmit queue 14 timed out 80692736 ms [ +0.000093] ice 0000:b1:00.0 ens801f0np0: tx_timeout: VSI_num: 6, Q 14, NTC: 0x0, HW_HEAD: 0x0, NTU: 0x0, INT: 0x4000001 [ +0.000012] ice 0000:b1:00.0 ens801f0np0: tx_timeout recovery level 1, txqueue 14 [ +0.394718] ice 0000:b1:00.0: PTP reset successful [ +0.006184] BUG: kernel NULL pointer dereference, address: 0000000000000098 [ +0.000045] #PF: supervisor read access in kernel mode [ +0.000023] #PF: error_code(0x0000) - not-present page [ +0.000023] PGD 0 P4D 0 [ +0.000018] Oops: 0000 [#1] PREEMPT SMP NOPTI [ +0.000023] CPU: 38 PID: 7540 Comm: kworker/38:1 Not tainted 6.8.0-rc7 #1 [ +0.000031] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0014.082620210524 08/26/2021 [ +0.000036] Workqueue: ice ice_service_task [ice] [ +0.000183] RIP: 0010:ice_clean_tx_ring+0xa/0xd0 [ice] [...] [ +0.000013] Call Trace: [ +0.000016] <TASK> [ +0.000014] ? __die+0x1f/0x70 [ +0.000029] ? page_fault_oops+0x171/0x4f0 [ +0.000029] ? schedule+0x3b/0xd0 [ +0.000027] ? exc_page_fault+0x7b/0x180 [ +0.000022] ? asm_exc_page_fault+0x22/0x30 [ +0.000031] ? ice_clean_tx_ring+0xa/0xd0 [ice] [ +0.000194] ice_free_tx_ring+0xe/0x60 [ice] [ +0.000186] ice_destroy_xdp_rings+0x157/0x310 [ice] [ +0.000151] ice_vsi_decfg+0x53/0xe0 [ice] [ +0.000180] ice_vsi_rebuild+0x239/0x540 [ice] [ +0.000186] ice_vsi_rebuild_by_type+0x76/0x180 [ice] [ +0.000145] ice_rebuild+0x18c/0x840 [ice] [ +0.000145] ? delay_tsc+0x4a/0xc0 [ +0.000022] ? delay_tsc+0x92/0xc0 [ +0.000020] ice_do_reset+0x140/0x180 [ice] [ +0.000886] ice_service_task+0x404/0x1030 [ice] [ +0.000824] process_one_work+0x171/0x340 [ +0.000685] worker_thread+0x277/0x3a0 [ +0.000675] ? preempt_count_add+0x6a/0xa0 [ +0.000677] ? _raw_spin_lock_irqsave+0x23/0x50 [ +0.000679] ? __pfx_worker_thread+0x10/0x10 [ +0.000653] kthread+0xf0/0x120 [ +0.000635] ? __pfx_kthread+0x10/0x10 [ +0.000616] ret_from_fork+0x2d/0x50 [ +0.000612] ? __pfx_kthread+0x10/0x10 [ +0.000604] ret_from_fork_asm+0x1b/0x30 [ +0.000604] </TASK> The previous way of handling this through returning -EBUSY is not viable, particularly when destroying AF_XDP socket, because the kernel proceeds with removal anyway. There is plenty of code between those calls and there is no need to create a large critical section that covers them both, same as there is no need to protect ice_vsi_rebuild() with rtnl_lock(). Leaving an unprotexted section in between would result in a state, when the VSI is closed, but not yet rebuild, such situation can be handled pretty easily. Lock ice_vsi_rebuild() with ICE_CFG_BUSY flag. Particularly, to prevent system crash, when tx_timeout and .ndo_bpf() happen at the same time. Also, handle the state between critical sections by skipping XDP ring configuration. Fixes: efc2214b6047 ("ice: Add support for XDP") Reviewed-by: Igor Bagnucki <igor.bagnucki@xxxxxxxxx> Signed-off-by: Larysa Zaremba <larysa.zaremba@xxxxxxxxx> --- drivers/net/ethernet/intel/ice/ice_lib.c | 5 +++- drivers/net/ethernet/intel/ice/ice_main.c | 36 +++++++++++++++++++---- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 7629b0190578..4774bcc4d5a8 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2426,7 +2426,10 @@ void ice_vsi_decfg(struct ice_vsi *vsi) dev_err(ice_pf_to_dev(pf), "Failed to remove RDMA scheduler config for VSI %u, err %d\n", vsi->vsi_num, err); - if (ice_is_xdp_ena_vsi(vsi)) + /* xdp_rings can be absent, if program was attached amid reset, + * VSI rebuild is supposed to create them later + */ + if (ice_is_xdp_ena_vsi(vsi) && vsi->xdp_rings) /* return value check can be skipped here, it always returns * 0 if reset is in progress */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 15a6805ac2a1..dc60d816a345 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2986,6 +2986,20 @@ static int ice_max_xdp_frame_size(struct ice_vsi *vsi) return ICE_RXBUF_3072; } +/** + * ice_rebuild_pending - ice_vsi_rebuild will be performed, when locks are released + * @vsi: VSI to setup XDP for + * + * ice_vsi_close() in the reset path is called under rtnl_lock(), + * so it happens strictly before or after .ndo_bpf(). + * In case it has happened before, we do not have anything attached to rings + */ +static bool ice_rebuild_pending(struct ice_vsi *vsi) +{ + return ice_is_reset_in_progress(vsi->back->state) && + !vsi->rx_rings[0]->desc; +} + /** * ice_xdp_setup_prog - Add or remove XDP eBPF program * @vsi: VSI to setup XDP for @@ -3009,7 +3023,7 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, } /* hot swap progs and avoid toggling link */ - if (ice_is_xdp_ena_vsi(vsi) == !!prog) { + if (ice_is_xdp_ena_vsi(vsi) == !!prog || ice_rebuild_pending(vsi)) { ice_vsi_assign_bpf_prog(vsi, prog); return 0; } @@ -3081,21 +3095,30 @@ static int ice_xdp(struct net_device *dev, struct netdev_bpf *xdp) { struct ice_netdev_priv *np = netdev_priv(dev); struct ice_vsi *vsi = np->vsi; + struct ice_pf *pf = vsi->back; + int ret; if (vsi->type != ICE_VSI_PF) { NL_SET_ERR_MSG_MOD(xdp->extack, "XDP can be loaded only on PF VSI"); return -EINVAL; } + while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) + usleep_range(1000, 2000); + switch (xdp->command) { case XDP_SETUP_PROG: - return ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack); + ret = ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack); + break; case XDP_SETUP_XSK_POOL: - return ice_xsk_pool_setup(vsi, xdp->xsk.pool, - xdp->xsk.queue_id); + ret = ice_xsk_pool_setup(vsi, xdp->xsk.pool, xdp->xsk.queue_id); + break; default: - return -EINVAL; + ret = -EINVAL; } + + clear_bit(ICE_CFG_BUSY, pf->state); + return ret; } /** @@ -7672,7 +7695,10 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) ice_gnss_init(pf); /* rebuild PF VSI */ + while (test_and_set_bit(ICE_CFG_BUSY, pf->state)) + usleep_range(1000, 2000); err = ice_vsi_rebuild_by_type(pf, ICE_VSI_PF); + clear_bit(ICE_CFG_BUSY, pf->state); if (err) { dev_err(dev, "PF VSI rebuild failed: %d\n", err); goto err_vsi_rebuild; -- 2.43.0