On Mon, Feb 04, 2019 at 10:13:47AM +0200, Leon Romanovsky wrote: > @@ -1664,20 +1673,41 @@ static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags, > struct ib_sge *sg_list, u32 num_sge) > { > int i; > + int ret = 0; > > for (i = 0; i < num_sge; ++i) { > struct ib_sge *sg = &sg_list[i]; > int bytes_committed = 0; > - int ret; > > ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr, > sg->length, > &bytes_committed, NULL, > pf_flags); > if (ret < 0) > - return ret; > + break; > } > - return 0; > + > + if (ret < 0 && > + pf_flags & MLX5_PF_FLAGS_PREFETCH_DEFERRED) { > + int srcu_key; > + int j; > + > + srcu_key = srcu_read_lock(&dev->mr_srcu); > + for (j = i + 1; j < num_sge ; ++j) { > + struct mlx5_core_mkey *mmkey; > + struct mlx5_ib_mr *mr; > + > + mmkey = __mlx5_mr_lookup(dev->mdev, > + mlx5_base_mkey(sg_list[j].lkey)); > + mr = container_of(mmkey, > + struct mlx5_ib_mr, > + mmkey); > + atomic_dec(&mr->num_pending_prefetch); > + } > + srcu_read_unlock(&dev->mr_srcu, srcu_key); > + } .. so if we are now relying on the work queue flush to make this correct, I think the mlx5_ib_prefetch_mr_work could be written a bit more clearly: if (WARN_ON(!ib_device_try_get(&w->dev->ib_dev))) goto out; mlx5_ib_prefetch_sg_list(w->dev, w->pf_flags, w->sg_list, w->num_sge); ib_device_put(&w->dev->ib_dev); out: As it is no longer acceptable for work to be executed on an unregistered ib device, this means that the MR's were not flushed before all the clients exited, which should not be possible. > @@ -1696,9 +1726,13 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, > enum ib_uverbs_advise_mr_advice advice, > u32 flags, struct ib_sge *sg_list, u32 num_sge) > { > + u32 pf_flags = (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) ? > + MLX5_PF_FLAGS_PREFETCH : MLX5_PF_FLAGS_PREFETCH_DEFERRED; > struct mlx5_ib_dev *dev = to_mdev(pd->device); > - u32 pf_flags = MLX5_PF_FLAGS_PREFETCH; > struct prefetch_mr_work *work; > + bool valid_req = true; > + int srcu_key; > + int i; > > if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) > pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; > @@ -1717,10 +1751,58 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, > memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge)); > > work->dev = dev; > - work->pf_flags = pf_flags; > + work->pf_flags = pf_flags | MLX5_PF_FLAGS_PREFETCH_DEFERRED; > work->num_sge = num_sge; > > INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); > - schedule_work(&work->work); > - return 0; > + > + srcu_key = srcu_read_lock(&dev->mr_srcu); > + for (i = 0; i < num_sge; ++i) { 'num_sge' is read directly from userspace and it is a u32, but 'i' here is an 'int', so userspace can trigger and infinite loop in the kernel. I noticed this type mistake in other similar places here as well. Using 'int' for a for loop variable is almost always the wrong thing to do in some subtle way like this.. > + struct mlx5_core_mkey *mmkey; > + struct mlx5_ib_mr *mr; > + > + mmkey = __mlx5_mr_lookup(dev->mdev, > + mlx5_base_mkey(sg_list[i].lkey)); > + if (!mmkey || mmkey->key != sg_list[i].lkey) { > + valid_req = false; > + break; > + } > + > + if (mmkey->type != MLX5_MKEY_MR) { > + valid_req = false; > + break; > + } > + > + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); > + > + if (mr->ibmr.pd != pd) { > + valid_req = false; > + break; > + } > + > + if (!mr->live) { > + valid_req = false; > + break; > + } > + > + atomic_inc(&mr->num_pending_prefetch); > + } This would be nicer in a function 'incr_num_pending_prefetch' or some such Jason