On Mon, Feb 04, 2019 at 11:43:07PM +0000, Jason Gunthorpe wrote: > On Mon, Feb 04, 2019 at 10:13:47AM +0200, Leon Romanovsky wrote: > > @@ -1664,20 +1673,41 @@ static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags, > > struct ib_sge *sg_list, u32 num_sge) > > { > > int i; > > + int ret = 0; > > > > for (i = 0; i < num_sge; ++i) { > > struct ib_sge *sg = &sg_list[i]; > > int bytes_committed = 0; > > - int ret; > > > > ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr, > > sg->length, > > &bytes_committed, NULL, > > pf_flags); > > if (ret < 0) > > - return ret; > > + break; > > } > > - return 0; > > + > > + if (ret < 0 && > > + pf_flags & MLX5_PF_FLAGS_PREFETCH_DEFERRED) { > > + int srcu_key; > > + int j; > > + > > + srcu_key = srcu_read_lock(&dev->mr_srcu); > > + for (j = i + 1; j < num_sge ; ++j) { > > + struct mlx5_core_mkey *mmkey; > > + struct mlx5_ib_mr *mr; > > + > > + mmkey = __mlx5_mr_lookup(dev->mdev, > > + mlx5_base_mkey(sg_list[j].lkey)); > > + mr = container_of(mmkey, > > + struct mlx5_ib_mr, > > + mmkey); > > + atomic_dec(&mr->num_pending_prefetch); > > + } > > + srcu_read_unlock(&dev->mr_srcu, srcu_key); > > + } > > .. so if we are now relying on the work queue flush to make this > correct, I think the mlx5_ib_prefetch_mr_work could be written a bit > more clearly: > > if (WARN_ON(!ib_device_try_get(&w->dev->ib_dev))) > goto out; > > mlx5_ib_prefetch_sg_list(w->dev, w->pf_flags, w->sg_list, > w->num_sge); > ib_device_put(&w->dev->ib_dev); > out: > > As it is no longer acceptable for work to be executed on an > unregistered ib device, this means that the MR's were not flushed > before all the clients exited, which should not be possible. Why should we do ib_device get/put? We are already operating on such device and it should be "with us" all the time we are executing mlx5_ib_prefetch_sg_list(). > > > @@ -1696,9 +1726,13 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, > > enum ib_uverbs_advise_mr_advice advice, > > u32 flags, struct ib_sge *sg_list, u32 num_sge) > > { > > + u32 pf_flags = (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) ? > > + MLX5_PF_FLAGS_PREFETCH : MLX5_PF_FLAGS_PREFETCH_DEFERRED; > > struct mlx5_ib_dev *dev = to_mdev(pd->device); > > - u32 pf_flags = MLX5_PF_FLAGS_PREFETCH; > > struct prefetch_mr_work *work; > > + bool valid_req = true; > > + int srcu_key; > > + int i; > > > > if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) > > pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; > > @@ -1717,10 +1751,58 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, > > memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge)); > > > > work->dev = dev; > > - work->pf_flags = pf_flags; > > + work->pf_flags = pf_flags | MLX5_PF_FLAGS_PREFETCH_DEFERRED; > > work->num_sge = num_sge; > > > > INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); > > - schedule_work(&work->work); > > - return 0; > > + > > + srcu_key = srcu_read_lock(&dev->mr_srcu); > > + for (i = 0; i < num_sge; ++i) { > > 'num_sge' is read directly from userspace and it is a u32, but 'i' > here is an 'int', so userspace can trigger and infinite loop in the > kernel. I noticed this type mistake in other similar places here as > well. > > Using 'int' for a for loop variable is almost always the wrong thing > to do in some subtle way like this.. It is true for 32bit systems where "int" is 32 bits length. Those systems are under radar for us anyway. > > > + struct mlx5_core_mkey *mmkey; > > + struct mlx5_ib_mr *mr; > > + > > + mmkey = __mlx5_mr_lookup(dev->mdev, > > + mlx5_base_mkey(sg_list[i].lkey)); > > + if (!mmkey || mmkey->key != sg_list[i].lkey) { > > + valid_req = false; > > + break; > > + } > > + > > + if (mmkey->type != MLX5_MKEY_MR) { > > + valid_req = false; > > + break; > > + } > > + > > + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); > > + > > + if (mr->ibmr.pd != pd) { > > + valid_req = false; > > + break; > > + } > > + > > + if (!mr->live) { > > + valid_req = false; > > + break; > > + } > > + > > + atomic_inc(&mr->num_pending_prefetch); > > + } > > This would be nicer in a function 'incr_num_pending_prefetch' or some > such Maybe, I'm not fun of one-time called functions. > > Jason
Attachment:
signature.asc
Description: PGP signature