On Mon, Jul 1, 2019 at 7:29 AM Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx> wrote: > > > > On 06/25/2019 10:41 PM, Ilya Dryomov wrote: > > rbd_wait_state_locked() is built around rbd_dev->lock_waitq and blocks > > rbd worker threads while waiting for the lock, potentially impacting > > other rbd devices. There is no good way to pass an error code into > > image request state machines when acquisition fails, hence the use of > > RBD_DEV_FLAG_BLACKLISTED for everything and various other issues. > > > > Introduce rbd_dev->acquiring_list and move acquisition into image > > request state machine. Use rbd_img_schedule() for kicking and passing > > error codes. No blocking occurs while waiting for the lock, but > > rbd_dev->lock_rwsem is still held across lock, unlock and set_cookie > > calls. > > > > Always acquire the lock on "rbd map" to avoid associating the latency > > of acquiring the lock with the first I/O request. > > > > A slight regression is that lock_timeout is now respected only if lock > > acquisition is triggered by "rbd map" and not by I/O. This is somewhat > > compensated by the fact that we no longer block if the peer refuses to > > release lock -- I/O is failed with EROFS right away. > > > > Signed-off-by: Ilya Dryomov <idryomov@xxxxxxxxx> > > Great, that's really what we need. Just two small question inline. Yeah, long overdue! The next step is exclusive lock state machine and no lock_rwsem across lock, unlock and set_cookie calls. > > Reviewed-by: Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx> > > --- > > drivers/block/rbd.c | 325 +++++++++++++++++++++++++------------------- > > 1 file changed, 182 insertions(+), 143 deletions(-) > > > > diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c > > index 59d1fef35663..fd3f248ba9c2 100644 > > --- a/drivers/block/rbd.c > > +++ b/drivers/block/rbd.c > > @@ -312,6 +312,7 @@ enum img_req_flags { > > > > enum rbd_img_state { > > RBD_IMG_START = 1, > > + RBD_IMG_EXCLUSIVE_LOCK, > > __RBD_IMG_OBJECT_REQUESTS, > > RBD_IMG_OBJECT_REQUESTS, > > }; > > @@ -412,9 +413,11 @@ struct rbd_device { > > struct delayed_work lock_dwork; > > struct work_struct unlock_work; > > spinlock_t lock_lists_lock; > > + struct list_head acquiring_list; > > struct list_head running_list; > > + struct completion acquire_wait; > > + int acquire_err; > > struct completion releasing_wait; > > - wait_queue_head_t lock_waitq; > > > > struct workqueue_struct *task_wq; > > > > @@ -442,12 +445,10 @@ struct rbd_device { > > * Flag bits for rbd_dev->flags: > > * - REMOVING (which is coupled with rbd_dev->open_count) is protected > > * by rbd_dev->lock > > - * - BLACKLISTED is protected by rbd_dev->lock_rwsem > > */ > > enum rbd_dev_flags { > > RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ > > RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ > > - RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ > > }; > > > > static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ > > @@ -500,6 +501,8 @@ static int minor_to_rbd_dev_id(int minor) > > > > static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) > > { > > + lockdep_assert_held(&rbd_dev->lock_rwsem); > > + > > return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || > > rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; > > } > > @@ -2895,15 +2898,21 @@ static bool need_exclusive_lock(struct rbd_img_request *img_req) > > return rbd_img_is_write(img_req); > > } > > > > -static void rbd_lock_add_request(struct rbd_img_request *img_req) > > +static bool rbd_lock_add_request(struct rbd_img_request *img_req) > > { > > struct rbd_device *rbd_dev = img_req->rbd_dev; > > + bool locked; > > > > lockdep_assert_held(&rbd_dev->lock_rwsem); > > + locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED; > > spin_lock(&rbd_dev->lock_lists_lock); > > rbd_assert(list_empty(&img_req->lock_item)); > > - list_add_tail(&img_req->lock_item, &rbd_dev->running_list); > > + if (!locked) > > + list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list); > > + else > > + list_add_tail(&img_req->lock_item, &rbd_dev->running_list); > > spin_unlock(&rbd_dev->lock_lists_lock); > > + return locked; > > } > > > > static void rbd_lock_del_request(struct rbd_img_request *img_req) > > @@ -2922,6 +2931,30 @@ static void rbd_lock_del_request(struct rbd_img_request *img_req) > > complete(&rbd_dev->releasing_wait); > > } > > > > +static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) > > +{ > > + struct rbd_device *rbd_dev = img_req->rbd_dev; > > + > > + if (!need_exclusive_lock(img_req)) > > + return 1; > > + > > + if (rbd_lock_add_request(img_req)) > > + return 1; > > + > > + if (rbd_dev->opts->exclusive) { > > + WARN_ON(1); /* lock got released? */ > > + return -EROFS; > > + } > > + > > + /* > > + * Note the use of mod_delayed_work() in rbd_acquire_lock() > > + * and cancel_delayed_work() in wake_requests(). > > + */ > > + dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); > > + queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); > > + return 0; > > +} > > + > > static void rbd_img_object_requests(struct rbd_img_request *img_req) > > { > > struct rbd_obj_request *obj_req; > > @@ -2944,11 +2977,30 @@ static void rbd_img_object_requests(struct rbd_img_request *img_req) > > > > static bool rbd_img_advance(struct rbd_img_request *img_req, int *result) > > { > > + struct rbd_device *rbd_dev = img_req->rbd_dev; > > + int ret; > > + > > again: > > switch (img_req->state) { > > case RBD_IMG_START: > > rbd_assert(!*result); > > > > + ret = rbd_img_exclusive_lock(img_req); > > + if (ret < 0) { > > + *result = ret; > > + return true; > > + } > > + img_req->state = RBD_IMG_EXCLUSIVE_LOCK; > > + if (ret > 0) > > + goto again; > > + return false; > > + case RBD_IMG_EXCLUSIVE_LOCK: > > + if (*result) > > + return true; > > + > > + rbd_assert(!need_exclusive_lock(img_req) || > > + __rbd_is_lock_owner(rbd_dev)); > > + > > rbd_img_object_requests(img_req); > > if (!img_req->pending.num_pending) { > > *result = img_req->pending.result; > > @@ -3107,7 +3159,7 @@ static void rbd_unlock(struct rbd_device *rbd_dev) > > ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, > > RBD_LOCK_NAME, rbd_dev->lock_cookie); > > if (ret && ret != -ENOENT) > > - rbd_warn(rbd_dev, "failed to unlock: %d", ret); > > + rbd_warn(rbd_dev, "failed to unlock header: %d", ret); > > > > /* treat errors as the image is unlocked */ > > rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; > > @@ -3234,15 +3286,30 @@ static int rbd_request_lock(struct rbd_device *rbd_dev) > > goto out; > > } > > > > -static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) > > +static void wake_requests(struct rbd_device *rbd_dev, int result) > This code cover two cases I think, > (1) rbd mapping to waiting acquire_wait, > (2) img_request state machine in RBD_IMG_EXCLUSIVE_LOCK. > > So the case (1) is not waking *requests* actually :). Can you add a > comment here to make it clear? > Maybe add some comment about that. I'll rename this function to wake_lock_waiters() and add a comment. > > { > > - dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); > > + struct rbd_img_request *img_req; > > + > > + dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); > > + lockdep_assert_held_exclusive(&rbd_dev->lock_rwsem); > > > > cancel_delayed_work(&rbd_dev->lock_dwork); > > - if (wake_all) > > - wake_up_all(&rbd_dev->lock_waitq); > > - else > > - wake_up(&rbd_dev->lock_waitq); > > + if (!completion_done(&rbd_dev->acquire_wait)) { > > + rbd_assert(list_empty(&rbd_dev->acquiring_list) && > > + list_empty(&rbd_dev->running_list)); > > + rbd_dev->acquire_err = result; > > + complete_all(&rbd_dev->acquire_wait); > > + return; > > + } > > + > > + list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) { > > + mutex_lock(&img_req->state_mutex); > > + rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK); > > + rbd_img_schedule(img_req, result); > > + mutex_unlock(&img_req->state_mutex); > > + } > > + > > + list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list); > > } > From the code, above, there should be a window img_req will be in > *running*, but it is not in running_list. > > But that can't happen because the lock_rwsem protect it, right? Right. wake_requests() is called with lock_rwsem held for write and the whole image request state machine is under lock_rwsem for read if need_exclusive_lock() says so. Thanks, Ilya