On Thu, 2020-01-16 at 15:46 +0100, Ilya Dryomov wrote: > On Thu, Jan 16, 2020 at 3:14 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > On Tue, 2020-01-14 at 22:44 -0500, xiubli@xxxxxxxxxx wrote: > > > From: Xiubo Li <xiubli@xxxxxxxxxx> > > > > > > item total sum_lat(us) avg_lat(us) > > > ----------------------------------------------------- > > > write 222 5287750000 23818693 > > > > > > URL: https://tracker.ceph.com/issues/43215 > > > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > > > --- > > > fs/ceph/addr.c | 23 +++++++++++++++++++++-- > > > fs/ceph/debugfs.c | 8 ++++++++ > > > fs/ceph/file.c | 9 +++++++++ > > > fs/ceph/mds_client.c | 20 ++++++++++++++++++++ > > > fs/ceph/mds_client.h | 6 ++++++ > > > include/linux/ceph/osd_client.h | 3 ++- > > > net/ceph/osd_client.c | 9 ++++++++- > > > 7 files changed, 74 insertions(+), 4 deletions(-) > > > > > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > > > index 2a32f731f92a..b667ddaa6623 100644 > > > --- a/fs/ceph/addr.c > > > +++ b/fs/ceph/addr.c > > > @@ -598,12 +598,15 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) > > > loff_t page_off = page_offset(page); > > > int err, len = PAGE_SIZE; > > > struct ceph_writeback_ctl ceph_wbc; > > > + struct ceph_client_metric *metric; > > > + s64 latency; > > > > > > dout("writepage %p idx %lu\n", page, page->index); > > > > > > inode = page->mapping->host; > > > ci = ceph_inode(inode); > > > fsc = ceph_inode_to_client(inode); > > > + metric = &fsc->mdsc->metric; > > > > > > /* verify this is a writeable snap context */ > > > snapc = page_snap_context(page); > > > @@ -645,7 +648,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) > > > &ci->i_layout, snapc, page_off, len, > > > ceph_wbc.truncate_seq, > > > ceph_wbc.truncate_size, > > > - &inode->i_mtime, &page, 1); > > > + &inode->i_mtime, &page, 1, > > > + &latency); > > > + if (latency) > > > + ceph_mdsc_update_write_latency(metric, latency); > > > + > > > if (err < 0) { > > > struct writeback_control tmp_wbc; > > > if (!wbc) > > > @@ -707,6 +714,8 @@ static void writepages_finish(struct ceph_osd_request *req) > > > { > > > struct inode *inode = req->r_inode; > > > struct ceph_inode_info *ci = ceph_inode(inode); > > > + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > > + struct ceph_client_metric *metric = &fsc->mdsc->metric; > > > struct ceph_osd_data *osd_data; > > > struct page *page; > > > int num_pages, total_pages = 0; > > > @@ -714,7 +723,6 @@ static void writepages_finish(struct ceph_osd_request *req) > > > int rc = req->r_result; > > > struct ceph_snap_context *snapc = req->r_snapc; > > > struct address_space *mapping = inode->i_mapping; > > > - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > > bool remove_page; > > > > > > dout("writepages_finish %p rc %d\n", inode, rc); > > > @@ -783,6 +791,11 @@ static void writepages_finish(struct ceph_osd_request *req) > > > ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); > > > else > > > kfree(osd_data->pages); > > > + > > > + if (!rc) { > > > + s64 latency = jiffies - req->r_start_stamp; > > > + ceph_mdsc_update_write_latency(metric, latency); > > > + } > > > ceph_osdc_put_request(req); > > > } > > > > > > @@ -1675,6 +1688,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) > > > struct inode *inode = file_inode(filp); > > > struct ceph_inode_info *ci = ceph_inode(inode); > > > struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > > + struct ceph_client_metric *metric = &fsc->mdsc->metric; > > > struct ceph_osd_request *req; > > > struct page *page = NULL; > > > u64 len, inline_version; > > > @@ -1786,6 +1800,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) > > > err = ceph_osdc_start_request(&fsc->client->osdc, req, false); > > > if (!err) > > > err = ceph_osdc_wait_request(&fsc->client->osdc, req); > > > + > > > + if (!err || err == -ETIMEDOUT) { > > > + s64 latency = jiffies - req->r_start_stamp; > > > + ceph_mdsc_update_write_latency(metric, latency); > > > + } > > > out_put: > > > ceph_osdc_put_request(req); > > > if (err == -ECANCELED) > > > diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c > > > index 8200bf025ccd..3fdb15af0a83 100644 > > > --- a/fs/ceph/debugfs.c > > > +++ b/fs/ceph/debugfs.c > > > @@ -142,6 +142,14 @@ static int metric_show(struct seq_file *s, void *p) > > > seq_printf(s, "%-14s%-12lld%-16lld%lld\n", "read", > > > total, sum / NSEC_PER_USEC, avg / NSEC_PER_USEC); > > > > > > + spin_lock(&mdsc->metric.write_lock); > > > + total = atomic64_read(&mdsc->metric.total_writes), > > > + sum = timespec64_to_ns(&mdsc->metric.write_latency_sum); > > > + spin_unlock(&mdsc->metric.write_lock); > > > + avg = total ? sum / total : 0; > > > + seq_printf(s, "%-14s%-12lld%-16lld%lld\n", "write", > > > + total, sum / NSEC_PER_USEC, avg / NSEC_PER_USEC); > > > + > > > seq_printf(s, "\n"); > > > seq_printf(s, "item total miss hit\n"); > > > seq_printf(s, "-------------------------------------------------\n"); > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > > index 797d4d224223..70530ac798ac 100644 > > > --- a/fs/ceph/file.c > > > +++ b/fs/ceph/file.c > > > @@ -822,6 +822,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) > > > op = &req->r_ops[i]; > > > if (op->op == CEPH_OSD_OP_READ) > > > ceph_mdsc_update_read_latency(metric, latency); > > > + else if (op->op == CEPH_OSD_OP_WRITE && rc != -ENOENT) > > > + ceph_mdsc_update_write_latency(metric, latency); > > > } > > > } > > > > > > @@ -1075,6 +1077,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, > > > > > > if (!write) > > > ceph_mdsc_update_read_latency(metric, latency); > > > + else if (write && ret != -ENOENT) > > > + ceph_mdsc_update_write_latency(metric, latency); > > > } > > > > > > size = i_size_read(inode); > > > @@ -1163,6 +1167,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, > > > struct inode *inode = file_inode(file); > > > struct ceph_inode_info *ci = ceph_inode(inode); > > > struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > > + struct ceph_client_metric *metric = &fsc->mdsc->metric; > > > struct ceph_vino vino; > > > struct ceph_osd_request *req; > > > struct page **pages; > > > @@ -1248,6 +1253,10 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, > > > if (!ret) > > > ret = ceph_osdc_wait_request(&fsc->client->osdc, req); > > > > > > + if (!ret || ret == -ETIMEDOUT) { > > > + s64 latency = jiffies - req->r_start_stamp; > > > + ceph_mdsc_update_write_latency(metric, latency); > > > + } > > > out: > > > ceph_osdc_put_request(req); > > > if (ret != 0) { > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > > index dc2cda55a5a5..2569f9303c0c 100644 > > > --- a/fs/ceph/mds_client.c > > > +++ b/fs/ceph/mds_client.c > > > @@ -4112,6 +4112,22 @@ void ceph_mdsc_update_read_latency(struct ceph_client_metric *m, > > > spin_unlock(&m->read_lock); > > > } > > > > > > +void ceph_mdsc_update_write_latency(struct ceph_client_metric *m, > > > + s64 latency) > > > +{ > > > + struct timespec64 ts; > > > + > > > + if (!m) > > > + return; > > > + > > > + jiffies_to_timespec64(latency, &ts); > > > + > > > + spin_lock(&m->write_lock); > > > + atomic64_inc(&m->total_writes); > > > + m->write_latency_sum = timespec64_add(m->write_latency_sum, ts); > > > + spin_unlock(&m->write_lock); > > > +} > > > + > > > /* > > > * delayed work -- periodically trim expired leases, renew caps with mds > > > */ > > > @@ -4212,6 +4228,10 @@ static int ceph_mdsc_metric_init(struct ceph_client_metric *metric) > > > memset(&metric->read_latency_sum, 0, sizeof(struct timespec64)); > > > atomic64_set(&metric->total_reads, 0); > > > > > > + spin_lock_init(&metric->write_lock); > > > + memset(&metric->write_latency_sum, 0, sizeof(struct timespec64)); > > > + atomic64_set(&metric->total_writes, 0); > > > + > > > return 0; > > > } > > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > > > index fee25b999c7c..0120357e7549 100644 > > > --- a/fs/ceph/mds_client.h > > > +++ b/fs/ceph/mds_client.h > > > @@ -370,6 +370,10 @@ struct ceph_client_metric { > > > spinlock_t read_lock; > > > atomic64_t total_reads; > > > struct timespec64 read_latency_sum; > > > + > > > + spinlock_t write_lock; > > > + atomic64_t total_writes; > > > + struct timespec64 write_latency_sum; > > > > Would percpu counters be better here? I mean it's not a _super_ high > > performance codepath, but it's nice when stats gathering has very little > > overhead. It'd take up a bit more space, but it's not that much, and > > there'd be no serialization between CPUs operating on different inodes. > > > > > > To be clear, the latency you're measuring is request time to the OSD? > > > > > }; > > > > > > /* > > > @@ -556,4 +560,6 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc, > > > > > > extern void ceph_mdsc_update_read_latency(struct ceph_client_metric *m, > > > s64 latency); > > > +extern void ceph_mdsc_update_write_latency(struct ceph_client_metric *m, > > > + s64 latency); > > > #endif > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > > index 43e4240d88e7..e73439d18f28 100644 > > > --- a/include/linux/ceph/osd_client.h > > > +++ b/include/linux/ceph/osd_client.h > > > @@ -524,7 +524,8 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, > > > u64 off, u64 len, > > > u32 truncate_seq, u64 truncate_size, > > > struct timespec64 *mtime, > > > - struct page **pages, int nr_pages); > > > + struct page **pages, int nr_pages, > > > + s64 *latency); > > > > > > int ceph_osdc_copy_from(struct ceph_osd_client *osdc, > > > u64 src_snapid, u64 src_version, > > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c > > > index 62eb758f2474..9f6833ab733c 100644 > > > --- a/net/ceph/osd_client.c > > > +++ b/net/ceph/osd_client.c > > > @@ -5285,12 +5285,16 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, > > > u64 off, u64 len, > > > u32 truncate_seq, u64 truncate_size, > > > struct timespec64 *mtime, > > > - struct page **pages, int num_pages) > > > + struct page **pages, int num_pages, > > > + s64 *latency) > > > { > > > struct ceph_osd_request *req; > > > int rc = 0; > > > int page_align = off & ~PAGE_MASK; > > > > > > + if (latency) > > > + *latency = 0; > > > + > > > req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, > > > CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, > > > snapc, truncate_seq, truncate_size, > > > @@ -5308,6 +5312,9 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, > > > if (!rc) > > > rc = ceph_osdc_wait_request(osdc, req); > > > > > > + if (latency && (!rc || rc == -ETIMEDOUT)) > > > + *latency = jiffies - req->r_start_stamp; > > > + > > > > Are you concerned at all with scheduling delays? Note that you're doing > > the latency calculation here which occurs in the task that is woken by > > __complete_request. That won't necessarily wake up immediately on a busy > > machine, so this measurement will include that delay as well. > > > > I wonder if we ought to add a r_end_stamp field to the req instead, and > > grab jiffies in (e.g.) __complete_request. Then you could just fetch > > that out and do the math. > > __complete_request() is a bit of a special case, putting it in > finish_request() would work better. It will still include some delays, > but then measuring the OSD service time on the client side is pretty > much impossible to do precisely. > Yeah, that sounds better. This is a best-effort sort of thing, but let's do make our best effort. > > > ceph_osdc_put_request(req); > > > if (rc == 0) > > > rc = len; > > > > Ditto here on my earlier comment in the earlier email. Let's just turn > > this into a ceph_osdc_writepages_start function and open-code the wait > > and latency calculation in writepage_nounlock(). > > That's a good idea, but let's keep the existing name. The non-blocking > behavior should be the default -- I have most of the remaining blocking > methods in libceph converted in a private branch for rbd exclusive-lock > rewrite. > As a general rule, I like changing the name when a function's behavior changes significantly like this, as that makes it harder to screw things up when selectively backporting patches. In this case, there's only a single caller of each, so I don't have a strong objection, but I have been bitten by this before. It also might not hurt to move both ceph_osdc_readpages and ceph_osdc_writepages into ceph.ko. Given that they're only called from cephfs they probably belong there anyway. -- Jeff Layton <jlayton@xxxxxxxxxx>