On Tue, 2020-01-14 at 22:44 -0500, xiubli@xxxxxxxxxx wrote: > From: Xiubo Li <xiubli@xxxxxxxxxx> > > item total sum_lat(us) avg_lat(us) > ----------------------------------------------------- > write 222 5287750000 23818693 > > URL: https://tracker.ceph.com/issues/43215 > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > --- > fs/ceph/addr.c | 23 +++++++++++++++++++++-- > fs/ceph/debugfs.c | 8 ++++++++ > fs/ceph/file.c | 9 +++++++++ > fs/ceph/mds_client.c | 20 ++++++++++++++++++++ > fs/ceph/mds_client.h | 6 ++++++ > include/linux/ceph/osd_client.h | 3 ++- > net/ceph/osd_client.c | 9 ++++++++- > 7 files changed, 74 insertions(+), 4 deletions(-) > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > index 2a32f731f92a..b667ddaa6623 100644 > --- a/fs/ceph/addr.c > +++ b/fs/ceph/addr.c > @@ -598,12 +598,15 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) > loff_t page_off = page_offset(page); > int err, len = PAGE_SIZE; > struct ceph_writeback_ctl ceph_wbc; > + struct ceph_client_metric *metric; > + s64 latency; > > dout("writepage %p idx %lu\n", page, page->index); > > inode = page->mapping->host; > ci = ceph_inode(inode); > fsc = ceph_inode_to_client(inode); > + metric = &fsc->mdsc->metric; > > /* verify this is a writeable snap context */ > snapc = page_snap_context(page); > @@ -645,7 +648,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) > &ci->i_layout, snapc, page_off, len, > ceph_wbc.truncate_seq, > ceph_wbc.truncate_size, > - &inode->i_mtime, &page, 1); > + &inode->i_mtime, &page, 1, > + &latency); > + if (latency) > + ceph_mdsc_update_write_latency(metric, latency); > + > if (err < 0) { > struct writeback_control tmp_wbc; > if (!wbc) > @@ -707,6 +714,8 @@ static void writepages_finish(struct ceph_osd_request *req) > { > struct inode *inode = req->r_inode; > struct ceph_inode_info *ci = ceph_inode(inode); > + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > + struct ceph_client_metric *metric = &fsc->mdsc->metric; > struct ceph_osd_data *osd_data; > struct page *page; > int num_pages, total_pages = 0; > @@ -714,7 +723,6 @@ static void writepages_finish(struct ceph_osd_request *req) > int rc = req->r_result; > struct ceph_snap_context *snapc = req->r_snapc; > struct address_space *mapping = inode->i_mapping; > - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > bool remove_page; > > dout("writepages_finish %p rc %d\n", inode, rc); > @@ -783,6 +791,11 @@ static void writepages_finish(struct ceph_osd_request *req) > ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); > else > kfree(osd_data->pages); > + > + if (!rc) { > + s64 latency = jiffies - req->r_start_stamp; > + ceph_mdsc_update_write_latency(metric, latency); > + } > ceph_osdc_put_request(req); > } > > @@ -1675,6 +1688,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) > struct inode *inode = file_inode(filp); > struct ceph_inode_info *ci = ceph_inode(inode); > struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > + struct ceph_client_metric *metric = &fsc->mdsc->metric; > struct ceph_osd_request *req; > struct page *page = NULL; > u64 len, inline_version; > @@ -1786,6 +1800,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) > err = ceph_osdc_start_request(&fsc->client->osdc, req, false); > if (!err) > err = ceph_osdc_wait_request(&fsc->client->osdc, req); > + > + if (!err || err == -ETIMEDOUT) { > + s64 latency = jiffies - req->r_start_stamp; > + ceph_mdsc_update_write_latency(metric, latency); > + } > out_put: > ceph_osdc_put_request(req); > if (err == -ECANCELED) > diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c > index 8200bf025ccd..3fdb15af0a83 100644 > --- a/fs/ceph/debugfs.c > +++ b/fs/ceph/debugfs.c > @@ -142,6 +142,14 @@ static int metric_show(struct seq_file *s, void *p) > seq_printf(s, "%-14s%-12lld%-16lld%lld\n", "read", > total, sum / NSEC_PER_USEC, avg / NSEC_PER_USEC); > > + spin_lock(&mdsc->metric.write_lock); > + total = atomic64_read(&mdsc->metric.total_writes), > + sum = timespec64_to_ns(&mdsc->metric.write_latency_sum); > + spin_unlock(&mdsc->metric.write_lock); > + avg = total ? sum / total : 0; > + seq_printf(s, "%-14s%-12lld%-16lld%lld\n", "write", > + total, sum / NSEC_PER_USEC, avg / NSEC_PER_USEC); > + > seq_printf(s, "\n"); > seq_printf(s, "item total miss hit\n"); > seq_printf(s, "-------------------------------------------------\n"); > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 797d4d224223..70530ac798ac 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -822,6 +822,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) > op = &req->r_ops[i]; > if (op->op == CEPH_OSD_OP_READ) > ceph_mdsc_update_read_latency(metric, latency); > + else if (op->op == CEPH_OSD_OP_WRITE && rc != -ENOENT) > + ceph_mdsc_update_write_latency(metric, latency); > } > } > > @@ -1075,6 +1077,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, > > if (!write) > ceph_mdsc_update_read_latency(metric, latency); > + else if (write && ret != -ENOENT) > + ceph_mdsc_update_write_latency(metric, latency); > } > > size = i_size_read(inode); > @@ -1163,6 +1167,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, > struct inode *inode = file_inode(file); > struct ceph_inode_info *ci = ceph_inode(inode); > struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > + struct ceph_client_metric *metric = &fsc->mdsc->metric; > struct ceph_vino vino; > struct ceph_osd_request *req; > struct page **pages; > @@ -1248,6 +1253,10 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, > if (!ret) > ret = ceph_osdc_wait_request(&fsc->client->osdc, req); > > + if (!ret || ret == -ETIMEDOUT) { > + s64 latency = jiffies - req->r_start_stamp; > + ceph_mdsc_update_write_latency(metric, latency); > + } > out: > ceph_osdc_put_request(req); > if (ret != 0) { > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index dc2cda55a5a5..2569f9303c0c 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -4112,6 +4112,22 @@ void ceph_mdsc_update_read_latency(struct ceph_client_metric *m, > spin_unlock(&m->read_lock); > } > > +void ceph_mdsc_update_write_latency(struct ceph_client_metric *m, > + s64 latency) > +{ > + struct timespec64 ts; > + > + if (!m) > + return; > + > + jiffies_to_timespec64(latency, &ts); > + > + spin_lock(&m->write_lock); > + atomic64_inc(&m->total_writes); > + m->write_latency_sum = timespec64_add(m->write_latency_sum, ts); > + spin_unlock(&m->write_lock); > +} > + > /* > * delayed work -- periodically trim expired leases, renew caps with mds > */ > @@ -4212,6 +4228,10 @@ static int ceph_mdsc_metric_init(struct ceph_client_metric *metric) > memset(&metric->read_latency_sum, 0, sizeof(struct timespec64)); > atomic64_set(&metric->total_reads, 0); > > + spin_lock_init(&metric->write_lock); > + memset(&metric->write_latency_sum, 0, sizeof(struct timespec64)); > + atomic64_set(&metric->total_writes, 0); > + > return 0; > } > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > index fee25b999c7c..0120357e7549 100644 > --- a/fs/ceph/mds_client.h > +++ b/fs/ceph/mds_client.h > @@ -370,6 +370,10 @@ struct ceph_client_metric { > spinlock_t read_lock; > atomic64_t total_reads; > struct timespec64 read_latency_sum; > + > + spinlock_t write_lock; > + atomic64_t total_writes; > + struct timespec64 write_latency_sum; Would percpu counters be better here? I mean it's not a _super_ high performance codepath, but it's nice when stats gathering has very little overhead. It'd take up a bit more space, but it's not that much, and there'd be no serialization between CPUs operating on different inodes. To be clear, the latency you're measuring is request time to the OSD? > }; > > /* > @@ -556,4 +560,6 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc, > > extern void ceph_mdsc_update_read_latency(struct ceph_client_metric *m, > s64 latency); > +extern void ceph_mdsc_update_write_latency(struct ceph_client_metric *m, > + s64 latency); > #endif > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > index 43e4240d88e7..e73439d18f28 100644 > --- a/include/linux/ceph/osd_client.h > +++ b/include/linux/ceph/osd_client.h > @@ -524,7 +524,8 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, > u64 off, u64 len, > u32 truncate_seq, u64 truncate_size, > struct timespec64 *mtime, > - struct page **pages, int nr_pages); > + struct page **pages, int nr_pages, > + s64 *latency); > > int ceph_osdc_copy_from(struct ceph_osd_client *osdc, > u64 src_snapid, u64 src_version, > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c > index 62eb758f2474..9f6833ab733c 100644 > --- a/net/ceph/osd_client.c > +++ b/net/ceph/osd_client.c > @@ -5285,12 +5285,16 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, > u64 off, u64 len, > u32 truncate_seq, u64 truncate_size, > struct timespec64 *mtime, > - struct page **pages, int num_pages) > + struct page **pages, int num_pages, > + s64 *latency) > { > struct ceph_osd_request *req; > int rc = 0; > int page_align = off & ~PAGE_MASK; > > + if (latency) > + *latency = 0; > + > req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, > CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, > snapc, truncate_seq, truncate_size, > @@ -5308,6 +5312,9 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, > if (!rc) > rc = ceph_osdc_wait_request(osdc, req); > > + if (latency && (!rc || rc == -ETIMEDOUT)) > + *latency = jiffies - req->r_start_stamp; > + Are you concerned at all with scheduling delays? Note that you're doing the latency calculation here which occurs in the task that is woken by __complete_request. That won't necessarily wake up immediately on a busy machine, so this measurement will include that delay as well. I wonder if we ought to add a r_end_stamp field to the req instead, and grab jiffies in (e.g.) __complete_request. Then you could just fetch that out and do the math. > ceph_osdc_put_request(req); > if (rc == 0) > rc = len; Ditto here on my earlier comment in the earlier email. Let's just turn this into a ceph_osdc_writepages_start function and open-code the wait and latency calculation in writepage_nounlock(). -- Jeff Layton <jlayton@xxxxxxxxxx>