Re: Instrument librbd+qemu IO from hypervisor

Martin Millnert <martin@xxxxxxxxxxx> · Thu, 15 Mar 2018 11:26:33 +0100

Self-follow-up:

The ceph version is 0.80.11 in the cluster I'm working. So quite old.

Adding:
  admin socket = /var/run/ceph/$cluster-$type.$id.$pid.$cctid.asok
  log file = /var/log/ceph/

to /etc/ceph.conf, and then in my case tweaking apparmor (for test
disabling it):
  service apparmor teardown
  service apparmor stop

Then stopping a qemu VM:
  virsh stop $instance

Then restarting libvirt-bin:
  service libvirt-bin restart

Then starting the VM again:
  virsh start $instance

Allowed me to get at the the perf dump data, which seems to contain
basically what I need for the moment:
{ "librbd--compute/a43efe1b-461a-4b54-923e-09c2e95da1ba_disk": { "rd": 0,
      "rd_bytes": 0,
      "rd_latency": { "avgcount": 0,
          "sum": 0.000000000},
      "wr": 0,
      "wr_bytes": 0,
      "wr_latency": { "avgcount": 0,
          "sum": 0.000000000},
      "discard": 0,
      "discard_bytes": 0,
      "discard_latency": { "avgcount": 0,
          "sum": 0.000000000},
      "flush": 9,
      "aio_rd": 4596,
      "aio_rd_bytes": 88915968,
      "aio_rd_latency": { "avgcount": 4596,
          "sum": 7.335787000},
      "aio_wr": 114,
      "aio_wr_bytes": 1438720,
      "aio_wr_latency": { "avgcount": 114,
          "sum": 0.011218000},
      "aio_discard": 0,
      "aio_discard_bytes": 0,
      "aio_discard_latency": { "avgcount": 0,
          "sum": 0.000000000},
      "aio_flush": 0,
      "aio_flush_latency": { "avgcount": 0,
          "sum": 0.000000000},
      "snap_create": 0,
      "snap_remove": 0,
      "snap_rollback": 0,
      "notify": 0,
      "resize": 0},
  "objectcacher-librbd--compute/a43efe1b-461a-4b54-923e-09c2e95da1ba_disk": { "cache_ops_hit": 114,
      "cache_ops_miss": 4458,
      "cache_bytes_hit": 24985600,
      "cache_bytes_miss": 88279552,
      "data_read": 88764416,
      "data_written": 1438720,
      "data_flushed": 1438720,
      "data_overwritten_while_flushing": 0,
      "write_ops_blocked": 0,
      "write_bytes_blocked": 0,
      "write_time_blocked": 0.000000000},
  "objecter": { "op_active": 0,
      "op_laggy": 0,
      "op_send": 4553,
      "op_send_bytes": 0,
      "op_resend": 0,
      "op_ack": 4552,
      "op_commit": 89,
      "op": 4553,
      "op_r": 4464,
      "op_w": 88,
      "op_rmw": 1,
      "op_pg": 0,
      "osdop_stat": 2,
      "osdop_create": 0,
      "osdop_read": 4458,
      "osdop_write": 88,
      "osdop_writefull": 0,
      "osdop_append": 0,
      "osdop_zero": 0,
      "osdop_truncate": 0,
      "osdop_delete": 0,
      "osdop_mapext": 0,
      "osdop_sparse_read": 0,
      "osdop_clonerange": 0,
      "osdop_getxattr": 0,
      "osdop_setxattr": 0,
      "osdop_cmpxattr": 0,
      "osdop_rmxattr": 0,
      "osdop_resetxattrs": 0,
      "osdop_tmap_up": 0,
      "osdop_tmap_put": 0,
      "osdop_tmap_get": 0,
      "osdop_call": 9,
      "osdop_watch": 1,
      "osdop_notify": 0,
      "osdop_src_cmpxattr": 0,
      "osdop_pgls": 0,
      "osdop_pgls_filter": 0,
      "osdop_other": 88,
      "linger_active": 1,
      "linger_send": 1,
      "linger_resend": 0,
      "poolop_active": 0,
      "poolop_send": 0,
      "poolop_resend": 0,
      "poolstat_active": 0,
      "poolstat_send": 0,
      "poolstat_resend": 0,
      "statfs_active": 0,
      "statfs_send": 0,
      "statfs_resend": 0,
      "command_active": 0,
      "command_send": 0,
      "command_resend": 0,
      "map_epoch": 0,
      "map_full": 0,
      "map_inc": 0,
      "osd_sessions": 7140,
      "osd_session_open": 119,
      "osd_session_close": 0,
      "osd_laggy": 1},
  "throttle-msgr_dispatch_throttler-radosclient": { "val": 0,
      "max": 104857600,
      "get": 4643,
      "get_sum": 89851514,
      "get_or_fail_fail": 0,
      "get_or_fail_success": 0,
      "take": 0,
      "take_sum": 0,
      "put": 4643,
      "put_sum": 89851514,
      "wait": { "avgcount": 0,
          "sum": 0.000000000}},
  "throttle-objecter_bytes": { "val": 0,
      "max": 104857600,
      "get": 4553,
      "get_sum": 89718272,
      "get_or_fail_fail": 0,
      "get_or_fail_success": 4553,
      "take": 0,
      "take_sum": 0,
      "put": 4546,
      "put_sum": 89718272,
      "wait": { "avgcount": 0,
          "sum": 0.000000000}},
  "throttle-objecter_ops": { "val": 0,
      "max": 1024,
      "get": 4553,
      "get_sum": 4553,
      "get_or_fail_fail": 0,
      "get_or_fail_success": 4553,
      "take": 0,
      "take_sum": 0,
      "put": 4553,
      "put_sum": 4553,
      "wait": { "avgcount": 0,
          "sum": 0.000000000}}}

Am I missing something here?

One thing I need to figure out is how to fix apparmor to allow this
in enforcing mode.

Best,
Martin

On Thu, Mar 15, 2018 at 10:53:51AM +0100, Martin Millnert wrote:
> Dear fellow cephalopods,
> 
> does anyone have any pointers on how to instrument librbd as-driven-by
> qemu IO performance from a hypervisor?
> 
> Are there less intrusive ways than perf or equivalent? Can librbd be
> told to dump statistics somewhere (per volume) - clientside?
> 
> This would come in real handy whilst debugging potential performance
> issues troubling me.
> 
> Ideally I'd like to get per-volume metrics out that I can submit to
> InfluxDB for presentation in Graphana. But I'll take anything.
> 
> Best,
> Martin
Attachment:
signature.asc

Description: PGP signature
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com