Below are the performance counters. Some scientific workflows trigger
this - some parts of them are quite data intensive - they process
thousands of files over many hours to days. The 200GB ceph-fuse got
there in about 3 days. I'm keeping the node alive for now in case we
can extract some more definitive info on what is happening there.
Andras
# ceph daemon /var/run/ceph/ceph-client.admin.asok perf dump
{
"AsyncMessenger::Worker-0": {
"msgr_recv_messages": 37730,
"msgr_send_messages": 37731,
"msgr_recv_bytes": 1121379127,
"msgr_send_bytes": 11913693154,
"msgr_created_connections": 75333,
"msgr_active_connections": 730,
"msgr_running_total_time": 642.152166956,
"msgr_running_send_time": 536.723862752,
"msgr_running_recv_time": 25.429112242,
"msgr_running_fast_dispatch_time": 63.814291954
},
"AsyncMessenger::Worker-1": {
"msgr_recv_messages": 38507,
"msgr_send_messages": 38467,
"msgr_recv_bytes": 1240174043,
"msgr_send_bytes": 11673685736,
"msgr_created_connections": 75479,
"msgr_active_connections": 729,
"msgr_running_total_time": 628.670562086,
"msgr_running_send_time": 523.772820969,
"msgr_running_recv_time": 25.902871268,
"msgr_running_fast_dispatch_time": 62.375965165
},
"AsyncMessenger::Worker-2": {
"msgr_recv_messages": 597697,
"msgr_send_messages": 504640,
"msgr_recv_bytes": 1314713236,
"msgr_send_bytes": 11880445442,
"msgr_created_connections": 75338,
"msgr_active_connections": 728,
"msgr_running_total_time": 711.909282325,
"msgr_running_send_time": 556.195748166,
"msgr_running_recv_time": 127.267332682,
"msgr_running_fast_dispatch_time": 62.209721085
},
"client": {
"reply": {
"avgcount": 236795,
"sum": 6177.205536940,
"avgtime": 0.026086722
},
"lat": {
"avgcount": 236795,
"sum": 6177.205536940,
"avgtime": 0.026086722
},
"wrlat": {
"avgcount": 857828153,
"sum": 8413.835066735,
"avgtime": 0.000009808
}
},
"objectcacher-libcephfs": {
"cache_ops_hit": 4160412,
"cache_ops_miss": 4887,
"cache_bytes_hit": 3247294145494,
"cache_bytes_miss": 12914144260,
"data_read": 48923557765,
"data_written": 35292875783,
"data_flushed": 35292681606,
"data_overwritten_while_flushing": 0,
"write_ops_blocked": 0,
"write_bytes_blocked": 0,
"write_time_blocked": 0.000000000
},
"objecter": {
"op_active": 0,
"op_laggy": 0,
"op_send": 111268,
"op_send_bytes": 35292681606,
"op_resend": 0,
"op_reply": 111268,
"op": 111268,
"op_r": 2193,
"op_w": 109075,
"op_rmw": 0,
"op_pg": 0,
"osdop_stat": 2,
"osdop_create": 2,
"osdop_read": 2193,
"osdop_write": 109071,
"osdop_writefull": 0,
"osdop_writesame": 0,
"osdop_append": 0,
"osdop_zero": 0,
"osdop_truncate": 0,
"osdop_delete": 0,
"osdop_mapext": 0,
"osdop_sparse_read": 0,
"osdop_clonerange": 0,
"osdop_getxattr": 0,
"osdop_setxattr": 0,
"osdop_cmpxattr": 0,
"osdop_rmxattr": 0,
"osdop_resetxattrs": 0,
"osdop_tmap_up": 0,
"osdop_tmap_put": 0,
"osdop_tmap_get": 0,
"osdop_call": 0,
"osdop_watch": 0,
"osdop_notify": 0,
"osdop_src_cmpxattr": 0,
"osdop_pgls": 0,
"osdop_pgls_filter": 0,
"osdop_other": 0,
"linger_active": 0,
"linger_send": 0,
"linger_resend": 0,
"linger_ping": 0,
"poolop_active": 0,
"poolop_send": 0,
"poolop_resend": 0,
"poolstat_active": 0,
"poolstat_send": 0,
"poolstat_resend": 0,
"statfs_active": 0,
"statfs_send": 1348,
"statfs_resend": 0,
"command_active": 0,
"command_send": 0,
"command_resend": 0,
"map_epoch": 1079783,
"map_full": 0,
"map_inc": 632,
"osd_sessions": 2160,
"osd_session_open": 226144,
"osd_session_close": 223984,
"osd_laggy": 0,
"omap_wr": 0,
"omap_rd": 0,
"omap_del": 0
},
"throttle-msgr_dispatch_throttler-client": {
"val": 0,
"max": 104857600,
"get_started": 0,
"get": 673934,
"get_sum": 3626395290,
"get_or_fail_fail": 0,
"get_or_fail_success": 673934,
"take": 0,
"take_sum": 0,
"put": 673934,
"put_sum": 3626395290,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-objecter_bytes": {
"val": 0,
"max": 104857600,
"get_started": 0,
"get": 0,
"get_sum": 0,
"get_or_fail_fail": 0,
"get_or_fail_success": 0,
"take": 111268,
"take_sum": 38456168409,
"put": 111264,
"put_sum": 38456168409,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-objecter_ops": {
"val": 0,
"max": 1024,
"get_started": 0,
"get": 0,
"get_sum": 0,
"get_or_fail_fail": 0,
"get_or_fail_success": 0,
"take": 111268,
"take_sum": 111268,
"put": 111268,
"put_sum": 111268,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
}
}
On 09/05/2018 10:00 AM, Sage Weil wrote:
On Wed, 5 Sep 2018, Andras Pataki wrote:
Hi cephers,
Every so often we have a ceph-fuse process that grows to rather large size (up
to eating up the whole memory of the machine). Here is an example of a 200GB
RSS size ceph-fuse instance:
# ceph daemon /var/run/ceph/ceph-client.admin.asok dump_mempools
{
"bloom_filter": {
"items": 0,
"bytes": 0
},
"bluestore_alloc": {
"items": 0,
"bytes": 0
},
"bluestore_cache_data": {
"items": 0,
"bytes": 0
},
"bluestore_cache_onode": {
"items": 0,
"bytes": 0
},
"bluestore_cache_other": {
"items": 0,
"bytes": 0
},
"bluestore_fsck": {
"items": 0,
"bytes": 0
},
"bluestore_txc": {
"items": 0,
"bytes": 0
},
"bluestore_writing_deferred": {
"items": 0,
"bytes": 0
},
"bluestore_writing": {
"items": 0,
"bytes": 0
},
"bluefs": {
"items": 0,
"bytes": 0
},
"buffer_anon": {
"items": 51534897,
"bytes": 207321872398
},
"buffer_meta": {
"items": 64,
"bytes": 5632
},
"osd": {
"items": 0,
"bytes": 0
},
"osd_mapbl": {
"items": 0,
"bytes": 0
},
"osd_pglog": {
"items": 0,
"bytes": 0
},
"osdmap": {
"items": 28593,
"bytes": 431872
},
"osdmap_mapping": {
"items": 0,
"bytes": 0
},
"pgmap": {
"items": 0,
"bytes": 0
},
"mds_co": {
"items": 0,
"bytes": 0
},
"unittest_1": {
"items": 0,
"bytes": 0
},
"unittest_2": {
"items": 0,
"bytes": 0
},
"total": {
"items": 51563554,
"bytes": 207322309902
}
}
The general cache size looks like this (if it is helpful I can put a whole
cache dump somewhere):
# ceph daemon /var/run/ceph/ceph-client.admin.asok dump_cache | grep path | wc
-l
84085
# ceph daemon /var/run/ceph/ceph-client.admin.asok dump_cache | grep name | wc
-l
168186
Any ideas what 'buffer_anon' is and what could be eating up the 200GB of
RAM?
buffer_anon is memory consumed by the bufferlist class that hasn't been
explicitly put into a separate mempool category. The question is
where/why are buffers getting pinned in memory. Can you dump the
perfcounters? That might give some hint.
My guess is a leak, or a problem with the ObjectCacher code that is
preventing it from timming older buffers.
How reproducible is the situation? Any idea what workloads trigger it?
Thanks!
sage
We are running with a few ceph-fuse specific parameters increased in
ceph.conf:
# Description: Set the number of inodes that the client keeps in
the metadata cache.
# Default: 16384
client_cache_size = 262144
# Description: Set the maximum number of dirty bytes in the object
cache.
# Default: 104857600 (100MB)
client_oc_max_dirty = 536870912
# Description: Set the maximum number of objects in the object cache.
# Default: 1000
client_oc_max_objects = 8192
# Description: Set how many bytes of data will the client cache.
# Default: 209715200 (200 MB)
client_oc_size = 2147483640
# Description: Set the maximum number of bytes that the kernel
reads ahead for future read operations. Overridden by the
client_readahead_max_periods setting.
# Default: 0 (unlimited)
#client_readahead_max_bytes = 67108864
# Description: Set the number of file layout periods (object size *
number of stripes) that the kernel reads ahead. Overrides the
client_readahead_max_bytes setting.
# Default: 4
client_readahead_max_periods = 64
# Description: Set the minimum number bytes that the kernel reads
ahead.
# Default: 131072 (128KB)
client_readahead_min = 4194304
We are running a 12.2.7 ceph cluster, and the cluster is otherwise healthy.
Any hints would be appreciated. Thanks,
Andras
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com