On 14.2.5 but also present in Luminous, buffer_anon memory use spirals out of control when scanning many thousands of files. The use case is more or less "look up this file and if it exists append this chunk to it, otherwise create it with this chunk." The memory is recovered as soon as the workload stops, and at most only 20-100 files are ever open at one time. Cache gets oversized but that's more or less expected, it's pretty much always/immediately in some warn state, which makes me wonder if a much larger cache might help buffer_anon use, looking for advice there. This is on a deeply-hashed directory, but overall very little data (<20GB), lots of tiny files. As I typed this post the pool went from ~60GB to ~110GB. I've resorted to a cronjob that restarts the active MDS when it reaches swap just to keep the cluster alive. ~$ ceph daemon mds.mds1 dump_mempools { "mempool": { "by_pool": { "bloom_filter": { "items": 4631659, "bytes": 4631659 }, "bluestore_alloc": { "items": 0, "bytes": 0 }, "bluestore_cache_data": { "items": 0, "bytes": 0 }, "bluestore_cache_onode": { "items": 0, "bytes": 0 }, "bluestore_cache_other": { "items": 0, "bytes": 0 }, "bluestore_fsck": { "items": 0, "bytes": 0 }, "bluestore_txc": { "items": 0, "bytes": 0 }, "bluestore_writing_deferred": { "items": 0, "bytes": 0 }, "bluestore_writing": { "items": 0, "bytes": 0 }, "bluefs": { "items": 0, "bytes": 0 }, "buffer_anon": { "items": 67791, "bytes": 85598497506 }, "buffer_meta": { "items": 57987, "bytes": 5102856 }, "osd": { "items": 0, "bytes": 0 }, "osd_mapbl": { "items": 0, "bytes": 0 }, "osd_pglog": { "items": 0, "bytes": 0 }, "osdmap": { "items": 582, "bytes": 12248 }, "osdmap_mapping": { "items": 0, "bytes": 0 }, "pgmap": { "items": 0, "bytes": 0 }, "mds_co": { "items": 284739975, "bytes": 6883426437 }, "unittest_1": { "items": 0, "bytes": 0 }, "unittest_2": { "items": 0, "bytes": 0 } }, "total": { "items": 289497994, "bytes": 92491670706 } } } ~$ ceph daemon mds.mds0 perf dump { "AsyncMessenger::Worker-0": { "msgr_recv_messages": 1360700, "msgr_send_messages": 2298283, "msgr_recv_bytes": 17915475859, "msgr_send_bytes": 2024853049, "msgr_created_connections": 2031, "msgr_active_connections": 18446744073709552000, "msgr_running_total_time": 96.2125937, "msgr_running_send_time": 38.268843421, "msgr_running_recv_time": 44.299468018, "msgr_running_fast_dispatch_time": 17.303765523 }, "AsyncMessenger::Worker-1": { "msgr_recv_messages": 971844, "msgr_send_messages": 1266589, "msgr_recv_bytes": 14435001275, "msgr_send_bytes": 1755800874, "msgr_created_connections": 213, "msgr_active_connections": 18446744073709552000, "msgr_running_total_time": 60.745883284, "msgr_running_send_time": 17.694164502, "msgr_running_recv_time": 24.300171049, "msgr_running_fast_dispatch_time": 14.947038849 }, "AsyncMessenger::Worker-2": { "msgr_recv_messages": 1742305, "msgr_send_messages": 2163916, "msgr_recv_bytes": 30829094382, "msgr_send_bytes": 2915900257, "msgr_created_connections": 233, "msgr_active_connections": 18446744073709552000, "msgr_running_total_time": 137.913631549, "msgr_running_send_time": 41.234654308, "msgr_running_recv_time": 40.918463152, "msgr_running_fast_dispatch_time": 36.512891479 }, "cct": { "total_workers": 1, "unhealthy_workers": 0 }, "finisher-PurgeQueue": { "queue_len": 0, "complete_latency": { "avgcount": 47756, "sum": 217.373554326, "avgtime": 0.004551753 } }, "mds": { "request": 1178430, "reply": 1178373, "reply_latency": { "avgcount": 1178373, "sum": 60810.239426392, "avgtime": 0.051605255 }, "forward": 0, "dir_fetch": 49751, "dir_commit": 44312, "dir_split": 0, "dir_merge": 0, "inode_max": 100000, "inodes": 2759030, "inodes_top": 1919408, "inodes_bottom": 836395, "inodes_pin_tail": 3227, "inodes_pinned": 17019, "inodes_expired": 42387174, "inodes_with_caps": 5485, "caps": 11773, "subtrees": 2, "traverse": 1878329, "traverse_hit": 1675078, "traverse_forward": 0, "traverse_discover": 0, "traverse_dir_fetch": 42538, "traverse_remote_ino": 0, "traverse_lock": 25, "load_cent": 1294614, "q": 29, "exported": 0, "exported_inodes": 0, "imported": 0, "imported_inodes": 0, "openino_dir_fetch": 7277, "openino_backtrace_fetch": 1, "openino_peer_discover": 0, "root_rfiles": 31043731, "root_rbytes": 5791840170135, "root_rsnaps": 0 }, "mds_cache": { "num_strays": 400, "num_strays_delayed": 8, "num_strays_enqueuing": 0, "strays_created": 49534, "strays_enqueued": 49638, "strays_reintegrated": 0, "strays_migrated": 0, "num_recovering_processing": 0, "num_recovering_enqueued": 0, "num_recovering_prioritized": 0, "recovery_started": 1194, "recovery_completed": 1194, "ireq_enqueue_scrub": 0, "ireq_exportdir": 0, "ireq_flush": 0, "ireq_fragmentdir": 0, "ireq_fragstats": 0, "ireq_inodestats": 0 }, "mds_log": { "evadd": 1811605, "evex": 1809564, "evtrm": 1809564, "ev": 106369, "evexg": 0, "evexd": 2865, "segadd": 2244, "segex": 2244, "segtrm": 2244, "seg": 130, "segexg": 0, "segexd": 3, "expos": 4457957952634, "wrpos": 4458177431234, "rdpos": 4454374251644, "jlat": { "avgcount": 1041651, "sum": 65486.869073583, "avgtime": 0.062868339 }, "replayed": 104328 }, "mds_mem": { "ino": 2759033, "ino+": 45094333, "ino-": 42335300, "dir": 6317, "dir+": 8300, "dir-": 1983, "dn": 2761351, "dn+": 45205560, "dn-": 42444209, "cap": 11773, "cap+": 1191966, "cap-": 1180193, "rss": 67989140, "heap": 330432 }, "mds_server": { "dispatch_client_request": 1816885, "dispatch_server_request": 0, "handle_client_request": 1178430, "handle_client_session": 11175, "handle_slave_request": 0, "req_create_latency": { "avgcount": 52013, "sum": 1098.436606927, "avgtime": 0.021118501 }, "req_getattr_latency": { "avgcount": 48725, "sum": 1867.479634967, "avgtime": 0.038326929 }, "req_getfilelock_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_link_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_lookup_latency": { "avgcount": 397795, "sum": 8410.812821606, "avgtime": 0.021143586 }, "req_lookuphash_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_lookupino_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_lookupname_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_lookupparent_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_lookupsnap_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_lssnap_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_mkdir_latency": { "avgcount": 108, "sum": 6.164358676, "avgtime": 0.057077395 }, "req_mknod_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_mksnap_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_open_latency": { "avgcount": 287701, "sum": 8356.203112022, "avgtime": 0.029044748 }, "req_readdir_latency": { "avgcount": 7727, "sum": 158.295126355, "avgtime": 0.020485974 }, "req_rename_latency": { "avgcount": 11832, "sum": 354.415798014, "avgtime": 0.029954005 }, "req_renamesnap_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_rmdir_latency": { "avgcount": 151, "sum": 11.192303283, "avgtime": 0.074121213 }, "req_rmsnap_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_rmxattr_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_setattr_latency": { "avgcount": 278323, "sum": 39304.420644246, "avgtime": 0.14121873 }, "req_setdirlayout_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_setfilelock_latency": { "avgcount": 44572, "sum": 292.346143916, "avgtime": 0.006558964 }, "req_setlayout_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_setxattr_latency": { "avgcount": 7, "sum": 0.015024808, "avgtime": 0.002146401 }, "req_symlink_latency": { "avgcount": 0, "sum": 0, "avgtime": 0 }, "req_unlink_latency": { "avgcount": 49419, "sum": 950.457851572, "avgtime": 0.01923264 }, "cap_revoke_eviction": 0 }, "mds_sessions": { "session_count": 221, "session_add": 221, "session_remove": 0, "sessions_open": 0, "sessions_stale": 0, "total_load": 116764, "average_load": 528, "avg_session_uptime": 3708284 }, "mempool": { "bloom_filter_bytes": 4120998, "bloom_filter_items": 4120998, "bluestore_alloc_bytes": 0, "bluestore_alloc_items": 0, "bluestore_cache_data_bytes": 0, "bluestore_cache_data_items": 0, "bluestore_cache_onode_bytes": 0, "bluestore_cache_onode_items": 0, "bluestore_cache_other_bytes": 0, "bluestore_cache_other_items": 0, "bluestore_fsck_bytes": 0, "bluestore_fsck_items": 0, "bluestore_txc_bytes": 0, "bluestore_txc_items": 0, "bluestore_writing_deferred_bytes": 0, "bluestore_writing_deferred_items": 0, "bluestore_writing_bytes": 0, "bluestore_writing_items": 0, "bluefs_bytes": 0, "bluefs_items": 0, "buffer_anon_bytes": 61380965872, "buffer_anon_items": 50480, "buffer_meta_bytes": 3808640, "buffer_meta_items": 43280, "osd_bytes": 0, "osd_items": 0, "osd_mapbl_bytes": 0, "osd_mapbl_items": 0, "osd_pglog_bytes": 0, "osd_pglog_items": 0, "osdmap_bytes": 12248, "osdmap_items": 582, "osdmap_mapping_bytes": 0, "osdmap_mapping_items": 0, "pgmap_bytes": 0, "pgmap_items": 0, "mds_co_bytes": 6996083297, "mds_co_items": 288521188, "unittest_1_bytes": 0, "unittest_1_items": 0, "unittest_2_bytes": 0, "unittest_2_items": 0 }, "objecter": { "op_active": 72, "op_laggy": 0, "op_send": 1490006, "op_send_bytes": 4115392592, "op_resend": 0, "op_reply": 1489934, "op": 1490006, "op_r": 52212, "op_w": 1437794, "op_rmw": 0, "op_pg": 0, "osdop_stat": 43160, "osdop_create": 14114, "osdop_read": 1127, "osdop_write": 1042786, "osdop_writefull": 1366, "osdop_writesame": 0, "osdop_append": 0, "osdop_zero": 2, "osdop_truncate": 0, "osdop_delete": 55542, "osdop_mapext": 0, "osdop_sparse_read": 0, "osdop_clonerange": 0, "osdop_getxattr": 37965, "osdop_setxattr": 28228, "osdop_cmpxattr": 0, "osdop_rmxattr": 0, "osdop_resetxattrs": 0, "osdop_call": 0, "osdop_watch": 0, "osdop_notify": 0, "osdop_src_cmpxattr": 0, "osdop_pgls": 0, "osdop_pgls_filter": 0, "osdop_other": 278315, "linger_active": 0, "linger_send": 0, "linger_resend": 0, "linger_ping": 0, "poolop_active": 0, "poolop_send": 0, "poolop_resend": 0, "poolstat_active": 0, "poolstat_send": 0, "poolstat_resend": 0, "statfs_active": 0, "statfs_send": 0, "statfs_resend": 0, "command_active": 0, "command_send": 0, "command_resend": 0, "map_epoch": 0, "map_full": 0, "map_inc": 0, "osd_sessions": 33, "osd_session_open": 33, "osd_session_close": 0, "osd_laggy": 0, "omap_wr": 73583, "omap_rd": 99506, "omap_del": 20167 }, "purge_queue": { "pq_executing_ops": 0, "pq_executing": 0, "pq_executed": 49638 }, "throttle-msgr_dispatch_throttler-mds": { "val": 1054, "max": 104857600, "get_started": 0, "get": 4074846, "get_sum": 62878031734, "get_or_fail_fail": 0, "get_or_fail_success": 4074846, "take": 0, "take_sum": 0, "put": 4074840, "put_sum": 62878030680, "wait": { "avgcount": 0, "sum": 0, "avgtime": 0 } }, "throttle-objecter_bytes": { "val": 332944, "max": 104857600, "get_started": 0, "get": 0, "get_sum": 0, "get_or_fail_fail": 0, "get_or_fail_success": 0, "take": 1490006, "take_sum": 4341973304, "put": 1142949, "put_sum": 4341640360, "wait": { "avgcount": 0, "sum": 0, "avgtime": 0 } }, "throttle-objecter_ops": { "val": 72, "max": 1024, "get_started": 0, "get": 0, "get_sum": 0, "get_or_fail_fail": 0, "get_or_fail_success": 0, "take": 1490006, "take_sum": 1490006, "put": 1489934, "put_sum": 1489934, "wait": { "avgcount": 0, "sum": 0, "avgtime": 0 } }, "throttle-write_buf_throttle": { "val": 0, "max": 3758096384, "get_started": 0, "get": 49638, "get_sum": 5013438, "get_or_fail_fail": 0, "get_or_fail_success": 49638, "take": 0, "take_sum": 0, "put": 1066, "put_sum": 5013438, "wait": { "avgcount": 0, "sum": 0, "avgtime": 0 } }, "throttle-write_buf_throttle-0x2c2df40": { "val": 0, "max": 3758096384, "get_started": 0, "get": 1811605, "get_sum": 3803177386, "get_or_fail_fail": 0, "get_or_fail_success": 1811605, "take": 0, "take_sum": 0, "put": 1041720, "put_sum": 3803177386, "wait": { "avgcount": 0, "sum": 0, "avgtime": 0 } } } _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com