On Sat, Apr 14, 2018 at 9:23 PM, Alexandre DERUMIER <aderumier@xxxxxxxxx> wrote: > Hi, > > Still leaking again after update to 12.2.4, around 17G after 9 days > > > > > USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND > > ceph 629903 50.7 25.9 17473680 17082432 ? Ssl avril05 6498:21 /usr/bin/ceph-mds -f --cluster ceph --id ceph4-1.odiso.net --setuser ceph --setgroup ceph > > > > > > ~# ceph daemon mds.ceph4-1.odiso.net cache status > { > "pool": { > "items": 16019302, > "bytes": 5100941968 > } > } > > > > > > # ceph daemon mds.ceph4-1.odiso.net perf dump > { > "AsyncMessenger::Worker-0": { > "msgr_recv_messages": 648541059, > "msgr_send_messages": 666102301, > "msgr_recv_bytes": 4943336751206, > "msgr_send_bytes": 868468165048, > "msgr_created_connections": 167, > "msgr_active_connections": 166, > "msgr_running_total_time": 33884.943400671, > "msgr_running_send_time": 12229.226645264, > "msgr_running_recv_time": 26234.680757843, > "msgr_running_fast_dispatch_time": 4650.248980986 > }, > "AsyncMessenger::Worker-1": { > "msgr_recv_messages": 732301444, > "msgr_send_messages": 750526966, > "msgr_recv_bytes": 4248782228635, > "msgr_send_bytes": 2379403291660, > "msgr_created_connections": 172, > "msgr_active_connections": 171, > "msgr_running_total_time": 38490.093448635, > "msgr_running_send_time": 14692.222019414, > "msgr_running_recv_time": 31000.304091618, > "msgr_running_fast_dispatch_time": 3945.573521893 > }, > "AsyncMessenger::Worker-2": { > "msgr_recv_messages": 503228767, > "msgr_send_messages": 485729577, > "msgr_recv_bytes": 3644656184942, > "msgr_send_bytes": 526380645708, > "msgr_created_connections": 156, > "msgr_active_connections": 156, > "msgr_running_total_time": 26566.051442840, > "msgr_running_send_time": 9335.249687474, > "msgr_running_recv_time": 22643.927960456, > "msgr_running_fast_dispatch_time": 3426.566334706 > }, > "finisher-PurgeQueue": { > "queue_len": 0, > "complete_latency": { > "avgcount": 2077128, > "sum": 10029.468276512, > "avgtime": 0.004828526 > } > }, > "mds": { > "request": 1320419754, > "reply": 1320418963, > "reply_latency": { > "avgcount": 1320418963, > "sum": 3567340.917522550, > "avgtime": 0.002701673 > }, > "forward": 0, > "dir_fetch": 95955541, > "dir_commit": 5380286, > "dir_split": 29080, > "dir_merge": 28453, > "inode_max": 2147483647, > "inodes": 2049324, > "inodes_top": 55759, > "inodes_bottom": 118910, > "inodes_pin_tail": 1874655, > "inodes_pinned": 1969667, > "inodes_expired": 14225864524, > "inodes_with_caps": 1969030, > "caps": 3010600, > "subtrees": 2, > "traverse": 1433042396, > "traverse_hit": 855810795, > "traverse_forward": 0, > "traverse_discover": 0, > "traverse_dir_fetch": 75553963, > "traverse_remote_ino": 5462, > "traverse_lock": 217, > "load_cent": 132079451933, > "q": 41, > "exported": 0, > "exported_inodes": 0, > "imported": 0, > "imported_inodes": 0 > }, > "mds_cache": { > "num_strays": 150, > "num_strays_delayed": 0, > "num_strays_enqueuing": 0, > "strays_created": 2317004, > "strays_enqueued": 2316671, > "strays_reintegrated": 288, > "strays_migrated": 0, > "num_recovering_processing": 0, > "num_recovering_enqueued": 0, > "num_recovering_prioritized": 0, > "recovery_started": 0, > "recovery_completed": 0, > "ireq_enqueue_scrub": 0, > "ireq_exportdir": 0, > "ireq_flush": 0, > "ireq_fragmentdir": 57533, > "ireq_fragstats": 0, > "ireq_inodestats": 0 > }, > "mds_log": { > "evadd": 293928039, > "evex": 293928281, > "evtrm": 293926233, > "ev": 26595, > "evexg": 0, > "evexd": 2048, > "segadd": 365381, > "segex": 365382, > "segtrm": 365380, > "seg": 32, > "segexg": 0, > "segexd": 2, > "expos": 4997676796422, > "wrpos": 4997732797135, > "rdpos": 4232612352311, > "jlat": { > "avgcount": 62629276, > "sum": 260619.838247062, > "avgtime": 0.004161310 > }, > "replayed": 24789 > }, > "mds_mem": { > "ino": 2048405, > "ino+": 14160488289, > "ino-": 14158439884, > "dir": 377882, > "dir+": 15421679, > "dir-": 15043797, > "dn": 2049614, > "dn+": 14231703198, > "dn-": 14229653584, > "cap": 3010600, > "cap+": 1555206662, > "cap-": 1552196062, > "rss": 17082432, > "heap": 313916, > "buf": 0 > }, > "mds_server": { > "dispatch_client_request": 1437033326, > "dispatch_server_request": 0, > "handle_client_request": 1320419754, > "handle_client_session": 11542297, > "handle_slave_request": 0, > "req_create": 18618128, > "req_getattr": 11195570, > "req_getfilelock": 0, > "req_link": 411, > "req_lookup": 1005844421, > "req_lookuphash": 0, > "req_lookupino": 0, > "req_lookupname": 37344, > "req_lookupparent": 0, > "req_lookupsnap": 0, > "req_lssnap": 0, > "req_mkdir": 691747, > "req_mknod": 18, > "req_mksnap": 0, > "req_open": 230213054, > "req_readdir": 50618109, > "req_rename": 17377032, > "req_renamesnap": 0, > "req_rmdir": 463707, > "req_rmsnap": 0, > "req_rmxattr": 0, > "req_setattr": 1963949, > "req_setdirlayout": 0, > "req_setfilelock": 210187, > "req_setlayout": 0, > "req_setxattr": 8, > "req_symlink": 1971, > "req_unlink": 1801435 > }, > "mds_sessions": { > "session_count": 305, > "session_add": 473, > "session_remove": 168 > }, > "objecter": { > "op_active": 0, > "op_laggy": 0, > "op_send": 197270397, > "op_send_bytes": 796275884964, > "op_resend": 7, > "op_reply": 197270390, > "op": 197270390, > "op_r": 96075672, > "op_w": 101194718, > "op_rmw": 0, > "op_pg": 0, > "osdop_stat": 4428036, > "osdop_create": 19400797, > "osdop_read": 31288, > "osdop_write": 62709547, > "osdop_writefull": 165583, > "osdop_writesame": 0, > "osdop_append": 0, > "osdop_zero": 2, > "osdop_truncate": 13280, > "osdop_delete": 3185444, > "osdop_mapext": 0, > "osdop_sparse_read": 0, > "osdop_clonerange": 0, > "osdop_getxattr": 27007173, > "osdop_setxattr": 38801594, > "osdop_cmpxattr": 0, > "osdop_rmxattr": 0, > "osdop_resetxattrs": 0, > "osdop_tmap_up": 0, > "osdop_tmap_put": 0, > "osdop_tmap_get": 0, > "osdop_call": 0, > "osdop_watch": 0, > "osdop_notify": 0, > "osdop_src_cmpxattr": 0, > "osdop_pgls": 0, > "osdop_pgls_filter": 0, > "osdop_other": 10143158, > "linger_active": 0, > "linger_send": 0, > "linger_resend": 0, > "linger_ping": 0, > "poolop_active": 0, > "poolop_send": 0, > "poolop_resend": 0, > "poolstat_active": 0, > "poolstat_send": 0, > "poolstat_resend": 0, > "statfs_active": 0, > "statfs_send": 0, > "statfs_resend": 0, > "command_active": 0, > "command_send": 0, > "command_resend": 0, > "map_epoch": 3044, > "map_full": 0, > "map_inc": 160, > "osd_sessions": 18, > "osd_session_open": 20, > "osd_session_close": 2, > "osd_laggy": 0, > "omap_wr": 9743114, > "omap_rd": 191911089, > "omap_del": 684272 > }, > "purge_queue": { > "pq_executing_ops": 0, > "pq_executing": 0, > "pq_executed": 2316671 > }, > "throttle-msgr_dispatch_throttler-mds": { > "val": 0, > "max": 104857600, > "get_started": 0, > "get": 1884071270, > "get_sum": 12697353890803, > "get_or_fail_fail": 0, > "get_or_fail_success": 1884071270, > "take": 0, > "take_sum": 0, > "put": 1884071270, > "put_sum": 12697353890803, > "wait": { > "avgcount": 0, > "sum": 0.000000000, > "avgtime": 0.000000000 > } > }, > "throttle-objecter_bytes": { > "val": 0, > "max": 104857600, > "get_started": 0, > "get": 0, > "get_sum": 0, > "get_or_fail_fail": 0, > "get_or_fail_success": 0, > "take": 197270390, > "take_sum": 796529593788, > "put": 183928495, > "put_sum": 796529593788, > "wait": { > "avgcount": 0, > "sum": 0.000000000, > "avgtime": 0.000000000 > } > }, > "throttle-objecter_ops": { > "val": 0, > "max": 1024, > "get_started": 0, > "get": 0, > "get_sum": 0, > "get_or_fail_fail": 0, > "get_or_fail_success": 0, > "take": 197270390, > "take_sum": 197270390, > "put": 197270390, > "put_sum": 197270390, > "wait": { > "avgcount": 0, > "sum": 0.000000000, > "avgtime": 0.000000000 > } > }, > "throttle-write_buf_throttle": { > "val": 0, > "max": 3758096384, > "get_started": 0, > "get": 2316671, > "get_sum": 215451035, > "get_or_fail_fail": 0, > "get_or_fail_success": 2316671, > "take": 0, > "take_sum": 0, > "put": 31223, > "put_sum": 215451035, > "wait": { > "avgcount": 0, > "sum": 0.000000000, > "avgtime": 0.000000000 > } > }, > "throttle-write_buf_throttle-0x563c33bea220": { > "val": 29763, > "max": 3758096384, > "get_started": 0, > "get": 293928039, > "get_sum": 765120443785, > "get_or_fail_fail": 0, > "get_or_fail_success": 293928039, > "take": 0, > "take_sum": 0, > "put": 62629276, > "put_sum": 765120414022, > "wait": { > "avgcount": 0, > "sum": 0.000000000, > "avgtime": 0.000000000 > } > } > } > I don't find any clue. Next time it happens, could you please try "ceph tell mds.xxx heap release" > > > # ceph status > cluster: > id: e22b8e83-3036-4fe5-8fd5-5ce9d539beca > health: HEALTH_OK > > services: > mon: 3 daemons, quorum ceph4-1,ceph4-2,ceph4-3 > mgr: ceph4-2.odiso.net(active), standbys: ceph4-3.odiso.net, ceph4-1.odiso.net > mds: cephfs4-1/1/1 up {0=ceph4-1.odiso.net=up:active}, 2 up:standby > osd: 18 osds: 18 up, 18 in > > data: > pools: 11 pools, 1992 pgs > objects: 72258k objects, 5918 GB > usage: 20088 GB used, 6737 GB / 26825 GB avail > pgs: 1992 active+clean > > io: > client: 3099 kB/s rd, 6412 kB/s wr, 108 op/s rd, 481 op/s wr > > > ----- Mail original ----- > De: "Patrick Donnelly" <pdonnell@xxxxxxxxxx> > À: "aderumier" <aderumier@xxxxxxxxx> > Cc: "ceph-users" <ceph-users@xxxxxxxxxxxxxx> > Envoyé: Mardi 27 Mars 2018 20:35:08 > Objet: Re: ceph mds memory usage 20GB : is it normal ? > > Hello Alexandre, > > On Thu, Mar 22, 2018 at 2:29 AM, Alexandre DERUMIER <aderumier@xxxxxxxxx> wrote: >> Hi, >> >> I'm running cephfs since 2 months now, >> >> and my active msd memory usage is around 20G now (still growing). >> >> ceph 1521539 10.8 31.2 20929836 20534868 ? Ssl janv.26 8573:34 /usr/bin/ceph-mds -f --cluster ceph --id 2 --setuser ceph --setgroup ceph >> USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND >> >> >> this is on luminous 12.2.2 >> >> only tuning done is: >> >> mds_cache_memory_limit = 5368709120 >> >> >> (5GB). I known it's a soft limit, but 20G seem quite huge vs 5GB .... >> >> >> Is it normal ? > > No, that's definitely not normal! > > >> # ceph daemon mds.2 perf dump mds >> { >> "mds": { >> "request": 1444009197, >> "reply": 1443999870, >> "reply_latency": { >> "avgcount": 1443999870, >> "sum": 1657849.656122933, >> "avgtime": 0.001148095 >> }, >> "forward": 0, >> "dir_fetch": 51740910, >> "dir_commit": 9069568, >> "dir_split": 64367, >> "dir_merge": 58016, >> "inode_max": 2147483647, >> "inodes": 2042975, >> "inodes_top": 152783, >> "inodes_bottom": 138781, >> "inodes_pin_tail": 1751411, >> "inodes_pinned": 1824714, >> "inodes_expired": 7258145573, >> "inodes_with_caps": 1812018, >> "caps": 2538233, >> "subtrees": 2, >> "traverse": 1591668547, >> "traverse_hit": 1259482170, >> "traverse_forward": 0, >> "traverse_discover": 0, >> "traverse_dir_fetch": 30827836, >> "traverse_remote_ino": 7510, >> "traverse_lock": 86236, >> "load_cent": 144401980319, >> "q": 49, >> "exported": 0, >> "exported_inodes": 0, >> "imported": 0, >> "imported_inodes": 0 >> } >> } > > Can you also share `ceph daemon mds.2 cache status`, the full `ceph > daemon mds.2 perf dump`, and `ceph status`? > > Note [1] will be in 12.2.5 and may help with your issue. > > [1] https://github.com/ceph/ceph/pull/20527 > > -- > Patrick Donnelly > > _______________________________________________ > ceph-users mailing list > ceph-users@xxxxxxxxxxxxxx > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com