Hi there, maybe you could be so kind and help me with following issue: We running Ceph FS but there's repeatedly a problem with the MDS. Sometimes following error occurs: "mds0: Client 701782 failing to respond to capability release" Listing the session informations shows that the "num_caps" on that Client is much more than on the other Clients. ( see also -> attachement ) The problem is that the load on one of the server is increasing to really high value ( 80 to 100 ) independent of client which is complaining. I guess my problem is also that I dont really understand the meaning of those "capabilties". Following facts (let me know if you need more):
|
##### CEPH FS ERROR 09:33:30 PROD root@ceph01:~# ceph -s cluster xxxxxxxxxxxxxxxxxxxxxxx health HEALTH_WARN mds0: Client 701782 failing to respond to capability release monmap e1: 3 mons at {ceph01=xx.xx.xx.114:6789/0,ceph02=xx.xx.xx.115:6789/0,ceph03=xx.xx.xx.116:6789/0} election epoch 106, quorum 0,1,2 ceph01,ceph02,ceph03 mdsmap e260: 1/1/1 up {0=ceph01=up:active}, 2 up:standby ..... -> Load raises immedtiatly 09:33:32 PROD root@ceph01:~# ceph daemon mds.ceph01 session ls [ { "id": 701782, "num_leases": 16, "num_caps": 221397, "state": "open", "replay_requests": 0, "reconnecting": false, "inst": "client.701782 xx.xx.xx.114:0\/1344307356", "client_metadata": {} }, { "id": 692103, "num_leases": 1, "num_caps": 50115, "state": "open", "replay_requests": 0, "reconnecting": false, "inst": "client.692103 xx.xx.xx.117:0\/3600471798", "client_metadata": {} }, { "id": 691995, "num_leases": 2, "num_caps": 53227, "state": "open", "replay_requests": 0, "reconnecting": false, "inst": "client.691995 xx.xx.xx.115:0\/1220606159", "client_metadata": {} }, { "id": 692058, "num_leases": 8, "num_caps": 49722, "state": "open", "replay_requests": 0, "reconnecting": false, "inst": "client.692058 xx.xx.xx.116:0\/4048537076", "client_metadata": {} } ] 09:38:18 PROD root@ceph01:~# ceph daemon mds.ceph01 perf dump { "mds": { "request": 1387754, "reply": 1387696, "reply_latency": { "avgcount": 1387696, "sum": 6439.991891758 }, "forward": 0, "dir_fetch": 57946, "dir_commit": 35053, "dir_split": 0, "inode_max": 5000000, "inodes": 1116643, "inodes_top": 837156, "inodes_bottom": 279487, "inodes_pin_tail": 0, "inodes_pinned": 292936, "inodes_expired": 0, "inodes_with_caps": 269668, "caps": 374718, "subtrees": 2, "traverse": 2591500, "traverse_hit": 2492810, "traverse_forward": 0, "traverse_discover": 0, "traverse_dir_fetch": 19330, "traverse_remote_ino": 0, "traverse_lock": 2350, "load_cent": 138774897, "q": 0, "exported": 0, "exported_inodes": 0, "imported": 0, "imported_inodes": 0 }, "mds_cache": { "num_strays": 56, "num_strays_purging": 0, "num_strays_delayed": 0, "strays_created": 2835, "strays_purged": 2802, "num_recovering_processing": 0, "num_recovering_enqueued": 0, "num_recovering_prioritized": 0, "recovery_started": 0, "recovery_completed": 0 }, "mds_log": { "evadd": 376174, "evex": 377829, "evtrm": 377829, "ev": 13815, "evexg": 0, "evexd": 1024, "segadd": 738, "segex": 738, "segtrm": 738, "seg": 31, "segexg": 0, "segexd": 1, "expos": 6882857746, "wrpos": 6991387600, "rdpos": 4859818564, "jlat": 0 }, "mds_mem": { "ino": 1112733, "ino+": 1115537, "ino-": 2804, "dir": 66813, "dir+": 67017, "dir-": 204, "dn": 1116643, "dn+": 1121224, "dn-": 4581, "cap": 374718, "cap+": 1005845, "cap-": 631127, "rss": 6992420, "heap": 49060, "malloc": 18446744073708021059, "buf": 0 }, "mds_server": { "handle_client_request": 1387754, "handle_slave_request": 0, "handle_client_session": 80950, "dispatch_client_request": 2526245, "dispatch_server_request": 0 }, "objecter": { "op_active": 0, "op_laggy": 0, "op_send": 567467, "op_send_bytes": 0, "op_resend": 0, "op_ack": 283387, "op_commit": 284080, "op": 567467, "op_r": 283387, "op_w": 284080, "op_rmw": 0, "op_pg": 0, "osdop_stat": 24703, "osdop_create": 40923, "osdop_read": 24, "osdop_write": 186035, "osdop_writefull": 17341, "osdop_append": 0, "osdop_zero": 1, "osdop_truncate": 0, "osdop_delete": 4721, "osdop_mapext": 0, "osdop_sparse_read": 0, "osdop_clonerange": 0, "osdop_getxattr": 283361, "osdop_setxattr": 40923, "osdop_cmpxattr": 0, "osdop_rmxattr": 0, "osdop_resetxattrs": 0, "osdop_tmap_up": 0, "osdop_tmap_put": 0, "osdop_tmap_get": 0, "osdop_call": 0, "osdop_watch": 0, "osdop_notify": 0, "osdop_src_cmpxattr": 0, "osdop_pgls": 0, "osdop_pgls_filter": 0, "osdop_other": 221016, "linger_active": 0, "linger_send": 0, "linger_resend": 0, "linger_ping": 0, "poolop_active": 0, "poolop_send": 0, "poolop_resend": 0, "poolstat_active": 0, "poolstat_send": 0, "poolstat_resend": 0, "statfs_active": 0, "statfs_send": 0, "statfs_resend": 0, "command_active": 0, "command_send": 0, "command_resend": 0, "map_epoch": 1025, "map_full": 0, "map_inc": 2, "osd_sessions": 210, "osd_session_open": 3589, "osd_session_close": 3569, "osd_laggy": 0 }, "throttle-msgr_dispatch_throttler-mds": { "val": 0, "max": 104857600, "get": 6198403, "get_sum": 1661221790, "get_or_fail_fail": 0, "get_or_fail_success": 0, "take": 0, "take_sum": 0, "put": 6198403, "put_sum": 1661221790, "wait": { "avgcount": 0, "sum": 0.000000000 } }, "throttle-objecter_bytes": { "val": 0, "max": 104857600, "get": 0, "get_sum": 0, "get_or_fail_fail": 0, "get_or_fail_success": 0, "take": 567467, "take_sum": 2263639983, "put": 562730, "put_sum": 2263639983, "wait": { "avgcount": 0, "sum": 0.000000000 } }, "throttle-objecter_ops": { "val": 0, "max": 1024, "get": 0, "get_sum": 0, "get_or_fail_fail": 0, "get_or_fail_success": 0, "take": 567467, "take_sum": 567467, "put": 567467, "put_sum": 567467, "wait": { "avgcount": 0, "sum": 0.000000000 } } }
Attachment:
smime.p7s
Description: S/MIME Cryptographic Signature
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com