MDS: obscene buffer_anon memory use when scanning lots of files

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 14.2.5 but also present in Luminous, buffer_anon memory use spirals
out of control when scanning many thousands of files. The use case is
more or less "look up this file and if it exists append this chunk to
it, otherwise create it with this chunk." The memory is recovered as
soon as the workload stops, and at most only 20-100 files are ever
open at one time.

Cache gets oversized but that's more or less expected, it's pretty
much always/immediately in some warn state, which makes me wonder if a
much larger cache might help buffer_anon use, looking for advice
there. This is on a deeply-hashed directory, but overall very little
data (<20GB), lots of tiny files.

As I typed this post the pool went from ~60GB to ~110GB. I've resorted
to a cronjob that restarts the active MDS when it reaches swap just to
keep the cluster alive.

~$ ceph daemon mds.mds1 dump_mempools
{
  "mempool": {
    "by_pool": {
      "bloom_filter": {
        "items": 4631659,
        "bytes": 4631659
      },
      "bluestore_alloc": {
        "items": 0,
        "bytes": 0
      },
      "bluestore_cache_data": {
        "items": 0,
        "bytes": 0
      },
      "bluestore_cache_onode": {
        "items": 0,
        "bytes": 0
      },
      "bluestore_cache_other": {
        "items": 0,
        "bytes": 0
      },
      "bluestore_fsck": {
        "items": 0,
        "bytes": 0
      },
      "bluestore_txc": {
        "items": 0,
        "bytes": 0
      },
      "bluestore_writing_deferred": {
        "items": 0,
        "bytes": 0
      },
      "bluestore_writing": {
        "items": 0,
        "bytes": 0
      },
      "bluefs": {
        "items": 0,
        "bytes": 0
      },
      "buffer_anon": {
        "items": 67791,
        "bytes": 85598497506
      },
      "buffer_meta": {
        "items": 57987,
        "bytes": 5102856
      },
      "osd": {
        "items": 0,
        "bytes": 0
      },
      "osd_mapbl": {
        "items": 0,
        "bytes": 0
      },
      "osd_pglog": {
        "items": 0,
        "bytes": 0
      },
      "osdmap": {
        "items": 582,
        "bytes": 12248
      },
      "osdmap_mapping": {
        "items": 0,
        "bytes": 0
      },
      "pgmap": {
        "items": 0,
        "bytes": 0
      },
      "mds_co": {
        "items": 284739975,
        "bytes": 6883426437
      },
      "unittest_1": {
        "items": 0,
        "bytes": 0
      },
      "unittest_2": {
        "items": 0,
        "bytes": 0
      }
    },
    "total": {
      "items": 289497994,
      "bytes": 92491670706
    }
  }
}


~$ ceph daemon mds.mds0 perf dump
{
  "AsyncMessenger::Worker-0": {
    "msgr_recv_messages": 1360700,
    "msgr_send_messages": 2298283,
    "msgr_recv_bytes": 17915475859,
    "msgr_send_bytes": 2024853049,
    "msgr_created_connections": 2031,
    "msgr_active_connections": 18446744073709552000,
    "msgr_running_total_time": 96.2125937,
    "msgr_running_send_time": 38.268843421,
    "msgr_running_recv_time": 44.299468018,
    "msgr_running_fast_dispatch_time": 17.303765523
  },
  "AsyncMessenger::Worker-1": {
    "msgr_recv_messages": 971844,
    "msgr_send_messages": 1266589,
    "msgr_recv_bytes": 14435001275,
    "msgr_send_bytes": 1755800874,
    "msgr_created_connections": 213,
    "msgr_active_connections": 18446744073709552000,
    "msgr_running_total_time": 60.745883284,
    "msgr_running_send_time": 17.694164502,
    "msgr_running_recv_time": 24.300171049,
    "msgr_running_fast_dispatch_time": 14.947038849
  },
  "AsyncMessenger::Worker-2": {
    "msgr_recv_messages": 1742305,
    "msgr_send_messages": 2163916,
    "msgr_recv_bytes": 30829094382,
    "msgr_send_bytes": 2915900257,
    "msgr_created_connections": 233,
    "msgr_active_connections": 18446744073709552000,
    "msgr_running_total_time": 137.913631549,
    "msgr_running_send_time": 41.234654308,
    "msgr_running_recv_time": 40.918463152,
    "msgr_running_fast_dispatch_time": 36.512891479
  },
  "cct": {
    "total_workers": 1,
    "unhealthy_workers": 0
  },
  "finisher-PurgeQueue": {
    "queue_len": 0,
    "complete_latency": {
      "avgcount": 47756,
      "sum": 217.373554326,
      "avgtime": 0.004551753
    }
  },
  "mds": {
    "request": 1178430,
    "reply": 1178373,
    "reply_latency": {
      "avgcount": 1178373,
      "sum": 60810.239426392,
      "avgtime": 0.051605255
    },
    "forward": 0,
    "dir_fetch": 49751,
    "dir_commit": 44312,
    "dir_split": 0,
    "dir_merge": 0,
    "inode_max": 100000,
    "inodes": 2759030,
    "inodes_top": 1919408,
    "inodes_bottom": 836395,
    "inodes_pin_tail": 3227,
    "inodes_pinned": 17019,
    "inodes_expired": 42387174,
    "inodes_with_caps": 5485,
    "caps": 11773,
    "subtrees": 2,
    "traverse": 1878329,
    "traverse_hit": 1675078,
    "traverse_forward": 0,
    "traverse_discover": 0,
    "traverse_dir_fetch": 42538,
    "traverse_remote_ino": 0,
    "traverse_lock": 25,
    "load_cent": 1294614,
    "q": 29,
    "exported": 0,
    "exported_inodes": 0,
    "imported": 0,
    "imported_inodes": 0,
    "openino_dir_fetch": 7277,
    "openino_backtrace_fetch": 1,
    "openino_peer_discover": 0,
    "root_rfiles": 31043731,
    "root_rbytes": 5791840170135,
    "root_rsnaps": 0
  },
  "mds_cache": {
    "num_strays": 400,
    "num_strays_delayed": 8,
    "num_strays_enqueuing": 0,
    "strays_created": 49534,
    "strays_enqueued": 49638,
    "strays_reintegrated": 0,
    "strays_migrated": 0,
    "num_recovering_processing": 0,
    "num_recovering_enqueued": 0,
    "num_recovering_prioritized": 0,
    "recovery_started": 1194,
    "recovery_completed": 1194,
    "ireq_enqueue_scrub": 0,
    "ireq_exportdir": 0,
    "ireq_flush": 0,
    "ireq_fragmentdir": 0,
    "ireq_fragstats": 0,
    "ireq_inodestats": 0
  },
  "mds_log": {
    "evadd": 1811605,
    "evex": 1809564,
    "evtrm": 1809564,
    "ev": 106369,
    "evexg": 0,
    "evexd": 2865,
    "segadd": 2244,
    "segex": 2244,
    "segtrm": 2244,
    "seg": 130,
    "segexg": 0,
    "segexd": 3,
    "expos": 4457957952634,
    "wrpos": 4458177431234,
    "rdpos": 4454374251644,
    "jlat": {
      "avgcount": 1041651,
      "sum": 65486.869073583,
      "avgtime": 0.062868339
    },
    "replayed": 104328
  },
  "mds_mem": {
    "ino": 2759033,
    "ino+": 45094333,
    "ino-": 42335300,
    "dir": 6317,
    "dir+": 8300,
    "dir-": 1983,
    "dn": 2761351,
    "dn+": 45205560,
    "dn-": 42444209,
    "cap": 11773,
    "cap+": 1191966,
    "cap-": 1180193,
    "rss": 67989140,
    "heap": 330432
  },
  "mds_server": {
    "dispatch_client_request": 1816885,
    "dispatch_server_request": 0,
    "handle_client_request": 1178430,
    "handle_client_session": 11175,
    "handle_slave_request": 0,
    "req_create_latency": {
      "avgcount": 52013,
      "sum": 1098.436606927,
      "avgtime": 0.021118501
    },
    "req_getattr_latency": {
      "avgcount": 48725,
      "sum": 1867.479634967,
      "avgtime": 0.038326929
    },
    "req_getfilelock_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_link_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_lookup_latency": {
      "avgcount": 397795,
      "sum": 8410.812821606,
      "avgtime": 0.021143586
    },
    "req_lookuphash_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_lookupino_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_lookupname_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_lookupparent_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_lookupsnap_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_lssnap_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_mkdir_latency": {
      "avgcount": 108,
      "sum": 6.164358676,
      "avgtime": 0.057077395
    },
    "req_mknod_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_mksnap_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_open_latency": {
      "avgcount": 287701,
      "sum": 8356.203112022,
      "avgtime": 0.029044748
    },
    "req_readdir_latency": {
      "avgcount": 7727,
      "sum": 158.295126355,
      "avgtime": 0.020485974
    },
    "req_rename_latency": {
      "avgcount": 11832,
      "sum": 354.415798014,
      "avgtime": 0.029954005
    },
    "req_renamesnap_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_rmdir_latency": {
      "avgcount": 151,
      "sum": 11.192303283,
      "avgtime": 0.074121213
    },
    "req_rmsnap_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_rmxattr_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_setattr_latency": {
      "avgcount": 278323,
      "sum": 39304.420644246,
      "avgtime": 0.14121873
    },
    "req_setdirlayout_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_setfilelock_latency": {
      "avgcount": 44572,
      "sum": 292.346143916,
      "avgtime": 0.006558964
    },
    "req_setlayout_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_setxattr_latency": {
      "avgcount": 7,
      "sum": 0.015024808,
      "avgtime": 0.002146401
    },
    "req_symlink_latency": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    },
    "req_unlink_latency": {
      "avgcount": 49419,
      "sum": 950.457851572,
      "avgtime": 0.01923264
    },
    "cap_revoke_eviction": 0
  },
  "mds_sessions": {
    "session_count": 221,
    "session_add": 221,
    "session_remove": 0,
    "sessions_open": 0,
    "sessions_stale": 0,
    "total_load": 116764,
    "average_load": 528,
    "avg_session_uptime": 3708284
  },
  "mempool": {
    "bloom_filter_bytes": 4120998,
    "bloom_filter_items": 4120998,
    "bluestore_alloc_bytes": 0,
    "bluestore_alloc_items": 0,
    "bluestore_cache_data_bytes": 0,
    "bluestore_cache_data_items": 0,
    "bluestore_cache_onode_bytes": 0,
    "bluestore_cache_onode_items": 0,
    "bluestore_cache_other_bytes": 0,
    "bluestore_cache_other_items": 0,
    "bluestore_fsck_bytes": 0,
    "bluestore_fsck_items": 0,
    "bluestore_txc_bytes": 0,
    "bluestore_txc_items": 0,
    "bluestore_writing_deferred_bytes": 0,
    "bluestore_writing_deferred_items": 0,
    "bluestore_writing_bytes": 0,
    "bluestore_writing_items": 0,
    "bluefs_bytes": 0,
    "bluefs_items": 0,
    "buffer_anon_bytes": 61380965872,
    "buffer_anon_items": 50480,
    "buffer_meta_bytes": 3808640,
    "buffer_meta_items": 43280,
    "osd_bytes": 0,
    "osd_items": 0,
    "osd_mapbl_bytes": 0,
    "osd_mapbl_items": 0,
    "osd_pglog_bytes": 0,
    "osd_pglog_items": 0,
    "osdmap_bytes": 12248,
    "osdmap_items": 582,
    "osdmap_mapping_bytes": 0,
    "osdmap_mapping_items": 0,
    "pgmap_bytes": 0,
    "pgmap_items": 0,
    "mds_co_bytes": 6996083297,
    "mds_co_items": 288521188,
    "unittest_1_bytes": 0,
    "unittest_1_items": 0,
    "unittest_2_bytes": 0,
    "unittest_2_items": 0
  },
  "objecter": {
    "op_active": 72,
    "op_laggy": 0,
    "op_send": 1490006,
    "op_send_bytes": 4115392592,
    "op_resend": 0,
    "op_reply": 1489934,
    "op": 1490006,
    "op_r": 52212,
    "op_w": 1437794,
    "op_rmw": 0,
    "op_pg": 0,
    "osdop_stat": 43160,
    "osdop_create": 14114,
    "osdop_read": 1127,
    "osdop_write": 1042786,
    "osdop_writefull": 1366,
    "osdop_writesame": 0,
    "osdop_append": 0,
    "osdop_zero": 2,
    "osdop_truncate": 0,
    "osdop_delete": 55542,
    "osdop_mapext": 0,
    "osdop_sparse_read": 0,
    "osdop_clonerange": 0,
    "osdop_getxattr": 37965,
    "osdop_setxattr": 28228,
    "osdop_cmpxattr": 0,
    "osdop_rmxattr": 0,
    "osdop_resetxattrs": 0,
    "osdop_call": 0,
    "osdop_watch": 0,
    "osdop_notify": 0,
    "osdop_src_cmpxattr": 0,
    "osdop_pgls": 0,
    "osdop_pgls_filter": 0,
    "osdop_other": 278315,
    "linger_active": 0,
    "linger_send": 0,
    "linger_resend": 0,
    "linger_ping": 0,
    "poolop_active": 0,
    "poolop_send": 0,
    "poolop_resend": 0,
    "poolstat_active": 0,
    "poolstat_send": 0,
    "poolstat_resend": 0,
    "statfs_active": 0,
    "statfs_send": 0,
    "statfs_resend": 0,
    "command_active": 0,
    "command_send": 0,
    "command_resend": 0,
    "map_epoch": 0,
    "map_full": 0,
    "map_inc": 0,
    "osd_sessions": 33,
    "osd_session_open": 33,
    "osd_session_close": 0,
    "osd_laggy": 0,
    "omap_wr": 73583,
    "omap_rd": 99506,
    "omap_del": 20167
  },
  "purge_queue": {
    "pq_executing_ops": 0,
    "pq_executing": 0,
    "pq_executed": 49638
  },
  "throttle-msgr_dispatch_throttler-mds": {
    "val": 1054,
    "max": 104857600,
    "get_started": 0,
    "get": 4074846,
    "get_sum": 62878031734,
    "get_or_fail_fail": 0,
    "get_or_fail_success": 4074846,
    "take": 0,
    "take_sum": 0,
    "put": 4074840,
    "put_sum": 62878030680,
    "wait": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    }
  },
  "throttle-objecter_bytes": {
    "val": 332944,
    "max": 104857600,
    "get_started": 0,
    "get": 0,
    "get_sum": 0,
    "get_or_fail_fail": 0,
    "get_or_fail_success": 0,
    "take": 1490006,
    "take_sum": 4341973304,
    "put": 1142949,
    "put_sum": 4341640360,
    "wait": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    }
  },
  "throttle-objecter_ops": {
    "val": 72,
    "max": 1024,
    "get_started": 0,
    "get": 0,
    "get_sum": 0,
    "get_or_fail_fail": 0,
    "get_or_fail_success": 0,
    "take": 1490006,
    "take_sum": 1490006,
    "put": 1489934,
    "put_sum": 1489934,
    "wait": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    }
  },
  "throttle-write_buf_throttle": {
    "val": 0,
    "max": 3758096384,
    "get_started": 0,
    "get": 49638,
    "get_sum": 5013438,
    "get_or_fail_fail": 0,
    "get_or_fail_success": 49638,
    "take": 0,
    "take_sum": 0,
    "put": 1066,
    "put_sum": 5013438,
    "wait": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    }
  },
  "throttle-write_buf_throttle-0x2c2df40": {
    "val": 0,
    "max": 3758096384,
    "get_started": 0,
    "get": 1811605,
    "get_sum": 3803177386,
    "get_or_fail_fail": 0,
    "get_or_fail_success": 1811605,
    "take": 0,
    "take_sum": 0,
    "put": 1041720,
    "put_sum": 3803177386,
    "wait": {
      "avgcount": 0,
      "sum": 0,
      "avgtime": 0
    }
  }
}
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com



[Index of Archives]     [Information on CEPH]     [Linux Filesystem Development]     [Ceph Development]     [Ceph Large]     [Ceph Dev]     [Linux USB Development]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [xfs]


  Powered by Linux