I'm just seeing StupidAllocator::_aligned_len and btree::btree_iterator<btree::btree_node<btree::btree_map_params<unsigned long, unsigned long, std::less<unsigned long>, mempoo on 1 osd, both 10%. here the dump_mempools { "mempool": { "by_pool": { "bloom_filter": { "items": 0, "bytes": 0 }, "bluestore_alloc": { "items": 210243456, "bytes": 210243456 }, "bluestore_cache_data": { "items": 54, "bytes": 643072 }, "bluestore_cache_onode": { "items": 105637, "bytes": 70988064 }, "bluestore_cache_other": { "items": 48661920, "bytes": 1539544228 }, "bluestore_fsck": { "items": 0, "bytes": 0 }, "bluestore_txc": { "items": 12, "bytes": 8928 }, "bluestore_writing_deferred": { "items": 406, "bytes": 4792868 }, "bluestore_writing": { "items": 66, "bytes": 1085440 }, "bluefs": { "items": 1882, "bytes": 93600 }, "buffer_anon": { "items": 138986, "bytes": 24983701 }, "buffer_meta": { "items": 544, "bytes": 34816 }, "osd": { "items": 243, "bytes": 3089016 }, "osd_mapbl": { "items": 36, "bytes": 179308 }, "osd_pglog": { "items": 952564, "bytes": 372459684 }, "osdmap": { "items": 3639, "bytes": 224664 }, "osdmap_mapping": { "items": 0, "bytes": 0 }, "pgmap": { "items": 0, "bytes": 0 }, "mds_co": { "items": 0, "bytes": 0 }, "unittest_1": { "items": 0, "bytes": 0 }, "unittest_2": { "items": 0, "bytes": 0 } }, "total": { "items": 260109445, "bytes": 2228370845 } } } and the perf dump root@ceph5-2:~# ceph daemon osd.4 perf dump { "AsyncMessenger::Worker-0": { "msgr_recv_messages": 22948570, "msgr_send_messages": 22561570, "msgr_recv_bytes": 333085080271, "msgr_send_bytes": 261798871204, "msgr_created_connections": 6152, "msgr_active_connections": 2701, "msgr_running_total_time": 1055.197867330, "msgr_running_send_time": 352.764480121, "msgr_running_recv_time": 499.206831955, "msgr_running_fast_dispatch_time": 130.982201607 }, "AsyncMessenger::Worker-1": { "msgr_recv_messages": 18801593, "msgr_send_messages": 18430264, "msgr_recv_bytes": 306871760934, "msgr_send_bytes": 192789048666, "msgr_created_connections": 5773, "msgr_active_connections": 2721, "msgr_running_total_time": 816.821076305, "msgr_running_send_time": 261.353228926, "msgr_running_recv_time": 394.035587911, "msgr_running_fast_dispatch_time": 104.012155720 }, "AsyncMessenger::Worker-2": { "msgr_recv_messages": 18463400, "msgr_send_messages": 18105856, "msgr_recv_bytes": 187425453590, "msgr_send_bytes": 220735102555, "msgr_created_connections": 5897, "msgr_active_connections": 2605, "msgr_running_total_time": 807.186854324, "msgr_running_send_time": 296.834435839, "msgr_running_recv_time": 351.364389691, "msgr_running_fast_dispatch_time": 101.215776792 }, "bluefs": { "gift_bytes": 0, "reclaim_bytes": 0, "db_total_bytes": 256050724864, "db_used_bytes": 12413042688, "wal_total_bytes": 0, "wal_used_bytes": 0, "slow_total_bytes": 0, "slow_used_bytes": 0, "num_files": 209, "log_bytes": 10383360, "log_compactions": 14, "logged_bytes": 336498688, "files_written_wal": 2, "files_written_sst": 4499, "bytes_written_wal": 417989099783, "bytes_written_sst": 213188750209 }, "bluestore": { "kv_flush_lat": { "avgcount": 26371957, "sum": 26.734038497, "avgtime": 0.000001013 }, "kv_commit_lat": { "avgcount": 26371957, "sum": 3397.491150603, "avgtime": 0.000128829 }, "kv_lat": { "avgcount": 26371957, "sum": 3424.225189100, "avgtime": 0.000129843 }, "state_prepare_lat": { "avgcount": 30484924, "sum": 3689.542105337, "avgtime": 0.000121028 }, "state_aio_wait_lat": { "avgcount": 30484924, "sum": 509.864546111, "avgtime": 0.000016725 }, "state_io_done_lat": { "avgcount": 30484924, "sum": 24.534052953, "avgtime": 0.000000804 }, "state_kv_queued_lat": { "avgcount": 30484924, "sum": 3488.338424238, "avgtime": 0.000114428 }, "state_kv_commiting_lat": { "avgcount": 30484924, "sum": 5660.437003432, "avgtime": 0.000185679 }, "state_kv_done_lat": { "avgcount": 30484924, "sum": 7.763511500, "avgtime": 0.000000254 }, "state_deferred_queued_lat": { "avgcount": 26346134, "sum": 666071.296856696, "avgtime": 0.025281557 }, "state_deferred_aio_wait_lat": { "avgcount": 26346134, "sum": 1755.660547071, "avgtime": 0.000066638 }, "state_deferred_cleanup_lat": { "avgcount": 26346134, "sum": 185465.151653703, "avgtime": 0.007039558 }, "state_finishing_lat": { "avgcount": 30484920, "sum": 3.046847481, "avgtime": 0.000000099 }, "state_done_lat": { "avgcount": 30484920, "sum": 13193.362685280, "avgtime": 0.000432783 }, "throttle_lat": { "avgcount": 30484924, "sum": 14.634269979, "avgtime": 0.000000480 }, "submit_lat": { "avgcount": 30484924, "sum": 3873.883076148, "avgtime": 0.000127075 }, "commit_lat": { "avgcount": 30484924, "sum": 13376.492317331, "avgtime": 0.000438790 }, "read_lat": { "avgcount": 5873923, "sum": 1817.167582057, "avgtime": 0.000309361 }, "read_onode_meta_lat": { "avgcount": 19608201, "sum": 146.770464482, "avgtime": 0.000007485 }, "read_wait_aio_lat": { "avgcount": 13734278, "sum": 2532.578077242, "avgtime": 0.000184398 }, "compress_lat": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "decompress_lat": { "avgcount": 1346945, "sum": 26.227575896, "avgtime": 0.000019471 }, "csum_lat": { "avgcount": 28020392, "sum": 149.587819041, "avgtime": 0.000005338 }, "compress_success_count": 0, "compress_rejected_count": 0, "write_pad_bytes": 352923605, "deferred_write_ops": 24373340, "deferred_write_bytes": 216791842816, "write_penalty_read_ops": 8062366, "bluestore_allocated": 3765566013440, "bluestore_stored": 4186255221852, "bluestore_compressed": 39981379040, "bluestore_compressed_allocated": 73748348928, "bluestore_compressed_original": 165041381376, "bluestore_onodes": 104232, "bluestore_onode_hits": 71206874, "bluestore_onode_misses": 1217914, "bluestore_onode_shard_hits": 260183292, "bluestore_onode_shard_misses": 22851573, "bluestore_extents": 3394513, "bluestore_blobs": 2773587, "bluestore_buffers": 0, "bluestore_buffer_bytes": 0, "bluestore_buffer_hit_bytes": 62026011221, "bluestore_buffer_miss_bytes": 995233669922, "bluestore_write_big": 5648815, "bluestore_write_big_bytes": 552502214656, "bluestore_write_big_blobs": 12440992, "bluestore_write_small": 35883770, "bluestore_write_small_bytes": 223436965719, "bluestore_write_small_unused": 408125, "bluestore_write_small_deferred": 34961455, "bluestore_write_small_pre_read": 34961455, "bluestore_write_small_new": 514190, "bluestore_txc": 30484924, "bluestore_onode_reshard": 5144189, "bluestore_blob_split": 60104, "bluestore_extent_compress": 53347252, "bluestore_gc_merged": 21142528, "bluestore_read_eio": 0, "bluestore_fragmentation_micros": 67 }, "finisher-defered_finisher": { "queue_len": 0, "complete_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "finisher-finisher-0": { "queue_len": 0, "complete_latency": { "avgcount": 26625163, "sum": 1057.506990951, "avgtime": 0.000039718 } }, "finisher-objecter-finisher-0": { "queue_len": 0, "complete_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.0::sdata_wait_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.0::shard_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.1::sdata_wait_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.1::shard_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.2::sdata_wait_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.2::shard_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.3::sdata_wait_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.3::shard_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.4::sdata_wait_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.4::shard_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.5::sdata_wait_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.5::shard_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.6::sdata_wait_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.6::shard_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.7::sdata_wait_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "mutex-OSDShard.7::shard_lock": { "wait": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "objecter": { "op_active": 0, "op_laggy": 0, "op_send": 0, "op_send_bytes": 0, "op_resend": 0, "op_reply": 0, "op": 0, "op_r": 0, "op_w": 0, "op_rmw": 0, "op_pg": 0, "osdop_stat": 0, "osdop_create": 0, "osdop_read": 0, "osdop_write": 0, "osdop_writefull": 0, "osdop_writesame": 0, "osdop_append": 0, "osdop_zero": 0, "osdop_truncate": 0, "osdop_delete": 0, "osdop_mapext": 0, "osdop_sparse_read": 0, "osdop_clonerange": 0, "osdop_getxattr": 0, "osdop_setxattr": 0, "osdop_cmpxattr": 0, "osdop_rmxattr": 0, "osdop_resetxattrs": 0, "osdop_tmap_up": 0, "osdop_tmap_put": 0, "osdop_tmap_get": 0, "osdop_call": 0, "osdop_watch": 0, "osdop_notify": 0, "osdop_src_cmpxattr": 0, "osdop_pgls": 0, "osdop_pgls_filter": 0, "osdop_other": 0, "linger_active": 0, "linger_send": 0, "linger_resend": 0, "linger_ping": 0, "poolop_active": 0, "poolop_send": 0, "poolop_resend": 0, "poolstat_active": 0, "poolstat_send": 0, "poolstat_resend": 0, "statfs_active": 0, "statfs_send": 0, "statfs_resend": 0, "command_active": 0, "command_send": 0, "command_resend": 0, "map_epoch": 105913, "map_full": 0, "map_inc": 828, "osd_sessions": 0, "osd_session_open": 0, "osd_session_close": 0, "osd_laggy": 0, "omap_wr": 0, "omap_rd": 0, "omap_del": 0 }, "osd": { "op_wip": 0, "op": 16758102, "op_in_bytes": 238398820586, "op_out_bytes": 165484999463, "op_latency": { "avgcount": 16758102, "sum": 38242.481640842, "avgtime": 0.002282029 }, "op_process_latency": { "avgcount": 16758102, "sum": 28644.906310687, "avgtime": 0.001709316 }, "op_prepare_latency": { "avgcount": 16761367, "sum": 3489.856599934, "avgtime": 0.000208208 }, "op_r": 6188565, "op_r_out_bytes": 165484999463, "op_r_latency": { "avgcount": 6188565, "sum": 4507.365756792, "avgtime": 0.000728337 }, "op_r_process_latency": { "avgcount": 6188565, "sum": 942.363063429, "avgtime": 0.000152274 }, "op_r_prepare_latency": { "avgcount": 6188644, "sum": 982.866710389, "avgtime": 0.000158817 }, "op_w": 10546037, "op_w_in_bytes": 238334329494, "op_w_latency": { "avgcount": 10546037, "sum": 33160.719998316, "avgtime": 0.003144377 }, "op_w_process_latency": { "avgcount": 10546037, "sum": 27668.702029030, "avgtime": 0.002623611 }, "op_w_prepare_latency": { "avgcount": 10548652, "sum": 2499.688609173, "avgtime": 0.000236967 }, "op_rw": 23500, "op_rw_in_bytes": 64491092, "op_rw_out_bytes": 0, "op_rw_latency": { "avgcount": 23500, "sum": 574.395885734, "avgtime": 0.024442378 }, "op_rw_process_latency": { "avgcount": 23500, "sum": 33.841218228, "avgtime": 0.001440051 }, "op_rw_prepare_latency": { "avgcount": 24071, "sum": 7.301280372, "avgtime": 0.000303322 }, "op_before_queue_op_lat": { "avgcount": 57892986, "sum": 1502.117718889, "avgtime": 0.000025946 }, "op_before_dequeue_op_lat": { "avgcount": 58091683, "sum": 45194.453254037, "avgtime": 0.000777984 }, "subop": 19784758, "subop_in_bytes": 547174969754, "subop_latency": { "avgcount": 19784758, "sum": 13019.714424060, "avgtime": 0.000658067 }, "subop_w": 19784758, "subop_w_in_bytes": 547174969754, "subop_w_latency": { "avgcount": 19784758, "sum": 13019.714424060, "avgtime": 0.000658067 }, "subop_pull": 0, "subop_pull_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "subop_push": 0, "subop_push_in_bytes": 0, "subop_push_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "pull": 0, "push": 2003, "push_out_bytes": 5560009728, "recovery_ops": 1940, "loadavg": 118, "buffer_bytes": 0, "history_alloc_Mbytes": 0, "history_alloc_num": 0, "cached_crc": 0, "cached_crc_adjusted": 0, "missed_crc": 0, "numpg": 243, "numpg_primary": 82, "numpg_replica": 161, "numpg_stray": 0, "numpg_removing": 0, "heartbeat_to_peers": 10, "map_messages": 7013, "map_message_epochs": 7143, "map_message_epoch_dups": 6315, "messages_delayed_for_map": 0, "osd_map_cache_hit": 203309, "osd_map_cache_miss": 33, "osd_map_cache_miss_low": 0, "osd_map_cache_miss_low_avg": { "avgcount": 0, "sum": 0 }, "osd_map_bl_cache_hit": 47012, "osd_map_bl_cache_miss": 1681, "stat_bytes": 6401248198656, "stat_bytes_used": 3777979072512, "stat_bytes_avail": 2623269126144, "copyfrom": 0, "tier_promote": 0, "tier_flush": 0, "tier_flush_fail": 0, "tier_try_flush": 0, "tier_try_flush_fail": 0, "tier_evict": 0, "tier_whiteout": 1631, "tier_dirty": 22360, "tier_clean": 0, "tier_delay": 0, "tier_proxy_read": 0, "tier_proxy_write": 0, "agent_wake": 0, "agent_skip": 0, "agent_flush": 0, "agent_evict": 0, "object_ctx_cache_hit": 16311156, "object_ctx_cache_total": 17426393, "op_cache_hit": 0, "osd_tier_flush_lat": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "osd_tier_promote_lat": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "osd_tier_r_lat": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "osd_pg_info": 30483113, "osd_pg_fastinfo": 29619885, "osd_pg_biginfo": 81703 }, "recoverystate_perf": { "initial_latency": { "avgcount": 243, "sum": 6.869296500, "avgtime": 0.028268709 }, "started_latency": { "avgcount": 1125, "sum": 13551384.917335850, "avgtime": 12045.675482076 }, "reset_latency": { "avgcount": 1368, "sum": 1101.727799040, "avgtime": 0.805356578 }, "start_latency": { "avgcount": 1368, "sum": 0.002014799, "avgtime": 0.000001472 }, "primary_latency": { "avgcount": 507, "sum": 4575560.638823428, "avgtime": 9024.774435549 }, "peering_latency": { "avgcount": 550, "sum": 499.372283616, "avgtime": 0.907949606 }, "backfilling_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "waitremotebackfillreserved_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "waitlocalbackfillreserved_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "notbackfilling_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "repnotrecovering_latency": { "avgcount": 1009, "sum": 8975301.082274411, "avgtime": 8895.243887288 }, "repwaitrecoveryreserved_latency": { "avgcount": 420, "sum": 99.846056520, "avgtime": 0.237728706 }, "repwaitbackfillreserved_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "reprecovering_latency": { "avgcount": 420, "sum": 241.682764382, "avgtime": 0.575435153 }, "activating_latency": { "avgcount": 507, "sum": 16.893347339, "avgtime": 0.033320211 }, "waitlocalrecoveryreserved_latency": { "avgcount": 199, "sum": 672.335512769, "avgtime": 3.378570415 }, "waitremoterecoveryreserved_latency": { "avgcount": 199, "sum": 213.536439363, "avgtime": 1.073047433 }, "recovering_latency": { "avgcount": 199, "sum": 79.007696479, "avgtime": 0.397023600 }, "recovered_latency": { "avgcount": 507, "sum": 14.000732748, "avgtime": 0.027614857 }, "clean_latency": { "avgcount": 395, "sum": 4574325.900371083, "avgtime": 11580.571899673 }, "active_latency": { "avgcount": 425, "sum": 4575107.630123680, "avgtime": 10764.959129702 }, "replicaactive_latency": { "avgcount": 589, "sum": 8975184.499049954, "avgtime": 15238.004242869 }, "stray_latency": { "avgcount": 818, "sum": 800.729455666, "avgtime": 0.978886865 }, "getinfo_latency": { "avgcount": 550, "sum": 15.085667048, "avgtime": 0.027428485 }, "getlog_latency": { "avgcount": 546, "sum": 3.482175693, "avgtime": 0.006377611 }, "waitactingchange_latency": { "avgcount": 39, "sum": 35.444551284, "avgtime": 0.908834648 }, "incomplete_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "down_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "getmissing_latency": { "avgcount": 507, "sum": 6.702129624, "avgtime": 0.013219190 }, "waitupthru_latency": { "avgcount": 507, "sum": 474.098261727, "avgtime": 0.935105052 }, "notrecovering_latency": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } }, "rocksdb": { "get": 28320977, "submit_transaction": 30484924, "submit_transaction_sync": 26371957, "get_latency": { "avgcount": 28320977, "sum": 325.900908733, "avgtime": 0.000011507 }, "submit_latency": { "avgcount": 30484924, "sum": 1835.888692371, "avgtime": 0.000060222 }, "submit_sync_latency": { "avgcount": 26371957, "sum": 1431.555230628, "avgtime": 0.000054283 }, "compact": 0, "compact_range": 0, "compact_queue_merge": 0, "compact_queue_len": 0, "rocksdb_write_wal_time": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "rocksdb_write_memtable_time": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "rocksdb_write_delay_time": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 }, "rocksdb_write_pre_and_post_time": { "avgcount": 0, "sum": 0.000000000, "avgtime": 0.000000000 } } } ----- Mail original ----- De: "Igor Fedotov" <ifedotov@xxxxxxx> À: "aderumier" <aderumier@xxxxxxxxx> Cc: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx>, "Mark Nelson" <mnelson@xxxxxxxxxx>, "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> Envoyé: Mardi 5 Février 2019 18:56:51 Objet: Re: ceph osd commit latency increase over time, until restart On 2/4/2019 6:40 PM, Alexandre DERUMIER wrote: >>> but I don't see l_bluestore_fragmentation counter. >>> (but I have bluestore_fragmentation_micros) > ok, this is the same > > b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros", > "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000"); > > > Here a graph on last month, with bluestore_fragmentation_micros and latency, > > http://odisoweb1.odiso.net/latency_vs_fragmentation_micros.png hmm, so fragmentation grows eventually and drops on OSD restarts, isn't it? The same for other OSDs? This proves some issue with the allocator - generally fragmentation might grow but it shouldn't reset on restart. Looks like some intervals aren't properly merged in run-time. On the other side I'm not completely sure that latency degradation is caused by that - fragmentation growth is relatively small - I don't see how this might impact performance that high. Wondering if you have OSD mempool monitoring (dump_mempools command output on admin socket) reports? Do you have any historic data? If not may I have current output and say a couple more samples with 8-12 hours interval? Wrt to backporting bitmap allocator to mimic - we haven't had such plans before that but I'll discuss this at BlueStore meeting shortly. Thanks, Igor > ----- Mail original ----- > De: "Alexandre Derumier" <aderumier@xxxxxxxxx> > À: "Igor Fedotov" <ifedotov@xxxxxxx> > Cc: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx>, "Mark Nelson" <mnelson@xxxxxxxxxx>, "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> > Envoyé: Lundi 4 Février 2019 16:04:38 > Objet: Re: ceph osd commit latency increase over time, until restart > > Thanks Igor, > >>> Could you please collect BlueStore performance counters right after OSD >>> startup and once you get high latency. >>> >>> Specifically 'l_bluestore_fragmentation' parameter is of interest. > I'm already monitoring with > "ceph daemon osd.x perf dump ", (I have 2months history will all counters) > > but I don't see l_bluestore_fragmentation counter. > > (but I have bluestore_fragmentation_micros) > > >>> Also if you're able to rebuild the code I can probably make a simple >>> patch to track latency and some other internal allocator's paramter to >>> make sure it's degraded and learn more details. > Sorry, It's a critical production cluster, I can't test on it :( > But I have a test cluster, maybe I can try to put some load on it, and try to reproduce. > > > >>> More vigorous fix would be to backport bitmap allocator from Nautilus >>> and try the difference... > Any plan to backport it to mimic ? (But I can wait for Nautilus) > perf results of new bitmap allocator seem very promising from what I've seen in PR. > > > > ----- Mail original ----- > De: "Igor Fedotov" <ifedotov@xxxxxxx> > À: "Alexandre Derumier" <aderumier@xxxxxxxxx>, "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx>, "Mark Nelson" <mnelson@xxxxxxxxxx> > Cc: "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> > Envoyé: Lundi 4 Février 2019 15:51:30 > Objet: Re: ceph osd commit latency increase over time, until restart > > Hi Alexandre, > > looks like a bug in StupidAllocator. > > Could you please collect BlueStore performance counters right after OSD > startup and once you get high latency. > > Specifically 'l_bluestore_fragmentation' parameter is of interest. > > Also if you're able to rebuild the code I can probably make a simple > patch to track latency and some other internal allocator's paramter to > make sure it's degraded and learn more details. > > > More vigorous fix would be to backport bitmap allocator from Nautilus > and try the difference... > > > Thanks, > > Igor > > > On 2/4/2019 5:17 PM, Alexandre DERUMIER wrote: >> Hi again, >> >> I speak too fast, the problem has occured again, so it's not tcmalloc cache size related. >> >> >> I have notice something using a simple "perf top", >> >> each time I have this problem (I have seen exactly 4 times the same behaviour), >> >> when latency is bad, perf top give me : >> >> StupidAllocator::_aligned_len >> and >> btree::btree_iterator<btree::btree_node<btree::btree_map_params<unsigned long, unsigned long, std::less<unsigned long>, mempoo >> l::pool_allocator<(mempool::pool_index_t)1, std::pair<unsigned long const, unsigned long> >, 256> >, std::pair<unsigned long const, unsigned long>&, std::pair<unsigned long >> const, unsigned long>*>::increment_slow() >> >> (around 10-20% time for both) >> >> >> when latency is good, I don't see them at all. >> >> >> I have used the Mark wallclock profiler, here the results: >> >> http://odisoweb1.odiso.net/gdbpmp-ok.txt >> >> http://odisoweb1.odiso.net/gdbpmp-bad.txt >> >> >> here an extract of the thread with btree::btree_iterator && StupidAllocator::_aligned_len >> >> >> + 100.00% clone >> + 100.00% start_thread >> + 100.00% ShardedThreadPool::WorkThreadSharded::entry() >> + 100.00% ShardedThreadPool::shardedthreadpool_worker(unsigned int) >> + 100.00% OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*) >> + 70.00% PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&) >> | + 70.00% OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&) >> | + 70.00% PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&) >> | + 68.00% PGBackend::handle_message(boost::intrusive_ptr<OpRequest>) >> | | + 68.00% ReplicatedBackend::_handle_message(boost::intrusive_ptr<OpRequest>) >> | | + 68.00% ReplicatedBackend::do_repop(boost::intrusive_ptr<OpRequest>) >> | | + 67.00% non-virtual thunk to PrimaryLogPG::queue_transactions(std::vector<ObjectStore::Transaction, std::allocator<ObjectStore::Transaction> >&, boost::intrusive_ptr<OpRequest>) >> | | | + 67.00% BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, std::vector<ObjectStore::Transaction, std::allocator<ObjectStore::Transaction> >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*) >> | | | + 66.00% BlueStore::_txc_add_transaction(BlueStore::TransContext*, ObjectStore::Transaction*) >> | | | | + 66.00% BlueStore::_write(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>&, boost::intrusive_ptr<BlueStore::Onode>&, unsigned long, unsigned long, ceph::buffer::list&, unsigned int) >> | | | | + 66.00% BlueStore::_do_write(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>&, boost::intrusive_ptr<BlueStore::Onode>, unsigned long, unsigned long, ceph::buffer::list&, unsigned int) >> | | | | + 65.00% BlueStore::_do_alloc_write(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>, boost::intrusive_ptr<BlueStore::Onode>, BlueStore::WriteContext*) >> | | | | | + 64.00% StupidAllocator::allocate(unsigned long, unsigned long, unsigned long, long, std::vector<bluestore_pextent_t, mempool::pool_allocator<(mempool::pool_index_t)4, bluestore_pextent_t> >*) >> | | | | | | + 64.00% StupidAllocator::allocate_int(unsigned long, unsigned long, long, unsigned long*, unsigned int*) >> | | | | | | + 34.00% btree::btree_iterator<btree::btree_node<btree::btree_map_params<unsigned long, unsigned long, std::less<unsigned long>, mempool::pool_allocator<(mempool::pool_index_t)1, std::pair<unsigned long const, unsigned long> >, 256> >, std::pair<unsigned long const, unsigned long>&, std::pair<unsigned long const, unsigned long>*>::increment_slow() >> | | | | | | + 26.00% StupidAllocator::_aligned_len(interval_set<unsigned long, btree::btree_map<unsigned long, unsigned long, std::less<unsigned long>, mempool::pool_allocator<(mempool::pool_index_t)1, std::pair<unsigned long const, unsigned long> >, 256> >::iterator, unsigned long) >> >> >> >> ----- Mail original ----- >> De: "Alexandre Derumier" <aderumier@xxxxxxxxx> >> À: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx> >> Cc: "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> >> Envoyé: Lundi 4 Février 2019 09:38:11 >> Objet: Re: ceph osd commit latency increase over time, until restart >> >> Hi, >> >> some news: >> >> I have tried with different transparent hugepage values (madvise, never) : no change >> >> I have tried to increase bluestore_cache_size_ssd to 8G: no change >> >> I have tried to increase TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES to 256mb : it seem to help, after 24h I'm still around 1,5ms. (need to wait some more days to be sure) >> >> >> Note that this behaviour seem to happen really faster (< 2 days) on my big nvme drives (6TB), >> my others clusters user 1,6TB ssd. >> >> Currently I'm using only 1 osd by nvme (I don't have more than 5000iops by osd), but I'll try this week with 2osd by nvme, to see if it's helping. >> >> >> BTW, does somebody have already tested ceph without tcmalloc, with glibc >= 2.26 (which have also thread cache) ? >> >> >> Regards, >> >> Alexandre >> >> >> ----- Mail original ----- >> De: "aderumier" <aderumier@xxxxxxxxx> >> À: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx> >> Cc: "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> >> Envoyé: Mercredi 30 Janvier 2019 19:58:15 >> Objet: Re: ceph osd commit latency increase over time, until restart >> >>>> Thanks. Is there any reason you monitor op_w_latency but not >>>> op_r_latency but instead op_latency? >>>> >>>> Also why do you monitor op_w_process_latency? but not op_r_process_latency? >> I monitor read too. (I have all metrics for osd sockets, and a lot of graphs). >> >> I just don't see latency difference on reads. (or they are very very small vs the write latency increase) >> >> >> >> ----- Mail original ----- >> De: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx> >> À: "aderumier" <aderumier@xxxxxxxxx> >> Cc: "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> >> Envoyé: Mercredi 30 Janvier 2019 19:50:20 >> Objet: Re: ceph osd commit latency increase over time, until restart >> >> Hi, >> >> Am 30.01.19 um 14:59 schrieb Alexandre DERUMIER: >>> Hi Stefan, >>> >>>>> currently i'm in the process of switching back from jemalloc to tcmalloc >>>>> like suggested. This report makes me a little nervous about my change. >>> Well,I'm really not sure that it's a tcmalloc bug. >>> maybe bluestore related (don't have filestore anymore to compare) >>> I need to compare with bigger latencies >>> >>> here an example, when all osd at 20-50ms before restart, then after restart (at 21:15), 1ms >>> http://odisoweb1.odiso.net/latencybad.png >>> >>> I observe the latency in my guest vm too, on disks iowait. >>> >>> http://odisoweb1.odiso.net/latencybadvm.png >>> >>>>> Also i'm currently only monitoring latency for filestore osds. Which >>>>> exact values out of the daemon do you use for bluestore? >>> here my influxdb queries: >>> >>> It take op_latency.sum/op_latency.avgcount on last second. >>> >>> >>> SELECT non_negative_derivative(first("op_latency.sum"), 1s)/non_negative_derivative(first("op_latency.avgcount"),1s) FROM "ceph" WHERE "host" =~ /^([[host]])$/ AND "id" =~ /^([[osd]])$/ AND $timeFilter GROUP BY time($interval), "host", "id" fill(previous) >>> >>> >>> SELECT non_negative_derivative(first("op_w_latency.sum"), 1s)/non_negative_derivative(first("op_w_latency.avgcount"),1s) FROM "ceph" WHERE "host" =~ /^([[host]])$/ AND collection='osd' AND "id" =~ /^([[osd]])$/ AND $timeFilter GROUP BY time($interval), "host", "id" fill(previous) >>> >>> >>> SELECT non_negative_derivative(first("op_w_process_latency.sum"), 1s)/non_negative_derivative(first("op_w_process_latency.avgcount"),1s) FROM "ceph" WHERE "host" =~ /^([[host]])$/ AND collection='osd' AND "id" =~ /^([[osd]])$/ AND $timeFilter GROUP BY time($interval), "host", "id" fill(previous) >> Thanks. Is there any reason you monitor op_w_latency but not >> op_r_latency but instead op_latency? >> >> Also why do you monitor op_w_process_latency? but not op_r_process_latency? >> >> greets, >> Stefan >> >>> >>> >>> >>> ----- Mail original ----- >>> De: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx> >>> À: "aderumier" <aderumier@xxxxxxxxx>, "Sage Weil" <sage@xxxxxxxxxxxx> >>> Cc: "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> >>> Envoyé: Mercredi 30 Janvier 2019 08:45:33 >>> Objet: Re: ceph osd commit latency increase over time, until restart >>> >>> Hi, >>> >>> Am 30.01.19 um 08:33 schrieb Alexandre DERUMIER: >>>> Hi, >>>> >>>> here some new results, >>>> different osd/ different cluster >>>> >>>> before osd restart latency was between 2-5ms >>>> after osd restart is around 1-1.5ms >>>> >>>> http://odisoweb1.odiso.net/cephperf2/bad.txt (2-5ms) >>>> http://odisoweb1.odiso.net/cephperf2/ok.txt (1-1.5ms) >>>> http://odisoweb1.odiso.net/cephperf2/diff.txt >>>> >>>> From what I see in diff, the biggest difference is in tcmalloc, but maybe I'm wrong. >>>> (I'm using tcmalloc 2.5-2.2) >>> currently i'm in the process of switching back from jemalloc to tcmalloc >>> like suggested. This report makes me a little nervous about my change. >>> >>> Also i'm currently only monitoring latency for filestore osds. Which >>> exact values out of the daemon do you use for bluestore? >>> >>> I would like to check if i see the same behaviour. >>> >>> Greets, >>> Stefan >>> >>>> ----- Mail original ----- >>>> De: "Sage Weil" <sage@xxxxxxxxxxxx> >>>> À: "aderumier" <aderumier@xxxxxxxxx> >>>> Cc: "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> >>>> Envoyé: Vendredi 25 Janvier 2019 10:49:02 >>>> Objet: Re: ceph osd commit latency increase over time, until restart >>>> >>>> Can you capture a perf top or perf record to see where teh CPU time is >>>> going on one of the OSDs wth a high latency? >>>> >>>> Thanks! >>>> sage >>>> >>>> >>>> On Fri, 25 Jan 2019, Alexandre DERUMIER wrote: >>>> >>>>> Hi, >>>>> >>>>> I have a strange behaviour of my osd, on multiple clusters, >>>>> >>>>> All cluster are running mimic 13.2.1,bluestore, with ssd or nvme drivers, >>>>> workload is rbd only, with qemu-kvm vms running with librbd + snapshot/rbd export-diff/snapshotdelete each day for backup >>>>> >>>>> When the osd are refreshly started, the commit latency is between 0,5-1ms. >>>>> >>>>> But overtime, this latency increase slowly (maybe around 1ms by day), until reaching crazy >>>>> values like 20-200ms. >>>>> >>>>> Some example graphs: >>>>> >>>>> http://odisoweb1.odiso.net/osdlatency1.png >>>>> http://odisoweb1.odiso.net/osdlatency2.png >>>>> >>>>> All osds have this behaviour, in all clusters. >>>>> >>>>> The latency of physical disks is ok. (Clusters are far to be full loaded) >>>>> >>>>> And if I restart the osd, the latency come back to 0,5-1ms. >>>>> >>>>> That's remember me old tcmalloc bug, but maybe could it be a bluestore memory bug ? >>>>> >>>>> Any Hints for counters/logs to check ? >>>>> >>>>> >>>>> Regards, >>>>> >>>>> Alexandre >>>>> >>>>> >>>> _______________________________________________ >>>> ceph-users mailing list >>>> ceph-users@xxxxxxxxxxxxxx >>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com >>>> _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com