Re: ceph osd commit latency increase over time, until restart

Alexandre DERUMIER <aderumier@xxxxxxxxx> · Fri, 8 Feb 2019 16:14:54 +0100 (CET)

I'm just seeing 

StupidAllocator::_aligned_len 
and 
btree::btree_iterator<btree::btree_node<btree::btree_map_params<unsigned long, unsigned long, std::less<unsigned long>, mempoo 

on 1 osd, both 10%.

here the dump_mempools

{
    "mempool": {
        "by_pool": {
            "bloom_filter": {
                "items": 0,
                "bytes": 0
            },
            "bluestore_alloc": {
                "items": 210243456,
                "bytes": 210243456
            },
            "bluestore_cache_data": {
                "items": 54,
                "bytes": 643072
            },
            "bluestore_cache_onode": {
                "items": 105637,
                "bytes": 70988064
            },
            "bluestore_cache_other": {
                "items": 48661920,
                "bytes": 1539544228
            },
            "bluestore_fsck": {
                "items": 0,
                "bytes": 0
            },
            "bluestore_txc": {
                "items": 12,
                "bytes": 8928
            },
            "bluestore_writing_deferred": {
                "items": 406,
                "bytes": 4792868
            },
            "bluestore_writing": {
                "items": 66,
                "bytes": 1085440
            },
            "bluefs": {
                "items": 1882,
                "bytes": 93600
            },
            "buffer_anon": {
                "items": 138986,
                "bytes": 24983701
            },
          "buffer_meta": {
                "items": 544,
                "bytes": 34816
            },
            "osd": {
                "items": 243,
                "bytes": 3089016
            },
            "osd_mapbl": {
                "items": 36,
                "bytes": 179308
            },
            "osd_pglog": {
                "items": 952564,
                "bytes": 372459684
            },
            "osdmap": {
                "items": 3639,
                "bytes": 224664
            },
            "osdmap_mapping": {
                "items": 0,
                "bytes": 0
            },
            "pgmap": {
                "items": 0,
                "bytes": 0
            },
            "mds_co": {
                "items": 0,
                "bytes": 0
            },
            "unittest_1": {
                "items": 0,
                "bytes": 0
            },
            "unittest_2": {
                "items": 0,
                "bytes": 0
            }
        },
        "total": {
            "items": 260109445,
            "bytes": 2228370845
        }
    }
}

and the perf dump

root@ceph5-2:~# ceph daemon osd.4 perf dump
{
    "AsyncMessenger::Worker-0": {
        "msgr_recv_messages": 22948570,
        "msgr_send_messages": 22561570,
        "msgr_recv_bytes": 333085080271,
        "msgr_send_bytes": 261798871204,
        "msgr_created_connections": 6152,
        "msgr_active_connections": 2701,
        "msgr_running_total_time": 1055.197867330,
        "msgr_running_send_time": 352.764480121,
        "msgr_running_recv_time": 499.206831955,
        "msgr_running_fast_dispatch_time": 130.982201607
    },
    "AsyncMessenger::Worker-1": {
        "msgr_recv_messages": 18801593,
        "msgr_send_messages": 18430264,
        "msgr_recv_bytes": 306871760934,
        "msgr_send_bytes": 192789048666,
        "msgr_created_connections": 5773,
        "msgr_active_connections": 2721,
        "msgr_running_total_time": 816.821076305,
        "msgr_running_send_time": 261.353228926,
        "msgr_running_recv_time": 394.035587911,
        "msgr_running_fast_dispatch_time": 104.012155720
    },
    "AsyncMessenger::Worker-2": {
        "msgr_recv_messages": 18463400,
        "msgr_send_messages": 18105856,
        "msgr_recv_bytes": 187425453590,
        "msgr_send_bytes": 220735102555,
        "msgr_created_connections": 5897,
        "msgr_active_connections": 2605,
        "msgr_running_total_time": 807.186854324,
        "msgr_running_send_time": 296.834435839,
        "msgr_running_recv_time": 351.364389691,
        "msgr_running_fast_dispatch_time": 101.215776792
    },
    "bluefs": {
        "gift_bytes": 0,
        "reclaim_bytes": 0,
        "db_total_bytes": 256050724864,
        "db_used_bytes": 12413042688,
        "wal_total_bytes": 0,
        "wal_used_bytes": 0,
        "slow_total_bytes": 0,
        "slow_used_bytes": 0,
        "num_files": 209,
        "log_bytes": 10383360,
        "log_compactions": 14,
        "logged_bytes": 336498688,
        "files_written_wal": 2,
        "files_written_sst": 4499,
        "bytes_written_wal": 417989099783,
        "bytes_written_sst": 213188750209
    },
    "bluestore": {
        "kv_flush_lat": {
            "avgcount": 26371957,
            "sum": 26.734038497,
            "avgtime": 0.000001013
        },
        "kv_commit_lat": {
            "avgcount": 26371957,
            "sum": 3397.491150603,
            "avgtime": 0.000128829
        },
        "kv_lat": {
            "avgcount": 26371957,
            "sum": 3424.225189100,
            "avgtime": 0.000129843
        },
        "state_prepare_lat": {
            "avgcount": 30484924,
            "sum": 3689.542105337,
            "avgtime": 0.000121028
        },
        "state_aio_wait_lat": {
            "avgcount": 30484924,
            "sum": 509.864546111,
            "avgtime": 0.000016725
        },
        "state_io_done_lat": {
            "avgcount": 30484924,
            "sum": 24.534052953,
            "avgtime": 0.000000804
        },
        "state_kv_queued_lat": {
            "avgcount": 30484924,
            "sum": 3488.338424238,
            "avgtime": 0.000114428
        },
        "state_kv_commiting_lat": {
            "avgcount": 30484924,
            "sum": 5660.437003432,
            "avgtime": 0.000185679
        },
        "state_kv_done_lat": {
            "avgcount": 30484924,
            "sum": 7.763511500,
            "avgtime": 0.000000254
        },
        "state_deferred_queued_lat": {
            "avgcount": 26346134,
            "sum": 666071.296856696,
            "avgtime": 0.025281557
        },
        "state_deferred_aio_wait_lat": {
            "avgcount": 26346134,
            "sum": 1755.660547071,
            "avgtime": 0.000066638
        },
        "state_deferred_cleanup_lat": {
            "avgcount": 26346134,
            "sum": 185465.151653703,
            "avgtime": 0.007039558
        },
        "state_finishing_lat": {
            "avgcount": 30484920,
            "sum": 3.046847481,
            "avgtime": 0.000000099
        },
        "state_done_lat": {
            "avgcount": 30484920,
            "sum": 13193.362685280,
            "avgtime": 0.000432783
        },
        "throttle_lat": {
            "avgcount": 30484924,
            "sum": 14.634269979,
            "avgtime": 0.000000480
        },
        "submit_lat": {
            "avgcount": 30484924,
            "sum": 3873.883076148,
            "avgtime": 0.000127075
        },
        "commit_lat": {
            "avgcount": 30484924,
            "sum": 13376.492317331,
            "avgtime": 0.000438790
        },
        "read_lat": {
            "avgcount": 5873923,
            "sum": 1817.167582057,
            "avgtime": 0.000309361
        },
        "read_onode_meta_lat": {
            "avgcount": 19608201,
            "sum": 146.770464482,
            "avgtime": 0.000007485
        },
        "read_wait_aio_lat": {
            "avgcount": 13734278,
            "sum": 2532.578077242,
            "avgtime": 0.000184398
        },
        "compress_lat": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "decompress_lat": {
            "avgcount": 1346945,
            "sum": 26.227575896,
            "avgtime": 0.000019471
        },
        "csum_lat": {
            "avgcount": 28020392,
            "sum": 149.587819041,
            "avgtime": 0.000005338
        },
        "compress_success_count": 0,
        "compress_rejected_count": 0,
        "write_pad_bytes": 352923605,
        "deferred_write_ops": 24373340,
        "deferred_write_bytes": 216791842816,
        "write_penalty_read_ops": 8062366,
        "bluestore_allocated": 3765566013440,
        "bluestore_stored": 4186255221852,
        "bluestore_compressed": 39981379040,
        "bluestore_compressed_allocated": 73748348928,
        "bluestore_compressed_original": 165041381376,
        "bluestore_onodes": 104232,
        "bluestore_onode_hits": 71206874,
        "bluestore_onode_misses": 1217914,
        "bluestore_onode_shard_hits": 260183292,
        "bluestore_onode_shard_misses": 22851573,
        "bluestore_extents": 3394513,
        "bluestore_blobs": 2773587,
        "bluestore_buffers": 0,
        "bluestore_buffer_bytes": 0,
        "bluestore_buffer_hit_bytes": 62026011221,
        "bluestore_buffer_miss_bytes": 995233669922,
        "bluestore_write_big": 5648815,
        "bluestore_write_big_bytes": 552502214656,
        "bluestore_write_big_blobs": 12440992,
        "bluestore_write_small": 35883770,
        "bluestore_write_small_bytes": 223436965719,
        "bluestore_write_small_unused": 408125,
        "bluestore_write_small_deferred": 34961455,
        "bluestore_write_small_pre_read": 34961455,
        "bluestore_write_small_new": 514190,
        "bluestore_txc": 30484924,
        "bluestore_onode_reshard": 5144189,
        "bluestore_blob_split": 60104,
        "bluestore_extent_compress": 53347252,
        "bluestore_gc_merged": 21142528,
        "bluestore_read_eio": 0,
        "bluestore_fragmentation_micros": 67
    },
    "finisher-defered_finisher": {
        "queue_len": 0,
        "complete_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "finisher-finisher-0": {
        "queue_len": 0,
        "complete_latency": {
            "avgcount": 26625163,
            "sum": 1057.506990951,
            "avgtime": 0.000039718
        }
    },
    "finisher-objecter-finisher-0": {
        "queue_len": 0,
        "complete_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.0::sdata_wait_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.0::shard_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.1::sdata_wait_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.1::shard_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.2::sdata_wait_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.2::shard_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.3::sdata_wait_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.3::shard_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.4::sdata_wait_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.4::shard_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.5::sdata_wait_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.5::shard_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.6::sdata_wait_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.6::shard_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.7::sdata_wait_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mutex-OSDShard.7::shard_lock": {
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "objecter": {
        "op_active": 0,
        "op_laggy": 0,
        "op_send": 0,
        "op_send_bytes": 0,
        "op_resend": 0,
        "op_reply": 0,
        "op": 0,
        "op_r": 0,
        "op_w": 0,
        "op_rmw": 0,
        "op_pg": 0,
        "osdop_stat": 0,
        "osdop_create": 0,
        "osdop_read": 0,
        "osdop_write": 0,
        "osdop_writefull": 0,
        "osdop_writesame": 0,
        "osdop_append": 0,
        "osdop_zero": 0,
        "osdop_truncate": 0,
        "osdop_delete": 0,
        "osdop_mapext": 0,
        "osdop_sparse_read": 0,
        "osdop_clonerange": 0,
        "osdop_getxattr": 0,
        "osdop_setxattr": 0,
        "osdop_cmpxattr": 0,
        "osdop_rmxattr": 0,
        "osdop_resetxattrs": 0,
        "osdop_tmap_up": 0,
        "osdop_tmap_put": 0,
        "osdop_tmap_get": 0,
        "osdop_call": 0,
        "osdop_watch": 0,
        "osdop_notify": 0,
        "osdop_src_cmpxattr": 0,
        "osdop_pgls": 0,
        "osdop_pgls_filter": 0,
        "osdop_other": 0,
        "linger_active": 0,
        "linger_send": 0,
        "linger_resend": 0,
        "linger_ping": 0,
        "poolop_active": 0,
        "poolop_send": 0,
        "poolop_resend": 0,
        "poolstat_active": 0,
        "poolstat_send": 0,
        "poolstat_resend": 0,
        "statfs_active": 0,
        "statfs_send": 0,
        "statfs_resend": 0,
        "command_active": 0,
        "command_send": 0,
        "command_resend": 0,
        "map_epoch": 105913,
        "map_full": 0,
        "map_inc": 828,
        "osd_sessions": 0,
        "osd_session_open": 0,
        "osd_session_close": 0,
        "osd_laggy": 0,
        "omap_wr": 0,
        "omap_rd": 0,
        "omap_del": 0
    },
    "osd": {
        "op_wip": 0,
        "op": 16758102,
        "op_in_bytes": 238398820586,
        "op_out_bytes": 165484999463,
        "op_latency": {
            "avgcount": 16758102,
            "sum": 38242.481640842,
            "avgtime": 0.002282029
        },
        "op_process_latency": {
            "avgcount": 16758102,
            "sum": 28644.906310687,
            "avgtime": 0.001709316
        },
        "op_prepare_latency": {
            "avgcount": 16761367,
            "sum": 3489.856599934,
            "avgtime": 0.000208208
        },
        "op_r": 6188565,
        "op_r_out_bytes": 165484999463,
        "op_r_latency": {
            "avgcount": 6188565,
            "sum": 4507.365756792,
            "avgtime": 0.000728337
        },
        "op_r_process_latency": {
            "avgcount": 6188565,
            "sum": 942.363063429,
            "avgtime": 0.000152274
        },
        "op_r_prepare_latency": {
            "avgcount": 6188644,
            "sum": 982.866710389,
            "avgtime": 0.000158817
        },
        "op_w": 10546037,
        "op_w_in_bytes": 238334329494,
        "op_w_latency": {
            "avgcount": 10546037,
            "sum": 33160.719998316,
            "avgtime": 0.003144377
        },
        "op_w_process_latency": {
            "avgcount": 10546037,
            "sum": 27668.702029030,
            "avgtime": 0.002623611
        },
        "op_w_prepare_latency": {
            "avgcount": 10548652,
            "sum": 2499.688609173,
            "avgtime": 0.000236967
        },
        "op_rw": 23500,
        "op_rw_in_bytes": 64491092,
        "op_rw_out_bytes": 0,
        "op_rw_latency": {
            "avgcount": 23500,
            "sum": 574.395885734,
            "avgtime": 0.024442378
        },
        "op_rw_process_latency": {
            "avgcount": 23500,
            "sum": 33.841218228,
            "avgtime": 0.001440051
        },
        "op_rw_prepare_latency": {
            "avgcount": 24071,
            "sum": 7.301280372,
            "avgtime": 0.000303322
        },
        "op_before_queue_op_lat": {
            "avgcount": 57892986,
            "sum": 1502.117718889,
            "avgtime": 0.000025946
        },
        "op_before_dequeue_op_lat": {
            "avgcount": 58091683,
            "sum": 45194.453254037,
            "avgtime": 0.000777984
        },
        "subop": 19784758,
        "subop_in_bytes": 547174969754,
        "subop_latency": {
            "avgcount": 19784758,
            "sum": 13019.714424060,
            "avgtime": 0.000658067
        },
        "subop_w": 19784758,
        "subop_w_in_bytes": 547174969754,
        "subop_w_latency": {
            "avgcount": 19784758,
            "sum": 13019.714424060,
            "avgtime": 0.000658067
        },
        "subop_pull": 0,
        "subop_pull_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "subop_push": 0,
        "subop_push_in_bytes": 0,
        "subop_push_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "pull": 0,
        "push": 2003,
        "push_out_bytes": 5560009728,
        "recovery_ops": 1940,
        "loadavg": 118,
        "buffer_bytes": 0,
        "history_alloc_Mbytes": 0,
        "history_alloc_num": 0,
        "cached_crc": 0,
        "cached_crc_adjusted": 0,
        "missed_crc": 0,
        "numpg": 243,
        "numpg_primary": 82,
        "numpg_replica": 161,
        "numpg_stray": 0,
        "numpg_removing": 0,
        "heartbeat_to_peers": 10,
        "map_messages": 7013,
        "map_message_epochs": 7143,
        "map_message_epoch_dups": 6315,
        "messages_delayed_for_map": 0,
        "osd_map_cache_hit": 203309,
        "osd_map_cache_miss": 33,
        "osd_map_cache_miss_low": 0,
        "osd_map_cache_miss_low_avg": {
            "avgcount": 0,
            "sum": 0
        },
        "osd_map_bl_cache_hit": 47012,
        "osd_map_bl_cache_miss": 1681,
        "stat_bytes": 6401248198656,
        "stat_bytes_used": 3777979072512,
        "stat_bytes_avail": 2623269126144,
        "copyfrom": 0,
        "tier_promote": 0,
        "tier_flush": 0,
        "tier_flush_fail": 0,
        "tier_try_flush": 0,
        "tier_try_flush_fail": 0,
        "tier_evict": 0,
        "tier_whiteout": 1631,
        "tier_dirty": 22360,
        "tier_clean": 0,
        "tier_delay": 0,
        "tier_proxy_read": 0,
        "tier_proxy_write": 0,
        "agent_wake": 0,
        "agent_skip": 0,
        "agent_flush": 0,
        "agent_evict": 0,
        "object_ctx_cache_hit": 16311156,
        "object_ctx_cache_total": 17426393,
        "op_cache_hit": 0,
        "osd_tier_flush_lat": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "osd_tier_promote_lat": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "osd_tier_r_lat": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "osd_pg_info": 30483113,
        "osd_pg_fastinfo": 29619885,
        "osd_pg_biginfo": 81703
    },
    "recoverystate_perf": {
        "initial_latency": {
            "avgcount": 243,
            "sum": 6.869296500,
            "avgtime": 0.028268709
        },
        "started_latency": {
            "avgcount": 1125,
            "sum": 13551384.917335850,
            "avgtime": 12045.675482076
        },
        "reset_latency": {
            "avgcount": 1368,
            "sum": 1101.727799040,
            "avgtime": 0.805356578
        },
        "start_latency": {
            "avgcount": 1368,
            "sum": 0.002014799,
            "avgtime": 0.000001472
        },
        "primary_latency": {
            "avgcount": 507,
            "sum": 4575560.638823428,
            "avgtime": 9024.774435549
        },
        "peering_latency": {
            "avgcount": 550,
            "sum": 499.372283616,
            "avgtime": 0.907949606
        },
        "backfilling_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "waitremotebackfillreserved_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "waitlocalbackfillreserved_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "notbackfilling_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "repnotrecovering_latency": {
            "avgcount": 1009,
            "sum": 8975301.082274411,
            "avgtime": 8895.243887288
        },
        "repwaitrecoveryreserved_latency": {
            "avgcount": 420,
            "sum": 99.846056520,
            "avgtime": 0.237728706
        },
        "repwaitbackfillreserved_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "reprecovering_latency": {
            "avgcount": 420,
            "sum": 241.682764382,
            "avgtime": 0.575435153
        },
        "activating_latency": {
            "avgcount": 507,
            "sum": 16.893347339,
            "avgtime": 0.033320211
        },
        "waitlocalrecoveryreserved_latency": {
            "avgcount": 199,
            "sum": 672.335512769,
            "avgtime": 3.378570415
        },
        "waitremoterecoveryreserved_latency": {
            "avgcount": 199,
            "sum": 213.536439363,
            "avgtime": 1.073047433
        },
        "recovering_latency": {
            "avgcount": 199,
            "sum": 79.007696479,
            "avgtime": 0.397023600
        },
        "recovered_latency": {
            "avgcount": 507,
            "sum": 14.000732748,
            "avgtime": 0.027614857
        },
        "clean_latency": {
            "avgcount": 395,
            "sum": 4574325.900371083,
            "avgtime": 11580.571899673
        },
        "active_latency": {
            "avgcount": 425,
            "sum": 4575107.630123680,
            "avgtime": 10764.959129702
        },
        "replicaactive_latency": {
            "avgcount": 589,
            "sum": 8975184.499049954,
            "avgtime": 15238.004242869
        },
        "stray_latency": {
            "avgcount": 818,
            "sum": 800.729455666,
            "avgtime": 0.978886865
        },
        "getinfo_latency": {
            "avgcount": 550,
            "sum": 15.085667048,
            "avgtime": 0.027428485
        },
        "getlog_latency": {
            "avgcount": 546,
            "sum": 3.482175693,
            "avgtime": 0.006377611
        },
        "waitactingchange_latency": {
            "avgcount": 39,
            "sum": 35.444551284,
            "avgtime": 0.908834648
        },
        "incomplete_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "down_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "getmissing_latency": {
            "avgcount": 507,
            "sum": 6.702129624,
            "avgtime": 0.013219190
        },
        "waitupthru_latency": {
            "avgcount": 507,
            "sum": 474.098261727,
            "avgtime": 0.935105052
        },
        "notrecovering_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "rocksdb": {
        "get": 28320977,
        "submit_transaction": 30484924,
        "submit_transaction_sync": 26371957,
        "get_latency": {
            "avgcount": 28320977,
            "sum": 325.900908733,
            "avgtime": 0.000011507
        },
        "submit_latency": {
            "avgcount": 30484924,
            "sum": 1835.888692371,
            "avgtime": 0.000060222
        },
        "submit_sync_latency": {
            "avgcount": 26371957,
            "sum": 1431.555230628,
            "avgtime": 0.000054283
        },
        "compact": 0,
        "compact_range": 0,
        "compact_queue_merge": 0,
        "compact_queue_len": 0,
        "rocksdb_write_wal_time": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "rocksdb_write_memtable_time": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "rocksdb_write_delay_time": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "rocksdb_write_pre_and_post_time": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    }
}

----- Mail original -----
De: "Igor Fedotov" <ifedotov@xxxxxxx>
À: "aderumier" <aderumier@xxxxxxxxx>
Cc: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx>, "Mark Nelson" <mnelson@xxxxxxxxxx>, "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx>
Envoyé: Mardi 5 Février 2019 18:56:51
Objet: Re:  ceph osd commit latency increase over time, until restart

On 2/4/2019 6:40 PM, Alexandre DERUMIER wrote: 
>>> but I don't see l_bluestore_fragmentation counter. 
>>> (but I have bluestore_fragmentation_micros) 
> ok, this is the same 
> 
> b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros", 
> "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000"); 
> 
> 
> Here a graph on last month, with bluestore_fragmentation_micros and latency, 
> 
> http://odisoweb1.odiso.net/latency_vs_fragmentation_micros.png 

hmm, so fragmentation grows eventually and drops on OSD restarts, isn't 
it? The same for other OSDs? 

This proves some issue with the allocator - generally fragmentation 
might grow but it shouldn't reset on restart. Looks like some intervals 
aren't properly merged in run-time. 

On the other side I'm not completely sure that latency degradation is 
caused by that - fragmentation growth is relatively small - I don't see 
how this might impact performance that high. 

Wondering if you have OSD mempool monitoring (dump_mempools command 
output on admin socket) reports? Do you have any historic data? 

If not may I have current output and say a couple more samples with 
8-12 hours interval? 

Wrt to backporting bitmap allocator to mimic - we haven't had such plans 
before that but I'll discuss this at BlueStore meeting shortly. 

Thanks, 

Igor 

> ----- Mail original ----- 
> De: "Alexandre Derumier" <aderumier@xxxxxxxxx> 
> À: "Igor Fedotov" <ifedotov@xxxxxxx> 
> Cc: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx>, "Mark Nelson" <mnelson@xxxxxxxxxx>, "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> 
> Envoyé: Lundi 4 Février 2019 16:04:38 
> Objet: Re:  ceph osd commit latency increase over time, until restart 
> 
> Thanks Igor, 
> 
>>> Could you please collect BlueStore performance counters right after OSD 
>>> startup and once you get high latency. 
>>> 
>>> Specifically 'l_bluestore_fragmentation' parameter is of interest. 
> I'm already monitoring with 
> "ceph daemon osd.x perf dump ", (I have 2months history will all counters) 
> 
> but I don't see l_bluestore_fragmentation counter. 
> 
> (but I have bluestore_fragmentation_micros) 
> 
> 
>>> Also if you're able to rebuild the code I can probably make a simple 
>>> patch to track latency and some other internal allocator's paramter to 
>>> make sure it's degraded and learn more details. 
> Sorry, It's a critical production cluster, I can't test on it :( 
> But I have a test cluster, maybe I can try to put some load on it, and try to reproduce. 
> 
> 
> 
>>> More vigorous fix would be to backport bitmap allocator from Nautilus 
>>> and try the difference... 
> Any plan to backport it to mimic ? (But I can wait for Nautilus) 
> perf results of new bitmap allocator seem very promising from what I've seen in PR. 
> 
> 
> 
> ----- Mail original ----- 
> De: "Igor Fedotov" <ifedotov@xxxxxxx> 
> À: "Alexandre Derumier" <aderumier@xxxxxxxxx>, "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx>, "Mark Nelson" <mnelson@xxxxxxxxxx> 
> Cc: "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> 
> Envoyé: Lundi 4 Février 2019 15:51:30 
> Objet: Re:  ceph osd commit latency increase over time, until restart 
> 
> Hi Alexandre, 
> 
> looks like a bug in StupidAllocator. 
> 
> Could you please collect BlueStore performance counters right after OSD 
> startup and once you get high latency. 
> 
> Specifically 'l_bluestore_fragmentation' parameter is of interest. 
> 
> Also if you're able to rebuild the code I can probably make a simple 
> patch to track latency and some other internal allocator's paramter to 
> make sure it's degraded and learn more details. 
> 
> 
> More vigorous fix would be to backport bitmap allocator from Nautilus 
> and try the difference... 
> 
> 
> Thanks, 
> 
> Igor 
> 
> 
> On 2/4/2019 5:17 PM, Alexandre DERUMIER wrote: 
>> Hi again, 
>> 
>> I speak too fast, the problem has occured again, so it's not tcmalloc cache size related. 
>> 
>> 
>> I have notice something using a simple "perf top", 
>> 
>> each time I have this problem (I have seen exactly 4 times the same behaviour), 
>> 
>> when latency is bad, perf top give me : 
>> 
>> StupidAllocator::_aligned_len 
>> and 
>> btree::btree_iterator<btree::btree_node<btree::btree_map_params<unsigned long, unsigned long, std::less<unsigned long>, mempoo 
>> l::pool_allocator<(mempool::pool_index_t)1, std::pair<unsigned long const, unsigned long> >, 256> >, std::pair<unsigned long const, unsigned long>&, std::pair<unsigned long 
>> const, unsigned long>*>::increment_slow() 
>> 
>> (around 10-20% time for both) 
>> 
>> 
>> when latency is good, I don't see them at all. 
>> 
>> 
>> I have used the Mark wallclock profiler, here the results: 
>> 
>> http://odisoweb1.odiso.net/gdbpmp-ok.txt 
>> 
>> http://odisoweb1.odiso.net/gdbpmp-bad.txt 
>> 
>> 
>> here an extract of the thread with btree::btree_iterator && StupidAllocator::_aligned_len 
>> 
>> 
>> + 100.00% clone 
>> + 100.00% start_thread 
>> + 100.00% ShardedThreadPool::WorkThreadSharded::entry() 
>> + 100.00% ShardedThreadPool::shardedthreadpool_worker(unsigned int) 
>> + 100.00% OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*) 
>> + 70.00% PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&) 
>> | + 70.00% OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&) 
>> | + 70.00% PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&) 
>> | + 68.00% PGBackend::handle_message(boost::intrusive_ptr<OpRequest>) 
>> | | + 68.00% ReplicatedBackend::_handle_message(boost::intrusive_ptr<OpRequest>) 
>> | | + 68.00% ReplicatedBackend::do_repop(boost::intrusive_ptr<OpRequest>) 
>> | | + 67.00% non-virtual thunk to PrimaryLogPG::queue_transactions(std::vector<ObjectStore::Transaction, std::allocator<ObjectStore::Transaction> >&, boost::intrusive_ptr<OpRequest>) 
>> | | | + 67.00% BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, std::vector<ObjectStore::Transaction, std::allocator<ObjectStore::Transaction> >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*) 
>> | | | + 66.00% BlueStore::_txc_add_transaction(BlueStore::TransContext*, ObjectStore::Transaction*) 
>> | | | | + 66.00% BlueStore::_write(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>&, boost::intrusive_ptr<BlueStore::Onode>&, unsigned long, unsigned long, ceph::buffer::list&, unsigned int) 
>> | | | | + 66.00% BlueStore::_do_write(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>&, boost::intrusive_ptr<BlueStore::Onode>, unsigned long, unsigned long, ceph::buffer::list&, unsigned int) 
>> | | | | + 65.00% BlueStore::_do_alloc_write(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>, boost::intrusive_ptr<BlueStore::Onode>, BlueStore::WriteContext*) 
>> | | | | | + 64.00% StupidAllocator::allocate(unsigned long, unsigned long, unsigned long, long, std::vector<bluestore_pextent_t, mempool::pool_allocator<(mempool::pool_index_t)4, bluestore_pextent_t> >*) 
>> | | | | | | + 64.00% StupidAllocator::allocate_int(unsigned long, unsigned long, long, unsigned long*, unsigned int*) 
>> | | | | | | + 34.00% btree::btree_iterator<btree::btree_node<btree::btree_map_params<unsigned long, unsigned long, std::less<unsigned long>, mempool::pool_allocator<(mempool::pool_index_t)1, std::pair<unsigned long const, unsigned long> >, 256> >, std::pair<unsigned long const, unsigned long>&, std::pair<unsigned long const, unsigned long>*>::increment_slow() 
>> | | | | | | + 26.00% StupidAllocator::_aligned_len(interval_set<unsigned long, btree::btree_map<unsigned long, unsigned long, std::less<unsigned long>, mempool::pool_allocator<(mempool::pool_index_t)1, std::pair<unsigned long const, unsigned long> >, 256> >::iterator, unsigned long) 
>> 
>> 
>> 
>> ----- Mail original ----- 
>> De: "Alexandre Derumier" <aderumier@xxxxxxxxx> 
>> À: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx> 
>> Cc: "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> 
>> Envoyé: Lundi 4 Février 2019 09:38:11 
>> Objet: Re:  ceph osd commit latency increase over time, until restart 
>> 
>> Hi, 
>> 
>> some news: 
>> 
>> I have tried with different transparent hugepage values (madvise, never) : no change 
>> 
>> I have tried to increase bluestore_cache_size_ssd to 8G: no change 
>> 
>> I have tried to increase TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES to 256mb : it seem to help, after 24h I'm still around 1,5ms. (need to wait some more days to be sure) 
>> 
>> 
>> Note that this behaviour seem to happen really faster (< 2 days) on my big nvme drives (6TB), 
>> my others clusters user 1,6TB ssd. 
>> 
>> Currently I'm using only 1 osd by nvme (I don't have more than 5000iops by osd), but I'll try this week with 2osd by nvme, to see if it's helping. 
>> 
>> 
>> BTW, does somebody have already tested ceph without tcmalloc, with glibc >= 2.26 (which have also thread cache) ? 
>> 
>> 
>> Regards, 
>> 
>> Alexandre 
>> 
>> 
>> ----- Mail original ----- 
>> De: "aderumier" <aderumier@xxxxxxxxx> 
>> À: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx> 
>> Cc: "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> 
>> Envoyé: Mercredi 30 Janvier 2019 19:58:15 
>> Objet: Re:  ceph osd commit latency increase over time, until restart 
>> 
>>>> Thanks. Is there any reason you monitor op_w_latency but not 
>>>> op_r_latency but instead op_latency? 
>>>> 
>>>> Also why do you monitor op_w_process_latency? but not op_r_process_latency? 
>> I monitor read too. (I have all metrics for osd sockets, and a lot of graphs). 
>> 
>> I just don't see latency difference on reads. (or they are very very small vs the write latency increase) 
>> 
>> 
>> 
>> ----- Mail original ----- 
>> De: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx> 
>> À: "aderumier" <aderumier@xxxxxxxxx> 
>> Cc: "Sage Weil" <sage@xxxxxxxxxxxx>, "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> 
>> Envoyé: Mercredi 30 Janvier 2019 19:50:20 
>> Objet: Re:  ceph osd commit latency increase over time, until restart 
>> 
>> Hi, 
>> 
>> Am 30.01.19 um 14:59 schrieb Alexandre DERUMIER: 
>>> Hi Stefan, 
>>> 
>>>>> currently i'm in the process of switching back from jemalloc to tcmalloc 
>>>>> like suggested. This report makes me a little nervous about my change. 
>>> Well,I'm really not sure that it's a tcmalloc bug. 
>>> maybe bluestore related (don't have filestore anymore to compare) 
>>> I need to compare with bigger latencies 
>>> 
>>> here an example, when all osd at 20-50ms before restart, then after restart (at 21:15), 1ms 
>>> http://odisoweb1.odiso.net/latencybad.png 
>>> 
>>> I observe the latency in my guest vm too, on disks iowait. 
>>> 
>>> http://odisoweb1.odiso.net/latencybadvm.png 
>>> 
>>>>> Also i'm currently only monitoring latency for filestore osds. Which 
>>>>> exact values out of the daemon do you use for bluestore? 
>>> here my influxdb queries: 
>>> 
>>> It take op_latency.sum/op_latency.avgcount on last second. 
>>> 
>>> 
>>> SELECT non_negative_derivative(first("op_latency.sum"), 1s)/non_negative_derivative(first("op_latency.avgcount"),1s) FROM "ceph" WHERE "host" =~ /^([[host]])$/ AND "id" =~ /^([[osd]])$/ AND $timeFilter GROUP BY time($interval), "host", "id" fill(previous) 
>>> 
>>> 
>>> SELECT non_negative_derivative(first("op_w_latency.sum"), 1s)/non_negative_derivative(first("op_w_latency.avgcount"),1s) FROM "ceph" WHERE "host" =~ /^([[host]])$/ AND collection='osd' AND "id" =~ /^([[osd]])$/ AND $timeFilter GROUP BY time($interval), "host", "id" fill(previous) 
>>> 
>>> 
>>> SELECT non_negative_derivative(first("op_w_process_latency.sum"), 1s)/non_negative_derivative(first("op_w_process_latency.avgcount"),1s) FROM "ceph" WHERE "host" =~ /^([[host]])$/ AND collection='osd' AND "id" =~ /^([[osd]])$/ AND $timeFilter GROUP BY time($interval), "host", "id" fill(previous) 
>> Thanks. Is there any reason you monitor op_w_latency but not 
>> op_r_latency but instead op_latency? 
>> 
>> Also why do you monitor op_w_process_latency? but not op_r_process_latency? 
>> 
>> greets, 
>> Stefan 
>> 
>>> 
>>> 
>>> 
>>> ----- Mail original ----- 
>>> De: "Stefan Priebe, Profihost AG" <s.priebe@xxxxxxxxxxxx> 
>>> À: "aderumier" <aderumier@xxxxxxxxx>, "Sage Weil" <sage@xxxxxxxxxxxx> 
>>> Cc: "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> 
>>> Envoyé: Mercredi 30 Janvier 2019 08:45:33 
>>> Objet: Re:  ceph osd commit latency increase over time, until restart 
>>> 
>>> Hi, 
>>> 
>>> Am 30.01.19 um 08:33 schrieb Alexandre DERUMIER: 
>>>> Hi, 
>>>> 
>>>> here some new results, 
>>>> different osd/ different cluster 
>>>> 
>>>> before osd restart latency was between 2-5ms 
>>>> after osd restart is around 1-1.5ms 
>>>> 
>>>> http://odisoweb1.odiso.net/cephperf2/bad.txt (2-5ms) 
>>>> http://odisoweb1.odiso.net/cephperf2/ok.txt (1-1.5ms) 
>>>> http://odisoweb1.odiso.net/cephperf2/diff.txt 
>>>> 
>>>> From what I see in diff, the biggest difference is in tcmalloc, but maybe I'm wrong. 
>>>> (I'm using tcmalloc 2.5-2.2) 
>>> currently i'm in the process of switching back from jemalloc to tcmalloc 
>>> like suggested. This report makes me a little nervous about my change. 
>>> 
>>> Also i'm currently only monitoring latency for filestore osds. Which 
>>> exact values out of the daemon do you use for bluestore? 
>>> 
>>> I would like to check if i see the same behaviour. 
>>> 
>>> Greets, 
>>> Stefan 
>>> 
>>>> ----- Mail original ----- 
>>>> De: "Sage Weil" <sage@xxxxxxxxxxxx> 
>>>> À: "aderumier" <aderumier@xxxxxxxxx> 
>>>> Cc: "ceph-users" <ceph-users@xxxxxxxxxxxxxx>, "ceph-devel" <ceph-devel@xxxxxxxxxxxxxxx> 
>>>> Envoyé: Vendredi 25 Janvier 2019 10:49:02 
>>>> Objet: Re: ceph osd commit latency increase over time, until restart 
>>>> 
>>>> Can you capture a perf top or perf record to see where teh CPU time is 
>>>> going on one of the OSDs wth a high latency? 
>>>> 
>>>> Thanks! 
>>>> sage 
>>>> 
>>>> 
>>>> On Fri, 25 Jan 2019, Alexandre DERUMIER wrote: 
>>>> 
>>>>> Hi, 
>>>>> 
>>>>> I have a strange behaviour of my osd, on multiple clusters, 
>>>>> 
>>>>> All cluster are running mimic 13.2.1,bluestore, with ssd or nvme drivers, 
>>>>> workload is rbd only, with qemu-kvm vms running with librbd + snapshot/rbd export-diff/snapshotdelete each day for backup 
>>>>> 
>>>>> When the osd are refreshly started, the commit latency is between 0,5-1ms. 
>>>>> 
>>>>> But overtime, this latency increase slowly (maybe around 1ms by day), until reaching crazy 
>>>>> values like 20-200ms. 
>>>>> 
>>>>> Some example graphs: 
>>>>> 
>>>>> http://odisoweb1.odiso.net/osdlatency1.png 
>>>>> http://odisoweb1.odiso.net/osdlatency2.png 
>>>>> 
>>>>> All osds have this behaviour, in all clusters. 
>>>>> 
>>>>> The latency of physical disks is ok. (Clusters are far to be full loaded) 
>>>>> 
>>>>> And if I restart the osd, the latency come back to 0,5-1ms. 
>>>>> 
>>>>> That's remember me old tcmalloc bug, but maybe could it be a bluestore memory bug ? 
>>>>> 
>>>>> Any Hints for counters/logs to check ? 
>>>>> 
>>>>> 
>>>>> Regards, 
>>>>> 
>>>>> Alexandre 
>>>>> 
>>>>> 
>>>> _______________________________________________ 
>>>> ceph-users mailing list 
>>>> ceph-users@xxxxxxxxxxxxxx 
>>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com 
>>>> 

_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com