Bug in MGR related to OSD reweight?

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello

In order to reduce the rebalacing load when removing the disk, a test was conducted to adjust the reweight value of the OSD.

When the OSD reweight value was set to 0 during the test, an assert fail occurred in the MGR when the OSD was not in the down state.
(It was a phenomenon that MGR went down after restarting was attempted 3 times)

Regardless of the OSD status, even if the OSD reweight value is set to 0, MGR should not be affected. Is there anything you would like to share about this?

Below is the output log contents and related code parts.

[Ceph crash info]
------------------------------------------------------------------------------
{
    "archived": "2023-07-25 07:57:04.772224",
    "assert_condition": "osd_weight.count(oid)",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc",
    "assert_func": "float OSDMap::calc_deviations(ceph::common::CephContext*, const std::map<int, std::set<pg_t> >&, const std::map<int, float>&, float, std::map<int, float>&, std::multimap<float, int>&, float&)",
    "assert_line": 5155,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc: In function 'float OSDMap::calc_deviations(ceph::common::CephContext*, const std::map<int, std::set<pg_t> >&, const std::map<int, float>&, float, std::map<int, float>&, std::multimap<float, int>&, float&)' thread 7f56bf420700 time 2023-07-25T07:49:30.155196+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc: 5155: FAILED ceph_assert(osd_weight.count(oid))\n",
    "assert_thread_name": "balancer",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f570b3c3ce0]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1b0) [0x7f570c5a4cd2]",
        "/usr/lib64/ceph/libceph-common.so.2(+0x283e95) [0x7f570c5a4e95]",
        "(OSDMap::calc_deviations(ceph::common::CephContext*, std::map<int, std::set<pg_t, std::less<pg_t>, std::allocator<pg_t> >, std::less<int>, std::allocator<std::pair<int const, std::set<pg_t, std::less<pg_t>, std::allocator<pg_t> > > > > const&, std::map<int, float, std::less<int>, std::allocator<std::pair<int const, float> > > const&, float, std::map<int, float, std::less<int>, std::allocator<std::pair<int const, float> > >&, std::multimap<float, int, std::less<float>, std::allocator<std::pair<float const, int> > >&, float&)+0xe0) [0x7f570ca71490]",
        "(OSDMap::calc_pg_upmaps(ceph::common::CephContext*, unsigned int, int, std::set<long, std::less<long>, std::allocator<long> > const&, OSDMap::Incremental*, unsigned int*)+0x389) [0x7f570ca75829]",
        "/usr/bin/ceph-mgr(+0x299406) [0x555f39368406]",
        "/lib64/libpython3.6m.so.1.0(+0x19d0d7) [0x7f570d4d80d7]",
        "_PyEval_EvalFrameDefault()",
        "/lib64/libpython3.6m.so.1.0(+0xf9984) [0x7f570d434984]",
        "/lib64/libpython3.6m.so.1.0(+0x17a030) [0x7f570d4b5030]",
        "/lib64/libpython3.6m.so.1.0(+0x19d377) [0x7f570d4d8377]",
        "_PyEval_EvalFrameDefault()",
        "/lib64/libpython3.6m.so.1.0(+0x179e48) [0x7f570d4b4e48]",
        "/lib64/libpython3.6m.so.1.0(+0x19d377) [0x7f570d4d8377]",
        "_PyEval_EvalFrameDefault()",
        "/lib64/libpython3.6m.so.1.0(+0x179e48) [0x7f570d4b4e48]",
        "/lib64/libpython3.6m.so.1.0(+0x19d377) [0x7f570d4d8377]",
        "_PyEval_EvalFrameDefault()",
        "/lib64/libpython3.6m.so.1.0(+0xfa2f6) [0x7f570d4352f6]",
        "_PyFunction_FastCallDict()",
        "_PyObject_FastCallDict()",
        "/lib64/libpython3.6m.so.1.0(+0x10db30) [0x7f570d448b30]",
        "_PyObject_FastCallDict()",
        "PyObject_CallMethod()",
        "(PyModuleRunner::serve()+0x66) [0x555f39363e06]",
        "(PyModuleRunner::PyModuleRunnerThread::entry()+0x3e3) [0x555f39365443]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f570b3b91ca]",
        "clone()"
    ],
    "ceph_version": "17.2.3",
    "crash_id": "2023-07-25T07:49:30.158977Z_893ea7cd-a5d9-4e57-9ea0-86978b1300cf",
    "entity_name": "mgr.eyb-ceph-01.axqecw",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-mgr",
    "stack_sig": "4195797c9f0eefb87056e4b2697fb9367d000d73fc09bdea7f52dbe5b600ccca",
    "timestamp": "2023-07-25T07:49:30.158977Z",
    "utsname_hostname": "eyb-ceph-01",
    "utsname_machine": "x86_64",
    "utsname_release": "5.4.0-125-generic",
    "utsname_sysname": "Linux",
    "utsname_version": "#141-Ubuntu SMP Wed Aug 10 13:42:03 UTC 2022"
}

[MGR Log]
/var/log/ceph/<fsid>/ceph-mgr*.log
------------------------------------------------------------------------------
2023-08-22T11:06:56.402+0000 7fbf43d5e700  0 [balancer INFO root] Optimize plan auto_2023-08-22_11:06:56
2023-08-22T11:06:56.402+0000 7fbf43d5e700  0 [balancer INFO root] Mode upmap, max misplaced 0.050000
2023-08-22T11:06:56.402+0000 7fbf43d5e700  0 [balancer INFO root] do_upmap
2023-08-22T11:06:56.402+0000 7fbf43d5e700  0 [balancer INFO root] pools ['rp3pool', 'ec195-4k-pool', 'default.rgw.log', 'ec93pool-rbd', 'suwon.bucket.index', 'default.rgw.buckets.index', '.rgw.root', 'ec93-non-stripe-pool', 'default.rgw.control', '.mgr', 'default.rgw.meta', 'default.rgw.otp', 'default.rgw.buckets.non-ec', 'ec195-64k-pool', 'ec195-16k-pool']
2023-08-22T11:06:56.450+0000 7fbf43d5e700 -1 /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc: In function 'float OSDMap::calc_deviations(ceph::common::CephContext*, const std::map<int, std::set<pg_t> >&, const std::map<int, float>&, float, std::map<int, float>&, std::multimap<float, int>&, float&)' thread 7fbf43d5e700 time 2023-08-22T11:06:56.450708+0000
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc: 5155: FAILED ceph_assert(osd_weight.count(oid))
 
 ceph version 17.2.3 (dff484dfc9e19a9819f375586300b3b79d80034d) quincy (stable)
 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x152) [0x7fbfadff5c74]
 2: /usr/lib64/ceph/libceph-common.so.2(+0x283e95) [0x7fbfadff5e95]
 3: (OSDMap::calc_deviations(ceph::common::CephContext*, std::map<int, std::set<pg_t, std::less<pg_t>, std::allocator<pg_t> >, std::less<int>, std::allocator<std::pair<int const, std::set<pg_t, std::less<pg_t>, std::allocator<pg_t> > > > > const&, std::map<int, float, std::less<int>, std::allocator<std::pair<int const, float> > > const&, float, std::map<int, float, std::less<int>, std::allocator<std::pair<int const, float> > >&, std::multimap<float, int, std::less<float>, std::allocator<std::pair<float const, int> > >&, float&)+0xe0) [0x7fbfae4c2490]
 4: (OSDMap::calc_pg_upmaps(ceph::common::CephContext*, unsigned int, int, std::set<long, std::less<long>, std::allocator<long> > const&, OSDMap::Incremental*, unsigned int*)+0x389) [0x7fbfae4c6829]
 5: /usr/bin/ceph-mgr(+0x299406) [0x55e50358a406]
 6: /lib64/libpython3.6m.so.1.0(+0x19d0d7) [0x7fbfaef290d7]
 7: _PyEval_EvalFrameDefault()
 8: /lib64/libpython3.6m.so.1.0(+0xf9984) [0x7fbfaee85984]
 9: /lib64/libpython3.6m.so.1.0(+0x17a030) [0x7fbfaef06030]
 10: /lib64/libpython3.6m.so.1.0(+0x19d377) [0x7fbfaef29377]
 11: _PyEval_EvalFrameDefault()
 12: /lib64/libpython3.6m.so.1.0(+0x179e48) [0x7fbfaef05e48]
 13: /lib64/libpython3.6m.so.1.0(+0x19d377) [0x7fbfaef29377]
 14: _PyEval_EvalFrameDefault()
 15: /lib64/libpython3.6m.so.1.0(+0x179e48) [0x7fbfaef05e48]
 16: /lib64/libpython3.6m.so.1.0(+0x19d377) [0x7fbfaef29377]
 17: _PyEval_EvalFrameDefault()
 18: /lib64/libpython3.6m.so.1.0(+0xfa2f6) [0x7fbfaee862f6]
 19: _PyFunction_FastCallDict()
 20: _PyObject_FastCallDict()
 21: /lib64/libpython3.6m.so.1.0(+0x10db30) [0x7fbfaee99b30]
 22: _PyObject_FastCallDict()
 23: PyObject_CallMethod()
 24: (PyModuleRunner::serve()+0x66) [0x55e503585e06]
 25: (PyModuleRunner::PyModuleRunnerThread::entry()+0x3e3) [0x55e503587443]
 26: /lib64/libpthread.so.0(+0x81ca) [0x7fbface0a1ca]
 27: clone()
 
2023-08-22T11:06:56.450+0000 7fbf43d5e700 -1 *** Caught signal (Aborted) **
 in thread 7fbf43d5e700 thread_name:balancer
 
 ceph version 17.2.3 (dff484dfc9e19a9819f375586300b3b79d80034d) quincy (stable)
 1: /lib64/libpthread.so.0(+0x12ce0) [0x7fbface14ce0]
 2: gsignal()
 3: abort()
 4: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1b0) [0x7fbfadff5cd2]
 5: /usr/lib64/ceph/libceph-common.so.2(+0x283e95) [0x7fbfadff5e95]
 6: (OSDMap::calc_deviations(ceph::common::CephContext*, std::map<int, std::set<pg_t, std::less<pg_t>, std::allocator<pg_t> >, std::less<int>, std::allocator<std::pair<int const, std::set<pg_t, std::less<pg_t>, std::allocator<pg_t> > > > > const&, std::map<int, float, std::less<int>, std::allocator<std::pair<int const, float> > > const&, float, std::map<int, float, std::less<int>, std::allocator<std::pair<int const, float> > >&, std::multimap<float, int, std::less<float>, std::allocator<std::pair<float const, int> > >&, float&)+0xe0) [0x7fbfae4c2490]
 7: (OSDMap::calc_pg_upmaps(ceph::common::CephContext*, unsigned int, int, std::set<long, std::less<long>, std::allocator<long> > const&, OSDMap::Incremental*, unsigned int*)+0x389) [0x7fbfae4c6829]
 8: /usr/bin/ceph-mgr(+0x299406) [0x55e50358a406]

[Related code]
OSDMap.cc
------------------------------------------------------------------------------
float OSDMap::calc_deviations (
  CephContext *cct,
  const map<int,set<pg_t>>& pgs_by_osd,
  const map<int,float>& osd_weight,
  float pgs_per_weight,
  map<int,float>& osd_deviation,
  multimap<float,int>& deviation_osd,
  float& stddev)  // return current max deviation
{
  //
  // This function calculates the 2 maps osd_deviation and deviation_osd which
  // hold the deviation between the current number of PGs which map to an OSD
  // and the optimal number. Ot also calculates the stddev of the deviations and
  // returns the current max deviation.
  // NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as
  //        long as it is monotonic with stddev (and it is), it is sufficient for
  //        the balancer code.
  //
  float cur_max_deviation = 0.0;
  stddev = 0.0;
  for (auto& [oid, opgs] : pgs_by_osd) {
    // make sure osd is still there (belongs to this crush-tree)
    ceph_assert(osd_weight.count(oid));
    float target = osd_weight.at(oid) * pgs_per_weight;
    float deviation = (float)opgs.size() - target;
    ldout(cct, 20) << " osd." << oid
                   << "\tpgs " << opgs.size()
                   << "\ttarget " << target
                   << "\tdeviation " << deviation
                   << dendl;
    osd_deviation[oid] = deviation;
    deviation_osd.insert(make_pair(deviation, oid));
    stddev += deviation * deviation;
    if (fabsf(deviation) > cur_max_deviation)
      cur_max_deviation = fabsf(deviation);
  }
  return cur_max_deviation;
}
_______________________________________________
Dev mailing list -- dev@xxxxxxx
To unsubscribe send an email to dev-leave@xxxxxxx



[Index of Archives]     [CEPH Users]     [Ceph Devel]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux