Hi! I just had the time to check again: even after removing the broken OSD, mgr still crashes. All OSDs are on and in. If I run "ceph balancer on" on a HEALTH_OK cluster, an optimization plan is generated and started. After some minutes all MGRs die. This is a major problem for me, as I still got that SSD OSD that is inbalanced and limiting the whole pools space. root@adminnode:~# ceph osd tree ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF -1 29.91933 root default -16 29.91933 datacenter dc01 -19 29.91933 pod dc01-agg01 -10 16.52396 rack dc01-rack02 -4 6.29695 host node1001 0 hdd 0.90999 osd.0 up 1.00000 1.00000 1 hdd 0.90999 osd.1 up 1.00000 1.00000 5 hdd 0.90999 osd.5 up 1.00000 1.00000 29 hdd 0.90970 osd.29 up 1.00000 1.00000 33 hdd 0.90970 osd.33 up 1.00000 1.00000 2 ssd 0.43700 osd.2 up 1.00000 1.00000 3 ssd 0.43700 osd.3 up 1.00000 1.00000 4 ssd 0.43700 osd.4 up 1.00000 1.00000 30 ssd 0.43660 osd.30 up 1.00000 1.00000 -7 6.29724 host node1002 9 hdd 0.90999 osd.9 up 1.00000 1.00000 10 hdd 0.90999 osd.10 up 1.00000 1.00000 11 hdd 0.90999 osd.11 up 1.00000 1.00000 12 hdd 0.90999 osd.12 up 1.00000 1.00000 35 hdd 0.90970 osd.35 up 1.00000 1.00000 6 ssd 0.43700 osd.6 up 1.00000 1.00000 7 ssd 0.43700 osd.7 up 1.00000 1.00000 8 ssd 0.43700 osd.8 up 1.00000 1.00000 31 ssd 0.43660 osd.31 up 1.00000 1.00000 -28 2.18318 host node1005 34 ssd 0.43660 osd.34 up 1.00000 1.00000 36 ssd 0.87329 osd.36 up 1.00000 1.00000 37 ssd 0.87329 osd.37 up 1.00000 1.00000 -29 1.74658 host node1006 42 ssd 0.87329 osd.42 up 1.00000 1.00000 43 ssd 0.87329 osd.43 up 1.00000 1.00000 -11 13.39537 rack dc01-rack03 -22 5.38794 host node1003 17 hdd 0.90999 osd.17 up 1.00000 1.00000 18 hdd 0.90999 osd.18 up 1.00000 1.00000 24 hdd 0.90999 osd.24 up 1.00000 1.00000 26 hdd 0.90999 osd.26 up 1.00000 1.00000 13 ssd 0.43700 osd.13 up 1.00000 1.00000 14 ssd 0.43700 osd.14 up 1.00000 1.00000 15 ssd 0.43700 osd.15 up 1.00000 1.00000 16 ssd 0.43700 osd.16 up 1.00000 1.00000 -25 5.38765 host node1004 23 hdd 0.90999 osd.23 up 1.00000 1.00000 25 hdd 0.90999 osd.25 up 1.00000 1.00000 27 hdd 0.90999 osd.27 up 1.00000 1.00000 28 hdd 0.90970 osd.28 up 1.00000 1.00000 19 ssd 0.43700 osd.19 up 1.00000 1.00000 20 ssd 0.43700 osd.20 up 1.00000 1.00000 21 ssd 0.43700 osd.21 up 1.00000 1.00000 22 ssd 0.43700 osd.22 up 1.00000 1.00000 -30 2.61978 host node1007 38 ssd 0.43660 osd.38 up 1.00000 1.00000 39 ssd 0.43660 osd.39 up 1.00000 1.00000 40 ssd 0.87329 osd.40 up 1.00000 1.00000 41 ssd 0.87329 osd.41 up 1.00000 1.00000 root@adminnode:~# ceph osd df ID CLASS WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR PGS 0 hdd 0.90999 1.00000 932GiB 353GiB 579GiB 37.87 0.83 95 1 hdd 0.90999 1.00000 932GiB 400GiB 531GiB 42.98 0.94 108 5 hdd 0.90999 1.00000 932GiB 267GiB 664GiB 28.70 0.63 72 29 hdd 0.90970 1.00000 932GiB 356GiB 576GiB 38.19 0.84 96 33 hdd 0.90970 1.00000 932GiB 344GiB 587GiB 36.94 0.81 93 2 ssd 0.43700 1.00000 447GiB 273GiB 174GiB 61.09 1.34 52 3 ssd 0.43700 1.00000 447GiB 252GiB 195GiB 56.38 1.23 61 4 ssd 0.43700 1.00000 447GiB 308GiB 140GiB 68.78 1.51 59 30 ssd 0.43660 1.00000 447GiB 231GiB 216GiB 51.77 1.13 48 9 hdd 0.90999 1.00000 932GiB 358GiB 573GiB 38.48 0.84 97 10 hdd 0.90999 1.00000 932GiB 347GiB 585GiB 37.25 0.82 94 11 hdd 0.90999 1.00000 932GiB 335GiB 597GiB 35.96 0.79 91 12 hdd 0.90999 1.00000 932GiB 357GiB 575GiB 38.28 0.84 96 35 hdd 0.90970 1.00000 932GiB 318GiB 614GiB 34.14 0.75 86 6 ssd 0.43700 1.00000 447GiB 278GiB 170GiB 62.08 1.36 63 7 ssd 0.43700 1.00000 447GiB 256GiB 191GiB 57.17 1.25 60 8 ssd 0.43700 1.00000 447GiB 291GiB 156GiB 65.01 1.42 57 31 ssd 0.43660 1.00000 447GiB 246GiB 201GiB 54.96 1.20 51 34 ssd 0.43660 1.00000 447GiB 189GiB 258GiB 42.22 0.92 46 36 ssd 0.87329 1.00000 894GiB 389GiB 506GiB 43.45 0.95 91 37 ssd 0.87329 1.00000 894GiB 390GiB 504GiB 43.63 0.96 85 42 ssd 0.87329 1.00000 894GiB 401GiB 493GiB 44.88 0.98 92 43 ssd 0.87329 1.00000 894GiB 455GiB 439GiB 50.89 1.11 89 17 hdd 0.90999 1.00000 932GiB 368GiB 563GiB 39.55 0.87 100 18 hdd 0.90999 1.00000 932GiB 350GiB 582GiB 37.56 0.82 95 24 hdd 0.90999 1.00000 932GiB 359GiB 572GiB 38.58 0.84 97 26 hdd 0.90999 1.00000 932GiB 388GiB 544GiB 41.62 0.91 105 13 ssd 0.43700 1.00000 447GiB 322GiB 125GiB 72.12 1.58 80 14 ssd 0.43700 1.00000 447GiB 291GiB 156GiB 65.16 1.43 70 15 ssd 0.43700 1.00000 447GiB 350GiB 96.9GiB 78.33 1.72 78 <-- 16 ssd 0.43700 1.00000 447GiB 268GiB 179GiB 60.05 1.31 71 23 hdd 0.90999 1.00000 932GiB 364GiB 567GiB 39.08 0.86 98 25 hdd 0.90999 1.00000 932GiB 391GiB 541GiB 41.92 0.92 106 27 hdd 0.90999 1.00000 932GiB 393GiB 538GiB 42.21 0.92 106 28 hdd 0.90970 1.00000 932GiB 467GiB 464GiB 50.14 1.10 126 19 ssd 0.43700 1.00000 447GiB 310GiB 137GiB 69.36 1.52 76 20 ssd 0.43700 1.00000 447GiB 316GiB 131GiB 70.66 1.55 76 21 ssd 0.43700 1.00000 447GiB 323GiB 125GiB 72.13 1.58 80 22 ssd 0.43700 1.00000 447GiB 283GiB 164GiB 63.39 1.39 69 38 ssd 0.43660 1.00000 447GiB 146GiB 302GiB 32.55 0.71 46 39 ssd 0.43660 1.00000 447GiB 142GiB 305GiB 31.84 0.70 43 40 ssd 0.87329 1.00000 894GiB 407GiB 487GiB 45.53 1.00 98 41 ssd 0.87329 1.00000 894GiB 353GiB 541GiB 39.51 0.87 102 TOTAL 29.9TiB 13.7TiB 16.3TiB 45.66 MIN/MAX VAR: 0.63/1.72 STDDEV: 13.59 Kevin Am So., 6. Jan. 2019 um 07:34 Uhr schrieb Konstantin Shalygin <k0ste@xxxxxxxx>: > > On 1/5/19 4:17 PM, Kevin Olbrich wrote: > > root@adminnode:~# ceph osd tree > > ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF > > -1 30.82903 root default > > -16 30.82903 datacenter dc01 > > -19 30.82903 pod dc01-agg01 > > -10 17.43365 rack dc01-rack02 > > -4 7.20665 host node1001 > > 0 hdd 0.90999 osd.0 up 1.00000 1.00000 > > 1 hdd 0.90999 osd.1 up 1.00000 1.00000 > > 5 hdd 0.90999 osd.5 up 1.00000 1.00000 > > 29 hdd 0.90970 osd.29 up 1.00000 1.00000 > > 32 hdd 0.90970 osd.32 down 0 1.00000 > > 33 hdd 0.90970 osd.33 up 1.00000 1.00000 > > 2 ssd 0.43700 osd.2 up 1.00000 1.00000 > > 3 ssd 0.43700 osd.3 up 1.00000 1.00000 > > 4 ssd 0.43700 osd.4 up 1.00000 1.00000 > > 30 ssd 0.43660 osd.30 up 1.00000 1.00000 > > -7 6.29724 host node1002 > > 9 hdd 0.90999 osd.9 up 1.00000 1.00000 > > 10 hdd 0.90999 osd.10 up 1.00000 1.00000 > > 11 hdd 0.90999 osd.11 up 1.00000 1.00000 > > 12 hdd 0.90999 osd.12 up 1.00000 1.00000 > > 35 hdd 0.90970 osd.35 up 1.00000 1.00000 > > 6 ssd 0.43700 osd.6 up 1.00000 1.00000 > > 7 ssd 0.43700 osd.7 up 1.00000 1.00000 > > 8 ssd 0.43700 osd.8 up 1.00000 1.00000 > > 31 ssd 0.43660 osd.31 up 1.00000 1.00000 > > -28 2.18318 host node1005 > > 34 ssd 0.43660 osd.34 up 1.00000 1.00000 > > 36 ssd 0.87329 osd.36 up 1.00000 1.00000 > > 37 ssd 0.87329 osd.37 up 1.00000 1.00000 > > -29 1.74658 host node1006 > > 42 ssd 0.87329 osd.42 up 1.00000 1.00000 > > 43 ssd 0.87329 osd.43 up 1.00000 1.00000 > > -11 13.39537 rack dc01-rack03 > > -22 5.38794 host node1003 > > 17 hdd 0.90999 osd.17 up 1.00000 1.00000 > > 18 hdd 0.90999 osd.18 up 1.00000 1.00000 > > 24 hdd 0.90999 osd.24 up 1.00000 1.00000 > > 26 hdd 0.90999 osd.26 up 1.00000 1.00000 > > 13 ssd 0.43700 osd.13 up 1.00000 1.00000 > > 14 ssd 0.43700 osd.14 up 1.00000 1.00000 > > 15 ssd 0.43700 osd.15 up 1.00000 1.00000 > > 16 ssd 0.43700 osd.16 up 1.00000 1.00000 > > -25 5.38765 host node1004 > > 23 hdd 0.90999 osd.23 up 1.00000 1.00000 > > 25 hdd 0.90999 osd.25 up 1.00000 1.00000 > > 27 hdd 0.90999 osd.27 up 1.00000 1.00000 > > 28 hdd 0.90970 osd.28 up 1.00000 1.00000 > > 19 ssd 0.43700 osd.19 up 1.00000 1.00000 > > 20 ssd 0.43700 osd.20 up 1.00000 1.00000 > > 21 ssd 0.43700 osd.21 up 1.00000 1.00000 > > 22 ssd 0.43700 osd.22 up 1.00000 1.00000 > > -30 2.61978 host node1007 > > 38 ssd 0.43660 osd.38 up 1.00000 1.00000 > > 39 ssd 0.43660 osd.39 up 1.00000 1.00000 > > 40 ssd 0.87329 osd.40 up 1.00000 1.00000 > > 41 ssd 0.87329 osd.41 up 1.00000 1.00000 > > > > ======================================================== > > root@adminnode:~# ceph osd df tree > > ID CLASS WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR PGS > > TYPE NAME > > -1 30.82903 - 29.9TiB 14.0TiB 16.0TiB 46.65 1.00 - > > root default > > -16 30.82903 - 29.9TiB 14.0TiB 16.0TiB 46.65 1.00 - > > datacenter dc01 > > -19 30.82903 - 29.9TiB 14.0TiB 16.0TiB 46.65 1.00 - > > pod dc01-agg01 > > -10 17.43365 - 16.5TiB 7.31TiB 9.21TiB 44.26 0.95 - > > rack dc01-rack02 > > -4 7.20665 - 6.29TiB 2.76TiB 3.54TiB 43.83 0.94 - > > host node1001 > > 0 hdd 0.90999 1.00000 932GiB 356GiB 575GiB 38.22 0.82 95 > > osd.0 > > 1 hdd 0.90999 1.00000 932GiB 397GiB 534GiB 42.66 0.91 106 > > osd.1 > > 5 hdd 0.90999 1.00000 932GiB 284GiB 647GiB 30.50 0.65 76 > > osd.5 > > 29 hdd 0.90970 1.00000 932GiB 366GiB 566GiB 39.29 0.84 98 > > osd.29 > > 32 hdd 0.90970 0 0B 0B 0B 0 0 0 > > osd.32 > > 33 hdd 0.90970 1.00000 932GiB 369GiB 563GiB 39.57 0.85 99 > > osd.33 > > 2 ssd 0.43700 1.00000 447GiB 271GiB 176GiB 60.67 1.30 50 > > osd.2 > > 3 ssd 0.43700 1.00000 447GiB 249GiB 198GiB 55.62 1.19 58 > > osd.3 > > 4 ssd 0.43700 1.00000 447GiB 297GiB 150GiB 66.39 1.42 56 > > osd.4 > > 30 ssd 0.43660 1.00000 447GiB 236GiB 211GiB 52.85 1.13 48 > > osd.30 > > -7 6.29724 - 6.29TiB 2.74TiB 3.55TiB 43.53 0.93 - > > host node1002 > > 9 hdd 0.90999 1.00000 932GiB 354GiB 578GiB 37.96 0.81 95 > > osd.9 > > 10 hdd 0.90999 1.00000 932GiB 357GiB 575GiB 38.28 0.82 96 > > osd.10 > > 11 hdd 0.90999 1.00000 932GiB 318GiB 613GiB 34.18 0.73 86 > > osd.11 > > 12 hdd 0.90999 1.00000 932GiB 373GiB 558GiB 40.09 0.86 100 > > osd.12 > > 35 hdd 0.90970 1.00000 932GiB 343GiB 588GiB 36.83 0.79 92 > > osd.35 > > 6 ssd 0.43700 1.00000 447GiB 269GiB 178GiB 60.20 1.29 60 > > osd.6 > > 7 ssd 0.43700 1.00000 447GiB 249GiB 198GiB 55.69 1.19 56 > > osd.7 > > 8 ssd 0.43700 1.00000 447GiB 286GiB 161GiB 63.95 1.37 56 > > osd.8 > > 31 ssd 0.43660 1.00000 447GiB 257GiB 190GiB 57.47 1.23 55 > > osd.31 > > -28 2.18318 - 2.18TiB 968GiB 1.24TiB 43.29 0.93 - > > host node1005 > > 34 ssd 0.43660 1.00000 447GiB 202GiB 245GiB 45.14 0.97 47 > > osd.34 > > 36 ssd 0.87329 1.00000 894GiB 405GiB 489GiB 45.28 0.97 91 > > osd.36 > > 37 ssd 0.87329 1.00000 894GiB 361GiB 533GiB 40.38 0.87 79 > > osd.37 > > -29 1.74658 - 1.75TiB 888GiB 900GiB 49.65 1.06 - > > host node1006 > > 42 ssd 0.87329 1.00000 894GiB 417GiB 477GiB 46.68 1.00 92 > > osd.42 > > 43 ssd 0.87329 1.00000 894GiB 471GiB 424GiB 52.63 1.13 90 > > osd.43 > > -11 13.39537 - 13.4TiB 6.64TiB 6.75TiB 49.60 1.06 - > > rack dc01-rack03 > > -22 5.38794 - 5.39TiB 2.70TiB 2.69TiB 50.14 1.07 - > > host node1003 > > 17 hdd 0.90999 1.00000 932GiB 371GiB 560GiB 39.83 0.85 100 > > osd.17 > > 18 hdd 0.90999 1.00000 932GiB 390GiB 542GiB 41.82 0.90 105 > > osd.18 > > 24 hdd 0.90999 1.00000 932GiB 352GiB 580GiB 37.77 0.81 94 > > osd.24 > > 26 hdd 0.90999 1.00000 932GiB 387GiB 545GiB 41.54 0.89 104 > > osd.26 > > 13 ssd 0.43700 1.00000 447GiB 319GiB 128GiB 71.32 1.53 77 > > osd.13 > > 14 ssd 0.43700 1.00000 447GiB 303GiB 144GiB 67.76 1.45 70 > > osd.14 > > 15 ssd 0.43700 1.00000 447GiB 361GiB 86.4GiB 80.67 1.73 77 > > osd.15 > > 16 ssd 0.43700 1.00000 447GiB 283GiB 164GiB 63.29 1.36 71 > > osd.16 > > -25 5.38765 - 5.39TiB 2.83TiB 2.56TiB 52.55 1.13 - > > host node1004 > > 23 hdd 0.90999 1.00000 932GiB 382GiB 549GiB 41.05 0.88 102 > > osd.23 > > 25 hdd 0.90999 1.00000 932GiB 412GiB 520GiB 44.20 0.95 111 > > osd.25 > > 27 hdd 0.90999 1.00000 932GiB 385GiB 546GiB 41.36 0.89 103 > > osd.27 > > 28 hdd 0.90970 1.00000 932GiB 462GiB 469GiB 49.64 1.06 124 > > osd.28 > > 19 ssd 0.43700 1.00000 447GiB 314GiB 133GiB 70.22 1.51 75 > > osd.19 > > 20 ssd 0.43700 1.00000 447GiB 327GiB 120GiB 73.06 1.57 76 > > osd.20 > > 21 ssd 0.43700 1.00000 447GiB 324GiB 123GiB 72.45 1.55 77 > > osd.21 > > 22 ssd 0.43700 1.00000 447GiB 292GiB 156GiB 65.21 1.40 68 > > osd.22 > > -30 2.61978 - 2.62TiB 1.11TiB 1.51TiB 42.43 0.91 - > > host node1007 > > 38 ssd 0.43660 1.00000 447GiB 165GiB 283GiB 36.82 0.79 46 > > osd.38 > > 39 ssd 0.43660 1.00000 447GiB 156GiB 292GiB 34.79 0.75 42 > > osd.39 > > 40 ssd 0.87329 1.00000 894GiB 429GiB 466GiB 47.94 1.03 98 > > osd.40 > > 41 ssd 0.87329 1.00000 894GiB 389GiB 505GiB 43.55 0.93 103 > > osd.41 > > TOTAL 29.9TiB 14.0TiB 16.0TiB 46.65 > > MIN/MAX VAR: 0.65/1.73 STDDEV: 13.30 > > > > ============================================================= > > root@adminnode:~# ceph df && ceph -v > > GLOBAL: > > SIZE AVAIL RAW USED %RAW USED > > 29.9TiB 16.0TiB 14.0TiB 46.65 > > POOLS: > > NAME ID USED %USED MAX AVAIL OBJECTS > > rbd_vms_ssd 2 986GiB 49.83 993GiB 262606 > > rbd_vms_hdd 3 3.76TiB 48.94 3.92TiB 992255 > > rbd_vms_ssd_01 4 372KiB 0 662GiB 148 > > rbd_vms_ssd_01_ec 6 2.85TiB 68.81 1.29TiB 770506 > > > > ceph version 12.2.8 (ae699615bac534ea496ee965ac6192cb7e0e07c0) luminous (stable) > > Looks good. You should always delete your down osd's from crush map > before replace it. After delete this osd try balancer again. > > > > k > _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com