Re: Falls cluster then one node switch off

Никитенко Виталий <v1t83@xxxxxxxxx> · Wed, 25 May 2016 08:58:54 +0700

I'm sorry it was not right map, that map right

# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable straw_calc_version 1

# devices
device 0 osd.0
device 1 osd.1
device 2 osd.2
device 3 osd.3
device 4 osd.4
device 5 osd.5

# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root

# buckets
host ceph1-node {
        id -2           # do not change unnecessarily
        # weight 0.030
        alg straw
        hash 0  # rjenkins1
        item osd.0 weight 0.010
        item osd.1 weight 0.010
        item osd.2 weight 0.010
}
host ceph2-node {
        id -3           # do not change unnecessarily
        # weight 0.030
        alg straw
        hash 0  # rjenkins1
        item osd.3 weight 0.010
        item osd.4 weight 0.010
        item osd.5 weight 0.010
}
root default {
        id -1           # do not change unnecessarily
        # weight 0.060
        alg straw
        hash 0  # rjenkins1
        item ceph1-node weight 0.030
        item ceph2-node weight 0.030
}

# rules
rule replicated_ruleset {
        ruleset 0
        type replicated
        min_size 1
        max_size 10
        step take default
        step chooseleaf firstn 0 type host
        step emit
}

# end crush map

> Firefly, but be aware that this version is EoL and no longer receiving
> updates.

Installed new version
root@ceph1-node:~# ceph --version
ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)

On pool created file 3GB, and turned off one host, after 10 minutes, there's such messages

2016-05-24 17:18:23.804162 mon.0 [INF] pgmap v172: 640 pgs: 10 active+recovering+degraded+remapped, 408 active+undersized+degraded, 117 active+remapped, 105 active+degraded+remapped; 2118 MB data, 2315 MB used, 28371 MB / 30686 MB avail; 517/1098 objects degraded (47.086%); 234/1098 objects misplaced (21.311%); 5746 kB/s, 1 objects/s recovering                                                                                                                                                                                                    
2016-05-24 17:18:28.268437 mon.0 [INF] pgmap v173: 640 pgs: 11 active+recovering+degraded+remapped, 408 active+undersized+degraded, 120 active+remapped, 101 active+degraded+remapped; 2118 MB data, 2331 MB used, 28355 MB / 30686 MB avail; 513/1098 objects degraded (46.721%); 234/1098 objects misplaced (21.311%); 4507 kB/s, 1 objects/s recovering                                                                                                                                                                                                    
2016-05-24 17:18:32.759455 mon.0 [INF] pgmap v174: 640 pgs: 13 active+recovering+degraded+remapped, 408 active+undersized+degraded, 125 active+remapped, 94 active+degraded+remapped; 2118 MB data, 2375 MB used, 28311 MB / 30686 MB avail; 499/1098 objects degraded (45.446%); 234/1098 objects misplaced (21.311%); 9729 kB/s, 2 objects/s recovering                                                                                                                                                                                                     
2016-05-24 17:18:35.314436 mon.0 [INF] pgmap v175: 640 pgs: 11 active+recovering+degraded+remapped, 408 active+undersized+degraded, 130 active+remapped, 91 active+degraded+remapped; 2118 MB data, 2395 MB used, 28291 MB / 30686 MB avail; 491/1098 objects degraded (44.718%); 234/1098 objects misplaced (21.311%); 11285 kB/s, 2 objects/s recovering                                                                                                                                                                                                    
2016-05-24 17:18:36.634583 mon.0 [INF] pgmap v176: 640 pgs: 12 active+recovering+degraded+remapped, 408 active+undersized+degraded, 130 active+remapped, 90 active+degraded+remapped; 2118 MB data, 2403 MB used, 28283 MB / 30686 MB avail; 489/1098 objects degraded (44.536%); 234/1098 objects misplaced (21.311%); 6608 kB/s, 1 objects/s recovering                                                                                                                                                                                                     
2016-05-24 17:18:39.724440 mon.0 [INF] pgmap v177: 640 pgs: 15 active+recovering+degraded+remapped, 408 active+undersized+degraded, 133 active+remapped, 84 active+degraded+remapped; 2118 MB data, 2428 MB used, 28258 MB / 30686 MB avail; 477/1098 objects degraded (43.443%); 234/1098 objects misplaced (21.311%); 13084 kB/s, 3 objects/s recovering                                                                                                                                                                                                    
2016-05-24 17:18:44.009854 mon.0 [INF] pgmap v178: 640 pgs: 12 active+recovering+degraded+remapped, 408 active+undersized+degraded, 137 active+remapped, 83 active+degraded+remapped; 2118 MB data, 2447 MB used, 28239 MB / 30686 MB avail; 474/1098 objects degraded (43.169%); 234/1098 objects misplaced (21.311%); 9650 kB/s, 2 objects/s recovering                                                                                                                                                                                                     
2016-05-24 17:18:48.822643 mon.0 [INF] pgmap v179: 640 pgs: 10 active+recovering+degraded+remapped, 408 active+undersized+degraded, 142 active+remapped, 80 active+degraded+remapped; 2118 MB data, 2493 MB used, 28193 MB / 30686 MB avail; 469/1098 objects degraded (42.714%); 234/1098 objects misplaced (21.311%); 3857 kB/s, 0 objects/s recovering                                                                                                                                                                                                     

root@ceph1-node:~# ceph -s
    cluster 808ee682-c121-4867-9fe4-a347d95bf3f0
     health HEALTH_WARN
            503 pgs degraded
            12 pgs recovering
            408 pgs stuck degraded
            640 pgs stuck unclean
            408 pgs stuck undersized
            408 pgs undersized
            recovery 474/1098 objects degraded (43.169%)
            recovery 234/1098 objects misplaced (21.311%)
            1 mons down, quorum 0,2 ceph1-node,ceph-mon2
     monmap e1: 3 mons at {ceph-mon2=192.168.241.20:6789/0,ceph1-node=192.168.241.2:6789/0,ceph2-node=192.168.241.12:6789/0}
            election epoch 18, quorum 0,2 ceph1-node,ceph-mon2
     osdmap e58: 6 osds: 3 up, 3 in; 232 remapped pgs
      pgmap v178: 640 pgs, 2 pools, 2118 MB data, 549 objects
            2447 MB used, 28239 MB / 30686 MB avail
            474/1098 objects degraded (43.169%)
            234/1098 objects misplaced (21.311%)
                 408 active+undersized+degraded
                 137 active+remapped
                  83 active+degraded+remapped
                  12 active+recovering+degraded+remapped
recovery io 9650 kB/s, 2 objects/s

iostat -x 1

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda               0.00     7.58    0.00    3.79     0.00    63.64    33.60     0.71  187.20    0.00  187.20 187.20  70.91
sdb               0.00     0.00   15.15    2.27  1745.45     7.20   201.17     3.52  202.26  202.80  198.67  40.17  70.00
sdc               0.00     0.00   28.79    9.85  3781.82  3119.70   357.25     6.26  161.96  125.26  269.23  24.24  93.64
sdd               0.00     2.27   15.91   26.52  1842.42 13575.76   726.86    11.55  287.14  139.43  375.77  22.86  96.97
rbd0              0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           4.17    0.00   95.83    0.00    0.00    0.00

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda               0.00     6.92    0.00    3.85     0.00    43.08    22.40     0.65  168.00    0.00  168.00 168.00  64.62
sdb               0.00     0.00    1.54    0.77   196.92     3.08   173.33     0.29  122.67  172.00   24.00  66.67  15.38
sdc               0.00     0.77    5.38    6.92   787.69  3156.92   641.00     3.32  263.25  198.29  313.78  47.75  58.77
sdd               0.00    23.85    9.23   51.54  1083.08 16794.62   588.38    15.09  264.71  177.67  280.30  16.56 100.62
rbd0              0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00

> "ceph osd tree" output may help, as well as removing ceph1-node2 from the
> picture.

root@ceph1-node:~# ceph osd tree
ID WEIGHT  TYPE NAME           UP/DOWN REWEIGHT PRIMARY-AFFINITY 
-1 0.05997 root default                                          
-2 0.02998     host ceph1-node                                   
 0 0.00999         osd.0            up  1.00000          1.00000 
 1 0.00999         osd.1            up  1.00000          1.00000 
 2 0.00999         osd.2            up  1.00000          1.00000 
-3 0.02998     host ceph2-node                                   
 3 0.00999         osd.3          down        0          1.00000 
 4 0.00999         osd.4          down        0          1.00000 
 5 0.00999         osd.5          down        0          1.00000 

> Have you verified (ceph osd get <poolname> size / min_size) that all your
> pools are actually set like this?

 root@ceph1-node:~# ceph osd pool get hdd size
size: 2
root@ceph1-node:~# ceph osd pool get hdd min_size
2016-05-24 17:22:52.171706 7fe7b787d700  0 -- :/135882111 >> 192.168.241.12:6789/0 pipe(0x7fe7bc059cf0 sd=3 :0 s=1 pgs=0 cs=0 l=1 c=0x7fe7bc05dfe0).fault
min_size: 1

 root@ceph1-node:~# ceph osd dump  
pool 1 'hdd' replicated size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 512 pgp_num 512 last_change 53 flags hashpspool stripe_width 0

after remapped end

root@ceph1-node:~# ceph -s
2016-05-24 17:23:10.123542 7f2c001cf700  0 -- :/623268863 >> 192.168.241.12:6789/0 pipe(0x7f2bfc059cd0 sd=3 :0 s=1 pgs=0 cs=0 l=1 c=0x7f2bfc05dfc0).fault
    cluster 808ee682-c121-4867-9fe4-a347d95bf3f0
     health HEALTH_WARN
            408 pgs degraded
            262 pgs stuck degraded
            640 pgs stuck unclean
            262 pgs stuck undersized
            408 pgs undersized
            recovery 315/1098 objects degraded (28.689%)
            recovery 234/1098 objects misplaced (21.311%)
            1 mons down, quorum 0,2 ceph1-node,ceph-mon2
     monmap e1: 3 mons at {ceph-mon2=192.168.241.20:6789/0,ceph1-node=192.168.241.2:6789/0,ceph2-node=192.168.241.12:6789/0}
            election epoch 18, quorum 0,2 ceph1-node,ceph-mon2
     osdmap e63: 6 osds: 3 up, 3 in; 232 remapped pgs
      pgmap v209: 640 pgs, 2 pools, 2118 MB data, 549 objects
            3149 MB used, 27537 MB / 30686 MB avail
            315/1098 objects degraded (28.689%)
            234/1098 objects misplaced (21.311%)
                 408 active+undersized+degraded
                 232 active+remapped

Any idea why ceph making redundancies on local disks host?

24.05.2016, 12:53, "Christian Balzer" <chibi@xxxxxxx>:
> Hello,
>
> On Tue, 24 May 2016 10:28:02 +0700 Никитенко Виталий wrote:
>
>>  Hello!
>>  I have a cluster of 2 nodes with 3 OSD each. The cluster full about 80%.
>
> According to your CRUSH map that's not quite true, namely ceph1-node2
> entry.
>
> And while that again according to your CRUSH map isn't in the default root
> I wonder WHERE it is and if it confuses Ceph into believing that there is
> actually a third node?
>
> "ceph osd tree" output may help, as well as removing ceph1-node2 from the
> picture.
>
>>  df -H
>>  /dev/sdc1 27G 24G 3.9G 86% /var/lib/ceph/osd/ceph-1
>>  /dev/sdd1 27G 20G 6.9G 75% /var/lib/ceph/osd/ceph-2
>>  /dev/sdb1 27G 24G 3.5G 88% /var/lib/ceph/osd/ceph-0
>>
>>  When I switch off one server, then after 10 minutes begins remapped pgs
>
> [snip]
>>  As a result, one disk overflow and the cluster falls. Why ceph remapped
>>  pgs, it was supposed to simply mark all pgs as active+degraded, while
>>  second node down?
>
> Yes, I agree, that shouldn't happen with a properly configured 2 node
> cluster.
>
>>  ceph version 0.80.11
>
> Not aware of any bugs in there and in fact I did test a 2 node cluster
> with Firefly, but be aware that this version is EoL and no longer receiving
> updates.
>
>>  root@ceph1-node:~# cat /etc/ceph/ceph.conf
>>  [global]
>>  fsid = b66c7daa-d6d8-46c7-9e61-15adbb749ed7
>>  mon_initial_members = ceph1-node, ceph2-node, ceph-mon2
>>  mon_host = 192.168.241.97,192.168.241.110,192.168.241.123
>>  auth_cluster_required = cephx
>>  auth_service_required = cephx
>>  auth_client_required = cephx
>>  filestore_xattr_use_omap = true
>>  osd_pool_default_size = 2
>>  osd_pool_default_min_size = 1
>
> Have you verified (ceph osd get <poolname> size / min_size) that all your
> pools are actually set like this?
>
> Regards,
>
> Christian
>>  mon_clock_drift_allowed = 2
>>
>>  root@ceph1-node:~#cat crush-map.txt
>>  # begin crush
>>  map tunable choose_local_tries
>>  0 tunable choose_local_fallback_tries
>>  0 tunable choose_total_tries
>>  50 tunable chooseleaf_descend_once
>>  1 tunable straw_calc_version
>>  1
>>  #
>>  devices device 0
>>  osd.0 device 1
>>  osd.1 device 2
>>  osd.2 device 3
>>  osd.3 device 4
>>  osd.4 device 5
>>  osd.5
>>  #
>>  types type 0
>>  osd type 1
>>  host type 2
>>  chassis type 3
>>  rack type 4
>>  row type 5
>>  pdu type 6
>>  pod type 7
>>  room type 8
>>  datacenter type 9
>>  region type 10
>>  root
>>  #
>>  buckets host ceph1-node
>>  { id -2 # do not change
>>  unnecessarily # weight
>>  0.060 alg
>>  straw hash 0 #
>>  rjenkins1 item osd.0 weight
>>  0.020 item osd.1 weight
>>  0.020 item osd.2 weight
>>  0.020 }
>>  host ceph2-node
>>  { id -3 # do not change unnecessarily
>>          # weight 0.060
>>          alg straw
>>          hash 0 # rjenkins1
>>          item osd.3 weight 0.020
>>          item osd.4 weight 0.020
>>          item osd.5 weight 0.020
>>  }
>>  root default {
>>          id -1 # do not change unnecessarily
>>          # weight 0.120
>>          alg straw
>>          hash 0 # rjenkins1
>>          item ceph1-node weight 0.060
>>          item ceph2-node weight 0.060
>>  }
>>  host ceph1-node2 {
>>          id -4 # do not change unnecessarily
>>          # weight 3.000
>>          alg straw
>>          hash 0 # rjenkins1
>>          item osd.0 weight 1.000
>>          item osd.1 weight 1.000
>>          item osd.2 weight 1.000
>>  }
>>
>>  # rules
>>  rule replicated_ruleset {
>>          ruleset 0
>>          type replicated
>>          min_size 1
>>          max_size 10
>>          step take default
>>          step chooseleaf firstn 0 type host
>>          step emit
>>  }
>>  # end crush map
>>
>>  _______________________________________________
>>  ceph-users mailing list
>>  ceph-users@xxxxxxxxxxxxxx
>>  http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
> --
> Christian Balzer Network/Systems Engineer
> chibi@xxxxxxx Global OnLine Japan/Rakuten Communications
> http://www.gol.com/
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com