On 11/07/12 13:32, Mark Kirkwood wrote:
I have attached the dump of stuck stale pgs, and the crushmap in use.
...of course I left off the crushmap, so here it is, plus my ceph.conf for good measure.
Mark
# begin crush map # devices device 0 osd0 device 1 osd1 device 2 osd2 device 3 osd3 # types type 0 device type 1 host type 2 datacenter type 3 root # buckets host ved1 { id -1 # do not change unnecessarily # weight 1.000 alg straw hash 0 # rjenkins1 item osd0 weight 1.000 } host ved2 { id -2 # do not change unnecessarily # weight 1.000 alg straw hash 0 # rjenkins1 item osd1 weight 1.000 } host ved3 { id -3 # do not change unnecessarily # weight 1.000 alg straw hash 0 # rjenkins1 item osd2 weight 1.000 } host ved4 { id -4 # do not change unnecessarily # weight 1.000 alg straw hash 0 # rjenkins1 item osd3 weight 1.000 } datacenter datacenter0 { id -5 # do not change unnecessarily # weight 2.000 alg straw hash 0 # rjenkins1 item ved1 weight 1.000 item ved2 weight 1.000 } datacenter datacenter1 { id -6 # do not change unnecessarily # weight 2.000 alg straw hash 0 # rjenkins1 item ved3 weight 1.000 item ved4 weight 1.000 } root root { id -7 # do not change unnecessarily # weight 4.000 alg straw hash 0 # rjenkins1 item datacenter0 weight 2.000 item datacenter1 weight 2.000 } # rules rule data { ruleset 1 type replicated min_size 2 max_size 10 step take datacenter0 step chooseleaf firstn -2 type host step emit step take datacenter1 step chooseleaf firstn 2 type host step emit } rule rbd { ruleset 2 type replicated min_size 2 max_size 10 step take datacenter0 step chooseleaf firstn -2 type host step emit step take datacenter1 step chooseleaf firstn 2 type host step emit } # end crush map
; ; Sample ceph ceph.conf file. ; ; This file defines cluster membership, the various locations ; that Ceph stores data, and any other runtime options. ; ; Experimental setup for 4 osd, 3 mon and 0 mds. ; Will experiment with crush rules later... ; ; If a 'host' is defined for a daemon, the start/stop script will ; verify that it matches the hostname (or else ignore it). If it is ; not defined, it is assumed that the daemon is intended to start on ; the current host (e.g., in a setup with a startup.conf on each ; node). ; The variables $type, $id and $name are available to use in paths ; $type = The type of daemon, possible values: mon, mds and osd ; $id = The ID of the daemon, for mon.alpha, $id will be alpha ; $name = $type.$id ; For example: ; osd.0 ; $type = osd ; $id = 0 ; $name = osd.0 ; mon.beta ; $type = mon ; $id = beta ; $name = mon.beta ; global [global] ; enable secure authentication ;auth supported = cephx ;keyring = /etc/ceph/ceph.keyring ; allow ourselves to open a lot of files max open files = 131072 ; set log file log file = /var/log/ceph/$name.log ; log_to_syslog = true ; uncomment this line to log to syslog ; set up pid files pid file = /var/run/ceph/$name.pid ; If you want to run a IPv6 cluster, set this to true. Dual-stack isn't possible ;ms bind ipv6 = true ; monitors ; You need at least one. You need at least three if you want to ; tolerate any node failures. Always create an odd number. [mon] mon data = /var/data/$name ;keyring = /var/data/keyring.$name ; If you are using for example the RADOS Gateway and want to have your newly created ; pools a higher replication level, you can set a default ;osd pool default size = 3 ; You can also specify a CRUSH rule for new pools ; Wiki: http://ceph.newdream.net/wiki/Custom_data_placement_with_CRUSH ;osd pool default crush rule = 0 ; Timing is critical for monitors, but if you want to allow the clocks to drift a ; bit more, you can specify the max drift. ;mon clock drift allowed = 1 ; Tell the monitor to backoff from this warning for 30 seconds ;mon clock drift warn backoff = 30 ; logging, for debugging monitor crashes, in order of ; their likelihood of being helpful :) ;debug ms = 1 ;debug mon = 20 ;debug paxos = 20 ;debug auth = 20 [mon.ved1] host = ved1 mon addr = 192.168.122.11:6789 [mon.ved2] host = ved2 mon addr = 192.168.122.12:6789 [mon.ved3] host = ved3 mon addr = 192.168.122.13:6789 ; osd ; You need at least one. Two if you want data to be replicated. ; Define as many as you like. [osd] ; This is where the btrfs volume will be mounted. osd data = /var/data/$name ;keyring = /var/data/keyring.$name ; Ideally, make this a separate disk or partition. A few ; hundred MB should be enough; more if you have fast or many ; disks. You can use a file under the osd data dir if need be ; (e.g. /data/$name/journal), but it will be slower than a ; separate disk or partition. ; This is an example of a file-based journal. osd journal = /var/data/$name/journal osd journal size = 1000 ; journal size, in megabytes ; If you want to run the journal on a tmpfs, disable DirectIO ;journal dio = false ; You can change the number of recovery operations to speed up recovery ; or slow it down if your machines can't handle it ; osd recovery max active = 3 ; osd logging to debug osd issues, in order of likelihood of being ; helpful ;debug ms = 1 ;debug osd = 20 ;debug filestore = 20 ;debug journal = 20 [osd.0] host = ved1 ; if 'btrfs devs' is not specified, you're responsible for ; setting up the 'osd data' dir. if it is not btrfs, things ; will behave up until you try to recover from a crash (which ; usually fine for basic testing). ;btrfs devs = /dev/sdx ; If you want to specify some other mount options, you can do so. ; The default values are rw,noatime ;btrfs options = rw,noatime [osd.1] host = ved2 [osd.2] host = ved3 [osd.3] host = ved4