Re: Pg stuck stale...why?

Mark Kirkwood <mark.kirkwood@xxxxxxxxxxxxxxx> · Wed, 11 Jul 2012 13:35:42 +1200

On 11/07/12 13:32, Mark Kirkwood wrote:

I have attached the dump of stuck stale pgs, and the crushmap in use.

...of course I left off the crushmap, so here it is, plus my ceph.conf 
for good measure.

Mark
# begin crush map

# devices
device 0 osd0
device 1 osd1
device 2 osd2
device 3 osd3

# types
type 0 device
type 1 host
type 2 datacenter
type 3 root

# buckets
host ved1 {
	id -1		# do not change unnecessarily
	# weight 1.000
	alg straw
	hash 0	# rjenkins1
	item osd0 weight 1.000
}
host ved2 {
	id -2		# do not change unnecessarily
	# weight 1.000
	alg straw
	hash 0	# rjenkins1
	item osd1 weight 1.000
}
host ved3 {
	id -3		# do not change unnecessarily
	# weight 1.000
	alg straw
	hash 0	# rjenkins1
	item osd2 weight 1.000
}
host ved4 {
	id -4		# do not change unnecessarily
	# weight 1.000
	alg straw
	hash 0	# rjenkins1
	item osd3 weight 1.000
}
datacenter datacenter0 {
	id -5		# do not change unnecessarily
	# weight 2.000
	alg straw
	hash 0	# rjenkins1
	item ved1 weight 1.000
	item ved2 weight 1.000
}
datacenter datacenter1 {
	id -6		# do not change unnecessarily
	# weight 2.000
	alg straw
	hash 0	# rjenkins1
	item ved3 weight 1.000
	item ved4 weight 1.000
}
root root {
	id -7		# do not change unnecessarily
	# weight 4.000
	alg straw
	hash 0	# rjenkins1
	item datacenter0 weight 2.000
	item datacenter1 weight 2.000
}

# rules
rule data {
	ruleset 1
	type replicated
	min_size 2
	max_size 10
	step take datacenter0
	step chooseleaf firstn -2 type host
	step emit
	step take datacenter1
	step chooseleaf firstn 2 type host
	step emit
}
rule rbd {
	ruleset 2
	type replicated
	min_size 2
	max_size 10
	step take datacenter0
	step chooseleaf firstn -2 type host
	step emit
	step take datacenter1
	step chooseleaf firstn 2 type host
	step emit
}

# end crush map
;
; Sample ceph ceph.conf file.
;
; This file defines cluster membership, the various locations
; that Ceph stores data, and any other runtime options.

;
; Experimental setup for 4 osd, 3 mon and 0 mds. 
; Will experiment with crush rules later...
;

; If a 'host' is defined for a daemon, the start/stop script will
; verify that it matches the hostname (or else ignore it).  If it is
; not defined, it is assumed that the daemon is intended to start on
; the current host (e.g., in a setup with a startup.conf on each
; node).

; The variables $type, $id and $name are available to use in paths
; $type = The type of daemon, possible values: mon, mds and osd
; $id = The ID of the daemon, for mon.alpha, $id will be alpha
; $name = $type.$id

; For example:
; osd.0
;  $type = osd
;  $id = 0
;  $name = osd.0

; mon.beta
;  $type = mon
;  $id = beta
;  $name = mon.beta

; global
[global]
	; enable secure authentication
	;auth supported = cephx
	;keyring = /etc/ceph/ceph.keyring

        ; allow ourselves to open a lot of files
        max open files = 131072

        ; set log file
        log file = /var/log/ceph/$name.log
        ; log_to_syslog = true        ; uncomment this line to log to syslog

        ; set up pid files
        pid file = /var/run/ceph/$name.pid

        ; If you want to run a IPv6 cluster, set this to true. Dual-stack isn't possible
        ;ms bind ipv6 = true

; monitors
;  You need at least one.  You need at least three if you want to
;  tolerate any node failures.  Always create an odd number.
[mon]
        mon data = /var/data/$name
	;keyring = /var/data/keyring.$name

        ; If you are using for example the RADOS Gateway and want to have your newly created
        ; pools a higher replication level, you can set a default
        ;osd pool default size = 3

        ; You can also specify a CRUSH rule for new pools
        ; Wiki: http://ceph.newdream.net/wiki/Custom_data_placement_with_CRUSH
        ;osd pool default crush rule = 0

        ; Timing is critical for monitors, but if you want to allow the clocks to drift a
        ; bit more, you can specify the max drift.
        ;mon clock drift allowed = 1

        ; Tell the monitor to backoff from this warning for 30 seconds
        ;mon clock drift warn backoff = 30

	; logging, for debugging monitor crashes, in order of
	; their likelihood of being helpful :)
	;debug ms = 1
	;debug mon = 20
	;debug paxos = 20
	;debug auth = 20

[mon.ved1]
	host = ved1
	mon addr = 192.168.122.11:6789

[mon.ved2]
	host = ved2
	mon addr = 192.168.122.12:6789

[mon.ved3]
	host = ved3
	mon addr = 192.168.122.13:6789

; osd
;  You need at least one.  Two if you want data to be replicated.
;  Define as many as you like.
[osd]
	; This is where the btrfs volume will be mounted.
	osd data = /var/data/$name
	;keyring = /var/data/keyring.$name

	; Ideally, make this a separate disk or partition.  A few
 	; hundred MB should be enough; more if you have fast or many
 	; disks.  You can use a file under the osd data dir if need be
 	; (e.g. /data/$name/journal), but it will be slower than a
 	; separate disk or partition.

        ; This is an example of a file-based journal.
	osd journal = /var/data/$name/journal
	osd journal size = 1000 ; journal size, in megabytes

        ; If you want to run the journal on a tmpfs, disable DirectIO
        ;journal dio = false

        ; You can change the number of recovery operations to speed up recovery
        ; or slow it down if your machines can't handle it
        ; osd recovery max active = 3

	; osd logging to debug osd issues, in order of likelihood of being
	; helpful
	;debug ms = 1
	;debug osd = 20
	;debug filestore = 20
	;debug journal = 20

[osd.0]
	host = ved1

	; if 'btrfs devs' is not specified, you're responsible for
	; setting up the 'osd data' dir.  if it is not btrfs, things
	; will behave up until you try to recover from a crash (which
	; usually fine for basic testing).
	;btrfs devs = /dev/sdx

        ; If you want to specify some other mount options, you can do so.
        ; The default values are rw,noatime
        ;btrfs options = rw,noatime

[osd.1]
	host = ved2

[osd.2]
	host = ved3

[osd.3]
	host = ved4