# ceph -v ceph version 0.61.2 (fea782543a844bb277ae94d3391788b76c5bee60) # rpm -qa | grep ceph ceph-0.61.2-0.el6.x86_64 ceph-radosgw-0.61.2-0.el6.x86_64 ceph-deploy-0.1-31.g7c5f29c.noarch ceph-release-1-0.el6.noarch libcephfs1-0.61.2-0.el6.x86_64 thanks Chris -----Original Message----- From: Samuel Just [mailto:sam.just@xxxxxxxxxxx] Sent: 13 August 2013 20:11 To: Howarth, Chris [CCC-OT_IT] Cc: ceph-users@xxxxxxxx Subject: Re: Ceph pgs stuck unclean Version? -Sam On Tue, Aug 13, 2013 at 7:52 AM, Howarth, Chris <chris.howarth@xxxxxxxx> wrote: > Hi Sam, > Thanks for your reply here. Unfortunately I didn't capture all this data at the time of the issue. What I do have I've pasted below. FYI the only way I found to fix this issue was to temporarily reduce the number of replicas in the pool to 1. The stuck pgs then disappeared and so I then increased the replicas back to 2 at this point. Obviously this is not a great workaround so I am keen to get to the bottom of the problem here. > > Thanks again for your help. > > Chris > > # ceph health detail > HEALTH_WARN 7 pgs stuck unclean > pg 3.5a is stuck unclean for 335339.172516, current state active, last > acting [5,4] pg 3.54 is stuck unclean for 335339.157608, current state > active, last acting [15,7] pg 3.55 is stuck unclean for 335339.167154, > current state active, last acting [16,9] pg 3.1c is stuck unclean for > 335339.174150, current state active, last acting [8,16] pg 3.a is > stuck unclean for 335339.177001, current state active, last acting > [0,8] pg 3.4 is stuck unclean for 335339.165377, current state active, > last acting [17,4] pg 3.5 is stuck unclean for 335339.149507, current > state active, last acting [2,6] # ceph pg 3.5a query { "state": "active", > "epoch": 699, > "up": [ > 5, > 4], > "acting": [ > 5, > 4], > "info": { "pgid": "3.5a", > "last_update": "413'688", > "last_complete": "413'688", > "log_tail": "0'0", > "last_backfill": "MAX", > "purged_snaps": "[]", > "history": { "epoch_created": 67, > "last_epoch_started": 644, > "last_epoch_clean": 644, > "last_epoch_split": 0, > "same_up_since": 643, > "same_interval_since": 643, > "same_primary_since": 561, > "last_scrub": "0'0", > "last_scrub_stamp": "2013-08-01 15:23:29.253783", > "last_deep_scrub": "0'0", > "last_deep_scrub_stamp": "2013-08-01 15:23:29.253783", > "last_clean_scrub_stamp": "2013-08-01 15:23:29.253783"}, > "stats": { "version": "413'688", > "reported": "561'1484", > "state": "active", > "last_fresh": "2013-08-02 12:25:41.793582", > "last_change": "2013-08-02 09:54:08.163758", > "last_active": "2013-08-02 12:25:41.793582", > "last_clean": "2013-08-02 09:49:34.246621", > "last_became_active": "0.000000", > "last_unstale": "2013-08-02 12:25:41.793582", > "mapping_epoch": 641, > "log_start": "0'0", > "ondisk_log_start": "0'0", > "created": 67, > "last_epoch_clean": 67, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "0'0", > "last_scrub_stamp": "2013-08-01 15:23:29.253783", > "last_deep_scrub": "0'0", > "last_deep_scrub_stamp": "2013-08-01 15:23:29.253783", > "last_clean_scrub_stamp": "2013-08-01 15:23:29.253783", > "log_size": 0, > "ondisk_log_size": 0, > "stats_invalid": "0", > "stat_sum": { "num_bytes": 134217728, > "num_objects": 32, > "num_object_clones": 0, > "num_object_copies": 0, > "num_objects_missing_on_primary": 0, > "num_objects_degraded": 0, > "num_objects_unfound": 0, > "num_read": 0, > "num_read_kb": 0, > "num_write": 688, > "num_write_kb": 327680, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 45, > "num_bytes_recovered": 188743680, > "num_keys_recovered": 0}, > "stat_cat_sum": {}, > "up": [ > 5, > 4], > "acting": [ > 5, > 4]}, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 644}, > "recovery_state": [ > { "name": "Started\/Primary\/Active", > "enter_time": "2013-08-02 09:49:56.504882", > "might_have_unfound": [], > "recovery_progress": { "backfill_target": -1, > "waiting_on_backfill": 0, > "backfill_pos": "0\/\/0\/\/-1", > "backfill_info": { "begin": "0\/\/0\/\/-1", > "end": "0\/\/0\/\/-1", > "objects": []}, > "peer_backfill_info": { "begin": "0\/\/0\/\/-1", > "end": "0\/\/0\/\/-1", > "objects": []}, > "backfills_in_flight": [], > "pull_from_peer": [], > "pushing": []}, > "scrub": { "scrubber.epoch_start": "0", > "scrubber.active": 0, > "scrubber.block_writes": 0, > "scrubber.finalizing": 0, > "scrubber.waiting_on": 0, > "scrubber.waiting_on_whom": []}}, > { "name": "Started", > "enter_time": "2013-08-02 09:49:55.501261"}]} > > -----Original Message----- > From: Samuel Just [mailto:sam.just@xxxxxxxxxxx] > Sent: 12 August 2013 22:52 > To: Howarth, Chris [CCC-OT_IT] > Cc: ceph-users@xxxxxxxx > Subject: Re: Ceph pgs stuck unclean > > Can you attach the output of: > > ceph -s > ceph pg dump > ceph osd dump > > and run > > ceph osd getmap -o /tmp/osdmap > > and attach /tmp/osdmap/ > -Sam > > On Wed, Aug 7, 2013 at 1:58 AM, Howarth, Chris <chris.howarth@xxxxxxxx> wrote: >> Hi, >> >> One of our OSD disks failed on a cluster and I replaced it, but >> when it failed it did not completely recover and I have a number of >> pgs which are stuck unclean: >> >> >> >> # ceph health detail >> >> HEALTH_WARN 7 pgs stuck unclean >> >> pg 3.5a is stuck unclean for 335339.172516, current state active, >> last acting [5,4] >> >> pg 3.54 is stuck unclean for 335339.157608, current state active, >> last acting [15,7] >> >> pg 3.55 is stuck unclean for 335339.167154, current state active, >> last acting [16,9] >> >> pg 3.1c is stuck unclean for 335339.174150, current state active, >> last acting [8,16] >> >> pg 3.a is stuck unclean for 335339.177001, current state active, last >> acting [0,8] >> >> pg 3.4 is stuck unclean for 335339.165377, current state active, last >> acting [17,4] >> >> pg 3.5 is stuck unclean for 335339.149507, current state active, last >> acting [2,6] >> >> >> >> Does anyone know how to fix these ? I tried the following, but this >> does not seem to work: >> >> >> >> # ceph pg 3.5 mark_unfound_lost revert >> >> pg has no unfound objects >> >> >> >> thanks >> >> >> >> Chris >> >> __________________________ >> >> Chris Howarth >> >> OS Platforms Engineering >> >> Citi Architecture & Technology Engineering >> >> (e) chris.howarth@xxxxxxxx >> >> (t) +44 (0) 20 7508 3848 >> >> (f) +44 (0) 20 7508 0964 >> >> (mail-drop) CGC-06-3A >> >> >> >> >> _______________________________________________ >> ceph-users mailing list >> ceph-users@xxxxxxxxxxxxxx >> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com >> _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com