Search the docs or mailing list archives for a discussion of the CRUSH tunables; you can see that's the problem here because CRUSH is only mapping the PG to one OSD instead of two. :) -Greg Software Engineer #42 @ http://inktank.com | http://ceph.com On Thu, Nov 14, 2013 at 4:40 AM, Ugis <ugis22@xxxxxxxxx> wrote: > Hi, > > Got 1 pg which does not fix and I dont know how to make it. > > ceph version 0.67.4 (ad85b8bfafea6232d64cb7ba76a8b6e8252fa0c7) > # ceph health detail > HEALTH_WARN 1 pgs stuck unclean; recovery 23/4687006 degraded > (0.000%); 6 near full osd(s) > pg 4.2ab is stuck unclean for 172566.764239, current state > active+remapped, last acting [7,5] > recovery 23/4687006 degraded (0.000%) > osd.2 is near full at 87% > osd.3 is near full at 88% > osd.4 is near full at 85% > osd.5 is near full at 86% > osd.6 is near full at 87% > osd.9 is near full at 85% > > # ceph osd dump > ... > pool 4 'temp1' rep size 2 min_size 1 crush_ruleset 3 object_hash > rjenkins pg_num 1300 pgp_num 1300 last_change 7039 owner 0 > removed_snaps [1~3] > ... > pg_temp 4.2ab [7,5] > > # ceph pg map 4.2ab > osdmap e7783 pg 4.2ab (4.2ab) -> up [7] acting [7,5] > > # ceph pg 4.2ab query > { "state": "active+remapped", > "epoch": 7783, > "up": [ > 7], > "acting": [ > 7, > 5], > "info": { "pgid": "4.2ab", > "last_update": "7752'114660", > "last_complete": "7752'114660", > "log_tail": "6874'111659", > "last_backfill": "MAX", > "purged_snaps": "[1~3]", > "history": { "epoch_created": 452, > "last_epoch_started": 7651, > "last_epoch_clean": 7651, > "last_epoch_split": 0, > "same_up_since": 7493, > "same_interval_since": 7650, > "same_primary_since": 7650, > "last_scrub": "7387'114067", > "last_scrub_stamp": "2013-11-12 01:07:46.610491", > "last_deep_scrub": "6874'103266", > "last_deep_scrub_stamp": "2013-11-07 16:49:45.483581", > "last_clean_scrub_stamp": "2013-11-12 01:07:46.610491"}, > "stats": { "version": "7752'114660", > "reported_seq": "5635118", > "reported_epoch": "7783", > "state": "active+remapped", > "last_fresh": "2013-11-14 10:18:22.181241", > "last_change": "2013-11-12 17:14:34.141441", > "last_active": "2013-11-14 10:18:22.181241", > "last_clean": "2013-11-12 14:01:01.947846", > "last_became_active": "0.000000", > "last_unstale": "2013-11-14 10:18:22.181241", > "mapping_epoch": 7502, > "log_start": "6874'111659", > "ondisk_log_start": "6874'111659", > "created": 452, > "last_epoch_clean": 7651, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "7387'114067", > "last_scrub_stamp": "2013-11-12 01:07:46.610491", > "last_deep_scrub": "6874'103266", > "last_deep_scrub_stamp": "2013-11-07 16:49:45.483581", > "last_clean_scrub_stamp": "2013-11-12 01:07:46.610491", > "log_size": 3001, > "ondisk_log_size": 3001, > "stats_invalid": "0", > "stat_sum": { "num_bytes": 9453961216, > "num_objects": 2254, > "num_object_clones": 0, > "num_object_copies": 0, > "num_objects_missing_on_primary": 0, > "num_objects_degraded": 0, > "num_objects_unfound": 0, > "num_read": 456340, > "num_read_kb": 1825360, > "num_write": 14004, > "num_write_kb": 6942720, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 4732, > "num_bytes_recovered": 19847446528, > "num_keys_recovered": 0}, > "stat_cat_sum": {}, > "up": [ > 7], > "acting": [ > 7, > 5]}, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 7651}, > "recovery_state": [ > { "name": "Started\/Primary\/Active", > "enter_time": "2013-11-12 17:14:34.001382", > "might_have_unfound": [], > "recovery_progress": { "backfill_target": -1, > "waiting_on_backfill": 0, > "backfill_pos": "0\/\/0\/\/-1", > "backfill_info": { "begin": "0\/\/0\/\/-1", > "end": "0\/\/0\/\/-1", > "objects": []}, > "peer_backfill_info": { "begin": "0\/\/0\/\/-1", > "end": "0\/\/0\/\/-1", > "objects": []}, > "backfills_in_flight": [], > "pull_from_peer": [], > "pushing": []}, > "scrub": { "scrubber.epoch_start": "0", > "scrubber.active": 0, > "scrubber.block_writes": 0, > "scrubber.finalizing": 0, > "scrubber.waiting_on": 0, > "scrubber.waiting_on_whom": []}}, > { "name": "Started", > "enter_time": "2013-11-12 17:14:32.905963"}]} > > > I have set all my osds to osd_backfill_full_ratio 90, so they are not > blocking backfill because of this. > Seems that problem is in osd 5 or 7 which should hold the pg, but I > cached no related logs even after "ms 1"(lots of output, may be wrong > grepping). > Tried to scrub/deep-scrub this pg, but there seems no action going on > after I issue commands. > > This state came after I had "full osd" and cluster blocked io. Then I > did some reweighting and cluster now sits at this state for a day > already. > > Any ideas how to trigger fixing of this stuck active+remapped pg 4.2ab? > > Ugis > _______________________________________________ > ceph-users mailing list > ceph-users@xxxxxxxxxxxxxx > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com