Hi All...
Today we had a warning regarding 8 near
full osd. Looking to the osds occupation,
3 of them were above 90%. In order to
solve the situation, I've decided to
reweigh those first using
ceph osd crush reweight osd.1 2.67719
ceph osd crush reweight osd.26
2.67719
ceph osd crush reweight osd.53
2.67719
Please note that I've started with a very
conservative step since the original
weight for all osds was 2.72710.
After some rebalancing (which has now
stopped) I've seen that the cluster is
currently in the following state
# ceph health detail
HEALTH_WARN 4 pgs backfill_toofull; 4 pgs
stuck unclean; recovery 20/39433323
objects degraded (0.000%); recovery
77898/39433323 objects misplaced (0.198%);
8 near full osd(s); crush map has legacy
tunables (require bobtail, min is firefly)
pg 6.e2 is stuck unclean for 9578.920997,
current state
active+remapped+backfill_toofull, last
acting [49,38,11]
pg 6.4 is stuck unclean for 9562.054680,
current state
active+remapped+backfill_toofull, last
acting [53,6,26]
pg 5.24 is stuck unclean for 10292.469037,
current state
active+remapped+backfill_toofull, last
acting [32,13,51]
pg 5.306 is stuck unclean for
10292.448364, current state
active+remapped+backfill_toofull, last
acting [44,7,59]
pg 5.306 is
active+remapped+backfill_toofull, acting
[44,7,59]
pg 5.24 is
active+remapped+backfill_toofull, acting
[32,13,51]
pg 6.4 is
active+remapped+backfill_toofull, acting
[53,6,26]
pg 6.e2 is
active+remapped+backfill_toofull, acting
[49,38,11]
recovery 20/39433323 objects degraded
(0.000%)
recovery 77898/39433323 objects misplaced
(0.198%)
osd.1 is near full at 88%
osd.14 is near full at 87%
osd.24 is near full at 86%
osd.26 is near full at 87%
osd.37 is near full at 87%
osd.53 is near full at 88%
osd.56 is near full at 85%
osd.62 is near full at 87%
crush map has legacy tunables
(require bobtail, min is firefly); see
http://ceph.com/docs/master/rados/operations/crush-map/#tunables
Not sure if it is worthwhile to mention,
but after upgrading to Jewel, our cluster
shows the warnings regarding tunables. We
still have not migrated to the optimal
tunables because the cluster will be very
actively used during the 3 next weeks (
due to one of the main conference in our
area) and we prefer to do that migration
after this peak period,
I am unsure what happen during the
rebalacing but the mapping of these 4 stuck
pgs seems strange, namely the up and acting
osds are different.
# ceph pg dump_stuck unclean
ok
pg_stat state up up_primary
acting acting_primary
6.e2
active+remapped+backfill_toofull
[8,53,38] 8 [49,38,11] 49
6.4 active+remapped+backfill_toofull
[53,24,6] 53 [53,6,26] 53
5.24
active+remapped+backfill_toofull
[32,13,56] 32 [32,13,51] 32
5.306
active+remapped+backfill_toofull
[44,60,26] 44 [44,7,59] 44
# ceph pg map 6.e2
osdmap e1054 pg 6.e2 (6.e2) -> up
[8,53,38] acting [49,38,11]
# ceph pg map 6.4
osdmap e1054 pg 6.4 (6.4) -> up
[53,24,6] acting [53,6,26]
# ceph pg map 5.24
osdmap e1054 pg 5.24 (5.24) -> up
[32,13,56] acting [32,13,51]
# ceph pg map 5.306
osdmap e1054 pg 5.306 (5.306) -> up
[44,60,26] acting [44,7,59]
To complete this information, I am also
sending the output of pg query for one of
these problematic pgs (ceph pg 5.306 query)
after this email.
What should be the procedure to try to
recover those PGS before continuing with the
reweighing?
Than you in advance
Goncalo
# ceph pg 5.306 query
{
"state":
"active+remapped+backfill_toofull",
"snap_trimq": "[]",
"epoch": 1054,
"up": [
44,
60,
26
],
"acting": [
44,
7,
59
],
"backfill_targets": [
"26",
"60"
],
"actingbackfill": [
"7",
"26",
"44",
"59",
"60"
],
"info": {
"pgid": "5.306",
"last_update": "1005'55174",
"last_complete": "1005'55174",
"log_tail": "1005'52106",
"last_user_version": 55174,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": "[]",
"history": {
"epoch_created": 339,
"last_epoch_started": 1016,
"last_epoch_clean": 996,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 1015,
"same_interval_since": 1015,
"same_primary_since": 928,
"last_scrub": "1005'55169",
"last_scrub_stamp": "2016-07-19
14:31:45.790871",
"last_deep_scrub": "1005'55169",
"last_deep_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_clean_scrub_stamp":
"2016-07-19 14:31:45.790871"
},
"stats": {
"version": "1005'55174",
"reported_seq": "39726",
"reported_epoch": "1049",
"state":
"active+remapped+backfill_toofull",
"last_fresh": "2016-07-20
01:55:43.224525",
"last_change": "2016-07-20
00:47:02.045700",
"last_active": "2016-07-20
01:55:43.224525",
"last_peered": "2016-07-20
01:55:43.224525",
"last_clean": "2016-07-20
00:34:36.197721",
"last_became_active":
"2016-07-20 00:36:21.224010",
"last_became_peered":
"2016-07-20 00:36:21.224010",
"last_unstale": "2016-07-20
01:55:43.224525",
"last_undegraded": "2016-07-20
01:55:43.224525",
"last_fullsized": "2016-07-20
01:55:43.224525",
"mapping_epoch": 995,
"log_start": "1005'52106",
"ondisk_log_start":
"1005'52106",
"created": 339,
"last_epoch_clean": 996,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "1005'55169",
"last_scrub_stamp": "2016-07-19
14:31:45.790871",
"last_deep_scrub": "1005'55169",
"last_deep_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_clean_scrub_stamp":
"2016-07-19 14:31:45.790871",
"log_size": 3068,
"ondisk_log_size": 3068,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid":
false,
"pin_stats_invalid": true,
"stat_sum": {
"num_bytes": 0,
"num_objects": 230,
"num_object_clones": 0,
"num_object_copies": 1150,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced":
920,
"num_objects_unfound": 0,
"num_objects_dirty": 230,
"num_whiteouts": 0,
"num_read": 12454,
"num_read_kb": 217518,
"num_write": 55524,
"num_write_kb": 228743,
"num_scrub_errors": 0,
"num_shallow_scrub_errors":
0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 230,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive":
0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0
},
"up": [
44,
60,
26
],
"acting": [
44,
7,
59
],
"blocked_by": [],
"up_primary": 44,
"acting_primary": 44
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 1016,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"peer_info": [
{
"peer": "7",
"pgid": "5.306",
"last_update": "1005'55174",
"last_complete": "1005'55174",
"log_tail": "1005'52106",
"last_user_version": 55174,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": "[]",
"history": {
"epoch_created": 339,
"last_epoch_started": 1016,
"last_epoch_clean": 996,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 1015,
"same_interval_since": 1015,
"same_primary_since": 928,
"last_scrub": "1005'55169",
"last_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_deep_scrub":
"1005'55169",
"last_deep_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_clean_scrub_stamp":
"2016-07-19 14:31:45.790871"
},
"stats": {
"version": "1005'55173",
"reported_seq": "39711",
"reported_epoch": "1005",
"state": "active+clean",
"last_fresh": "2016-07-19
16:44:02.213143",
"last_change": "2016-07-19
14:31:45.791194",
"last_active": "2016-07-19
16:44:02.213143",
"last_peered": "2016-07-19
16:44:02.213143",
"last_clean": "2016-07-19
16:44:02.213143",
"last_became_active":
"2016-06-27 04:57:38.897948",
"last_became_peered":
"2016-06-27 04:57:38.897948",
"last_unstale": "2016-07-19
16:44:02.213143",
"last_undegraded":
"2016-07-19 16:44:02.213143",
"last_fullsized":
"2016-07-19 16:44:02.213143",
"mapping_epoch": 995,
"log_start": "1005'52106",
"ondisk_log_start":
"1005'52106",
"created": 339,
"last_epoch_clean": 996,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "1005'55169",
"last_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_deep_scrub":
"1005'55169",
"last_deep_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_clean_scrub_stamp":
"2016-07-19 14:31:45.790871",
"log_size": 3067,
"ondisk_log_size": 3067,
"stats_invalid": false,
"dirty_stats_invalid":
false,
"omap_stats_invalid": false,
"hitset_stats_invalid":
false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": true,
"stat_sum": {
"num_bytes": 0,
"num_objects": 230,
"num_object_clones": 0,
"num_object_copies":
693,
"num_objects_missing_on_primary": 0,
"num_objects_missing":
0,
"num_objects_degraded":
0,
"num_objects_misplaced":
0,
"num_objects_unfound":
0,
"num_objects_dirty":
230,
"num_whiteouts": 0,
"num_read": 12454,
"num_read_kb": 217518,
"num_write": 55524,
"num_write_kb": 228743,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors":
0,
"num_objects_recovered":
0,
"num_bytes_recovered":
0,
"num_keys_recovered": 0,
"num_objects_omap": 230,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high":
0,
"num_flush_mode_low": 0,
"num_evict_mode_some":
0,
"num_evict_mode_full":
0,
"num_objects_pinned": 0
},
"up": [
44,
60,
26
],
"acting": [
44,
7,
59
],
"blocked_by": [],
"up_primary": 44,
"acting_primary": 44
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 1016,
"hit_set_history": {
"current_last_update":
"0'0",
"history": []
}
},
{
"peer": "26",
"pgid": "5.306",
"last_update": "1005'55174",
"last_complete": "1005'55174",
"log_tail": "1005'52174",
"last_user_version": 0,
"last_backfill": "MIN",
"last_backfill_bitwise": 1,
"purged_snaps": "[]",
"history": {
"epoch_created": 339,
"last_epoch_started": 1016,
"last_epoch_clean": 996,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 1015,
"same_interval_since": 1015,
"same_primary_since": 928,
"last_scrub": "1005'55169",
"last_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_deep_scrub":
"1005'55169",
"last_deep_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_clean_scrub_stamp":
"2016-07-19 14:31:45.790871"
},
"stats": {
"version": "0'0",
"reported_seq": "0",
"reported_epoch": "0",
"state": "inactive",
"last_fresh": "0.000000",
"last_change": "0.000000",
"last_active": "0.000000",
"last_peered": "0.000000",
"last_clean": "0.000000",
"last_became_active":
"0.000000",
"last_became_peered":
"0.000000",
"last_unstale": "0.000000",
"last_undegraded":
"0.000000",
"last_fullsized":
"0.000000",
"mapping_epoch": 0,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 0,
"last_epoch_clean": 0,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp":
"0.000000",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp":
"0.000000",
"last_clean_scrub_stamp":
"0.000000",
"log_size": 0,
"ondisk_log_size": 0,
"stats_invalid": false,
"dirty_stats_invalid":
false,
"omap_stats_invalid": false,
"hitset_stats_invalid":
false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing":
0,
"num_objects_degraded":
0,
"num_objects_misplaced":
0,
"num_objects_unfound":
0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors":
0,
"num_objects_recovered":
0,
"num_bytes_recovered":
0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high":
0,
"num_flush_mode_low": 0,
"num_evict_mode_some":
0,
"num_evict_mode_full":
0,
"num_objects_pinned": 0
},
"up": [],
"acting": [],
"blocked_by": [],
"up_primary": -1,
"acting_primary": -1
},
"empty": 0,
"dne": 0,
"incomplete": 1,
"last_epoch_started": 1016,
"hit_set_history": {
"current_last_update":
"0'0",
"history": []
}
},
{
"peer": "59",
"pgid": "5.306",
"last_update": "1005'55174",
"last_complete": "1005'55174",
"log_tail": "1005'52106",
"last_user_version": 55174,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": "[]",
"history": {
"epoch_created": 339,
"last_epoch_started": 1016,
"last_epoch_clean": 996,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 1015,
"same_interval_since": 1015,
"same_primary_since": 928,
"last_scrub": "1005'55169",
"last_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_deep_scrub":
"1005'55169",
"last_deep_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_clean_scrub_stamp":
"2016-07-19 14:31:45.790871"
},
"stats": {
"version": "1005'55173",
"reported_seq": "39711",
"reported_epoch": "1005",
"state": "active+clean",
"last_fresh": "2016-07-19
16:44:02.213143",
"last_change": "2016-07-19
14:31:45.791194",
"last_active": "2016-07-19
16:44:02.213143",
"last_peered": "2016-07-19
16:44:02.213143",
"last_clean": "2016-07-19
16:44:02.213143",
"last_became_active":
"2016-06-27 04:57:38.897948",
"last_became_peered":
"2016-06-27 04:57:38.897948",
"last_unstale": "2016-07-19
16:44:02.213143",
"last_undegraded":
"2016-07-19 16:44:02.213143",
"last_fullsized":
"2016-07-19 16:44:02.213143",
"mapping_epoch": 995,
"log_start": "1005'52106",
"ondisk_log_start":
"1005'52106",
"created": 339,
"last_epoch_clean": 996,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "1005'55169",
"last_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_deep_scrub":
"1005'55169",
"last_deep_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_clean_scrub_stamp":
"2016-07-19 14:31:45.790871",
"log_size": 3067,
"ondisk_log_size": 3067,
"stats_invalid": false,
"dirty_stats_invalid":
false,
"omap_stats_invalid": false,
"hitset_stats_invalid":
false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": true,
"stat_sum": {
"num_bytes": 0,
"num_objects": 230,
"num_object_clones": 0,
"num_object_copies":
693,
"num_objects_missing_on_primary": 0,
"num_objects_missing":
0,
"num_objects_degraded":
0,
"num_objects_misplaced":
0,
"num_objects_unfound":
0,
"num_objects_dirty":
230,
"num_whiteouts": 0,
"num_read": 12454,
"num_read_kb": 217518,
"num_write": 55524,
"num_write_kb": 228743,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors":
0,
"num_objects_recovered":
0,
"num_bytes_recovered":
0,
"num_keys_recovered": 0,
"num_objects_omap": 230,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high":
0,
"num_flush_mode_low": 0,
"num_evict_mode_some":
0,
"num_evict_mode_full":
0,
"num_objects_pinned": 0
},
"up": [
44,
60,
26
],
"acting": [
44,
7,
59
],
"blocked_by": [],
"up_primary": 44,
"acting_primary": 44
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 1016,
"hit_set_history": {
"current_last_update":
"0'0",
"history": []
}
},
{
"peer": "60",
"pgid": "5.306",
"last_update": "1005'55174",
"last_complete": "1005'55174",
"log_tail": "1005'52174",
"last_user_version": 0,
"last_backfill": "MIN",
"last_backfill_bitwise": 1,
"purged_snaps": "[]",
"history": {
"epoch_created": 339,
"last_epoch_started": 1016,
"last_epoch_clean": 996,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 1015,
"same_interval_since": 1015,
"same_primary_since": 928,
"last_scrub": "1005'55169",
"last_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_deep_scrub":
"1005'55169",
"last_deep_scrub_stamp":
"2016-07-19 14:31:45.790871",
"last_clean_scrub_stamp":
"2016-07-19 14:31:45.790871"
},
"stats": {
"version": "0'0",
"reported_seq": "0",
"reported_epoch": "0",
"state": "inactive",
"last_fresh": "0.000000",
"last_change": "0.000000",
"last_active": "0.000000",
"last_peered": "0.000000",
"last_clean": "0.000000",
"last_became_active":
"0.000000",
"last_became_peered":
"0.000000",
"last_unstale": "0.000000",
"last_undegraded":
"0.000000",
"last_fullsized":
"0.000000",
"mapping_epoch": 0,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 0,
"last_epoch_clean": 0,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp":
"0.000000",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp":
"0.000000",
"last_clean_scrub_stamp":
"0.000000",
"log_size": 0,
"ondisk_log_size": 0,
"stats_invalid": false,
"dirty_stats_invalid":
false,
"omap_stats_invalid": false,
"hitset_stats_invalid":
false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing":
0,
"num_objects_degraded":
0,
"num_objects_misplaced":
0,
"num_objects_unfound":
0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors":
0,
"num_objects_recovered":
0,
"num_bytes_recovered":
0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high":
0,
"num_flush_mode_low": 0,
"num_evict_mode_some":
0,
"num_evict_mode_full":
0,
"num_objects_pinned": 0
},
"up": [],
"acting": [],
"blocked_by": [],
"up_primary": -1,
"acting_primary": -1
},
"empty": 0,
"dne": 0,
"incomplete": 1,
"last_epoch_started": 1016,
"hit_set_history": {
"current_last_update":
"0'0",
"history": []
}
}
],
"recovery_state": [
{
"name":
"Started\/Primary\/Active",
"enter_time": "2016-07-20
00:36:21.091565",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [
"26",
"60"
],
"waiting_on_backfill": [],
"last_backfill_started":
"MIN",
"backfill_info": {
"begin": "MIN",
"end": "MIN",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start":
"995",
"scrubber.active": 0,
"scrubber.state":
"INACTIVE",
"scrubber.start": "MIN",
"scrubber.end": "MIN",
"scrubber.subset_last_update": "0'0",
"scrubber.deep": false,
"scrubber.seed": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom":
[]
}
},
{
"name": "Started",
"enter_time": "2016-07-20
00:36:19.742892"
}
],
"agent_state": {}
}
--
Goncalo Borges
Research Computing
ARC Centre of Excellence for Particle Physics at the Terascale
School of Physics A28 | University of Sydney, NSW 2006
T: +61 2 93511937