Hi all,
I am managing a typical small ceph cluster that consists of 4 nodes with each one having 7 OSDs (some in hdd pool, some in ssd pool)
Having a healthy cluster and following some space issues due to bad pg management from ceph, I tried some reweighs in specific OSDs. Unfortunately the reballancing after reweigh a specific pg went to active+remmaped state and I have now a couple of misplaced
objects (3359/6118678 objects misplaced (0.055%))
Cluster had (for iops reasons) the scrub and deep-scrub options disabled. I enabled a simple scrub to all PG and waited to run. It finished completely without problems I guess. I would like to avoid deep-scrub, but if this is going to help I will run it once.
I am thinking of declaring OSD 26 as lost. This way, new pg copy from 6 will be created, correct?
Any other less harmful thoughts on how to fix it?
I attached all the information I could provide. I am also pasting them raw below:
Query for faulty PG:
{
"state": "active+remapped",
"snap_trimq": "[]",
"epoch": 11755,
"up": [
6
],
"acting": [
6,
26
],
"actingbackfill": [
"6",
"26"
],
"info": {
"pgid": "1.11d",
"last_update": "11755'60561210",
"last_complete": "11755'60561210",
"log_tail": "11755'60558123",
"last_user_version": 60561210,
"last_backfill": "MAX",
"purged_snaps": "[1~33,36~22]",
"history": {
"epoch_created": 31,
"last_epoch_started": 11681,
"last_epoch_clean": 11681,
"last_epoch_split": 0,
"same_up_since": 11679,
"same_interval_since": 11680,
"same_primary_since": 11510,
"last_scrub": "449'16483",
"last_scrub_stamp": "2016-09-14 15:25:14.228231",
"last_deep_scrub": "448'16277",
"last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
"last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
},
"stats": {
"version": "11755'60561210",
"reported_seq": "50924585",
"reported_epoch": "11755",
"state": "active+remapped",
"last_fresh": "2018-12-03 12:58:03.289251",
"last_change": "2018-11-09 10:54:06.861873",
"last_active": "2018-12-03 12:58:03.289251",
"last_peered": "2018-12-03 12:58:03.289251",
"last_clean": "2018-11-09 10:54:02.622866",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2018-12-03 12:58:03.289251",
"last_undegraded": "2018-12-03 12:58:03.289251",
"last_fullsized": "2018-12-03 12:58:03.289251",
"mapping_epoch": 11679,
"log_start": "11755'60558123",
"ondisk_log_start": "11755'60558123",
"created": 31,
"last_epoch_clean": 11681,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "449'16483",
"last_scrub_stamp": "2016-09-14 15:25:14.228231",
"last_deep_scrub": "448'16277",
"last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
"last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
"log_size": 3087,
"ondisk_log_size": 3087,
"stats_invalid": "0",
"stat_sum": {
"num_bytes": 14031434258,
"num_objects": 3359,
"num_object_clones": 0,
"num_object_copies": 6718,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 3359,
"num_objects_unfound": 0,
"num_objects_dirty": 3359,
"num_whiteouts": 0,
"num_read": 27359423,
"num_read_kb": 1815932413,
"num_write": 121113356,
"num_write_kb": 2124776643,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 65218,
"num_bytes_recovered": 271765903872,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0
},
"up": [
6
],
"acting": [
6,
26
],
"blocked_by": [],
"up_primary": 6,
"acting_primary": 6
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 11681,
"hit_set_history": {
"current_last_update": "0'0",
"current_last_stamp": "0.000000",
"current_info": {
"begin": "0.000000",
"end": "0.000000",
"version": "0'0",
"using_gmt": "1"
},
"history": []
}
},
"peer_info": [
{
"peer": "26",
"pgid": "1.11d",
"last_update": "11755'60561210",
"last_complete": "11755'60561210",
"log_tail": "11649'58446601",
"last_user_version": 58449647,
"last_backfill": "MAX",
"purged_snaps": "[1~33,36~22]",
"history": {
"epoch_created": 31,
"last_epoch_started": 11681,
"last_epoch_clean": 11681,
"last_epoch_split": 0,
"same_up_since": 11679,
"same_interval_since": 11680,
"same_primary_since": 11510,
"last_scrub": "449'16483",
"last_scrub_stamp": "2016-09-14 15:25:14.228231",
"last_deep_scrub": "448'16277",
"last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
"last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
},
"stats": {
"version": "11678'58449646",
"reported_seq": "48950066",
"reported_epoch": "11678",
"state": "active+clean",
"last_fresh": "2018-11-09 10:54:02.263168",
"last_change": "2018-11-09 08:01:12.116827",
"last_active": "2018-11-09 10:54:02.263168",
"last_peered": "2018-11-09 10:54:02.263168",
"last_clean": "2018-11-09 10:54:02.263168",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2018-11-09 10:54:02.263168",
"last_undegraded": "2018-11-09 10:54:02.263168",
"last_fullsized": "2018-11-09 10:54:02.263168",
"mapping_epoch": 11679,
"log_start": "11649'58446601",
"ondisk_log_start": "11649'58446601",
"created": 31,
"last_epoch_clean": 11610,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "449'16483",
"last_scrub_stamp": "2016-09-14 15:25:14.228231",
"last_deep_scrub": "448'16277",
"last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
"last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
"log_size": 3045,
"ondisk_log_size": 3045,
"stats_invalid": "0",
"stat_sum": {
"num_bytes": 18153595392,
"num_objects": 4344,
"num_object_clones": 0,
"num_object_copies": 8688,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 4344,
"num_whiteouts": 0,
"num_read": 26674601,
"num_read_kb": 1767105243,
"num_write": 116892449,
"num_write_kb": 2073693377,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 65218,
"num_bytes_recovered": 271765903872,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0
},
"up": [
6
],
"acting": [
6,
26
],
"blocked_by": [],
"up_primary": 6,
"acting_primary": 6
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 11681,
"hit_set_history": {
"current_last_update": "0'0",
"current_last_stamp": "0.000000",
"current_info": {
"begin": "0.000000",
"end": "0.000000",
"version": "0'0",
"using_gmt": "1"
},
"history": []
}
}
],
"recovery_state": [
{
"name": "Started\/Primary\/Active",
"enter_time": "2018-11-09 10:54:06.825830",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [],
"waiting_on_backfill": [],
"last_backfill_started": "-1\/0\/\/0",
"backfill_info": {
"begin": "-1\/0\/\/0",
"end": "-1\/0\/\/0",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "0",
"scrubber.active": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom": []
}
},
{
"name": "Started",
"enter_time": "2018-11-09 10:54:05.789621"
}
],
"agent_state": {}
}
Ceph status
health HEALTH_WARN
1 pgs stuck unclean
recovery 3359/6118678 objects misplaced (0.055%)
noout,nodeep-scrub flag(s) set
monmap e3: 3 mons at {0=192.168.1.1:6789/0,1=192.168.1.2:6789/0,2=192.168.1.3:6789/0}
election epoch 4882, quorum 0,1,2 0,1,2
osdmap e11755: 27 osds: 27 up, 27 in; 1 remapped pgs
flags noout,nodeep-scrub
pgmap v62734988: 1024 pgs, 2 pools, 10183 GB data, 2557 kobjects
23768 GB used, 48720 GB / 72488 GB avail
3359/6118678 objects misplaced (0.055%)
1023 active+clean
1 active+remapped
client io 141 kB/s rd, 14068 kB/s wr, 925 op/s
2018-12-03 12:58:52.109913 mon.0 [INF] pgmap v62734987: 1024 pgs: 1 active+remapped, 1023 active+clean; 10183 GB data, 23768 GB used, 48720 GB / 72488 GB avail; 8325 kB/s rd, 16182 kB/s wr, 1704 op/s;
3359/6118678 objects misplaced (0.055%)
OSD tree
ID WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR TYPE NAME
-11 37.19995 - 37204G 10257G 26946G 27.57 0.84 root hdd
-12 9.29999 - 9301G 2531G 6769G 27.22 0.83 host hdd-node1
18 4.64999 1.00000 4650G 1226G 3424G 26.37 0.80 osd.18
19 4.64999 1.00000 4650G 1305G 3345G 28.06 0.86 osd.19
-13 9.29999 - 9301G 2665G 6635G 28.66 0.87 host hdd-node2
20 4.64999 1.00000 4650G 1361G 3289G 29.27 0.89 osd.20
21 4.64999 1.00000 4650G 1304G 3346G 28.05 0.86 osd.21
-14 9.29999 - 9301G 2628G 6672G 28.26 0.86 host hdd-node3
22 4.64999 1.00000 4650G 1396G 3254G 30.02 0.92 osd.22
23 4.64999 1.00000 4650G 1232G 3418G 26.50 0.81 osd.23
-15 9.29999 - 9301G 2431G 6869G 26.15 0.80 host hdd-node4
24 4.64999 1.00000 4650G 1218G 3432G 26.20 0.80 osd.24
25 4.64999 1.00000 4650G 1213G 3436G 26.09 0.80 osd.25
-1 35.14995 - 35284G 13512G 21771G 38.30 1.17 root default
-2 9.25000 - 9285G 3431G 5853G 36.96 1.13 host node1
0 1.84999 1.00000 1857G 765G 1091G 41.24 1.26 osd.0
1 1.84999 1.00000 1857G 633G 1224G 34.09 1.04 osd.1
6 1.84999 1.00000 1857G 777G 1079G 41.88 1.28 osd.6
7 1.84999 0.89999 1857G 752G 1104G 40.54 1.24 osd.7
8 1.84999 1.00000 1857G 502G 1354G 27.06 0.83 osd.8
-3 9.24995 - 9285G 3562G 5722G 38.37 1.17 host node2
2 1.84999 1.00000 1857G 766G 1090G 41.27 1.26 osd.2
3 1.84999 0.70000 1857G 674G 1182G 36.33 1.11 osd.3
9 1.84999 1.00000 1857G 580G 1276G 31.28 0.95 osd.9
10 1.84999 1.00000 1857G 814G 1042G 43.88 1.34 osd.10
11 1.84999 1.00000 1857G 725G 1131G 39.07 1.19 osd.11
-4 9.25000 - 9285G 3561G 5724G 38.35 1.17 host node3
4 1.84999 1.00000 1857G 684G 1172G 36.88 1.12 osd.4
5 1.84999 1.00000 1857G 633G 1223G 34.11 1.04 osd.5
12 1.84999 1.00000 1857G 696G 1160G 37.49 1.14 osd.12
13 1.84999 0.70000 1857G 741G 1116G 39.90 1.22 osd.13
14 1.84999 0.89999 1857G 805G 1051G 43.37 1.32 osd.14
-5 7.39999 - 7428G 2957G 4470G 39.81 1.21 host node4
15 1.84999 0.79999 1857G 742G 1115G 39.96 1.22 osd.15
16 1.84999 1.00000 1857G 634G 1222G 34.15 1.04 osd.16
17 1.84999 0.89999 1857G 803G 1053G 43.26 1.32 osd.17
26 1.84999 0.81000 1857G 777G 1079G 41.89 1.28 osd.26
TOTAL 72488G 23770G 48718G 32.79
MIN/MAX VAR: 0.80/1.34 STDDEV: 6.56
PG dump
version 62735224
stamp 2018-12-03 13:02:52.799643
last_osdmap_epoch 11755
last_pg_scan 9537
full_ratio 0.95
nearfull_ratio 0.85
pg_stat objects mip degr misp unf bytes log disklog state state_stamp v reported up up_primary acting acting_primary last_scrub scrub_stamp last_deep_scrub deep_scrub_stamp
///active+clean ones removed///
1.11d 3359 0 0 3359 0 14031434258 3034 3034 active+remapped 2018-11-09 10:54:06.861873 11755'60561357 11755:50924695 [6] 6 [6,26] 6 449'16483 2016-09-14 15:25:14.228231 448'16277 2016-09-13 06:11:45.633007
///active+clean ones removed///
pool 1 1738101 0 0 3359 0 7253679601802 1562466 1562466
pool 2 881071 0 0 0 0 3682217717410 1561924 1561924
sum 2619172 0 0 3359 0 10935897319212 3124390 3124390
osdstat kbused kbavail kb hb in hb out
0 803034868 1144251736 1947286604 [1,2,3,4,5,10,12,13,14,15,16,26] []
1 663754736 1283531868 1947286604 [0,2,3,5,10,12,13,15,16,17,26] []
2 803619260 1143667344 1947286604 [0,1,3,4,5,6,7,8,13,14,15,16,17,26] []
3 707438640 1239847964 1947286604 [0,1,2,4,5,6,7,8,12,13,14,15,17] []
4 718194072 1229092532 1947286604 [2,3,5,6,8,10,11,15,16,17,26] []
5 664279112 1283007492 1947286604 [0,1,2,3,4,6,8,10,11,15,17,26] []
6 815455088 1131831516 1947286604 [2,4,5,7,9,10,11,12,13,14,15,16,17,26] []
7 789396940 1157889664 1947286604 [2,4,6,8,9,10,11,12,13,14,15,16,17,26] []
8 526871252 1420415352 1947286604 [2,3,4,5,7,9,10,11,12,13,15,16,17] []
9 609147992 1338138612 1947286604 [0,1,4,7,8,10,13,14,15,16,17,26] []
10 854451916 1092834688 1947286604 [0,1,4,5,7,8,9,11,12,13,14,15,16,17,26] []
11 760893328 1186393276 1947286604 [1,4,5,6,7,8,10,12,13,14,15,16,17,26] []
12 730109256 1217177348 1947286604 [0,6,7,8,9,10,11,13,15,16,17,26] []
13 777029008 1170257596 1947286604 [0,1,2,3,6,7,8,9,10,11,12,14,15,16] []
14 844469760 1102816844 1947286604 [0,1,2,3,6,7,9,10,11,13,15,16,17,26] []
15 778122444 1169164160 1947286604 [0,2,3,4,6,9,10,11,14,16] []
16 664960388 1282326216 1947286604 [1,2,3,4,5,6,7,8,10,11,12,13,15,17] []
17 842428012 1104858592 1947286604 [0,1,2,3,4,5,8,9,10,11,12,13,14,16,18] []
18 1285869748 3590537232 4876406980 [0,1,17,20,21,22,23,24,25,26] []
19 1368764192 3507642788 4876406980 [0,1,18,20,21,22,23,24,25,26] []
20 1427417120 3448989860 4876406980 [4,17,18,19,21,22,23,24,25,26] []
21 1367928664 3508478316 4876406980 [4,5,18,19,20,22,23,24,25,26] []
22 1464361956 3412045024 4876406980 [4,5,18,19,20,21,23,24,25,26] []
23 1292415092 3583991888 4876406980 [4,5,18,19,20,21,22,24,25,26] []
24 1277731204 3598675776 4876406980 [4,5,18,19,20,21,22,23,25,26] []
25 1272703828 3603703152 4876406980 [4,5,18,19,20,21,22,23,24,26] []
26 815682292 1131604312 1947286604 [0,1,2,3,4,5,6,7,8,10,11,12,13,25] []
Ceph health detail
HEALTH_WARN 1 pgs stuck unclean; recovery 3359/6120420 objects misplaced (0.055%); noout,nodeep-scrub flag(s) set
pg 1.11d is stuck unclean for 2081576.511195, current state active+remapped, last acting [6,26]
recovery 3359/6120420 objects misplaced (0.055%)
noout,nodeep-scrub flag(s) set
Ceph version
ceph version 0.94.10 (b1e0532418e4631af01acbc0cedd426f1905f4af)
Regards,
Nasos Pan |
Attachment:
ceph.health.detail
Description: ceph.health.detail
Attachment:
ceph.status
Description: ceph.status
Attachment:
osd.tree
Description: osd.tree
Attachment:
pg.dump
Description: pg.dump
Attachment:
pg.query
Description: pg.query
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com