PG problem after reweight (1 PG active+remapped)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi all,

I am managing a typical small ceph cluster that consists of 4 nodes with each one having 7 OSDs (some in hdd pool, some in ssd pool)

Having a healthy cluster and following some space issues due to bad pg management from ceph, I tried some reweighs in specific OSDs. Unfortunately the reballancing after reweigh a specific pg went to active+remmaped state and I have now a couple of misplaced objects (3359/6118678 objects misplaced (0.055%))

Cluster had (for iops reasons) the scrub and deep-scrub options disabled. I enabled a simple scrub to all PG and waited to run. It finished completely without problems I guess. I would like to avoid deep-scrub, but if this is going to help I will run it once.

I am thinking of declaring OSD 26 as lost. This way, new pg copy from 6 will be created, correct?
Any other less harmful thoughts on how to fix it? 

I attached all the information I could provide. I am also pasting them raw below:

Query for faulty PG:

{
    "state": "active+remapped",
    "snap_trimq": "[]",
    "epoch": 11755,
    "up": [
        6
    ],
    "acting": [
        6,
        26
    ],
    "actingbackfill": [
        "6",
        "26"
    ],
    "info": {
        "pgid": "1.11d",
        "last_update": "11755'60561210",
        "last_complete": "11755'60561210",
        "log_tail": "11755'60558123",
        "last_user_version": 60561210,
        "last_backfill": "MAX",
        "purged_snaps": "[1~33,36~22]",
        "history": {
            "epoch_created": 31,
            "last_epoch_started": 11681,
            "last_epoch_clean": 11681,
            "last_epoch_split": 0,
            "same_up_since": 11679,
            "same_interval_since": 11680,
            "same_primary_since": 11510,
            "last_scrub": "449'16483",
            "last_scrub_stamp": "2016-09-14 15:25:14.228231",
            "last_deep_scrub": "448'16277",
            "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
            "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
        },
        "stats": {
            "version": "11755'60561210",
            "reported_seq": "50924585",
            "reported_epoch": "11755",
            "state": "active+remapped",
            "last_fresh": "2018-12-03 12:58:03.289251",
            "last_change": "2018-11-09 10:54:06.861873",
            "last_active": "2018-12-03 12:58:03.289251",
            "last_peered": "2018-12-03 12:58:03.289251",
            "last_clean": "2018-11-09 10:54:02.622866",
            "last_became_active": "0.000000",
            "last_became_peered": "0.000000",
            "last_unstale": "2018-12-03 12:58:03.289251",
            "last_undegraded": "2018-12-03 12:58:03.289251",
            "last_fullsized": "2018-12-03 12:58:03.289251",
            "mapping_epoch": 11679,
            "log_start": "11755'60558123",
            "ondisk_log_start": "11755'60558123",
            "created": 31,
            "last_epoch_clean": 11681,
            "parent": "0.0",
            "parent_split_bits": 0,
            "last_scrub": "449'16483",
            "last_scrub_stamp": "2016-09-14 15:25:14.228231",
            "last_deep_scrub": "448'16277",
            "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
            "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
            "log_size": 3087,
            "ondisk_log_size": 3087,
            "stats_invalid": "0",
            "stat_sum": {
                "num_bytes": 14031434258,
                "num_objects": 3359,
                "num_object_clones": 0,
                "num_object_copies": 6718,
                "num_objects_missing_on_primary": 0,
                "num_objects_degraded": 0,
                "num_objects_misplaced": 3359,
                "num_objects_unfound": 0,
                "num_objects_dirty": 3359,
                "num_whiteouts": 0,
                "num_read": 27359423,
                "num_read_kb": 1815932413,
                "num_write": 121113356,
                "num_write_kb": 2124776643,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 65218,
                "num_bytes_recovered": 271765903872,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0
            },
            "up": [
                6
            ],
            "acting": [
                6,
                26
            ],
            "blocked_by": [],
            "up_primary": 6,
            "acting_primary": 6
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 11681,
        "hit_set_history": {
            "current_last_update": "0'0",
            "current_last_stamp": "0.000000",
            "current_info": {
                "begin": "0.000000",
                "end": "0.000000",
                "version": "0'0",
                "using_gmt": "1"
            },
            "history": []
        }
    },
    "peer_info": [
        {
            "peer": "26",
            "pgid": "1.11d",
            "last_update": "11755'60561210",
            "last_complete": "11755'60561210",
            "log_tail": "11649'58446601",
            "last_user_version": 58449647,
            "last_backfill": "MAX",
            "purged_snaps": "[1~33,36~22]",
            "history": {
                "epoch_created": 31,
                "last_epoch_started": 11681,
                "last_epoch_clean": 11681,
                "last_epoch_split": 0,
                "same_up_since": 11679,
                "same_interval_since": 11680,
                "same_primary_since": 11510,
                "last_scrub": "449'16483",
                "last_scrub_stamp": "2016-09-14 15:25:14.228231",
                "last_deep_scrub": "448'16277",
                "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
                "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
            },
            "stats": {
                "version": "11678'58449646",
                "reported_seq": "48950066",
                "reported_epoch": "11678",
                "state": "active+clean",
                "last_fresh": "2018-11-09 10:54:02.263168",
                "last_change": "2018-11-09 08:01:12.116827",
                "last_active": "2018-11-09 10:54:02.263168",
                "last_peered": "2018-11-09 10:54:02.263168",
                "last_clean": "2018-11-09 10:54:02.263168",
                "last_became_active": "0.000000",
                "last_became_peered": "0.000000",
                "last_unstale": "2018-11-09 10:54:02.263168",
                "last_undegraded": "2018-11-09 10:54:02.263168",
                "last_fullsized": "2018-11-09 10:54:02.263168",
                "mapping_epoch": 11679,
                "log_start": "11649'58446601",
                "ondisk_log_start": "11649'58446601",
                "created": 31,
                "last_epoch_clean": 11610,
                "parent": "0.0",
                "parent_split_bits": 0,
                "last_scrub": "449'16483",
                "last_scrub_stamp": "2016-09-14 15:25:14.228231",
                "last_deep_scrub": "448'16277",
                "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
                "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
                "log_size": 3045,
                "ondisk_log_size": 3045,
                "stats_invalid": "0",
                "stat_sum": {
                    "num_bytes": 18153595392,
                    "num_objects": 4344,
                    "num_object_clones": 0,
                    "num_object_copies": 8688,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 4344,
                    "num_whiteouts": 0,
                    "num_read": 26674601,
                    "num_read_kb": 1767105243,
                    "num_write": 116892449,
                    "num_write_kb": 2073693377,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 65218,
                    "num_bytes_recovered": 271765903872,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0
                },
                "up": [
                    6
                ],
                "acting": [
                    6,
                    26
                ],
                "blocked_by": [],
                "up_primary": 6,
                "acting_primary": 6
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 11681,
            "hit_set_history": {
                "current_last_update": "0'0",
                "current_last_stamp": "0.000000",
                "current_info": {
                    "begin": "0.000000",
                    "end": "0.000000",
                    "version": "0'0",
                    "using_gmt": "1"
                },
                "history": []
            }
        }
    ],
    "recovery_state": [
        {
            "name": "Started\/Primary\/Active",
            "enter_time": "2018-11-09 10:54:06.825830",
            "might_have_unfound": [],
            "recovery_progress": {
                "backfill_targets": [],
                "waiting_on_backfill": [],
                "last_backfill_started": "-1\/0\/\/0",
                "backfill_info": {
                    "begin": "-1\/0\/\/0",
                    "end": "-1\/0\/\/0",
                    "objects": []
                },
                "peer_backfill_info": [],
                "backfills_in_flight": [],
                "recovering": [],
                "pg_backend": {
                    "pull_from_peer": [],
                    "pushing": []
                }
            },
            "scrub": {
                "scrubber.epoch_start": "0",
                "scrubber.active": 0,
                "scrubber.waiting_on": 0,
                "scrubber.waiting_on_whom": []
            }
        },
        {
            "name": "Started",
            "enter_time": "2018-11-09 10:54:05.789621"
        }
    ],
    "agent_state": {}
}

Ceph status

    health HEALTH_WARN
            1 pgs stuck unclean
            recovery 3359/6118678 objects misplaced (0.055%)
            noout,nodeep-scrub flag(s) set
     monmap e3: 3 mons at {0=192.168.1.1:6789/0,1=192.168.1.2:6789/0,2=192.168.1.3:6789/0}
            election epoch 4882, quorum 0,1,2 0,1,2
     osdmap e11755: 27 osds: 27 up, 27 in; 1 remapped pgs
            flags noout,nodeep-scrub
      pgmap v62734988: 1024 pgs, 2 pools, 10183 GB data, 2557 kobjects
            23768 GB used, 48720 GB / 72488 GB avail
            3359/6118678 objects misplaced (0.055%)
                1023 active+clean
                   1 active+remapped
  client io 141 kB/s rd, 14068 kB/s wr, 925 op/s

2018-12-03 12:58:52.109913 mon.0 [INF] pgmap v62734987: 1024 pgs: 1 active+remapped, 1023 active+clean; 10183 GB data, 23768 GB used, 48720 GB / 72488 GB avail; 8325 kB/s rd, 16182 kB/s wr, 1704 op/s; 3359/6118678 objects misplaced (0.055%)

OSD tree

ID  WEIGHT   REWEIGHT SIZE   USE    AVAIL  %USE  VAR  TYPE NAME              
-11 37.19995        - 37204G 10257G 26946G 27.57 0.84 root hdd              
-12  9.29999        -  9301G  2531G  6769G 27.22 0.83     host hdd-node1
 18  4.64999  1.00000  4650G  1226G  3424G 26.37 0.80         osd.18        
 19  4.64999  1.00000  4650G  1305G  3345G 28.06 0.86         osd.19        
-13  9.29999        -  9301G  2665G  6635G 28.66 0.87     host hdd-node2
 20  4.64999  1.00000  4650G  1361G  3289G 29.27 0.89         osd.20        
 21  4.64999  1.00000  4650G  1304G  3346G 28.05 0.86         osd.21        
-14  9.29999        -  9301G  2628G  6672G 28.26 0.86     host hdd-node3
 22  4.64999  1.00000  4650G  1396G  3254G 30.02 0.92         osd.22        
 23  4.64999  1.00000  4650G  1232G  3418G 26.50 0.81         osd.23        
-15  9.29999        -  9301G  2431G  6869G 26.15 0.80     host hdd-node4
 24  4.64999  1.00000  4650G  1218G  3432G 26.20 0.80         osd.24        
 25  4.64999  1.00000  4650G  1213G  3436G 26.09 0.80         osd.25        
 -1 35.14995        - 35284G 13512G 21771G 38.30 1.17 root default          
 -2  9.25000        -  9285G  3431G  5853G 36.96 1.13     host node1    
  0  1.84999  1.00000  1857G   765G  1091G 41.24 1.26         osd.0          
  1  1.84999  1.00000  1857G   633G  1224G 34.09 1.04         osd.1          
  6  1.84999  1.00000  1857G   777G  1079G 41.88 1.28         osd.6          
  7  1.84999  0.89999  1857G   752G  1104G 40.54 1.24         osd.7          
  8  1.84999  1.00000  1857G   502G  1354G 27.06 0.83         osd.8          
 -3  9.24995        -  9285G  3562G  5722G 38.37 1.17     host node2    
  2  1.84999  1.00000  1857G   766G  1090G 41.27 1.26         osd.2          
  3  1.84999  0.70000  1857G   674G  1182G 36.33 1.11         osd.3          
  9  1.84999  1.00000  1857G   580G  1276G 31.28 0.95         osd.9          
 10  1.84999  1.00000  1857G   814G  1042G 43.88 1.34         osd.10        
 11  1.84999  1.00000  1857G   725G  1131G 39.07 1.19         osd.11        
 -4  9.25000        -  9285G  3561G  5724G 38.35 1.17     host node3    
  4  1.84999  1.00000  1857G   684G  1172G 36.88 1.12         osd.4          
  5  1.84999  1.00000  1857G   633G  1223G 34.11 1.04         osd.5          
 12  1.84999  1.00000  1857G   696G  1160G 37.49 1.14         osd.12        
 13  1.84999  0.70000  1857G   741G  1116G 39.90 1.22         osd.13        
 14  1.84999  0.89999  1857G   805G  1051G 43.37 1.32         osd.14        
 -5  7.39999        -  7428G  2957G  4470G 39.81 1.21     host node4    
 15  1.84999  0.79999  1857G   742G  1115G 39.96 1.22         osd.15        
 16  1.84999  1.00000  1857G   634G  1222G 34.15 1.04         osd.16        
 17  1.84999  0.89999  1857G   803G  1053G 43.26 1.32         osd.17        
 26  1.84999  0.81000  1857G   777G  1079G 41.89 1.28         osd.26        
                TOTAL 72488G 23770G 48718G 32.79                            
MIN/MAX VAR: 0.80/1.34  STDDEV: 6.56

PG dump

version 62735224
stamp 2018-12-03 13:02:52.799643
last_osdmap_epoch 11755
last_pg_scan 9537
full_ratio 0.95
nearfull_ratio 0.85
pg_stat objects mip degr misp unf bytes log disklog state state_stamp v reported up up_primary acting acting_primary last_scrub scrub_stamp last_deep_scrub deep_scrub_stamp

///active+clean ones removed///

1.11d 3359 0 0 3359 0 14031434258 3034 3034 active+remapped 2018-11-09 10:54:06.861873 11755'60561357 11755:50924695 [6] 6 [6,26] 6 449'16483 2016-09-14 15:25:14.228231 448'16277 2016-09-13 06:11:45.633007

///active+clean ones removed///

pool 1 1738101 0 0 3359 0 7253679601802 1562466 1562466
pool 2 881071 0 0 0 0 3682217717410 1561924 1561924
 sum 2619172 0 0 3359 0 10935897319212 3124390 3124390
osdstat kbused kbavail kb hb in hb out
0 803034868 1144251736 1947286604 [1,2,3,4,5,10,12,13,14,15,16,26] []
1 663754736 1283531868 1947286604 [0,2,3,5,10,12,13,15,16,17,26] []
2 803619260 1143667344 1947286604 [0,1,3,4,5,6,7,8,13,14,15,16,17,26] []
3 707438640 1239847964 1947286604 [0,1,2,4,5,6,7,8,12,13,14,15,17] []
4 718194072 1229092532 1947286604 [2,3,5,6,8,10,11,15,16,17,26] []
5 664279112 1283007492 1947286604 [0,1,2,3,4,6,8,10,11,15,17,26] []
6 815455088 1131831516 1947286604 [2,4,5,7,9,10,11,12,13,14,15,16,17,26] []
7 789396940 1157889664 1947286604 [2,4,6,8,9,10,11,12,13,14,15,16,17,26] []
8 526871252 1420415352 1947286604 [2,3,4,5,7,9,10,11,12,13,15,16,17] []
9 609147992 1338138612 1947286604 [0,1,4,7,8,10,13,14,15,16,17,26] []
10 854451916 1092834688 1947286604 [0,1,4,5,7,8,9,11,12,13,14,15,16,17,26] []
11 760893328 1186393276 1947286604 [1,4,5,6,7,8,10,12,13,14,15,16,17,26] []
12 730109256 1217177348 1947286604 [0,6,7,8,9,10,11,13,15,16,17,26] []
13 777029008 1170257596 1947286604 [0,1,2,3,6,7,8,9,10,11,12,14,15,16] []
14 844469760 1102816844 1947286604 [0,1,2,3,6,7,9,10,11,13,15,16,17,26] []
15 778122444 1169164160 1947286604 [0,2,3,4,6,9,10,11,14,16] []
16 664960388 1282326216 1947286604 [1,2,3,4,5,6,7,8,10,11,12,13,15,17] []
17 842428012 1104858592 1947286604 [0,1,2,3,4,5,8,9,10,11,12,13,14,16,18] []
18 1285869748 3590537232 4876406980 [0,1,17,20,21,22,23,24,25,26] []
19 1368764192 3507642788 4876406980 [0,1,18,20,21,22,23,24,25,26] []
20 1427417120 3448989860 4876406980 [4,17,18,19,21,22,23,24,25,26] []
21 1367928664 3508478316 4876406980 [4,5,18,19,20,22,23,24,25,26] []
22 1464361956 3412045024 4876406980 [4,5,18,19,20,21,23,24,25,26] []
23 1292415092 3583991888 4876406980 [4,5,18,19,20,21,22,24,25,26] []
24 1277731204 3598675776 4876406980 [4,5,18,19,20,21,22,23,25,26] []
25 1272703828 3603703152 4876406980 [4,5,18,19,20,21,22,23,24,26] []
26 815682292 1131604312 1947286604 [0,1,2,3,4,5,6,7,8,10,11,12,13,25] []

Ceph health detail

HEALTH_WARN 1 pgs stuck unclean; recovery 3359/6120420 objects misplaced (0.055%); noout,nodeep-scrub flag(s) set
pg 1.11d is stuck unclean for 2081576.511195, current state active+remapped, last acting [6,26]
recovery 3359/6120420 objects misplaced (0.055%)
noout,nodeep-scrub flag(s) set

Ceph version

ceph version 0.94.10 (b1e0532418e4631af01acbc0cedd426f1905f4af)

Regards,
Nasos Pan

Attachment: ceph.health.detail
Description: ceph.health.detail

Attachment: ceph.status
Description: ceph.status

Attachment: osd.tree
Description: osd.tree

Attachment: pg.dump
Description: pg.dump

Attachment: pg.query
Description: pg.query

_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

[Index of Archives]     [Information on CEPH]     [Linux Filesystem Development]     [Ceph Development]     [Ceph Large]     [Ceph Dev]     [Linux USB Development]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [xfs]


  Powered by Linux