According to the output you provided previously, OSD.51/90 might have unfound object. To shuffle object again, you could do: # ceph osd set noout; ceph osd set nodown # systemctl restart ceph-osd@51 * wait for OSD.51's process to be up # systemctl restart ceph-osd@90 * wait for OSD.90's process to be up # ceph osd unset noout; ceph osd unset nodown If your cluster is big enough, please do this very slowly. > "might_have_unfound": [ > { > "osd": "51", > "status": "already probed" > }, > { > "osd": "90", > "status": "already probed" > } Or you could also do to trigger scrubbing pg: # ceph pg scrub ${pg_id} Regards, On Tue, Jan 10, 2017 at 4:25 AM, Andras Pataki <apataki@xxxxxxxxxxxxxxxxxxxx> wrote: > Yes, it doesn't cause issues, but I don't see any way to "repair" the > problem. One possible idea that I might do eventually if no solution is > found is to copy the CephFS files in question and remove the ones with > inconsistencies (which should remove the underlying rados objects). But > it'd be perhaps good to do some searching on how/why this problem came about > before doing this. > > andras > > > > On 01/07/2017 06:48 PM, Shinobu Kinjo wrote: >> >> Sorry for the late. >> >> Are you still facing inconsistent pg status? >> >> On Wed, Jan 4, 2017 at 11:39 PM, Andras Pataki >> <apataki@xxxxxxxxxxxxxxxxxxxx> wrote: >>> >>> # ceph pg debug unfound_objects_exist >>> FALSE >>> >>> Andras >>> >>> >>> On 01/03/2017 11:38 PM, Shinobu Kinjo wrote: >>>> >>>> Would you run: >>>> >>>> # ceph pg debug unfound_objects_exist >>>> >>>> On Wed, Jan 4, 2017 at 5:31 AM, Andras Pataki >>>> <apataki@xxxxxxxxxxxxxxxxxxxx> wrote: >>>>> >>>>> Here is the output of ceph pg query for one of hte >>>>> active+clean+inconsistent >>>>> PGs: >>>>> >>>>> { >>>>> "state": "active+clean+inconsistent", >>>>> "snap_trimq": "[]", >>>>> "epoch": 342982, >>>>> "up": [ >>>>> 319, >>>>> 90, >>>>> 51 >>>>> ], >>>>> "acting": [ >>>>> 319, >>>>> 90, >>>>> 51 >>>>> ], >>>>> "actingbackfill": [ >>>>> "51", >>>>> "90", >>>>> "319" >>>>> ], >>>>> "info": { >>>>> "pgid": "6.92c", >>>>> "last_update": "342982'41304", >>>>> "last_complete": "342982'41304", >>>>> "log_tail": "342980'38259", >>>>> "last_user_version": 41304, >>>>> "last_backfill": "MAX", >>>>> "last_backfill_bitwise": 0, >>>>> "purged_snaps": "[]", >>>>> "history": { >>>>> "epoch_created": 262553, >>>>> "last_epoch_started": 342598, >>>>> "last_epoch_clean": 342613, >>>>> "last_epoch_split": 0, >>>>> "last_epoch_marked_full": 0, >>>>> "same_up_since": 342596, >>>>> "same_interval_since": 342597, >>>>> "same_primary_since": 342597, >>>>> "last_scrub": "342982'41177", >>>>> "last_scrub_stamp": "2017-01-02 18:19:48.081750", >>>>> "last_deep_scrub": "342965'37465", >>>>> "last_deep_scrub_stamp": "2016-12-20 16:31:06.438823", >>>>> "last_clean_scrub_stamp": "2016-12-11 12:51:19.258816" >>>>> }, >>>>> "stats": { >>>>> "version": "342982'41304", >>>>> "reported_seq": "43600", >>>>> "reported_epoch": "342982", >>>>> "state": "active+clean+inconsistent", >>>>> "last_fresh": "2017-01-03 15:27:15.075176", >>>>> "last_change": "2017-01-02 18:19:48.081806", >>>>> "last_active": "2017-01-03 15:27:15.075176", >>>>> "last_peered": "2017-01-03 15:27:15.075176", >>>>> "last_clean": "2017-01-03 15:27:15.075176", >>>>> "last_became_active": "2016-11-01 16:21:23.328639", >>>>> "last_became_peered": "2016-11-01 16:21:23.328639", >>>>> "last_unstale": "2017-01-03 15:27:15.075176", >>>>> "last_undegraded": "2017-01-03 15:27:15.075176", >>>>> "last_fullsized": "2017-01-03 15:27:15.075176", >>>>> "mapping_epoch": 342596, >>>>> "log_start": "342980'38259", >>>>> "ondisk_log_start": "342980'38259", >>>>> "created": 262553, >>>>> "last_epoch_clean": 342613, >>>>> "parent": "0.0", >>>>> "parent_split_bits": 0, >>>>> "last_scrub": "342982'41177", >>>>> "last_scrub_stamp": "2017-01-02 18:19:48.081750", >>>>> "last_deep_scrub": "342965'37465", >>>>> "last_deep_scrub_stamp": "2016-12-20 16:31:06.438823", >>>>> "last_clean_scrub_stamp": "2016-12-11 12:51:19.258816", >>>>> "log_size": 3045, >>>>> "ondisk_log_size": 3045, >>>>> "stats_invalid": false, >>>>> "dirty_stats_invalid": false, >>>>> "omap_stats_invalid": false, >>>>> "hitset_stats_invalid": false, >>>>> "hitset_bytes_stats_invalid": false, >>>>> "pin_stats_invalid": true, >>>>> "stat_sum": { >>>>> "num_bytes": 16929346269, >>>>> "num_objects": 4881, >>>>> "num_object_clones": 0, >>>>> "num_object_copies": 14643, >>>>> "num_objects_missing_on_primary": 0, >>>>> "num_objects_missing": 0, >>>>> "num_objects_degraded": 0, >>>>> "num_objects_misplaced": 0, >>>>> "num_objects_unfound": 0, >>>>> "num_objects_dirty": 4881, >>>>> "num_whiteouts": 0, >>>>> "num_read": 7592, >>>>> "num_read_kb": 19593996, >>>>> "num_write": 42541, >>>>> "num_write_kb": 47306915, >>>>> "num_scrub_errors": 1, >>>>> "num_shallow_scrub_errors": 1, >>>>> "num_deep_scrub_errors": 0, >>>>> "num_objects_recovered": 5807, >>>>> "num_bytes_recovered": 22691211916, >>>>> "num_keys_recovered": 0, >>>>> "num_objects_omap": 0, >>>>> "num_objects_hit_set_archive": 0, >>>>> "num_bytes_hit_set_archive": 0, >>>>> "num_flush": 0, >>>>> "num_flush_kb": 0, >>>>> "num_evict": 0, >>>>> "num_evict_kb": 0, >>>>> "num_promote": 0, >>>>> "num_flush_mode_high": 0, >>>>> "num_flush_mode_low": 0, >>>>> "num_evict_mode_some": 0, >>>>> "num_evict_mode_full": 0, >>>>> "num_objects_pinned": 0 >>>>> }, >>>>> "up": [ >>>>> 319, >>>>> 90, >>>>> 51 >>>>> ], >>>>> "acting": [ >>>>> 319, >>>>> 90, >>>>> 51 >>>>> ], >>>>> "blocked_by": [], >>>>> "up_primary": 319, >>>>> "acting_primary": 319 >>>>> }, >>>>> "empty": 0, >>>>> "dne": 0, >>>>> "incomplete": 0, >>>>> "last_epoch_started": 342598, >>>>> "hit_set_history": { >>>>> "current_last_update": "0'0", >>>>> "history": [] >>>>> } >>>>> }, >>>>> "peer_info": [ >>>>> { >>>>> "peer": "51", >>>>> "pgid": "6.92c", >>>>> "last_update": "342982'41304", >>>>> "last_complete": "342982'41304", >>>>> "log_tail": "341563'12014", >>>>> "last_user_version": 15033, >>>>> "last_backfill": "MAX", >>>>> "last_backfill_bitwise": 0, >>>>> "purged_snaps": "[]", >>>>> "history": { >>>>> "epoch_created": 262553, >>>>> "last_epoch_started": 342598, >>>>> "last_epoch_clean": 342613, >>>>> "last_epoch_split": 0, >>>>> "last_epoch_marked_full": 0, >>>>> "same_up_since": 342596, >>>>> "same_interval_since": 342597, >>>>> "same_primary_since": 342597, >>>>> "last_scrub": "342982'41177", >>>>> "last_scrub_stamp": "2017-01-02 18:19:48.081750", >>>>> "last_deep_scrub": "342965'37465", >>>>> "last_deep_scrub_stamp": "2016-12-20 >>>>> 16:31:06.438823", >>>>> "last_clean_scrub_stamp": "2016-12-11 >>>>> 12:51:19.258816" >>>>> }, >>>>> "stats": { >>>>> "version": "342541'15032", >>>>> "reported_seq": "21472", >>>>> "reported_epoch": "342597", >>>>> "state": "active+undersized+degraded", >>>>> "last_fresh": "2016-11-01 16:05:44.991004", >>>>> "last_change": "2016-11-01 16:05:44.990630", >>>>> "last_active": "2016-11-01 16:05:44.991004", >>>>> "last_peered": "2016-11-01 16:05:44.991004", >>>>> "last_clean": "2016-11-01 15:26:23.393984", >>>>> "last_became_active": "2016-11-01 16:05:44.990630", >>>>> "last_became_peered": "2016-11-01 16:05:44.990630", >>>>> "last_unstale": "2016-11-01 16:05:44.991004", >>>>> "last_undegraded": "2016-11-01 16:05:44.021269", >>>>> "last_fullsized": "2016-11-01 16:05:44.021269", >>>>> "mapping_epoch": 342596, >>>>> "log_start": "341563'12014", >>>>> "ondisk_log_start": "341563'12014", >>>>> "created": 262553, >>>>> "last_epoch_clean": 342587, >>>>> "parent": "0.0", >>>>> "parent_split_bits": 0, >>>>> "last_scrub": "342266'14514", >>>>> "last_scrub_stamp": "2016-10-28 16:41:06.563820", >>>>> "last_deep_scrub": "342266'14514", >>>>> "last_deep_scrub_stamp": "2016-10-28 >>>>> 16:41:06.563820", >>>>> "last_clean_scrub_stamp": "2016-10-28 >>>>> 16:41:06.563820", >>>>> "log_size": 3018, >>>>> "ondisk_log_size": 3018, >>>>> "stats_invalid": false, >>>>> "dirty_stats_invalid": false, >>>>> "omap_stats_invalid": false, >>>>> "hitset_stats_invalid": false, >>>>> "hitset_bytes_stats_invalid": false, >>>>> "pin_stats_invalid": true, >>>>> "stat_sum": { >>>>> "num_bytes": 12528581359, >>>>> "num_objects": 3562, >>>>> "num_object_clones": 0, >>>>> "num_object_copies": 10683, >>>>> "num_objects_missing_on_primary": 0, >>>>> "num_objects_missing": 0, >>>>> "num_objects_degraded": 3561, >>>>> "num_objects_misplaced": 0, >>>>> "num_objects_unfound": 0, >>>>> "num_objects_dirty": 3562, >>>>> "num_whiteouts": 0, >>>>> "num_read": 3678, >>>>> "num_read_kb": 10197642, >>>>> "num_write": 15656, >>>>> "num_write_kb": 19564203, >>>>> "num_scrub_errors": 0, >>>>> "num_shallow_scrub_errors": 0, >>>>> "num_deep_scrub_errors": 0, >>>>> "num_objects_recovered": 5806, >>>>> "num_bytes_recovered": 22687335556, >>>>> "num_keys_recovered": 0, >>>>> "num_objects_omap": 0, >>>>> "num_objects_hit_set_archive": 0, >>>>> "num_bytes_hit_set_archive": 0, >>>>> "num_flush": 0, >>>>> "num_flush_kb": 0, >>>>> "num_evict": 0, >>>>> "num_evict_kb": 0, >>>>> "num_promote": 0, >>>>> "num_flush_mode_high": 0, >>>>> "num_flush_mode_low": 0, >>>>> "num_evict_mode_some": 0, >>>>> "num_evict_mode_full": 0, >>>>> "num_objects_pinned": 0 >>>>> }, >>>>> "up": [ >>>>> 319, >>>>> 90, >>>>> 51 >>>>> ], >>>>> "acting": [ >>>>> 319, >>>>> 90, >>>>> 51 >>>>> ], >>>>> "blocked_by": [], >>>>> "up_primary": 319, >>>>> "acting_primary": 319 >>>>> }, >>>>> "empty": 0, >>>>> "dne": 0, >>>>> "incomplete": 0, >>>>> "last_epoch_started": 342598, >>>>> "hit_set_history": { >>>>> "current_last_update": "0'0", >>>>> "history": [] >>>>> } >>>>> }, >>>>> { >>>>> "peer": "90", >>>>> "pgid": "6.92c", >>>>> "last_update": "342982'41304", >>>>> "last_complete": "342982'41304", >>>>> "log_tail": "341563'12014", >>>>> "last_user_version": 15033, >>>>> "last_backfill": "MAX", >>>>> "last_backfill_bitwise": 0, >>>>> "purged_snaps": "[]", >>>>> "history": { >>>>> "epoch_created": 262553, >>>>> "last_epoch_started": 342598, >>>>> "last_epoch_clean": 342613, >>>>> "last_epoch_split": 0, >>>>> "last_epoch_marked_full": 0, >>>>> "same_up_since": 342596, >>>>> "same_interval_since": 342597, >>>>> "same_primary_since": 342597, >>>>> "last_scrub": "342982'41177", >>>>> "last_scrub_stamp": "2017-01-02 18:19:48.081750", >>>>> "last_deep_scrub": "342965'37465", >>>>> "last_deep_scrub_stamp": "2016-12-20 >>>>> 16:31:06.438823", >>>>> "last_clean_scrub_stamp": "2016-12-11 >>>>> 12:51:19.258816" >>>>> }, >>>>> "stats": { >>>>> "version": "342589'15033", >>>>> "reported_seq": "21478", >>>>> "reported_epoch": "342596", >>>>> "state": "remapped+peering", >>>>> "last_fresh": "2016-11-01 16:21:20.584113", >>>>> "last_change": "2016-11-01 16:21:20.295685", >>>>> "last_active": "2016-11-01 16:14:02.694748", >>>>> "last_peered": "2016-11-01 16:14:02.694748", >>>>> "last_clean": "2016-11-01 15:26:23.393984", >>>>> "last_became_active": "2016-11-01 16:05:44.990630", >>>>> "last_became_peered": "2016-11-01 16:05:44.990630", >>>>> "last_unstale": "2016-11-01 16:21:20.584113", >>>>> "last_undegraded": "2016-11-01 16:21:20.584113", >>>>> "last_fullsized": "2016-11-01 16:21:20.584113", >>>>> "mapping_epoch": 342596, >>>>> "log_start": "341563'12014", >>>>> "ondisk_log_start": "341563'12014", >>>>> "created": 262553, >>>>> "last_epoch_clean": 342587, >>>>> "parent": "0.0", >>>>> "parent_split_bits": 0, >>>>> "last_scrub": "342266'14514", >>>>> "last_scrub_stamp": "2016-10-28 16:41:06.563820", >>>>> "last_deep_scrub": "342266'14514", >>>>> "last_deep_scrub_stamp": "2016-10-28 >>>>> 16:41:06.563820", >>>>> "last_clean_scrub_stamp": "2016-10-28 >>>>> 16:41:06.563820", >>>>> "log_size": 3019, >>>>> "ondisk_log_size": 3019, >>>>> "stats_invalid": false, >>>>> "dirty_stats_invalid": false, >>>>> "omap_stats_invalid": false, >>>>> "hitset_stats_invalid": false, >>>>> "hitset_bytes_stats_invalid": false, >>>>> "pin_stats_invalid": true, >>>>> "stat_sum": { >>>>> "num_bytes": 12528581359, >>>>> "num_objects": 3562, >>>>> "num_object_clones": 0, >>>>> "num_object_copies": 10686, >>>>> "num_objects_missing_on_primary": 0, >>>>> "num_objects_missing": 0, >>>>> "num_objects_degraded": 0, >>>>> "num_objects_misplaced": 0, >>>>> "num_objects_unfound": 0, >>>>> "num_objects_dirty": 3562, >>>>> "num_whiteouts": 0, >>>>> "num_read": 3678, >>>>> "num_read_kb": 10197642, >>>>> "num_write": 15656, >>>>> "num_write_kb": 19564203, >>>>> "num_scrub_errors": 0, >>>>> "num_shallow_scrub_errors": 0, >>>>> "num_deep_scrub_errors": 0, >>>>> "num_objects_recovered": 5806, >>>>> "num_bytes_recovered": 22687335556, >>>>> "num_keys_recovered": 0, >>>>> "num_objects_omap": 0, >>>>> "num_objects_hit_set_archive": 0, >>>>> "num_bytes_hit_set_archive": 0, >>>>> "num_flush": 0, >>>>> "num_flush_kb": 0, >>>>> "num_evict": 0, >>>>> "num_evict_kb": 0, >>>>> "num_promote": 0, >>>>> "num_flush_mode_high": 0, >>>>> "num_flush_mode_low": 0, >>>>> "num_evict_mode_some": 0, >>>>> "num_evict_mode_full": 0, >>>>> "num_objects_pinned": 0 >>>>> }, >>>>> "up": [ >>>>> 319, >>>>> 90, >>>>> 51 >>>>> ], >>>>> "acting": [ >>>>> 319, >>>>> 90, >>>>> 51 >>>>> ], >>>>> "blocked_by": [], >>>>> "up_primary": 319, >>>>> "acting_primary": 319 >>>>> }, >>>>> "empty": 0, >>>>> "dne": 0, >>>>> "incomplete": 0, >>>>> "last_epoch_started": 342598, >>>>> "hit_set_history": { >>>>> "current_last_update": "0'0", >>>>> "history": [] >>>>> } >>>>> } >>>>> ], >>>>> "recovery_state": [ >>>>> { >>>>> "name": "Started\/Primary\/Active", >>>>> "enter_time": "2016-11-01 16:21:23.007072", >>>>> "might_have_unfound": [ >>>>> { >>>>> "osd": "51", >>>>> "status": "already probed" >>>>> }, >>>>> { >>>>> "osd": "90", >>>>> "status": "already probed" >>>>> } >>>>> ], >>>>> "recovery_progress": { >>>>> "backfill_targets": [], >>>>> "waiting_on_backfill": [], >>>>> "last_backfill_started": "MIN", >>>>> "backfill_info": { >>>>> "begin": "MIN", >>>>> "end": "MIN", >>>>> "objects": [] >>>>> }, >>>>> "peer_backfill_info": [], >>>>> "backfills_in_flight": [], >>>>> "recovering": [], >>>>> "pg_backend": { >>>>> "pull_from_peer": [], >>>>> "pushing": [] >>>>> } >>>>> }, >>>>> "scrub": { >>>>> "scrubber.epoch_start": "342597", >>>>> "scrubber.active": 0, >>>>> "scrubber.state": "INACTIVE", >>>>> "scrubber.start": "MIN", >>>>> "scrubber.end": "MIN", >>>>> "scrubber.subset_last_update": "0'0", >>>>> "scrubber.deep": false, >>>>> "scrubber.seed": 0, >>>>> "scrubber.waiting_on": 0, >>>>> "scrubber.waiting_on_whom": [] >>>>> } >>>>> }, >>>>> { >>>>> "name": "Started", >>>>> "enter_time": "2016-11-01 16:21:21.763033" >>>>> } >>>>> ], >>>>> "agent_state": {} >>>>> } >>>>> >>>>> >>>>> Andras >>>>> >>>>> >>>>> >>>>> On 12/23/2016 01:27 AM, Shinobu Kinjo wrote: >>>>>> >>>>>> Would you be able to execute ``ceph pg ${PG ID} query`` against that >>>>>> particular PG? >>>>>> >>>>>> On Wed, Dec 21, 2016 at 11:44 PM, Andras Pataki >>>>>> <apataki@xxxxxxxxxxxxxxxxxxxx> wrote: >>>>>>> >>>>>>> Yes, size = 3, and I have checked that all three replicas are the >>>>>>> same >>>>>>> zero >>>>>>> length object on the disk. I think some metadata info is mismatching >>>>>>> what >>>>>>> the OSD log refers to as "object info size". But I'm not sure what >>>>>>> to >>>>>>> do >>>>>>> about it. pg repair does not fix it. In fact, the file this object >>>>>>> corresponds to in CephFS is shorter so this chunk shouldn't even >>>>>>> exist >>>>>>> I >>>>>>> think (details are in the original email). Although I may be >>>>>>> understanding >>>>>>> the situation wrong ... >>>>>>> >>>>>>> Andras >>>>>>> >>>>>>> >>>>>>> On 12/21/2016 07:17 AM, Mehmet wrote: >>>>>>> >>>>>>> Hi Andras, >>>>>>> >>>>>>> Iam not the experienced User but i guess you could have a look on >>>>>>> this >>>>>>> object on each related osd for the pg, compare them and delete the >>>>>>> Different >>>>>>> object. I assume you have size = 3. >>>>>>> >>>>>>> Then again pg repair. >>>>>>> >>>>>>> But be carefull iirc the replica will be recovered from the primary >>>>>>> pg. >>>>>>> >>>>>>> Hth >>>>>>> >>>>>>> Am 20. Dezember 2016 22:39:44 MEZ, schrieb Andras Pataki >>>>>>> <apataki@xxxxxxxxxxxxxxxxxxxx>: >>>>>>>> >>>>>>>> Hi cephers, >>>>>>>> >>>>>>>> Any ideas on how to proceed on the inconsistencies below? At the >>>>>>>> moment >>>>>>>> our ceph setup has 5 of these - in all cases it seems like some zero >>>>>>>> length >>>>>>>> objects that match across the three replicas, but do not match the >>>>>>>> object >>>>>>>> info size. I tried running pg repair on one of them, but it didn't >>>>>>>> repair >>>>>>>> the problem: >>>>>>>> >>>>>>>> 2016-12-20 16:24:40.870307 7f3e1a4b1700 0 log_channel(cluster) log >>>>>>>> [INF] >>>>>>>> : 6.92c repair starts >>>>>>>> 2016-12-20 16:27:06.183186 7f3e1a4b1700 -1 log_channel(cluster) log >>>>>>>> [ERR] >>>>>>>> : repair 6.92c 6:34932257:::1000187bbb5.00000009:head on disk size >>>>>>>> (0) >>>>>>>> does >>>>>>>> not match object info size (3014656) adjusted for ondisk to >>>>>>>> (3014656) >>>>>>>> 2016-12-20 16:27:35.885496 7f3e17cac700 -1 log_channel(cluster) log >>>>>>>> [ERR] >>>>>>>> : 6.92c repair 1 errors, 0 fixed >>>>>>>> >>>>>>>> >>>>>>>> Any help/hints would be appreciated. >>>>>>>> >>>>>>>> Thanks, >>>>>>>> >>>>>>>> Andras >>>>>>>> >>>>>>>> >>>>>>>> On 12/15/2016 10:13 AM, Andras Pataki wrote: >>>>>>>> >>>>>>>> Hi everyone, >>>>>>>> >>>>>>>> Yesterday scrubbing turned up an inconsistency in one of our >>>>>>>> placement >>>>>>>> groups. We are running ceph 10.2.3, using CephFS and RBD for some >>>>>>>> VM >>>>>>>> images. >>>>>>>> >>>>>>>> [root@hyperv017 ~]# ceph -s >>>>>>>> cluster d7b33135-0940-4e48-8aa6-1d2026597c2f >>>>>>>> health HEALTH_ERR >>>>>>>> 1 pgs inconsistent >>>>>>>> 1 scrub errors >>>>>>>> noout flag(s) set >>>>>>>> monmap e15: 3 mons at >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> {hyperv029=10.4.36.179:6789/0,hyperv030=10.4.36.180:6789/0,hyperv031=10.4.36.181:6789/0} >>>>>>>> election epoch 27192, quorum 0,1,2 >>>>>>>> hyperv029,hyperv030,hyperv031 >>>>>>>> fsmap e17181: 1/1/1 up {0=hyperv029=up:active}, 2 >>>>>>>> up:standby >>>>>>>> osdmap e342930: 385 osds: 385 up, 385 in >>>>>>>> flags noout >>>>>>>> pgmap v37580512: 34816 pgs, 5 pools, 673 TB data, 198 >>>>>>>> Mobjects >>>>>>>> 1583 TB used, 840 TB / 2423 TB avail >>>>>>>> 34809 active+clean >>>>>>>> 4 active+clean+scrubbing+deep >>>>>>>> 2 active+clean+scrubbing >>>>>>>> 1 active+clean+inconsistent >>>>>>>> client io 87543 kB/s rd, 671 MB/s wr, 23 op/s rd, 2846 op/s wr >>>>>>>> >>>>>>>> # ceph pg dump | grep inconsistent >>>>>>>> 6.13f1 4692 0 0 0 0 16057314767 3087 >>>>>>>> 3087 >>>>>>>> active+clean+inconsistent 2016-12-14 16:49:48.391572 >>>>>>>> 342929'41011 >>>>>>>> 342929:43966 [158,215,364] 158 [158,215,364] 158 >>>>>>>> 342928'40540 >>>>>>>> 2016-12-14 16:49:48.391511 342928'40540 2016-12-14 >>>>>>>> 16:49:48.391511 >>>>>>>> >>>>>>>> I tried a couple of other deep scrubs on pg 6.13f1 but got repeated >>>>>>>> errors. In the OSD logs: >>>>>>>> >>>>>>>> 2016-12-14 16:48:07.733291 7f3b56e3a700 -1 log_channel(cluster) log >>>>>>>> [ERR] >>>>>>>> : deep-scrub 6.13f1 6:8fc91b77:::1000187bb70.00000009:head on disk >>>>>>>> size >>>>>>>> (0) >>>>>>>> does not match object info size (1835008) adjusted for ondisk to >>>>>>>> (1835008) >>>>>>>> I looked at the objects on the 3 OSD's on their respective hosts and >>>>>>>> they >>>>>>>> are the same, zero length files: >>>>>>>> >>>>>>>> # cd ~ceph/osd/ceph-158/current/6.13f1_head >>>>>>>> # find . -name *1000187bb70* -ls >>>>>>>> 669738 0 -rw-r--r-- 1 ceph ceph 0 Dec 13 17:00 >>>>>>>> >>>>>>>> ./DIR_1/DIR_F/DIR_3/DIR_9/DIR_8/1000187bb70.00000009__head_EED893F1__6 >>>>>>>> >>>>>>>> # cd ~ceph/osd/ceph-215/current/6.13f1_head >>>>>>>> # find . -name *1000187bb70* -ls >>>>>>>> 539815647 0 -rw-r--r-- 1 ceph ceph 0 Dec 13 17:00 >>>>>>>> >>>>>>>> ./DIR_1/DIR_F/DIR_3/DIR_9/DIR_8/1000187bb70.00000009__head_EED893F1__6 >>>>>>>> >>>>>>>> # cd ~ceph/osd/ceph-364/current/6.13f1_head >>>>>>>> # find . -name *1000187bb70* -ls >>>>>>>> 1881432215 0 -rw-r--r-- 1 ceph ceph 0 Dec 13 >>>>>>>> 17:00 >>>>>>>> >>>>>>>> ./DIR_1/DIR_F/DIR_3/DIR_9/DIR_8/1000187bb70.00000009__head_EED893F1__6 >>>>>>>> >>>>>>>> At the time of the write, there wasn't anything unusual going on as >>>>>>>> far >>>>>>>> as >>>>>>>> I can tell (no hardware/network issues, all processes were up, etc). >>>>>>>> >>>>>>>> This pool is a CephFS data pool, and the corresponding file (inode >>>>>>>> hex >>>>>>>> 1000187bb70, decimal 1099537300336) looks like this: >>>>>>>> >>>>>>>> # ls -li chr4.tags.tsv >>>>>>>> 1099537300336 -rw-r--r-- 1 xichen xichen 14469915 Dec 13 17:01 >>>>>>>> chr4.tags.tsv >>>>>>>> >>>>>>>> Reading the file is also ok (no errors, right number of bytes): >>>>>>>> # cat chr4.tags.tsv > /dev/null >>>>>>>> # wc chr4.tags.tsv >>>>>>>> 592251 2961255 14469915 chr4.tags.tsv >>>>>>>> >>>>>>>> We are using the standard 4MB block size for CephFS, and if I >>>>>>>> interpret >>>>>>>> this right, this is the 9th chunk, so there shouldn't be any data >>>>>>>> (or >>>>>>>> even a >>>>>>>> 9th chunk), since the file is only 14MB. Should I run pg repair on >>>>>>>> this? >>>>>>>> Any ideas on how this could come about? Any other recommendations? >>>>>>>> >>>>>>>> Thanks, >>>>>>>> >>>>>>>> Andras >>>>>>>> apataki@xxxxxxxxxxx >>>>>>>> >>>>>>>> >>>>>>>> ________________________________ >>>>>>>> ceph-users mailing list >>>>>>>> ceph-users@xxxxxxxxxxxxxx >>>>>>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com >>>>>>> >>>>>>> >>>>>>> _______________________________________________ >>>>>>> ceph-users mailing list >>>>>>> ceph-users@xxxxxxxxxxxxxx >>>>>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com >>>>>>> > _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com