_______________________________________________Hi Christian,
You haven't resharded any of your buckets have you? You can run the command below in v12.2.11 to list stale bucket instances.
radosgw-admin reshard stale-instances list
Can you also send the output from the following command on each rgw?
radosgw-admin period get
From: Christian Rice <crice@xxxxxxxxxxx>
Sent: Tuesday, March 5, 2019 1:46 AM
To: Matthew H; ceph-users
Subject: Re: radosgw sync falling behind regularlysure thing.
sv5-ceph-rgw1
zonegroup get
{
"id": "de6af748-1a2f-44a1-9d44-30799cf1313e",
"name": "us",
"api_name": "us",
"is_master": "true",
"endpoints": [
"http://sv5-ceph-rgw1.savagebeast.com:8080"
],
"hostnames": [],
"hostnames_s3website": [],
"master_zone": "1e27bf9c-3a2f-4845-85b6-33a24bbe1c04",
"zones": [
{
"id": "107d29a0-b732-4bf1-a26e-1f64f820e839",
"name": "dc11-prod",
"endpoints": [
],
"log_meta": "false",
"log_data": "true",
"bucket_index_max_shards": 0,
"read_only": "false",
"tier_type": "",
"sync_from_all": "true",
"sync_from": []
},
{
"id": "1e27bf9c-3a2f-4845-85b6-33a24bbe1c04",
"name": "sv5-corp",
"endpoints": [
"http://sv5-ceph-rgw1.savagebeast.com:8080"
],
"log_meta": "false",
"log_data": "true",
"bucket_index_max_shards": 0,
"read_only": "false",
"tier_type": "",
"sync_from_all": "true",
"sync_from": []
},
{
"id": "331d3f1e-1b72-4c56-bb5a-d1d0fcf6d0b8",
"name": "sv3-prod",
"endpoints": [
],
"log_meta": "false",
"log_data": "true",
"bucket_index_max_shards": 0,
"read_only": "false",
"tier_type": "",
"sync_from_all": "true",
"sync_from": []
}
],
"placement_targets": [
{
"name": "default-placement",
"tags": []
}
],
"default_placement": "default-placement",
"realm_id": "b3e2afe7-2254-494a-9a34-ce50358779fd"
}
zone get
{
"id": "1e27bf9c-3a2f-4845-85b6-33a24bbe1c04",
"name": "sv5-corp",
"domain_root": "sv5-corp.rgw.meta:root",
"control_pool": "sv5-corp.rgw.control",
"gc_pool": "sv5-corp.rgw.log:gc",
"lc_pool": "sv5-corp.rgw.log:lc",
"log_pool": "sv5-corp.rgw.log",
"intent_log_pool": "sv5-corp.rgw.log:intent",
"usage_log_pool": "sv5-corp.rgw.log:usage",
"reshard_pool": "sv5-corp.rgw.log:reshard",
"user_keys_pool": "sv5-corp.rgw.meta:users.keys",
"user_email_pool": "sv5-corp.rgw.meta:users.email",
"user_swift_pool": "sv5-corp.rgw.meta:users.swift",
"user_uid_pool": "sv5-corp.rgw.meta:users.uid",
"system_key": {
"access_key": "access_key_redacted",
"secret_key": "secret_key_redacted"
},
"placement_pools": [
{
"key": "default-placement",
"val": {
"index_pool": "sv5-corp.rgw.buckets.index",
"data_pool": "sv5-corp.rgw.buckets.data",
"data_extra_pool": "sv5-corp.rgw.buckets.non-ec",
"index_type": 0,
"compression": ""
}
}
],
"metadata_heap": "",
"tier_config": [],
"realm_id": "b3e2afe7-2254-494a-9a34-ce50358779fd"
}
sv3-ceph-rgw1
zonegroup get
{
"id": "de6af748-1a2f-44a1-9d44-30799cf1313e",
"name": "us",
"api_name": "us",
"is_master": "true",
"endpoints": [
"http://sv5-ceph-rgw1.savagebeast.com:8080"
],
"hostnames": [],
"hostnames_s3website": [],
"master_zone": "1e27bf9c-3a2f-4845-85b6-33a24bbe1c04",
"zones": [
{
"id": "107d29a0-b732-4bf1-a26e-1f64f820e839",
"name": "dc11-prod",
"endpoints": [
],
"log_meta": "false",
"log_data": "true",
"bucket_index_max_shards": 0,
"read_only": "false",
"tier_type": "",
"sync_from_all": "true",
"sync_from": []
},
{
"id": "1e27bf9c-3a2f-4845-85b6-33a24bbe1c04",
"name": "sv5-corp",
"endpoints": [
"http://sv5-ceph-rgw1.savagebeast.com:8080"
],
"log_meta": "false",
"log_data": "true",
"bucket_index_max_shards": 0,
"read_only": "false",
"tier_type": "",
"sync_from_all": "true",
"sync_from": []
},
{
"id": "331d3f1e-1b72-4c56-bb5a-d1d0fcf6d0b8",
"name": "sv3-prod",
"endpoints": [
],
"log_meta": "false",
"log_data": "true",
"bucket_index_max_shards": 0,
"read_only": "false",
"tier_type": "",
"sync_from_all": "true",
"sync_from": []
}
],
"placement_targets": [
{
"name": "default-placement",
"tags": []
}
],
"default_placement": "default-placement",
"realm_id": "b3e2afe7-2254-494a-9a34-ce50358779fd"
}
zone get
{
"id": "331d3f1e-1b72-4c56-bb5a-d1d0fcf6d0b8",
"name": "sv3-prod",
"domain_root": "sv3-prod.rgw.meta:root",
"control_pool": "sv3-prod.rgw.control",
"gc_pool": "sv3-prod.rgw.log:gc",
"lc_pool": "sv3-prod.rgw.log:lc",
"log_pool": "sv3-prod.rgw.log",
"intent_log_pool": "sv3-prod.rgw.log:intent",
"usage_log_pool": "sv3-prod.rgw.log:usage",
"reshard_pool": "sv3-prod.rgw.log:reshard",
"user_keys_pool": "sv3-prod.rgw.meta:users.keys",
"user_email_pool": "sv3-prod.rgw.meta:users.email",
"user_swift_pool": "sv3-prod.rgw.meta:users.swift",
"user_uid_pool": "sv3-prod.rgw.meta:users.uid",
"system_key": {
"access_key": "access_key_redacted",
"secret_key": "secret_key_redacted"
},
"placement_pools": [
{
"key": "default-placement",
"val": {
"index_pool": "sv3-prod.rgw.buckets.index",
"data_pool": "sv3-prod.rgw.buckets.data",
"data_extra_pool": "sv3-prod.rgw.buckets.non-ec",
"index_type": 0,
"compression": ""
}
}
],
"metadata_heap": "",
"tier_config": [],
"realm_id": "b3e2afe7-2254-494a-9a34-ce50358779fd"
}
dc11-ceph-rgw1
zonegroup get
{
"id": "de6af748-1a2f-44a1-9d44-30799cf1313e",
"name": "us",
"api_name": "us",
"is_master": "true",
"endpoints": [
"http://sv5-ceph-rgw1.savagebeast.com:8080"
],
"hostnames": [],
"hostnames_s3website": [],
"master_zone": "1e27bf9c-3a2f-4845-85b6-33a24bbe1c04",
"zones": [
{
"id": "107d29a0-b732-4bf1-a26e-1f64f820e839",
"name": "dc11-prod",
"endpoints": [
],
"log_meta": "false",
"log_data": "true",
"bucket_index_max_shards": 0,
"read_only": "false",
"tier_type": "",
"sync_from_all": "true",
"sync_from": []
},
{
"id": "1e27bf9c-3a2f-4845-85b6-33a24bbe1c04",
"name": "sv5-corp",
"endpoints": [
"http://sv5-ceph-rgw1.savagebeast.com:8080"
],
"log_meta": "false",
"log_data": "true",
"bucket_index_max_shards": 0,
"read_only": "false",
"tier_type": "",
"sync_from_all": "true",
"sync_from": []
},
{
"id": "331d3f1e-1b72-4c56-bb5a-d1d0fcf6d0b8",
"name": "sv3-prod",
"endpoints": [
],
"log_meta": "false",
"log_data": "true",
"bucket_index_max_shards": 0,
"read_only": "false",
"tier_type": "",
"sync_from_all": "true",
"sync_from": []
}
],
"placement_targets": [
{
"name": "default-placement",
"tags": []
}
],
"default_placement": "default-placement",
"realm_id": "b3e2afe7-2254-494a-9a34-ce50358779fd"
}
zone get
{
"id": "107d29a0-b732-4bf1-a26e-1f64f820e839",
"name": "dc11-prod",
"domain_root": "dc11-prod.rgw.meta:root",
"control_pool": "dc11-prod.rgw.control",
"gc_pool": "dc11-prod.rgw.log:gc",
"lc_pool": "dc11-prod.rgw.log:lc",
"log_pool": "dc11-prod.rgw.log",
"intent_log_pool": "dc11-prod.rgw.log:intent",
"usage_log_pool": "dc11-prod.rgw.log:usage",
"reshard_pool": "dc11-prod.rgw.log:reshard",
"user_keys_pool": "dc11-prod.rgw.meta:users.keys",
"user_email_pool": "dc11-prod.rgw.meta:users.email",
"user_swift_pool": "dc11-prod.rgw.meta:users.swift",
"user_uid_pool": "dc11-prod.rgw.meta:users.uid",
"system_key": {
"access_key": "access_key_redacted",
"secret_key": "secret_key_redacted"
},
"placement_pools": [
{
"key": "default-placement",
"val": {
"index_pool": "dc11-prod.rgw.buckets.index",
"data_pool": "dc11-prod.rgw.buckets.data",
"data_extra_pool": "dc11-prod.rgw.buckets.non-ec",
"index_type": 0,
"compression": ""
}
}
],
"metadata_heap": "",
"tier_config": [],
"realm_id": "b3e2afe7-2254-494a-9a34-ce50358779fd"
}
From: Matthew H <matthew.heler@xxxxxxxxxxx>
Date: Monday, March 4, 2019 at 7:44 PM
To: Christian Rice <crice@xxxxxxxxxxx>, ceph-users <ceph-users@xxxxxxxxxxxxxx>
Subject: Re: radosgw sync falling behind regularly
Christian,
Can you provide your zonegroup and zones configurations for all 3 rgw sites? (run the commands for each site please)
Thanks,
From: Christian Rice <crice@xxxxxxxxxxx>
Sent: Monday, March 4, 2019 5:34 PM
To: Matthew H; ceph-users
Subject: Re: radosgw sync falling behind regularly
So we upgraded everything from 12.2.8 to 12.2.11, and things have gone to hell. Lots of sync errors, like so:
sudo radosgw-admin sync error list
[
{
"shard_id": 0,
"entries": [
{
"id": "1_1549348245.870945_5163821.1",
"section": "data",
"name": "dora/catalogmaker-redis:1e27bf9c-3a2f-4845-85b6-33a24bbe1c04.18467.470/56fbc9685d609b4c8cdbd11dd60bf03bedcb613b438c663c9899d930b25f0405",
"timestamp": "2019-02-05 06:30:45.870945Z",
"info": {
"source_zone": "1e27bf9c-3a2f-4845-85b6-33a24bbe1c04",
"error_code": 5,
"message": "failed to sync object(5) Input/output error"
}
},
…
radosgw logs are full of:
2019-03-04 14:32:58.039467 7f90e81eb700 0 data sync: ERROR: failed to read remote data log info: ret=-2
2019-03-04 14:32:58.041296 7f90e81eb700 0 data sync: ERROR: init sync on escarpment/escarpment:1e27bf9c-3a2f-4845-85b6-33a24bbe1c04.18467.146 failed, retcode=-2
2019-03-04 14:32:58.041662 7f90e81eb700 0 meta sync: ERROR: RGWBackoffControlCR called coroutine returned -2
2019-03-04 14:32:58.042949 7f90e81eb700 0 data sync: WARNING: skipping data log entry for missing bucket escarpment/escarpment:1e27bf9c-3a2f-4845-85b6-33a24bbe1c04.18467.146
2019-03-04 14:32:58.823501 7f90e81eb700 0 data sync: ERROR: failed to read remote data log info: ret=-2
2019-03-04 14:32:58.825243 7f90e81eb700 0 meta sync: ERROR: RGWBackoffControlCR called coroutine returned -2
dc11-ceph-rgw2:~$ sudo radosgw-admin sync status
realm b3e2afe7-2254-494a-9a34-ce50358779fd (savagebucket)
zonegroup de6af748-1a2f-44a1-9d44-30799cf1313e (us)
zone 107d29a0-b732-4bf1-a26e-1f64f820e839 (dc11-prod)
2019-03-04 14:26:21.351372 7ff7ae042e40 0 meta sync: ERROR: failed to fetch mdlog info
metadata sync syncing
full sync: 0/64 shards
failed to fetch local sync status: (5) Input/output error
^C
Any advice? All three clusters on 12.2.11, Debian stretch.
From: Christian Rice <crice@xxxxxxxxxxx>
Date: Thursday, February 28, 2019 at 9:06 AM
To: Matthew H <matthew.heler@xxxxxxxxxxx>, ceph-users <ceph-users@xxxxxxxxxxxxxx>
Subject: Re: radosgw sync falling behind regularly
Yeah my bad on the typo, not running 12.8.8 ☺ It’s 12.2.8. We can upgrade and will attempt to do so asap. Thanks for that, I need to read my release notes more carefully, I guess!
From: Matthew H <matthew.heler@xxxxxxxxxxx>
Date: Wednesday, February 27, 2019 at 8:33 PM
To: Christian Rice <crice@xxxxxxxxxxx>, ceph-users <ceph-users@xxxxxxxxxxxxxx>
Subject: Re: radosgw sync falling behind regularly
Hey Christian,
I'm making a while guess, but assuming this is 12.2.8. If so, it it possible that you can upgrade to 12.2.11? There's been rgw multisite bug fixes for metadata syncing and data syncing ( both separate issues ) that you could be hitting.
Thanks,
From: ceph-users <ceph-users-bounces@xxxxxxxxxxxxxx> on behalf of Christian Rice <crice@xxxxxxxxxxx>
Sent: Wednesday, February 27, 2019 7:05 PM
To: ceph-users
Subject: radosgw sync falling behind regularly
Debian 9; ceph 12.8.8-bpo90+1; no rbd or cephfs, just radosgw; three clusters in one zonegroup.
Often we find either metadata or data sync behind, and it doesn’t look to ever recover until…we restart the endpoint radosgw target service.
eg at 15:45:40:
dc11-ceph-rgw1:/var/log/ceph# radosgw-admin sync status
realm b3e2afe7-2254-494a-9a34-ce50358779fd (savagebucket)
zonegroup de6af748-1a2f-44a1-9d44-30799cf1313e (us)
zone 107d29a0-b732-4bf1-a26e-1f64f820e839 (dc11-prod)
metadata sync syncing
full sync: 0/64 shards
incremental sync: 64/64 shards
metadata is behind on 2 shards
behind shards: [19,41]
oldest incremental change not applied: 2019-02-27 14:42:24.0.408263s
data sync source: 1e27bf9c-3a2f-4845-85b6-33a24bbe1c04 (sv5-corp)
syncing
full sync: 0/128 shards
incremental sync: 128/128 shards
data is caught up with source
source: 331d3f1e-1b72-4c56-bb5a-d1d0fcf6d0b8 (sv3-prod)
syncing
full sync: 0/128 shards
incremental sync: 128/128 shards
data is caught up with source
so at 15:46:07:
dc11-ceph-rgw1:/var/log/ceph# sudo systemctl restart ceph-radosgw@rgw.dc11-ceph-rgw1.service
and by the time I checked at 15:48:08:
dc11-ceph-rgw1:/var/log/ceph# radosgw-admin sync status
realm b3e2afe7-2254-494a-9a34-ce50358779fd (savagebucket)
zonegroup de6af748-1a2f-44a1-9d44-30799cf1313e (us)
zone 107d29a0-b732-4bf1-a26e-1f64f820e839 (dc11-prod)
metadata sync syncing
full sync: 0/64 shards
incremental sync: 64/64 shards
metadata is caught up with master
data sync source: 1e27bf9c-3a2f-4845-85b6-33a24bbe1c04 (sv5-corp)
syncing
full sync: 0/128 shards
incremental sync: 128/128 shards
data is caught up with source
source: 331d3f1e-1b72-4c56-bb5a-d1d0fcf6d0b8 (sv3-prod)
syncing
full sync: 0/128 shards
incremental sync: 128/128 shards
data is caught up with source
There’s no way this is “lag.” It’s stuck, and happens frequently, though perhaps not daily. Any suggestions? Our cluster isn’t heavily used yet, but it’s production.
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com