Hi
Thanks for the reply. Yeah, i restarted all of the mon servers, in
sequence, and yesterday just leader alone without any success.
Reports:
root@monb01:~# ceph report | grep committed
report 4002437698
"monmap_first_committed": 1,
"monmap_last_committed": 6,
"osdmap_first_committed": 67114,
"osdmap_last_committed": 72592,
"mdsmap_first_committed": 1,
"mdsmap_last_committed": 1,
"first_committed": 609225,
"last_committed": 609251,
"first_committed": 180754137,
"last_committed": 180754777,
root@monb01:~#
root@monb01:~# ceph report | jq .osdmap_clean_epochs
report 395175214
{
"min_last_epoch_clean": 72592,
"last_epoch_clean": {
"per_pool": [
{
"poolid": 0,
"floor": 72592
},
{
"poolid": 1,
"floor": 72592
},
{
"poolid": 2,
"floor": 72592
},
{
"poolid": 3,
"floor": 72592
},
{
"poolid": 4,
"floor": 72592
},
{
"poolid": 5,
"floor": 72592
},
{
"poolid": 26,
"floor": 72592
},
{
"poolid": 27,
"floor": 72592
},
{
"poolid": 28,
"floor": 72592
}
]
},
"osd_epochs": [
{
"id": 0,
"epoch": 72592
},
{
"id": 1,
"epoch": 72592
},
{
"id": 2,
"epoch": 72592
},
{
"id": 3,
"epoch": 72592
},
{
"id": 4,
"epoch": 72592
},
{
"id": 5,
"epoch": 72592
},
{
"id": 6,
"epoch": 72592
},
{
"id": 7,
"epoch": 72592
},
{
"id": 8,
"epoch": 72592
},
{
"id": 9,
"epoch": 72592
},
{
"id": 10,
"epoch": 72592
},
{
"id": 11,
"epoch": 72592
},
{
"id": 12,
"epoch": 72592
},
{
"id": 13,
"epoch": 72592
},
{
"id": 14,
"epoch": 72592
},
{
"id": 15,
"epoch": 72592
},
{
"id": 16,
"epoch": 72592
},
{
"id": 17,
"epoch": 72592
},
{
"id": 18,
"epoch": 72592
},
{
"id": 19,
"epoch": 72592
},
{
"id": 20,
"epoch": 72592
},
{
"id": 21,
"epoch": 72592
},
{
"id": 22,
"epoch": 72592
},
{
"id": 23,
"epoch": 72592
},
{
"id": 24,
"epoch": 72592
},
{
"id": 25,
"epoch": 72592
},
{
"id": 26,
"epoch": 72592
},
{
"id": 27,
"epoch": 72592
},
{
"id": 28,
"epoch": 72592
},
{
"id": 29,
"epoch": 72592
},
{
"id": 30,
"epoch": 72592
},
{
"id": 31,
"epoch": 72592
},
{
"id": 32,
"epoch": 72592
},
{
"id": 33,
"epoch": 72592
},
{
"id": 34,
"epoch": 72592
},
{
"id": 35,
"epoch": 72592
},
{
"id": 36,
"epoch": 72592
},
{
"id": 37,
"epoch": 72592
},
{
"id": 38,
"epoch": 72592
},
{
"id": 39,
"epoch": 72592
},
{
"id": 40,
"epoch": 72592
},
{
"id": 41,
"epoch": 72592
},
{
"id": 42,
"epoch": 72592
},
{
"id": 43,
"epoch": 72592
},
{
"id": 44,
"epoch": 72592
},
{
"id": 45,
"epoch": 72592
},
{
"id": 46,
"epoch": 72592
},
{
"id": 47,
"epoch": 72592
},
{
"id": 48,
"epoch": 72592
},
{
"id": 49,
"epoch": 72592
},
{
"id": 50,
"epoch": 72592
},
{
"id": 51,
"epoch": 72592
},
{
"id": 52,
"epoch": 72592
},
{
"id": 53,
"epoch": 72592
},
{
"id": 54,
"epoch": 72592
},
{
"id": 55,
"epoch": 72592
},
{
"id": 56,
"epoch": 72592
},
{
"id": 57,
"epoch": 72592
},
{
"id": 58,
"epoch": 72592
},
{
"id": 59,
"epoch": 72592
},
{
"id": 60,
"epoch": 72592
},
{
"id": 61,
"epoch": 72592
},
{
"id": 62,
"epoch": 72592
},
{
"id": 63,
"epoch": 72592
},
{
"id": 64,
"epoch": 72592
},
{
"id": 65,
"epoch": 72592
},
{
"id": 66,
"epoch": 72592
},
{
"id": 67,
"epoch": 72592
},
{
"id": 68,
"epoch": 72592
},
{
"id": 69,
"epoch": 72592
},
{
"id": 70,
"epoch": 72592
},
{
"id": 71,
"epoch": 72592
},
{
"id": 72,
"epoch": 72592
},
{
"id": 73,
"epoch": 72592
},
{
"id": 74,
"epoch": 72592
},
{
"id": 75,
"epoch": 72592
},
{
"id": 76,
"epoch": 72592
},
{
"id": 77,
"epoch": 72592
},
{
"id": 78,
"epoch": 72592
},
{
"id": 79,
"epoch": 72592
},
{
"id": 80,
"epoch": 72592
},
{
"id": 81,
"epoch": 72592
},
{
"id": 83,
"epoch": 72592
},
{
"id": 84,
"epoch": 72592
},
{
"id": 85,
"epoch": 72592
},
{
"id": 86,
"epoch": 72592
},
{
"id": 87,
"epoch": 72592
},
{
"id": 88,
"epoch": 72592
},
{
"id": 89,
"epoch": 72592
},
{
"id": 90,
"epoch": 72592
},
{
"id": 91,
"epoch": 72592
},
{
"id": 92,
"epoch": 72592
},
{
"id": 93,
"epoch": 72592
},
{
"id": 94,
"epoch": 72592
},
{
"id": 95,
"epoch": 72592
},
{
"id": 96,
"epoch": 72592
},
{
"id": 97,
"epoch": 72592
},
{
"id": 98,
"epoch": 72592
},
{
"id": 99,
"epoch": 72592
},
{
"id": 100,
"epoch": 72592
},
{
"id": 101,
"epoch": 72592
},
{
"id": 102,
"epoch": 72592
},
{
"id": 103,
"epoch": 72592
},
{
"id": 104,
"epoch": 72592
},
{
"id": 105,
"epoch": 72592
},
{
"id": 106,
"epoch": 72592
},
{
"id": 107,
"epoch": 72592
},
{
"id": 108,
"epoch": 72592
},
{
"id": 109,
"epoch": 72592
},
{
"id": 110,
"epoch": 72592
},
{
"id": 111,
"epoch": 72592
},
{
"id": 112,
"epoch": 72592
},
{
"id": 113,
"epoch": 72592
},
{
"id": 114,
"epoch": 72592
},
{
"id": 115,
"epoch": 72592
},
{
"id": 116,
"epoch": 72592
},
{
"id": 117,
"epoch": 72592
},
{
"id": 118,
"epoch": 72592
},
{
"id": 119,
"epoch": 72592
},
{
"id": 120,
"epoch": 72592
},
{
"id": 121,
"epoch": 72592
},
{
"id": 122,
"epoch": 72592
},
{
"id": 123,
"epoch": 72592
},
{
"id": 124,
"epoch": 72592
},
{
"id": 125,
"epoch": 72592
},
{
"id": 126,
"epoch": 72592
},
{
"id": 127,
"epoch": 72592
},
{
"id": 128,
"epoch": 72592
},
{
"id": 129,
"epoch": 72592
},
{
"id": 130,
"epoch": 72592
},
{
"id": 131,
"epoch": 72592
},
{
"id": 132,
"epoch": 72592
},
{
"id": 133,
"epoch": 72592
},
{
"id": 134,
"epoch": 72592
},
{
"id": 135,
"epoch": 72592
},
{
"id": 136,
"epoch": 72592
},
{
"id": 137,
"epoch": 72592
},
{
"id": 138,
"epoch": 72592
},
{
"id": 139,
"epoch": 72592
},
{
"id": 140,
"epoch": 72592
},
{
"id": 141,
"epoch": 72592
},
{
"id": 142,
"epoch": 72592
},
{
"id": 143,
"epoch": 72592
},
{
"id": 144,
"epoch": 72592
},
{
"id": 145,
"epoch": 72592
},
{
"id": 146,
"epoch": 72592
},
{
"id": 147,
"epoch": 72592
},
{
"id": 148,
"epoch": 72592
},
{
"id": 149,
"epoch": 72592
},
{
"id": 150,
"epoch": 72592
},
{
"id": 151,
"epoch": 72592
},
{
"id": 152,
"epoch": 72592
},
{
"id": 153,
"epoch": 72592
},
{
"id": 154,
"epoch": 72592
},
{
"id": 155,
"epoch": 72592
},
{
"id": 156,
"epoch": 72592
},
{
"id": 157,
"epoch": 72592
},
{
"id": 158,
"epoch": 72592
},
{
"id": 159,
"epoch": 72592
},
{
"id": 160,
"epoch": 72592
},
{
"id": 161,
"epoch": 72592
},
{
"id": 162,
"epoch": 72592
},
{
"id": 163,
"epoch": 72592
},
{
"id": 164,
"epoch": 72592
},
{
"id": 165,
"epoch": 72592
},
{
"id": 166,
"epoch": 72592
},
{
"id": 167,
"epoch": 72592
},
{
"id": 168,
"epoch": 72592
},
{
"id": 169,
"epoch": 72592
},
{
"id": 170,
"epoch": 72592
},
{
"id": 171,
"epoch": 72592
},
{
"id": 172,
"epoch": 72592
},
{
"id": 173,
"epoch": 72592
},
{
"id": 174,
"epoch": 72592
},
{
"id": 175,
"epoch": 72592
},
{
"id": 176,
"epoch": 72592
},
{
"id": 177,
"epoch": 72592
},
{
"id": 178,
"epoch": 72592
},
{
"id": 179,
"epoch": 72592
},
{
"id": 180,
"epoch": 72592
},
{
"id": 181,
"epoch": 72592
},
{
"id": 182,
"epoch": 72592
},
{
"id": 183,
"epoch": 72592
},
{
"id": 184,
"epoch": 72592
},
{
"id": 185,
"epoch": 72592
},
{
"id": 186,
"epoch": 72592
},
{
"id": 187,
"epoch": 72592
},
{
"id": 188,
"epoch": 72592
},
{
"id": 189,
"epoch": 72592
},
{
"id": 190,
"epoch": 72592
},
{
"id": 191,
"epoch": 72592
},
{
"id": 192,
"epoch": 72592
},
{
"id": 193,
"epoch": 72592
},
{
"id": 194,
"epoch": 72592
},
{
"id": 195,
"epoch": 72592
},
{
"id": 196,
"epoch": 72592
},
{
"id": 197,
"epoch": 72592
},
{
"id": 198,
"epoch": 72592
},
{
"id": 199,
"epoch": 72592
},
{
"id": 200,
"epoch": 72592
},
{
"id": 201,
"epoch": 72592
},
{
"id": 202,
"epoch": 72592
},
{
"id": 203,
"epoch": 72592
},
{
"id": 204,
"epoch": 72592
},
{
"id": 205,
"epoch": 72592
},
{
"id": 206,
"epoch": 72592
},
{
"id": 207,
"epoch": 72592
},
{
"id": 208,
"epoch": 72592
},
{
"id": 209,
"epoch": 72592
},
{
"id": 210,
"epoch": 72592
},
{
"id": 211,
"epoch": 72592
},
{
"id": 212,
"epoch": 72592
},
{
"id": 213,
"epoch": 72592
},
{
"id": 214,
"epoch": 72592
},
{
"id": 215,
"epoch": 72592
},
{
"id": 216,
"epoch": 72592
},
{
"id": 217,
"epoch": 72592
},
{
"id": 218,
"epoch": 72592
},
{
"id": 219,
"epoch": 72592
},
{
"id": 220,
"epoch": 72592
},
{
"id": 221,
"epoch": 72592
},
{
"id": 222,
"epoch": 72592
},
{
"id": 223,
"epoch": 72592
},
{
"id": 224,
"epoch": 72592
},
{
"id": 225,
"epoch": 72592
},
{
"id": 226,
"epoch": 72592
},
{
"id": 227,
"epoch": 72592
},
{
"id": 228,
"epoch": 72592
},
{
"id": 229,
"epoch": 72592
},
{
"id": 230,
"epoch": 72592
},
{
"id": 231,
"epoch": 72592
},
{
"id": 232,
"epoch": 72592
},
{
"id": 233,
"epoch": 72592
},
{
"id": 234,
"epoch": 72592
},
{
"id": 235,
"epoch": 72592
},
{
"id": 236,
"epoch": 72592
},
{
"id": 237,
"epoch": 72592
},
{
"id": 238,
"epoch": 72592
},
{
"id": 239,
"epoch": 72592
},
{
"id": 240,
"epoch": 72592
},
{
"id": 241,
"epoch": 72592
},
{
"id": 242,
"epoch": 72592
},
{
"id": 243,
"epoch": 72592
},
{
"id": 244,
"epoch": 72592
},
{
"id": 245,
"epoch": 72592
},
{
"id": 246,
"epoch": 72592
},
{
"id": 247,
"epoch": 72592
},
{
"id": 248,
"epoch": 72592
},
{
"id": 249,
"epoch": 72592
},
{
"id": 250,
"epoch": 72592
},
{
"id": 251,
"epoch": 72592
},
{
"id": 252,
"epoch": 72592
},
{
"id": 253,
"epoch": 72592
},
{
"id": 254,
"epoch": 72592
},
{
"id": 255,
"epoch": 72592
},
{
"id": 256,
"epoch": 72592
},
{
"id": 257,
"epoch": 72592
},
{
"id": 258,
"epoch": 72592
},
{
"id": 259,
"epoch": 72592
},
{
"id": 260,
"epoch": 72592
},
{
"id": 261,
"epoch": 72592
},
{
"id": 262,
"epoch": 72592
},
{
"id": 263,
"epoch": 72592
},
{
"id": 264,
"epoch": 72592
},
{
"id": 265,
"epoch": 72592
},
{
"id": 266,
"epoch": 72592
},
{
"id": 267,
"epoch": 72592
},
{
"id": 268,
"epoch": 72592
},
{
"id": 269,
"epoch": 72592
},
{
"id": 270,
"epoch": 72592
},
{
"id": 271,
"epoch": 72592
},
{
"id": 272,
"epoch": 72592
},
{
"id": 273,
"epoch": 72592
},
{
"id": 274,
"epoch": 72592
},
{
"id": 275,
"epoch": 72592
},
{
"id": 276,
"epoch": 72592
},
{
"id": 277,
"epoch": 72592
},
{
"id": 278,
"epoch": 72592
},
{
"id": 279,
"epoch": 72592
},
{
"id": 280,
"epoch": 72592
},
{
"id": 281,
"epoch": 72592
},
{
"id": 282,
"epoch": 72592
},
{
"id": 283,
"epoch": 72592
},
{
"id": 284,
"epoch": 72592
}
]
}
root@monb01:~#
W dniu 2020-11-12 11:58, Dan van der Ster napisał(a):
I found another possible trimming bug this morning, but I don't expect
it applies to you because you said you restarted the mon leader:
https://tracker.ceph.com/issues/48212
Otherwise, couple you please share the output of
ceph report | grep committed
ceph report | jq .osdmap_clean_epochs
Thanks,
Dan
On Thu, Nov 12, 2020 at 10:56 AM <m.sliwinski@xxxxx> wrote:
Hi
Thanks for the response. Our cluster is currently mostly on 14.2.13,
especially all MONs and MGRs are.
Some OSDs are still on 14.2.9, but it shouldn't block osdmap trimming
i
think, because atm we don't have any down OSDs, i checked for that
when
we first noticed the issue.
I'm working of course on bringing all OSDs to 14.2.13, but it will
take
some time as i have to create and test Debian packages for that.
Could there be any other reason? I found posts about PGs for new pool
not bein marked as created in MON db while cluster still reporting
everything as active+clean, but i dont know how to debug that.
--
Best regards
Marcin
W dniu 2020-11-11 16:50, Dan van der Ster napisał(a):
> Hi,
>
> v14.2.13 has an important fix in this area:
> https://tracker.ceph.com/issues/47290
> Without this fix, your cluster will not trim if there are any *down*
> osds in the cluster.
>
> On our clusters we are running v14.2.11 patched with commit
> "mon/OSDMonitor: only take in osd into consideration when trimming
> osdmaps" -- this trims maps perfectly afaict.
>
> I can't vouch for the rest of 14.2.13, so better test that adequately
> before upgrading.
>
> Cheers, Dan
>
>
> On Tue, Nov 10, 2020 at 6:57 PM <m.sliwinski@xxxxx> wrote:
>>
>> Hi
>>
>> We have ceph cluster running on Nautilus, recently upgraded from
>> Mimic.
>> When in Mimic we noticed issue with osdmap not trimming, which caused
>> part of our cluster to crash due to osdmap cache misses. We solved it
>> by
>> adding "osd_map_cache_size = 5000" to our ceph.conf
>> Because we had at that time mixed OSD versions from both Mimic and
>> Nautilus we decided to finish upgrade, but it didn't solve our
>> problem.
>> We have at the moment: "oldest_map": 67114, "newest_map": 72588,and
>> the
>> difference is not shrinking even thought cluster is in active+clean
>> state. Restarting all mon's didn't help. It seems bug is similar to
>> https://tracker.ceph.com/issues/44184 but there's no solution there.
>> What else can i check or do?
>> I don't want do to cangerous things like mon_osd_force_trim_to or
>> something similar without finding the cause.
>>
>> I noticed in MON debug log:
>>
>> 2020-11-10 17:11:14.612 7f9592d5b700 10 mon.monb01@0(leader).osd
>> e72571
>> should_prune could only prune 4957 epochs (67114..72071), which is
>> less
>> than the required minimum (10000)
>> 2020-11-10 17:11:19.612 7f9592d5b700 10 mon.monb01@0(leader).osd
>> e72571
>> should_prune could only prune 4957 epochs (67114..72071), which is
>> less
>> than the required minimum (10000)
>>
>> So i added config options to reduce those values:
>>
>> mon dev mon_debug_block_osdmap_trim false
>> mon advanced mon_min_osdmap_epochs 100
>> mon advanced mon_osdmap_full_prune_min 500
>> mon advanced paxos_service_trim_min 10
>>
>> But it didn't help:
>>
>> 2020-11-10 18:28:26.165 7f1b700ab700 20 mon.monb01@0(leader).osd
>> e72588
>> load_osdmap_manifest osdmap manifest detected in store; reload.
>> 2020-11-10 18:28:26.169 7f1b700ab700 10 mon.monb01@0(leader).osd
>> e72588
>> load_osdmap_manifest store osdmap manifest pinned (67114 .. 72484)
>> 2020-11-10 18:28:26.169 7f1b700ab700 10 mon.monb01@0(leader).osd
>> e72588
>> should_prune not enough epochs to form an interval (last pinned:
>> 72484,
>> last to pin: 72488, interval: 10)
>>
>> Command "ceph report | jq '.osdmap_manifest' |jq '.pinned_maps[]'"
>> shows
>> 67114 on the top, but i'm unable to determine why.
>>
>> Same with 'ceph report | jq .osdmap_first_committed':
>>
>> root@monb01:/var/log/ceph# ceph report | jq .osdmap_first_committed
>> report 4073203295
>> 67114
>> root@monb01:/var/log/ceph#
>>
>> When i try to derermine if a certain PG or OSD is keeping it so low i
>> don't get anything.
>>
>> And in MON debug log i get:
>>
>> 2020-11-10 18:42:41.767 7f1b74721700 10 mon.monb01@0(leader) e6
>> refresh_from_paxos
>> 2020-11-10 18:42:41.767 7f1b74721700 10
>> mon.monb01@0(leader).paxosservice(mdsmap 1..1) refresh
>> 2020-11-10 18:42:41.767 7f1b74721700 10
>> mon.monb01@0(leader).paxosservice(osdmap 67114..72588) refresh
>> 2020-11-10 18:42:41.767 7f1b74721700 20 mon.monb01@0(leader).osd
>> e72588
>> load_osdmap_manifest osdmap manifest detected in store; reload.
>> 2020-11-10 18:42:41.767 7f1b74721700 10 mon.monb01@0(leader).osd
>> e72588
>> load_osdmap_manifest store osdmap manifest pinned (67114 .. 72484)
>>
>> I also get:
>>
>> root@monb01:/var/log/ceph# ceph report |grep "min_last_epoch_clean"
>> report 2716976759
>> "min_last_epoch_clean": 0,
>> root@monb01:/var/log/ceph#
>>
>>
>> Additional info:
>> root@monb01:/var/log/ceph# ceph versions
>> {
>> "mon": {
>> "ceph version 14.2.13
>> (1778d63e55dbff6cedb071ab7d367f8f52a8699f)
>> nautilus (stable)": 3
>> },
>> "mgr": {
>> "ceph version 14.2.13
>> (1778d63e55dbff6cedb071ab7d367f8f52a8699f)
>> nautilus (stable)": 3
>> },
>> "osd": {
>> "ceph version 14.2.13
>> (1778d63e55dbff6cedb071ab7d367f8f52a8699f)
>> nautilus (stable)": 120,
>> "ceph version 14.2.9
>> (581f22da52345dba46ee232b73b990f06029a2a0)
>> nautilus (stable)": 164
>> },
>> "mds": {},
>> "overall": {
>> "ceph version 14.2.13
>> (1778d63e55dbff6cedb071ab7d367f8f52a8699f)
>> nautilus (stable)": 126,
>> "ceph version 14.2.9
>> (581f22da52345dba46ee232b73b990f06029a2a0)
>> nautilus (stable)": 164
>> }
>> }
>>
>>
>> root@monb01:/var/log/ceph# ceph mon feature ls
>>
>> all features
>> supported: [kraken,luminous,mimic,osdmap-prune,nautilus]
>> persistent: [kraken,luminous,mimic,osdmap-prune,nautilus]
>> on current monmap (epoch 6)
>> persistent: [kraken,luminous,mimic,osdmap-prune,nautilus]
>> required: [kraken,luminous,mimic,osdmap-prune,nautilus]
>>
>>
>> root@monb01:/var/log/ceph# ceph osd dump | grep require
>> require_min_compat_client luminous
>> require_osd_release nautilus
>>
>>
>> root@monb01:/var/log/ceph# ceph report | jq
>> '.osdmap_manifest.pinned_maps | length'
>> report 1777129876
>> 538
>>
>> root@monb01:/var/log/ceph# ceph pg dump -f json | jq .osd_epochs
>> dumped all
>> null
>>
>> --
>> Best regards
>> Marcin
>> _______________________________________________
>> ceph-users mailing list -- ceph-users@xxxxxxx
>> To unsubscribe send an email to ceph-users-leave@xxxxxxx
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx