Hi all,
at the University of Zurich we run a cephfs cluster of ~12PB raw size.
We currently run Pacific 16.2.15 and our clients (Ubuntu 20.04) mount cephfs using the kernel driver.
The cluster was deployed in Mimic and subsequently to Nautilus (14.2.22) and then Pacific (16.2.15).
Last Wednesday (4 Dec.) the kernel clients have been upgraded from 5.15.0-124 to 5.15.0-126.
We run 6 active MDSs and 2 standby with static pinning of directory subtrees.
Yesterday (actually it happened on Friday evening) we experienced a problem where an MDS serving rank.4 crashed and then the two standby MDS in turn crashed when trying to replay the journal.
The other ranks are not affected.
We would appreciate any help to debug and recover the failed filesystem.
Thank you.
Following some info on the crash and status of the cluster.
The oldest and initial crash info is:
{
"assert_condition": "batch_ops.empty()",
"assert_file": "/build/ceph-16.2.15/src/mds/CDentry.h",
"assert_func": "virtual CDentry::~CDentry()",
"assert_line": 130,
"assert_msg": "/build/ceph-16.2.15/src/mds/CDentry.h: In function 'virtual CDentry::~CDentry()' thread 7f14fb70e700 time 2024-12-06T19:08:09.292770+0100\n/build/ceph-16.2.15/src/mds/CDentry.h: 130: FAILED ceph_assert(batch_ops.empty())\n",
"assert_thread_name": "ms_dispatch",
"backtrace": [
"/lib/x86_64-linux-gnu/libpthread.so.0(+0x12980) [0x7f1507109980]",
"gsignal()",
"abort()",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x19c) [0x7f15077b4a2e]",
"(ceph::__ceph_assertf_fail(char const*, char const*, int, char const*, char const*, ...)+0) [0x7f15077b4bb8]",
"(CDentry::~CDentry()+0x4d9) [0x564e03dec519]",
"(CDir::remove_dentry(CDentry*)+0x17e) [0x564e03df672e]",
"(MDCache::trim_dentry(CDentry*, std::map<int, boost::intrusive_ptr<MCacheExpire>, std::less<int>, std::allocator<std::pair<int const, boost::intrusive_ptr<MCacheExpire> > > >&)+0xd4) [0x564e03cb9b54]",
"(MDCache::trim_lru(unsigned long, std::map<int, boost::intrusive_ptr<MCacheExpire>, std::less<int>, std::allocator<std::pair<int const, boost::intrusive_ptr<MCacheExpire> > > >&)+0x91a) [0x564e03cbb36a]",
"(MDCache::trim(unsigned long)+0xc1) [0x564e03cbbde1]",
"(MDCache::upkeep_main()+0x94a) [0x564e03cf8aea]",
"/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xbd6df) [0x7f1506c276df]",
"/lib/x86_64-linux-gnu/libpthread.so.0(+0x76db) [0x7f15070fe6db]",
"clone()"
],
"ceph_version": "16.2.15",
"crash_id": "2024-12-06T18:08:09.295408Z_e6f3560d-21aa-425d-a1b2-f0b5fb5bd103",
"entity_name": "mds.mds-l20-33",
"os_id": "ubuntu",
"os_name": "Ubuntu",
"os_version": "18.04.6 LTS (Bionic Beaver)",
"os_version_id": "18.04",
"process_name": "ceph-mds",
"stack_sig": "5e65f2db93b88e8fb398b21684d658a9ef1cec94845c09c581a0d8a80722960a",
"timestamp": "2024-12-06T18:08:09.295408Z",
"utsname_hostname": "mds-l20-33",
"utsname_machine": "x86_64",
"utsname_release": "5.4.0-146-generic",
"utsname_sysname": "Linux",
"utsname_version": "#163~18.04.1-Ubuntu SMP Mon Mar 20 15:02:59 UTC 2023"
}
and it looks similar to https://tracker.ceph.com/issues/66967 even though the "assert_line" is different.
But all the following crash are different and look like:
{
"assert_condition": "p->first <= start",
"assert_file": "/build/ceph-16.2.15/src/include/interval_set.h",
"assert_func": "void interval_set<T, C>::erase(T, T, std::function<bool(T, T)>) [with T = inodeno_t; C = std::map]",
"assert_line": 568,
"assert_msg": "/build/ceph-16.2.15/src/include/interval_set.h: In function 'void interval_set<T, C>::erase(T, T, std::function<bool(T, T)>) [with T = inodeno_t; C = std::map]' thread 7fdd353b9700 time 2024-12-06T19:08:24.550104+0100\n/build/ceph-16.2.15/src/include/interval_set.h: 568: FAILED ceph_assert(p->first <= start)\n",
"assert_thread_name": "md_log_replay",
"backtrace": [
"/lib/x86_64-linux-gnu/libpthread.so.0(+0x12980) [0x7fdd445bb980]",
"gsignal()",
"abort()",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x19c) [0x7fdd44c66a2e]",
"(ceph::__ceph_assertf_fail(char const*, char const*, int, char const*, char const*, ...)+0) [0x7fdd44c66bb8]",
"(interval_set<inodeno_t, std::map>::erase(inodeno_t, inodeno_t, std::function<bool (inodeno_t, inodeno_t)>)+0x296) [0x55876572c9f6]",
"(interval_set<inodeno_t, std::map>::erase(inodeno_t)+0x2f) [0x55876572ca3f]",
"(EMetaBlob::replay(MDSRank*, LogSegment*, int, MDPeerUpdate*)+0x588b) [0x558765a0868b]",
"(EUpdate::replay(MDSRank*)+0x5d) [0x558765a0a0bd]",
"(MDLog::_replay_thread()+0x78e) [0x55876598d64e]",
"(MDLog::ReplayThread::entry()+0xd) [0x55876567f4ad]",
"/lib/x86_64-linux-gnu/libpthread.so.0(+0x76db) [0x7fdd445b06db]",
"clone()"
],
"ceph_version": "16.2.15",
"crash_id": "2024-12-06T18:08:24.552401Z_35179c40-1fb8-473e-84d6-c013b27f341f",
"entity_name": "mds.mds-l20-33",
"os_id": "ubuntu",
"os_name": "Ubuntu",
"os_version": "18.04.6 LTS (Bionic Beaver)",
"os_version_id": "18.04",
"process_name": "ceph-mds",
"stack_sig": "96ed41e917b880ad1183c7acd926e3259e37e1511eb6de7730a21d14121679b2",
"timestamp": "2024-12-06T18:08:24.552401Z",
"utsname_hostname": "mds-l20-33",
"utsname_machine": "x86_64",
"utsname_release": "5.4.0-146-generic",
"utsname_sysname": "Linux",
"utsname_version": "#163~18.04.1-Ubuntu SMP Mon Mar 20 15:02:59 UTC 2023"
}
that looks like https://tracker.ceph.com/issues/61009 even if it has not been reported for Pacific.
Our current status is:
root@mon-l19-31:~# ceph -s
cluster:
id: 6ef3c8fe-33e9-4db3-80a7-c64d8991b9f0
health: HEALTH_WARN
1054 OSD(s) reporting legacy (not per-pool) BlueStore omap usage stats
1 filesystem is degraded
insufficient standby MDS daemons available
4 MDSs report slow requests
2 MDSs behind on trimming
4 pgs not deep-scrubbed in time
2 pools have too many placement groups
33 daemons have recently crashed
services:
mon: 3 daemons, quorum mon-l19-31,mon-l20-41,mon-l15-41 (age 2M)
mgr: mon-l20-41(active, since 2M), standbys: mon-l19-31, mon-l15-41
mds: 6/6 daemons up
osd: 1216 osds: 1213 up (since 11d), 1213 in (since 11d)
data:
volumes: 0/1 healthy, 1 recovering
pools: 4 pools, 9729 pgs
objects: 2.78G objects, 5.3 PiB
usage: 8.4 PiB used, 3.3 PiB / 12 PiB avail
pgs: 9655 active+clean
74 active+clean+scrubbing+deep
root@mon-l19-31:~# ceph fs status
spindlefs - 611 clients
=========
RANK STATE MDS ACTIVITY DNS INOS DIRS CAPS
0 active mds-l21-34-2 Reqs: 1 /s 248k 222k 119k 5107
1 active mds-l19-33 Reqs: 1 /s 15.5M 14.5M 2049k 1836k
2 active mds-l21-33 Reqs: 0 /s 25.7M 20.5M 2765k 65.5k
3 active mds-l19-33-2 Reqs: 0 /s 20.4M 20.4M 582k 10.3k
4 replay(laggy) mds-l21-33-2 0 0 0 0
5 active mds-l21-34 Reqs: 1 /s 304k 274k 139k 9112
POOL TYPE USED AVAIL
spindle_metadata metadata 592G 4321G
spindle data 8384T 1092T
vhp data 119T 10.2T
MDS version: ceph version 16.2.15 (618f440892089921c3e944a991122ddc44e60516) pacific (stable)
root@mon-l19-31:~# ceph health detail
HEALTH_WARN 1054 OSD(s) reporting legacy (not per-pool) BlueStore omap usage stats; 1 filesystem is degraded; insufficient standby MDS daemons available; 4 MDSs report slow requests; 2 MDSs behind on trimming; 4 pgs not deep-scrubbed in time; 2 pools have too many placement groups; 33 daemons have recently crashed
[WRN] BLUESTORE_NO_PER_POOL_OMAP: 1054 OSD(s) reporting legacy (not per-pool) BlueStore omap usage stats
osd.13 legacy (not per-pool) omap detected, suggest to run store repair to benefit from per-pool omap usage statistics
...
...
...
osd.1226 legacy (not per-pool) omap detected, suggest to run store repair to benefit from per-pool omap usage statistics
[WRN] FS_DEGRADED: 1 filesystem is degraded
fs spindlefs is degraded
[WRN] MDS_INSUFFICIENT_STANDBY: insufficient standby MDS daemons available
have 0; want 1 more
[WRN] MDS_SLOW_REQUEST: 4 MDSs report slow requests
mds.mds-l21-33(mds.2): 953 slow requests are blocked > 30 secs
mds.mds-l19-33(mds.1): 35 slow requests are blocked > 30 secs
mds.mds-l21-34-2(mds.0): 19 slow requests are blocked > 30 secs
mds.mds-l19-33-2(mds.3): 3 slow requests are blocked > 30 secs
[WRN] MDS_TRIM: 2 MDSs behind on trimming
mds.mds-l19-33(mds.1): Behind on trimming (449/128) max_segments: 128, num_segments: 449
mds.mds-l19-33-2(mds.3): Behind on trimming (7099/128) max_segments: 128, num_segments: 7099
[WRN] PG_NOT_DEEP_SCRUBBED: 4 pgs not deep-scrubbed in time
pg 11.140a not deep-scrubbed since 2024-11-28T08:56:45.425135+0100
pg 11.694 not deep-scrubbed since 2024-11-27T22:18:56.908953+0100
pg 11.21b not deep-scrubbed since 2024-11-28T04:48:14.734599+0100
pg 11.ad8 not deep-scrubbed since 2024-11-27T19:59:32.851964+0100
[WRN] POOL_TOO_MANY_PGS: 2 pools have too many placement groups
Pool spindle has 8192 placement groups, should have 8192
Pool vhp has 1024 placement groups, should have 1024
[WRN] RECENT_CRASH: 33 daemons have recently crashed
mds.mds-l20-33 crashed on host mds-l20-33 at 2024-12-06T18:08:50.420719Z
mds.mds-l20-33-2 crashed on host mds-l20-33 at 2024-12-06T18:08:33.409537Z
mds.mds-l20-33-2 crashed on host mds-l20-33 at 2024-12-06T18:08:27.335401Z
mds.mds-l20-33 crashed on host mds-l20-33 at 2024-12-06T18:08:53.156894Z
mds.mds-l20-33-2 crashed on host mds-l20-33 at 2024-12-06T18:08:31.359872Z
mds.mds-l20-33-2 crashed on host mds-l20-33 at 2024-12-06T18:08:29.331676Z
mds.mds-l20-33 crashed on host mds-l20-33 at 2024-12-06T18:08:09.295408Z
mds.mds-l20-33 crashed on host mds-l20-33 at 2024-12-06T18:08:24.552401Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-06T18:09:13.029070Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-06T18:09:10.641714Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-06T18:09:18.205911Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-06T18:09:15.176043Z
mds.mds-l20-33 crashed on host mds-l20-33 at 2024-12-08T13:33:16.196251Z
mds.mds-l20-33 crashed on host mds-l20-33 at 2024-12-08T13:33:18.242351Z
mds.mds-l20-33 crashed on host mds-l20-33 at 2024-12-08T13:33:13.762433Z
mds.mds-l20-33-2 crashed on host mds-l20-33 at 2024-12-08T15:22:20.143608Z
mds.mds-l20-33-2 crashed on host mds-l20-33 at 2024-12-08T15:22:18.083712Z
mds.mds-l20-33-2 crashed on host mds-l20-33 at 2024-12-08T15:22:15.248451Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:19:10.277110Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:19:06.202423Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:23:24.469758Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:23:22.372727Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:19:08.271763Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:23:26.559707Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:50:21.508069Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:50:19.417250Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:50:17.469305Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:59:42.693519Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:59:40.494913Z
mds.mds-l21-33-2 crashed on host mds-l21-33 at 2024-12-09T09:59:44.763196Z
and 3 more
Best regards,
Enrico
--
Enrico Favero
S3IT Services and Support for Science IT
Office Y11 F 52
University of Zürich
Winterthurerstrasse 190, CH-8057 Zürich (Switzerland)
Tel: +41 44 635 42 22
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx