So we now have a different error, I ran `ceph fs reset k8s` because of the map that was in the strange state. Now I'm getting the following error in the MDS log when it tries to 'join' the cluster (even though its the only one):
0> 2018-09-03 07:59:05.143026 7f9f9381d700 -1 /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/12.2.7/rpm/el7/BUILD/ceph-12.2.7/src/mds/MDCache.cc: In function 'void MDCache::rejoin_send_rejoins()' thread 7f9f9381d700 time 2018-09-03 07:59:05.140564
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/12.2.7/rpm/el7/BUILD/ceph-12.2.7/src/mds/MDCache.cc: 4029: FAILED assert(auth >= 0)
ceph version 12.2.7 (3ec878d1e53e1aeb47a9f619c49d9e7c0aa384d5) luminous (stable)
1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x110) [0x5604f3f96510]
2: (MDCache::rejoin_send_rejoins()+0x29b4) [0x5604f3d623e4]
3: (MDCache::process_imported_caps()+0x12d8) [0x5604f3d66328]
4: (MDCache::rejoin_open_ino_finish(inodeno_t, int)+0x3ec) [0x5604f3d690dc]
5: (MDSInternalContextBase::complete(int)+0x1eb) [0x5604f3ee0e2b]
6: (void finish_contexts<MDSInternalContextBase>(CephContext*, std::list<MDSInternalContextBase*, std::allocator<MDSInternalContextBase*> >&, int)+0xac) [0x5604f3c6395c]
7: (MDCache::open_ino_finish(inodeno_t, MDCache::open_ino_info_t&, int)+0x15c) [0x5604f3d1f05c]
8: (MDCache::_open_ino_backtrace_fetched(inodeno_t, ceph::buffer::list&, int)+0x493) [0x5604f3d477a3]
9: (MDSIOContextBase::complete(int)+0xa4) [0x5604f3ee1144]
10: (Finisher::finisher_thread_entry()+0x198) [0x5604f3f95488]
11: (()+0x7e25) [0x7f9f9e870e25]
12: (clone()+0x6d) [0x7f9f9d950bad]
NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this.
--- logging levels ---
0/ 5 none
0/ 1 lockdep
0/ 1 context
1/ 1 crush
20/20 mds
1/ 5 mds_balancer
1/ 5 mds_locker
1/ 5 mds_log
1/ 5 mds_log_expire
1/ 5 mds_migrator
0/ 1 buffer
0/ 1 timer
0/ 1 filer
0/ 1 striper
0/ 1 objecter
0/ 5 rados
0/ 5 rbd
0/ 5 rbd_mirror
0/ 5 rbd_replay
0/ 5 journaler
0/ 5 objectcacher
0/ 5 client
1/ 5 osd
0/ 5 optracker
0/ 5 objclass
1/ 3 filestore
1/ 3 journal
0/ 5 ms
1/ 5 mon
0/10 monc
1/ 5 paxos
0/ 5 tp
1/ 5 auth
1/ 5 crypto
1/ 1 finisher
1/ 1 reserver
1/ 5 heartbeatmap
1/ 5 perfcounter
1/ 5 rgw
1/10 civetweb
1/ 5 javaclient
1/ 5 asok
1/ 1 throttle
0/ 0 refs
1/ 5 xio
1/ 5 compressor
1/ 5 bluestore
1/ 5 bluefs
1/ 3 bdev
1/ 5 kstore
4/ 5 rocksdb
4/ 5 leveldb
4/ 5 memdb
1/ 5 kinetic
1/ 5 fuse
1/ 5 mgr
1/ 5 mgrc
1/ 5 dpdk
1/ 5 eventtrace
-2/-2 (syslog threshold)
99/99 (stderr threshold)
max_recent 10000
max_new 1000
log_file
--- end dump of recent events ---
*** Caught signal (Aborted) **
in thread 7f9f9381d700 thread_name:fn_anonymous
ceph version 12.2.7 (3ec878d1e53e1aeb47a9f619c49d9e7c0aa384d5) luminous (stable)
1: (()+0x5b7a11) [0x5604f3f55a11]
2: (()+0xf6d0) [0x7f9f9e8786d0]
3: (gsignal()+0x37) [0x7f9f9d888277]
4: (abort()+0x148) [0x7f9f9d889968]
5: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x284) [0x5604f3f96684]
6: (MDCache::rejoin_send_rejoins()+0x29b4) [0x5604f3d623e4]
7: (MDCache::process_imported_caps()+0x12d8) [0x5604f3d66328]
8: (MDCache::rejoin_open_ino_finish(inodeno_t, int)+0x3ec) [0x5604f3d690dc]
9: (MDSInternalContextBase::complete(int)+0x1eb) [0x5604f3ee0e2b]
10: (void finish_contexts<MDSInternalContextBase>(CephContext*, std::list<MDSInternalContextBase*, std::allocator<MDSInternalContextBase*> >&, int)+0xac) [0x5604f3c6395c]
11: (MDCache::open_ino_finish(inodeno_t, MDCache::open_ino_info_t&, int)+0x15c) [0x5604f3d1f05c]
12: (MDCache::_open_ino_backtrace_fetched(inodeno_t, ceph::buffer::list&, int)+0x493) [0x5604f3d477a3]
13: (MDSIOContextBase::complete(int)+0xa4) [0x5604f3ee1144]
14: (Finisher::finisher_thread_entry()+0x198) [0x5604f3f95488]
15: (()+0x7e25) [0x7f9f9e870e25]
16: (clone()+0x6d) [0x7f9f9d950bad]
2018-09-03 07:59:05.214546 7f9f9381d700 -1 *** Caught signal (Aborted) **
in thread 7f9f9381d700 thread_name:fn_anonymous
ceph version 12.2.7 (3ec878d1e53e1aeb47a9f619c49d9e7c0aa384d5) luminous (stable)
1: (()+0x5b7a11) [0x5604f3f55a11]
2: (()+0xf6d0) [0x7f9f9e8786d0]
3: (gsignal()+0x37) [0x7f9f9d888277]
4: (abort()+0x148) [0x7f9f9d889968]
5: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x284) [0x5604f3f96684]
6: (MDCache::rejoin_send_rejoins()+0x29b4) [0x5604f3d623e4]
7: (MDCache::process_imported_caps()+0x12d8) [0x5604f3d66328]
8: (MDCache::rejoin_open_ino_finish(inodeno_t, int)+0x3ec) [0x5604f3d690dc]
9: (MDSInternalContextBase::complete(int)+0x1eb) [0x5604f3ee0e2b]
10: (void finish_contexts<MDSInternalContextBase>(CephContext*, std::list<MDSInternalContextBase*, std::allocator<MDSInternalContextBase*> >&, int)+0xac) [0x5604f3c6395c]
11: (MDCache::open_ino_finish(inodeno_t, MDCache::open_ino_info_t&, int)+0x15c) [0x5604f3d1f05c]
12: (MDCache::_open_ino_backtrace_fetched(inodeno_t, ceph::buffer::list&, int)+0x493) [0x5604f3d477a3]
13: (MDSIOContextBase::complete(int)+0xa4) [0x5604f3ee1144]
14: (Finisher::finisher_thread_entry()+0x198) [0x5604f3f95488]
15: (()+0x7e25) [0x7f9f9e870e25]
16: (clone()+0x6d) [0x7f9f9d950bad]
NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this.
--- begin dump of recent events ---
0> 2018-09-03 07:59:05.214546 7f9f9381d700 -1 *** Caught signal (Aborted) **
in thread 7f9f9381d700 thread_name:fn_anonymous
ceph version 12.2.7 (3ec878d1e53e1aeb47a9f619c49d9e7c0aa384d5) luminous (stable)
1: (()+0x5b7a11) [0x5604f3f55a11]
2: (()+0xf6d0) [0x7f9f9e8786d0]
3: (gsignal()+0x37) [0x7f9f9d888277]
4: (abort()+0x148) [0x7f9f9d889968]
5: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x284) [0x5604f3f96684]
6: (MDCache::rejoin_send_rejoins()+0x29b4) [0x5604f3d623e4]
7: (MDCache::process_imported_caps()+0x12d8) [0x5604f3d66328]
8: (MDCache::rejoin_open_ino_finish(inodeno_t, int)+0x3ec) [0x5604f3d690dc]
9: (MDSInternalContextBase::complete(int)+0x1eb) [0x5604f3ee0e2b]
10: (void finish_contexts<MDSInternalContextBase>(CephContext*, std::list<MDSInternalContextBase*, std::allocator<MDSInternalContextBase*> >&, int)+0xac) [0x5604f3c6395c]
11: (MDCache::open_ino_finish(inodeno_t, MDCache::open_ino_info_t&, int)+0x15c) [0x5604f3d1f05c]
12: (MDCache::_open_ino_backtrace_fetched(inodeno_t, ceph::buffer::list&, int)+0x493) [0x5604f3d477a3]
13: (MDSIOContextBase::complete(int)+0xa4) [0x5604f3ee1144]
14: (Finisher::finisher_thread_entry()+0x198) [0x5604f3f95488]
15: (()+0x7e25) [0x7f9f9e870e25]
16: (clone()+0x6d) [0x7f9f9d950bad]
NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this.
--- logging levels ---
0/ 5 none
0/ 1 lockdep
0/ 1 context
1/ 1 crush
20/20 mds
1/ 5 mds_balancer
1/ 5 mds_locker
1/ 5 mds_log
1/ 5 mds_log_expire
1/ 5 mds_migrator
0/ 1 buffer
0/ 1 timer
0/ 1 filer
0/ 1 striper
0/ 1 objecter
0/ 5 rados
0/ 5 rbd
0/ 5 rbd_mirror
0/ 5 rbd_replay
0/ 5 journaler
0/ 5 objectcacher
0/ 5 client
1/ 5 osd
0/ 5 optracker
0/ 5 objclass
1/ 3 filestore
1/ 3 journal
0/ 5 ms
1/ 5 mon
0/10 monc
1/ 5 paxos
0/ 5 tp
1/ 5 auth
1/ 5 crypto
1/ 1 finisher
1/ 1 reserver
1/ 5 heartbeatmap
1/ 5 perfcounter
1/ 5 rgw
1/10 civetweb
1/ 5 javaclient
1/ 5 asok
1/ 1 throttle
0/ 0 refs
1/ 5 xio
1/ 5 compressor
1/ 5 bluestore
1/ 5 bluefs
1/ 3 bdev
1/ 5 kstore
4/ 5 rocksdb
4/ 5 leveldb
4/ 5 memdb
1/ 5 kinetic
1/ 5 fuse
1/ 5 mgr
1/ 5 mgrc
1/ 5 dpdk
1/ 5 eventtrace
-2/-2 (syslog threshold)
99/99 (stderr threshold)
max_recent 10000
max_new 1000
log_file
--- end dump of recent events ---
start_mds.sh: line 4: 133 Aborted (core dumped) /usr/bin/ceph-mds "${DAEMON_OPTS[@]}" -i "${MDS_NAME}"
Kind regards,
Marlinc
On Mon, 3 Sep 2018 at 03:11, Yan, Zheng <ukernel@xxxxxxxxx> wrote:
On Mon, Sep 3, 2018 at 1:57 AM Marlin Cremers
<m.cremers@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> Hey there,
>
> So I now have a problem since none of my MDSes can start anymore.
>
> They are stuck in the resolve state since Ceph things there are still MDSes alive which I can see when I run:
>
need mds log to check why mds are stuck in resolve state.
> ceph mds deactivate k8s:0
> Error EEXIST: mds.4:0 not active (???)
> ceph mds deactivate k8s:1
> Error EEXIST: mds.4:1 not active (???)
>
> How can I remove the MDSes from Ceph's memory as I currently have no running MDSes.
>
> When I look run ceph mds stat -f json then get the following output:
> {
> "fsmap":{
> "epoch":3901,
> "compat":{
> "compat":{
>
> },
> "ro_compat":{
>
> },
> "incompat":{
> "feature_1":"base v0.20",
> "feature_2":"client writeable ranges",
> "feature_3":"default file layouts on dirs",
> "feature_4":"dir inode in separate object",
> "feature_5":"mds uses versioned encoding",
> "feature_6":"dirfrag is stored in omap",
> "feature_8":"no anchor table",
> "feature_9":"file layout v2"
> }
> },
> "feature_flags":{
> "enable_multiple":true,
> "ever_enabled_multiple":false
> },
> "standbys":[
>
> ],
> "filesystems":[
> {
> "mdsmap":{
> "epoch":3896,
> "flags":12,
> "ever_allowed_features":0,
> "explicitly_allowed_features":0,
> "created":"2018-04-21 16:55:37.625468",
> "modified":"2018-09-02 19:36:00.788965",
> "tableserver":0,
> "root":0,
> "session_timeout":60,
> "session_autoclose":300,
> "max_file_size":1099511627776,
> "last_failure":0,
> "last_failure_osd_epoch":17409,
> "compat":{
> "compat":{
>
> },
> "ro_compat":{
>
> },
> "incompat":{
> "feature_1":"base v0.20",
> "feature_2":"client writeable ranges",
> "feature_3":"default file layouts on dirs",
> "feature_4":"dir inode in separate object",
> "feature_5":"mds uses versioned encoding",
> "feature_6":"dirfrag is stored in omap",
> "feature_8":"no anchor table",
> "feature_9":"file layout v2"
> }
> },
> "max_mds":1,
> "in":[
> 0,
> 1
> ],
> "up":{
>
> },
> "failed":[
>
> ],
> "damaged":[
>
> ],
> "stopped":[
> 2,
> 3
> ],
> "info":{
>
> },
> "data_pools":[
> 16
> ],
> "metadata_pool":17,
> "enabled":true,
> "fs_name":"k8s",
> "balancer":"",
> "standby_count_wanted":1
> },
> "id":4
> },
> {
> "mdsmap":{
> "epoch":3901,
> "flags":12,
> "ever_allowed_features":0,
> "explicitly_allowed_features":0,
> "created":"2018-04-29 15:53:35.342750",
> "modified":"2018-09-02 19:37:33.823379",
> "tableserver":0,
> "root":0,
> "session_timeout":60,
> "session_autoclose":300,
> "max_file_size":1099511627776,
> "last_failure":0,
> "last_failure_osd_epoch":17341,
> "compat":{
> "compat":{
>
> },
> "ro_compat":{
>
> },
> "incompat":{
> "feature_1":"base v0.20",
> "feature_2":"client writeable ranges",
> "feature_3":"default file layouts on dirs",
> "feature_4":"dir inode in separate object",
> "feature_5":"mds uses versioned encoding",
> "feature_6":"dirfrag is stored in omap",
> "feature_8":"no anchor table",
> "feature_9":"file layout v2"
> }
> },
> "max_mds":1,
> "in":[
> 0
> ],
> "up":{
> "mds_0":201494080
> },
> "failed":[
>
> ],
> "damaged":[
>
> ],
> "stopped":[
>
> ],
> "info":{
> "gid_201494080":{
> "gid":201494080,
> "name":"node01",
> "rank":0,
> "incarnation":3898,
> "state":"up:active",
> "state_seq":5,
> "addr":"10.14.4.241:6800/1458476866",
> "standby_for_rank":-1,
> "standby_for_fscid":-1,
> "standby_for_name":"",
> "standby_replay":false,
> "export_targets":[
>
> ],
> "features":4611087853745930235
> }
> },
> "data_pools":[
> 19
> ],
> "metadata_pool":18,
> "enabled":true,
> "fs_name":"windoos_click",
> "balancer":"",
> "standby_count_wanted":1
> },
> "id":5
> }
> ]
> },
> "mdsmap_first_committed":3259,
> "mdsmap_last_committed":3901
> }
>
> Which seems to suggest that Ceph things there are two up MDSes in the k8s filesystem and two that are stopped.
>
> I hope soneone who knows the internals of Ceph can help me as this looks like something I'm not able to fix on my own.
>
> Kind regards,
> Marlinc
> _______________________________________________
> ceph-users mailing list
> ceph-users@xxxxxxxxxxxxxx
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com