Failing to create monitor in a working cluster.

pmestre@xxxxxxxxx · Fri, 31 Mar 2023 16:05:47 -0000

Hello, i've been running a 3 node proxmox cluster with 4 ceph osd for 3 years as a production cluster.
As a test for trying to move ceph cluster network, i destroyed one of the 3 working monitors and tried to recreate it.
After destroying it, the new monitor refuses to join the cluster, even in the old network. I've tried all steps in documentation "Troubleshooting monitors" section.

New monitor has this config extracted from ceph --admin-daemon file.asok 
{
    "name": "n3ceph",
    "rank": -1,
    "state": "probing",
    "election_epoch": 0,
    "quorum": [],
    "features": {
        "required_con": "2449958197560098820",
        "required_mon": [
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune",
            "nautilus",
            "octopus",
            "pacific",
            "elector-pinging"
        ],
        "quorum_con": "0",
        "quorum_mon": []
    },
    "outside_quorum": [],
    "extra_probe_peers": [],
    "sync_provider": [],
    "monmap": {
        "epoch": 6,
        "fsid": "5e60d0bb-33b4-42db-bbe7-7032c35ee605",
        "modified": "2023-03-31T11:54:44.616569Z",
        "created": "2019-12-02T13:50:38.097448Z",
        "min_mon_release": 16,
        "min_mon_release_name": "pacific",
        "election_strategy": 1,
        "disallowed_leaders: ": "",
        "stretch_mode": false,
        "tiebreaker_mon": "",
        "removed_ranks: ": "1",
        "features": {
            "persistent": [
                "kraken",
                "luminous",
                "mimic",
                "osdmap-prune",
                "nautilus",
                "octopus",
                "pacific",
                "elector-pinging"
            ],
            "optional": []
        },
        "mons": [
            {
                "rank": 0,
                "name": "node1",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.1:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.1:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.1:6789/0",
                "public_addr": "10.100.100.1:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            },
            {
                "rank": 1,
                "name": "node2",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.2:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.2:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.2:6789/0",
                "public_addr": "10.100.100.2:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            }
        ]
    },
    "feature_map": {
        "mon": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 1
            }
        ]
    },
    "stretch_mode": false
}

The quorum mon stat is as follows:
{
    "name": "node1",
    "rank": 0,
    "state": "leader",
    "election_epoch": 340,
    "quorum": [
        0,
        1
    ],
    "quorum_age": 13090,
    "features": {
        "required_con": "2449958747317026820",
        "required_mon": [
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune",
            "nautilus",
            "octopus",
            "pacific",
            "elector-pinging"
        ],
        "quorum_con": "4540138314316775423",
        "quorum_mon": [
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune",
            "nautilus",
            "octopus",
            "pacific",
            "elector-pinging"
        ]
    },
    "outside_quorum": [],
    "extra_probe_peers": [],
    "sync_provider": [],
    "monmap": {
        "epoch": 6,
        "fsid": "5e60d0bb-33b4-42db-bbe7-7032c35ee605",
        "modified": "2023-03-31T11:54:44.616569Z",
        "created": "2019-12-02T13:50:38.097448Z",
        "min_mon_release": 16,
        "min_mon_release_name": "pacific",
        "election_strategy": 1,
        "disallowed_leaders: ": "",
        "stretch_mode": false,
        "tiebreaker_mon": "",
        "removed_ranks: ": "1",
        "features": {
            "persistent": [
                "kraken",
                "luminous",
                "mimic",
                "osdmap-prune",
                "nautilus",
                "octopus",
                "pacific",
                "elector-pinging"
            ],
            "optional": []
        },
        "mons": [
            {
                "rank": 0,
                "name": "node1",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.1:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.1:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.1:6789/0",
                "public_addr": "10.100.100.1:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            },
            {
                "rank": 1,
                "name": "node2",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.2:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.2:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.2:6789/0",
                "public_addr": "10.100.100.2:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            }
        ]
    },
    "feature_map": {
        "mon": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 1
            }
        ],
        "osd": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 5
            }
        ],
        "client": [
            {
                "features": "0x2f018fb87aa4aafe",
                "release": "luminous",
                "num": 1
            },
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 12
            }
        ],
        "mgr": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 1
            }
        ]
    },
    "stretch_mode": false

I tried to get a debug log with ceph daemon mon.n3ceph config set debug_mon 10/10
 and restarting the service, but the ceph log file stoped working after i tried that setting.

journalctl -u tells me:
mar 31 17:35:22 node3 ceph-mon[240916]: 2023-03-31T17:35:22.926+0200 7f49e0699700 -1 mon.n3ceph@-1(probing) e6 get_health_metrics reporting 4 slow ops, oldest is log(1 entries from seq 1 at 2023-03-31T17:30:19.347379+0200)
mar 31 17:35:27 node3 ceph-mon[240916]: 2023-03-31T17:35:27.926+0200 7f49e0699700 -1 mon.n3ceph@-1(probing) e6 get_health_metrics reporting 4 slow ops, oldest is log(1 entries from seq 1 at 2023-03-31T17:30:19.347379+0200)
mar 31 17:35:32 node3 ceph-mon[240916]: 2023-03-31T17:35:32.926+0200 7f49e0699700 -1 mon.n3ceph@-1(probing) e6 get_health_metrics reporting 4 slow ops, oldest is log(1 entries from seq 1 at 2023-03-31T17:30:19.347379+0200).

Any ideas? Cluster is running fine with two monitors, but a reboot in one of the nodes might be a big problem.
Kind regards and many thanks.
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx