Adam, I have posted a question related to upgrading earlier and this thread is related to that, I have opened a new one because I found that error in logs and thought the upgrade may be stuck because of duplicate OSDs. root@ceph1:~# ls -l /var/lib/ceph/f270ad9e-1f6f-11ed-b6f8-a539d87379ea/ total 44 drwx------ 3 nobody nogroup 4096 Aug 19 05:37 alertmanager.ceph1 drwx------ 3 167 167 4096 Aug 19 05:36 crash drwx------ 2 167 167 4096 Aug 19 05:37 crash.ceph1 drwx------ 4 998 996 4096 Aug 19 05:37 grafana.ceph1 drwx------ 2 167 167 4096 Aug 19 05:36 mgr.ceph1.xmbvsb drwx------ 3 167 167 4096 Aug 19 05:36 mon.ceph1 drwx------ 2 nobody nogroup 4096 Aug 19 05:37 node-exporter.ceph1 drwx------ 2 167 167 4096 Aug 19 17:55 osd.0 drwx------ 2 167 167 4096 Aug 19 18:03 osd.1 drwx------ 2 167 167 4096 Aug 31 05:20 osd.4 drwx------ 4 nobody nogroup 4096 Aug 19 05:38 prometheus.ceph1 Here is the output of cephadm ls root@ceph1:~# cephadm ls [ { "style": "cephadm:v1", "name": "alertmanager.ceph1", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@alertmanager.ceph1", "enabled": true, "state": "running", "container_id": "97403cf9799711461216b7f83e88c574da2b631c7c65233ebd82d8a216a48924", "container_image_name": "quay.io/prometheus/alertmanager:v0.20.0", "container_image_id": "0881eb8f169f5556a292b4e2c01d683172b12830a62a9225a98a8e206bb734f0", "version": "0.20.0", "started": "2022-08-19T16:59:02.461978Z", "created": "2022-08-19T03:37:16.403605Z", "deployed": "2022-08-19T03:37:15.815605Z", "configured": "2022-08-19T16:59:02.117607Z" }, { "style": "cephadm:v1", "name": "grafana.ceph1", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@grafana.ceph1", "enabled": true, "state": "running", "container_id": "c7136aea8349a37dd9b320acd926c4bcbed95bc4549779e9580ed4290edc2117", "container_image_name": "quay.io/ceph/ceph-grafana:6.7.4", "container_image_id": "557c83e11646f123a27b5e4b62ac6c45e7bb8b2e90d6044034d0db5b7019415c", "version": "6.7.4", "started": "2022-08-19T03:38:05.481992Z", "created": "2022-08-19T03:37:46.823604Z", "deployed": "2022-08-19T03:37:46.239604Z", "configured": "2022-08-19T03:38:05.163603Z" }, { "style": "cephadm:v1", "name": "osd.1", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@osd.1", "enabled": true, "state": "running", "container_id": "51586b775bda0485c8b27b8401ac2430570e6f42cb7e12bae3eea05064f1fd20", "container_image_name": "quay.io/ceph/ceph:v15", "container_image_id": "93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4", "version": "15.2.17", "started": "2022-08-19T16:03:10.612432Z", "created": "2022-08-19T16:03:09.765746Z", "deployed": "2022-08-19T16:03:09.141746Z", "configured": "2022-08-31T02:53:34.224643Z" }, { "style": "cephadm:v1", "name": "prometheus.ceph1", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@prometheus.ceph1", "enabled": true, "state": "running", "container_id": "ba305236e5db9f2095b23b86a2340924909e9e8e54e5cdbe1d51c14dc4c8587a", "container_image_name": "quay.io/prometheus/prometheus:v2.18.1", "container_image_id": "de242295e2257c37c8cadfd962369228f8f10b2d48a44259b65fef44ad4f6490", "version": "2.18.1", "started": "2022-08-19T16:59:03.538981Z", "created": "2022-08-19T03:38:01.567604Z", "deployed": "2022-08-19T03:38:00.983603Z", "configured": "2022-08-19T16:59:03.193607Z" }, { "style": "cephadm:v1", "name": "node-exporter.ceph1", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@node-exporter.ceph1", "enabled": true, "state": "running", "container_id": "00bf3ad29cce79e905e8533648ef38cbd232990fa9616aff1c0020b7b66d0cc0", "container_image_name": "quay.io/prometheus/node-exporter:v0.18.1", "container_image_id": "e5a616e4b9cf68dfcad7782b78e118be4310022e874d52da85c55923fb615f87", "version": "0.18.1", "started": "2022-08-19T03:37:55.232032Z", "created": "2022-08-19T03:37:47.711604Z", "deployed": "2022-08-19T03:37:47.155604Z", "configured": "2022-08-19T03:37:47.711604Z" }, { "style": "cephadm:v1", "name": "osd.0", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@osd.0", "enabled": true, "state": "running", "container_id": "6b69046972dfbdb53665228258a15b13bc13a462ca4e066a4eca0cd593442d2d", "container_image_name": "quay.io/ceph/ceph:v15", "container_image_id": "93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4", "version": "15.2.17", "started": "2022-08-19T15:55:20.580157Z", "created": "2022-08-19T15:55:19.725766Z", "deployed": "2022-08-19T15:55:19.125766Z", "configured": "2022-08-31T02:53:34.760643Z" }, { "style": "cephadm:v1", "name": "crash.ceph1", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@crash.ceph1", "enabled": true, "state": "running", "container_id": "6bc56f478ccb96841fe86a540e284c175300b83dad9e906ae3230f22341c8293", "container_image_name": "quay.io/ceph/ceph:v15", "container_image_id": "93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4", "version": "15.2.17", "started": "2022-08-19T03:37:17.660080Z", "created": "2022-08-19T03:37:17.559605Z", "deployed": "2022-08-19T03:37:16.991605Z", "configured": "2022-08-19T03:37:17.559605Z" }, { "style": "cephadm:v1", "name": "mon.ceph1", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@mon.ceph1 ", "enabled": true, "state": "running", "container_id": "d0f03130491daebbe783c4990c6a4383d49e7a0e2bdf8c5d1eed012865e5d875", "container_image_name": "quay.io/ceph/ceph:v15", "container_image_id": "93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4", "version": "15.2.17", "started": "2022-08-19T03:36:21.804129Z", "created": "2022-08-19T03:36:19.743608Z", "deployed": "2022-08-19T03:36:18.439608Z", "configured": "2022-08-19T03:38:05.931603Z" }, { "style": "cephadm:v1", "name": "mgr.ceph1.xmbvsb", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@mgr.ceph1.xmbvsb", "enabled": true, "state": "stopped", "container_id": null, "container_image_name": "quay.io/ceph/ceph:v15", "container_image_id": null, "version": null, "started": null, "created": "2022-08-19T03:36:22.815608Z", "deployed": "2022-08-19T03:36:22.239608Z", "configured": "2022-08-19T03:38:06.487603Z" }, { "style": "cephadm:v1", "name": "osd.4", "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea", "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@osd.4", "enabled": true, "state": "running", "container_id": "938840fe7fd0cb45cc26d077837c9847d7c7a7a68c7e1588d4bb4343c695a071", "container_image_name": "quay.io/ceph/ceph:v15", "container_image_id": "93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4", "version": "15.2.17", "started": "2022-08-31T03:20:55.416219Z", "created": "2022-08-23T21:46:49.458533Z", "deployed": "2022-08-23T21:46:48.818533Z", "configured": "2022-08-31T02:53:41.196643Z" } ] I have noticed one more thing, I did docker stop <container_id_of_mgr> on ceph1 node and now my mgr container disappeared, I can't see it anywhere and not sure how do i bring back mgr because upgrade won't let me do anything if i don't have two mgr instance. root@ceph1:~# ceph -s cluster: id: f270ad9e-1f6f-11ed-b6f8-a539d87379ea health: HEALTH_WARN 4 stray daemon(s) not managed by cephadm services: mon: 1 daemons, quorum ceph1 (age 17h) mgr: ceph2.hmbdla(active, since 5h) osd: 6 osds: 6 up (since 40h), 6 in (since 8d) data: pools: 6 pools, 161 pgs objects: 20.59k objects, 85 GiB usage: 174 GiB used, 826 GiB / 1000 GiB avail pgs: 161 active+clean io: client: 0 B/s rd, 12 KiB/s wr, 0 op/s rd, 2 op/s wr progress: Upgrade to quay.io/ceph/ceph:16.2.10 (0s) [............................] I can see mgr count:2 but not sure how do i bring it back root@ceph1:~# ceph orch ls NAME PORTS RUNNING REFRESHED AGE PLACEMENT alertmanager ?:9093,9094 1/1 20s ago 13d count:1 crash 2/2 20s ago 13d * grafana ?:3000 1/1 20s ago 13d count:1 mgr 2/2 20s ago 13d count:2 mon 0/5 - 13d <unmanaged> node-exporter ?:9100 2/2 20s ago 13d * osd 6 20s ago - <unmanaged> osd.all-available-devices 0 - 13d * osd.osd_spec_default 0 - 8d * prometheus ?:9095 1/1 20s ago 13d count:1 On Thu, Sep 1, 2022 at 12:28 PM Adam King <adking@xxxxxxxxxx> wrote: > Are there any extra directories in /var/lib/ceph or /var/lib/ceph/<fsid> > that appear to be for those OSDs on that host? When cephadm builds the info > it uses for "ceph orch ps" it's actually scraping those directories. The > output of "cephadm ls" on the host with the duplicates could also > potentially have some insights. > > On Thu, Sep 1, 2022 at 12:15 PM Satish Patel <satish.txt@xxxxxxxxx> wrote: > >> Folks, >> >> I am playing with cephadm and life was good until I started upgrading from >> octopus to pacific. My upgrade process stuck after upgrading mgr and in >> logs now i can see following error >> >> root@ceph1:~# ceph log last cephadm >> 2022-09-01T14:40:45.739804+0000 mgr.ceph2.hmbdla (mgr.265806) 8 : >> cephadm [INF] Deploying daemon grafana.ceph1 on ceph1 >> 2022-09-01T14:40:56.115693+0000 mgr.ceph2.hmbdla (mgr.265806) 14 : >> cephadm [INF] Deploying daemon prometheus.ceph1 on ceph1 >> 2022-09-01T14:41:11.856725+0000 mgr.ceph2.hmbdla (mgr.265806) 25 : >> cephadm [INF] Reconfiguring alertmanager.ceph1 (dependencies >> changed)... >> 2022-09-01T14:41:11.861535+0000 mgr.ceph2.hmbdla (mgr.265806) 26 : >> cephadm [INF] Reconfiguring daemon alertmanager.ceph1 on ceph1 >> 2022-09-01T14:41:12.927852+0000 mgr.ceph2.hmbdla (mgr.265806) 27 : >> cephadm [INF] Reconfiguring grafana.ceph1 (dependencies changed)... >> 2022-09-01T14:41:12.940615+0000 mgr.ceph2.hmbdla (mgr.265806) 28 : >> cephadm [INF] Reconfiguring daemon grafana.ceph1 on ceph1 >> 2022-09-01T14:41:14.056113+0000 mgr.ceph2.hmbdla (mgr.265806) 33 : >> cephadm [INF] Found duplicate OSDs: osd.2 in status running on ceph1, >> osd.2 in status running on ceph2 >> 2022-09-01T14:41:14.056437+0000 mgr.ceph2.hmbdla (mgr.265806) 34 : >> cephadm [INF] Found duplicate OSDs: osd.5 in status running on ceph1, >> osd.5 in status running on ceph2 >> 2022-09-01T14:41:14.056630+0000 mgr.ceph2.hmbdla (mgr.265806) 35 : >> cephadm [INF] Found duplicate OSDs: osd.3 in status running on ceph1, >> osd.3 in status running on ceph2 >> >> >> Not sure from where duplicate names came and how that happened. In >> following output i can't see any duplication >> >> root@ceph1:~# ceph osd tree >> ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF >> -1 0.97656 root default >> -3 0.48828 host ceph1 >> 4 hdd 0.09769 osd.4 up 1.00000 1.00000 >> 0 ssd 0.19530 osd.0 up 1.00000 1.00000 >> 1 ssd 0.19530 osd.1 up 1.00000 1.00000 >> -5 0.48828 host ceph2 >> 5 hdd 0.09769 osd.5 up 1.00000 1.00000 >> 2 ssd 0.19530 osd.2 up 1.00000 1.00000 >> 3 ssd 0.19530 osd.3 up 1.00000 1.00000 >> >> >> But same time i can see duplicate OSD number in ceph1 and ceph2 >> >> >> root@ceph1:~# ceph orch ps >> NAME HOST PORTS STATUS REFRESHED AGE >> MEM USE MEM LIM VERSION IMAGE ID CONTAINER ID >> alertmanager.ceph1 ceph1 *:9093,9094 running (20s) 2s ago 20s >> 17.1M - ba2b418f427c 856a4fe641f1 >> alertmanager.ceph1 ceph2 *:9093,9094 running (20s) 3s ago 20s >> 17.1M - ba2b418f427c 856a4fe641f1 >> crash.ceph2 ceph1 running (12d) 2s ago 12d >> 10.0M - 15.2.17 93146564743f 0a009254afb0 >> crash.ceph2 ceph2 running (12d) 3s ago 12d >> 10.0M - 15.2.17 93146564743f 0a009254afb0 >> grafana.ceph1 ceph1 *:3000 running (18s) 2s ago 19s >> 47.9M - 8.3.5 dad864ee21e9 7d7a70b8ab7f >> grafana.ceph1 ceph2 *:3000 running (18s) 3s ago 19s >> 47.9M - 8.3.5 dad864ee21e9 7d7a70b8ab7f >> mgr.ceph2.hmbdla ceph1 running (13h) 2s ago 12d >> 506M - 16.2.10 0d668911f040 6274723c35f7 >> mgr.ceph2.hmbdla ceph2 running (13h) 3s ago 12d >> 506M - 16.2.10 0d668911f040 6274723c35f7 >> node-exporter.ceph2 ceph1 running (91m) 2s ago 12d >> 60.7M - 0.18.1 e5a616e4b9cf d0ba04bb977c >> node-exporter.ceph2 ceph2 running (91m) 3s ago 12d >> 60.7M - 0.18.1 e5a616e4b9cf d0ba04bb977c >> osd.2 ceph1 running (12h) 2s ago 12d >> 867M 4096M 15.2.17 93146564743f e286fb1c6302 >> osd.2 ceph2 running (12h) 3s ago 12d >> 867M 4096M 15.2.17 93146564743f e286fb1c6302 >> osd.3 ceph1 running (12h) 2s ago 12d >> 978M 4096M 15.2.17 93146564743f d3ae5d9f694f >> osd.3 ceph2 running (12h) 3s ago 12d >> 978M 4096M 15.2.17 93146564743f d3ae5d9f694f >> osd.5 ceph1 running (12h) 2s ago 8d >> 225M 4096M 15.2.17 93146564743f 405068fb474e >> osd.5 ceph2 running (12h) 3s ago 8d >> 225M 4096M 15.2.17 93146564743f 405068fb474e >> prometheus.ceph1 ceph1 *:9095 running (8s) 2s ago 8s >> 30.4M - 514e6a882f6e 9031dbe30cae >> prometheus.ceph1 ceph2 *:9095 running (8s) 3s ago 8s >> 30.4M - 514e6a882f6e 9031dbe30cae >> >> >> Is this a bug or did I do something wrong? any workaround to get out >> from this condition? >> _______________________________________________ >> ceph-users mailing list -- ceph-users@xxxxxxx >> To unsubscribe send an email to ceph-users-leave@xxxxxxx >> >> _______________________________________________ ceph-users mailing list -- ceph-users@xxxxxxx To unsubscribe send an email to ceph-users-leave@xxxxxxx