Re: [cephadm] Found duplicate OSDs

Satish Patel <satish.txt@xxxxxxxxx> · Thu, 1 Sep 2022 15:50:40 -0400

Adam,

I have posted a question related to upgrading earlier and this thread is
related to that, I have opened a new one because I found that error in logs
and thought the upgrade may be stuck because of duplicate OSDs.

root@ceph1:~# ls -l /var/lib/ceph/f270ad9e-1f6f-11ed-b6f8-a539d87379ea/
total 44
drwx------ 3 nobody nogroup 4096 Aug 19 05:37 alertmanager.ceph1
drwx------ 3    167     167 4096 Aug 19 05:36 crash
drwx------ 2    167     167 4096 Aug 19 05:37 crash.ceph1
drwx------ 4    998     996 4096 Aug 19 05:37 grafana.ceph1
drwx------ 2    167     167 4096 Aug 19 05:36 mgr.ceph1.xmbvsb
drwx------ 3    167     167 4096 Aug 19 05:36 mon.ceph1
drwx------ 2 nobody nogroup 4096 Aug 19 05:37 node-exporter.ceph1
drwx------ 2    167     167 4096 Aug 19 17:55 osd.0
drwx------ 2    167     167 4096 Aug 19 18:03 osd.1
drwx------ 2    167     167 4096 Aug 31 05:20 osd.4
drwx------ 4 nobody nogroup 4096 Aug 19 05:38 prometheus.ceph1

Here is the output of cephadm ls

root@ceph1:~# cephadm ls
[
    {
        "style": "cephadm:v1",
        "name": "alertmanager.ceph1",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit":
"ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@alertmanager.ceph1",
        "enabled": true,
        "state": "running",
        "container_id":
"97403cf9799711461216b7f83e88c574da2b631c7c65233ebd82d8a216a48924",
        "container_image_name": "quay.io/prometheus/alertmanager:v0.20.0",
        "container_image_id":
"0881eb8f169f5556a292b4e2c01d683172b12830a62a9225a98a8e206bb734f0",
        "version": "0.20.0",
        "started": "2022-08-19T16:59:02.461978Z",
        "created": "2022-08-19T03:37:16.403605Z",
        "deployed": "2022-08-19T03:37:15.815605Z",
        "configured": "2022-08-19T16:59:02.117607Z"
    },
    {
        "style": "cephadm:v1",
        "name": "grafana.ceph1",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit":
"ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@grafana.ceph1",
        "enabled": true,
        "state": "running",
        "container_id":
"c7136aea8349a37dd9b320acd926c4bcbed95bc4549779e9580ed4290edc2117",
        "container_image_name": "quay.io/ceph/ceph-grafana:6.7.4",
        "container_image_id":
"557c83e11646f123a27b5e4b62ac6c45e7bb8b2e90d6044034d0db5b7019415c",
        "version": "6.7.4",
        "started": "2022-08-19T03:38:05.481992Z",
        "created": "2022-08-19T03:37:46.823604Z",
        "deployed": "2022-08-19T03:37:46.239604Z",
        "configured": "2022-08-19T03:38:05.163603Z"
    },
    {
        "style": "cephadm:v1",
        "name": "osd.1",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@osd.1",
        "enabled": true,
        "state": "running",
        "container_id":
"51586b775bda0485c8b27b8401ac2430570e6f42cb7e12bae3eea05064f1fd20",
        "container_image_name": "quay.io/ceph/ceph:v15",
        "container_image_id":
"93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4",
        "version": "15.2.17",
        "started": "2022-08-19T16:03:10.612432Z",
        "created": "2022-08-19T16:03:09.765746Z",
        "deployed": "2022-08-19T16:03:09.141746Z",
        "configured": "2022-08-31T02:53:34.224643Z"
    },
    {
        "style": "cephadm:v1",
        "name": "prometheus.ceph1",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit":
"ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@prometheus.ceph1",
        "enabled": true,
        "state": "running",
        "container_id":
"ba305236e5db9f2095b23b86a2340924909e9e8e54e5cdbe1d51c14dc4c8587a",
        "container_image_name": "quay.io/prometheus/prometheus:v2.18.1",
        "container_image_id":
"de242295e2257c37c8cadfd962369228f8f10b2d48a44259b65fef44ad4f6490",
        "version": "2.18.1",
        "started": "2022-08-19T16:59:03.538981Z",
        "created": "2022-08-19T03:38:01.567604Z",
        "deployed": "2022-08-19T03:38:00.983603Z",
        "configured": "2022-08-19T16:59:03.193607Z"
    },
    {
        "style": "cephadm:v1",
        "name": "node-exporter.ceph1",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit":
"ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@node-exporter.ceph1",
        "enabled": true,
        "state": "running",
        "container_id":
"00bf3ad29cce79e905e8533648ef38cbd232990fa9616aff1c0020b7b66d0cc0",
        "container_image_name": "quay.io/prometheus/node-exporter:v0.18.1",
        "container_image_id":
"e5a616e4b9cf68dfcad7782b78e118be4310022e874d52da85c55923fb615f87",
        "version": "0.18.1",
        "started": "2022-08-19T03:37:55.232032Z",
        "created": "2022-08-19T03:37:47.711604Z",
        "deployed": "2022-08-19T03:37:47.155604Z",
        "configured": "2022-08-19T03:37:47.711604Z"
    },
    {
        "style": "cephadm:v1",
        "name": "osd.0",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@osd.0",
        "enabled": true,
        "state": "running",
        "container_id":
"6b69046972dfbdb53665228258a15b13bc13a462ca4e066a4eca0cd593442d2d",
        "container_image_name": "quay.io/ceph/ceph:v15",
        "container_image_id":
"93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4",
        "version": "15.2.17",
        "started": "2022-08-19T15:55:20.580157Z",
        "created": "2022-08-19T15:55:19.725766Z",
        "deployed": "2022-08-19T15:55:19.125766Z",
        "configured": "2022-08-31T02:53:34.760643Z"
    },
    {
        "style": "cephadm:v1",
        "name": "crash.ceph1",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit":
"ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@crash.ceph1",
        "enabled": true,
        "state": "running",
        "container_id":
"6bc56f478ccb96841fe86a540e284c175300b83dad9e906ae3230f22341c8293",
        "container_image_name": "quay.io/ceph/ceph:v15",
        "container_image_id":
"93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4",
        "version": "15.2.17",
        "started": "2022-08-19T03:37:17.660080Z",
        "created": "2022-08-19T03:37:17.559605Z",
        "deployed": "2022-08-19T03:37:16.991605Z",
        "configured": "2022-08-19T03:37:17.559605Z"
    },
    {
        "style": "cephadm:v1",
        "name": "mon.ceph1",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@mon.ceph1
",
        "enabled": true,
        "state": "running",
        "container_id":
"d0f03130491daebbe783c4990c6a4383d49e7a0e2bdf8c5d1eed012865e5d875",
        "container_image_name": "quay.io/ceph/ceph:v15",
        "container_image_id":
"93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4",
        "version": "15.2.17",
        "started": "2022-08-19T03:36:21.804129Z",
        "created": "2022-08-19T03:36:19.743608Z",
        "deployed": "2022-08-19T03:36:18.439608Z",
        "configured": "2022-08-19T03:38:05.931603Z"
    },
    {
        "style": "cephadm:v1",
        "name": "mgr.ceph1.xmbvsb",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit":
"ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@mgr.ceph1.xmbvsb",
        "enabled": true,
        "state": "stopped",
        "container_id": null,
        "container_image_name": "quay.io/ceph/ceph:v15",
        "container_image_id": null,
        "version": null,
        "started": null,
        "created": "2022-08-19T03:36:22.815608Z",
        "deployed": "2022-08-19T03:36:22.239608Z",
        "configured": "2022-08-19T03:38:06.487603Z"
    },
    {
        "style": "cephadm:v1",
        "name": "osd.4",
        "fsid": "f270ad9e-1f6f-11ed-b6f8-a539d87379ea",
        "systemd_unit": "ceph-f270ad9e-1f6f-11ed-b6f8-a539d87379ea@osd.4",
        "enabled": true,
        "state": "running",
        "container_id":
"938840fe7fd0cb45cc26d077837c9847d7c7a7a68c7e1588d4bb4343c695a071",
        "container_image_name": "quay.io/ceph/ceph:v15",
        "container_image_id":
"93146564743febec815d6a764dad93fc07ce971e88315403ac508cb5da6d35f4",
        "version": "15.2.17",
        "started": "2022-08-31T03:20:55.416219Z",
        "created": "2022-08-23T21:46:49.458533Z",
        "deployed": "2022-08-23T21:46:48.818533Z",
        "configured": "2022-08-31T02:53:41.196643Z"
    }
]

I have noticed one more thing, I did docker stop <container_id_of_mgr> on
ceph1 node and now my mgr container disappeared, I can't see it anywhere
and not sure how do i bring back mgr because upgrade won't let me do
anything if i don't have two mgr instance.

root@ceph1:~# ceph -s
  cluster:
    id:     f270ad9e-1f6f-11ed-b6f8-a539d87379ea
    health: HEALTH_WARN
            4 stray daemon(s) not managed by cephadm

  services:
    mon: 1 daemons, quorum ceph1 (age 17h)
    mgr: ceph2.hmbdla(active, since 5h)
    osd: 6 osds: 6 up (since 40h), 6 in (since 8d)

  data:
    pools:   6 pools, 161 pgs
    objects: 20.59k objects, 85 GiB
    usage:   174 GiB used, 826 GiB / 1000 GiB avail
    pgs:     161 active+clean

  io:
    client:   0 B/s rd, 12 KiB/s wr, 0 op/s rd, 2 op/s wr

  progress:
    Upgrade to quay.io/ceph/ceph:16.2.10 (0s)
      [............................]

I can see mgr count:2 but not sure how do i bring it back

root@ceph1:~# ceph orch ls
NAME                       PORTS        RUNNING  REFRESHED  AGE  PLACEMENT
alertmanager               ?:9093,9094      1/1  20s ago    13d  count:1
crash                                       2/2  20s ago    13d  *
grafana                    ?:3000           1/1  20s ago    13d  count:1
mgr                                         2/2  20s ago    13d  count:2
mon                                         0/5  -          13d  <unmanaged>
node-exporter              ?:9100           2/2  20s ago    13d  *
osd                                           6  20s ago    -    <unmanaged>
osd.all-available-devices                     0  -          13d  *
osd.osd_spec_default                          0  -          8d   *
prometheus                 ?:9095           1/1  20s ago    13d  count:1

On Thu, Sep 1, 2022 at 12:28 PM Adam King <adking@xxxxxxxxxx> wrote:

> Are there any extra directories in /var/lib/ceph or /var/lib/ceph/<fsid>
> that appear to be for those OSDs on that host? When cephadm builds the info
> it uses for "ceph orch ps" it's actually scraping those directories. The
> output of "cephadm ls" on the host with the duplicates could also
> potentially have some insights.
>
> On Thu, Sep 1, 2022 at 12:15 PM Satish Patel <satish.txt@xxxxxxxxx> wrote:
>
>> Folks,
>>
>> I am playing with cephadm and life was good until I started upgrading from
>> octopus to pacific. My upgrade process stuck after upgrading mgr and in
>> logs now i can see following error
>>
>> root@ceph1:~# ceph log last cephadm
>> 2022-09-01T14:40:45.739804+0000 mgr.ceph2.hmbdla (mgr.265806) 8 :
>> cephadm [INF] Deploying daemon grafana.ceph1 on ceph1
>> 2022-09-01T14:40:56.115693+0000 mgr.ceph2.hmbdla (mgr.265806) 14 :
>> cephadm [INF] Deploying daemon prometheus.ceph1 on ceph1
>> 2022-09-01T14:41:11.856725+0000 mgr.ceph2.hmbdla (mgr.265806) 25 :
>> cephadm [INF] Reconfiguring alertmanager.ceph1 (dependencies
>> changed)...
>> 2022-09-01T14:41:11.861535+0000 mgr.ceph2.hmbdla (mgr.265806) 26 :
>> cephadm [INF] Reconfiguring daemon alertmanager.ceph1 on ceph1
>> 2022-09-01T14:41:12.927852+0000 mgr.ceph2.hmbdla (mgr.265806) 27 :
>> cephadm [INF] Reconfiguring grafana.ceph1 (dependencies changed)...
>> 2022-09-01T14:41:12.940615+0000 mgr.ceph2.hmbdla (mgr.265806) 28 :
>> cephadm [INF] Reconfiguring daemon grafana.ceph1 on ceph1
>> 2022-09-01T14:41:14.056113+0000 mgr.ceph2.hmbdla (mgr.265806) 33 :
>> cephadm [INF] Found duplicate OSDs: osd.2 in status running on ceph1,
>> osd.2 in status running on ceph2
>> 2022-09-01T14:41:14.056437+0000 mgr.ceph2.hmbdla (mgr.265806) 34 :
>> cephadm [INF] Found duplicate OSDs: osd.5 in status running on ceph1,
>> osd.5 in status running on ceph2
>> 2022-09-01T14:41:14.056630+0000 mgr.ceph2.hmbdla (mgr.265806) 35 :
>> cephadm [INF] Found duplicate OSDs: osd.3 in status running on ceph1,
>> osd.3 in status running on ceph2
>>
>>
>> Not sure from where duplicate names came and how that happened. In
>> following output i can't see any duplication
>>
>> root@ceph1:~# ceph osd tree
>> ID  CLASS  WEIGHT   TYPE NAME       STATUS  REWEIGHT  PRI-AFF
>> -1         0.97656  root default
>> -3         0.48828      host ceph1
>>  4    hdd  0.09769          osd.4       up   1.00000  1.00000
>>  0    ssd  0.19530          osd.0       up   1.00000  1.00000
>>  1    ssd  0.19530          osd.1       up   1.00000  1.00000
>> -5         0.48828      host ceph2
>>  5    hdd  0.09769          osd.5       up   1.00000  1.00000
>>  2    ssd  0.19530          osd.2       up   1.00000  1.00000
>>  3    ssd  0.19530          osd.3       up   1.00000  1.00000
>>
>>
>> But same time i can see duplicate OSD number in ceph1 and ceph2
>>
>>
>> root@ceph1:~# ceph orch ps
>> NAME                 HOST   PORTS        STATUS         REFRESHED  AGE
>>  MEM USE  MEM LIM  VERSION  IMAGE ID      CONTAINER ID
>> alertmanager.ceph1   ceph1  *:9093,9094  running (20s)     2s ago  20s
>>    17.1M        -           ba2b418f427c  856a4fe641f1
>> alertmanager.ceph1   ceph2  *:9093,9094  running (20s)     3s ago  20s
>>    17.1M        -           ba2b418f427c  856a4fe641f1
>> crash.ceph2          ceph1               running (12d)     2s ago  12d
>>    10.0M        -  15.2.17  93146564743f  0a009254afb0
>> crash.ceph2          ceph2               running (12d)     3s ago  12d
>>    10.0M        -  15.2.17  93146564743f  0a009254afb0
>> grafana.ceph1        ceph1  *:3000       running (18s)     2s ago  19s
>>    47.9M        -  8.3.5    dad864ee21e9  7d7a70b8ab7f
>> grafana.ceph1        ceph2  *:3000       running (18s)     3s ago  19s
>>    47.9M        -  8.3.5    dad864ee21e9  7d7a70b8ab7f
>> mgr.ceph2.hmbdla     ceph1               running (13h)     2s ago  12d
>>     506M        -  16.2.10  0d668911f040  6274723c35f7
>> mgr.ceph2.hmbdla     ceph2               running (13h)     3s ago  12d
>>     506M        -  16.2.10  0d668911f040  6274723c35f7
>> node-exporter.ceph2  ceph1               running (91m)     2s ago  12d
>>    60.7M        -  0.18.1   e5a616e4b9cf  d0ba04bb977c
>> node-exporter.ceph2  ceph2               running (91m)     3s ago  12d
>>    60.7M        -  0.18.1   e5a616e4b9cf  d0ba04bb977c
>> osd.2                ceph1               running (12h)     2s ago  12d
>>     867M    4096M  15.2.17  93146564743f  e286fb1c6302
>> osd.2                ceph2               running (12h)     3s ago  12d
>>     867M    4096M  15.2.17  93146564743f  e286fb1c6302
>> osd.3                ceph1               running (12h)     2s ago  12d
>>     978M    4096M  15.2.17  93146564743f  d3ae5d9f694f
>> osd.3                ceph2               running (12h)     3s ago  12d
>>     978M    4096M  15.2.17  93146564743f  d3ae5d9f694f
>> osd.5                ceph1               running (12h)     2s ago   8d
>>     225M    4096M  15.2.17  93146564743f  405068fb474e
>> osd.5                ceph2               running (12h)     3s ago   8d
>>     225M    4096M  15.2.17  93146564743f  405068fb474e
>> prometheus.ceph1     ceph1  *:9095       running (8s)      2s ago   8s
>>    30.4M        -           514e6a882f6e  9031dbe30cae
>> prometheus.ceph1     ceph2  *:9095       running (8s)      3s ago   8s
>>    30.4M        -           514e6a882f6e  9031dbe30cae
>>
>>
>> Is this a bug or did I do something wrong? any workaround to get out
>> from this condition?
>> _______________________________________________
>> ceph-users mailing list -- ceph-users@xxxxxxx
>> To unsubscribe send an email to ceph-users-leave@xxxxxxx
>>
>>
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx