LRC k6m3l3, rack outage and availability

steve.bakerx1@xxxxxxxxx · Wed, 08 Mar 2023 14:00:33 -0000

Hi, currently we are testing LRC codes and I got a cluster setup with 3 racks and 4 hosts in each of those. What I want to achieve is to have a storage efficient erasure code (<=200%) and also availability during a rack outage. In (my) theory, that should have worked with the LRC k6m3l3 having a crush-locality=rack and a crush-failure domain=host. But when I tested it, the PGs of the pool all go in the "down" state. So, when we've got k=6 data chunks and m=3 coding chunks, the data should be reconstructable with 6 of these 9 objects. With l=3, LRC splits these 9 objects in 3 groups of 3 objects and creates one additional locality-chunk per group. We now got 3 groups of 4 objects. These 3 groups get distributed over the 3 racks, the 4 objects of each group get distributed over the 4 hosts of a rack. I thought that on a full rack outage, the 6 remaining k/m chunks on the other 2 racks should still be enough to keep up the availability and the cluster could proceed in a degraded state. 
 But it does not, so I guess my thinking is wrong :) I wonder what's the reason for this, is it maybe some min_size setting ? The default min_size of this pool becomes 7 - I also changed that to 6 (yes, one shouldn't do that in productrion I think) but got the same result. Below I've added some details about the cluster, pool creation and pg dumps. Any ideas ?  Can s.o. explain why this does not work or give another solution how to achieve the described specifications? Thx!

############
Ceph version:
############

ceph --version
ceph version 16.2.11 (3cf40e2dca667f68c6ce3ff5cd94f01e711af894) pacific (stable)

#################
Creation of the pool:
#################

ceph osd erasure-code-profile set lrc_individual_profile plugin=lrc k=6 m=3 l=3 crush-failure-domain=host crush-locality=rack crush-root=default
ceph osd pool create lrc_individual_pool 1024 1024 erasure lrc_individual_profile
ceph osd pool set lrc_individual_pool pg_num 1024
ceph osd pool set lrc_individual_pool pg_num_min 1024
ceph osd pool set lrc_individual_pool pgp_num 1024
ceph osd pool set lrc_individual_pool pg_autoscale_mode warn
ceph osd pool set lrc_individual_pool bulk true

##################
Resulting pool details:
##################

ceph osd pool ls detail
pool 72 'lrc_individual_pool' erasure profile lrc_individual_profile size 12 min_size 7 crush_rule 1 object_hash rjenkins pg_num 1024 pgp_num 1024 autoscale_mode warn last_change 140484 flags hashpspool,bulk stripe_width 24576 pg_num_min 1024

ceph osd pool get lrc_individual_pool all
size: 12
min_size: 7
pg_num: 1024
pgp_num: 1024
crush_rule: lrc_individual_pool
hashpspool: true
allow_ec_overwrites: false
nodelete: false
nopgchange: false
nosizechange: false
write_fadvise_dontneed: false
noscrub: false
nodeep-scrub: false
use_gmt_hitset: 1
erasure_code_profile: lrc_individual_profile
fast_read: 0
pg_autoscale_mode: warn
pg_num_min: 1024
bulk: true

#################
Resulting crush rule:
#################

ceph osd crush rule dump lrc_individual_pool
{
    "rule_id": 1,
    "rule_name": "lrc_individual_pool",
    "ruleset": 1,
    "type": 3,
    "min_size": 3,
    "max_size": 12,
    "steps": [
        {
            "op": "set_chooseleaf_tries",
            "num": 5
        },
        {
            "op": "set_choose_tries",
            "num": 100
        },
        {
            "op": "take",
            "item": -1,
            "item_name": "default"
        },
        {
            "op": "choose_indep",
            "num": 3,
            "type": "rack"
        },
        {
            "op": "chooseleaf_indep",
            "num": 4,
            "type": "host"
        },
        {
            "op": "emit"
        }
    ]
}

############################
Ceph status after the rack outage:
############################

cluster:
    id:     ...
    health: HEALTH_WARN
            96 osds down
            4 hosts (96 osds) down
            1 rack (96 osds) down
            Reduced data availability: 1024 pgs inactive, 1024 pgs down

  services:
    mon: 3 daemons, quorum ...,...,... (age 4d)
    mgr: ...(active, since 4d), standbys: ...,...
    osd: 288 osds: 192 up (since 116s), 288 in (since 21h)

  data:
    pools:   2 pools, 1025 pgs
    objects: 291 objects, 0 B
    usage:   199 GiB used, 524 TiB / 524 TiB avail
    pgs:     99.902% pgs not active
             1024 down
             1    active+clean

#################
Section of pg dump:
#################

72.32          0                   0         0          0        0      0            0           0     0         0          down  2023-03-08T09:04:02.992141+0100          0'0    140549:52    [NONE,NONE,NONE,NONE,246,116,170,275,112,41,238,40]         246    [NONE,NONE,NONE,NONE,246,116,170,275,112,41,238,40]             246          0'0  2023-03-08T08:48:49.712787+0100              0'0  2023-03-08T08:48:49.712787+0100              0
72.33          0                   0         0          0        0      0            0           0     0         0          down  2023-03-08T09:04:02.988083+0100          0'0   140549:134      [36,263,162,73,NONE,NONE,NONE,NONE,99,155,74,282]          36      [36,263,162,73,NONE,NONE,NONE,NONE,99,155,74,282]              36          0'0  2023-03-08T08:48:49.712787+0100              0'0  2023-03-08T08:48:49.712787+0100              0
72.31          0                   0         0          0        0      0            0           0     0         0          down  2023-03-08T09:04:06.225241+0100          0'0   140549:136     [116,259,194,170,84,52,98,198,NONE,NONE,NONE,NONE]         116     [116,259,194,170,84,52,98,198,NONE,NONE,NONE,NONE]             116          0'0  2023-03-08T08:48:49.712787+0100              0'0  2023-03-08T08:48:49.712787+0100              0

##############
Single pg query:
##############

ceph pg 72.33 query
{
    "snap_trimq": "[]",
    "snap_trimq_len": 0,
    "state": "down",
    "epoch": 140550,
    "up": [
        36,
        263,
        162,
        73,
        2147483647,
        2147483647,
        2147483647,
        2147483647,
        99,
        155,
        74,
        282
    ],
    "acting": [
        36,
        263,
        162,
        73,
        2147483647,
        2147483647,
        2147483647,
        2147483647,
        99,
        155,
        74,
        282
    ],
    "info": {
        "pgid": "72.33s0",
        "last_update": "0'0",
        "last_complete": "0'0",
        "log_tail": "0'0",
        "last_user_version": 0,
        "last_backfill": "MAX",
        "purged_snaps": [],
        "history": {
            "epoch_created": 140477,
            "epoch_pool_created": 140477,
            "last_epoch_started": 140516,
            "last_interval_started": 140515,
            "last_epoch_clean": 140516,
            "last_interval_clean": 140515,
            "last_epoch_split": 0,
            "last_epoch_marked_full": 0,
            "same_up_since": 140538,
            "same_interval_since": 140538,
            "same_primary_since": 140477,
            "last_scrub": "0'0",
            "last_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
            "last_deep_scrub": "0'0",
            "last_deep_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
            "last_clean_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
            "prior_readable_until_ub": 0
        },
        "stats": {
            "version": "0'0",
            "reported_seq": 135,
            "reported_epoch": 140550,
            "state": "down",
            "last_fresh": "2023-03-08T09:07:15.104685+0100",
            "last_change": "2023-03-08T09:04:02.988083+0100",
            "last_active": "2023-03-08T09:04:02.337985+0100",
            "last_peered": "2023-03-08T09:04:01.288586+0100",
            "last_clean": "2023-03-08T09:04:01.288586+0100",
            "last_became_active": "2023-03-08T08:57:08.464085+0100",
            "last_became_peered": "2023-03-08T08:57:08.464085+0100",
            "last_unstale": "2023-03-08T09:07:15.104685+0100",
            "last_undegraded": "2023-03-08T09:07:15.104685+0100",
            "last_fullsized": "2023-03-08T09:07:15.104685+0100",
            "mapping_epoch": 140538,
            "log_start": "0'0",
            "ondisk_log_start": "0'0",
            "created": 140477,
            "last_epoch_clean": 140516,
            "parent": "0.0",
            "parent_split_bits": 0,
            "last_scrub": "0'0",
            "last_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
            "last_deep_scrub": "0'0",
            "last_deep_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
            "last_clean_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
            "log_size": 0,
            "ondisk_log_size": 0,
            "stats_invalid": false,
            "dirty_stats_invalid": false,
            "omap_stats_invalid": false,
            "hitset_stats_invalid": false,
            "hitset_bytes_stats_invalid": false,
            "pin_stats_invalid": false,
            "manifest_stats_invalid": false,
            "snaptrimq_len": 0,
            "stat_sum": {
                "num_bytes": 0,
                "num_objects": 0,
                "num_object_clones": 0,
                "num_object_copies": 0,
                "num_objects_missing_on_primary": 0,
                "num_objects_missing": 0,
                "num_objects_degraded": 0,
                "num_objects_misplaced": 0,
                "num_objects_unfound": 0,
                "num_objects_dirty": 0,
                "num_whiteouts": 0,
                "num_read": 0,
                "num_read_kb": 0,
                "num_write": 0,
                "num_write_kb": 0,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 0,
                "num_bytes_recovered": 0,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0,
                "num_objects_pinned": 0,
                "num_legacy_snapsets": 0,
                "num_large_omap_objects": 0,
                "num_objects_manifest": 0,
                "num_omap_bytes": 0,
                "num_omap_keys": 0,
                "num_objects_repaired": 0
            },
            "up": [
                36,
                263,
                162,
                73,
                2147483647,
                2147483647,
                2147483647,
                2147483647,
                99,
                155,
                74,
                282
            ],
            "acting": [
                36,
                263,
                162,
                73,
                2147483647,
                2147483647,
                2147483647,
                2147483647,
                99,
                155,
                74,
                282
            ],
            "avail_no_missing": [],
            "object_location_counts": [],
            "blocked_by": [
                173,
                219,
                236,
                253
            ],
            "up_primary": 36,
            "acting_primary": 36,
            "purged_snaps": []
        },
        "empty": 1,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 140516,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [],
    "recovery_state": [
        {
            "name": "Started/Primary/Peering/Down",
            "enter_time": "2023-03-08T09:04:02.988076+0100",
            "comment": "not enough up instances of this PG to go active"
        },
        {
            "name": "Started/Primary/Peering",
            "enter_time": "2023-03-08T09:04:02.987998+0100",
            "past_intervals": [
                {
                    "first": "140515",
                    "last": "140537",
                    "all_participants": [
                        {
                            "osd": 36,
                            "shard": 0
                        },
                        {
                            "osd": 73,
                            "shard": 3
                        },
                        {
                            "osd": 74,
                            "shard": 10
                        },
                        {
                            "osd": 99,
                            "shard": 8
                        },
                        {
                            "osd": 155,
                            "shard": 9
                        },
                        {
                            "osd": 162,
                            "shard": 2
                        },
                        {
                            "osd": 173,
                            "shard": 7
                        },
                        {
                            "osd": 219,
                            "shard": 6
                        },
                        {
                            "osd": 236,
                            "shard": 5
                        },
                        {
                            "osd": 253,
                            "shard": 4
                        },
                        {
                            "osd": 263,
                            "shard": 1
                        },
                        {
                            "osd": 282,
                            "shard": 11
                        }
                    ],
                    "intervals": [
                        {
                            "first": "140515",
                            "last": "140536",
                            "acting": "36(0),73(3),74(10),99(8),155(9),162(2),173(7),219(6),236(5),253(4),263(1),282(11)"
                        }
                    ]
                }
            ],
            "probing_osds": [
                "36(0)",
                "73(3)",
                "74(10)",
                "99(8)",
                "155(9)",
                "162(2)",
                "263(1)",
                "282(11)"
            ],
            "blocked": "peering is blocked due to down osds",
            "down_osds_we_would_probe": [
                173,
                219,
                236,
                253
            ],
            "peering_blocked_by": [
                {
                    "osd": 173,
                    "current_lost_at": 0,
                    "comment": "starting or marking this osd lost may let us proceed"
                },
                {
                    "osd": 219,
                    "current_lost_at": 0,
                    "comment": "starting or marking this osd lost may let us proceed"
                },
                {
                    "osd": 236,
                    "current_lost_at": 0,
                    "comment": "starting or marking this osd lost may let us proceed"
                },
                {
                    "osd": 253,
                    "current_lost_at": 0,
                    "comment": "starting or marking this osd lost may let us proceed"
                }
            ]
        },
        {
            "name": "Started",
            "enter_time": "2023-03-08T09:04:02.987942+0100"
        }
    ],
    "agent_state": {}
}
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx