Hi, currently we are testing LRC codes and I got a cluster setup
with 3 racks and 4 hosts in each of those. What I want to achieve is
to have a storage efficient erasure code (<=200%) and also
availability during a rack outage. In (my) theory, that should have
worked with the LRC k6m3l3 having a crush-locality=rack and a
crush-failure domain=host. But when I tested it, the PGs of the pool
all go in the "down" state. So, when we've got k=6 data chunks and
m=3 coding chunks, the data should be reconstructable with 6 of
these 9 objects. With l=3, LRC splits these 9 objects in 3 groups of
3 objects and creates one additional locality-chunk per group. We
now got 3 groups of 4 objects. These 3 groups get distributed over
the 3 racks, the 4 objects of each group get distributed over the 4
hosts of a rack. I thought that on a full rack outage, the 6
remaining k/m chunks on the other 2 racks should still be enough to
keep up the availability and the cluster could proceed in a degraded
state.
But it does not, so I guess my thinking is wrong :) I wonder what's
the reason for this, is it maybe some min_size setting ? The default
min_size of this pool becomes 7 - I also changed that to 6 (yes, one
shouldn't do that in productrion I think) but got the same result.
Below I've added some details about the cluster, pool creation and
pg dumps. Any ideas ? Can s.o. explain why this does not work or
give another solution how to achieve the described specifications?
Thx!
############
Ceph version:
############
ceph --version
ceph version 16.2.11 (3cf40e2dca667f68c6ce3ff5cd94f01e711af894)
pacific (stable)
#################
Creation of the pool:
#################
ceph osd erasure-code-profile set lrc_individual_profile plugin=lrc
k=6 m=3 l=3 crush-failure-domain=host crush-locality=rack
crush-root=default
ceph osd pool create lrc_individual_pool 1024 1024 erasure
lrc_individual_profile
ceph osd pool set lrc_individual_pool pg_num 1024
ceph osd pool set lrc_individual_pool pg_num_min 1024
ceph osd pool set lrc_individual_pool pgp_num 1024
ceph osd pool set lrc_individual_pool pg_autoscale_mode warn
ceph osd pool set lrc_individual_pool bulk true
##################
Resulting pool details:
##################
ceph osd pool ls detail
pool 72 'lrc_individual_pool' erasure profile lrc_individual_profile
size 12 min_size 7 crush_rule 1 object_hash rjenkins pg_num 1024
pgp_num 1024 autoscale_mode warn last_change 140484 flags
hashpspool,bulk stripe_width 24576 pg_num_min 1024
ceph osd pool get lrc_individual_pool all
size: 12
min_size: 7
pg_num: 1024
pgp_num: 1024
crush_rule: lrc_individual_pool
hashpspool: true
allow_ec_overwrites: false
nodelete: false
nopgchange: false
nosizechange: false
write_fadvise_dontneed: false
noscrub: false
nodeep-scrub: false
use_gmt_hitset: 1
erasure_code_profile: lrc_individual_profile
fast_read: 0
pg_autoscale_mode: warn
pg_num_min: 1024
bulk: true
#################
Resulting crush rule:
#################
ceph osd crush rule dump lrc_individual_pool
{
"rule_id": 1,
"rule_name": "lrc_individual_pool",
"ruleset": 1,
"type": 3,
"min_size": 3,
"max_size": 12,
"steps": [
{
"op": "set_chooseleaf_tries",
"num": 5
},
{
"op": "set_choose_tries",
"num": 100
},
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "choose_indep",
"num": 3,
"type": "rack"
},
{
"op": "chooseleaf_indep",
"num": 4,
"type": "host"
},
{
"op": "emit"
}
]
}
############################
Ceph status after the rack outage:
############################
cluster:
id: ...
health: HEALTH_WARN
96 osds down
4 hosts (96 osds) down
1 rack (96 osds) down
Reduced data availability: 1024 pgs inactive, 1024 pgs down
services:
mon: 3 daemons, quorum ...,...,... (age 4d)
mgr: ...(active, since 4d), standbys: ...,...
osd: 288 osds: 192 up (since 116s), 288 in (since 21h)
data:
pools: 2 pools, 1025 pgs
objects: 291 objects, 0 B
usage: 199 GiB used, 524 TiB / 524 TiB avail
pgs: 99.902% pgs not active
1024 down
1 active+clean
#################
Section of pg dump:
#################
72.32 0 0 0 0 0
0 0 0 0 0 down
2023-03-08T09:04:02.992141+0100 0'0 140549:52
[NONE,NONE,NONE,NONE,246,116,170,275,112,41,238,40] 246
[NONE,NONE,NONE,NONE,246,116,170,275,112,41,238,40] 246
0'0 2023-03-08T08:48:49.712787+0100 0'0
2023-03-08T08:48:49.712787+0100 0
72.33 0 0 0 0 0
0 0 0 0 0 down
2023-03-08T09:04:02.988083+0100 0'0 140549:134
[36,263,162,73,NONE,NONE,NONE,NONE,99,155,74,282] 36
[36,263,162,73,NONE,NONE,NONE,NONE,99,155,74,282] 36
0'0 2023-03-08T08:48:49.712787+0100 0'0
2023-03-08T08:48:49.712787+0100 0
72.31 0 0 0 0 0
0 0 0 0 0 down
2023-03-08T09:04:06.225241+0100 0'0 140549:136
[116,259,194,170,84,52,98,198,NONE,NONE,NONE,NONE] 116
[116,259,194,170,84,52,98,198,NONE,NONE,NONE,NONE] 116
0'0 2023-03-08T08:48:49.712787+0100 0'0
2023-03-08T08:48:49.712787+0100 0
##############
Single pg query:
##############
ceph pg 72.33 query
{
"snap_trimq": "[]",
"snap_trimq_len": 0,
"state": "down",
"epoch": 140550,
"up": [
36,
263,
162,
73,
2147483647,
2147483647,
2147483647,
2147483647,
99,
155,
74,
282
],
"acting": [
36,
263,
162,
73,
2147483647,
2147483647,
2147483647,
2147483647,
99,
155,
74,
282
],
"info": {
"pgid": "72.33s0",
"last_update": "0'0",
"last_complete": "0'0",
"log_tail": "0'0",
"last_user_version": 0,
"last_backfill": "MAX",
"purged_snaps": [],
"history": {
"epoch_created": 140477,
"epoch_pool_created": 140477,
"last_epoch_started": 140516,
"last_interval_started": 140515,
"last_epoch_clean": 140516,
"last_interval_clean": 140515,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 140538,
"same_interval_since": 140538,
"same_primary_since": 140477,
"last_scrub": "0'0",
"last_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
"last_clean_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
"prior_readable_until_ub": 0
},
"stats": {
"version": "0'0",
"reported_seq": 135,
"reported_epoch": 140550,
"state": "down",
"last_fresh": "2023-03-08T09:07:15.104685+0100",
"last_change": "2023-03-08T09:04:02.988083+0100",
"last_active": "2023-03-08T09:04:02.337985+0100",
"last_peered": "2023-03-08T09:04:01.288586+0100",
"last_clean": "2023-03-08T09:04:01.288586+0100",
"last_became_active": "2023-03-08T08:57:08.464085+0100",
"last_became_peered": "2023-03-08T08:57:08.464085+0100",
"last_unstale": "2023-03-08T09:07:15.104685+0100",
"last_undegraded": "2023-03-08T09:07:15.104685+0100",
"last_fullsized": "2023-03-08T09:07:15.104685+0100",
"mapping_epoch": 140538,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 140477,
"last_epoch_clean": 140516,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
"last_clean_scrub_stamp": "2023-03-08T08:48:49.712787+0100",
"log_size": 0,
"ondisk_log_size": 0,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0,
"num_omap_bytes": 0,
"num_omap_keys": 0,
"num_objects_repaired": 0
},
"up": [
36,
263,
162,
73,
2147483647,
2147483647,
2147483647,
2147483647,
99,
155,
74,
282
],
"acting": [
36,
263,
162,
73,
2147483647,
2147483647,
2147483647,
2147483647,
99,
155,
74,
282
],
"avail_no_missing": [],
"object_location_counts": [],
"blocked_by": [
173,
219,
236,
253
],
"up_primary": 36,
"acting_primary": 36,
"purged_snaps": []
},
"empty": 1,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 140516,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"peer_info": [],
"recovery_state": [
{
"name": "Started/Primary/Peering/Down",
"enter_time": "2023-03-08T09:04:02.988076+0100",
"comment": "not enough up instances of this PG to go active"
},
{
"name": "Started/Primary/Peering",
"enter_time": "2023-03-08T09:04:02.987998+0100",
"past_intervals": [
{
"first": "140515",
"last": "140537",
"all_participants": [
{
"osd": 36,
"shard": 0
},
{
"osd": 73,
"shard": 3
},
{
"osd": 74,
"shard": 10
},
{
"osd": 99,
"shard": 8
},
{
"osd": 155,
"shard": 9
},
{
"osd": 162,
"shard": 2
},
{
"osd": 173,
"shard": 7
},
{
"osd": 219,
"shard": 6
},
{
"osd": 236,
"shard": 5
},
{
"osd": 253,
"shard": 4
},
{
"osd": 263,
"shard": 1
},
{
"osd": 282,
"shard": 11
}
],
"intervals": [
{
"first": "140515",
"last": "140536",
"acting":
"36(0),73(3),74(10),99(8),155(9),162(2),173(7),219(6),236(5),253(4),263(1),282(11)"
}
]
}
],
"probing_osds": [
"36(0)",
"73(3)",
"74(10)",
"99(8)",
"155(9)",
"162(2)",
"263(1)",
"282(11)"
],
"blocked": "peering is blocked due to down osds",
"down_osds_we_would_probe": [
173,
219,
236,
253
],
"peering_blocked_by": [
{
"osd": 173,
"current_lost_at": 0,
"comment": "starting or marking this osd lost
may let us proceed"
},
{
"osd": 219,
"current_lost_at": 0,
"comment": "starting or marking this osd lost
may let us proceed"
},
{
"osd": 236,
"current_lost_at": 0,
"comment": "starting or marking this osd lost
may let us proceed"
},
{
"osd": 253,
"current_lost_at": 0,
"comment": "starting or marking this osd lost
may let us proceed"
}
]
},
{
"name": "Started",
"enter_time": "2023-03-08T09:04:02.987942+0100"
}
],
"agent_state": {}
}
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx