``` Subvolumes do not "inherit" the distributed ephemeral pin. What you should expect below is that the "csi" subvolumegroup will be fragmented and distributed across the ranks. Consequently, the subvolumes will also be distributed across ranks as part of the subtrees rooted at each fragment of the "csi" subvolumegroup (directory). ``` How is subvolumegroup fragmentation handled? Are the subvolumes equally distributed across all available active MDS? In the following scenario, will 3 of the subvolumes be mapped to each of the MDS? Will setting the ephemeral distributed pin on Subvolumegroup ensure that the subvolumes in it will be equally distributed across MDS ? We are looking at ceph-csi use case for Kubernetes. PVs (subvolumes) are dynamically created by Kubernetes. # Ceph FS configuration ceph fs subvolumegroup create midline-a csi ceph fs subvolumegroup pin midline-a csi distributed 1 ceph fs subvolume create midline-a subvol1 csi ceph fs subvolume create midline-a subvol2 csi ceph fs subvolume create midline-a subvol3 csi ceph fs subvolume create midline-a subvol4 csi ceph fs subvolume create midline-a subvol5 csi ceph fs subvolume create midline-a subvol6 csi # ceph fs ls name: midline-a, metadata pool: fs-midline-metadata-a, data pools: [ fs-midline-data-a ] # ceph fs subvolumegroup ls midline-a [ { "name": "csi" } ] # ceph fs subvolume ls midline-a csi [ { "name": "subvol4" }, { "name": "subvol2" }, { "name": "subvol3" }, { "name": "subvol5" }, { "name": "subvol6" }, { "name": "subvol1" } ] # ceph fs status midline-a - 2 clients ========= RANK STATE MDS ACTIVITY DNS INOS DIRS CAPS 0 active midline.server1.njyfcn Reqs: 0 /s 514 110 228 36 1 active midline.server2.lpnjmx Reqs: 0 /s 47 22 17 6 POOL TYPE USED AVAIL fs-midline-metadata-a metadata 25.4M 25.9T fs-midline-data-a data 216k 25.9T STANDBY MDS midline.server3.wsbxsh MDS version: ceph version 19.2.0 (16063ff2022298c9300e49a547a16ffda59baf13) squid (stable) Following are the subtrees output from the MDSs. The directory fragments does not seem to be equally mapped to MDS. # ceph tell mds.midline.server1.njyfcn get subtrees | jq [ { "is_auth": true, "auth_first": 0, "auth_second": -2, "export_pin": -1, "distributed_ephemeral_pin": false, "random_ephemeral_pin": false, "export_pin_target": -1, "dir": { "path": "", "dirfrag": "0x1", "snapid_first": 2, "projected_version": "1240", "version": "1240", "committing_version": "0", "committed_version": "0", "is_rep": false, "dir_auth": "0", "states": [ "auth", "dirty", "complete" ], "is_auth": true, "auth_state": { "replicas": { "1": 1 } }, "replica_state": { "authority": [ 0, -2 ], "replica_nonce": 0 }, "auth_pins": 0, "is_frozen": false, "is_freezing": false, "pins": { "child": 1, "subtree": 1, "subtreetemp": 0, "replicated": 1, "dirty": 1, "waiter": 0, "authpin": 0 }, "nref": 4 } }, { "is_auth": true, "auth_first": 0, "auth_second": -2, "export_pin": -1, "distributed_ephemeral_pin": false, "random_ephemeral_pin": false, "export_pin_target": -1, "dir": { "path": "~mds0", "dirfrag": "0x100", "snapid_first": 2, "projected_version": "1232", "version": "1232", "committing_version": "0", "committed_version": "0", "is_rep": false, "dir_auth": "0", "states": [ "auth", "dirty", "complete" ], "is_auth": true, "auth_state": { "replicas": {} }, "replica_state": { "authority": [ 0, -2 ], "replica_nonce": 0 }, "auth_pins": 0, "is_frozen": false, "is_freezing": false, "pins": { "child": 1, "subtree": 1, "subtreetemp": 0, "dirty": 1, "waiter": 0, "authpin": 0 }, "nref": 3 } }, { "is_auth": false, "auth_first": 1, "auth_second": -2, "export_pin": -1, "distributed_ephemeral_pin": true, "random_ephemeral_pin": false, "export_pin_target": 1, "dir": { "path": "/volumes/csi", "dirfrag": "0x100000006ae.11*", "snapid_first": 2, "projected_version": "50", "version": "50", "committing_version": "50", "committed_version": "50", "is_rep": false, "dir_auth": "1", "states": [], "is_auth": false, "auth_state": { "replicas": {} }, "replica_state": { "authority": [ 1, -2 ], "replica_nonce": 1 }, "auth_pins": 0, "is_frozen": false, "is_freezing": false, "pins": { "ptrwaiter": 0, "request": 0, "child": 0, "frozen": 0, "subtree": 1, "replicated": 0, "dirty": 0, "waiter": 0, "authpin": 0, "tempexporting": 0 }, "nref": 1 } }, { "is_auth": true, "auth_first": 0, "auth_second": -2, "export_pin": -1, "distributed_ephemeral_pin": true, "random_ephemeral_pin": false, "export_pin_target": 0, "dir": { "path": "/volumes/csi", "dirfrag": "0x100000006ae.10*", "snapid_first": 2, "projected_version": "52", "version": "52", "committing_version": "50", "committed_version": "50", "is_rep": false, "dir_auth": "0", "states": [ "auth", "dirty", "complete" ], "is_auth": true, "auth_state": { "replicas": {} }, "replica_state": { "authority": [ 0, -2 ], "replica_nonce": 0 }, "auth_pins": 0, "is_frozen": false, "is_freezing": false, "pins": { "subtree": 1, "dirty": 1, "waiter": 0, "authpin": 0 }, "nref": 2 } }, { "is_auth": true, "auth_first": 0, "auth_second": -2, "export_pin": -1, "distributed_ephemeral_pin": true, "random_ephemeral_pin": false, "export_pin_target": 0, "dir": { "path": "/volumes/csi", "dirfrag": "0x100000006ae.01*", "snapid_first": 2, "projected_version": "136", "version": "136", "committing_version": "82", "committed_version": "82", "is_rep": false, "dir_auth": "0", "states": [ "auth", "dirty", "complete" ], "is_auth": true, "auth_state": { "replicas": { "1": 1 } }, "replica_state": { "authority": [ 0, -2 ], "replica_nonce": 0 }, "auth_pins": 0, "is_frozen": false, "is_freezing": false, "pins": { "child": 1, "frozen": 0, "subtree": 1, "replicated": 1, "dirty": 1, "authpin": 0 }, "nref": 4 } } ] # ceph tell mds.midline.server2.lpnjmx get subtrees | jq [ { "is_auth": true, "auth_first": 1, "auth_second": -2, "export_pin": -1, "distributed_ephemeral_pin": false, "random_ephemeral_pin": false, "export_pin_target": -1, "dir": { "path": "~mds1", "dirfrag": "0x101", "snapid_first": 2, "projected_version": "332", "version": "332", "committing_version": "0", "committed_version": "0", "is_rep": false, "dir_auth": "1", "states": [ "auth", "dirty", "complete" ], "is_auth": true, "auth_state": { "replicas": {} }, "replica_state": { "authority": [ 1, -2 ], "replica_nonce": 0 }, "auth_pins": 0, "is_frozen": false, "is_freezing": false, "pins": { "child": 1, "subtree": 1, "subtreetemp": 0, "dirty": 1, "waiter": 0, "authpin": 0 }, "nref": 3 } }, { "is_auth": true, "auth_first": 1, "auth_second": -2, "export_pin": -1, "distributed_ephemeral_pin": true, "random_ephemeral_pin": false, "export_pin_target": 1, "dir": { "path": "/volumes/csi", "dirfrag": "0x100000006ae.11*", "snapid_first": 2, "projected_version": "66", "version": "66", "committing_version": "50", "committed_version": "50", "is_rep": false, "dir_auth": "1", "states": [ "auth", "dirty", "complete" ], "is_auth": true, "auth_state": { "replicas": { "0": 1 } }, "replica_state": { "authority": [ 1, -2 ], "replica_nonce": 0 }, "auth_pins": 0, "is_frozen": false, "is_freezing": false, "pins": { "ptrwaiter": 0, "child": 1, "frozen": 0, "subtree": 1, "importing": 0, "replicated": 1, "dirty": 1, "authpin": 0 }, "nref": 4 } }, { "is_auth": false, "auth_first": 0, "auth_second": -2, "export_pin": -1, "distributed_ephemeral_pin": false, "random_ephemeral_pin": false, "export_pin_target": -1, "dir": { "path": "", "dirfrag": "0x1", "snapid_first": 2, "projected_version": "0", "version": "1216", "committing_version": "0", "committed_version": "0", "is_rep": false, "dir_auth": "0", "states": [], "is_auth": false, "auth_state": { "replicas": {} }, "replica_state": { "authority": [ 0, -2 ], "replica_nonce": 1 }, "auth_pins": 0, "is_frozen": false, "is_freezing": false, "pins": { "child": 1, "subtree": 1 }, "nref": 2 } } ] Regards, Rajmohan R On Wed, Nov 20, 2024 at 12:48 PM Patrick Donnelly <pdonnell@xxxxxxxxxx> wrote: > On Tue, Nov 19, 2024 at 8:38 PM Rajmohan Ramamoorthy > <ram.rajmohanr@xxxxxxxxx> wrote: > > > > CephFS subvolumes not inheriting ephemeral distributed pin configured on > > the subvolumegroups. > > Subvolumes do not "inherit" the distributed ephemeral pin. What you > should expect below is that the "csi" subvolumegroup will be > fragmented and distributed across the ranks. Consequently, the > subvolumes will also be distributed across ranks as part of the > subtrees rooted at each fragment of the "csi" subvolumegroup > (directory). > > > References: > > > > https://github.com/ceph/ceph/pull/35759 > > https://github.com/ceph/ceph/pull/43896 > > > > # Environmet setup > > > > Ceph version: 18.2.4 and 19.2.0 > > Meaning you tested two versions? > > > CephFS with multiple MDS (2 active + 1 standby MDS) > > RHEL 9.4 > > Linux myserver 5.14.0-427.37.1.el9_4.x86_64 #1 SMP PREEMPT_DYNAMIC Fri > Sep > > 13 12:41:50 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux > > > > # Ceph FS configuration > > ``` > > ceph config set mds mds_export_ephemeral_distributed true > > ceph fs set midline-a balance_automate false > > ``` > > # Subvolume group (csi) creation > > ``` > > ceph fs subvolumegroup create midline-a csi > > ``` > > # Subvolume group (csi) pinning (tried both methods) > > ``` > > ceph fs subvolumegroup pin midline-a csi distributed 1 > > > > setfattr -n ceph.dir.pin.distributed -v 1 setfattr -n > > ceph.dir.pin.distributed -v 1 /mnt/cephfs/volumes/csi/ > > ``` > > You ran setfattr and ` ceph fs subvolumegroup pin midline-a csi > distributed 1` ? > > > # Subvolume creation > > ``` > > ceph fs subvolume create midline-a subvol1 csi > > ceph fs subvolume create midline-a subvol2 csi > > ``` > > > > # Get attribute (subvolume group) > > ``` > > getfattr -n ceph.dir.pin.distributed /mnt/cephfs/volumes/csi > > > > getfattr: Removing leading '/' from absolute path names > > # file: mnt/cephfs/volumes/csi > > ceph.dir.pin.distributed="1" > > ``` > > > > # Get attribute (subvolume) - Subvolumes not inheriting the distributed > pin > > attribute > > > > ``` > > getfattr -n ceph.dir.pin.distributed /mnt/cephfs/volumes/csi/subvol1 > > > > getfattr: Removing leading '/' from absolute path names > > # file: mnt/cephfs/volumes/csi/subvol1 > > ceph.dir.pin.distributed="0" > > ``` > > > > ``` > > getfattr -n ceph.dir.pin.distributed /mnt/cephfs/volumes/csi/subvol2 > > > > getfattr: Removing leading '/' from absolute path names > > # file: mnt/cephfs/volumes/csi/subvol2 > > ceph.dir.pin.distributed="0" > > ``` > > these last two outputs are expected. > > > # MDS logs (subtree information) > > ``` > > ceph tell mds.abc123 get subtrees | jq '.[] | [.dir.path, .auth_first, > > .export_pin, .export_pin_target]' > > > > [ > > "~mds1", > > 1, > > -1, > > -1 > > ] > > [ > > "", > > 0, > > -1, > > -1 > > ] > > [ > > "/volumes/csi", > > 1, > > -1, > > 1 > > ] > > ``` > > > > ``` > > ceph tell mds.xyz123 get subtrees | jq '.[] | [.dir.path, .auth_first, > > .export_pin, .export_pin_target]' > > > > [ > > "/volumes/csi", > > 1, > > -1, > > 1 > > ] > > [ > > "", > > 0, > > -1, > > -1 > > ] > > [ > > "~mds0", > > 0, > > -1, > > -1 > > ] > > [ > > "/volumes/csi", > > 0, > > -1, > > 0 > > ] > > ``` > > It looks like the "csi" directory was split but the output is > incomplete. Please paste the full (anonymized) `get subtrees` output. > > -- > Patrick Donnelly, Ph.D. > He / Him / His > Red Hat Partner Engineer > IBM, Inc. > GPG: 19F28A586F808C2402351B93C3301A3E258DD79D > > _______________________________________________ ceph-users mailing list -- ceph-users@xxxxxxx To unsubscribe send an email to ceph-users-leave@xxxxxxx