On Tue, 24 Jul 2012, Yehuda Sadeh wrote: > On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@xxxxxxxxxxx> wrote: > > From: caleb miles <caleb.miles@xxxxxxxxxxx> > > > > The server side recently added support for tuning some magic > > crush variables. Decode these variables if they are present, or use the > > default values if they are not present. > > > > Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5. > > > > Signed-off-by: caleb miles <caleb.miles@xxxxxxxxxxx> > > Reviewed-by: Sage Weil <sage@xxxxxxxxxxx> > > --- > > include/linux/ceph/ceph_features.h | 4 ++- > > include/linux/crush/crush.h | 8 +++++++ > > net/ceph/crush/mapper.c | 13 ++++++----- > > net/ceph/osdmap.c | 39 ++++++++++++++++++++++++++++++++++++ > > 4 files changed, 57 insertions(+), 7 deletions(-) > > > > diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h > > index 342f93d..df25dcf 100644 > > --- a/include/linux/ceph/ceph_features.h > > +++ b/include/linux/ceph/ceph_features.h > > @@ -12,12 +12,14 @@ > > #define CEPH_FEATURE_MONNAMES (1<<5) > > #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) > > #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) > > +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) > > any reason why this is 18 and not 8? 9-17 are used.. just not implemented/used by the kernel code. > > > > > /* > > * Features supported. > > */ > > #define CEPH_FEATURES_SUPPORTED_DEFAULT \ > > - (CEPH_FEATURE_NOSRCADDR) > > + (CEPH_FEATURE_NOSRCADDR | \ > > + CEPH_FEATURE_CRUSH_TUNABLES) > > > > #define CEPH_FEATURES_REQUIRED_DEFAULT \ > > (CEPH_FEATURE_NOSRCADDR) > > diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h > > index 7c47508..25baa28 100644 > > --- a/include/linux/crush/crush.h > > +++ b/include/linux/crush/crush.h > > @@ -154,6 +154,14 @@ struct crush_map { > > __s32 max_buckets; > > __u32 max_rules; > > __s32 max_devices; > > + > > + /* choose local retries before re-descent */ > > + __u32 choose_local_tries; > > + /* choose local attempts using a fallback permutation before > > + * re-descent */ > > + __u32 choose_local_fallback_tries; > > + /* choose attempts before giving up */ > > + __u32 choose_total_tries; > > }; > > > > > > diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c > > index d7edc24..35fce75 100644 > > --- a/net/ceph/crush/mapper.c > > +++ b/net/ceph/crush/mapper.c > > @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map, > > int item = 0; > > int itemtype; > > int collide, reject; > > - const unsigned int orig_tries = 5; /* attempts before we fall back to search */ > > > > dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", > > bucket->id, x, outpos, numrep); > > @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map, > > reject = 1; > > goto reject; > > } > > - if (flocal >= (in->size>>1) && > > - flocal > orig_tries) > > + if (map->choose_local_fallback_tries > 0 && > > + flocal >= (in->size>>1) && > > + flocal > map->choose_local_fallback_tries) > > is flocal right here or should it be ftotal? > > > item = bucket_perm_choose(in, x, r); > > else > > item = crush_bucket_choose(in, x, r); > > @@ -422,13 +422,14 @@ reject: > > ftotal++; > > flocal++; > > > > - if (collide && flocal < 3) > > + if (collide && flocal <= map->choose_local_tries) > > /* retry locally a few times */ > > retry_bucket = 1; > > - else if (flocal <= in->size + orig_tries) > > + else if (map->choose_local_fallback_tries > 0 && > > + flocal <= in->size + map->choose_local_fallback_tries) > > /* exhaustive bucket search */ > > retry_bucket = 1; > > - else if (ftotal < 20) > > + else if (ftotal <= map->choose_total_tries) > > /* then retry descent */ > > retry_descent = 1; > > else > > diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c > > index 9600674..3124b71 100644 > > --- a/net/ceph/osdmap.c > > +++ b/net/ceph/osdmap.c > > @@ -135,6 +135,21 @@ bad: > > return -EINVAL; > > } > > > > +static int skip_name_map(void **p, void *end) > > +{ > > + int len; > > + ceph_decode_32_safe(p, end, len ,bad); > > + while (len--) { > > + int strlen; > use u32 for strlen > > > + *p += sizeof(u32); > > + ceph_decode_32_safe(p, end, strlen, bad); > > + *p += strlen; > > +} > > + return 0; > > +bad: > > + return -EINVAL; > > +} > > + > > static struct crush_map *crush_decode(void *pbyval, void *end) > > { > > struct crush_map *c; > > @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) > > void **p = &pbyval; > > void *start = pbyval; > > u32 magic; > > + u32 num_name_maps; > > > > dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); > > > > @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) > > if (c == NULL) > > return ERR_PTR(-ENOMEM); > > > > + /* set tunables to default values */ > > + c->choose_local_tries = 2; > > + c->choose_local_fallback_tries = 5; > > + c->choose_total_tries = 19; > > + > > ceph_decode_need(p, end, 4*sizeof(u32), bad); > > magic = ceph_decode_32(p); > > if (magic != CRUSH_MAGIC) { > > @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end) > > } > > > > /* ignore trailing name maps. */ > > + for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { > > + err = skip_name_map(p, end); > > + if (err < 0) > > + goto done; > > + } > > + > > + /* tunables */ > > + ceph_decode_need(p, end, 3*sizeof(u32), done); > > + c->choose_local_tries = ceph_decode_32(p); > > + c->choose_local_fallback_tries = ceph_decode_32(p); > > + c->choose_total_tries = ceph_decode_32(p); > > + dout("crush decode tunable choose_local_tries = %d", > > + c->choose_local_tries); > > + dout("crush decode tunable choose_local_fallback_tries = %d", > > + c->choose_local_fallback_tries); > > + dout("crush decode tunable choose_total_tries = %d", > > + c->choose_total_tries); > > > > +done: > > dout("crush_decode success\n"); > > return c; > > > > -- > > 1.7.9 > > > > -- > > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > > the body of a message to majordomo@xxxxxxxxxxxxxxx > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html