Re: [PATCH 2/9] libceph: support crush tunables

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 24 Jul 2012, Yehuda Sadeh wrote:
> On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <sage@xxxxxxxxxxx> wrote:
> > From: caleb miles <caleb.miles@xxxxxxxxxxx>
> >
> > The server side recently added support for tuning some magic
> > crush variables. Decode these variables if they are present, or use the
> > default values if they are not present.
> >
> > Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
> >
> > Signed-off-by: caleb miles <caleb.miles@xxxxxxxxxxx>
> > Reviewed-by: Sage Weil <sage@xxxxxxxxxxx>
> > ---
> >  include/linux/ceph/ceph_features.h |    4 ++-
> >  include/linux/crush/crush.h        |    8 +++++++
> >  net/ceph/crush/mapper.c            |   13 ++++++-----
> >  net/ceph/osdmap.c                  |   39 ++++++++++++++++++++++++++++++++++++
> >  4 files changed, 57 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
> > index 342f93d..df25dcf 100644
> > --- a/include/linux/ceph/ceph_features.h
> > +++ b/include/linux/ceph/ceph_features.h
> > @@ -12,12 +12,14 @@
> >  #define CEPH_FEATURE_MONNAMES       (1<<5)
> >  #define CEPH_FEATURE_RECONNECT_SEQ  (1<<6)
> >  #define CEPH_FEATURE_DIRLAYOUTHASH  (1<<7)
> > +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
> 
> any reason why this is 18 and not 8?

9-17 are used.. just not implemented/used by the kernel code.

> 
> >
> >  /*
> >   * Features supported.
> >   */
> >  #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
> > -       (CEPH_FEATURE_NOSRCADDR)
> > +       (CEPH_FEATURE_NOSRCADDR |        \
> > +        CEPH_FEATURE_CRUSH_TUNABLES)
> >
> >  #define CEPH_FEATURES_REQUIRED_DEFAULT   \
> >         (CEPH_FEATURE_NOSRCADDR)
> > diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> > index 7c47508..25baa28 100644
> > --- a/include/linux/crush/crush.h
> > +++ b/include/linux/crush/crush.h
> > @@ -154,6 +154,14 @@ struct crush_map {
> >         __s32 max_buckets;
> >         __u32 max_rules;
> >         __s32 max_devices;
> > +
> > +       /* choose local retries before re-descent */
> > +       __u32 choose_local_tries;
> > +       /* choose local attempts using a fallback permutation before
> > +        * re-descent */
> > +       __u32 choose_local_fallback_tries;
> > +       /* choose attempts before giving up */
> > +       __u32 choose_total_tries;
> >  };
> >
> >
> > diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
> > index d7edc24..35fce75 100644
> > --- a/net/ceph/crush/mapper.c
> > +++ b/net/ceph/crush/mapper.c
> > @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
> >         int item = 0;
> >         int itemtype;
> >         int collide, reject;
> > -       const unsigned int orig_tries = 5; /* attempts before we fall back to search */
> >
> >         dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
> >                 bucket->id, x, outpos, numrep);
> > @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
> >                                         reject = 1;
> >                                         goto reject;
> >                                 }
> > -                               if (flocal >= (in->size>>1) &&
> > -                                   flocal > orig_tries)
> > +                               if (map->choose_local_fallback_tries > 0 &&
> > +                                   flocal >= (in->size>>1) &&
> > +                                   flocal > map->choose_local_fallback_tries)
> 
> is flocal right here or should it be ftotal?
> 
> >                                         item = bucket_perm_choose(in, x, r);
> >                                 else
> >                                         item = crush_bucket_choose(in, x, r);
> > @@ -422,13 +422,14 @@ reject:
> >                                         ftotal++;
> >                                         flocal++;
> >
> > -                                       if (collide && flocal < 3)
> > +                                       if (collide && flocal <= map->choose_local_tries)
> >                                                 /* retry locally a few times */
> >                                                 retry_bucket = 1;
> > -                                       else if (flocal <= in->size + orig_tries)
> > +                                       else if (map->choose_local_fallback_tries > 0 &&
> > +                                                flocal <= in->size + map->choose_local_fallback_tries)
> >                                                 /* exhaustive bucket search */
> >                                                 retry_bucket = 1;
> > -                                       else if (ftotal < 20)
> > +                                       else if (ftotal <= map->choose_total_tries)
> >                                                 /* then retry descent */
> >                                                 retry_descent = 1;
> >                                         else
> > diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> > index 9600674..3124b71 100644
> > --- a/net/ceph/osdmap.c
> > +++ b/net/ceph/osdmap.c
> > @@ -135,6 +135,21 @@ bad:
> >         return -EINVAL;
> >  }
> >
> > +static int skip_name_map(void **p, void *end)
> > +{
> > +        int len;
> > +        ceph_decode_32_safe(p, end, len ,bad);
> > +        while (len--) {
> > +                int strlen;
> use u32 for strlen
> 
> > +                *p += sizeof(u32);
> > +                ceph_decode_32_safe(p, end, strlen, bad);
> > +                *p += strlen;
> > +}
> > +        return 0;
> > +bad:
> > +        return -EINVAL;
> > +}
> > +
> >  static struct crush_map *crush_decode(void *pbyval, void *end)
> >  {
> >         struct crush_map *c;
> > @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
> >         void **p = &pbyval;
> >         void *start = pbyval;
> >         u32 magic;
> > +       u32 num_name_maps;
> >
> >         dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
> >
> > @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
> >         if (c == NULL)
> >                 return ERR_PTR(-ENOMEM);
> >
> > +        /* set tunables to default values */
> > +        c->choose_local_tries = 2;
> > +        c->choose_local_fallback_tries = 5;
> > +        c->choose_total_tries = 19;
> > +
> >         ceph_decode_need(p, end, 4*sizeof(u32), bad);
> >         magic = ceph_decode_32(p);
> >         if (magic != CRUSH_MAGIC) {
> > @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
> >         }
> >
> >         /* ignore trailing name maps. */
> > +        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
> > +                err = skip_name_map(p, end);
> > +                if (err < 0)
> > +                        goto done;
> > +        }
> > +
> > +        /* tunables */
> > +        ceph_decode_need(p, end, 3*sizeof(u32), done);
> > +        c->choose_local_tries = ceph_decode_32(p);
> > +        c->choose_local_fallback_tries =  ceph_decode_32(p);
> > +        c->choose_total_tries = ceph_decode_32(p);
> > +        dout("crush decode tunable choose_local_tries = %d",
> > +             c->choose_local_tries);
> > +        dout("crush decode tunable choose_local_fallback_tries = %d",
> > +             c->choose_local_fallback_tries);
> > +        dout("crush decode tunable choose_total_tries = %d",
> > +             c->choose_total_tries);
> >
> > +done:
> >         dout("crush_decode success\n");
> >         return c;
> >
> > --
> > 1.7.9
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux